Don't consider textual characters to be emoji (#12582)
* Don't consider textual characters to be emoji We were using emojibase-regex to match emoji within messages. However, the docs (https://emojibase.dev/docs/regex/) state that this regex matches both emoji and text presentation characters. This is not what we want, and will result in false positives for characters like '↔' that could turn into an emoji if paired with a variation selector. Unfortunately, none of the other regexes provided by Emojibase do what we want either (https://github.com/milesj/emojibase/issues/174). In the meantime, browser support for the RGI_Emoji character sequence class has made it feasible to write an emoji regex by hand, so that's what I've done. * Add a fallback for BIGEMOJI_REGEX as wellpull/28217/head
							parent
							
								
									489bc32674
								
							
						
					
					
						commit
						c61eca8c24
					
				
							
								
								
									
										10
									
								
								.eslintrc.js
								
								
								
								
							
							
						
						
									
										10
									
								
								.eslintrc.js
								
								
								
								
							|  | @ -78,6 +78,11 @@ module.exports = { | |||
|                         name: "matrix-react-sdk/", | ||||
|                         message: "Please use matrix-react-sdk/src/index instead", | ||||
|                     }, | ||||
|                     { | ||||
|                         name: "emojibase-regex", | ||||
|                         message: | ||||
|                             "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", | ||||
|                     }, | ||||
|                 ], | ||||
|                 patterns: [ | ||||
|                     { | ||||
|  | @ -141,6 +146,11 @@ module.exports = { | |||
|                         ], | ||||
|                         message: "Please use matrix-js-sdk/src/matrix instead", | ||||
|                     }, | ||||
|                     { | ||||
|                         group: ["emojibase-regex/emoji*"], | ||||
|                         message: | ||||
|                             "This regex doesn't actually test for emoji. See the docs at https://emojibase.dev/docs/regex/ and prefer our own EMOJI_REGEX from HtmlUtils.", | ||||
|                     }, | ||||
|                 ], | ||||
|             }, | ||||
|         ], | ||||
|  |  | |||
|  | @ -20,7 +20,6 @@ limitations under the License. | |||
| import React, { LegacyRef, ReactNode } from "react"; | ||||
| import sanitizeHtml from "sanitize-html"; | ||||
| import classNames from "classnames"; | ||||
| import EMOJIBASE_REGEX from "emojibase-regex"; | ||||
| import katex from "katex"; | ||||
| import { decode } from "html-entities"; | ||||
| import { IContent } from "matrix-js-sdk/src/matrix"; | ||||
|  | @ -46,10 +45,35 @@ const SURROGATE_PAIR_PATTERN = /([\ud800-\udbff])([\udc00-\udfff])/; | |||
| const SYMBOL_PATTERN = /([\u2100-\u2bff])/; | ||||
| 
 | ||||
| // Regex pattern for non-emoji characters that can appear in an "all-emoji" message
 | ||||
| // (Zero-Width Joiner, Zero-Width Space, Emoji presentation character, other whitespace)
 | ||||
| const EMOJI_SEPARATOR_REGEX = /[\u200D\u200B\s]|\uFE0F/g; | ||||
| // (Zero-Width Space, other whitespace)
 | ||||
| const EMOJI_SEPARATOR_REGEX = /[\u200B\s]/g; | ||||
| 
 | ||||
| const BIGEMOJI_REGEX = new RegExp(`^(${EMOJIBASE_REGEX.source})+$`, "i"); | ||||
| // Regex for emoji. This includes any RGI_Emoji sequence followed by an optional
 | ||||
| // emoji presentation VS (U+FE0F), but not those sequences that are followed by
 | ||||
| // a text presentation VS (U+FE0E). We also count lone regional indicators
 | ||||
| // (U+1F1E6-U+1F1FF). Technically this regex produces false negatives for emoji
 | ||||
| // followed by U+FE0E when the emoji doesn't have a text variant, but in
 | ||||
| // practice this doesn't matter.
 | ||||
| export const EMOJI_REGEX = (() => { | ||||
|     try { | ||||
|         // Per our support policy, v mode is available to us, but we still don't
 | ||||
|         // want the app to completely crash on older platforms. We use the
 | ||||
|         // constructor here to avoid a syntax error on such platforms.
 | ||||
|         return new RegExp("\\p{RGI_Emoji}(?!\\uFE0E)(?:(?<!\\uFE0F)\\uFE0F)?|[\\u{1f1e6}-\\u{1f1ff}]", "v"); | ||||
|     } catch (_e) { | ||||
|         // v mode not supported; fall back to matching nothing
 | ||||
|         return /(?!)/; | ||||
|     } | ||||
| })(); | ||||
| 
 | ||||
| const BIGEMOJI_REGEX = (() => { | ||||
|     try { | ||||
|         return new RegExp(`^(${EMOJI_REGEX.source})+$`, "iv"); | ||||
|     } catch (_e) { | ||||
|         // Fall back, just like for EMOJI_REGEX
 | ||||
|         return /(?!)/; | ||||
|     } | ||||
| })(); | ||||
| 
 | ||||
| /* | ||||
|  * Return true if the given string contains emoji | ||||
|  | @ -266,7 +290,7 @@ export function formatEmojis(message: string | undefined, isHtmlMessage?: boolea | |||
|     let key = 0; | ||||
| 
 | ||||
|     for (const data of graphemeSegmenter.segment(message)) { | ||||
|         if (EMOJIBASE_REGEX.test(data.segment)) { | ||||
|         if (EMOJI_REGEX.test(data.segment)) { | ||||
|             if (text) { | ||||
|                 result.push(text); | ||||
|                 text = ""; | ||||
|  |  | |||
|  | @ -15,7 +15,6 @@ limitations under the License. | |||
| */ | ||||
| 
 | ||||
| import React, { createRef, KeyboardEvent, SyntheticEvent } from "react"; | ||||
| import EMOJI_REGEX from "emojibase-regex"; | ||||
| import { | ||||
|     IContent, | ||||
|     MatrixEvent, | ||||
|  | @ -70,6 +69,7 @@ import { doMaybeLocalRoomAction } from "../../../utils/local-room"; | |||
| import { Caret } from "../../../editor/caret"; | ||||
| import { IDiff } from "../../../editor/diff"; | ||||
| import { getBlobSafeMimeType } from "../../../utils/blobs"; | ||||
| import { EMOJI_REGEX } from "../../../HtmlUtils"; | ||||
| 
 | ||||
| /** | ||||
|  * Build the mentions information based on the editor model (and any related events): | ||||
|  |  | |||
|  | @ -15,11 +15,10 @@ See the License for the specific language governing permissions and | |||
| limitations under the License. | ||||
| */ | ||||
| 
 | ||||
| import EMOJIBASE_REGEX from "emojibase-regex"; | ||||
| import { MatrixClient, RoomMember, Room } from "matrix-js-sdk/src/matrix"; | ||||
| 
 | ||||
| import AutocompleteWrapperModel, { GetAutocompleterComponent, UpdateCallback, UpdateQuery } from "./autocomplete"; | ||||
| import { unicodeToShortcode } from "../HtmlUtils"; | ||||
| import { EMOJI_REGEX, unicodeToShortcode } from "../HtmlUtils"; | ||||
| import * as Avatar from "../Avatar"; | ||||
| import defaultDispatcher from "../dispatcher/dispatcher"; | ||||
| import { Action } from "../dispatcher/actions"; | ||||
|  | @ -197,7 +196,7 @@ abstract class BasePart { | |||
| 
 | ||||
| abstract class PlainBasePart extends BasePart { | ||||
|     protected acceptsInsertion(chr: string, offset: number, inputType: string): boolean { | ||||
|         if (chr === "\n" || EMOJIBASE_REGEX.test(chr)) { | ||||
|         if (chr === "\n" || EMOJI_REGEX.test(chr)) { | ||||
|             return false; | ||||
|         } | ||||
|         // when not pasting or dropping text, reject characters that should start a pill candidate
 | ||||
|  | @ -375,7 +374,7 @@ class NewlinePart extends BasePart implements IBasePart { | |||
| 
 | ||||
| export class EmojiPart extends BasePart implements IBasePart { | ||||
|     protected acceptsInsertion(chr: string, offset: number): boolean { | ||||
|         return EMOJIBASE_REGEX.test(chr); | ||||
|         return EMOJI_REGEX.test(chr); | ||||
|     } | ||||
| 
 | ||||
|     protected acceptsRemoval(position: number, chr: string): boolean { | ||||
|  | @ -573,7 +572,7 @@ export class PartCreator { | |||
|             case "\n": | ||||
|                 return new NewlinePart(); | ||||
|             default: | ||||
|                 if (EMOJIBASE_REGEX.test(getFirstGrapheme(input))) { | ||||
|                 if (EMOJI_REGEX.test(getFirstGrapheme(input))) { | ||||
|                     return new EmojiPart(); | ||||
|                 } | ||||
|                 return new PlainPart(); | ||||
|  | @ -650,7 +649,7 @@ export class PartCreator { | |||
|         let plainText = ""; | ||||
| 
 | ||||
|         for (const data of graphemeSegmenter.segment(text)) { | ||||
|             if (EMOJIBASE_REGEX.test(data.segment)) { | ||||
|             if (EMOJI_REGEX.test(data.segment)) { | ||||
|                 if (plainText) { | ||||
|                     parts.push(this.plain(plainText)); | ||||
|                     plainText = ""; | ||||
|  |  | |||
|  | @ -107,6 +107,12 @@ describe("bodyToHtml", () => { | |||
|         expect(html).toMatchInlineSnapshot(`"<span class="mx_EventTile_searchHighlight">test</span> foo <b>bar"`); | ||||
|     }); | ||||
| 
 | ||||
|     it("generates big emoji for emoji made of multiple characters", () => { | ||||
|         const { asFragment } = render(bodyToHtml({ body: "👨👩👧👦 ↔️ 🇮🇸", msgtype: "m.text" }, [], {}) as ReactElement); | ||||
| 
 | ||||
|         expect(asFragment()).toMatchSnapshot(); | ||||
|     }); | ||||
| 
 | ||||
|     it("should generate big emoji for an emoji-only reply to a message", () => { | ||||
|         const { asFragment } = render( | ||||
|             bodyToHtml( | ||||
|  | @ -132,6 +138,12 @@ describe("bodyToHtml", () => { | |||
|         expect(asFragment()).toMatchSnapshot(); | ||||
|     }); | ||||
| 
 | ||||
|     it("does not mistake characters in text presentation mode for emoji", () => { | ||||
|         const { asFragment } = render(bodyToHtml({ body: "↔ ❗︎", msgtype: "m.text" }, [], {}) as ReactElement); | ||||
| 
 | ||||
|         expect(asFragment()).toMatchSnapshot(); | ||||
|     }); | ||||
| 
 | ||||
|     describe("feature_latex_maths", () => { | ||||
|         beforeEach(() => { | ||||
|             jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); | ||||
|  |  | |||
|  | @ -1,5 +1,16 @@ | |||
| // Jest Snapshot v1, https://goo.gl/fbAQLP | ||||
| 
 | ||||
| exports[`bodyToHtml does not mistake characters in text presentation mode for emoji 1`] = ` | ||||
| <DocumentFragment> | ||||
|   <span | ||||
|     class="mx_EventTile_body" | ||||
|     dir="auto" | ||||
|   > | ||||
|     ↔ ❗︎ | ||||
|   </span> | ||||
| </DocumentFragment> | ||||
| `; | ||||
| 
 | ||||
| exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"<p>hello</p><pre><code>$\\xi$</code></pre><p>world</p>"`; | ||||
| 
 | ||||
| exports[`bodyToHtml feature_latex_maths should not mangle divs 1`] = `"<p>hello</p><div>world</div>"`; | ||||
|  | @ -8,6 +19,36 @@ exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"<p>hel | |||
| 
 | ||||
| exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>ξ</mi></mrow><annotation encoding="application/x-tex">\\xi</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord mathnormal" style="margin-right:0.04601em;">ξ</span></span></span></span> world"`; | ||||
| 
 | ||||
| exports[`bodyToHtml generates big emoji for emoji made of multiple characters 1`] = ` | ||||
| <DocumentFragment> | ||||
|   <span | ||||
|     class="mx_EventTile_body mx_EventTile_bigEmoji" | ||||
|     dir="auto" | ||||
|   > | ||||
|     <span | ||||
|       class="mx_Emoji" | ||||
|       title=":man-woman-girl-boy:" | ||||
|     > | ||||
|       👨👩👧👦 | ||||
|     </span> | ||||
|       | ||||
|     <span | ||||
|       class="mx_Emoji" | ||||
|       title=":left_right_arrow:" | ||||
|     > | ||||
|       ↔️ | ||||
|     </span> | ||||
|       | ||||
|     <span | ||||
|       class="mx_Emoji" | ||||
|       title=":flag-is:" | ||||
|     > | ||||
|       🇮🇸 | ||||
|     </span> | ||||
|   </span> | ||||
| </DocumentFragment> | ||||
| `; | ||||
| 
 | ||||
| exports[`bodyToHtml should generate big emoji for an emoji-only reply to a message 1`] = ` | ||||
| <DocumentFragment> | ||||
|   <span | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Robin
						Robin