From 72d1bd910a64a2c46ff2414684a3e0ebde9d32de Mon Sep 17 00:00:00 2001 From: Michael Telatynski <7t3chguy@gmail.com> Date: Tue, 23 May 2023 14:31:05 +0100 Subject: [PATCH] Switch from cheerio to DOMParser (#10929) * Add tests around feature_latex_maths * Switch from cheerio to DOMParser * strict * Iterate --- package.json | 1 - src/HtmlUtils.tsx | 21 +++----- src/editor/serialize.ts | 30 ++++------- test/HtmlUtils-test.tsx | 36 +++++++++++++ test/__snapshots__/HtmlUtils-test.tsx.snap | 6 +++ test/editor/serialize-test.ts | 40 ++++++++++++++ yarn.lock | 63 +--------------------- 7 files changed, 98 insertions(+), 99 deletions(-) diff --git a/package.json b/package.json index 60d68649bb..2fd292973d 100644 --- a/package.json +++ b/package.json @@ -68,7 +68,6 @@ "@testing-library/react-hooks": "^8.0.1", "await-lock": "^2.1.0", "blurhash": "^1.1.3", - "cheerio": "^1.0.0-rc.9", "classnames": "^2.2.6", "commonmark": "^0.30.0", "counterpart": "^0.18.6", diff --git a/src/HtmlUtils.tsx b/src/HtmlUtils.tsx index 6593a308c8..0d910bfeca 100644 --- a/src/HtmlUtils.tsx +++ b/src/HtmlUtils.tsx @@ -19,7 +19,6 @@ limitations under the License. import React, { LegacyRef, ReactElement, ReactNode } from "react"; import sanitizeHtml from "sanitize-html"; -import { load as cheerio } from "cheerio"; import classNames from "classnames"; import EMOJIBASE_REGEX from "emojibase-regex"; import { merge, split } from "lodash"; @@ -549,27 +548,19 @@ export function bodyToHtml(content: IContent, highlights: Optional, op } safeBody = sanitizeHtml(formattedBody!, sanitizeParams); - const phtml = cheerio(safeBody, { - // @ts-ignore: The `_useHtmlParser2` internal option is the - // simplest way to both parse and render using `htmlparser2`. - _useHtmlParser2: true, - decodeEntities: false, - }); - const isPlainText = phtml.html() === phtml.root().text(); + const phtml = new DOMParser().parseFromString(safeBody, "text/html"); + const isPlainText = phtml.body.innerHTML === phtml.body.textContent; isHtmlMessage = !isPlainText; if (isHtmlMessage && SettingsStore.getValue("feature_latex_maths")) { - // @ts-ignore - The types for `replaceWith` wrongly expect - // Cheerio instance to be returned. - phtml('div, span[data-mx-maths!=""]').replaceWith(function (i, e) { - return katex.renderToString(decode(phtml(e).attr("data-mx-maths")), { + [...phtml.querySelectorAll("div, span[data-mx-maths]")].forEach((e) => { + e.outerHTML = katex.renderToString(decode(e.getAttribute("data-mx-maths")), { throwOnError: false, - // @ts-ignore - `e` can be an Element, not just a Node - displayMode: e.name == "div", + displayMode: e.tagName == "DIV", output: "htmlAndMathml", }); }); - safeBody = phtml.html(); + safeBody = phtml.body.innerHTML; } } else if (highlighter) { safeBody = highlighter.applyHighlights(escapeHtml(plainBody), safeHighlights!).join(""); diff --git a/src/editor/serialize.ts b/src/editor/serialize.ts index b01719afb8..6bc45a5657 100644 --- a/src/editor/serialize.ts +++ b/src/editor/serialize.ts @@ -16,7 +16,6 @@ limitations under the License. */ import { encode } from "html-entities"; -import { load as cheerio } from "cheerio"; import escapeHtml from "escape-html"; import Markdown from "../Markdown"; @@ -133,8 +132,7 @@ export function htmlSerializeFromMdIfNeeded(md: string, { forceHTML = false } = }); }); - // make sure div tags always start on a new line, otherwise it will confuse - // the markdown parser + // make sure div tags always start on a new line, otherwise it will confuse the markdown parser md = md.replace(/(.)
{ + phtml.getElementsByTagName("code").item(i)!.textContent = e.textContent; }); // add fallback output for latex math, which should not be interpreted as markdown - phtml("div, span").each(function (i, e) { - const tex = phtml(e).attr("data-mx-maths"); + [...phtml.querySelectorAll("div, span")].forEach((e, i) => { + const tex = e.getAttribute("data-mx-maths"); if (tex) { - phtml(e).html(`${tex}`); + e.innerHTML = `${tex}`; } }); } - return phtml.html(); + return phtml.body.innerHTML; } // ensure removal of escape backslashes in non-Markdown messages if (md.indexOf("\\") > -1) { diff --git a/test/HtmlUtils-test.tsx b/test/HtmlUtils-test.tsx index d90e35c088..bca781792a 100644 --- a/test/HtmlUtils-test.tsx +++ b/test/HtmlUtils-test.tsx @@ -131,4 +131,40 @@ describe("bodyToHtml", () => { expect(asFragment()).toMatchSnapshot(); }); + + describe("feature_latex_maths", () => { + beforeEach(() => { + jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); + }); + + it("should render inline katex", () => { + const html = getHtml({ + body: "hello \\xi world", + msgtype: "m.text", + formatted_body: 'hello \\xi world', + format: "org.matrix.custom.html", + }); + expect(html).toMatchSnapshot(); + }); + + it("should render block katex", () => { + const html = getHtml({ + body: "hello \\xi world", + msgtype: "m.text", + formatted_body: '

hello

\\xi

world

', + format: "org.matrix.custom.html", + }); + expect(html).toMatchSnapshot(); + }); + + it("should not mangle code blocks", () => { + const html = getHtml({ + body: "hello \\xi world", + msgtype: "m.text", + formatted_body: "

hello

$\\xi$

world

", + format: "org.matrix.custom.html", + }); + expect(html).toMatchSnapshot(); + }); + }); }); diff --git a/test/__snapshots__/HtmlUtils-test.tsx.snap b/test/__snapshots__/HtmlUtils-test.tsx.snap index 0101de429e..0714120676 100644 --- a/test/__snapshots__/HtmlUtils-test.tsx.snap +++ b/test/__snapshots__/HtmlUtils-test.tsx.snap @@ -15,3 +15,9 @@ exports[`bodyToHtml should generate big emoji for an emoji-only reply to a messa `; + +exports[`bodyToHtml feature_latex_maths should not mangle code blocks 1`] = `"

hello

$\\xi$

world

"`; + +exports[`bodyToHtml feature_latex_maths should render block katex 1`] = `"

hello

ξ\\xi

world

"`; + +exports[`bodyToHtml feature_latex_maths should render inline katex 1`] = `"hello ξ\\xi world"`; diff --git a/test/editor/serialize-test.ts b/test/editor/serialize-test.ts index 25bfd17c93..ce4815658f 100644 --- a/test/editor/serialize-test.ts +++ b/test/editor/serialize-test.ts @@ -17,6 +17,7 @@ limitations under the License. import EditorModel from "../../src/editor/model"; import { htmlSerializeIfNeeded } from "../../src/editor/serialize"; import { createPartCreator } from "./mock"; +import SettingsStore from "../../src/settings/SettingsStore"; describe("editor/serialize", function () { describe("with markdown", function () { @@ -75,6 +76,7 @@ describe("editor/serialize", function () { expect(html).toBe("*hello* world < hey world!"); }); }); + describe("with plaintext", function () { it("markdown remains plaintext", function () { const pc = createPartCreator(); @@ -102,4 +104,42 @@ describe("editor/serialize", function () { expect(html).toBe("hello world"); }); }); + + describe("feature_latex_maths", () => { + beforeEach(() => { + jest.spyOn(SettingsStore, "getValue").mockImplementation((feature) => feature === "feature_latex_maths"); + }); + + it("should support inline katex", () => { + const pc = createPartCreator(); + const model = new EditorModel([pc.plain("hello $\\xi$ world")], pc); + const html = htmlSerializeIfNeeded(model, {}); + expect(html).toMatchInlineSnapshot(`"hello \\xi world"`); + }); + + it("should support block katex", () => { + const pc = createPartCreator(); + const model = new EditorModel([pc.plain("hello \n$$\\xi$$\n world")], pc); + const html = htmlSerializeIfNeeded(model, {}); + expect(html).toMatchInlineSnapshot(` + "

hello

+
\\xi
+

world

+ " + `); + }); + + it("should not mangle code blocks", () => { + const pc = createPartCreator(); + const model = new EditorModel([pc.plain("hello\n```\n$\\xi$\n```\nworld")], pc); + const html = htmlSerializeIfNeeded(model, {}); + expect(html).toMatchInlineSnapshot(` + "

hello

+
$\\xi$
+                
+

world

+ " + `); + }); + }); }); diff --git a/yarn.lock b/yarn.lock index 0d4e6252d3..6f901611a0 100644 --- a/yarn.lock +++ b/yarn.lock @@ -3028,11 +3028,6 @@ blurhash@^1.1.3: resolved "https://registry.yarnpkg.com/blurhash/-/blurhash-1.1.5.tgz#3034104cd5dce5a3e5caa871ae2f0f1f2d0ab566" integrity sha512-a+LO3A2DfxTaTztsmkbLYmUzUeApi0LZuKalwbNmqAHR6HhJGMt1qSV/R3wc+w4DL28holjqO3Bg74aUGavGjg== -boolbase@^1.0.0: - version "1.0.0" - resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e" - integrity sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww== - brace-expansion@^1.1.7: version "1.1.11" resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" @@ -3189,31 +3184,6 @@ check-more-types@^2.24.0: resolved "https://registry.yarnpkg.com/check-more-types/-/check-more-types-2.24.0.tgz#1420ffb10fd444dcfc79b43891bbfffd32a84600" integrity sha512-Pj779qHxV2tuapviy1bSZNEL1maXr13bPYpsvSDB68HlYcYuhlDrmGd63i0JHMCLKzc7rUSNIrpdJlhVlNwrxA== -cheerio-select@^2.1.0: - version "2.1.0" - resolved "https://registry.yarnpkg.com/cheerio-select/-/cheerio-select-2.1.0.tgz#4d8673286b8126ca2a8e42740d5e3c4884ae21b4" - integrity sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g== - dependencies: - boolbase "^1.0.0" - css-select "^5.1.0" - css-what "^6.1.0" - domelementtype "^2.3.0" - domhandler "^5.0.3" - domutils "^3.0.1" - -cheerio@^1.0.0-rc.9: - version "1.0.0-rc.12" - resolved "https://registry.yarnpkg.com/cheerio/-/cheerio-1.0.0-rc.12.tgz#788bf7466506b1c6bf5fae51d24a2c4d62e47683" - integrity sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q== - dependencies: - cheerio-select "^2.1.0" - dom-serializer "^2.0.0" - domhandler "^5.0.3" - domutils "^3.0.1" - htmlparser2 "^8.0.1" - parse5 "^7.0.0" - parse5-htmlparser2-tree-adapter "^7.0.0" - chokidar@^3.4.0, chokidar@^3.5.1: version "3.5.3" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd" @@ -3519,17 +3489,6 @@ css-functions-list@^3.1.0: resolved "https://registry.yarnpkg.com/css-functions-list/-/css-functions-list-3.1.0.tgz#cf5b09f835ad91a00e5959bcfc627cd498e1321b" integrity sha512-/9lCvYZaUbBGvYUgYGFJ4dcYiyqdhSjG7IPVluoV8A1ILjkF7ilmhp1OGUz8n+nmBcu0RNrQAzgD8B6FJbrt2w== -css-select@^5.1.0: - version "5.1.0" - resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6" - integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg== - dependencies: - boolbase "^1.0.0" - css-what "^6.1.0" - domhandler "^5.0.2" - domutils "^3.0.1" - nth-check "^2.0.1" - css-tree@^2.3.1: version "2.3.1" resolved "https://registry.yarnpkg.com/css-tree/-/css-tree-2.3.1.tgz#10264ce1e5442e8572fc82fbe490644ff54b5c20" @@ -3538,11 +3497,6 @@ css-tree@^2.3.1: mdn-data "2.0.30" source-map-js "^1.0.1" -css-what@^6.1.0: - version "6.1.0" - resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4" - integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw== - css.escape@^1.5.1: version "1.5.1" resolved "https://registry.yarnpkg.com/css.escape/-/css.escape-1.5.1.tgz#42e27d4fa04ae32f931a4b4d4191fa9cddee97cb" @@ -5080,7 +5034,7 @@ html-tags@^3.3.1: resolved "https://registry.yarnpkg.com/html-tags/-/html-tags-3.3.1.tgz#a04026a18c882e4bba8a01a3d39cfe465d40b5ce" integrity sha512-ztqyC3kLto0e9WbNp0aeP+M3kTt+nbaIveGmUxAtZa+8iFgKLUOD4YKM5j+f3QD89bra7UeumolZHKuOXnTmeQ== -htmlparser2@^8.0.0, htmlparser2@^8.0.1: +htmlparser2@^8.0.0: version "8.0.2" resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.2.tgz#f002151705b383e62433b5cf466f5b716edaec21" integrity sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA== @@ -6767,13 +6721,6 @@ npm-run-path@^4.0.0, npm-run-path@^4.0.1: dependencies: path-key "^3.0.0" -nth-check@^2.0.1: - version "2.1.1" - resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-2.1.1.tgz#c9eab428effce36cd6b92c924bdb000ef1f1ed1d" - integrity sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w== - dependencies: - boolbase "^1.0.0" - nwsapi@^2.2.2: version "2.2.3" resolved "https://registry.yarnpkg.com/nwsapi/-/nwsapi-2.2.3.tgz#00e04dfd5a4a751e5ec2fecdc75dfd2f0db820fa" @@ -6982,14 +6929,6 @@ parse-srcset@^1.0.2: resolved "https://registry.yarnpkg.com/parse-srcset/-/parse-srcset-1.0.2.tgz#f2bd221f6cc970a938d88556abc589caaaa2bde1" integrity sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q== -parse5-htmlparser2-tree-adapter@^7.0.0: - version "7.0.0" - resolved "https://registry.yarnpkg.com/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz#23c2cc233bcf09bb7beba8b8a69d46b08c62c2f1" - integrity sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g== - dependencies: - domhandler "^5.0.2" - parse5 "^7.0.0" - parse5@^7.0.0, parse5@^7.1.1: version "7.1.2" resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.1.2.tgz#0736bebbfd77793823240a23b7fc5e010b7f8e32"