Fix links being parsed as markdown links improperly (#7200)

* Fix links being parsed as markdown links improperly Fixes #4674 * Fix a typo * Fix overriding too much stuff * Fix parsing * Remove useless console.log * Remove unnecessary emph function * Properly fix tests * Add some better docs * Add missing license header
2021-11-30 19:09:05 +01:00 · 2021-11-30 19:09:05 +01:00 · e3187ed15c
parent 8fe582b094
commit e3187ed15c
3 changed files with 268 additions and 4 deletions
--- a/src/Markdown.ts
+++ b/src/Markdown.ts
@ -17,6 +17,8 @@ limitations under the License.

 import * as commonmark from 'commonmark';
 import { escape } from "lodash";
+import { logger } from 'matrix-js-sdk/src/logger';
+import * as linkify from 'linkifyjs';

 const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];

@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
    link: (node: commonmark.Node, entering: boolean) => void;
    html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
    html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
+    text: (node: commonmark.Node) => void;
+    out: (text: string) => void;
+    emph: (node: commonmark.Node) => void;
 }

 function isAllowedHtmlTag(node: commonmark.Node): boolean {
@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
    return par.firstChild != par.lastChild;
 }

+function getTextUntilEndOrLinebreak(node: commonmark.Node) {
+    let currentNode = node;
+    let text = '';
+    while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
+        const { literal, type } = currentNode;
+        if (type === 'text' && literal) {
+            let n = 0;
+            let char = literal[n];
+            while (char !== ' ' && char !== null && n <= literal.length) {
+                if (char === ' ') {
+                    break;
+                }
+                if (char) {
+                    text += char;
+                }
+                n += 1;
+                char = literal[n];
+            }
+            if (char === ' ') {
+                break;
+            }
+        }
+        currentNode = currentNode.next;
+    }
+    return text;
+}
+
 /**
 * Class that wraps commonmark, adding the ability to see whether
 * a given message actually uses any markdown syntax or whether
@ -70,11 +102,103 @@ export default class Markdown {
    private input: string;
    private parsed: commonmark.Node;

-    constructor(input) {
+    constructor(input: string) {
        this.input = input;

        const parser = new commonmark.Parser();
        this.parsed = parser.parse(this.input);
+        this.parsed = this.repairLinks(this.parsed);
+    }
+
+    /**
+     * This method is modifying the parsed AST in such a way that links are always
+     * properly linkified instead of sometimes being wrongly emphasised in case
+     * if you were to write a link like the example below:
+     * https://my_weird-link_domain.domain.com
+     * ^ this link would be parsed to something like this:
+     * <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
+     * This method makes it so the link gets properly modified to a version where it is
+     * not emphasised until it actually ends.
+     * See: https://github.com/vector-im/element-web/issues/4674
+     * @param parsed
+     */
+    private repairLinks(parsed: commonmark.Node) {
+        const walker = parsed.walker();
+        let event: commonmark.NodeWalkingStep = null;
+        let text = '';
+        let isInPara = false;
+        let previousNode: commonmark.Node | null = null;
+        let shouldUnlinkEmphasisNode = false;
+        while ((event = walker.next())) {
+            const { node } = event;
+            if (node.type === 'paragraph') {
+                if (event.entering) {
+                    isInPara = true;
+                } else {
+                    isInPara = false;
+                }
+            }
+            if (isInPara) {
+                // Clear saved string when line ends
+                if (
+                    node.type === 'softbreak' ||
+                    node.type === 'linebreak' ||
+                    // Also start calculating the text from the beginning on any spaces
+                    (node.type === 'text' && node.literal === ' ')
+                ) {
+                    text = '';
+                }
+                if (node.type === 'text') {
+                    text += node.literal;
+                }
+                // We should not do this if previous node was not a textnode, as we can't combine it then.
+                if (node.type === 'emph' && previousNode.type === 'text') {
+                    if (event.entering) {
+                        const foundLinks = linkify.find(text);
+                        for (const { value } of foundLinks) {
+                            if (node.firstChild.literal) {
+                                /**
+                                 * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
+                                 * but this solution seems to work well and is hopefully slightly easier to understand too
+                                 */
+                                const nonEmphasizedText = `_${node.firstChild.literal}_`;
+                                const f = getTextUntilEndOrLinebreak(node);
+                                const newText = value + nonEmphasizedText + f;
+                                const newLinks = linkify.find(newText);
+                                // Should always find only one link here, if it finds more it means that the algorithm is broken
+                                if (newLinks.length === 1) {
+                                    const emphasisTextNode = new commonmark.Node('text');
+                                    emphasisTextNode.literal = nonEmphasizedText;
+                                    previousNode.insertAfter(emphasisTextNode);
+                                    node.firstChild.literal = '';
+                                    event = node.walker().next();
+                                    // Remove `em` opening and closing nodes
+                                    node.unlink();
+                                    previousNode.insertAfter(event.node);
+                                    shouldUnlinkEmphasisNode = true;
+                                } else {
+                                    logger.error(
+                                        "Markdown links escaping found too many links for following text: ",
+                                        text,
+                                    );
+                                    logger.error(
+                                        "Markdown links escaping found too many links for modified text: ",
+                                        newText,
+                                    );
+                                }
+                            }
+                        }
+                    } else {
+                        if (shouldUnlinkEmphasisNode) {
+                            node.unlink();
+                            shouldUnlinkEmphasisNode = false;
+                        }
+                    }
+                }
+            }
+            previousNode = node;
+        }
+        return parsed;
    }

    isPlainText(): boolean {
@ -120,9 +244,7 @@ export default class Markdown {
        // you can nest them.
        //
        // Let's try sending with <p/>s anyway for now, though.
-
        const realParagraph = renderer.paragraph;
-
        renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
            // If there is only one top level node, just return the
            // bare text: it's a single line of text and so should be
--- a/test/Markdown-test.ts
+++ b/test/Markdown-test.ts
@ -0,0 +1,143 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+import * as linkifyjs from 'linkifyjs';
+import Markdown from "../src/Markdown";
+import matrixLinkify from '../src/linkify-matrix';
+
+beforeAll(() => {
+    // We need to call linkifier plugins before running those tests
+    matrixLinkify(linkifyjs);
+});
+
+describe("Markdown parser test", () => {
+    describe("fixing HTML links", () => {
+        const testString = [
+            "Test1:",
+            "#_foonetic_xkcd:matrix.org",
+            "http://google.com/_thing_",
+            "https://matrix.org/_matrix/client/foo/123_",
+            "#_foonetic_xkcd:matrix.org",
+            "",
+            "Test1A:",
+            "#_foonetic_xkcd:matrix.org",
+            "http://google.com/_thing_",
+            "https://matrix.org/_matrix/client/foo/123_",
+            "#_foonetic_xkcd:matrix.org",
+            "",
+            "Test2:",
+            "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+            "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+            "",
+            "Test3:",
+            "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
+            "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
+        ].join("\n");
+
+        it('tests that links are getting properly HTML formatted', () => {
+            /* eslint-disable max-len */
+            const expectedResult = [
+                "<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
+                "<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
+                "<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
+                "<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
+                "",
+            ].join("\n");
+            /* eslint-enable max-len */
+            const md = new Markdown(testString);
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+        it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
+            const test = [
+                "Test1:",
+                "<#_foonetic_xkcd:matrix.org>",
+                "<http://google.com/_thing_>",
+                "<https://matrix.org/_matrix/client/foo/123_>",
+                "<#_foonetic_xkcd:matrix.org>",
+                "",
+                "Test1A:",
+                "<#_foonetic_xkcd:matrix.org>",
+                "<http://google.com/_thing_>",
+                "<https://matrix.org/_matrix/client/foo/123_>",
+                "<#_foonetic_xkcd:matrix.org>",
+                "",
+                "Test2:",
+                "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
+                "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
+                "",
+                "Test3:",
+                "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
+                "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
+            ].join("\n");
+            /* eslint-disable max-len */
+            /**
+             * NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
+             * but it seems to be actually working properly
+             */
+            const expectedResult = [
+                "<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
+                "<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
+                "<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
+                "<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
+                "",
+            ].join("\n");
+            /* eslint-enable max-len */
+            const md = new Markdown(test);
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+
+        it('expects that links in codeblock are not modified', () => {
+            const expectedResult = [
+                '<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test1A:',
+                '#_foonetic_xkcd:matrix.org',
+                'http://google.com/_thing_',
+                'https://matrix.org/_matrix/client/foo/123_',
+                '#_foonetic_xkcd:matrix.org',
+                '',
+                'Test2:',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                '',
+                'Test3:',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
+                'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
+                '</code></pre>',
+                '',
+            ].join('\n');
+            const md = new Markdown("```" + testString + "```");
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+
+        it('expects that links in one line will be "escaped" properly', () => {
+            /* eslint-disable max-len */
+            const testString = [
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+                'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
+            ].join('\n');
+            const expectedResult = [
+                "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+                "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
+            ].join('<br />');
+            /* eslint-enable max-len */
+            const md = new Markdown(testString);
+            expect(md.toHTML()).toEqual(expectedResult);
+        });
+    });
+});
--- a/test/editor/deserialize-test.js
+++ b/test/editor/deserialize-test.js
@ -197,7 +197,6 @@ describe('editor/deserialize', function() {
        it('code block with no trailing text', function() {
            const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
-            console.log(parts);
            expect(parts.length).toBe(5);
            expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
            expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });