Fix links being parsed as markdown links improperly (#7200)

* Fix links being parsed as markdown links improperly

Fixes #4674

* Fix a typo

* Fix overriding too much stuff

* Fix parsing

* Remove useless console.log

* Remove unnecessary emph function

* Properly fix tests

* Add some better docs

* Add missing license header
pull/21833/head
Dariusz Niemczyk 2021-11-30 19:09:05 +01:00 committed by GitHub
parent 8fe582b094
commit e3187ed15c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 268 additions and 4 deletions

View File

@ -17,6 +17,8 @@ limitations under the License.
import * as commonmark from 'commonmark';
import { escape } from "lodash";
import { logger } from 'matrix-js-sdk/src/logger';
import * as linkify from 'linkifyjs';
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
link: (node: commonmark.Node, entering: boolean) => void;
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
text: (node: commonmark.Node) => void;
out: (text: string) => void;
emph: (node: commonmark.Node) => void;
}
function isAllowedHtmlTag(node: commonmark.Node): boolean {
@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
return par.firstChild != par.lastChild;
}
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
let currentNode = node;
let text = '';
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
const { literal, type } = currentNode;
if (type === 'text' && literal) {
let n = 0;
let char = literal[n];
while (char !== ' ' && char !== null && n <= literal.length) {
if (char === ' ') {
break;
}
if (char) {
text += char;
}
n += 1;
char = literal[n];
}
if (char === ' ') {
break;
}
}
currentNode = currentNode.next;
}
return text;
}
/**
* Class that wraps commonmark, adding the ability to see whether
* a given message actually uses any markdown syntax or whether
@ -70,11 +102,103 @@ export default class Markdown {
private input: string;
private parsed: commonmark.Node;
constructor(input) {
constructor(input: string) {
this.input = input;
const parser = new commonmark.Parser();
this.parsed = parser.parse(this.input);
this.parsed = this.repairLinks(this.parsed);
}
/**
* This method is modifying the parsed AST in such a way that links are always
* properly linkified instead of sometimes being wrongly emphasised in case
* if you were to write a link like the example below:
* https://my_weird-link_domain.domain.com
* ^ this link would be parsed to something like this:
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
* This method makes it so the link gets properly modified to a version where it is
* not emphasised until it actually ends.
* See: https://github.com/vector-im/element-web/issues/4674
* @param parsed
*/
private repairLinks(parsed: commonmark.Node) {
const walker = parsed.walker();
let event: commonmark.NodeWalkingStep = null;
let text = '';
let isInPara = false;
let previousNode: commonmark.Node | null = null;
let shouldUnlinkEmphasisNode = false;
while ((event = walker.next())) {
const { node } = event;
if (node.type === 'paragraph') {
if (event.entering) {
isInPara = true;
} else {
isInPara = false;
}
}
if (isInPara) {
// Clear saved string when line ends
if (
node.type === 'softbreak' ||
node.type === 'linebreak' ||
// Also start calculating the text from the beginning on any spaces
(node.type === 'text' && node.literal === ' ')
) {
text = '';
}
if (node.type === 'text') {
text += node.literal;
}
// We should not do this if previous node was not a textnode, as we can't combine it then.
if (node.type === 'emph' && previousNode.type === 'text') {
if (event.entering) {
const foundLinks = linkify.find(text);
for (const { value } of foundLinks) {
if (node.firstChild.literal) {
/**
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
* but this solution seems to work well and is hopefully slightly easier to understand too
*/
const nonEmphasizedText = `_${node.firstChild.literal}_`;
const f = getTextUntilEndOrLinebreak(node);
const newText = value + nonEmphasizedText + f;
const newLinks = linkify.find(newText);
// Should always find only one link here, if it finds more it means that the algorithm is broken
if (newLinks.length === 1) {
const emphasisTextNode = new commonmark.Node('text');
emphasisTextNode.literal = nonEmphasizedText;
previousNode.insertAfter(emphasisTextNode);
node.firstChild.literal = '';
event = node.walker().next();
// Remove `em` opening and closing nodes
node.unlink();
previousNode.insertAfter(event.node);
shouldUnlinkEmphasisNode = true;
} else {
logger.error(
"Markdown links escaping found too many links for following text: ",
text,
);
logger.error(
"Markdown links escaping found too many links for modified text: ",
newText,
);
}
}
}
} else {
if (shouldUnlinkEmphasisNode) {
node.unlink();
shouldUnlinkEmphasisNode = false;
}
}
}
}
previousNode = node;
}
return parsed;
}
isPlainText(): boolean {
@ -120,9 +244,7 @@ export default class Markdown {
// you can nest them.
//
// Let's try sending with <p/>s anyway for now, though.
const realParagraph = renderer.paragraph;
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
// If there is only one top level node, just return the
// bare text: it's a single line of text and so should be

143
test/Markdown-test.ts Normal file
View File

@ -0,0 +1,143 @@
/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import * as linkifyjs from 'linkifyjs';
import Markdown from "../src/Markdown";
import matrixLinkify from '../src/linkify-matrix';
beforeAll(() => {
// We need to call linkifier plugins before running those tests
matrixLinkify(linkifyjs);
});
describe("Markdown parser test", () => {
describe("fixing HTML links", () => {
const testString = [
"Test1:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test1A:",
"#_foonetic_xkcd:matrix.org",
"http://google.com/_thing_",
"https://matrix.org/_matrix/client/foo/123_",
"#_foonetic_xkcd:matrix.org",
"",
"Test2:",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"",
"Test3:",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
].join("\n");
it('tests that links are getting properly HTML formatted', () => {
/* eslint-disable max-len */
const expectedResult = [
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(testString);
expect(md.toHTML()).toEqual(expectedResult);
});
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
const test = [
"Test1:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test1A:",
"<#_foonetic_xkcd:matrix.org>",
"<http://google.com/_thing_>",
"<https://matrix.org/_matrix/client/foo/123_>",
"<#_foonetic_xkcd:matrix.org>",
"",
"Test2:",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
"",
"Test3:",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
].join("\n");
/* eslint-disable max-len */
/**
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
* but it seems to be actually working properly
*/
const expectedResult = [
"<p>Test1:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test1A:<br />&lt;#_foonetic_xkcd:matrix.org&gt;<br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br />&lt;#_foonetic_xkcd:matrix.org&gt;</p>",
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
"",
].join("\n");
/* eslint-enable max-len */
const md = new Markdown(test);
expect(md.toHTML()).toEqual(expectedResult);
});
it('expects that links in codeblock are not modified', () => {
const expectedResult = [
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test1A:',
'#_foonetic_xkcd:matrix.org',
'http://google.com/_thing_',
'https://matrix.org/_matrix/client/foo/123_',
'#_foonetic_xkcd:matrix.org',
'',
'Test2:',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'',
'Test3:',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
'</code></pre>',
'',
].join('\n');
const md = new Markdown("```" + testString + "```");
expect(md.toHTML()).toEqual(expectedResult);
});
it('expects that links in one line will be "escaped" properly', () => {
/* eslint-disable max-len */
const testString = [
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
].join('\n');
const expectedResult = [
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
].join('<br />');
/* eslint-enable max-len */
const md = new Markdown(testString);
expect(md.toHTML()).toEqual(expectedResult);
});
});
});

View File

@ -197,7 +197,6 @@ describe('editor/deserialize', function() {
it('code block with no trailing text', function() {
const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
console.log(parts);
expect(parts.length).toBe(5);
expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });