Fix links being parsed as markdown links improperly (#7200)
* Fix links being parsed as markdown links improperly Fixes #4674 * Fix a typo * Fix overriding too much stuff * Fix parsing * Remove useless console.log * Remove unnecessary emph function * Properly fix tests * Add some better docs * Add missing license headerpull/21833/head
parent
8fe582b094
commit
e3187ed15c
128
src/Markdown.ts
128
src/Markdown.ts
|
@ -17,6 +17,8 @@ limitations under the License.
|
||||||
|
|
||||||
import * as commonmark from 'commonmark';
|
import * as commonmark from 'commonmark';
|
||||||
import { escape } from "lodash";
|
import { escape } from "lodash";
|
||||||
|
import { logger } from 'matrix-js-sdk/src/logger';
|
||||||
|
import * as linkify from 'linkifyjs';
|
||||||
|
|
||||||
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
|
const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u'];
|
||||||
|
|
||||||
|
@ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer {
|
||||||
link: (node: commonmark.Node, entering: boolean) => void;
|
link: (node: commonmark.Node, entering: boolean) => void;
|
||||||
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
||||||
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
|
||||||
|
text: (node: commonmark.Node) => void;
|
||||||
|
out: (text: string) => void;
|
||||||
|
emph: (node: commonmark.Node) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
function isAllowedHtmlTag(node: commonmark.Node): boolean {
|
function isAllowedHtmlTag(node: commonmark.Node): boolean {
|
||||||
|
@ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean {
|
||||||
return par.firstChild != par.lastChild;
|
return par.firstChild != par.lastChild;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getTextUntilEndOrLinebreak(node: commonmark.Node) {
|
||||||
|
let currentNode = node;
|
||||||
|
let text = '';
|
||||||
|
while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') {
|
||||||
|
const { literal, type } = currentNode;
|
||||||
|
if (type === 'text' && literal) {
|
||||||
|
let n = 0;
|
||||||
|
let char = literal[n];
|
||||||
|
while (char !== ' ' && char !== null && n <= literal.length) {
|
||||||
|
if (char === ' ') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (char) {
|
||||||
|
text += char;
|
||||||
|
}
|
||||||
|
n += 1;
|
||||||
|
char = literal[n];
|
||||||
|
}
|
||||||
|
if (char === ' ') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentNode = currentNode.next;
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class that wraps commonmark, adding the ability to see whether
|
* Class that wraps commonmark, adding the ability to see whether
|
||||||
* a given message actually uses any markdown syntax or whether
|
* a given message actually uses any markdown syntax or whether
|
||||||
|
@ -70,11 +102,103 @@ export default class Markdown {
|
||||||
private input: string;
|
private input: string;
|
||||||
private parsed: commonmark.Node;
|
private parsed: commonmark.Node;
|
||||||
|
|
||||||
constructor(input) {
|
constructor(input: string) {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
|
|
||||||
const parser = new commonmark.Parser();
|
const parser = new commonmark.Parser();
|
||||||
this.parsed = parser.parse(this.input);
|
this.parsed = parser.parse(this.input);
|
||||||
|
this.parsed = this.repairLinks(this.parsed);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is modifying the parsed AST in such a way that links are always
|
||||||
|
* properly linkified instead of sometimes being wrongly emphasised in case
|
||||||
|
* if you were to write a link like the example below:
|
||||||
|
* https://my_weird-link_domain.domain.com
|
||||||
|
* ^ this link would be parsed to something like this:
|
||||||
|
* <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
|
||||||
|
* This method makes it so the link gets properly modified to a version where it is
|
||||||
|
* not emphasised until it actually ends.
|
||||||
|
* See: https://github.com/vector-im/element-web/issues/4674
|
||||||
|
* @param parsed
|
||||||
|
*/
|
||||||
|
private repairLinks(parsed: commonmark.Node) {
|
||||||
|
const walker = parsed.walker();
|
||||||
|
let event: commonmark.NodeWalkingStep = null;
|
||||||
|
let text = '';
|
||||||
|
let isInPara = false;
|
||||||
|
let previousNode: commonmark.Node | null = null;
|
||||||
|
let shouldUnlinkEmphasisNode = false;
|
||||||
|
while ((event = walker.next())) {
|
||||||
|
const { node } = event;
|
||||||
|
if (node.type === 'paragraph') {
|
||||||
|
if (event.entering) {
|
||||||
|
isInPara = true;
|
||||||
|
} else {
|
||||||
|
isInPara = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isInPara) {
|
||||||
|
// Clear saved string when line ends
|
||||||
|
if (
|
||||||
|
node.type === 'softbreak' ||
|
||||||
|
node.type === 'linebreak' ||
|
||||||
|
// Also start calculating the text from the beginning on any spaces
|
||||||
|
(node.type === 'text' && node.literal === ' ')
|
||||||
|
) {
|
||||||
|
text = '';
|
||||||
|
}
|
||||||
|
if (node.type === 'text') {
|
||||||
|
text += node.literal;
|
||||||
|
}
|
||||||
|
// We should not do this if previous node was not a textnode, as we can't combine it then.
|
||||||
|
if (node.type === 'emph' && previousNode.type === 'text') {
|
||||||
|
if (event.entering) {
|
||||||
|
const foundLinks = linkify.find(text);
|
||||||
|
for (const { value } of foundLinks) {
|
||||||
|
if (node.firstChild.literal) {
|
||||||
|
/**
|
||||||
|
* NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings
|
||||||
|
* but this solution seems to work well and is hopefully slightly easier to understand too
|
||||||
|
*/
|
||||||
|
const nonEmphasizedText = `_${node.firstChild.literal}_`;
|
||||||
|
const f = getTextUntilEndOrLinebreak(node);
|
||||||
|
const newText = value + nonEmphasizedText + f;
|
||||||
|
const newLinks = linkify.find(newText);
|
||||||
|
// Should always find only one link here, if it finds more it means that the algorithm is broken
|
||||||
|
if (newLinks.length === 1) {
|
||||||
|
const emphasisTextNode = new commonmark.Node('text');
|
||||||
|
emphasisTextNode.literal = nonEmphasizedText;
|
||||||
|
previousNode.insertAfter(emphasisTextNode);
|
||||||
|
node.firstChild.literal = '';
|
||||||
|
event = node.walker().next();
|
||||||
|
// Remove `em` opening and closing nodes
|
||||||
|
node.unlink();
|
||||||
|
previousNode.insertAfter(event.node);
|
||||||
|
shouldUnlinkEmphasisNode = true;
|
||||||
|
} else {
|
||||||
|
logger.error(
|
||||||
|
"Markdown links escaping found too many links for following text: ",
|
||||||
|
text,
|
||||||
|
);
|
||||||
|
logger.error(
|
||||||
|
"Markdown links escaping found too many links for modified text: ",
|
||||||
|
newText,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (shouldUnlinkEmphasisNode) {
|
||||||
|
node.unlink();
|
||||||
|
shouldUnlinkEmphasisNode = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
previousNode = node;
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
isPlainText(): boolean {
|
isPlainText(): boolean {
|
||||||
|
@ -120,9 +244,7 @@ export default class Markdown {
|
||||||
// you can nest them.
|
// you can nest them.
|
||||||
//
|
//
|
||||||
// Let's try sending with <p/>s anyway for now, though.
|
// Let's try sending with <p/>s anyway for now, though.
|
||||||
|
|
||||||
const realParagraph = renderer.paragraph;
|
const realParagraph = renderer.paragraph;
|
||||||
|
|
||||||
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
|
renderer.paragraph = function(node: commonmark.Node, entering: boolean) {
|
||||||
// If there is only one top level node, just return the
|
// If there is only one top level node, just return the
|
||||||
// bare text: it's a single line of text and so should be
|
// bare text: it's a single line of text and so should be
|
||||||
|
|
|
@ -0,0 +1,143 @@
|
||||||
|
/*
|
||||||
|
Copyright 2021 The Matrix.org Foundation C.I.C.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
import * as linkifyjs from 'linkifyjs';
|
||||||
|
import Markdown from "../src/Markdown";
|
||||||
|
import matrixLinkify from '../src/linkify-matrix';
|
||||||
|
|
||||||
|
beforeAll(() => {
|
||||||
|
// We need to call linkifier plugins before running those tests
|
||||||
|
matrixLinkify(linkifyjs);
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("Markdown parser test", () => {
|
||||||
|
describe("fixing HTML links", () => {
|
||||||
|
const testString = [
|
||||||
|
"Test1:",
|
||||||
|
"#_foonetic_xkcd:matrix.org",
|
||||||
|
"http://google.com/_thing_",
|
||||||
|
"https://matrix.org/_matrix/client/foo/123_",
|
||||||
|
"#_foonetic_xkcd:matrix.org",
|
||||||
|
"",
|
||||||
|
"Test1A:",
|
||||||
|
"#_foonetic_xkcd:matrix.org",
|
||||||
|
"http://google.com/_thing_",
|
||||||
|
"https://matrix.org/_matrix/client/foo/123_",
|
||||||
|
"#_foonetic_xkcd:matrix.org",
|
||||||
|
"",
|
||||||
|
"Test2:",
|
||||||
|
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
|
||||||
|
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
|
||||||
|
"",
|
||||||
|
"Test3:",
|
||||||
|
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
|
||||||
|
"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org",
|
||||||
|
].join("\n");
|
||||||
|
|
||||||
|
it('tests that links are getting properly HTML formatted', () => {
|
||||||
|
/* eslint-disable max-len */
|
||||||
|
const expectedResult = [
|
||||||
|
"<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
|
||||||
|
"<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>",
|
||||||
|
"<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>",
|
||||||
|
"<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>",
|
||||||
|
"",
|
||||||
|
].join("\n");
|
||||||
|
/* eslint-enable max-len */
|
||||||
|
const md = new Markdown(testString);
|
||||||
|
expect(md.toHTML()).toEqual(expectedResult);
|
||||||
|
});
|
||||||
|
it('tests that links with autolinks are not touched at all and are still properly formatted', () => {
|
||||||
|
const test = [
|
||||||
|
"Test1:",
|
||||||
|
"<#_foonetic_xkcd:matrix.org>",
|
||||||
|
"<http://google.com/_thing_>",
|
||||||
|
"<https://matrix.org/_matrix/client/foo/123_>",
|
||||||
|
"<#_foonetic_xkcd:matrix.org>",
|
||||||
|
"",
|
||||||
|
"Test1A:",
|
||||||
|
"<#_foonetic_xkcd:matrix.org>",
|
||||||
|
"<http://google.com/_thing_>",
|
||||||
|
"<https://matrix.org/_matrix/client/foo/123_>",
|
||||||
|
"<#_foonetic_xkcd:matrix.org>",
|
||||||
|
"",
|
||||||
|
"Test2:",
|
||||||
|
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
|
||||||
|
"<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>",
|
||||||
|
"",
|
||||||
|
"Test3:",
|
||||||
|
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
|
||||||
|
"<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>",
|
||||||
|
].join("\n");
|
||||||
|
/* eslint-disable max-len */
|
||||||
|
/**
|
||||||
|
* NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org
|
||||||
|
* but it seems to be actually working properly
|
||||||
|
*/
|
||||||
|
const expectedResult = [
|
||||||
|
"<p>Test1:<br /><#_foonetic_xkcd:matrix.org><br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br /><#_foonetic_xkcd:matrix.org></p>",
|
||||||
|
"<p>Test1A:<br /><#_foonetic_xkcd:matrix.org><br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br /><#_foonetic_xkcd:matrix.org></p>",
|
||||||
|
"<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>",
|
||||||
|
"<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>",
|
||||||
|
"",
|
||||||
|
].join("\n");
|
||||||
|
/* eslint-enable max-len */
|
||||||
|
const md = new Markdown(test);
|
||||||
|
expect(md.toHTML()).toEqual(expectedResult);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('expects that links in codeblock are not modified', () => {
|
||||||
|
const expectedResult = [
|
||||||
|
'<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org',
|
||||||
|
'http://google.com/_thing_',
|
||||||
|
'https://matrix.org/_matrix/client/foo/123_',
|
||||||
|
'#_foonetic_xkcd:matrix.org',
|
||||||
|
'',
|
||||||
|
'Test1A:',
|
||||||
|
'#_foonetic_xkcd:matrix.org',
|
||||||
|
'http://google.com/_thing_',
|
||||||
|
'https://matrix.org/_matrix/client/foo/123_',
|
||||||
|
'#_foonetic_xkcd:matrix.org',
|
||||||
|
'',
|
||||||
|
'Test2:',
|
||||||
|
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
|
||||||
|
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
|
||||||
|
'',
|
||||||
|
'Test3:',
|
||||||
|
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org',
|
||||||
|
'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```',
|
||||||
|
'</code></pre>',
|
||||||
|
'',
|
||||||
|
].join('\n');
|
||||||
|
const md = new Markdown("```" + testString + "```");
|
||||||
|
expect(md.toHTML()).toEqual(expectedResult);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('expects that links in one line will be "escaped" properly', () => {
|
||||||
|
/* eslint-disable max-len */
|
||||||
|
const testString = [
|
||||||
|
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
|
||||||
|
'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg',
|
||||||
|
].join('\n');
|
||||||
|
const expectedResult = [
|
||||||
|
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
|
||||||
|
"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg",
|
||||||
|
].join('<br />');
|
||||||
|
/* eslint-enable max-len */
|
||||||
|
const md = new Markdown(testString);
|
||||||
|
expect(md.toHTML()).toEqual(expectedResult);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
|
@ -197,7 +197,6 @@ describe('editor/deserialize', function() {
|
||||||
it('code block with no trailing text', function() {
|
it('code block with no trailing text', function() {
|
||||||
const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
|
const html = "<pre><code>0xDEADBEEF\n</code></pre>\n";
|
||||||
const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
|
const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
|
||||||
console.log(parts);
|
|
||||||
expect(parts.length).toBe(5);
|
expect(parts.length).toBe(5);
|
||||||
expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
|
expect(parts[0]).toStrictEqual({ type: "plain", text: "```" });
|
||||||
expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });
|
expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });
|
||||||
|
|
Loading…
Reference in New Issue