Fix links being parsed as markdown links improperly (#7200)
* Fix links being parsed as markdown links improperly Fixes #4674 * Fix a typo * Fix overriding too much stuff * Fix parsing * Remove useless console.log * Remove unnecessary emph function * Properly fix tests * Add some better docs * Add missing license headerpull/21833/head
							parent
							
								
									8fe582b094
								
							
						
					
					
						commit
						e3187ed15c
					
				
							
								
								
									
										128
									
								
								src/Markdown.ts
								
								
								
								
							
							
						
						
									
										128
									
								
								src/Markdown.ts
								
								
								
								
							|  | @ -17,6 +17,8 @@ limitations under the License. | |||
| 
 | ||||
| import * as commonmark from 'commonmark'; | ||||
| import { escape } from "lodash"; | ||||
| import { logger } from 'matrix-js-sdk/src/logger'; | ||||
| import * as linkify from 'linkifyjs'; | ||||
| 
 | ||||
| const ALLOWED_HTML_TAGS = ['sub', 'sup', 'del', 'u']; | ||||
| 
 | ||||
|  | @ -29,6 +31,9 @@ interface CommonmarkHtmlRendererInternal extends commonmark.HtmlRenderer { | |||
|     link: (node: commonmark.Node, entering: boolean) => void; | ||||
|     html_inline: (node: commonmark.Node) => void; // eslint-disable-line camelcase
 | ||||
|     html_block: (node: commonmark.Node) => void; // eslint-disable-line camelcase
 | ||||
|     text: (node: commonmark.Node) => void; | ||||
|     out: (text: string) => void; | ||||
|     emph: (node: commonmark.Node) => void; | ||||
| } | ||||
| 
 | ||||
| function isAllowedHtmlTag(node: commonmark.Node): boolean { | ||||
|  | @ -61,6 +66,33 @@ function isMultiLine(node: commonmark.Node): boolean { | |||
|     return par.firstChild != par.lastChild; | ||||
| } | ||||
| 
 | ||||
| function getTextUntilEndOrLinebreak(node: commonmark.Node) { | ||||
|     let currentNode = node; | ||||
|     let text = ''; | ||||
|     while (currentNode !== null && currentNode.type !== 'softbreak' && currentNode.type !== 'linebreak') { | ||||
|         const { literal, type } = currentNode; | ||||
|         if (type === 'text' && literal) { | ||||
|             let n = 0; | ||||
|             let char = literal[n]; | ||||
|             while (char !== ' ' && char !== null && n <= literal.length) { | ||||
|                 if (char === ' ') { | ||||
|                     break; | ||||
|                 } | ||||
|                 if (char) { | ||||
|                     text += char; | ||||
|                 } | ||||
|                 n += 1; | ||||
|                 char = literal[n]; | ||||
|             } | ||||
|             if (char === ' ') { | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|         currentNode = currentNode.next; | ||||
|     } | ||||
|     return text; | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * Class that wraps commonmark, adding the ability to see whether | ||||
|  * a given message actually uses any markdown syntax or whether | ||||
|  | @ -70,11 +102,103 @@ export default class Markdown { | |||
|     private input: string; | ||||
|     private parsed: commonmark.Node; | ||||
| 
 | ||||
|     constructor(input) { | ||||
|     constructor(input: string) { | ||||
|         this.input = input; | ||||
| 
 | ||||
|         const parser = new commonmark.Parser(); | ||||
|         this.parsed = parser.parse(this.input); | ||||
|         this.parsed = this.repairLinks(this.parsed); | ||||
|     } | ||||
| 
 | ||||
|     /** | ||||
|      * This method is modifying the parsed AST in such a way that links are always | ||||
|      * properly linkified instead of sometimes being wrongly emphasised in case | ||||
|      * if you were to write a link like the example below: | ||||
|      * https://my_weird-link_domain.domain.com
 | ||||
|      * ^ this link would be parsed to something like this: | ||||
|      * <a href="https://my">https://my</a><b>weird-link</b><a href="https://domain.domain.com">domain.domain.com</a>
 | ||||
|      * This method makes it so the link gets properly modified to a version where it is | ||||
|      * not emphasised until it actually ends. | ||||
|      * See: https://github.com/vector-im/element-web/issues/4674
 | ||||
|      * @param parsed | ||||
|      */ | ||||
|     private repairLinks(parsed: commonmark.Node) { | ||||
|         const walker = parsed.walker(); | ||||
|         let event: commonmark.NodeWalkingStep = null; | ||||
|         let text = ''; | ||||
|         let isInPara = false; | ||||
|         let previousNode: commonmark.Node | null = null; | ||||
|         let shouldUnlinkEmphasisNode = false; | ||||
|         while ((event = walker.next())) { | ||||
|             const { node } = event; | ||||
|             if (node.type === 'paragraph') { | ||||
|                 if (event.entering) { | ||||
|                     isInPara = true; | ||||
|                 } else { | ||||
|                     isInPara = false; | ||||
|                 } | ||||
|             } | ||||
|             if (isInPara) { | ||||
|                 // Clear saved string when line ends
 | ||||
|                 if ( | ||||
|                     node.type === 'softbreak' || | ||||
|                     node.type === 'linebreak' || | ||||
|                     // Also start calculating the text from the beginning on any spaces
 | ||||
|                     (node.type === 'text' && node.literal === ' ') | ||||
|                 ) { | ||||
|                     text = ''; | ||||
|                 } | ||||
|                 if (node.type === 'text') { | ||||
|                     text += node.literal; | ||||
|                 } | ||||
|                 // We should not do this if previous node was not a textnode, as we can't combine it then.
 | ||||
|                 if (node.type === 'emph' && previousNode.type === 'text') { | ||||
|                     if (event.entering) { | ||||
|                         const foundLinks = linkify.find(text); | ||||
|                         for (const { value } of foundLinks) { | ||||
|                             if (node.firstChild.literal) { | ||||
|                                 /** | ||||
|                                  * NOTE: This technically should unlink the emph node and create LINK nodes instead, adding all the next elements as siblings | ||||
|                                  * but this solution seems to work well and is hopefully slightly easier to understand too | ||||
|                                  */ | ||||
|                                 const nonEmphasizedText = `_${node.firstChild.literal}_`; | ||||
|                                 const f = getTextUntilEndOrLinebreak(node); | ||||
|                                 const newText = value + nonEmphasizedText + f; | ||||
|                                 const newLinks = linkify.find(newText); | ||||
|                                 // Should always find only one link here, if it finds more it means that the algorithm is broken
 | ||||
|                                 if (newLinks.length === 1) { | ||||
|                                     const emphasisTextNode = new commonmark.Node('text'); | ||||
|                                     emphasisTextNode.literal = nonEmphasizedText; | ||||
|                                     previousNode.insertAfter(emphasisTextNode); | ||||
|                                     node.firstChild.literal = ''; | ||||
|                                     event = node.walker().next(); | ||||
|                                     // Remove `em` opening and closing nodes
 | ||||
|                                     node.unlink(); | ||||
|                                     previousNode.insertAfter(event.node); | ||||
|                                     shouldUnlinkEmphasisNode = true; | ||||
|                                 } else { | ||||
|                                     logger.error( | ||||
|                                         "Markdown links escaping found too many links for following text: ", | ||||
|                                         text, | ||||
|                                     ); | ||||
|                                     logger.error( | ||||
|                                         "Markdown links escaping found too many links for modified text: ", | ||||
|                                         newText, | ||||
|                                     ); | ||||
|                                 } | ||||
|                             } | ||||
|                         } | ||||
|                     } else { | ||||
|                         if (shouldUnlinkEmphasisNode) { | ||||
|                             node.unlink(); | ||||
|                             shouldUnlinkEmphasisNode = false; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             previousNode = node; | ||||
|         } | ||||
|         return parsed; | ||||
|     } | ||||
| 
 | ||||
|     isPlainText(): boolean { | ||||
|  | @ -120,9 +244,7 @@ export default class Markdown { | |||
|         // you can nest them.
 | ||||
|         //
 | ||||
|         // Let's try sending with <p/>s anyway for now, though.
 | ||||
| 
 | ||||
|         const realParagraph = renderer.paragraph; | ||||
| 
 | ||||
|         renderer.paragraph = function(node: commonmark.Node, entering: boolean) { | ||||
|             // If there is only one top level node, just return the
 | ||||
|             // bare text: it's a single line of text and so should be
 | ||||
|  |  | |||
|  | @ -0,0 +1,143 @@ | |||
| /* | ||||
| Copyright 2021 The Matrix.org Foundation C.I.C. | ||||
| 
 | ||||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| you may not use this file except in compliance with the License. | ||||
| You may obtain a copy of the License at | ||||
| 
 | ||||
|     http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| 
 | ||||
| Unless required by applicable law or agreed to in writing, software | ||||
| distributed under the License is distributed on an "AS IS" BASIS, | ||||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| See the License for the specific language governing permissions and | ||||
| limitations under the License. | ||||
| */ | ||||
| import * as linkifyjs from 'linkifyjs'; | ||||
| import Markdown from "../src/Markdown"; | ||||
| import matrixLinkify from '../src/linkify-matrix'; | ||||
| 
 | ||||
| beforeAll(() => { | ||||
|     // We need to call linkifier plugins before running those tests
 | ||||
|     matrixLinkify(linkifyjs); | ||||
| }); | ||||
| 
 | ||||
| describe("Markdown parser test", () => { | ||||
|     describe("fixing HTML links", () => { | ||||
|         const testString = [ | ||||
|             "Test1:", | ||||
|             "#_foonetic_xkcd:matrix.org", | ||||
|             "http://google.com/_thing_", | ||||
|             "https://matrix.org/_matrix/client/foo/123_", | ||||
|             "#_foonetic_xkcd:matrix.org", | ||||
|             "", | ||||
|             "Test1A:", | ||||
|             "#_foonetic_xkcd:matrix.org", | ||||
|             "http://google.com/_thing_", | ||||
|             "https://matrix.org/_matrix/client/foo/123_", | ||||
|             "#_foonetic_xkcd:matrix.org", | ||||
|             "", | ||||
|             "Test2:", | ||||
|             "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", | ||||
|             "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", | ||||
|             "", | ||||
|             "Test3:", | ||||
|             "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org", | ||||
|             "https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org", | ||||
|         ].join("\n"); | ||||
| 
 | ||||
|         it('tests that links are getting properly HTML formatted', () => { | ||||
|             /* eslint-disable max-len */ | ||||
|             const expectedResult = [ | ||||
|                 "<p>Test1:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>", | ||||
|                 "<p>Test1A:<br />#_foonetic_xkcd:matrix.org<br />http://google.com/_thing_<br />https://matrix.org/_matrix/client/foo/123_<br />#_foonetic_xkcd:matrix.org</p>", | ||||
|                 "<p>Test2:<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg<br />http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</p>", | ||||
|                 "<p>Test3:<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org<br />https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</p>", | ||||
|                 "", | ||||
|             ].join("\n"); | ||||
|             /* eslint-enable max-len */ | ||||
|             const md = new Markdown(testString); | ||||
|             expect(md.toHTML()).toEqual(expectedResult); | ||||
|         }); | ||||
|         it('tests that links with autolinks are not touched at all and are still properly formatted', () => { | ||||
|             const test = [ | ||||
|                 "Test1:", | ||||
|                 "<#_foonetic_xkcd:matrix.org>", | ||||
|                 "<http://google.com/_thing_>", | ||||
|                 "<https://matrix.org/_matrix/client/foo/123_>", | ||||
|                 "<#_foonetic_xkcd:matrix.org>", | ||||
|                 "", | ||||
|                 "Test1A:", | ||||
|                 "<#_foonetic_xkcd:matrix.org>", | ||||
|                 "<http://google.com/_thing_>", | ||||
|                 "<https://matrix.org/_matrix/client/foo/123_>", | ||||
|                 "<#_foonetic_xkcd:matrix.org>", | ||||
|                 "", | ||||
|                 "Test2:", | ||||
|                 "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>", | ||||
|                 "<http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg>", | ||||
|                 "", | ||||
|                 "Test3:", | ||||
|                 "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>", | ||||
|                 "<https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org>", | ||||
|             ].join("\n"); | ||||
|             /* eslint-disable max-len */ | ||||
|             /** | ||||
|              * NOTE: I'm not entirely sure if those "<"" and ">" should be visible in here for #_foonetic_xkcd:matrix.org | ||||
|              * but it seems to be actually working properly | ||||
|              */ | ||||
|             const expectedResult = [ | ||||
|                 "<p>Test1:<br /><#_foonetic_xkcd:matrix.org><br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br /><#_foonetic_xkcd:matrix.org></p>", | ||||
|                 "<p>Test1A:<br /><#_foonetic_xkcd:matrix.org><br /><a href=\"http://google.com/_thing_\">http://google.com/_thing_</a><br /><a href=\"https://matrix.org/_matrix/client/foo/123_\">https://matrix.org/_matrix/client/foo/123_</a><br /><#_foonetic_xkcd:matrix.org></p>", | ||||
|                 "<p>Test2:<br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a><br /><a href=\"http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg\">http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg</a></p>", | ||||
|                 "<p>Test3:<br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a><br /><a href=\"https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org\">https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org</a></p>", | ||||
|                 "", | ||||
|             ].join("\n"); | ||||
|             /* eslint-enable max-len */ | ||||
|             const md = new Markdown(test); | ||||
|             expect(md.toHTML()).toEqual(expectedResult); | ||||
|         }); | ||||
| 
 | ||||
|         it('expects that links in codeblock are not modified', () => { | ||||
|             const expectedResult = [ | ||||
|                 '<pre><code class="language-Test1:">#_foonetic_xkcd:matrix.org', | ||||
|                 'http://google.com/_thing_', | ||||
|                 'https://matrix.org/_matrix/client/foo/123_', | ||||
|                 '#_foonetic_xkcd:matrix.org', | ||||
|                 '', | ||||
|                 'Test1A:', | ||||
|                 '#_foonetic_xkcd:matrix.org', | ||||
|                 'http://google.com/_thing_', | ||||
|                 'https://matrix.org/_matrix/client/foo/123_', | ||||
|                 '#_foonetic_xkcd:matrix.org', | ||||
|                 '', | ||||
|                 'Test2:', | ||||
|                 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', | ||||
|                 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', | ||||
|                 '', | ||||
|                 'Test3:', | ||||
|                 'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org', | ||||
|                 'https://riot.im/app/#/room/#_foonetic_xkcd:matrix.org```', | ||||
|                 '</code></pre>', | ||||
|                 '', | ||||
|             ].join('\n'); | ||||
|             const md = new Markdown("```" + testString + "```"); | ||||
|             expect(md.toHTML()).toEqual(expectedResult); | ||||
|         }); | ||||
| 
 | ||||
|         it('expects that links in one line will be "escaped" properly', () => { | ||||
|             /* eslint-disable max-len */ | ||||
|             const testString = [ | ||||
|                 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', | ||||
|                 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg' + " " + 'http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg', | ||||
|             ].join('\n'); | ||||
|             const expectedResult = [ | ||||
|                 "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", | ||||
|                 "http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg http://domain.xyz/foo/bar-_stuff-like-this_-in-it.jpg", | ||||
|             ].join('<br />'); | ||||
|             /* eslint-enable max-len */ | ||||
|             const md = new Markdown(testString); | ||||
|             expect(md.toHTML()).toEqual(expectedResult); | ||||
|         }); | ||||
|     }); | ||||
| }); | ||||
|  | @ -197,7 +197,6 @@ describe('editor/deserialize', function() { | |||
|         it('code block with no trailing text', function() { | ||||
|             const html = "<pre><code>0xDEADBEEF\n</code></pre>\n"; | ||||
|             const parts = normalize(parseEvent(htmlMessage(html), createPartCreator())); | ||||
|             console.log(parts); | ||||
|             expect(parts.length).toBe(5); | ||||
|             expect(parts[0]).toStrictEqual({ type: "plain", text: "```" }); | ||||
|             expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" }); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Dariusz Niemczyk
						Dariusz Niemczyk