Merge pull request from GHSA-22p3-qrh9-cx32
* Make _iterate_over_text easier to read by using simple data structures * Prefer a set of tags to ignore In my tests, it's 4x faster to check for containment in a set of this size * Add a stack size limit to _iterate_over_text * Continue accepting the case where there is no body element * Use an early return instead for None Co-authored-by: Richard van der Hoff <richard@matrix.org>release-v1.61
parent
21e6c0ed64
commit
fa13080618
|
@ -12,10 +12,9 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import codecs
|
import codecs
|
||||||
import itertools
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
from typing import TYPE_CHECKING, Dict, Generator, Iterable, List, Optional, Set, Union
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
TAGS_TO_REMOVE = (
|
TAGS_TO_REMOVE = {
|
||||||
"header",
|
"header",
|
||||||
"nav",
|
"nav",
|
||||||
"aside",
|
"aside",
|
||||||
|
@ -291,32 +290,43 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
|
||||||
"img",
|
"img",
|
||||||
"picture",
|
"picture",
|
||||||
etree.Comment,
|
etree.Comment,
|
||||||
)
|
}
|
||||||
|
|
||||||
# Split all the text nodes into paragraphs (by splitting on new
|
# Split all the text nodes into paragraphs (by splitting on new
|
||||||
# lines)
|
# lines)
|
||||||
text_nodes = (
|
text_nodes = (
|
||||||
re.sub(r"\s+", "\n", el).strip()
|
re.sub(r"\s+", "\n", el).strip()
|
||||||
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
|
for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
|
||||||
)
|
)
|
||||||
return summarize_paragraphs(text_nodes)
|
return summarize_paragraphs(text_nodes)
|
||||||
|
|
||||||
|
|
||||||
def _iterate_over_text(
|
def _iterate_over_text(
|
||||||
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
|
tree: Optional["etree.Element"],
|
||||||
|
tags_to_ignore: Set[Union[str, "etree.Comment"]],
|
||||||
|
stack_limit: int = 1024,
|
||||||
) -> Generator[str, None, None]:
|
) -> Generator[str, None, None]:
|
||||||
"""Iterate over the tree returning text nodes in a depth first fashion,
|
"""Iterate over the tree returning text nodes in a depth first fashion,
|
||||||
skipping text nodes inside certain tags.
|
skipping text nodes inside certain tags.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tree: The parent element to iterate. Can be None if there isn't one.
|
||||||
|
tags_to_ignore: Set of tags to ignore
|
||||||
|
stack_limit: Maximum stack size limit for depth-first traversal.
|
||||||
|
Nodes will be dropped if this limit is hit, which may truncate the
|
||||||
|
textual result.
|
||||||
|
Intended to limit the maximum working memory when generating a preview.
|
||||||
"""
|
"""
|
||||||
# This is basically a stack that we extend using itertools.chain.
|
|
||||||
# This will either consist of an element to iterate over *or* a string
|
if tree is None:
|
||||||
# to be returned.
|
|
||||||
elements = iter([tree])
|
|
||||||
while True:
|
|
||||||
el = next(elements, None)
|
|
||||||
if el is None:
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# This is a stack whose items are elements to iterate over *or* strings
|
||||||
|
# to be returned.
|
||||||
|
elements: List[Union[str, "etree.Element"]] = [tree]
|
||||||
|
while elements:
|
||||||
|
el = elements.pop()
|
||||||
|
|
||||||
if isinstance(el, str):
|
if isinstance(el, str):
|
||||||
yield el
|
yield el
|
||||||
elif el.tag not in tags_to_ignore:
|
elif el.tag not in tags_to_ignore:
|
||||||
|
@ -329,17 +339,22 @@ def _iterate_over_text(
|
||||||
if el.text:
|
if el.text:
|
||||||
yield el.text
|
yield el.text
|
||||||
|
|
||||||
# We add to the stack all the elements children, interspersed with
|
# We add to the stack all the element's children, interspersed with
|
||||||
# each child's tail text (if it exists). The tail text of a node
|
# each child's tail text (if it exists).
|
||||||
# is text that comes *after* the node, so we always include it even
|
#
|
||||||
# if we ignore the child node.
|
# We iterate in reverse order so that earlier pieces of text appear
|
||||||
elements = itertools.chain(
|
# closer to the top of the stack.
|
||||||
itertools.chain.from_iterable( # Basically a flatmap
|
for child in el.iterchildren(reversed=True):
|
||||||
[child, child.tail] if child.tail else [child]
|
if len(elements) > stack_limit:
|
||||||
for child in el.iterchildren()
|
# We've hit our limit for working memory
|
||||||
),
|
break
|
||||||
elements,
|
|
||||||
)
|
if child.tail:
|
||||||
|
# The tail text of a node is text that comes *after* the node,
|
||||||
|
# so we always include it even if we ignore the child node.
|
||||||
|
elements.append(child.tail)
|
||||||
|
|
||||||
|
elements.append(child)
|
||||||
|
|
||||||
|
|
||||||
def summarize_paragraphs(
|
def summarize_paragraphs(
|
||||||
|
|
|
@ -370,6 +370,23 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
|
||||||
og = parse_html_to_open_graph(tree)
|
og = parse_html_to_open_graph(tree)
|
||||||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
||||||
|
|
||||||
|
def test_nested_nodes(self) -> None:
|
||||||
|
"""A body with some nested nodes. Tests that we iterate over children
|
||||||
|
in the right order (and don't reverse the order of the text)."""
|
||||||
|
html = b"""
|
||||||
|
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
|
||||||
|
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
|
||||||
|
"""
|
||||||
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
|
og = parse_html_to_open_graph(tree)
|
||||||
|
self.assertEqual(
|
||||||
|
og,
|
||||||
|
{
|
||||||
|
"og:title": None,
|
||||||
|
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MediaEncodingTestCase(unittest.TestCase):
|
class MediaEncodingTestCase(unittest.TestCase):
|
||||||
def test_meta_charset(self) -> None:
|
def test_meta_charset(self) -> None:
|
||||||
|
|
Loading…
Reference in New Issue