Add stubs package for lxml. (#15697)

The stubs have some issues so this has some generous cast and ignores in it, but it is better than not having stubs. Note that confusing that Element is a function which creates _Element instances (and similarly for Comment).
2023-05-31 13:06:57 -04:00 · 2023-05-31 13:06:57 -04:00 · 6f18812bb0
parent 874378c052
commit 6f18812bb0
10 changed files with 117 additions and 48 deletions
--- a/changelog.d/15697.misc
+++ b/changelog.d/15697.misc
@ -0,0 +1 @@
+Improve type hints.
--- a/mypy.ini
+++ b/mypy.ini
@ -60,9 +60,6 @@ ignore_missing_imports = True
 [mypy-ijson.*]
 ignore_missing_imports = True

-[mypy-lxml]
-ignore_missing_imports = True
-
 # https://github.com/msgpack/msgpack-python/issues/448
 [mypy-msgpack]
 ignore_missing_imports = True
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.

 [[package]]
 name = "alabaster"
@ -1215,6 +1215,21 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (>=0.29.7)"]

+[[package]]
+name = "lxml-stubs"
+version = "0.4.0"
+description = "Type annotations for the lxml package"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "lxml-stubs-0.4.0.tar.gz", hash = "sha256:184877b42127256abc2b932ba8bd0ab5ea80bd0b0fee618d16daa40e0b71abee"},
+    {file = "lxml_stubs-0.4.0-py3-none-any.whl", hash = "sha256:3b381e9e82397c64ea3cc4d6f79d1255d015f7b114806d4826218805c10ec003"},
+]
+
+[package.extras]
+test = ["coverage[toml] (==5.2)", "pytest (>=6.0.0)", "pytest-mypy-plugins (==1.9.3)"]
+
 [[package]]
 name = "markdown-it-py"
 version = "2.2.0"
@ -3409,22 +3424,22 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
 test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]

 [extras]
-all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"]
+all = ["Pympler", "authlib", "hiredis", "jaeger-client", "lxml", "matrix-synapse-ldap3", "opentracing", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pyicu", "pysaml2", "sentry-sdk", "txredisapi"]
 cache-memory = ["Pympler"]
 jwt = ["authlib"]
 matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
 oidc = ["authlib"]
 opentracing = ["jaeger-client", "opentracing"]
 postgres = ["psycopg2", "psycopg2cffi", "psycopg2cffi-compat"]
-redis = ["txredisapi", "hiredis"]
+redis = ["hiredis", "txredisapi"]
 saml2 = ["pysaml2"]
 sentry = ["sentry-sdk"]
 systemd = ["systemd-python"]
-test = ["parameterized", "idna"]
+test = ["idna", "parameterized"]
 url-preview = ["lxml"]
 user-search = ["pyicu"]

 [metadata]
 lock-version = "2.0"
 python-versions = "^3.7.1"
-content-hash = "ef3a16dd66177f7141239e1a2d3e07cc14c08f1e4e0c5127184d022bc062da52"
+content-hash = "7ad11e62a675e09444cf33ca2de3216fc4efc5874a2575e54d95d577a52439d3"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -314,6 +314,7 @@ black = ">=22.3.0"
 ruff = "0.0.265"

 # Typechecking
+lxml-stubs = ">=0.4.0"
 mypy = "*"
 mypy-zope = "*"
 types-bleach = ">=4.1.0"
--- a/synapse/media/oembed.py
+++ b/synapse/media/oembed.py
@ -14,7 +14,7 @@
 import html
 import logging
 import urllib.parse
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List, Optional, cast

 import attr

@ -98,7 +98,7 @@ class OEmbedProvider:
        # No match.
        return None

-    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+    def autodiscover_from_html(self, tree: "etree._Element") -> Optional[str]:
        """
        Search an HTML document for oEmbed autodiscovery information.

@ -109,18 +109,22 @@ class OEmbedProvider:
            The URL to use for oEmbed information, or None if no URL was found.
        """
        # Search for link elements with the proper rel and type attributes.
-        for tag in tree.xpath(
-            "//link[@rel='alternate'][@type='application/json+oembed']"
+        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        for tag in cast(
+            List["etree._Element"],
+            tree.xpath("//link[@rel='alternate'][@type='application/json+oembed']"),
        ):
            if "href" in tag.attrib:
-                return tag.attrib["href"]
+                return cast(str, tag.attrib["href"])

        # Some providers (e.g. Flickr) use alternative instead of alternate.
-        for tag in tree.xpath(
-            "//link[@rel='alternative'][@type='application/json+oembed']"
+        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        for tag in cast(
+            List["etree._Element"],
+            tree.xpath("//link[@rel='alternative'][@type='application/json+oembed']"),
        ):
            if "href" in tag.attrib:
-                return tag.attrib["href"]
+                return cast(str, tag.attrib["href"])

        return None

@ -212,11 +216,12 @@ class OEmbedProvider:
        return OEmbedResult(open_graph_response, author_name, cache_age)


-def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
+def _fetch_urls(tree: "etree._Element", tag_name: str) -> List[str]:
    results = []
-    for tag in tree.xpath("//*/" + tag_name):
+    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+    for tag in cast(List["etree._Element"], tree.xpath("//*/" + tag_name)):
        if "src" in tag.attrib:
-            results.append(tag.attrib["src"])
+            results.append(cast(str, tag.attrib["src"]))
    return results


@ -244,11 +249,12 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
    parser = etree.HTMLParser(recover=True, encoding="utf-8")

    # Attempt to parse the body. If this fails, log and return no metadata.
-    tree = etree.fromstring(html_body, parser)
+    # TODO Develop of lxml-stubs has this correct.
+    tree = etree.fromstring(html_body, parser)  # type: ignore[arg-type]

    # The data was successfully parsed, but no tree was found.
    if tree is None:
-        return
+        return  # type: ignore[unreachable]

    # Attempt to find interesting URLs (images, videos, embeds).
    if "og:image" not in open_graph_response:
--- a/synapse/media/preview_html.py
+++ b/synapse/media/preview_html.py
@ -24,6 +24,7 @@ from typing import (
    Optional,
    Set,
    Union,
+    cast,
 )

 if TYPE_CHECKING:
@ -115,7 +116,7 @@ def _get_html_media_encodings(

 def decode_body(
    body: bytes, uri: str, content_type: Optional[str] = None
-) -> Optional["etree.Element"]:
+) -> Optional["etree._Element"]:
    """
    This uses lxml to parse the HTML document.

@ -152,11 +153,12 @@ def decode_body(

    # Attempt to parse the body. Returns None if the body was successfully
    # parsed, but no tree was found.
-    return etree.fromstring(body, parser)
+    # TODO Develop of lxml-stubs has this correct.
+    return etree.fromstring(body, parser)  # type: ignore[arg-type]


 def _get_meta_tags(
-    tree: "etree.Element",
+    tree: "etree._Element",
    property: str,
    prefix: str,
    property_mapper: Optional[Callable[[str], Optional[str]]] = None,
@ -175,9 +177,15 @@ def _get_meta_tags(
    Returns:
        A map of tag name to value.
    """
+    # This actually returns Dict[str, str], but the caller sets this as a variable
+    # which is Dict[str, Optional[str]].
    results: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath(
-        f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
+    # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+    for tag in cast(
+        List["etree._Element"],
+        tree.xpath(
+            f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
+        ),
    ):
        # if we've got more than 50 tags, someone is taking the piss
        if len(results) >= 50:
@ -187,14 +195,15 @@ def _get_meta_tags(
            )
            return {}

-        key = tag.attrib[property]
+        key = cast(str, tag.attrib[property])
        if property_mapper:
-            key = property_mapper(key)
+            new_key = property_mapper(key)
            # None is a special value used to ignore a value.
-            if key is None:
+            if new_key is None:
                continue
+            key = new_key

-        results[key] = tag.attrib["content"]
+        results[key] = cast(str, tag.attrib["content"])

    return results

@ -219,7 +228,7 @@ def _map_twitter_to_open_graph(key: str) -> Optional[str]:
    return "og" + key[7:]


-def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
+def parse_html_to_open_graph(tree: "etree._Element") -> Dict[str, Optional[str]]:
    """
    Parse the HTML document into an Open Graph response.

@ -276,24 +285,36 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:

    if "og:title" not in og:
        # Attempt to find a title from the title tag, or the biggest header on the page.
-        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
+        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        title = cast(
+            List["etree._ElementUnicodeResult"],
+            tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()"),
+        )
        if title:
            og["og:title"] = title[0].strip()
        else:
            og["og:title"] = None

    if "og:image" not in og:
-        meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
+        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        meta_image = cast(
+            List["etree._ElementUnicodeResult"],
+            tree.xpath(
+                "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
+            ),
        )
        # If a meta image is found, use it.
        if meta_image:
            og["og:image"] = meta_image[0]
        else:
            # Try to find images which are larger than 10px by 10px.
+            # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
            #
            # TODO: consider inlined CSS styles as well as width & height attribs
-            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
+            images = cast(
+                List["etree._Element"],
+                tree.xpath("//img[@src][number(@width)>10][number(@height)>10]"),
+            )
            images = sorted(
                images,
                key=lambda i: (
@ -302,20 +323,29 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
            )
            # If no images were found, try to find *any* images.
            if not images:
-                images = tree.xpath("//img[@src][1]")
+                # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+                images = cast(List["etree._Element"], tree.xpath("//img[@src][1]"))
            if images:
-                og["og:image"] = images[0].attrib["src"]
+                og["og:image"] = cast(str, images[0].attrib["src"])

            # Finally, fallback to the favicon if nothing else.
            else:
-                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
+                # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+                favicons = cast(
+                    List["etree._ElementUnicodeResult"],
+                    tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]"),
+                )
                if favicons:
                    og["og:image"] = favicons[0]

    if "og:description" not in og:
        # Check the first meta description tag for content.
-        meta_description = tree.xpath(
-            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
+        # Cast: the type returned by xpath depends on the xpath expression: mypy can't deduce this.
+        meta_description = cast(
+            List["etree._ElementUnicodeResult"],
+            tree.xpath(
+                "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
+            ),
        )
        # If a meta description is found with content, use it.
        if meta_description:
@ -332,7 +362,7 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    return og


-def parse_html_description(tree: "etree.Element") -> Optional[str]:
+def parse_html_description(tree: "etree._Element") -> Optional[str]:
    """
    Calculate a text description based on an HTML document.

@ -368,6 +398,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
        "canvas",
        "img",
        "picture",
+        # etree.Comment is a function which creates an etree._Comment element.
+        # The "tag" attribute of an etree._Comment instance is confusingly the
+        # etree.Comment function instead of a string.
        etree.Comment,
    }

@ -381,8 +414,8 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:


 def _iterate_over_text(
-    tree: Optional["etree.Element"],
-    tags_to_ignore: Set[Union[str, "etree.Comment"]],
+    tree: Optional["etree._Element"],
+    tags_to_ignore: Set[object],
    stack_limit: int = 1024,
 ) -> Generator[str, None, None]:
    """Iterate over the tree returning text nodes in a depth first fashion,
@ -402,7 +435,7 @@ def _iterate_over_text(

    # This is a stack whose items are elements to iterate over *or* strings
    # to be returned.
-    elements: List[Union[str, "etree.Element"]] = [tree]
+    elements: List[Union[str, "etree._Element"]] = [tree]
    while elements:
        el = elements.pop()

--- a/tests/media/test_html_preview.py
+++ b/tests/media/test_html_preview.py
@ -24,7 +24,7 @@ from tests import unittest
 try:
    import lxml
 except ImportError:
-    lxml = None
+    lxml = None  # type: ignore[assignment]


 class SummarizeTestCase(unittest.TestCase):
@ -160,6 +160,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -176,6 +177,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -195,6 +197,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(
@ -217,6 +220,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -231,6 +235,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -246,6 +251,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
@ -261,6 +267,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@ -281,6 +288,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
@ -296,6 +304,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        """

        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)

        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -324,6 +333,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        <head><title>Foo</title></head><body>Some text.</body></html>
        """.strip()
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

@ -338,6 +348,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

@ -353,6 +364,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

@ -367,6 +379,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})

@ -380,6 +393,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(
            og,
@ -401,6 +415,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(
            og,
@ -419,6 +434,7 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
        with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
        """
        tree = decode_body(html, "http://example.com/test.html")
+        assert tree is not None
        og = parse_html_to_open_graph(tree)
        self.assertEqual(
            og,
--- a/tests/media/test_oembed.py
+++ b/tests/media/test_oembed.py
@ -28,7 +28,7 @@ from tests.unittest import HomeserverTestCase
 try:
    import lxml
 except ImportError:
-    lxml = None
+    lxml = None  # type: ignore[assignment]


 class OEmbedTests(HomeserverTestCase):
--- a/tests/media/test_url_previewer.py
+++ b/tests/media/test_url_previewer.py
@ -24,7 +24,7 @@ from tests.unittest import override_config
 try:
    import lxml
 except ImportError:
-    lxml = None
+    lxml = None  # type: ignore[assignment]


 class URLPreviewTests(unittest.HomeserverTestCase):
--- a/tests/rest/media/test_url_preview.py
+++ b/tests/rest/media/test_url_preview.py
@ -40,7 +40,7 @@ from tests.test_utils import SMALL_PNG
 try:
    import lxml
 except ImportError:
-    lxml = None
+    lxml = None  # type: ignore[assignment]


 class URLPreviewTests(unittest.HomeserverTestCase):