Support oEmbed for media previews. (#7920)

Fixes previews of Twitter URLs by using their oEmbed endpoint to grab content.
2020-07-27 07:50:44 -04:00 · 2020-07-27 07:50:44 -04:00 · 3fc8fdd150
parent b975fa2e99
commit 3fc8fdd150
3 changed files with 355 additions and 53 deletions
--- a/changelog.d/7920.feature
+++ b/changelog.d/7920.feature
@ -0,0 +1 @@
+Support oEmbed for media previews.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -26,6 +26,7 @@ import traceback
 from typing import Dict, Optional
 from urllib import parse as urlparse

+import attr
 from canonicaljson import json

 from twisted.internet import defer
@ -56,6 +57,65 @@ _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 OG_TAG_NAME_MAXLEN = 50
 OG_TAG_VALUE_MAXLEN = 1000

+ONE_HOUR = 60 * 60 * 1000
+
+# A map of globs to API endpoints.
+_oembed_globs = {
+    # Twitter.
+    "https://publish.twitter.com/oembed": [
+        "https://twitter.com/*/status/*",
+        "https://*.twitter.com/*/status/*",
+        "https://twitter.com/*/moments/*",
+        "https://*.twitter.com/*/moments/*",
+        # Include the HTTP versions too.
+        "http://twitter.com/*/status/*",
+        "http://*.twitter.com/*/status/*",
+        "http://twitter.com/*/moments/*",
+        "http://*.twitter.com/*/moments/*",
+    ],
+}
+# Convert the globs to regular expressions.
+_oembed_patterns = {}
+for endpoint, globs in _oembed_globs.items():
+    for glob in globs:
+        # Convert the glob into a sane regular expression to match against. The
+        # rules followed will be slightly different for the domain portion vs.
+        # the rest.
+        #
+        # 1. The scheme must be one of HTTP / HTTPS (and have no globs).
+        # 2. The domain can have globs, but we limit it to characters that can
+        #    reasonably be a domain part.
+        #    TODO: This does not attempt to handle Unicode domain names.
+        # 3. Other parts allow a glob to be any one, or more, characters.
+        results = urlparse.urlparse(glob)
+
+        # Ensure the scheme does not have wildcards (and is a sane scheme).
+        if results.scheme not in {"http", "https"}:
+            raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))
+
+        pattern = urlparse.urlunparse(
+            [
+                results.scheme,
+                re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
+            ]
+            + [re.escape(part).replace("\\*", ".+") for part in results[2:]]
+        )
+        _oembed_patterns[re.compile(pattern)] = endpoint
+
+
+@attr.s
+class OEmbedResult:
+    # Either HTML content or URL must be provided.
+    html = attr.ib(type=Optional[str])
+    url = attr.ib(type=Optional[str])
+    title = attr.ib(type=Optional[str])
+    # Number of seconds to cache the content.
+    cache_age = attr.ib(type=int)
+
+
+class OEmbedError(Exception):
+    """An error occurred processing the oEmbed object."""
+

 class PreviewUrlResource(DirectServeJsonResource):
    isLeaf = True
@ -99,7 +159,7 @@ class PreviewUrlResource(DirectServeJsonResource):
            cache_name="url_previews",
            clock=self.clock,
            # don't spider URLs more often than once an hour
-            expiry_ms=60 * 60 * 1000,
+            expiry_ms=ONE_HOUR,
        )

        if self._worker_run_media_background_jobs:
@ -310,6 +370,87 @@ class PreviewUrlResource(DirectServeJsonResource):

        return jsonog.encode("utf8")

+    def _get_oembed_url(self, url: str) -> Optional[str]:
+        """
+        Check whether the URL should be downloaded as oEmbed content instead.
+
+        Params:
+            url: The URL to check.
+
+        Returns:
+            A URL to use instead or None if the original URL should be used.
+        """
+        for url_pattern, endpoint in _oembed_patterns.items():
+            if url_pattern.fullmatch(url):
+                return endpoint
+
+        # No match.
+        return None
+
+    async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
+        """
+        Request content from an oEmbed endpoint.
+
+        Params:
+            endpoint: The oEmbed API endpoint.
+            url: The URL to pass to the API.
+
+        Returns:
+            An object representing the metadata returned.
+
+        Raises:
+            OEmbedError if fetching or parsing of the oEmbed information fails.
+        """
+        try:
+            logger.debug("Trying to get oEmbed content for url '%s'", url)
+            result = await self.client.get_json(
+                endpoint,
+                # TODO Specify max height / width.
+                # Note that only the JSON format is supported.
+                args={"url": url},
+            )
+
+            # Ensure there's a version of 1.0.
+            if result.get("version") != "1.0":
+                raise OEmbedError("Invalid version: %s" % (result.get("version"),))
+
+            oembed_type = result.get("type")
+
+            # Ensure the cache age is None or an int.
+            cache_age = result.get("cache_age")
+            if cache_age:
+                cache_age = int(cache_age)
+
+            oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
+
+            # HTML content.
+            if oembed_type == "rich":
+                oembed_result.html = result.get("html")
+                return oembed_result
+
+            if oembed_type == "photo":
+                oembed_result.url = result.get("url")
+                return oembed_result
+
+            # TODO Handle link and video types.
+
+            if "thumbnail_url" in result:
+                oembed_result.url = result.get("thumbnail_url")
+                return oembed_result
+
+            raise OEmbedError("Incompatible oEmbed information.")
+
+        except OEmbedError as e:
+            # Trap OEmbedErrors first so we can directly re-raise them.
+            logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
+            raise
+
+        except Exception as e:
+            # Trap any exception and let the code follow as usual.
+            # FIXME: pass through 404s and other error messages nicely
+            logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
+            raise OEmbedError() from e
+
    async def _download_url(self, url, user):
        # TODO: we should probably honour robots.txt... except in practice
        # we're most likely being explicitly triggered by a human rather than a
@ -319,54 +460,90 @@ class PreviewUrlResource(DirectServeJsonResource):

        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

-        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+        # If this URL can be accessed via oEmbed, use that instead.
+        url_to_download = url
+        oembed_url = self._get_oembed_url(url)
+        if oembed_url:
+            # The result might be a new URL to download, or it might be HTML content.
            try:
-                logger.debug("Trying to get preview for url '%s'", url)
-                length, headers, uri, code = await self.client.get_file(
-                    url,
-                    output_stream=f,
-                    max_size=self.max_spider_size,
-                    headers={"Accept-Language": self.url_preview_accept_language},
-                )
-            except SynapseError:
-                # Pass SynapseErrors through directly, so that the servlet
-                # handler will return a SynapseError to the client instead of
-                # blank data or a 500.
-                raise
-            except DNSLookupError:
-                # DNS lookup returned no results
-                # Note: This will also be the case if one of the resolved IP
-                # addresses is blacklisted
-                raise SynapseError(
-                    502,
-                    "DNS resolution failure during URL preview generation",
-                    Codes.UNKNOWN,
-                )
-            except Exception as e:
-                # FIXME: pass through 404s and other error messages nicely
-                logger.warning("Error downloading %s: %r", url, e)
+                oembed_result = await self._get_oembed_content(oembed_url, url)
+                if oembed_result.url:
+                    url_to_download = oembed_result.url
+                elif oembed_result.html:
+                    url_to_download = None
+            except OEmbedError:
+                # If an error occurs, try doing a normal preview.
+                pass

-                raise SynapseError(
-                    500,
-                    "Failed to download content: %s"
-                    % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                    Codes.UNKNOWN,
-                )
-            await finish()
+        if url_to_download:
+            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+                try:
+                    logger.debug("Trying to get preview for url '%s'", url_to_download)
+                    length, headers, uri, code = await self.client.get_file(
+                        url_to_download,
+                        output_stream=f,
+                        max_size=self.max_spider_size,
+                        headers={"Accept-Language": self.url_preview_accept_language},
+                    )
+                except SynapseError:
+                    # Pass SynapseErrors through directly, so that the servlet
+                    # handler will return a SynapseError to the client instead of
+                    # blank data or a 500.
+                    raise
+                except DNSLookupError:
+                    # DNS lookup returned no results
+                    # Note: This will also be the case if one of the resolved IP
+                    # addresses is blacklisted
+                    raise SynapseError(
+                        502,
+                        "DNS resolution failure during URL preview generation",
+                        Codes.UNKNOWN,
+                    )
+                except Exception as e:
+                    # FIXME: pass through 404s and other error messages nicely
+                    logger.warning("Error downloading %s: %r", url_to_download, e)
+
+                    raise SynapseError(
+                        500,
+                        "Failed to download content: %s"
+                        % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                        Codes.UNKNOWN,
+                    )
+                await finish()
+
+                if b"Content-Type" in headers:
+                    media_type = headers[b"Content-Type"][0].decode("ascii")
+                else:
+                    media_type = "application/octet-stream"
+
+                download_name = get_filename_from_headers(headers)
+
+                # FIXME: we should calculate a proper expiration based on the
+                # Cache-Control and Expire headers.  But for now, assume 1 hour.
+                expires = ONE_HOUR
+                etag = headers["ETag"][0] if "ETag" in headers else None
+        else:
+            html_bytes = oembed_result.html.encode("utf-8")  # type: ignore
+            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+                f.write(html_bytes)
+                await finish()
+
+            media_type = "text/html"
+            download_name = oembed_result.title
+            length = len(html_bytes)
+            # If a specific cache age was not given, assume 1 hour.
+            expires = oembed_result.cache_age or ONE_HOUR
+            uri = oembed_url
+            code = 200
+            etag = None

        try:
-            if b"Content-Type" in headers:
-                media_type = headers[b"Content-Type"][0].decode("ascii")
-            else:
-                media_type = "application/octet-stream"
            time_now_ms = self.clock.time_msec()

-            download_name = get_filename_from_headers(headers)
-
            await self.store.store_local_media(
                media_id=file_id,
                media_type=media_type,
-                time_now_ms=self.clock.time_msec(),
+                time_now_ms=time_now_ms,
                upload_name=download_name,
                media_length=length,
                user_id=user,
@ -389,10 +566,8 @@ class PreviewUrlResource(DirectServeJsonResource):
            "filename": fname,
            "uri": uri,
            "response_code": code,
-            # FIXME: we should calculate a proper expiration based on the
-            # Cache-Control and Expire headers.  But for now, assume 1 hour.
-            "expires": 60 * 60 * 1000,
-            "etag": headers["ETag"][0] if "ETag" in headers else None,
+            "expires": expires,
+            "etag": etag,
        }

    def _start_expire_url_cache_data(self):
@ -449,7 +624,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        # These may be cached for a bit on the client (i.e., they
        # may have a room open with a preview url thing open).
        # So we wait a couple of days before deleting, just in case.
-        expire_before = now - 2 * 24 * 60 * 60 * 1000
+        expire_before = now - 2 * 24 * ONE_HOUR
        media_ids = await self.store.get_url_cache_media_before(expire_before)

        removed_media = []
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -12,8 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import json
 import os
+import re
+
+from mock import patch

 import attr

@ -131,7 +134,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.reactor.nameResolver = Resolver()

    def test_cache_returns_correct_type(self):
-        self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

        request, channel = self.make_request(
            "GET", "url_preview?url=http://matrix.org", shorthand=False
@ -187,7 +190,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        )

    def test_non_ascii_preview_httpequiv(self):
-        self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

        end_content = (
            b"<html><head>"
@ -221,7 +224,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

    def test_non_ascii_preview_content_type(self):
-        self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

        end_content = (
            b"<html><head>"
@ -254,7 +257,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

    def test_overlong_title(self):
-        self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]

        end_content = (
            b"<html><head>"
@ -292,7 +295,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        """
        IP addresses can be previewed directly.
        """
-        self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]

        request, channel = self.make_request(
            "GET", "url_preview?url=http://example.com", shorthand=False
@ -439,7 +442,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        # Hardcode the URL resolving to the IP we want.
        self.lookups["example.com"] = [
            (IPv4Address, "1.1.1.2"),
-            (IPv4Address, "8.8.8.8"),
+            (IPv4Address, "10.1.2.3"),
        ]

        request, channel = self.make_request(
@ -518,7 +521,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        """
        Accept-Language header is sent to the remote server
        """
-        self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")]
+        self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]

        # Build and make a request to the server
        request, channel = self.make_request(
@ -562,3 +565,126 @@ class URLPreviewTests(unittest.HomeserverTestCase):
            ),
            server.data,
        )
+
+    def test_oembed_photo(self):
+        """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
+        # Route the HTTP version to an HTTP endpoint so that the tests work.
+        with patch.dict(
+            "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
+            {
+                re.compile(
+                    r"http://twitter\.com/.+/status/.+"
+                ): "http://publish.twitter.com/oembed",
+            },
+            clear=True,
+        ):
+
+            self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+            self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+            result = {
+                "version": "1.0",
+                "type": "photo",
+                "url": "http://cdn.twitter.com/matrixdotorg",
+            }
+            oembed_content = json.dumps(result).encode("utf-8")
+
+            end_content = (
+                b"<html><head>"
+                b"<title>Some Title</title>"
+                b'<meta property="og:description" content="hi" />'
+                b"</head></html>"
+            )
+
+            request, channel = self.make_request(
+                "GET",
+                "url_preview?url=http://twitter.com/matrixdotorg/status/12345",
+                shorthand=False,
+            )
+            request.render(self.preview_url)
+            self.pump()
+
+            client = self.reactor.tcpClients[0][2].buildProtocol(None)
+            server = AccumulatingProtocol()
+            server.makeConnection(FakeTransport(client, self.reactor))
+            client.makeConnection(FakeTransport(server, self.reactor))
+            client.dataReceived(
+                (
+                    b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                    b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+                )
+                % (len(oembed_content),)
+                + oembed_content
+            )
+
+            self.pump()
+
+            client = self.reactor.tcpClients[1][2].buildProtocol(None)
+            server = AccumulatingProtocol()
+            server.makeConnection(FakeTransport(client, self.reactor))
+            client.makeConnection(FakeTransport(server, self.reactor))
+            client.dataReceived(
+                (
+                    b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                    b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+                )
+                % (len(end_content),)
+                + end_content
+            )
+
+            self.pump()
+
+            self.assertEqual(channel.code, 200)
+            self.assertEqual(
+                channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
+            )
+
+    def test_oembed_rich(self):
+        """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
+        # Route the HTTP version to an HTTP endpoint so that the tests work.
+        with patch.dict(
+            "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
+            {
+                re.compile(
+                    r"http://twitter\.com/.+/status/.+"
+                ): "http://publish.twitter.com/oembed",
+            },
+            clear=True,
+        ):
+
+            self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+            result = {
+                "version": "1.0",
+                "type": "rich",
+                "html": "<div>Content Preview</div>",
+            }
+            end_content = json.dumps(result).encode("utf-8")
+
+            request, channel = self.make_request(
+                "GET",
+                "url_preview?url=http://twitter.com/matrixdotorg/status/12345",
+                shorthand=False,
+            )
+            request.render(self.preview_url)
+            self.pump()
+
+            client = self.reactor.tcpClients[0][2].buildProtocol(None)
+            server = AccumulatingProtocol()
+            server.makeConnection(FakeTransport(client, self.reactor))
+            client.makeConnection(FakeTransport(server, self.reactor))
+            client.dataReceived(
+                (
+                    b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                    b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+                )
+                % (len(end_content),)
+                + end_content
+            )
+
+            self.pump()
+            self.assertEqual(channel.code, 200)
+            self.assertEqual(
+                channel.json_body,
+                {"og:title": None, "og:description": "Content Preview"},
+            )