Refactor oEmbed previews (#10814)

The major change is moving the decision of whether to use oEmbed further up the call-stack. This reverts the _download_url method to being a "dumb" functionwhich takes a single URL and downloads it (as it was before #7920). This also makes more minor refactorings: * Renames internal variables for clarity. * Factors out shared code between the HTML and rich oEmbed previews. * Fixes tests to preview an oEmbed image.
2021-09-21 12:09:57 -04:00 · 2021-09-21 12:09:57 -04:00 · ba7a91aea5
parent 2843058a8b
commit ba7a91aea5
5 changed files with 299 additions and 220 deletions
--- a/changelog.d/10814.feature
+++ b/changelog.d/10814.feature
@ -0,0 +1 @@
 Improve oEmbed previews by processing the author name, photo, and video information.
--- a/docs/development/url_previews.md
+++ b/docs/development/url_previews.md
@ -25,16 +25,14 @@ When Synapse is asked to preview a URL it does the following:
 3. Kicks off a background process to generate a preview:
   1. Checks the database cache by URL and timestamp and returns the result if it
      has not expired and was successful (a 2xx return code).
-   2. Checks if the URL matches an oEmbed pattern. If it does, fetch the oEmbed
+   2. Checks if the URL matches an [oEmbed](https://oembed.com/) pattern. If it
-      response. If this is an image, replace the URL to fetch and continue. If
+      does, update the URL to download.
-      if it is HTML content, use the HTML as the document and continue.
+   3. Downloads the URL and stores it into a file via the media storage provider
-   3. If it doesn't match an oEmbed pattern, downloads the URL and stores it
+      and saves the local media metadata.
-      into a file via the media storage provider and saves the local media
+   4. If the media is an image:
      metadata.
   5. If the media is an image:
      1. Generates thumbnails.
      2. Generates an Open Graph response based on image properties.
-   6. If the media is HTML:
+   5. If the media is HTML:
      1. Decodes the HTML via the stored file.
      2. Generates an Open Graph response from the HTML.
      3. If an image exists in the Open Graph response:
@ -42,6 +40,13 @@ When Synapse is asked to preview a URL it does the following:
            provider and saves the local media metadata.
         2. Generates thumbnails.
         3. Updates the Open Graph response based on image properties.
   6. If the media is JSON and an oEmbed URL was found:
      1. Convert the oEmbed response to an Open Graph response.
      2. If a thumbnail or image is in the oEmbed response:
         1. Downloads the URL and stores it into a file via the media storage
            provider and saves the local media metadata.
         2. Generates thumbnails.
         3. Updates the Open Graph response based on image properties.
   7. Stores the result in the database cache.
 4. Returns the result.
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@ -12,11 +12,14 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import logging
 import urllib.parse
 from typing import TYPE_CHECKING, Optional
 import attr
 from synapse.http.client import SimpleHttpClient
 from synapse.types import JsonDict
 from synapse.util import json_decoder
 if TYPE_CHECKING:
    from synapse.server import HomeServer
@ -24,18 +27,15 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-@attr.s(slots=True, auto_attribs=True)
+@attr.s(slots=True, frozen=True, auto_attribs=True)
 class OEmbedResult:
-    # Either HTML content or URL must be provided.
+    # The Open Graph result (converted from the oEmbed result).
-    html: Optional[str]
+    open_graph_result: JsonDict
-    url: Optional[str]
+    # Number of seconds to cache the content, according to the oEmbed response.
-    title: Optional[str]
+    #
-    # Number of seconds to cache the content.
+    # This will be None if no cache-age is provided in the oEmbed response (or
-    cache_age: int
+    # if the oEmbed response cannot be turned into an Open Graph response).
-
+    cache_age: Optional[int]
 class OEmbedError(Exception):
    """An error occurred processing the oEmbed object."""
 class OEmbedProvider:
@ -81,75 +81,106 @@ class OEmbedProvider:
        """
        for url_pattern, endpoint in self._oembed_patterns.items():
            if url_pattern.fullmatch(url):
-                return endpoint
+                # TODO Specify max height / width.
                # Note that only the JSON format is supported, some endpoints want
                # this in the URL, others want it as an argument.
                endpoint = endpoint.replace("{format}", "json")
                args = {"url": url, "format": "json"}
                query_str = urllib.parse.urlencode(args, True)
                return f"{endpoint}?{query_str}"
        # No match.
        return None
-    async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
+    def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
        """
-        Request content from an oEmbed endpoint.
+        Parse the oEmbed response into an Open Graph response.
        Args:
-            endpoint: The oEmbed API endpoint.
+            url: The URL which is being previewed (not the one which was
-            url: The URL to pass to the API.
+                requested).
            raw_body: The oEmbed response as JSON encoded as bytes.
        Returns:
-            An object representing the metadata returned.
+            json-encoded Open Graph data
        Raises:
            OEmbedError if fetching or parsing of the oEmbed information fails.
        """
        try:
-            logger.debug("Trying to get oEmbed content for url '%s'", url)
+            # oEmbed responses *must* be UTF-8 according to the spec.
-
+            oembed = json_decoder.decode(raw_body.decode("utf-8"))
            # Note that only the JSON format is supported, some endpoints want
            # this in the URL, others want it as an argument.
            endpoint = endpoint.replace("{format}", "json")
            result = await self._client.get_json(
                endpoint,
                # TODO Specify max height / width.
                args={"url": url, "format": "json"},
            )
            # Ensure there's a version of 1.0.
-            if result.get("version") != "1.0":
+            oembed_version = oembed["version"]
-                raise OEmbedError("Invalid version: %s" % (result.get("version"),))
+            if oembed_version != "1.0":
-
+                raise RuntimeError(f"Invalid version: {oembed_version}")
            oembed_type = result.get("type")
            # Ensure the cache age is None or an int.
-            cache_age = result.get("cache_age")
+            cache_age = oembed.get("cache_age")
            if cache_age:
                cache_age = int(cache_age)
-            oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
+            # The results.
            open_graph_response = {"og:title": oembed.get("title")}
-            # HTML content.
+            # If a thumbnail exists, use it. Note that dimensions will be calculated later.
            if "thumbnail_url" in oembed:
                open_graph_response["og:image"] = oembed["thumbnail_url"]
            # Process each type separately.
            oembed_type = oembed["type"]
            if oembed_type == "rich":
-                oembed_result.html = result.get("html")
+                calc_description_and_urls(open_graph_response, oembed["html"])
                return oembed_result
-            if oembed_type == "photo":
+            elif oembed_type == "photo":
-                oembed_result.url = result.get("url")
+                # If this is a photo, use the full image, not the thumbnail.
-                return oembed_result
+                open_graph_response["og:image"] = oembed["url"]
-            # TODO Handle link and video types.
+            else:
-
+                raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")
            if "thumbnail_url" in result:
                oembed_result.url = result.get("thumbnail_url")
                return oembed_result
            raise OEmbedError("Incompatible oEmbed information.")
        except OEmbedError as e:
            # Trap OEmbedErrors first so we can directly re-raise them.
            logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
            raise
        except Exception as e:
            # Trap any exception and let the code follow as usual.
-            # FIXME: pass through 404s and other error messages nicely
+            logger.warning(f"Error parsing oEmbed metadata from {url}: {e:r}")
-            logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
+            open_graph_response = {}
-            raise OEmbedError() from e
+            cache_age = None
        return OEmbedResult(open_graph_response, cache_age)
 def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
    """
    Calculate description for an HTML document.
    This uses lxml to convert the HTML document into plaintext. If errors
    occur during processing of the document, an empty response is returned.
    Args:
        open_graph_response: The current Open Graph summary. This is updated with additional fields.
        html_body: The HTML document, as bytes.
    Returns:
        The summary
    """
    # If there's no body, nothing useful is going to be found.
    if not html_body:
        return
    from lxml import etree
    # Create an HTML parser. If this fails, log and return no metadata.
    parser = etree.HTMLParser(recover=True, encoding="utf-8")
    # Attempt to parse the body. If this fails, log and return no metadata.
    tree = etree.fromstring(html_body, parser)
    # The data was successfully parsed, but no tree was found.
    if tree is None:
        return
    from synapse.rest.media.v1.preview_url_resource import _calc_description
    description = _calc_description(tree)
    if description:
        open_graph_response["og:description"] = description
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -44,7 +44,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.rest.media.v1._base import get_filename_from_headers
 from synapse.rest.media.v1.media_storage import MediaStorage
-from synapse.rest.media.v1.oembed import OEmbedError, OEmbedProvider
+from synapse.rest.media.v1.oembed import OEmbedProvider
 from synapse.types import JsonDict
 from synapse.util import json_encoder
 from synapse.util.async_helpers import ObservableDeferred
@ -73,6 +73,7 @@ OG_TAG_NAME_MAXLEN = 50
 OG_TAG_VALUE_MAXLEN = 1000
 ONE_HOUR = 60 * 60 * 1000
 ONE_DAY = 24 * ONE_HOUR
@attr.s(slots=True, frozen=True, auto_attribs=True)
@ -255,10 +256,19 @@ class PreviewUrlResource(DirectServeJsonResource):
                og = og.encode("utf8")
            return og
-        media_info = await self._download_url(url, user)
+        # If this URL can be accessed via oEmbed, use that instead.
        url_to_download = url
        oembed_url = self._oembed.get_oembed_url(url)
        if oembed_url:
            url_to_download = oembed_url
        media_info = await self._download_url(url_to_download, user)
        logger.debug("got media_info of '%s'", media_info)
        # The number of milliseconds that the response should be considered valid.
        expiration_ms = media_info.expires
        if _is_media(media_info.media_type):
            file_id = media_info.filesystem_id
            dims = await self.media_repo._generate_thumbnails(
@ -288,34 +298,22 @@ class PreviewUrlResource(DirectServeJsonResource):
            encoding = get_html_media_encoding(body, media_info.media_type)
            og = decode_and_calc_og(body, media_info.uri, encoding)
-            # pre-cache the image for posterity
+            await self._precache_image_url(user, media_info, og)
            # FIXME: it might be cleaner to use the same flow as the main /preview_url
            # request itself and benefit from the same caching etc.  But for now we
            # just rely on the caching on the master request to speed things up.
            if "og:image" in og and og["og:image"]:
                image_info = await self._download_url(
                    _rebase_url(og["og:image"], media_info.uri), user
                )
-                if _is_media(image_info.media_type):
+        elif oembed_url and _is_json(media_info.media_type):
-                    # TODO: make sure we don't choke on white-on-transparent images
+            # Handle an oEmbed response.
-                    file_id = image_info.filesystem_id
+            with open(media_info.filename, "rb") as file:
-                    dims = await self.media_repo._generate_thumbnails(
+                body = file.read()
-                        None, file_id, file_id, image_info.media_type, url_cache=True
+
-                    )
+            oembed_response = self._oembed.parse_oembed_response(media_info.uri, body)
-                    if dims:
+            og = oembed_response.open_graph_result
-                        og["og:image:width"] = dims["width"]
+
-                        og["og:image:height"] = dims["height"]
+            # Use the cache age from the oEmbed result, instead of the HTTP response.
-                    else:
+            if oembed_response.cache_age is not None:
-                        logger.warning("Couldn't get dims for %s", og["og:image"])
+                expiration_ms = oembed_response.cache_age
            await self._precache_image_url(user, media_info, og)
                    og[
                        "og:image"
                    ] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
                    og["og:image:type"] = image_info.media_type
                    og["matrix:image:size"] = image_info.media_length
                else:
                    del og["og:image"]
        else:
            logger.warning("Failed to find any OG data in %s", url)
            og = {}
@ -336,12 +334,15 @@ class PreviewUrlResource(DirectServeJsonResource):
        jsonog = json_encoder.encode(og)
        # Cap the amount of time to consider a response valid.
        expiration_ms = min(expiration_ms, ONE_DAY)
        # store OG in history-aware DB cache
        await self.store.store_url_cache(
            url,
            media_info.response_code,
            media_info.etag,
-            media_info.expires + media_info.created_ts_ms,
+            media_info.created_ts_ms + expiration_ms,
            jsonog,
            media_info.filesystem_id,
            media_info.created_ts_ms,
@ -358,88 +359,52 @@ class PreviewUrlResource(DirectServeJsonResource):
        file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
-        # If this URL can be accessed via oEmbed, use that instead.
+        with self.media_storage.store_into_file(file_info) as (f, fname, finish):
        url_to_download: Optional[str] = url
        oembed_url = self._oembed.get_oembed_url(url)
        if oembed_url:
            # The result might be a new URL to download, or it might be HTML content.
            try:
-                oembed_result = await self._oembed.get_oembed_content(oembed_url, url)
+                logger.debug("Trying to get preview for url '%s'", url)
-                if oembed_result.url:
+                length, headers, uri, code = await self.client.get_file(
-                    url_to_download = oembed_result.url
+                    url,
-                elif oembed_result.html:
+                    output_stream=f,
-                    url_to_download = None
+                    max_size=self.max_spider_size,
-            except OEmbedError:
+                    headers={"Accept-Language": self.url_preview_accept_language},
                # If an error occurs, try doing a normal preview.
                pass
        if url_to_download:
            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
                try:
                    logger.debug("Trying to get preview for url '%s'", url_to_download)
                    length, headers, uri, code = await self.client.get_file(
                        url_to_download,
                        output_stream=f,
                        max_size=self.max_spider_size,
                        headers={"Accept-Language": self.url_preview_accept_language},
                    )
                except SynapseError:
                    # Pass SynapseErrors through directly, so that the servlet
                    # handler will return a SynapseError to the client instead of
                    # blank data or a 500.
                    raise
                except DNSLookupError:
                    # DNS lookup returned no results
                    # Note: This will also be the case if one of the resolved IP
                    # addresses is blacklisted
                    raise SynapseError(
                        502,
                        "DNS resolution failure during URL preview generation",
                        Codes.UNKNOWN,
                    )
                except Exception as e:
                    # FIXME: pass through 404s and other error messages nicely
                    logger.warning("Error downloading %s: %r", url_to_download, e)
                    raise SynapseError(
                        500,
                        "Failed to download content: %s"
                        % (traceback.format_exception_only(sys.exc_info()[0], e),),
                        Codes.UNKNOWN,
                    )
                await finish()
                if b"Content-Type" in headers:
                    media_type = headers[b"Content-Type"][0].decode("ascii")
                else:
                    media_type = "application/octet-stream"
                download_name = get_filename_from_headers(headers)
                # FIXME: we should calculate a proper expiration based on the
                # Cache-Control and Expire headers.  But for now, assume 1 hour.
                expires = ONE_HOUR
                etag = (
                    headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
                )
-        else:
+            except SynapseError:
-            # we can only get here if we did an oembed request and have an oembed_result.html
+                # Pass SynapseErrors through directly, so that the servlet
-            assert oembed_result.html is not None
+                # handler will return a SynapseError to the client instead of
-            assert oembed_url is not None
+                # blank data or a 500.
                raise
            except DNSLookupError:
                # DNS lookup returned no results
                # Note: This will also be the case if one of the resolved IP
                # addresses is blacklisted
                raise SynapseError(
                    502,
                    "DNS resolution failure during URL preview generation",
                    Codes.UNKNOWN,
                )
            except Exception as e:
                # FIXME: pass through 404s and other error messages nicely
                logger.warning("Error downloading %s: %r", url, e)
-            html_bytes = oembed_result.html.encode("utf-8")
+                raise SynapseError(
-            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
+                    500,
-                f.write(html_bytes)
+                    "Failed to download content: %s"
-                await finish()
+                    % (traceback.format_exception_only(sys.exc_info()[0], e),),
                    Codes.UNKNOWN,
                )
            await finish()
-            media_type = "text/html"
+            if b"Content-Type" in headers:
-            download_name = oembed_result.title
+                media_type = headers[b"Content-Type"][0].decode("ascii")
-            length = len(html_bytes)
+            else:
-            # If a specific cache age was not given, assume 1 hour.
+                media_type = "application/octet-stream"
-            expires = oembed_result.cache_age or ONE_HOUR
+
-            uri = oembed_url
+            download_name = get_filename_from_headers(headers)
-            code = 200
+
-            etag = None
+            # FIXME: we should calculate a proper expiration based on the
            # Cache-Control and Expire headers.  But for now, assume 1 hour.
            expires = ONE_HOUR
            etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
        try:
            time_now_ms = self.clock.time_msec()
@ -474,6 +439,46 @@ class PreviewUrlResource(DirectServeJsonResource):
            etag=etag,
        )
    async def _precache_image_url(
        self, user: str, media_info: MediaInfo, og: JsonDict
    ) -> None:
        """
        Pre-cache the image (if one exists) for posterity
        Args:
            user: The user requesting the preview.
            media_info: The media being previewed.
            og: The Open Graph dictionary. This is modified with image information.
        """
        # If there's no image or it is blank, there's nothing to do.
        if "og:image" not in og or not og["og:image"]:
            return
        # FIXME: it might be cleaner to use the same flow as the main /preview_url
        # request itself and benefit from the same caching etc.  But for now we
        # just rely on the caching on the master request to speed things up.
        image_info = await self._download_url(
            _rebase_url(og["og:image"], media_info.uri), user
        )
        if _is_media(image_info.media_type):
            # TODO: make sure we don't choke on white-on-transparent images
            file_id = image_info.filesystem_id
            dims = await self.media_repo._generate_thumbnails(
                None, file_id, file_id, image_info.media_type, url_cache=True
            )
            if dims:
                og["og:image:width"] = dims["width"]
                og["og:image:height"] = dims["height"]
            else:
                logger.warning("Couldn't get dims for %s", og["og:image"])
            og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
            og["og:image:type"] = image_info.media_type
            og["matrix:image:size"] = image_info.media_length
        else:
            del og["og:image"]
    def _start_expire_url_cache_data(self) -> Deferred:
        return run_as_background_process(
            "expire_url_cache_data", self._expire_url_cache_data
@ -527,7 +532,7 @@ class PreviewUrlResource(DirectServeJsonResource):
        # These may be cached for a bit on the client (i.e., they
        # may have a room open with a preview url thing open).
        # So we wait a couple of days before deleting, just in case.
-        expire_before = now - 2 * 24 * ONE_HOUR
+        expire_before = now - 2 * ONE_DAY
        media_ids = await self.store.get_url_cache_media_before(expire_before)
        removed_media = []
@ -669,7 +674,18 @@ def decode_and_calc_og(
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
-    # suck our tree into lxml and define our OG response.
+    """
    Calculate metadata for an HTML document.
    This uses lxml to search the HTML document for Open Graph data.
    Args:
        tree: The parsed HTML document.
        media_url: The URI used to download the body.
    Returns:
        The Open Graph response as a dictionary.
    """
    # if we see any image URLs in the OG response, then spider them
    # (although the client could choose to do this by asking for previews of those
@ -743,35 +759,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
        if meta_description:
            og["og:description"] = meta_description[0]
        else:
-            # grab any text nodes which are inside the <body/> tag...
+            og["og:description"] = _calc_description(tree)
            # unless they are within an HTML5 semantic markup tag...
            # <header/>, <nav/>, <aside/>, <footer/>
            # ...or if they are within a <script/> or <style/> tag.
            # This is a very very very coarse approximation to a plain text
            # render of the page.
            # We don't just use XPATH here as that is slow on some machines.
            from lxml import etree
            TAGS_TO_REMOVE = (
                "header",
                "nav",
                "aside",
                "footer",
                "script",
                "noscript",
                "style",
                etree.Comment,
            )
            # Split all the text nodes into paragraphs (by splitting on new
            # lines)
            text_nodes = (
                re.sub(r"\s+", "\n", el).strip()
                for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
            )
            og["og:description"] = summarize_paragraphs(text_nodes)
    elif og["og:description"]:
        # This must be a non-empty string at this point.
        assert isinstance(og["og:description"], str)
@ -782,6 +770,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
    return og
 def _calc_description(tree: "etree.Element") -> Optional[str]:
    """
    Calculate a text description based on an HTML document.
    Grabs any text nodes which are inside the <body/> tag, unless they are within
    an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
    if they are within a <script/> or <style/> tag.
    This is a very very very coarse approximation to a plain text render of the page.
    Args:
        tree: The parsed HTML document.
    Returns:
        The plain text description, or None if one cannot be generated.
    """
    # We don't just use XPATH here as that is slow on some machines.
    from lxml import etree
    TAGS_TO_REMOVE = (
        "header",
        "nav",
        "aside",
        "footer",
        "script",
        "noscript",
        "style",
        etree.Comment,
    )
    # Split all the text nodes into paragraphs (by splitting on new
    # lines)
    text_nodes = (
        re.sub(r"\s+", "\n", el).strip()
        for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
    )
    return summarize_paragraphs(text_nodes)
 def _iterate_over_text(
    tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
 ) -> Generator[str, None, None]:
@ -841,11 +869,25 @@ def _is_html(content_type: str) -> bool:
    )
 def _is_json(content_type: str) -> bool:
    return content_type.lower().startswith("application/json")
 def summarize_paragraphs(
    text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
 ) -> Optional[str]:
-    # Try to get a summary of between 200 and 500 words, respecting
+    """
-    # first paragraph and then word boundaries.
+    Try to get a summary respecting first paragraph and then word boundaries.
    Args:
        text_nodes: The paragraphs to summarize.
        min_size: The minimum number of words to include.
        max_size: The maximum number of words to include.
    Returns:
        A summary of the text nodes, or None if that was not possible.
    """
    # TODO: Respect sentences?
    description = ""
@ -868,7 +910,7 @@ def summarize_paragraphs(
        new_desc = ""
        # This splits the paragraph into words, but keeping the
-        # (preceeding) whitespace intact so we can easily concat
+        # (preceding) whitespace intact so we can easily concat
        # words back together.
        for match in re.finditer(r"\s*\S+", description):
            word = match.group()
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -24,6 +24,7 @@ from synapse.config.oembed import OEmbedEndpointConfig
 from tests import unittest
 from tests.server import FakeTransport
 from tests.test_utils import SMALL_PNG
 try:
    import lxml
@ -576,13 +577,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        }
        oembed_content = json.dumps(result).encode("utf-8")
        end_content = (
            b"<html><head>"
            b"<title>Some Title</title>"
            b'<meta property="og:description" content="hi" />'
            b"</head></html>"
        )
        channel = self.make_request(
            "GET",
            "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
@ -606,6 +600,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.pump()
        # Ensure a second request is made to the photo URL.
        client = self.reactor.tcpClients[1][2].buildProtocol(None)
        server = AccumulatingProtocol()
        server.makeConnection(FakeTransport(client, self.reactor))
@ -613,18 +608,23 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        client.dataReceived(
            (
                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
-                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+                b"Content-Type: image/png\r\n\r\n"
            )
-            % (len(end_content),)
+            % (len(SMALL_PNG),)
-            + end_content
+            + SMALL_PNG
        )
        self.pump()
        # Ensure the URL is what was requested.
        self.assertIn(b"/matrixdotorg", server.data)
        self.assertEqual(channel.code, 200)
-        self.assertEqual(
+        self.assertIsNone(channel.json_body["og:title"])
-            channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
+        self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
-        )
+        self.assertEqual(channel.json_body["og:image:height"], 1)
        self.assertEqual(channel.json_body["og:image:width"], 1)
        self.assertEqual(channel.json_body["og:image:type"], "image/png")
    def test_oembed_rich(self):
        """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
		`@ -0,0 +1 @@`
							`Improve oEmbed previews by processing the author name, photo, and video information.`