Fix preview of imgur and Tenor URLs. (#11669)
By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.pull/11768/head
parent
9eab71aa93
commit
15ffc4143c
|
@ -0,0 +1 @@
|
|||
Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.
|
|
@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
|
|||
5. If the media is HTML:
|
||||
1. Decodes the HTML via the stored file.
|
||||
2. Generates an Open Graph response from the HTML.
|
||||
3. If an image exists in the Open Graph response:
|
||||
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
||||
1. Downloads the URL and stores it into a file via the media storage provider
|
||||
and saves the local media metadata.
|
||||
2. Convert the oEmbed response to an Open Graph response.
|
||||
3. Override any Open Graph data from the HTML with data from oEmbed.
|
||||
4. If an image exists in the Open Graph response:
|
||||
1. Downloads the URL and stores it into a file via the media storage
|
||||
provider and saves the local media metadata.
|
||||
2. Generates thumbnails.
|
||||
|
|
|
@ -33,6 +33,8 @@ logger = logging.getLogger(__name__)
|
|||
class OEmbedResult:
|
||||
# The Open Graph result (converted from the oEmbed result).
|
||||
open_graph_result: JsonDict
|
||||
# The author_name of the oEmbed result
|
||||
author_name: Optional[str]
|
||||
# Number of milliseconds to cache the content, according to the oEmbed response.
|
||||
#
|
||||
# This will be None if no cache-age is provided in the oEmbed response (or
|
||||
|
@ -154,11 +156,12 @@ class OEmbedProvider:
|
|||
"og:url": url,
|
||||
}
|
||||
|
||||
# Use either title or author's name as the title.
|
||||
title = oembed.get("title") or oembed.get("author_name")
|
||||
title = oembed.get("title")
|
||||
if title:
|
||||
open_graph_response["og:title"] = title
|
||||
|
||||
author_name = oembed.get("author_name")
|
||||
|
||||
# Use the provider name and as the site.
|
||||
provider_name = oembed.get("provider_name")
|
||||
if provider_name:
|
||||
|
@ -193,9 +196,10 @@ class OEmbedProvider:
|
|||
# Trap any exception and let the code follow as usual.
|
||||
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
|
||||
open_graph_response = {}
|
||||
author_name = None
|
||||
cache_age = None
|
||||
|
||||
return OEmbedResult(open_graph_response, cache_age)
|
||||
return OEmbedResult(open_graph_response, author_name, cache_age)
|
||||
|
||||
|
||||
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
||||
|
|
|
@ -262,6 +262,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
|
||||
# The number of milliseconds that the response should be considered valid.
|
||||
expiration_ms = media_info.expires
|
||||
author_name: Optional[str] = None
|
||||
|
||||
if _is_media(media_info.media_type):
|
||||
file_id = media_info.filesystem_id
|
||||
|
@ -294,17 +295,25 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
# Check if this HTML document points to oEmbed information and
|
||||
# defer to that.
|
||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||
og = {}
|
||||
og_from_oembed: JsonDict = {}
|
||||
if oembed_url:
|
||||
oembed_info = await self._download_url(oembed_url, user)
|
||||
og, expiration_ms = await self._handle_oembed_response(
|
||||
(
|
||||
og_from_oembed,
|
||||
author_name,
|
||||
expiration_ms,
|
||||
) = await self._handle_oembed_response(
|
||||
url, oembed_info, expiration_ms
|
||||
)
|
||||
|
||||
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
|
||||
# to generate the Open Graph information from the HTML.
|
||||
if not oembed_url or not og:
|
||||
og = parse_html_to_open_graph(tree, media_info.uri)
|
||||
# Parse Open Graph information from the HTML in case the oEmbed
|
||||
# response failed or is incomplete.
|
||||
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
|
||||
|
||||
# Compile the Open Graph response by using the scraped
|
||||
# information from the HTML and overlaying any information
|
||||
# from the oEmbed response.
|
||||
og = {**og_from_html, **og_from_oembed}
|
||||
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
else:
|
||||
|
@ -312,7 +321,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
|
||||
elif oembed_url:
|
||||
# Handle the oEmbed information.
|
||||
og, expiration_ms = await self._handle_oembed_response(
|
||||
og, author_name, expiration_ms = await self._handle_oembed_response(
|
||||
url, media_info, expiration_ms
|
||||
)
|
||||
await self._precache_image_url(user, media_info, og)
|
||||
|
@ -321,6 +330,11 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
logger.warning("Failed to find any OG data in %s", url)
|
||||
og = {}
|
||||
|
||||
# If we don't have a title but we have author_name, copy it as
|
||||
# title
|
||||
if not og.get("og:title") and author_name:
|
||||
og["og:title"] = author_name
|
||||
|
||||
# filter out any stupidly long values
|
||||
keys_to_remove = []
|
||||
for k, v in og.items():
|
||||
|
@ -484,7 +498,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
|
||||
async def _handle_oembed_response(
|
||||
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||
) -> Tuple[JsonDict, int]:
|
||||
) -> Tuple[JsonDict, Optional[str], int]:
|
||||
"""
|
||||
Parse the downloaded oEmbed info.
|
||||
|
||||
|
@ -497,11 +511,12 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
Returns:
|
||||
A tuple of:
|
||||
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||
The author name if it could be retrieved from oEmbed.
|
||||
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||
"""
|
||||
# If JSON was not returned, there's nothing to do.
|
||||
if not _is_json(media_info.media_type):
|
||||
return {}, expiration_ms
|
||||
return {}, None, expiration_ms
|
||||
|
||||
with open(media_info.filename, "rb") as file:
|
||||
body = file.read()
|
||||
|
@ -513,7 +528,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||
if open_graph_result and oembed_response.cache_age is not None:
|
||||
expiration_ms = oembed_response.cache_age
|
||||
|
||||
return open_graph_result, expiration_ms
|
||||
return open_graph_result, oembed_response.author_name, expiration_ms
|
||||
|
||||
def _start_expire_url_cache_data(self) -> Deferred:
|
||||
return run_as_background_process(
|
||||
|
|
Loading…
Reference in New Issue