Fix preview of imgur and Tenor URLs. (#11669)
By scraping Open Graph information from the HTML even when an autodiscovery endpoint is found. The results are then combined to capture as much information as possible from the page.pull/11768/head
parent
9eab71aa93
commit
15ffc4143c
|
@ -0,0 +1 @@
|
||||||
|
Fix preview of some gif URLs (like tenor.com). Contributed by Philippe Daouadi.
|
|
@ -35,7 +35,12 @@ When Synapse is asked to preview a URL it does the following:
|
||||||
5. If the media is HTML:
|
5. If the media is HTML:
|
||||||
1. Decodes the HTML via the stored file.
|
1. Decodes the HTML via the stored file.
|
||||||
2. Generates an Open Graph response from the HTML.
|
2. Generates an Open Graph response from the HTML.
|
||||||
3. If an image exists in the Open Graph response:
|
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
||||||
|
1. Downloads the URL and stores it into a file via the media storage provider
|
||||||
|
and saves the local media metadata.
|
||||||
|
2. Convert the oEmbed response to an Open Graph response.
|
||||||
|
3. Override any Open Graph data from the HTML with data from oEmbed.
|
||||||
|
4. If an image exists in the Open Graph response:
|
||||||
1. Downloads the URL and stores it into a file via the media storage
|
1. Downloads the URL and stores it into a file via the media storage
|
||||||
provider and saves the local media metadata.
|
provider and saves the local media metadata.
|
||||||
2. Generates thumbnails.
|
2. Generates thumbnails.
|
||||||
|
|
|
@ -33,6 +33,8 @@ logger = logging.getLogger(__name__)
|
||||||
class OEmbedResult:
|
class OEmbedResult:
|
||||||
# The Open Graph result (converted from the oEmbed result).
|
# The Open Graph result (converted from the oEmbed result).
|
||||||
open_graph_result: JsonDict
|
open_graph_result: JsonDict
|
||||||
|
# The author_name of the oEmbed result
|
||||||
|
author_name: Optional[str]
|
||||||
# Number of milliseconds to cache the content, according to the oEmbed response.
|
# Number of milliseconds to cache the content, according to the oEmbed response.
|
||||||
#
|
#
|
||||||
# This will be None if no cache-age is provided in the oEmbed response (or
|
# This will be None if no cache-age is provided in the oEmbed response (or
|
||||||
|
@ -154,11 +156,12 @@ class OEmbedProvider:
|
||||||
"og:url": url,
|
"og:url": url,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Use either title or author's name as the title.
|
title = oembed.get("title")
|
||||||
title = oembed.get("title") or oembed.get("author_name")
|
|
||||||
if title:
|
if title:
|
||||||
open_graph_response["og:title"] = title
|
open_graph_response["og:title"] = title
|
||||||
|
|
||||||
|
author_name = oembed.get("author_name")
|
||||||
|
|
||||||
# Use the provider name and as the site.
|
# Use the provider name and as the site.
|
||||||
provider_name = oembed.get("provider_name")
|
provider_name = oembed.get("provider_name")
|
||||||
if provider_name:
|
if provider_name:
|
||||||
|
@ -193,9 +196,10 @@ class OEmbedProvider:
|
||||||
# Trap any exception and let the code follow as usual.
|
# Trap any exception and let the code follow as usual.
|
||||||
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
|
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
|
||||||
open_graph_response = {}
|
open_graph_response = {}
|
||||||
|
author_name = None
|
||||||
cache_age = None
|
cache_age = None
|
||||||
|
|
||||||
return OEmbedResult(open_graph_response, cache_age)
|
return OEmbedResult(open_graph_response, author_name, cache_age)
|
||||||
|
|
||||||
|
|
||||||
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
|
||||||
|
|
|
@ -262,6 +262,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
# The number of milliseconds that the response should be considered valid.
|
# The number of milliseconds that the response should be considered valid.
|
||||||
expiration_ms = media_info.expires
|
expiration_ms = media_info.expires
|
||||||
|
author_name: Optional[str] = None
|
||||||
|
|
||||||
if _is_media(media_info.media_type):
|
if _is_media(media_info.media_type):
|
||||||
file_id = media_info.filesystem_id
|
file_id = media_info.filesystem_id
|
||||||
|
@ -294,17 +295,25 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
# Check if this HTML document points to oEmbed information and
|
# Check if this HTML document points to oEmbed information and
|
||||||
# defer to that.
|
# defer to that.
|
||||||
oembed_url = self._oembed.autodiscover_from_html(tree)
|
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||||
og = {}
|
og_from_oembed: JsonDict = {}
|
||||||
if oembed_url:
|
if oembed_url:
|
||||||
oembed_info = await self._download_url(oembed_url, user)
|
oembed_info = await self._download_url(oembed_url, user)
|
||||||
og, expiration_ms = await self._handle_oembed_response(
|
(
|
||||||
|
og_from_oembed,
|
||||||
|
author_name,
|
||||||
|
expiration_ms,
|
||||||
|
) = await self._handle_oembed_response(
|
||||||
url, oembed_info, expiration_ms
|
url, oembed_info, expiration_ms
|
||||||
)
|
)
|
||||||
|
|
||||||
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
|
# Parse Open Graph information from the HTML in case the oEmbed
|
||||||
# to generate the Open Graph information from the HTML.
|
# response failed or is incomplete.
|
||||||
if not oembed_url or not og:
|
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
|
||||||
og = parse_html_to_open_graph(tree, media_info.uri)
|
|
||||||
|
# Compile the Open Graph response by using the scraped
|
||||||
|
# information from the HTML and overlaying any information
|
||||||
|
# from the oEmbed response.
|
||||||
|
og = {**og_from_html, **og_from_oembed}
|
||||||
|
|
||||||
await self._precache_image_url(user, media_info, og)
|
await self._precache_image_url(user, media_info, og)
|
||||||
else:
|
else:
|
||||||
|
@ -312,7 +321,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
elif oembed_url:
|
elif oembed_url:
|
||||||
# Handle the oEmbed information.
|
# Handle the oEmbed information.
|
||||||
og, expiration_ms = await self._handle_oembed_response(
|
og, author_name, expiration_ms = await self._handle_oembed_response(
|
||||||
url, media_info, expiration_ms
|
url, media_info, expiration_ms
|
||||||
)
|
)
|
||||||
await self._precache_image_url(user, media_info, og)
|
await self._precache_image_url(user, media_info, og)
|
||||||
|
@ -321,6 +330,11 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
logger.warning("Failed to find any OG data in %s", url)
|
logger.warning("Failed to find any OG data in %s", url)
|
||||||
og = {}
|
og = {}
|
||||||
|
|
||||||
|
# If we don't have a title but we have author_name, copy it as
|
||||||
|
# title
|
||||||
|
if not og.get("og:title") and author_name:
|
||||||
|
og["og:title"] = author_name
|
||||||
|
|
||||||
# filter out any stupidly long values
|
# filter out any stupidly long values
|
||||||
keys_to_remove = []
|
keys_to_remove = []
|
||||||
for k, v in og.items():
|
for k, v in og.items():
|
||||||
|
@ -484,7 +498,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
async def _handle_oembed_response(
|
async def _handle_oembed_response(
|
||||||
self, url: str, media_info: MediaInfo, expiration_ms: int
|
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||||
) -> Tuple[JsonDict, int]:
|
) -> Tuple[JsonDict, Optional[str], int]:
|
||||||
"""
|
"""
|
||||||
Parse the downloaded oEmbed info.
|
Parse the downloaded oEmbed info.
|
||||||
|
|
||||||
|
@ -497,11 +511,12 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
Returns:
|
Returns:
|
||||||
A tuple of:
|
A tuple of:
|
||||||
The Open Graph dictionary, if the oEmbed info can be parsed.
|
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||||
|
The author name if it could be retrieved from oEmbed.
|
||||||
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||||
"""
|
"""
|
||||||
# If JSON was not returned, there's nothing to do.
|
# If JSON was not returned, there's nothing to do.
|
||||||
if not _is_json(media_info.media_type):
|
if not _is_json(media_info.media_type):
|
||||||
return {}, expiration_ms
|
return {}, None, expiration_ms
|
||||||
|
|
||||||
with open(media_info.filename, "rb") as file:
|
with open(media_info.filename, "rb") as file:
|
||||||
body = file.read()
|
body = file.read()
|
||||||
|
@ -513,7 +528,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
if open_graph_result and oembed_response.cache_age is not None:
|
if open_graph_result and oembed_response.cache_age is not None:
|
||||||
expiration_ms = oembed_response.cache_age
|
expiration_ms = oembed_response.cache_age
|
||||||
|
|
||||||
return open_graph_result, expiration_ms
|
return open_graph_result, oembed_response.author_name, expiration_ms
|
||||||
|
|
||||||
def _start_expire_url_cache_data(self) -> Deferred:
|
def _start_expire_url_cache_data(self) -> Deferred:
|
||||||
return run_as_background_process(
|
return run_as_background_process(
|
||||||
|
|
Loading…
Reference in New Issue