Improve URL previews for some pages (#12951)
* Skip `og` and `meta` tags where the value is empty. * Fallback to the favicon if there are no other images. * Ignore tags meant for navigation.pull/12957/head
							parent
							
								
									888a29f412
								
							
						
					
					
						commit
						01df5bacac
					
				|  | @ -0,0 +1 @@ | |||
| Improve URL previews for pages with empty elements. | ||||
|  | @ -30,6 +30,9 @@ _xml_encoding_match = re.compile( | |||
| ) | ||||
| _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) | ||||
| 
 | ||||
| # Certain elements aren't meant for display. | ||||
| ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"} | ||||
| 
 | ||||
| 
 | ||||
| def _normalise_encoding(encoding: str) -> Optional[str]: | ||||
|     """Use the Python codec's name as the normalised entry.""" | ||||
|  | @ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
|     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", | ||||
| 
 | ||||
|     og: Dict[str, Optional[str]] = {} | ||||
|     for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): | ||||
|         if "content" in tag.attrib: | ||||
|             # if we've got more than 50 tags, someone is taking the piss | ||||
|             if len(og) >= 50: | ||||
|                 logger.warning("Skipping OG for page with too many 'og:' tags") | ||||
|                 return {} | ||||
|             og[tag.attrib["property"]] = tag.attrib["content"] | ||||
|     for tag in tree.xpath( | ||||
|         "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]" | ||||
|     ): | ||||
|         # if we've got more than 50 tags, someone is taking the piss | ||||
|         if len(og) >= 50: | ||||
|             logger.warning("Skipping OG for page with too many 'og:' tags") | ||||
|             return {} | ||||
| 
 | ||||
|         og[tag.attrib["property"]] = tag.attrib["content"] | ||||
| 
 | ||||
|     # TODO: grab article: meta tags too, e.g.: | ||||
| 
 | ||||
|  | @ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
|     # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> | ||||
| 
 | ||||
|     if "og:title" not in og: | ||||
|         # do some basic spidering of the HTML | ||||
|         title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") | ||||
|         if title and title[0].text is not None: | ||||
|             og["og:title"] = title[0].text.strip() | ||||
|         # Attempt to find a title from the title tag, or the biggest header on the page. | ||||
|         title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()") | ||||
|         if title: | ||||
|             og["og:title"] = title[0].strip() | ||||
|         else: | ||||
|             og["og:title"] = None | ||||
| 
 | ||||
|     if "og:image" not in og: | ||||
|         # TODO: extract a favicon failing all else | ||||
|         meta_image = tree.xpath( | ||||
|             "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content" | ||||
|             "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]" | ||||
|         ) | ||||
|         # If a meta image is found, use it. | ||||
|         if meta_image: | ||||
|             og["og:image"] = meta_image[0] | ||||
|         else: | ||||
|             # Try to find images which are larger than 10px by 10px. | ||||
|             # | ||||
|             # TODO: consider inlined CSS styles as well as width & height attribs | ||||
|             images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") | ||||
|             images = sorted( | ||||
|  | @ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]: | |||
|                     -1 * float(i.attrib["width"]) * float(i.attrib["height"]) | ||||
|                 ), | ||||
|             ) | ||||
|             # If no images were found, try to find *any* images. | ||||
|             if not images: | ||||
|                 images = tree.xpath("//img[@src]") | ||||
|                 images = tree.xpath("//img[@src][1]") | ||||
|             if images: | ||||
|                 og["og:image"] = images[0].attrib["src"] | ||||
| 
 | ||||
|             # Finally, fallback to the favicon if nothing else. | ||||
|             else: | ||||
|                 favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]") | ||||
|                 if favicons: | ||||
|                     og["og:image"] = favicons[0] | ||||
| 
 | ||||
|     if "og:description" not in og: | ||||
|         # Check the first meta description tag for content. | ||||
|         meta_description = tree.xpath( | ||||
|             "//*/meta" | ||||
|             "[translate(@name, 'DESCRIPTION', 'description')='description']" | ||||
|             "/@content" | ||||
|             "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]" | ||||
|         ) | ||||
|         # If a meta description is found with content, use it. | ||||
|         if meta_description: | ||||
|             og["og:description"] = meta_description[0] | ||||
|         else: | ||||
|  | @ -306,6 +320,10 @@ def _iterate_over_text( | |||
|         if isinstance(el, str): | ||||
|             yield el | ||||
|         elif el.tag not in tags_to_ignore: | ||||
|             # If the element isn't meant for display, ignore it. | ||||
|             if el.get("role") in ARIA_ROLES_TO_IGNORE: | ||||
|                 continue | ||||
| 
 | ||||
|             # el.text is the text before the first child, so we can immediately | ||||
|             # return it if the text exists. | ||||
|             if el.text: | ||||
|  |  | |||
|  | @ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase): | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class CalcOgTestCase(unittest.TestCase): | ||||
| class OpenGraphFromHtmlTestCase(unittest.TestCase): | ||||
|     if not lxml: | ||||
|         skip = "url preview feature requires lxml" | ||||
| 
 | ||||
|  | @ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase): | |||
| 
 | ||||
|         self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) | ||||
| 
 | ||||
|         # Another variant is a title with no content. | ||||
|         html = b""" | ||||
|         <html> | ||||
|         <head><title></title></head> | ||||
|         <body> | ||||
|         <h1>Title</h1> | ||||
|         </body> | ||||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = parse_html_to_open_graph(tree) | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Title", "og:description": "Title"}) | ||||
| 
 | ||||
|     def test_h1_as_title(self) -> None: | ||||
|         html = b""" | ||||
|         <html> | ||||
|  | @ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase): | |||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) | ||||
| 
 | ||||
|     def test_empty_description(self) -> None: | ||||
|         """Description tags with empty content should be ignored.""" | ||||
|         html = b""" | ||||
|         <html> | ||||
|         <meta property="og:description" content=""/> | ||||
|         <meta property="og:description"/> | ||||
|         <meta name="description" content=""/> | ||||
|         <meta name="description"/> | ||||
|         <meta name="description" content="Finally!"/> | ||||
|         <body> | ||||
|         <h1>Title</h1> | ||||
|         </body> | ||||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = parse_html_to_open_graph(tree) | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"}) | ||||
| 
 | ||||
|     def test_missing_title_and_broken_h1(self) -> None: | ||||
|         html = b""" | ||||
|         <html> | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Patrick Cloke
						Patrick Cloke