Merge pull request #973 from matrix-org/erikj/xpath_fix

Change the way we summarize URLs
pull/984/head
Erik Johnston 2016-08-04 16:37:25 +01:00 committed by GitHub
commit a5d7968b3e
2 changed files with 211 additions and 14 deletions

View File

@ -29,6 +29,8 @@ from synapse.http.server import (
from synapse.util.async import ObservableDeferred
from synapse.util.stringutils import is_ascii
from copy import deepcopy
import os
import re
import fnmatch
@ -329,20 +331,23 @@ class PreviewUrlResource(Resource):
# ...or if they are within a <script/> or <style/> tag.
# This is a very very very coarse approximation to a plain text
# render of the page.
text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
"ancestor::aside | ancestor::footer | "
"ancestor::script | ancestor::style)]" +
"[ancestor::body]")
text = ''
for text_node in text_nodes:
if len(text) < 500:
text += text_node + ' '
else:
break
text = re.sub(r'[\t ]+', ' ', text)
text = re.sub(r'[\t \r\n]*[\r\n]+', '\n', text)
text = text.strip()[:500]
og['og:description'] = text if text else None
# We don't just use XPATH here as that is slow on some machines.
# We clone `tree` as we modify it.
cloned_tree = deepcopy(tree.find("body"))
TAGS_TO_REMOVE = ("header", "nav", "aside", "footer", "script", "style",)
for el in cloned_tree.iter(TAGS_TO_REMOVE):
el.getparent().remove(el)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r'\s+', '\n', el.text).strip()
for el in cloned_tree.iter() if el.text
)
og['og:description'] = summarize_paragraphs(text_nodes)
# TODO: delete the url downloads to stop diskfilling,
# as we only ever cared about its OG
@ -450,3 +455,56 @@ class PreviewUrlResource(Resource):
content_type.startswith("application/xhtml")
):
return True
def summarize_paragraphs(text_nodes, min_size=200, max_size=500):
# Try to get a summary of between 200 and 500 words, respecting
# first paragraph and then word boundaries.
# TODO: Respect sentences?
description = ''
# Keep adding paragraphs until we get to the MIN_SIZE.
for text_node in text_nodes:
if len(description) < min_size:
text_node = re.sub(r'[\t \r\n]+', ' ', text_node)
description += text_node + '\n\n'
else:
break
description = description.strip()
description = re.sub(r'[\t ]+', ' ', description)
description = re.sub(r'[\t \r\n]*[\r\n]+', '\n\n', description)
# If the concatenation of paragraphs to get above MIN_SIZE
# took us over MAX_SIZE, then we need to truncate mid paragraph
if len(description) > max_size:
new_desc = ""
# This splits the paragraph into words, but keeping the
# (preceeding) whitespace intact so we can easily concat
# words back together.
for match in re.finditer("\s*\S+", description):
word = match.group()
# Keep adding words while the total length is less than
# MAX_SIZE.
if len(word) + len(new_desc) < max_size:
new_desc += word
else:
# At this point the next word *will* take us over
# MAX_SIZE, but we also want to ensure that its not
# a huge word. If it is add it anyway and we'll
# truncate later.
if len(new_desc) < min_size:
new_desc += word
break
# Double check that we're not over the limit
if len(new_desc) > max_size:
new_desc = new_desc[:max_size]
# We always add an ellipsis because at the very least
# we chopped mid paragraph.
description = new_desc.strip() + ""
return description if description else None

139
tests/test_preview.py Normal file
View File

@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
# Copyright 2014-2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import unittest
from synapse.rest.media.v1.preview_url_resource import summarize_paragraphs
class PreviewTestCase(unittest.TestCase):
def test_long_summarize(self):
example_paras = [
"""Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
Troms county, Norway. The administrative centre of the municipality is
the city of Tromsø. Outside of Norway, Tromso and Tromsö are
alternative spellings of the city.Tromsø is considered the northernmost
city in the world with a population above 50,000. The most populous town
north of it is Alta, Norway, with a population of 14,272 (2013).""",
"""Tromsø lies in Northern Norway. The municipality has a population of
(2015) 72,066, but with an annual influx of students it has over 75,000
most of the year. It is the largest urban area in Northern Norway and the
third largest north of the Arctic Circle (following Murmansk and Norilsk).
Most of Tromsø, including the city centre, is located on the island of
Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
Tromsøya had a population of 36,088. Substantial parts of the urban area
are also situated on the mainland to the east, and on parts of Kvaløyaa
large island to the west. Tromsøya is connected to the mainland by the Tromsø
Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
Sandnessund Bridge. Tromsø Airport connects the city to many destinations
in Europe. The city is warmer than most other places located on the same
latitude, due to the warming effect of the Gulf Stream.""",
"""The city centre of Tromsø contains the highest number of old wooden
houses in Northern Norway, the oldest house dating from 1789. The Arctic
Cathedral, a modern church from 1965, is probably the most famous landmark
in Tromsø. The city is a cultural centre for its region, with several
festivals taking place in the summer. Some of Norway's best-known
musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
Noted electronic musician Geir Jenssen also hails from Tromsø.""",
]
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
self.assertEquals(
desc,
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
" Troms county, Norway. The administrative centre of the municipality is"
" the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
" alternative spellings of the city.Tromsø is considered the northernmost"
" city in the world with a population above 50,000. The most populous town"
" north of it is Alta, Norway, with a population of 14,272 (2013)."
)
desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
self.assertEquals(
desc,
"Tromsø lies in Northern Norway. The municipality has a population of"
" (2015) 72,066, but with an annual influx of students it has over 75,000"
" most of the year. It is the largest urban area in Northern Norway and the"
" third largest north of the Arctic Circle (following Murmansk and Norilsk)."
" Most of Tromsø, including the city centre, is located on the island of"
" Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
" Tromsøya had a population of 36,088. Substantial parts of the…"
)
def test_short_summarize(self):
example_paras = [
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
" Troms county, Norway.",
"Tromsø lies in Northern Norway. The municipality has a population of"
" (2015) 72,066, but with an annual influx of students it has over 75,000"
" most of the year.",
"The city centre of Tromsø contains the highest number of old wooden"
" houses in Northern Norway, the oldest house dating from 1789. The Arctic"
" Cathedral, a modern church from 1965, is probably the most famous landmark"
" in Tromsø.",
]
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
self.assertEquals(
desc,
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
" Troms county, Norway.\n"
"\n"
"Tromsø lies in Northern Norway. The municipality has a population of"
" (2015) 72,066, but with an annual influx of students it has over 75,000"
" most of the year."
)
def test_small_then_large_summarize(self):
example_paras = [
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
" Troms county, Norway.",
"Tromsø lies in Northern Norway. The municipality has a population of"
" (2015) 72,066, but with an annual influx of students it has over 75,000"
" most of the year."
" The city centre of Tromsø contains the highest number of old wooden"
" houses in Northern Norway, the oldest house dating from 1789. The Arctic"
" Cathedral, a modern church from 1965, is probably the most famous landmark"
" in Tromsø.",
]
desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
self.assertEquals(
desc,
"Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
" Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
" Troms county, Norway.\n"
"\n"
"Tromsø lies in Northern Norway. The municipality has a population of"
" (2015) 72,066, but with an annual influx of students it has over 75,000"
" most of the year. The city centre of Tromsø contains the highest number"
" of old wooden houses in Northern Norway, the oldest house dating from"
" 1789. The Arctic Cathedral, a modern church…"
)