Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. (#15079)
Co-authored-by: David Robertson <davidr@element.io>pull/15114/head
							parent
							
								
									7ee7f49316
								
							
						
					
					
						commit
						1cbc3f197c
					
				|  | @ -0,0 +1 @@ | |||
| Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. | ||||
|  | @ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: | |||
|     We use this so that we can add prefix matching, which isn't something | ||||
|     that is supported by default. | ||||
|     """ | ||||
|     results = _parse_words(search_term) | ||||
|     escaped_words = [] | ||||
|     for word in _parse_words(search_term): | ||||
|         # Postgres tsvector and tsquery quoting rules: | ||||
|         # words potentially containing punctuation should be quoted | ||||
|         # and then existing quotes and backslashes should be doubled | ||||
|         # See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY | ||||
| 
 | ||||
|     both = " & ".join("(%s:* | %s)" % (result, result) for result in results) | ||||
|     exact = " & ".join("%s" % (result,) for result in results) | ||||
|     prefix = " & ".join("%s:*" % (result,) for result in results) | ||||
|         quoted_word = word.replace("'", "''").replace("\\", "\\\\") | ||||
|         escaped_words.append(f"'{quoted_word}'") | ||||
| 
 | ||||
|     both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words) | ||||
|     exact = " & ".join("%s" % (word,) for word in escaped_words) | ||||
|     prefix = " & ".join("%s:*" % (word,) for word in escaped_words) | ||||
| 
 | ||||
|     return both, exact, prefix | ||||
| 
 | ||||
|  | @ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]: | |||
|     if USE_ICU: | ||||
|         return _parse_words_with_icu(search_term) | ||||
| 
 | ||||
|     return _parse_words_with_regex(search_term) | ||||
| 
 | ||||
| 
 | ||||
| def _parse_words_with_regex(search_term: str) -> List[str]: | ||||
|     """ | ||||
|     Break down search term into words, when we don't have ICU available. | ||||
|     See: `_parse_words` | ||||
|     """ | ||||
|     return re.findall(r"([\w\-]+)", search_term, re.UNICODE) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -192,6 +192,13 @@ class UserDirectoryTestCase(unittest.HomeserverTestCase): | |||
|         self.helper.join(room, self.appservice.sender, tok=self.appservice.token) | ||||
|         self._check_only_one_user_in_directory(user, room) | ||||
| 
 | ||||
|     def test_search_term_with_colon_in_it_does_not_raise(self) -> None: | ||||
|         """ | ||||
|         Regression test: Test that search terms with colons in them are acceptable. | ||||
|         """ | ||||
|         u1 = self.register_user("user1", "pass") | ||||
|         self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10)) | ||||
| 
 | ||||
|     def test_user_not_in_users_table(self) -> None: | ||||
|         """Unclear how it happens, but on matrix.org we've seen join events | ||||
|         for users who aren't in the users table. Test that we don't fall over | ||||
|  |  | |||
|  | @ -25,6 +25,11 @@ from synapse.rest.client import login, register, room | |||
| from synapse.server import HomeServer | ||||
| from synapse.storage import DataStore | ||||
| from synapse.storage.background_updates import _BackgroundUpdateHandler | ||||
| from synapse.storage.databases.main import user_directory | ||||
| from synapse.storage.databases.main.user_directory import ( | ||||
|     _parse_words_with_icu, | ||||
|     _parse_words_with_regex, | ||||
| ) | ||||
| from synapse.storage.roommember import ProfileInfo | ||||
| from synapse.util import Clock | ||||
| 
 | ||||
|  | @ -42,7 +47,7 @@ ALICE = "@alice:a" | |||
| BOB = "@bob:b" | ||||
| BOBBY = "@bobby:a" | ||||
| # The localpart isn't 'Bela' on purpose so we can test looking up display names. | ||||
| BELA = "@somenickname:a" | ||||
| BELA = "@somenickname:example.org" | ||||
| 
 | ||||
| 
 | ||||
| class GetUserDirectoryTables: | ||||
|  | @ -423,6 +428,8 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase): | |||
| 
 | ||||
| 
 | ||||
| class UserDirectoryStoreTestCase(HomeserverTestCase): | ||||
|     use_icu = False | ||||
| 
 | ||||
|     def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: | ||||
|         self.store = hs.get_datastores().main | ||||
| 
 | ||||
|  | @ -434,6 +441,12 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): | |||
|         self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) | ||||
|         self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) | ||||
| 
 | ||||
|         self._restore_use_icu = user_directory.USE_ICU | ||||
|         user_directory.USE_ICU = self.use_icu | ||||
| 
 | ||||
|     def tearDown(self) -> None: | ||||
|         user_directory.USE_ICU = self._restore_use_icu | ||||
| 
 | ||||
|     def test_search_user_dir(self) -> None: | ||||
|         # normally when alice searches the directory she should just find | ||||
|         # bob because bobby doesn't share a room with her. | ||||
|  | @ -478,6 +491,26 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): | |||
|             {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, | ||||
|         ) | ||||
| 
 | ||||
|     @override_config({"user_directory": {"search_all_users": True}}) | ||||
|     def test_search_user_dir_start_of_user_id(self) -> None: | ||||
|         """Tests that a user can look up another user by searching for the start | ||||
|         of their user ID. | ||||
|         """ | ||||
|         r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10)) | ||||
|         self.assertFalse(r["limited"]) | ||||
|         self.assertEqual(1, len(r["results"])) | ||||
|         self.assertDictEqual( | ||||
|             r["results"][0], | ||||
|             {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): | ||||
|     use_icu = True | ||||
| 
 | ||||
|     if not icu: | ||||
|         skip = "Requires PyICU" | ||||
| 
 | ||||
| 
 | ||||
| class UserDirectoryICUTestCase(HomeserverTestCase): | ||||
|     if not icu: | ||||
|  | @ -513,3 +546,31 @@ class UserDirectoryICUTestCase(HomeserverTestCase): | |||
|             r["results"][0], | ||||
|             {"user_id": ALICE, "display_name": display_name, "avatar_url": None}, | ||||
|         ) | ||||
| 
 | ||||
|     def test_icu_word_boundary_punctuation(self) -> None: | ||||
|         """ | ||||
|         Tests the behaviour of punctuation with the ICU tokeniser. | ||||
| 
 | ||||
|         Seems to depend on underlying version of ICU. | ||||
|         """ | ||||
| 
 | ||||
|         # Note: either tokenisation is fine, because Postgres actually splits | ||||
|         # words itself afterwards. | ||||
|         self.assertIn( | ||||
|             _parse_words_with_icu("lazy'fox jumped:over the.dog"), | ||||
|             ( | ||||
|                 # ICU 66 on Ubuntu 20.04 | ||||
|                 ["lazy'fox", "jumped", "over", "the", "dog"], | ||||
|                 # ICU 70 on Ubuntu 22.04 | ||||
|                 ["lazy'fox", "jumped:over", "the.dog"], | ||||
|             ), | ||||
|         ) | ||||
| 
 | ||||
|     def test_regex_word_boundary_punctuation(self) -> None: | ||||
|         """ | ||||
|         Tests the behaviour of punctuation with the non-ICU tokeniser | ||||
|         """ | ||||
|         self.assertEqual( | ||||
|             _parse_words_with_regex("lazy'fox jumped:over the.dog"), | ||||
|             ["lazy", "fox", "jumped", "over", "the", "dog"], | ||||
|         ) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 reivilibre
						reivilibre