From 6748f0a57962fb9657cab60083d94b4c97a0526c Mon Sep 17 00:00:00 2001
From: David Baker <dave@matrix.org>
Date: Thu, 5 Oct 2017 11:33:30 +0100
Subject: [PATCH 1/3] Fix notif kws that start/end with non-word chars

Only prepend / append word bounary characters if the search
expression starts or ends with a word character, otherwise they
don't work because there's no word bounary between whitespace and
a non-word char.
---
 synapse/push/push_rule_evaluator.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 172c27c137..5a34d60abb 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -26,6 +26,8 @@ logger = logging.getLogger(__name__)
 GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]')
 IS_GLOB = re.compile(r'[\?\*\[\]]')
 INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$")
+STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w")
+ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$")
 
 
 def _room_member_count(ev, condition, room_member_count):
@@ -183,7 +185,7 @@ def _glob_to_re(glob, word_boundary):
             r,
         )
         if word_boundary:
-            r = r"\b%s\b" % (r,)
+            r = _re_word_boundary(r)
 
             return re.compile(r, flags=re.IGNORECASE)
         else:
@@ -192,13 +194,30 @@ def _glob_to_re(glob, word_boundary):
             return re.compile(r, flags=re.IGNORECASE)
     elif word_boundary:
         r = re.escape(glob)
-        r = r"\b%s\b" % (r,)
+        r = _re_word_boundary(r)
 
         return re.compile(r, flags=re.IGNORECASE)
     else:
         r = "^" + re.escape(glob) + "$"
         return re.compile(r, flags=re.IGNORECASE)
 
+def _re_word_boundary(r):
+    """
+    Adds word boundary characters to the start and end of an
+    expression to require that the match occur as a whole word,
+    but do so respecting the fact that strings starting or ending
+    with non-word characters will change word boundaries.
+    """
+    # Matching a regex string aginst a regex, since by definition
+    # \b is the boundary between a \w and a \W, so match \w at the
+    # start or end of the expression (although this will miss, eg.
+    # "[dl]og")
+    if STARTS_WITH_WORD_CHAR_REGEX.search(r):
+        r = r"\b%s" % (r,)
+    if ENDS_WITH_WORD_CHAR_REGEX.search(r):
+        r = r"%s\b" % (r,)
+    return r
+
 
 def _flatten_dict(d, prefix=[], result=None):
     if result is None:

From cbe3c3fdd49b87a452a9a9a229abfdf8dbe45922 Mon Sep 17 00:00:00 2001
From: David Baker <dave@matrix.org>
Date: Thu, 5 Oct 2017 11:43:10 +0100
Subject: [PATCH 2/3] pep8

---
 synapse/push/push_rule_evaluator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 5a34d60abb..b78f2d90d7 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -201,6 +201,7 @@ def _glob_to_re(glob, word_boundary):
         r = "^" + re.escape(glob) + "$"
         return re.compile(r, flags=re.IGNORECASE)
 
+
 def _re_word_boundary(r):
     """
     Adds word boundary characters to the start and end of an

From 0c8da8b519fbd8bca984117e354fe57c3a76e154 Mon Sep 17 00:00:00 2001
From: David Baker <dave@matrix.org>
Date: Thu, 5 Oct 2017 11:57:43 +0100
Subject: [PATCH 3/3] Use better method for word boundary searching

From https://github.com/matrix-org/matrix-js-sdk/commit/ebc95667b8a5777d13e5d3c679972bedae022fd5
---
 synapse/push/push_rule_evaluator.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index b78f2d90d7..65f9a63fd8 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -26,8 +26,6 @@ logger = logging.getLogger(__name__)
 GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]')
 IS_GLOB = re.compile(r'[\?\*\[\]]')
 INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$")
-STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w")
-ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$")
 
 
 def _room_member_count(ev, condition, room_member_count):
@@ -209,15 +207,9 @@ def _re_word_boundary(r):
     but do so respecting the fact that strings starting or ending
     with non-word characters will change word boundaries.
     """
-    # Matching a regex string aginst a regex, since by definition
-    # \b is the boundary between a \w and a \W, so match \w at the
-    # start or end of the expression (although this will miss, eg.
-    # "[dl]og")
-    if STARTS_WITH_WORD_CHAR_REGEX.search(r):
-        r = r"\b%s" % (r,)
-    if ENDS_WITH_WORD_CHAR_REGEX.search(r):
-        r = r"%s\b" % (r,)
-    return r
+    # we can't use \b as it chokes on unicode. however \W seems to be okay
+    # as shorthand for [^0-9A-Za-z_].
+    return r"(^|\W)%s(\W|$)" % (r,)
 
 
 def _flatten_dict(d, prefix=[], result=None):