The escape function no longer escapes \x00. It's not necessary.

Inline flags can now be turned off and apply to what follows. Added \R to match line endings.
2025-10-05 20:02:39 +02:00 · 2023-12-24 00:13:19 +00:00
parent 34333d5009
commit cdcbf36aeb
7 changed files with 54 additions and 58 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -45,7 +45,7 @@ jobs:

    env:
      # macOS archs
-      CIBW_ARCHS_MACOS: "x86_64 arm64"
+      CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"

    steps:
      - uses: actions/checkout@v3
@@ -64,6 +64,14 @@ jobs:
          name: regex-files
          path: wheelhouse/*.whl

+      - name: Create GitHub release
+        uses: actions/create-release@v1
+          env:
+            GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+          with:
+            tag_name: ${{ github.ref }}
+            title: Release ${{ github.ref }}
+
  # Build source distribution & manylinux1_x86_64 wheels
  # These two jobs build:
  #   1, build_wheels (above): manylinux1_i686 / manylinux2014_x86_64
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,11 @@
+Version: 2023.12.23
+
+    The escape function no longer escapes \x00. It's not necessary.
+
+    Inline flags can now be turned off and apply to what follows.
+
+    Added \R to match line endings.
+
 Version: 2023.10.3

    Updated to Unicode 15.1.0.
--- a/regex_3/_regex_core.py
+++ b/regex_3/_regex_core.py
@@ -1150,22 +1150,7 @@ def parse_flags_subpattern(source, info):

 def parse_positional_flags(source, info, flags_on, flags_off):
    "Parses positional flags."
-    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
-    if version == VERSION0:
-        # Positional flags are global and can only be turned on.
-        if flags_off:
-            raise error("bad inline flags: cannot turn flags off",
-              source.string, source.pos)
-
-        new_global_flags = flags_on & ~info.global_flags
-        if new_global_flags:
-            info.global_flags |= new_global_flags
-
-            # A global has been turned on, so reparse the pattern.
-            raise _UnscopedFlagSet(info.global_flags)
-    else:
-        info.flags = (info.flags | flags_on) & ~flags_off
-
+    info.flags = (info.flags | flags_on) & ~flags_off
    source.ignore_space = bool(info.flags & VERBOSE)

 def parse_name(source, allow_numeric=False, allow_group_0=False):
@@ -1233,6 +1218,14 @@ def parse_escape(source, info, in_set):
    elif ch in "pP":
        # A Unicode property, positive or negative.
        return parse_property(source, info, ch == "p", in_set)
+    elif ch == "R" and not in_set:
+        # A line ending.
+        charset = [0x0A, 0x0B, 0x0C, 0x0D]
+        if info.guess_encoding == UNICODE:
+            charset.extend([0x85, 0x2028, 0x2029])
+
+        return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)
+          for c in charset])]))
    elif ch == "X" and not in_set:
        # A grapheme cluster.
        return Grapheme()
--- a/regex_3/regex.py
+++ b/regex_3/regex.py
@@ -241,7 +241,7 @@ __all__ = ["cache_all", "compile", "DEFAULT_VERSION", "escape", "findall",
  "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__",
  "__doc__", "RegexFlag"]

-__version__ = "2.5.135"
+__version__ = "2.5.136"

 # --------------------------------------------------------------------
 # Public interface.
@@ -392,8 +392,6 @@ def escape(pattern, special_only=True, literal_spaces=False):
            elif c in _METACHARS or c.isspace():
                s.append("\\")
                s.append(c)
-            elif c == "\x00":
-                s.append("\\000")
            else:
                s.append(c)
    else:
@@ -402,8 +400,6 @@ def escape(pattern, special_only=True, literal_spaces=False):
                s.append(c)
            elif c in _ALNUM:
                s.append(c)
-            elif c == "\x00":
-                s.append("\\000")
            else:
                s.append("\\")
                s.append(c)
--- a/regex_3/test_regex.py
+++ b/regex_3/test_regex.py
@@ -911,10 +911,9 @@ class RegexTests(unittest.TestCase):
        p = regex.compile('(?iu)' + lower_char)
        self.assertEqual(bool(p.match(upper_char)), True)

+        # Changed to positional flags in regex 2023.12.23.
        self.assertEqual(bool(regex.match(r"(?i)a", "A")), True)
-        self.assertEqual(bool(regex.match(r"a(?i)", "A")), True)
-        self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True)
-        self.assertEqual(regex.match(r"a(?iV1)", "A"), None)
+        self.assertEqual(regex.match(r"a(?i)", "A"), None)

    def test_dollar_matches_twice(self):
        # $ matches the end of string, and just before the terminating \n.
@@ -1396,18 +1395,15 @@ class RegexTests(unittest.TestCase):
        # Issues 433028, 433024, 433027.
        self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2))
        self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2))
-        self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2))
-        self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None)
-
-        self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda:
-          regex.search(r"(?V0-i)Ab", "ab", flags=regex.I))
+        # Changed to positional flags in regex 2023.12.23.
+        self.assertEqual(regex.search(r"A(?i)b", "ab"), None)

        self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None)
        self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None)
-        self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None)
+        self.assertEqual(regex.search(r"(?-i)Ab", "ab", flags=regex.I), None)
        self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None)
-        self.assertEqual(regex.search(r"A(?V1-i)b", "ab",
-          flags=regex.I).span(), (0, 2))
+        self.assertEqual(regex.search(r"A(?-i)b", "ab", flags=regex.I).span(),
+          (0, 2))

    def test_repeated_repeats(self):
        # Issue 2537.
@@ -1820,12 +1816,10 @@ class RegexTests(unittest.TestCase):
            ('a.*b', 'acc\nccb', '', ascii(None)),
            ('a.{4,5}b', 'acc\nccb', '', ascii(None)),
            ('a.b', 'a\rb', '0', ascii('a\rb')),
-            # The new behaviour is that the inline flag affects only what follows.
-            ('a.b(?s)', 'a\nb', '0', ascii('a\nb')),
-            ('a.b(?sV1)', 'a\nb', '', ascii(None)),
+            # Changed to positional flags in regex 2023.12.23.
+            ('a.b(?s)', 'a\nb', '', ascii(None)),
            ('(?s)a.b', 'a\nb', '0', ascii('a\nb')),
-            ('a.*(?s)b', 'acc\nccb', '0', ascii('acc\nccb')),
-            ('a.*(?sV1)b', 'acc\nccb', '', ascii(None)),
+            ('a.*(?s)b', 'acc\nccb', '', ascii(None)),
            ('(?s)a.*b', 'acc\nccb', '0', ascii('acc\nccb')),
            ('(?s)a.{4,5}b', 'acc\nccb', '0', ascii('acc\nccb')),

@@ -2345,12 +2339,9 @@ class RegexTests(unittest.TestCase):
            # Not an error under PCRE/PRE:
            # When the new behaviour is turned on positional inline flags affect
            # only what follows.
-            ('w(?i)', 'W', '0', ascii('W')),
-            ('w(?iV1)', 'W', '0', ascii(None)),
+            ('w(?i)', 'W', '0', ascii(None)),
            ('w(?i)', 'w', '0', ascii('w')),
-            ('w(?iV1)', 'w', '0', ascii('w')),
            ('(?i)w', 'W', '0', ascii('W')),
-            ('(?iV1)w', 'W', '0', ascii('W')),

            # Comments using the x embedded pattern modifier.
            ("""(?x)w# comment 1
@@ -2403,14 +2394,10 @@ xyzabc
            # Bug 114033: nothing to repeat.
            (r'(x?)?', 'x', '0', ascii('x')),
            # Bug 115040: rescan if flags are modified inside pattern.
-            # If the new behaviour is turned on then positional inline flags
-            # affect only what follows.
-            (r' (?x)foo ', 'foo', '0', ascii('foo')),
-            (r' (?V1x)foo ', 'foo', '0', ascii(None)),
+            # Changed to positional flags in regex 2023.12.23.
+            (r' (?x)foo ', 'foo', '0', ascii(None)),
            (r'(?x) foo ', 'foo', '0', ascii('foo')),
-            (r'(?V1x) foo ', 'foo', '0', ascii('foo')),
            (r'(?x)foo ', 'foo', '0', ascii('foo')),
-            (r'(?V1x)foo ', 'foo', '0', ascii('foo')),
            # Bug 115618: negative lookahead.
            (r'(?<!abc)(d.f)', 'abcdefdof', '0', ascii('dof')),
            # Bug 116251: character class bug.
@@ -3154,10 +3141,8 @@ xyzabc

        # Hg issue 39: regex.search("((?i)blah)\\s+\\1", "blah BLAH") doesn't
        # return None
-        self.assertEqual(regex.search(r"(?V0)((?i)blah)\s+\1",
-          "blah BLAH").group(0, 1), ("blah BLAH", "blah"))
-        self.assertEqual(regex.search(r"(?V1)((?i)blah)\s+\1", "blah BLAH"),
-          None)
+        # Changed to positional flags in regex 2023.12.23.
+        self.assertEqual(regex.search(r"((?i)blah)\s+\1", "blah BLAH"), None)

        # Hg issue 40: regex.search("(\()?[^()]+(?(1)\)|)", "(abcd").group(0)
        # returns "bcd" instead of "abcd"
@@ -4336,10 +4321,10 @@ thing
        self.assertEqual(regex.search(r"^a?(a?)b?c\1$", "abca").span(), (0, 4))

        # Git issue 498: Conditional negative lookahead inside positive lookahead fails to match
-        self.assertEqual(regex.match(r"(?(?=a).|..)", "ab").span(), (0, 1))
-        self.assertEqual(regex.match(r"(?(?=b).|..)", "ab").span(), (0, 2))
-        self.assertEqual(regex.match(r"(?(?!a).|..)", "ab").span(), (0, 2))
-        self.assertEqual(regex.match(r"(?(?!b).|..)", "ab").span(), (0, 1))
+        self.assertEqual(regex.match(r'(?(?=a).|..)', 'ab').span(), (0, 1))
+        self.assertEqual(regex.match(r'(?(?=b).|..)', 'ab').span(), (0, 2))
+        self.assertEqual(regex.match(r'(?(?!a).|..)', 'ab').span(), (0, 2))
+        self.assertEqual(regex.match(r'(?(?!b).|..)', 'ab').span(), (0, 1))

    def test_fuzzy_ext(self):
        self.assertEqual(bool(regex.fullmatch(r'(?r)(?:a){e<=1:[a-z]}', 'e')),
@@ -4460,6 +4445,12 @@ thing
            self.assertEqual([m.span() for m in regex.finditer(r'(?m)^\s*?$',
              'foo\n\n\nbar')], [(4, 4), (4, 5), (5, 5)])

+    def test_line_ending(self):
+      self.assertEqual(regex.findall(r'\R', '\r\n\n\x0B\f\r\x85\u2028\u2029'),
+        ['\r\n', '\n', '\x0B', '\f', '\r', '\x85', '\u2028', '\u2029'])
+      self.assertEqual(regex.findall(br'\R', b'\r\n\n\x0B\f\r\x85'), [b'\r\n',
+        b'\n', b'\x0B', b'\f', b'\r'])
+
 def test_main():
    unittest.main(verbosity=2)

--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ with open('README.rst', encoding='utf-8') as file:

 setup(
    name='regex',
-    version='2023.10.3',
+    version='2023.12.23',
    description='Alternative regular expression module, to replace re.',
    long_description=long_description,
    long_description_content_type='text/x-rst',
--- a/tools/build_regex_unicode.py
+++ b/tools/build_regex_unicode.py
@@ -1781,4 +1781,4 @@ binary_dict = make_binary_dict()

 generate_code(unicode_data, UNICODE_VERSION, this_folder)

-print('\nSuccessfully generated _reges_unicode.h and _reges_unicode.c in %s' % tools_folder)
+print('\nSuccessfully generated _regex_unicode.h and _regex_unicode.c in %s' % tools_folder)