Updated to Unicode 12.1.0.

2025-10-05 20:02:39 +02:00 · 2019-06-02 02:32:45 +01:00
parent 0f8add10ce
commit d1b92998d5
13 changed files with 13151 additions and 12466 deletions
--- a/PyPI/PKG-INFO
+++ b/PyPI/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 1.1
 Name: regex
-Version: 2019.05.25
+Version: 2019.06.02
 Summary: Alternative regular expression module, to replace re.
 Home-page: https://bitbucket.org/mrabarnett/mrab-regex
 Author: Matthew Barnett
--- a/PyPI/setup.py
+++ b/PyPI/setup.py
@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')

 setup(
    name='regex',
-    version='2019.05.25',
+    version='2019.06.02',
    description='Alternative regular expression module, to replace re.',
    long_description=open(join(DOCS_DIR, 'Features.rst')).read(),

--- a/docs/UnicodeProperties.rst
+++ b/docs/UnicodeProperties.rst
@@ -138,7 +138,9 @@ Block [blk]
    Duployan
    Early_Dynastic_Cuneiform
    Egyptian_Hieroglyphs
+    Egyptian_Hieroglyph_Format_Controls
    Elbasan
+    Elymaic
    Emoticons
    Enclosed_Alphanumerics [Enclosed_Alphanum]
    Enclosed_Alphanumeric_Supplement [Enclosed_Alphanum_Sup]
@@ -251,12 +253,14 @@ Block [blk]
    Myanmar_Extended_A [Myanmar_Ext_A]
    Myanmar_Extended_B [Myanmar_Ext_B]
    Nabataean
+    Nandinagari
    Newa
    New_Tai_Lue
    NKo
    No_Block [NB]
    Number_Forms
    Nushu
+    Nyiakeng_Puachue_Hmong
    Ogham
    Old_Hungarian
    Old_Italic
@@ -272,6 +276,7 @@ Block [blk]
    Ornamental_Dingbats
    Osage
    Osmanya
+    Ottoman_Siyaq_Numbers
    Pahawh_Hmong
    Palmyrene
    Pau_Cin_Hau
@@ -295,6 +300,7 @@ Block [blk]
    Sinhala
    Sinhala_Archaic_Numbers
    Small_Form_Variants [Small_Forms]
+    Small_Kana_Extension [Small_Kana_Ext]
    Sogdian
    Sora_Sompeng
    Soyombo
@@ -313,6 +319,7 @@ Block [blk]
    Supplementary_Private_Use_Area_B [Sup_PUA_B]
    Sutton_SignWriting
    Syloti_Nagri
+    Symbols_And_Pictographs_Extended_A [Symbols_And_Pictographs_Ext_A]
    Syriac
    Syriac_Supplement [Syriac_Sup]
    Tagalog
@@ -324,6 +331,7 @@ Block [blk]
    Tai_Xuan_Jing_Symbols [Tai_Xuan_Jing]
    Takri
    Tamil
+    Tamil_Supplement [Tamil_Sup]
    Tangut
    Tangut_Components
    Telugu
@@ -341,6 +349,7 @@ Block [blk]
    Variation_Selectors_Supplement [VS_Sup]
    Vedic_Extensions [Vedic_Ext]
    Vertical_Forms
+    Wancho
    Warang_Citi
    Yijing_Hexagram_Symbols [Yijing]
    Yi_Radicals
@@ -886,12 +895,16 @@ Numeric_Value [nv]
    1/2
    1/20
    1/3
+    1/32
+    1/320
    1/4
    1/40
    1/5
    1/6
+    1/64
    1/7
    1/8
+    1/80
    1/9
    10
    100
@@ -942,6 +955,7 @@ Numeric_Value [nv]
    3/20
    3/4
    3/5
+    3/64
    3/8
    3/80
    30
@@ -1126,6 +1140,7 @@ Script [sc]
    Duployan [Dupl]
    Egyptian_Hieroglyphs [Egyp]
    Elbasan [Elba]
+    Elymaic [Elym]
    Ethiopic [Ethi]
    Georgian [Geor]
    Glagolitic [Glag]
@@ -1184,10 +1199,12 @@ Script [sc]
    Multani [Mult]
    Myanmar [Mymr]
    Nabataean [Nbat]
+    Nandinagari [Nand]
    Newa
    New_Tai_Lue [Talu]
    Nko [Nkoo]
    Nushu [Nshu]
+    Nyiakeng_Puachue_Hmong [Hmnp]
    Ogham [Ogam]
    Old_Hungarian [Hung]
    Old_Italic [Ital]
@@ -1239,6 +1256,7 @@ Script [sc]
    Ugaritic [Ugar]
    Unknown [Zzzz]
    Vai [Vaii]
+    Wancho [Wcho]
    Warang_Citi [Wara]
    Yi [Yiii]
    Zanabazar_Square [Zanb]
@@ -1264,11 +1282,12 @@ Script_Extensions [scx]
    Batak [Batk]
    Beng Cakm Sylo
    Beng Deva
-    Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
-    Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
+    Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
+    Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
    Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
    Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
    Beng Deva Gran Knda
+    Beng Deva Gran Knda Nand Orya Telu Tirh
    Bengali [Beng]
    Bhaiksuki [Bhks]
    Bopo Hang Hani Hira Kana
@@ -1299,14 +1318,15 @@ Script_Extensions [scx]
    Cyrl Latn
    Cyrl Perm
    Deseret [Dsrt]
-    Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
-    Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
+    Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
+    Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
    Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
    Deva Dogr Kthi Mahj
    Deva Gran
    Deva Gran Knda
    Deva Gran Latn
    Deva Knda Mlym Orya Taml Telu
+    Deva Nand
    Deva Shrd
    Deva Taml
    Devanagari [Deva]
@@ -1314,6 +1334,7 @@ Script_Extensions [scx]
    Duployan [Dupl]
    Egyptian_Hieroglyphs [Egyp]
    Elbasan [Elba]
+    Elymaic [Elym]
    Ethiopic [Ethi]
    Geor Latn
    Georgian [Geor]
@@ -1350,8 +1371,10 @@ Script_Extensions [scx]
    Khmer [Khmr]
    Khojki [Khoj]
    Khudawadi [Sind]
+    Knda Nand
    Lao [Laoo]
    Latin [Latn]
+    Latn Mong
    Lepcha [Lepc]
    Limbu [Limb]
    Linear_A [Lina]
@@ -1379,10 +1402,12 @@ Script_Extensions [scx]
    Multani [Mult]
    Myanmar [Mymr]
    Nabataean [Nbat]
+    Nandinagari [Nand]
    Newa
    New_Tai_Lue [Talu]
    Nko [Nkoo]
    Nushu [Nshu]
+    Nyiakeng_Puachue_Hmong [Hmnp]
    Ogham [Ogam]
    Old_Hungarian [Hung]
    Old_Italic [Ital]
@@ -1434,6 +1459,7 @@ Script_Extensions [scx]
    Ugaritic [Ugar]
    Unknown [Zzzz]
    Vai [Vaii]
+    Wancho [Wcho]
    Warang_Citi [Wara]
    Yi [Yiii]
    Zanabazar_Square [Zanb]
--- a/regex_2/_regex.h
+++ b/regex_2/_regex.h
@@ -11,7 +11,7 @@
 * 2010-01-16 mrab Re-written
 */

-/* Supports Unicode version 11.0.0. */
+/* Supports Unicode version 12.1.0. */

 #define RE_MAGIC 20100116

--- a/regex_2/_regex_unicode.c
+++ b/regex_2/_regex_unicode.c
--- a/regex_2/_regex_unicode.h
+++ b/regex_2/_regex_unicode.h
@@ -17,7 +17,7 @@ typedef unsigned char BOOL;

 #define RE_MAX_CASES 4
 #define RE_MAX_FOLDED 3
-#define RE_MAX_SCX 19
+#define RE_MAX_SCX 21

 typedef struct RE_Property {
    RE_UINT16 name;
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
 #define RE_LBREAK_EMODIFIER 41
 #define RE_LBREAK_ZWJ 42

-extern char* re_strings[1404];
+extern char* re_strings[1425];
 extern RE_Property re_properties[168];
-extern RE_PropertyValue re_property_values[1543];
+extern RE_PropertyValue re_property_values[1568];
 extern RE_UINT16 re_expand_on_folding[104];
 extern RE_GetPropertyFunc re_get_property[94];

--- a/regex_2/regex.py
+++ b/regex_2/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
  "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
  "VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]

-__version__ = "2.5.30"
+__version__ = "2.5.31"

 # --------------------------------------------------------------------
 # Public interface.
--- a/regex_3/_regex.h
+++ b/regex_3/_regex.h
@@ -11,7 +11,7 @@
 * 2010-01-16 mrab Re-written
 */

-/* Supports Unicode version 11.0.0. */
+/* Supports Unicode version 12.1.0. */

 #define RE_MAGIC 20100116

--- a/regex_3/_regex_unicode.c
+++ b/regex_3/_regex_unicode.c
--- a/regex_3/_regex_unicode.h
+++ b/regex_3/_regex_unicode.h
@@ -17,7 +17,7 @@ typedef unsigned char BOOL;

 #define RE_MAX_CASES 4
 #define RE_MAX_FOLDED 3
-#define RE_MAX_SCX 19
+#define RE_MAX_SCX 21

 typedef struct RE_Property {
    RE_UINT16 name;
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
 #define RE_LBREAK_EMODIFIER 41
 #define RE_LBREAK_ZWJ 42

-extern char* re_strings[1404];
+extern char* re_strings[1425];
 extern RE_Property re_properties[168];
-extern RE_PropertyValue re_property_values[1543];
+extern RE_PropertyValue re_property_values[1568];
 extern RE_UINT16 re_expand_on_folding[104];
 extern RE_GetPropertyFunc re_get_property[94];

--- a/regex_3/regex.py
+++ b/regex_3/regex.py
@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
  "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
  "VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]

-__version__ = "2.5.30"
+__version__ = "2.5.31"

 # --------------------------------------------------------------------
 # Public interface.
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')

 setup(
    name='regex',
-    version='2019.05.25',
+    version='2019.06.02',
    description='Alternative regular expression module, to replace re.',
    long_description=open(join(DOCS_DIR, 'Features.rst')).read(),

--- a/tools/build_regex_unicode.py
+++ b/tools/build_regex_unicode.py
@@ -11,6 +11,7 @@ from json import dump, load
 from os import remove, rename
 from os.path import dirname, exists, isfile, join, splitext
 from shutil import copy2, move
+from urllib.error import HTTPError
 from urllib.parse import urljoin
 from urllib.request import urlretrieve

@@ -1702,12 +1703,13 @@ def store_properties(unicode_version, properties, path):
 def check_unicode_version(unicode_data_files):
    'Checks the Unicode version in the data files.'

-    versions = set()
+    versions = defaultdict(set)

    # Read the version and filename from the first line of each data file.
    for line in unicode_data_files.splitlines():
        if line and line[0] != '#' and line[0] != '[':
-            path = join(unicode_folder, line.rpartition('/')[-1])
+            name = line.rpartition('/')[-1]
+            path = join(unicode_folder, name)

            with open(path, encoding='utf-8') as file:
                line = file.readline()
@@ -1717,16 +1719,26 @@ def check_unicode_version(unicode_data_files):
                        if line.startswith('# Version:'):
                            ver = line.split()[-1]
                            ver += '.0' * (2 - ver.count('.'))
-                            versions.add(ver)
+                            versions[ver].add(name)
                            break
                elif line.endswith('.txt\n'):
-                    versions.add(line[ : -5].rpartition('-')[2])
+                    ver = line[ : -5].rpartition('-')[2]
+                    versions[ver].add(name)

    if len(versions) != 1:
-        raise ValueError('expected 1 version of Unicode, but found {}: {}'.format(len(versions),
-          versions))
+        if set(versions) == {'12.0.0', '12.1.0'} and versions['12.0.0'] == {'emoji-data.txt'}:
+            # Cannot find emoji-data.txt for version 12.1.0.
+            pass
+        else:
+            for ver, names in versions.items():
+                print(ver, '=>', [name.strip('# \n') for name in names])

-    return versions.pop()
+            raise ValueError('expected 1 version of Unicode, but found {}'.format(len(versions)))
+
+    def make_key(ver):
+        return tuple(map(int, ver.split('.')))
+
+    return max(versions, key=make_key)

 def download_files(unicode_version, unicode_data_files):
    'Downloads the Unicode data files from the website.'
@@ -1739,7 +1751,11 @@ def download_files(unicode_version, unicode_data_files):
            if not isfile(versioned_path):
                url = urljoin(unicode_data_base, line)
                path = join(unicode_folder, line.rpartition('/')[-1])
-                download_unicode_file(url, path)
+
+                try:
+                    download_unicode_file(url, path)
+                except HTTPError:
+                    print('{} not found'.format(url))

 def merge_ranges(ranges):
    'Sorts and merges a list of codepoint ranges.'
@@ -2200,7 +2216,8 @@ def smallest_datatype(min_value, max_value):

 # Whether to update the Unicode data files from the Unicode website.
 UPDATE = False
-UNICODE_VERSION = '11.0.0'
+#UPDATE = True
+UNICODE_VERSION = '12.1.0'

 # Whether to recalculate the best block sizes for the tables.
 RECALC = False
@@ -2209,7 +2226,7 @@ RECALC = False
 unicode_data_base = 'http://www.unicode.org/Public/UNIDATA/'

 # The local folder in which the Unicode data files are stored.
-unicode_folder = join(dirname(__file__), 'Unicode')
+unicode_folder = r'D:\projects\Unicode\Data'

 # The local folder in which the generated C files should be written.
 code_folder = join(dirname(__file__), 'regex')
@@ -2257,7 +2274,7 @@ PropertyAliases.txt
 PropertyValueAliases.txt
 PropList.txt
 CaseFolding.txt
-UnicodeData.txt
+#UnicodeData.txt
 '''

 # Ensure that we have downloaded the Unicode data files for UNICODE_VERSION