1
1
mirror of https://github.com/mrabarnett/mrab-regex.git synced 2025-10-05 20:02:39 +02:00

Updated to Unicode 12.1.0.

This commit is contained in:
mrab
2019-06-02 02:32:45 +01:00
parent 0f8add10ce
commit d1b92998d5
13 changed files with 13151 additions and 12466 deletions

View File

@@ -1,6 +1,6 @@
Metadata-Version: 1.1
Name: regex
Version: 2019.05.25
Version: 2019.06.02
Summary: Alternative regular expression module, to replace re.
Home-page: https://bitbucket.org/mrabarnett/mrab-regex
Author: Matthew Barnett

View File

@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')
setup(
name='regex',
version='2019.05.25',
version='2019.06.02',
description='Alternative regular expression module, to replace re.',
long_description=open(join(DOCS_DIR, 'Features.rst')).read(),

View File

@@ -138,7 +138,9 @@ Block [blk]
Duployan
Early_Dynastic_Cuneiform
Egyptian_Hieroglyphs
Egyptian_Hieroglyph_Format_Controls
Elbasan
Elymaic
Emoticons
Enclosed_Alphanumerics [Enclosed_Alphanum]
Enclosed_Alphanumeric_Supplement [Enclosed_Alphanum_Sup]
@@ -251,12 +253,14 @@ Block [blk]
Myanmar_Extended_A [Myanmar_Ext_A]
Myanmar_Extended_B [Myanmar_Ext_B]
Nabataean
Nandinagari
Newa
New_Tai_Lue
NKo
No_Block [NB]
Number_Forms
Nushu
Nyiakeng_Puachue_Hmong
Ogham
Old_Hungarian
Old_Italic
@@ -272,6 +276,7 @@ Block [blk]
Ornamental_Dingbats
Osage
Osmanya
Ottoman_Siyaq_Numbers
Pahawh_Hmong
Palmyrene
Pau_Cin_Hau
@@ -295,6 +300,7 @@ Block [blk]
Sinhala
Sinhala_Archaic_Numbers
Small_Form_Variants [Small_Forms]
Small_Kana_Extension [Small_Kana_Ext]
Sogdian
Sora_Sompeng
Soyombo
@@ -313,6 +319,7 @@ Block [blk]
Supplementary_Private_Use_Area_B [Sup_PUA_B]
Sutton_SignWriting
Syloti_Nagri
Symbols_And_Pictographs_Extended_A [Symbols_And_Pictographs_Ext_A]
Syriac
Syriac_Supplement [Syriac_Sup]
Tagalog
@@ -324,6 +331,7 @@ Block [blk]
Tai_Xuan_Jing_Symbols [Tai_Xuan_Jing]
Takri
Tamil
Tamil_Supplement [Tamil_Sup]
Tangut
Tangut_Components
Telugu
@@ -341,6 +349,7 @@ Block [blk]
Variation_Selectors_Supplement [VS_Sup]
Vedic_Extensions [Vedic_Ext]
Vertical_Forms
Wancho
Warang_Citi
Yijing_Hexagram_Symbols [Yijing]
Yi_Radicals
@@ -886,12 +895,16 @@ Numeric_Value [nv]
1/2
1/20
1/3
1/32
1/320
1/4
1/40
1/5
1/6
1/64
1/7
1/8
1/80
1/9
10
100
@@ -942,6 +955,7 @@ Numeric_Value [nv]
3/20
3/4
3/5
3/64
3/8
3/80
30
@@ -1126,6 +1140,7 @@ Script [sc]
Duployan [Dupl]
Egyptian_Hieroglyphs [Egyp]
Elbasan [Elba]
Elymaic [Elym]
Ethiopic [Ethi]
Georgian [Geor]
Glagolitic [Glag]
@@ -1184,10 +1199,12 @@ Script [sc]
Multani [Mult]
Myanmar [Mymr]
Nabataean [Nbat]
Nandinagari [Nand]
Newa
New_Tai_Lue [Talu]
Nko [Nkoo]
Nushu [Nshu]
Nyiakeng_Puachue_Hmong [Hmnp]
Ogham [Ogam]
Old_Hungarian [Hung]
Old_Italic [Ital]
@@ -1239,6 +1256,7 @@ Script [sc]
Ugaritic [Ugar]
Unknown [Zzzz]
Vai [Vaii]
Wancho [Wcho]
Warang_Citi [Wara]
Yi [Yiii]
Zanabazar_Square [Zanb]
@@ -1264,11 +1282,12 @@ Script_Extensions [scx]
Batak [Batk]
Beng Cakm Sylo
Beng Deva
Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
Beng Deva Gran Knda
Beng Deva Gran Knda Nand Orya Telu Tirh
Bengali [Beng]
Bhaiksuki [Bhks]
Bopo Hang Hani Hira Kana
@@ -1299,14 +1318,15 @@ Script_Extensions [scx]
Cyrl Latn
Cyrl Perm
Deseret [Dsrt]
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
Deva Dogr Kthi Mahj
Deva Gran
Deva Gran Knda
Deva Gran Latn
Deva Knda Mlym Orya Taml Telu
Deva Nand
Deva Shrd
Deva Taml
Devanagari [Deva]
@@ -1314,6 +1334,7 @@ Script_Extensions [scx]
Duployan [Dupl]
Egyptian_Hieroglyphs [Egyp]
Elbasan [Elba]
Elymaic [Elym]
Ethiopic [Ethi]
Geor Latn
Georgian [Geor]
@@ -1350,8 +1371,10 @@ Script_Extensions [scx]
Khmer [Khmr]
Khojki [Khoj]
Khudawadi [Sind]
Knda Nand
Lao [Laoo]
Latin [Latn]
Latn Mong
Lepcha [Lepc]
Limbu [Limb]
Linear_A [Lina]
@@ -1379,10 +1402,12 @@ Script_Extensions [scx]
Multani [Mult]
Myanmar [Mymr]
Nabataean [Nbat]
Nandinagari [Nand]
Newa
New_Tai_Lue [Talu]
Nko [Nkoo]
Nushu [Nshu]
Nyiakeng_Puachue_Hmong [Hmnp]
Ogham [Ogam]
Old_Hungarian [Hung]
Old_Italic [Ital]
@@ -1434,6 +1459,7 @@ Script_Extensions [scx]
Ugaritic [Ugar]
Unknown [Zzzz]
Vai [Vaii]
Wancho [Wcho]
Warang_Citi [Wara]
Yi [Yiii]
Zanabazar_Square [Zanb]

View File

@@ -11,7 +11,7 @@
* 2010-01-16 mrab Re-written
*/
/* Supports Unicode version 11.0.0. */
/* Supports Unicode version 12.1.0. */
#define RE_MAGIC 20100116

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,7 @@ typedef unsigned char BOOL;
#define RE_MAX_CASES 4
#define RE_MAX_FOLDED 3
#define RE_MAX_SCX 19
#define RE_MAX_SCX 21
typedef struct RE_Property {
RE_UINT16 name;
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_LBREAK_EMODIFIER 41
#define RE_LBREAK_ZWJ 42
extern char* re_strings[1404];
extern char* re_strings[1425];
extern RE_Property re_properties[168];
extern RE_PropertyValue re_property_values[1543];
extern RE_PropertyValue re_property_values[1568];
extern RE_UINT16 re_expand_on_folding[104];
extern RE_GetPropertyFunc re_get_property[94];

View File

@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
"T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
"VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]
__version__ = "2.5.30"
__version__ = "2.5.31"
# --------------------------------------------------------------------
# Public interface.

View File

@@ -11,7 +11,7 @@
* 2010-01-16 mrab Re-written
*/
/* Supports Unicode version 11.0.0. */
/* Supports Unicode version 12.1.0. */
#define RE_MAGIC 20100116

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,7 @@ typedef unsigned char BOOL;
#define RE_MAX_CASES 4
#define RE_MAX_FOLDED 3
#define RE_MAX_SCX 19
#define RE_MAX_SCX 21
typedef struct RE_Property {
RE_UINT16 name;
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
#define RE_LBREAK_EMODIFIER 41
#define RE_LBREAK_ZWJ 42
extern char* re_strings[1404];
extern char* re_strings[1425];
extern RE_Property re_properties[168];
extern RE_PropertyValue re_property_values[1543];
extern RE_PropertyValue re_property_values[1568];
extern RE_UINT16 re_expand_on_folding[104];
extern RE_GetPropertyFunc re_get_property[94];

View File

@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
"T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
"VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]
__version__ = "2.5.30"
__version__ = "2.5.31"
# --------------------------------------------------------------------
# Public interface.

View File

@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')
setup(
name='regex',
version='2019.05.25',
version='2019.06.02',
description='Alternative regular expression module, to replace re.',
long_description=open(join(DOCS_DIR, 'Features.rst')).read(),

View File

@@ -11,6 +11,7 @@ from json import dump, load
from os import remove, rename
from os.path import dirname, exists, isfile, join, splitext
from shutil import copy2, move
from urllib.error import HTTPError
from urllib.parse import urljoin
from urllib.request import urlretrieve
@@ -1702,12 +1703,13 @@ def store_properties(unicode_version, properties, path):
def check_unicode_version(unicode_data_files):
'Checks the Unicode version in the data files.'
versions = set()
versions = defaultdict(set)
# Read the version and filename from the first line of each data file.
for line in unicode_data_files.splitlines():
if line and line[0] != '#' and line[0] != '[':
path = join(unicode_folder, line.rpartition('/')[-1])
name = line.rpartition('/')[-1]
path = join(unicode_folder, name)
with open(path, encoding='utf-8') as file:
line = file.readline()
@@ -1717,16 +1719,26 @@ def check_unicode_version(unicode_data_files):
if line.startswith('# Version:'):
ver = line.split()[-1]
ver += '.0' * (2 - ver.count('.'))
versions.add(ver)
versions[ver].add(name)
break
elif line.endswith('.txt\n'):
versions.add(line[ : -5].rpartition('-')[2])
ver = line[ : -5].rpartition('-')[2]
versions[ver].add(name)
if len(versions) != 1:
raise ValueError('expected 1 version of Unicode, but found {}: {}'.format(len(versions),
versions))
if set(versions) == {'12.0.0', '12.1.0'} and versions['12.0.0'] == {'emoji-data.txt'}:
# Cannot find emoji-data.txt for version 12.1.0.
pass
else:
for ver, names in versions.items():
print(ver, '=>', [name.strip('# \n') for name in names])
return versions.pop()
raise ValueError('expected 1 version of Unicode, but found {}'.format(len(versions)))
def make_key(ver):
return tuple(map(int, ver.split('.')))
return max(versions, key=make_key)
def download_files(unicode_version, unicode_data_files):
'Downloads the Unicode data files from the website.'
@@ -1739,7 +1751,11 @@ def download_files(unicode_version, unicode_data_files):
if not isfile(versioned_path):
url = urljoin(unicode_data_base, line)
path = join(unicode_folder, line.rpartition('/')[-1])
download_unicode_file(url, path)
try:
download_unicode_file(url, path)
except HTTPError:
print('{} not found'.format(url))
def merge_ranges(ranges):
'Sorts and merges a list of codepoint ranges.'
@@ -2200,7 +2216,8 @@ def smallest_datatype(min_value, max_value):
# Whether to update the Unicode data files from the Unicode website.
UPDATE = False
UNICODE_VERSION = '11.0.0'
#UPDATE = True
UNICODE_VERSION = '12.1.0'
# Whether to recalculate the best block sizes for the tables.
RECALC = False
@@ -2209,7 +2226,7 @@ RECALC = False
unicode_data_base = 'http://www.unicode.org/Public/UNIDATA/'
# The local folder in which the Unicode data files are stored.
unicode_folder = join(dirname(__file__), 'Unicode')
unicode_folder = r'D:\projects\Unicode\Data'
# The local folder in which the generated C files should be written.
code_folder = join(dirname(__file__), 'regex')
@@ -2257,7 +2274,7 @@ PropertyAliases.txt
PropertyValueAliases.txt
PropList.txt
CaseFolding.txt
UnicodeData.txt
#UnicodeData.txt
'''
# Ensure that we have downloaded the Unicode data files for UNICODE_VERSION