mirror of
https://github.com/mrabarnett/mrab-regex.git
synced 2025-10-05 20:02:39 +02:00
Updated to Unicode 12.1.0.
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
Metadata-Version: 1.1
|
||||
Name: regex
|
||||
Version: 2019.05.25
|
||||
Version: 2019.06.02
|
||||
Summary: Alternative regular expression module, to replace re.
|
||||
Home-page: https://bitbucket.org/mrabarnett/mrab-regex
|
||||
Author: Matthew Barnett
|
||||
|
@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')
|
||||
|
||||
setup(
|
||||
name='regex',
|
||||
version='2019.05.25',
|
||||
version='2019.06.02',
|
||||
description='Alternative regular expression module, to replace re.',
|
||||
long_description=open(join(DOCS_DIR, 'Features.rst')).read(),
|
||||
|
||||
|
@@ -138,7 +138,9 @@ Block [blk]
|
||||
Duployan
|
||||
Early_Dynastic_Cuneiform
|
||||
Egyptian_Hieroglyphs
|
||||
Egyptian_Hieroglyph_Format_Controls
|
||||
Elbasan
|
||||
Elymaic
|
||||
Emoticons
|
||||
Enclosed_Alphanumerics [Enclosed_Alphanum]
|
||||
Enclosed_Alphanumeric_Supplement [Enclosed_Alphanum_Sup]
|
||||
@@ -251,12 +253,14 @@ Block [blk]
|
||||
Myanmar_Extended_A [Myanmar_Ext_A]
|
||||
Myanmar_Extended_B [Myanmar_Ext_B]
|
||||
Nabataean
|
||||
Nandinagari
|
||||
Newa
|
||||
New_Tai_Lue
|
||||
NKo
|
||||
No_Block [NB]
|
||||
Number_Forms
|
||||
Nushu
|
||||
Nyiakeng_Puachue_Hmong
|
||||
Ogham
|
||||
Old_Hungarian
|
||||
Old_Italic
|
||||
@@ -272,6 +276,7 @@ Block [blk]
|
||||
Ornamental_Dingbats
|
||||
Osage
|
||||
Osmanya
|
||||
Ottoman_Siyaq_Numbers
|
||||
Pahawh_Hmong
|
||||
Palmyrene
|
||||
Pau_Cin_Hau
|
||||
@@ -295,6 +300,7 @@ Block [blk]
|
||||
Sinhala
|
||||
Sinhala_Archaic_Numbers
|
||||
Small_Form_Variants [Small_Forms]
|
||||
Small_Kana_Extension [Small_Kana_Ext]
|
||||
Sogdian
|
||||
Sora_Sompeng
|
||||
Soyombo
|
||||
@@ -313,6 +319,7 @@ Block [blk]
|
||||
Supplementary_Private_Use_Area_B [Sup_PUA_B]
|
||||
Sutton_SignWriting
|
||||
Syloti_Nagri
|
||||
Symbols_And_Pictographs_Extended_A [Symbols_And_Pictographs_Ext_A]
|
||||
Syriac
|
||||
Syriac_Supplement [Syriac_Sup]
|
||||
Tagalog
|
||||
@@ -324,6 +331,7 @@ Block [blk]
|
||||
Tai_Xuan_Jing_Symbols [Tai_Xuan_Jing]
|
||||
Takri
|
||||
Tamil
|
||||
Tamil_Supplement [Tamil_Sup]
|
||||
Tangut
|
||||
Tangut_Components
|
||||
Telugu
|
||||
@@ -341,6 +349,7 @@ Block [blk]
|
||||
Variation_Selectors_Supplement [VS_Sup]
|
||||
Vedic_Extensions [Vedic_Ext]
|
||||
Vertical_Forms
|
||||
Wancho
|
||||
Warang_Citi
|
||||
Yijing_Hexagram_Symbols [Yijing]
|
||||
Yi_Radicals
|
||||
@@ -886,12 +895,16 @@ Numeric_Value [nv]
|
||||
1/2
|
||||
1/20
|
||||
1/3
|
||||
1/32
|
||||
1/320
|
||||
1/4
|
||||
1/40
|
||||
1/5
|
||||
1/6
|
||||
1/64
|
||||
1/7
|
||||
1/8
|
||||
1/80
|
||||
1/9
|
||||
10
|
||||
100
|
||||
@@ -942,6 +955,7 @@ Numeric_Value [nv]
|
||||
3/20
|
||||
3/4
|
||||
3/5
|
||||
3/64
|
||||
3/8
|
||||
3/80
|
||||
30
|
||||
@@ -1126,6 +1140,7 @@ Script [sc]
|
||||
Duployan [Dupl]
|
||||
Egyptian_Hieroglyphs [Egyp]
|
||||
Elbasan [Elba]
|
||||
Elymaic [Elym]
|
||||
Ethiopic [Ethi]
|
||||
Georgian [Geor]
|
||||
Glagolitic [Glag]
|
||||
@@ -1184,10 +1199,12 @@ Script [sc]
|
||||
Multani [Mult]
|
||||
Myanmar [Mymr]
|
||||
Nabataean [Nbat]
|
||||
Nandinagari [Nand]
|
||||
Newa
|
||||
New_Tai_Lue [Talu]
|
||||
Nko [Nkoo]
|
||||
Nushu [Nshu]
|
||||
Nyiakeng_Puachue_Hmong [Hmnp]
|
||||
Ogham [Ogam]
|
||||
Old_Hungarian [Hung]
|
||||
Old_Italic [Ital]
|
||||
@@ -1239,6 +1256,7 @@ Script [sc]
|
||||
Ugaritic [Ugar]
|
||||
Unknown [Zzzz]
|
||||
Vai [Vaii]
|
||||
Wancho [Wcho]
|
||||
Warang_Citi [Wara]
|
||||
Yi [Yiii]
|
||||
Zanabazar_Square [Zanb]
|
||||
@@ -1264,11 +1282,12 @@ Script_Extensions [scx]
|
||||
Batak [Batk]
|
||||
Beng Cakm Sylo
|
||||
Beng Deva
|
||||
Beng Deva Dogr Gong Gran Gujr Guru Knda Limb Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
Beng Deva Dogr Gong Gran Gujr Guru Knda Mahj Mlym Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
|
||||
Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
|
||||
Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
|
||||
Beng Deva Gran Knda
|
||||
Beng Deva Gran Knda Nand Orya Telu Tirh
|
||||
Bengali [Beng]
|
||||
Bhaiksuki [Bhks]
|
||||
Bopo Hang Hani Hira Kana
|
||||
@@ -1299,14 +1318,15 @@ Script_Extensions [scx]
|
||||
Cyrl Latn
|
||||
Cyrl Perm
|
||||
Deseret [Dsrt]
|
||||
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Sind Takr Tirh
|
||||
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Sind Takr Tirh
|
||||
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
|
||||
Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
|
||||
Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
|
||||
Deva Dogr Kthi Mahj
|
||||
Deva Gran
|
||||
Deva Gran Knda
|
||||
Deva Gran Latn
|
||||
Deva Knda Mlym Orya Taml Telu
|
||||
Deva Nand
|
||||
Deva Shrd
|
||||
Deva Taml
|
||||
Devanagari [Deva]
|
||||
@@ -1314,6 +1334,7 @@ Script_Extensions [scx]
|
||||
Duployan [Dupl]
|
||||
Egyptian_Hieroglyphs [Egyp]
|
||||
Elbasan [Elba]
|
||||
Elymaic [Elym]
|
||||
Ethiopic [Ethi]
|
||||
Geor Latn
|
||||
Georgian [Geor]
|
||||
@@ -1350,8 +1371,10 @@ Script_Extensions [scx]
|
||||
Khmer [Khmr]
|
||||
Khojki [Khoj]
|
||||
Khudawadi [Sind]
|
||||
Knda Nand
|
||||
Lao [Laoo]
|
||||
Latin [Latn]
|
||||
Latn Mong
|
||||
Lepcha [Lepc]
|
||||
Limbu [Limb]
|
||||
Linear_A [Lina]
|
||||
@@ -1379,10 +1402,12 @@ Script_Extensions [scx]
|
||||
Multani [Mult]
|
||||
Myanmar [Mymr]
|
||||
Nabataean [Nbat]
|
||||
Nandinagari [Nand]
|
||||
Newa
|
||||
New_Tai_Lue [Talu]
|
||||
Nko [Nkoo]
|
||||
Nushu [Nshu]
|
||||
Nyiakeng_Puachue_Hmong [Hmnp]
|
||||
Ogham [Ogam]
|
||||
Old_Hungarian [Hung]
|
||||
Old_Italic [Ital]
|
||||
@@ -1434,6 +1459,7 @@ Script_Extensions [scx]
|
||||
Ugaritic [Ugar]
|
||||
Unknown [Zzzz]
|
||||
Vai [Vaii]
|
||||
Wancho [Wcho]
|
||||
Warang_Citi [Wara]
|
||||
Yi [Yiii]
|
||||
Zanabazar_Square [Zanb]
|
||||
|
@@ -11,7 +11,7 @@
|
||||
* 2010-01-16 mrab Re-written
|
||||
*/
|
||||
|
||||
/* Supports Unicode version 11.0.0. */
|
||||
/* Supports Unicode version 12.1.0. */
|
||||
|
||||
#define RE_MAGIC 20100116
|
||||
|
||||
|
12759
regex_2/_regex_unicode.c
12759
regex_2/_regex_unicode.c
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ typedef unsigned char BOOL;
|
||||
|
||||
#define RE_MAX_CASES 4
|
||||
#define RE_MAX_FOLDED 3
|
||||
#define RE_MAX_SCX 19
|
||||
#define RE_MAX_SCX 21
|
||||
|
||||
typedef struct RE_Property {
|
||||
RE_UINT16 name;
|
||||
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
|
||||
#define RE_LBREAK_EMODIFIER 41
|
||||
#define RE_LBREAK_ZWJ 42
|
||||
|
||||
extern char* re_strings[1404];
|
||||
extern char* re_strings[1425];
|
||||
extern RE_Property re_properties[168];
|
||||
extern RE_PropertyValue re_property_values[1543];
|
||||
extern RE_PropertyValue re_property_values[1568];
|
||||
extern RE_UINT16 re_expand_on_folding[104];
|
||||
extern RE_GetPropertyFunc re_get_property[94];
|
||||
|
||||
|
@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
|
||||
"T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
|
||||
"VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]
|
||||
|
||||
__version__ = "2.5.30"
|
||||
__version__ = "2.5.31"
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Public interface.
|
||||
|
@@ -11,7 +11,7 @@
|
||||
* 2010-01-16 mrab Re-written
|
||||
*/
|
||||
|
||||
/* Supports Unicode version 11.0.0. */
|
||||
/* Supports Unicode version 12.1.0. */
|
||||
|
||||
#define RE_MAGIC 20100116
|
||||
|
||||
|
12759
regex_3/_regex_unicode.c
12759
regex_3/_regex_unicode.c
File diff suppressed because it is too large
Load Diff
@@ -17,7 +17,7 @@ typedef unsigned char BOOL;
|
||||
|
||||
#define RE_MAX_CASES 4
|
||||
#define RE_MAX_FOLDED 3
|
||||
#define RE_MAX_SCX 19
|
||||
#define RE_MAX_SCX 21
|
||||
|
||||
typedef struct RE_Property {
|
||||
RE_UINT16 name;
|
||||
@@ -194,9 +194,9 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 ch);
|
||||
#define RE_LBREAK_EMODIFIER 41
|
||||
#define RE_LBREAK_ZWJ 42
|
||||
|
||||
extern char* re_strings[1404];
|
||||
extern char* re_strings[1425];
|
||||
extern RE_Property re_properties[168];
|
||||
extern RE_PropertyValue re_property_values[1543];
|
||||
extern RE_PropertyValue re_property_values[1568];
|
||||
extern RE_UINT16 re_expand_on_folding[104];
|
||||
extern RE_GetPropertyFunc re_get_property[94];
|
||||
|
||||
|
@@ -239,7 +239,7 @@ __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
|
||||
"T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
|
||||
"VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]
|
||||
|
||||
__version__ = "2.5.30"
|
||||
__version__ = "2.5.31"
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Public interface.
|
||||
|
2
setup.py
2
setup.py
@@ -16,7 +16,7 @@ DOCS_DIR = join(BASE_DIR, 'docs')
|
||||
|
||||
setup(
|
||||
name='regex',
|
||||
version='2019.05.25',
|
||||
version='2019.06.02',
|
||||
description='Alternative regular expression module, to replace re.',
|
||||
long_description=open(join(DOCS_DIR, 'Features.rst')).read(),
|
||||
|
||||
|
@@ -11,6 +11,7 @@ from json import dump, load
|
||||
from os import remove, rename
|
||||
from os.path import dirname, exists, isfile, join, splitext
|
||||
from shutil import copy2, move
|
||||
from urllib.error import HTTPError
|
||||
from urllib.parse import urljoin
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
@@ -1702,12 +1703,13 @@ def store_properties(unicode_version, properties, path):
|
||||
def check_unicode_version(unicode_data_files):
|
||||
'Checks the Unicode version in the data files.'
|
||||
|
||||
versions = set()
|
||||
versions = defaultdict(set)
|
||||
|
||||
# Read the version and filename from the first line of each data file.
|
||||
for line in unicode_data_files.splitlines():
|
||||
if line and line[0] != '#' and line[0] != '[':
|
||||
path = join(unicode_folder, line.rpartition('/')[-1])
|
||||
name = line.rpartition('/')[-1]
|
||||
path = join(unicode_folder, name)
|
||||
|
||||
with open(path, encoding='utf-8') as file:
|
||||
line = file.readline()
|
||||
@@ -1717,16 +1719,26 @@ def check_unicode_version(unicode_data_files):
|
||||
if line.startswith('# Version:'):
|
||||
ver = line.split()[-1]
|
||||
ver += '.0' * (2 - ver.count('.'))
|
||||
versions.add(ver)
|
||||
versions[ver].add(name)
|
||||
break
|
||||
elif line.endswith('.txt\n'):
|
||||
versions.add(line[ : -5].rpartition('-')[2])
|
||||
ver = line[ : -5].rpartition('-')[2]
|
||||
versions[ver].add(name)
|
||||
|
||||
if len(versions) != 1:
|
||||
raise ValueError('expected 1 version of Unicode, but found {}: {}'.format(len(versions),
|
||||
versions))
|
||||
if set(versions) == {'12.0.0', '12.1.0'} and versions['12.0.0'] == {'emoji-data.txt'}:
|
||||
# Cannot find emoji-data.txt for version 12.1.0.
|
||||
pass
|
||||
else:
|
||||
for ver, names in versions.items():
|
||||
print(ver, '=>', [name.strip('# \n') for name in names])
|
||||
|
||||
return versions.pop()
|
||||
raise ValueError('expected 1 version of Unicode, but found {}'.format(len(versions)))
|
||||
|
||||
def make_key(ver):
|
||||
return tuple(map(int, ver.split('.')))
|
||||
|
||||
return max(versions, key=make_key)
|
||||
|
||||
def download_files(unicode_version, unicode_data_files):
|
||||
'Downloads the Unicode data files from the website.'
|
||||
@@ -1739,7 +1751,11 @@ def download_files(unicode_version, unicode_data_files):
|
||||
if not isfile(versioned_path):
|
||||
url = urljoin(unicode_data_base, line)
|
||||
path = join(unicode_folder, line.rpartition('/')[-1])
|
||||
download_unicode_file(url, path)
|
||||
|
||||
try:
|
||||
download_unicode_file(url, path)
|
||||
except HTTPError:
|
||||
print('{} not found'.format(url))
|
||||
|
||||
def merge_ranges(ranges):
|
||||
'Sorts and merges a list of codepoint ranges.'
|
||||
@@ -2200,7 +2216,8 @@ def smallest_datatype(min_value, max_value):
|
||||
|
||||
# Whether to update the Unicode data files from the Unicode website.
|
||||
UPDATE = False
|
||||
UNICODE_VERSION = '11.0.0'
|
||||
#UPDATE = True
|
||||
UNICODE_VERSION = '12.1.0'
|
||||
|
||||
# Whether to recalculate the best block sizes for the tables.
|
||||
RECALC = False
|
||||
@@ -2209,7 +2226,7 @@ RECALC = False
|
||||
unicode_data_base = 'http://www.unicode.org/Public/UNIDATA/'
|
||||
|
||||
# The local folder in which the Unicode data files are stored.
|
||||
unicode_folder = join(dirname(__file__), 'Unicode')
|
||||
unicode_folder = r'D:\projects\Unicode\Data'
|
||||
|
||||
# The local folder in which the generated C files should be written.
|
||||
code_folder = join(dirname(__file__), 'regex')
|
||||
@@ -2257,7 +2274,7 @@ PropertyAliases.txt
|
||||
PropertyValueAliases.txt
|
||||
PropList.txt
|
||||
CaseFolding.txt
|
||||
UnicodeData.txt
|
||||
#UnicodeData.txt
|
||||
'''
|
||||
|
||||
# Ensure that we have downloaded the Unicode data files for UNICODE_VERSION
|
||||
|
Reference in New Issue
Block a user