diff --git a/.gitignore b/.gitignore index e8bf2cc..292b38e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ htmlcov data .DS_Store .idea +.venv diff --git a/bin/update-tables.py b/bin/update-tables.py index 0ff6ffc..2b6810b 100644 --- a/bin/update-tables.py +++ b/bin/update-tables.py @@ -24,23 +24,26 @@ import logging import datetime import functools +import collections import unicodedata from pathlib import Path from dataclasses import field, fields, dataclass from typing import Any, Mapping, Iterable, Iterator, Sequence, Container, Collection -from typing_extensions import Self # 3rd party import jinja2 import requests import urllib3.util import dateutil.parser +from typing_extensions import Self URL_UNICODE_DERIVED_AGE = 'https://www.unicode.org/Public/UCD/latest/ucd/DerivedAge.txt' URL_EASTASIAN_WIDTH = 'https://www.unicode.org/Public/{version}/ucd/EastAsianWidth.txt' URL_DERIVED_CATEGORY = 'https://www.unicode.org/Public/{version}/ucd/extracted/DerivedGeneralCategory.txt' +URL_EMOJI_ZWJ_SEQUENCES = 'https://unicode.org/Public/emoji/{version}/emoji-zwj-sequences.txt' EXCLUDE_VERSIONS = ['2.0.0', '2.1.2', '3.0.0', '3.1.0', '3.2.0', '4.0.0'] +EMOJI_LEGACY_ZWJ_VERSIONS = ['2.0', '3.0', '4.0', '5.0'] PATH_UP = os.path.relpath(os.path.join(os.path.dirname(__file__), os.path.pardir)) PATH_DATA = os.path.join(PATH_UP, 'data') @@ -52,7 +55,7 @@ JINJA_ENV = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.join(PATH_UP, 'code_templates')), keep_trailing_newline=True) -UTC_NOW = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC") +UTC_NOW = datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S UTC") CONNECT_TIMEOUT = int(os.environ.get('CONNECT_TIMEOUT', '10')) FETCH_BLOCKSIZE = int(os.environ.get('FETCH_BLOCKSIZE', '4096')) @@ -64,36 +67,63 @@ @dataclass(order=True, frozen=True) class UnicodeVersion: - """A class for camparable unicode version.""" + """A class for comparing 2 and 3-digit versions used in unicode data files""" major: int minor: int - micro: int + micro: int = None @classmethod def parse(cls, version_str: str) -> UnicodeVersion: """ - parse a version string. + Parse version strings used by unicode data files. >>> UnicodeVersion.parse("14.0.0") UnicodeVersion(major=14, minor=0, micro=0) + + >>> UnicodeVersion.parse("15.1") + UnicodeVersion(major=15, minor=1, micro=None) """ - return cls(*map(int, version_str.split(".")[:3])) + versions = list(map(int, version_str.split(".")[:3])) + while len(versions) < 3: + versions.append(None) + return cls(*versions) def __str__(self) -> str: """ >>> str(UnicodeVersion(12, 1, 0)) '12.1.0' + >>> str(UnicodeVersion(15, 1, None)) + '15.1' + """ - return f'{self.major}.{self.minor}.{self.micro}' + maybe_micro = '' + if self.micro is not None: + maybe_micro = f'.{self.micro}' + return f'{self.major}.{self.minor}{maybe_micro}' + + @property + def major_minor(self) -> str: + """ + >>> UnicodeVersion(11, 0, 0).major_minor + '11.0' + """ + return f'{self.major}.{self.minor}' @dataclass(frozen=True) class TableEntry: """An entry of a unicode table.""" - code_range: range | None + code_range: tuple[int, int] | None properties: tuple[str, ...] comment: str +@dataclass(frozen=True) +class SequenceEntry: + """An entry of a unicode sequence.""" + code_seq: str | None + description: str + comment: str + @dataclass class TableDef: @@ -102,9 +132,15 @@ class TableDef: values: list[tuple[str, str, str]] +@dataclass +class SequenceDef: + filename: str + date: str + sequences: dict[int, list[tuple[str, str]]] + + @dataclass(frozen=True) class RenderContext: - def to_dict(self) -> dict[str, Any]: return {field.name: getattr(self, field.name) for field in fields(self)} @@ -126,6 +162,12 @@ class UnicodeTableRenderCtx(RenderContext): table: Mapping[UnicodeVersion, TableDef] +@dataclass(frozen=True) +class UnicodeRegexRenderCtx(RenderContext): + variable_name: str + patterns: Mapping[UnicodeVersion, str] + + @dataclass class RenderDefinition: """Base class, do not instantiate it directly.""" @@ -200,6 +242,44 @@ def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self: render_context=context, ) +@dataclass +class UnicodeSequenceRenderDef(RenderDefinition): + render_context: UnicodeTableRenderCtx + + @classmethod + def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self: + _, ext = os.path.splitext(filename) + if ext == '.py': + jinja_filename = 'emoji_zwj_sequences.py.j2' + else: + raise ValueError('filename must be Python') + + return cls( + jinja_filename=jinja_filename, + output_filename=os.path.join(PATH_UP, 'wcwidth', filename), + render_context=context, + ) + +@dataclass +class UnicodeRegexRenderDef(RenderDefinition): + render_context: UnicodeRegexRenderCtx + + @classmethod + def new(cls, filename: str, context: UnicodeTableRenderCtx) -> Self: + _, ext = os.path.splitext(filename) + if ext == '.py': + jinja_filename = 're_patterns.py.j2' + else: + raise ValueError('filename must be Python') + + return cls( + jinja_filename=jinja_filename, + output_filename=os.path.join(PATH_UP, 'wcwidth', filename), + render_context=context, + ) + + + @functools.cache def fetch_unicode_versions() -> list[UnicodeVersion]: @@ -217,6 +297,28 @@ def fetch_unicode_versions() -> list[UnicodeVersion]: versions.sort() return versions +def fetch_zwj_versions() -> list[UnicodeVersion]: + """Determine Unicode Versions with Emoji Zero Width Join character support.""" + # From UnicodeĀ® Technical Standard #51 + # + # > Starting with Version 11.0 of this specification, the repertoire of + # > emoji characters is synchronized with the Unicode Standard, and has the + # > same version numbering system. For details, see Section 1.5.2, Versioning. + # + # http://www.unicode.org/reports/tr51/#Versioning + # + fname = os.path.join(PATH_DATA, URL_EMOJI_ZWJ_SEQUENCES.rsplit('/', 1)[-1]) + filename, ext = os.path.splitext(fname) + fname = filename + '-latest' + ext + do_retrieve(url=URL_EMOJI_ZWJ_SEQUENCES.format(version='latest'), fname=fname) + pattern = re.compile(r'.*# E([0-9.]+)') + versions = set() + with open(fname, encoding='utf-8') as f: + for line in f: + if match := re.match(pattern, line): + version = match.group(1) + versions.add(UnicodeVersion.parse(version)) + return sorted(versions) def fetch_source_headers() -> UnicodeVersionRstRenderCtx: # find all filenames with a version number in it, @@ -260,13 +362,25 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx: for version in fetch_unicode_versions(): fname = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt') do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname) - # TODO: test whether all of category, 'Cf' should be 'zero - # width', or, just the subset 2060..2064, see open issue - # https://github.com/jquast/wcwidth/issues/26 - table[version] = parse_category(fname=fname, category_codes=('Me', 'Mn',)) + # Determine values of zero-width character lookup table by the following category codes + table[version] = parse_category(fname=fname, category_codes=('Me', 'Mn', 'Cf', 'Zl', 'Zp', 'Sk')) + + # Inject NULL into all table versions. + table[version].values.append(('0x00000', '0x00000', name_ucs('\x00'))) + table[version].values.sort() return UnicodeTableRenderCtx('ZERO_WIDTH', table) +def fetch_emoji_zero_data() -> UnicodeTableRenderCtx: + """Fetch the latest emoji zero width joiner (ZWJ).""" + table: dict[UnicodeVersion, SequenceDef] = {} + for version in fetch_zwj_versions(): + fname = os.path.join(PATH_DATA, f'emoji-zwj-sequences-{version}.txt') + do_retrieve(url=URL_EMOJI_ZWJ_SEQUENCES.format(version=version), fname=fname) + table[version] = parse_zwj(fname=fname, version=version) + return UnicodeTableRenderCtx('EMOJI_ZWJ_SEQUENCES', table) + + def cite_source_description(filename: str) -> tuple[str, str]: """Return unicode.org source data file's own description as citation.""" with open(filename, encoding='utf-8') as f: @@ -276,13 +390,12 @@ def cite_source_description(filename: str) -> tuple[str, str]: return fname, date - -def make_table(values: Collection[int]) -> tuple[tuple[int, int], ...]: +def make_table(values: Collection[int]) -> list[tuple[int, int]]: """ - Return a tuple of lookup tables for given values. + Return a tuple of (start, end) lookup pairs for given sequence of sorted values. >>> make_table([0,1,2,5,6,7,9]) - ((0, 2), (5, 7), (9, 9)) + [(0, 2), (5, 7), (9, 9)] """ table: list[tuple[int, int]] = [] values_iter = iter(values) @@ -295,12 +408,17 @@ def make_table(values: Collection[int]) -> tuple[tuple[int, int], ...]: # continuation of existing range table.append((start, value,)) else: - # put back existing range, + # insert back previous range, table.append((start, end,)) # and start a new one table.append((value, value,)) - return tuple(table) + return table +def name_ucs(ucs: str) -> str: + try: + return string.capwords(unicodedata.name(ucs)) + except ValueError: + return None def convert_values_to_string_table( values: Collection[tuple[int, int]], @@ -308,17 +426,10 @@ def convert_values_to_string_table( """Convert integers into string table of (hex_start, hex_end, txt_description).""" pytable_values: list[tuple[str, str, str]] = [] for start, end in values: - hex_start, hex_end = (f'0x{start:05x}', f'0x{end:05x}') + hex_start, hex_end = f'0x{start:05x}', f'0x{end:05x}' ucs_start, ucs_end = chr(start), chr(end) - name_start, name_end = '(nil)', '(nil)' - try: - name_start = string.capwords(unicodedata.name(ucs_start)) - except ValueError: - pass - try: - name_end = string.capwords(unicodedata.name(ucs_end)) - except ValueError: - pass + name_start = name_ucs(ucs_start) or '(nil)' + name_end = name_ucs(ucs_end) or '(nil)' if name_start != name_end: txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}' else: @@ -346,8 +457,7 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]: start, end = code_points_str.split('..') else: start = end = code_points_str - code_range = range(int(start, base=16), - int(end, base=16) + 1) + code_range = (int(start, base=16), int(end, base=16) + 1) yield TableEntry(code_range, tuple(properties), comment) @@ -364,17 +474,59 @@ def parse_category(fname: str, category_codes: Container[str]) -> TableDef: # and "date string" from second line date = next(table_iter).comment.split(':', 1)[1].strip() - values: set[int] = set() - for entry in table_iter: - if (entry.code_range is not None - and entry.properties[0] in category_codes): - values.update(entry.code_range) - - txt_values = convert_values_to_string_table(make_table(sorted(values))) + values: list[tuple[int, int]] = ( + codepoint + for entry in table_iter + if entry.code_range is not None and entry.properties[0] in category_codes + for codepoint in range(*entry.code_range) + ) + txt_values = convert_values_to_string_table(make_table(values)) print('ok') return TableDef(version, date, txt_values) +def parse_zwj_file(file: Iterable[str]) -> Iterator[SequenceEntry]: + """ + Parse Emoji ZWJ Sequences + + Format: + code_point(s) ; type_field ; description # comments + """ + for line in file: + data, _, comment = line.partition('#') + data_fields: Iterator[str] = (field.strip() for field in data.split(';')) + code_points_str, *type_description = data_fields + description = '' + if len(type_description) > 1: + description = type_description[1] + + if not code_points_str: + # ignore comments or empty lines, except for 'Date:' -- a marker + # found across all releases so far. + if 'Date:' in comment: + yield SequenceEntry(None, None, comment) + continue + + hex_values = tuple(f'0x{int(code_point, 16):05x}' + for code_point in code_points_str.split()) + yield SequenceEntry(hex_values, description, comment) + + +def parse_zwj(fname: str, version: str) -> SequenceDef: + print(f'parsing {fname}: ', end='', flush=True) + with open(fname, encoding='utf-8') as f: + table_iter = parse_zwj_file(f) + date = next(table_iter).comment.split(':', 1)[1].strip() + # sequences are keyed by length + sequences = collections.defaultdict(list) + for entry in table_iter: + if entry.code_seq is not None: + sequences[len(entry.code_seq)].append((entry.code_seq, entry.description)) + sorted_sequences = collections.OrderedDict([(k, v) for k, v in sorted(sequences.items())]) + print('ok') + return SequenceDef(fname, date, sorted_sequences) + + @functools.cache def get_http_session() -> requests.Session: session = requests.Session() @@ -401,7 +553,7 @@ def is_url_newer(url: str, fname: str) -> bool: def do_retrieve(url: str, fname: str) -> None: """Retrieve given url to target filepath fname.""" folder = os.path.dirname(fname) - if not os.path.exists(folder): + if folder and not os.path.exists(folder): os.makedirs(folder) if not is_url_newer(url, fname): return @@ -429,11 +581,11 @@ def main() -> None: # code. def get_codegen_definitions() -> Iterator[RenderDefinition]: yield UnicodeVersionPyRenderDef.new( - UnicodeVersionPyRenderCtx(fetch_unicode_versions()) - ) + UnicodeVersionPyRenderCtx(versions=fetch_unicode_versions()) yield UnicodeVersionRstRenderDef.new(fetch_source_headers()) yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data()) yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data()) + yield UnicodeSequenceRenderDef.new('emoji_zwj_sequences.py', fetch_emoji_zero_data()) for render_def in get_codegen_definitions(): with open(render_def.output_filename, 'w', encoding='utf-8', newline='\n') as fout: @@ -442,6 +594,5 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]: fout.write(data) print('ok') - if __name__ == '__main__': main() diff --git a/bin/wcwidth-browser.py b/bin/wcwidth-browser.py index 51bcc21..86adac3 100755 --- a/bin/wcwidth-browser.py +++ b/bin/wcwidth-browser.py @@ -703,3 +703,6 @@ def main(opts): if __name__ == '__main__': sys.exit(main(validate_args(docopt.docopt(__doc__)))) + + +# TODO: --emoji-zwj --automatic-test !! diff --git a/code_templates/unicode_versions.py.j2 b/code_templates/unicode_versions.py.j2 index 09794f1..664c1e7 100644 --- a/code_templates/unicode_versions.py.j2 +++ b/code_templates/unicode_versions.py.j2 @@ -20,3 +20,15 @@ def list_versions(): "{{version}}", {%- endfor %} ) + +def list_zwj_versions(): + """ + Return Unicode Emoji version levels supported by this module release. + + :rtype: list[str] + """ + return ( +{%- for zjw_version in zjw_versions %} + "{{zjw_version}}", +{%- endfor %} + ) diff --git a/docs/intro.rst b/docs/intro.rst index 8a02758..72ab930 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -217,6 +217,14 @@ Other Languages History ======= +0.3.0 **unreleased experimental** + * **Bugfix**: ``wcswidth()`` for zero-width join characters such as used in emoji + sequences, and variations such as emoji skin tone, as well as many other + non-printable characters now correctly identified as zero-width. + * **Enhancement**: new function ``width()``, a copy of ``wcswidth()`` without + the POSIX compliance to return total width of -1 for any C0 or C1 control + characters, those characters measured as width 0 by this new function. + 0.2.8 *2023-09-30* * Include requirements files in the source distibution (`PR #82`). diff --git a/requirements-update.in b/requirements-update.in index 65817a1..08efe1b 100644 --- a/requirements-update.in +++ b/requirements-update.in @@ -2,3 +2,4 @@ typing-extensions jinja2 requests python-dateutil +#trieregex diff --git a/wcwidth/__init__.py b/wcwidth/__init__.py index 97c423d..3fae280 100644 --- a/wcwidth/__init__.py +++ b/wcwidth/__init__.py @@ -4,23 +4,31 @@ https://github.com/jquast/wcwidth """ # re-export all functions & definitions, even private ones, from top-level -# module path, to allow for 'from wcwidth import _private_func'. Of course, -# user beware that any _private function may disappear or change signature at -# any future version. +# module path, to allow for 'from wcwidth import _private_func' if necessary. +# Of course, user beware that any _private function may disappear or change +# signature at any future version. This is also a bit odd in that. +# +# This effort flattens the statement, 'from wcwidth.wcwidth import wcwidth' into +# 'from wcwidth import wcwidth'. # local -from .wcwidth import ZERO_WIDTH # noqa -from .wcwidth import (WIDE_EASTASIAN, - wcwidth, - wcswidth, - _bisearch, - list_versions, - _wcmatch_version, - _wcversion_value) +from .wcwidth import ( + width, + wcwidth, + wcswidth, + list_versions, + _wcwidth, + _wcswidth, + _bisearch, + _wcmatch_version, + _wcversion_value, + ZERO_WIDTH, + WIDE_EASTASIAN) # The __all__ attribute defines the items exported from statement, -# 'from wcwidth import *', but also to say, "This is the public API". -__all__ = ('wcwidth', 'wcswidth', 'list_versions') +# 'from wcwidth import *', but also to say, "This is the public API" +# that is qualified by semver version number management. +__all__ = ('width', 'wcwidth', 'wcswidth', 'list_versions') # We also used pkg_resources to load unicode version tables from version.json, # generated by bin/update-tables.py, but some environments are unable to diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py index 02afd5c..d951bb6 100644 --- a/wcwidth/table_wide.py +++ b/wcwidth/table_wide.py @@ -1,7 +1,7 @@ """ Exports WIDE_EASTASIAN table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2023-09-14 15:45:33 UTC. +This code generated by wcwidth/bin/update-tables.py on 2023-09-30 05:01:27 UTC. """ WIDE_EASTASIAN = { '4.1.0': ( @@ -201,7 +201,8 @@ (0x1f210, 0x1f23a,), # Squared Cjk Unified Ideo..Squared Cjk Unified Ideo (0x1f240, 0x1f248,), # Tortoise Shell Bracketed..Tortoise Shell Bracketed (0x1f250, 0x1f251,), # Circled Ideograph Advant..Circled Ideograph Accept - (0x20000, 0x2fffd,), # Cjk Unified Ideograph-20..(nil) + (0x20000, 0x2f73f,), # Cjk Unified Ideograph-20..(nil) + (0x2b740, 0x2fffd,), # Cjk Unified Ideograph-2b..(nil) (0x30000, 0x3fffd,), # Cjk Unified Ideograph-30..(nil) ), '6.1.0': ( diff --git a/wcwidth/table_zero.py b/wcwidth/table_zero.py index 1eef1ce..c63dd54 100644 --- a/wcwidth/table_zero.py +++ b/wcwidth/table_zero.py @@ -1,14 +1,28 @@ """ Exports ZERO_WIDTH table keyed by supporting unicode version level. -This code generated by wcwidth/bin/update-tables.py on 2023-09-14 15:45:33 UTC. +This code generated by wcwidth/bin/update-tables.py on 2023-09-30 05:01:27 UTC. """ ZERO_WIDTH = { '4.1.0': ( # Source: DerivedGeneralCategory-4.1.0.txt # Date: 2005-02-26, 02:35:50 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002ed,), # Modifier Letter Extra-hi..Modifier Letter Unaspira + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le + (0x00374, 0x00375,), # Greek Numeral Sign ..Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos (0x00483, 0x00486,), # Combining Cyrillic Titlo..Combining Cyrillic Psili (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005b9,), # Hebrew Accent Etnahta ..Hebrew Point Holam @@ -17,13 +31,17 @@ (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00603,), # Arabic Number Sign ..Arabic Sign Safha (0x00610, 0x00615,), # Arabic Sign Sallallahou ..Arabic Small High Tah (0x0064b, 0x0065e,), # Arabic Fathatan ..Arabic Fatha With Two Do (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen - (0x006de, 0x006e4,), # Arabic Start Of Rub El H..Arabic Small High Madda + (0x006dd, 0x006dd,), # Arabic End Of Ayah + (0x006de, 0x006de,), # Arabic Start Of Rub El Hizb + (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -99,6 +117,7 @@ (0x01732, 0x01734,), # Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod (0x01752, 0x01753,), # Buhid Vowel Sign I ..Buhid Vowel Sign U (0x01772, 0x01773,), # Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + (0x017b4, 0x017b5,), # Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa (0x017b7, 0x017bd,), # Khmer Vowel Sign I ..Khmer Vowel Sign Ua (0x017c6, 0x017c6,), # Khmer Sign Nikahit (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat @@ -111,32 +130,71 @@ (0x01939, 0x0193b,), # Limbu Sign Mukphreng ..Limbu Sign Sa-i (0x01a17, 0x01a18,), # Buginese Vowel Sign I ..Buginese Vowel Sign U (0x01dc0, 0x01dc3,), # Combining Dotted Grave A..Combining Suspension Mar - (0x020d0, 0x020eb,), # Combining Left Harpoon A..Combining Long Double So + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02063,), # Word Joiner ..Invisible Separator + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020eb,), # Combining Reverse Solidu..Combining Long Double So (0x0302a, 0x0302f,), # Ideographic Level Tone M..Hangul Double Dot Tone M (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara (0x0a825, 0x0a826,), # Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe23,), # Combining Ligature Left ..Combining Double Tilde R + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O (0x10a0c, 0x10a0f,), # Kharoshthi Vowel Length ..Kharoshthi Sign Visarga (0x10a38, 0x10a3a,), # Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo (0x10a3f, 0x10a3f,), # Kharoshthi Virama (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '5.0.0': ( # Source: DerivedGeneralCategory-5.0.0.txt # Date: 2006-02-27, 23:41:27 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002ed,), # Modifier Letter Extra-hi..Modifier Letter Unaspira + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le + (0x00374, 0x00375,), # Greek Numeral Sign ..Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos (0x00483, 0x00486,), # Combining Cyrillic Titlo..Combining Cyrillic Psili (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg @@ -144,13 +202,17 @@ (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00603,), # Arabic Number Sign ..Arabic Sign Safha (0x00610, 0x00615,), # Arabic Sign Sallallahou ..Arabic Small High Tah (0x0064b, 0x0065e,), # Arabic Fathatan ..Arabic Fatha With Two Do (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen - (0x006de, 0x006e4,), # Arabic Start Of Rub El H..Arabic Small High Madda + (0x006dd, 0x006dd,), # Arabic End Of Ayah + (0x006de, 0x006de,), # Arabic Start Of Rub El Hizb + (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -228,6 +290,7 @@ (0x01732, 0x01734,), # Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod (0x01752, 0x01753,), # Buhid Vowel Sign I ..Buhid Vowel Sign U (0x01772, 0x01773,), # Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + (0x017b4, 0x017b5,), # Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa (0x017b7, 0x017bd,), # Khmer Vowel Sign I ..Khmer Vowel Sign Ua (0x017c6, 0x017c6,), # Khmer Sign Nikahit (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat @@ -247,45 +310,91 @@ (0x01b6b, 0x01b73,), # Balinese Musical Symbol ..Balinese Musical Symbol (0x01dc0, 0x01dca,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfe, 0x01dff,), # Combining Left Arrowhead..Combining Right Arrowhea - (0x020d0, 0x020ef,), # Combining Left Harpoon A..Combining Right Arrow Be + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02063,), # Word Joiner ..Invisible Separator + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020ef,), # Combining Reverse Solidu..Combining Right Arrow Be (0x0302a, 0x0302f,), # Ideographic Level Tone M..Hangul Double Dot Tone M (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara (0x0a825, 0x0a826,), # Syloti Nagri Vowel Sign ..Syloti Nagri Vowel Sign (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe23,), # Combining Ligature Left ..Combining Double Tilde R + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O (0x10a0c, 0x10a0f,), # Kharoshthi Vowel Length ..Kharoshthi Sign Visarga (0x10a38, 0x10a3a,), # Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo (0x10a3f, 0x10a3f,), # Kharoshthi Virama (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '5.1.0': ( # Source: DerivedGeneralCategory-5.1.0.txt # Date: 2008-03-20, 17:54:57 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00603,), # Arabic Number Sign ..Arabic Sign Safha (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra (0x0064b, 0x0065e,), # Arabic Fathatan ..Arabic Fatha With Two Do (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen - (0x006de, 0x006e4,), # Arabic Start Of Rub El H..Arabic Small High Madda + (0x006dd, 0x006dd,), # Arabic End Of Ayah + (0x006de, 0x006de,), # Arabic Start Of Rub El Hizb + (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -373,6 +482,7 @@ (0x01732, 0x01734,), # Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod (0x01752, 0x01753,), # Buhid Vowel Sign I ..Buhid Vowel Sign U (0x01772, 0x01773,), # Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + (0x017b4, 0x017b5,), # Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa (0x017b7, 0x017bd,), # Khmer Vowel Sign I ..Khmer Vowel Sign Ua (0x017c6, 0x017c6,), # Khmer Sign Nikahit (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat @@ -397,12 +507,32 @@ (0x01c36, 0x01c37,), # Lepcha Sign Ran ..Lepcha Sign Nukta (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfe, 0x01dff,), # Combining Left Arrowhead..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302f,), # Ideographic Level Tone M..Hangul Double Dot Tone M (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a67c, 0x0a67d,), # Combining Cyrillic Kavyk..Combining Cyrillic Payer + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -418,6 +548,11 @@ (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -425,30 +560,53 @@ (0x10a38, 0x10a3a,), # Kharoshthi Sign Bar Abov..Kharoshthi Sign Dot Belo (0x10a3f, 0x10a3f,), # Kharoshthi Virama (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '5.2.0': ( # Source: DerivedGeneralCategory-5.2.0.txt # Date: 2009-08-22, 04:58:21 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00603,), # Arabic Number Sign ..Arabic Sign Safha (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra (0x0064b, 0x0065e,), # Arabic Fathatan ..Arabic Fatha With Two Do (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen - (0x006de, 0x006e4,), # Arabic Start Of Rub El H..Arabic Small High Madda + (0x006dd, 0x006dd,), # Arabic End Of Ayah + (0x006de, 0x006de,), # Arabic Start Of Rub El Hizb + (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -541,6 +699,7 @@ (0x01732, 0x01734,), # Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod (0x01752, 0x01753,), # Buhid Vowel Sign I ..Buhid Vowel Sign U (0x01772, 0x01773,), # Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + (0x017b4, 0x017b5,), # Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa (0x017b7, 0x017bd,), # Khmer Vowel Sign I ..Khmer Vowel Sign Ua (0x017c6, 0x017c6,), # Khmer Sign Nikahit (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat @@ -576,14 +735,34 @@ (0x01ced, 0x01ced,), # Vedic Sign Tiryak (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfd, 0x01dff,), # Combining Almost Equal T..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302f,), # Ideographic Level Tone M..Hangul Double Dot Tone M (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a67c, 0x0a67d,), # Combining Cyrillic Kavyk..Combining Cyrillic Payer (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -612,6 +791,11 @@ (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -621,31 +805,54 @@ (0x11080, 0x11081,), # Kaithi Sign Candrabindu ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '6.0.0': ( # Source: DerivedGeneralCategory-6.0.0.txt # Date: 2010-08-19, 00:48:09 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00603,), # Arabic Number Sign ..Arabic Sign Safha (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -740,6 +947,7 @@ (0x01732, 0x01734,), # Hanunoo Vowel Sign I ..Hanunoo Sign Pamudpod (0x01752, 0x01753,), # Buhid Vowel Sign I ..Buhid Vowel Sign U (0x01772, 0x01773,), # Tagbanwa Vowel Sign I ..Tagbanwa Vowel Sign U + (0x017b4, 0x017b5,), # Khmer Vowel Inherent Aq ..Khmer Vowel Inherent Aa (0x017b7, 0x017bd,), # Khmer Vowel Sign I ..Khmer Vowel Sign Ua (0x017c6, 0x017c6,), # Khmer Sign Nikahit (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat @@ -779,15 +987,35 @@ (0x01ced, 0x01ced,), # Vedic Sign Tiryak (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302f,), # Ideographic Level Tone M..Hangul Double Dot Tone M (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a67c, 0x0a67d,), # Combining Cyrillic Kavyk..Combining Cyrillic Payer (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -814,8 +1042,14 @@ (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -827,31 +1061,54 @@ (0x11080, 0x11081,), # Kaithi Sign Candrabindu ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '6.1.0': ( # Source: DerivedGeneralCategory-6.1.0.txt # Date: 2011-11-27, 05:10:22 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00604,), # Arabic Number Sign ..Arabic Sign Samvat (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -989,16 +1246,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69f, 0x0a69f,), # Combining Cyrillic Letter Iotified E (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -1027,8 +1304,14 @@ (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -1040,6 +1323,7 @@ (0x11080, 0x11081,), # Kaithi Sign Candrabindu ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -1051,30 +1335,52 @@ (0x116b7, 0x116b7,), # Takri Sign Nukta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '6.2.0': ( # Source: DerivedGeneralCategory-6.2.0.txt # Date: 2012-05-20, 00:42:34 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00604,), # Arabic Number Sign ..Arabic Sign Samvat (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -1212,16 +1518,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x0206a, 0x0206f,), # Inhibit Symmetric Swappi..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69f, 0x0a69f,), # Combining Cyrillic Letter Iotified E (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -1250,8 +1576,14 @@ (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -1263,6 +1595,7 @@ (0x11080, 0x11081,), # Kaithi Sign Candrabindu ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -1274,30 +1607,53 @@ (0x116b7, 0x116b7,), # Takri Sign Nukta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '6.3.0': ( # Source: DerivedGeneralCategory-6.3.0.txt # Date: 2013-07-05, 14:08:45 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00604,), # Arabic Number Sign ..Arabic Sign Samvat (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -1399,6 +1755,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U (0x01927, 0x01928,), # Limbu Vowel Sign E ..Limbu Vowel Sign O @@ -1436,16 +1793,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01dc0, 0x01de6,), # Combining Dotted Grave A..Combining Latin Small Le (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69f, 0x0a69f,), # Combining Cyrillic Letter Iotified E (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -1474,8 +1851,14 @@ (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe26,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x10a01, 0x10a03,), # Kharoshthi Vowel Sign I ..Kharoshthi Vowel Sign Vo (0x10a05, 0x10a06,), # Kharoshthi Vowel Sign E ..Kharoshthi Vowel Sign O @@ -1487,6 +1870,7 @@ (0x11080, 0x11081,), # Kaithi Sign Candrabindu ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -1498,30 +1882,53 @@ (0x116b7, 0x116b7,), # Takri Sign Nukta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '7.0.0': ( # Source: DerivedGeneralCategory-7.0.0.txt # Date: 2014-02-07, 18:42:12 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -1625,6 +2032,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U (0x01927, 0x01928,), # Limbu Vowel Sign E ..Limbu Vowel Sign O @@ -1639,7 +2047,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -1664,16 +2073,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df5,), # Combining Dotted Grave A..Combining Up Tack Above (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69f, 0x0a69f,), # Combining Cyrillic Letter Iotified E (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -1700,12 +2129,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2d,), # Combining Ligature Left ..Combining Conjoining Mac + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -1720,6 +2156,7 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -1754,32 +2191,56 @@ (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining (0x1d242, 0x1d244,), # Combining Greek Musical ..Combining Greek Musical (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '8.0.0': ( # Source: DerivedGeneralCategory-8.0.0.txt # Date: 2015-02-13, 13:47:11 GMT [MD] # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -1883,6 +2344,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U (0x01927, 0x01928,), # Limbu Vowel Sign E ..Limbu Vowel Sign O @@ -1897,7 +2359,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -1922,16 +2385,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df5,), # Combining Dotted Grave A..Combining Up Tack Above (0x01dfc, 0x01dff,), # Combining Double Inverte..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -1958,12 +2441,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -1978,6 +2468,7 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -2017,7 +2508,9 @@ (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -2029,26 +2522,49 @@ (0x1da9b, 0x1da9f,), # Signwriting Fill Modifie..Signwriting Fill Modifie (0x1daa1, 0x1daaf,), # Signwriting Rotation Mod..Signwriting Rotation Mod (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '9.0.0': ( # Source: DerivedGeneralCategory-9.0.0.txt # Date: 2016-06-01, 10:34:26 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -2059,6 +2575,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d4, 0x008e1,), # Arabic Small High Word A..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -2153,6 +2670,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -2168,7 +2686,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -2193,16 +2712,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df5,), # Combining Dotted Grave A..Combining Up Tack Above (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -2229,12 +2768,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -2249,6 +2795,7 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -2299,7 +2846,9 @@ (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -2317,26 +2866,49 @@ (0x1e026, 0x1e02a,), # Combining Glagolitic Let..Combining Glagolitic Let (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '10.0.0': ( # Source: DerivedGeneralCategory-10.0.0.txt # Date: 2017-03-08, 08:41:49 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -2347,6 +2919,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d4, 0x008e1,), # Arabic Small High Word A..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -2443,6 +3016,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -2458,7 +3032,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -2483,16 +3058,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df9,), # Combining Dotted Grave A..Combining Wide Inverted (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -2519,12 +3114,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -2539,6 +3141,7 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -2603,7 +3206,9 @@ (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -2621,26 +3226,49 @@ (0x1e026, 0x1e02a,), # Combining Glagolitic Let..Combining Glagolitic Let (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '11.0.0': ( # Source: DerivedGeneralCategory-11.0.0.txt # Date: 2018-02-21, 05:34:04 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -2652,6 +3280,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d3, 0x008e1,), # Arabic Small Low Waw ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -2750,6 +3379,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -2765,7 +3395,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -2790,16 +3421,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df9,), # Combining Dotted Grave A..Combining Wide Inverted (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -2827,12 +3478,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -2849,6 +3507,8 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -2919,7 +3579,9 @@ (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -2937,26 +3599,49 @@ (0x1e026, 0x1e02a,), # Combining Glagolitic Let..Combining Glagolitic Let (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '12.0.0': ( # Source: DerivedGeneralCategory-12.0.0.txt # Date: 2019-01-22, 08:18:28 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -2968,6 +3653,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d3, 0x008e1,), # Arabic Small Low Waw ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -3065,6 +3751,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -3080,7 +3767,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -3105,16 +3793,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df9,), # Combining Dotted Grave A..Combining Wide Inverted (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -3142,12 +3850,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -3164,6 +3879,8 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -3233,12 +3950,15 @@ (0x11d95, 0x11d95,), # Gunjala Gondi Sign Anusvara (0x11d97, 0x11d97,), # Gunjala Gondi Virama (0x11ef3, 0x11ef4,), # Makasar Vowel Sign I ..Makasar Vowel Sign U + (0x13430, 0x13438,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f4f, 0x16f4f,), # Miao Sign Consonant Modifier Bar (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -3258,26 +3978,49 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '12.1.0': ( # Source: DerivedGeneralCategory-12.1.0.txt # Date: 2019-03-10, 10:53:08 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -3289,6 +4032,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d3, 0x008e1,), # Arabic Small Low Waw ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -3386,6 +4130,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -3401,7 +4146,8 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01abe,), # Combining Doubled Circum..Combining Parentheses Ov + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -3426,16 +4172,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df9,), # Combining Dotted Grave A..Combining Wide Inverted (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -3463,12 +4229,19 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -3485,6 +4258,8 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -3554,12 +4329,15 @@ (0x11d95, 0x11d95,), # Gunjala Gondi Sign Anusvara (0x11d97, 0x11d97,), # Gunjala Gondi Virama (0x11ef3, 0x11ef4,), # Makasar Vowel Sign I ..Makasar Vowel Sign U + (0x13430, 0x13438,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f4f, 0x16f4f,), # Miao Sign Consonant Modifier Bar (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -3579,26 +4357,49 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '13.0.0': ( # Source: DerivedGeneralCategory-13.0.0.txt # Date: 2019-10-21, 14:30:32 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -3610,6 +4411,7 @@ (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark (0x008d3, 0x008e1,), # Arabic Small Low Waw ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -3708,6 +4510,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga (0x01920, 0x01922,), # Limbu Vowel Sign A ..Limbu Vowel Sign U @@ -3723,7 +4526,9 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01ac0,), # Combining Doubled Circum..Combining Latin Small Le + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay + (0x01abf, 0x01ac0,), # Combining Latin Small Le..Combining Latin Small Le (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -3748,16 +4553,36 @@ (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01df9,), # Combining Dotted Grave A..Combining Wide Inverted (0x01dfb, 0x01dff,), # Combining Deletion Mark ..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -3786,12 +4611,20 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve + (0x0ab6a, 0x0ab6b,), # Modifier Letter Left Tac..Modifier Letter Right Ta (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc1,), # Arabic Symbol Dot Above ..Arabic Symbol Small Tah (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -3809,6 +4642,8 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -3882,13 +4717,16 @@ (0x11d95, 0x11d95,), # Gunjala Gondi Sign Anusvara (0x11d97, 0x11d97,), # Gunjala Gondi Virama (0x11ef3, 0x11ef4,), # Makasar Vowel Sign I ..Makasar Vowel Sign U + (0x13430, 0x13438,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f4f, 0x16f4f,), # Miao Sign Consonant Modifier Bar (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x16fe4, 0x16fe4,), # Khitan Small Script Filler (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -3908,26 +4746,49 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '14.0.0': ( # Source: DerivedGeneralCategory-14.0.0.txt # Date: 2021-07-10, 00:35:08 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -3938,8 +4799,11 @@ (0x00825, 0x00827,), # Samaritan Vowel Sign Sho..Samaritan Vowel Sign U (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark + (0x00888, 0x00888,), # Arabic Raised Round Dot + (0x00890, 0x00891,), # Arabic Pound Mark Above ..Arabic Piastre Mark Abov (0x00898, 0x0089f,), # Arabic Small High Word A..Arabic Half Madda Over M (0x008ca, 0x008e1,), # Arabic Small High Farsi ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -4039,6 +4903,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x0180f, 0x0180f,), # Mongolian Free Variation Selector Four (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga @@ -4055,7 +4920,9 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01ace,), # Combining Doubled Circum..Combining Latin Small Le + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay + (0x01abf, 0x01ace,), # Combining Latin Small Le..Combining Latin Small Le (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -4079,16 +4946,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01dff,), # Combining Dotted Grave A..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -4117,12 +5004,20 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve + (0x0ab6a, 0x0ab6b,), # Modifier Letter Left Tac..Modifier Letter Right Ta (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc2,), # Arabic Symbol Dot Above ..Arabic Symbol Wasla Abov (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -4143,7 +5038,9 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x110c2, 0x110c2,), # Kaithi Vowel Sign Vocalic R + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -4217,15 +5114,18 @@ (0x11d95, 0x11d95,), # Gunjala Gondi Sign Anusvara (0x11d97, 0x11d97,), # Gunjala Gondi Virama (0x11ef3, 0x11ef4,), # Makasar Vowel Sign I ..Makasar Vowel Sign U + (0x13430, 0x13438,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High (0x16b30, 0x16b36,), # Pahawh Hmong Mark Cim Tu..Pahawh Hmong Mark Cim Ta (0x16f4f, 0x16f4f,), # Miao Sign Consonant Modifier Bar (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x16fe4, 0x16fe4,), # Khitan Small Script Filler (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1cf00, 0x1cf2d,), # Znamenny Combining Mark ..Znamenny Combining Mark (0x1cf30, 0x1cf46,), # Znamenny Combining Tonal..Znamenny Priznak Modifie (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -4246,26 +5146,49 @@ (0x1e2ec, 0x1e2ef,), # Wancho Tone Tup ..Wancho Tone Koini (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '15.0.0': ( # Source: DerivedGeneralCategory-15.0.0.txt # Date: 2022-04-26, 23:14:35 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -4276,8 +5199,11 @@ (0x00825, 0x00827,), # Samaritan Vowel Sign Sho..Samaritan Vowel Sign U (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark + (0x00888, 0x00888,), # Arabic Raised Round Dot + (0x00890, 0x00891,), # Arabic Pound Mark Above ..Arabic Piastre Mark Abov (0x00898, 0x0089f,), # Arabic Small High Word A..Arabic Half Madda Over M (0x008ca, 0x008e1,), # Arabic Small High Farsi ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -4377,6 +5303,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x0180f, 0x0180f,), # Mongolian Free Variation Selector Four (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga @@ -4393,7 +5320,9 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01ace,), # Combining Doubled Circum..Combining Latin Small Le + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay + (0x01abf, 0x01ace,), # Combining Latin Small Le..Combining Latin Small Le (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -4417,16 +5346,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01dff,), # Combining Dotted Grave A..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -4455,12 +5404,20 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve + (0x0ab6a, 0x0ab6b,), # Modifier Letter Left Tac..Modifier Letter Right Ta (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc2,), # Arabic Symbol Dot Above ..Arabic Symbol Wasla Abov (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -4482,7 +5439,9 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x110c2, 0x110c2,), # Kaithi Vowel Sign Vocalic R + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -4561,6 +5520,7 @@ (0x11f36, 0x11f3a,), # Kawi Vowel Sign I ..Kawi Vowel Sign Vocalic (0x11f40, 0x11f40,), # Kawi Vowel Sign Eu (0x11f42, 0x11f42,), # Kawi Conjoiner + (0x13430, 0x1343f,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x13440, 0x13440,), # Egyptian Hieroglyph Mirror Horizontally (0x13447, 0x13455,), # Egyptian Hieroglyph Modi..Egyptian Hieroglyph Modi (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High @@ -4569,9 +5529,11 @@ (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x16fe4, 0x16fe4,), # Khitan Small Script Filler (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1cf00, 0x1cf2d,), # Znamenny Combining Mark ..Znamenny Combining Mark (0x1cf30, 0x1cf46,), # Znamenny Combining Tonal..Znamenny Priznak Modifie (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -4594,26 +5556,49 @@ (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), '15.1.0': ( # Source: DerivedGeneralCategory-15.1.0.txt # Date: 2023-07-28, 23:34:02 GMT # + (0x00000, 0x00000,), # None + (0x0005e, 0x0005e,), # Circumflex Accent + (0x00060, 0x00060,), # Grave Accent + (0x000a8, 0x000a8,), # Diaeresis + (0x000ad, 0x000ad,), # Soft Hyphen + (0x000af, 0x000af,), # Macron + (0x000b4, 0x000b4,), # Acute Accent + (0x000b8, 0x000b8,), # Cedilla + (0x002c2, 0x002c5,), # Modifier Letter Left Arr..Modifier Letter Down Arr + (0x002d2, 0x002df,), # Modifier Letter Centred ..Modifier Letter Cross Ac + (0x002e5, 0x002eb,), # Modifier Letter Extra-hi..Modifier Letter Yang Dep + (0x002ed, 0x002ed,), # Modifier Letter Unaspirated + (0x002ef, 0x002ff,), # Modifier Letter Low Down..Modifier Letter Low Left (0x00300, 0x0036f,), # Combining Grave Accent ..Combining Latin Small Le - (0x00483, 0x00489,), # Combining Cyrillic Titlo..Combining Cyrillic Milli + (0x00375, 0x00375,), # Greek Lower Numeral Sign + (0x00384, 0x00385,), # Greek Tonos ..Greek Dialytika Tonos + (0x00483, 0x00487,), # Combining Cyrillic Titlo..Combining Cyrillic Pokry + (0x00488, 0x00489,), # Combining Cyrillic Hundr..Combining Cyrillic Milli (0x00591, 0x005bd,), # Hebrew Accent Etnahta ..Hebrew Point Meteg (0x005bf, 0x005bf,), # Hebrew Point Rafe (0x005c1, 0x005c2,), # Hebrew Point Shin Dot ..Hebrew Point Sin Dot (0x005c4, 0x005c5,), # Hebrew Mark Upper Dot ..Hebrew Mark Lower Dot (0x005c7, 0x005c7,), # Hebrew Point Qamats Qatan + (0x00600, 0x00605,), # Arabic Number Sign ..Arabic Number Mark Above (0x00610, 0x0061a,), # Arabic Sign Sallallahou ..Arabic Small Kasra + (0x0061c, 0x0061c,), # Arabic Letter Mark (0x0064b, 0x0065f,), # Arabic Fathatan ..Arabic Wavy Hamza Below (0x00670, 0x00670,), # Arabic Letter Superscript Alef (0x006d6, 0x006dc,), # Arabic Small High Ligatu..Arabic Small High Seen + (0x006dd, 0x006dd,), # Arabic End Of Ayah (0x006df, 0x006e4,), # Arabic Small High Rounde..Arabic Small High Madda (0x006e7, 0x006e8,), # Arabic Small High Yeh ..Arabic Small High Noon (0x006ea, 0x006ed,), # Arabic Empty Centre Low ..Arabic Small Low Meem + (0x0070f, 0x0070f,), # Syriac Abbreviation Mark (0x00711, 0x00711,), # Syriac Letter Superscript Alaph (0x00730, 0x0074a,), # Syriac Pthaha Above ..Syriac Barrekh (0x007a6, 0x007b0,), # Thaana Abafili ..Thaana Sukun @@ -4624,8 +5609,11 @@ (0x00825, 0x00827,), # Samaritan Vowel Sign Sho..Samaritan Vowel Sign U (0x00829, 0x0082d,), # Samaritan Vowel Sign Lon..Samaritan Mark Nequdaa (0x00859, 0x0085b,), # Mandaic Affrication Mark..Mandaic Gemination Mark + (0x00888, 0x00888,), # Arabic Raised Round Dot + (0x00890, 0x00891,), # Arabic Pound Mark Above ..Arabic Piastre Mark Abov (0x00898, 0x0089f,), # Arabic Small High Word A..Arabic Half Madda Over M (0x008ca, 0x008e1,), # Arabic Small High Farsi ..Arabic Small High Sign S + (0x008e2, 0x008e2,), # Arabic Disputed End Of Ayah (0x008e3, 0x00902,), # Arabic Turned Damma Belo..Devanagari Sign Anusvara (0x0093a, 0x0093a,), # Devanagari Vowel Sign Oe (0x0093c, 0x0093c,), # Devanagari Sign Nukta @@ -4725,6 +5713,7 @@ (0x017c9, 0x017d3,), # Khmer Sign Muusikatoan ..Khmer Sign Bathamasat (0x017dd, 0x017dd,), # Khmer Sign Atthacan (0x0180b, 0x0180d,), # Mongolian Free Variation..Mongolian Free Variation + (0x0180e, 0x0180e,), # Mongolian Vowel Separator (0x0180f, 0x0180f,), # Mongolian Free Variation Selector Four (0x01885, 0x01886,), # Mongolian Letter Ali Gal..Mongolian Letter Ali Gal (0x018a9, 0x018a9,), # Mongolian Letter Ali Gali Dagalga @@ -4741,7 +5730,9 @@ (0x01a65, 0x01a6c,), # Tai Tham Vowel Sign I ..Tai Tham Vowel Sign Oa B (0x01a73, 0x01a7c,), # Tai Tham Vowel Sign Oa A..Tai Tham Sign Khuen-lue (0x01a7f, 0x01a7f,), # Tai Tham Combining Cryptogrammic Dot - (0x01ab0, 0x01ace,), # Combining Doubled Circum..Combining Latin Small Le + (0x01ab0, 0x01abd,), # Combining Doubled Circum..Combining Parentheses Be + (0x01abe, 0x01abe,), # Combining Parentheses Overlay + (0x01abf, 0x01ace,), # Combining Latin Small Le..Combining Latin Small Le (0x01b00, 0x01b03,), # Balinese Sign Ulu Ricem ..Balinese Sign Surang (0x01b34, 0x01b34,), # Balinese Sign Rerekan (0x01b36, 0x01b3a,), # Balinese Vowel Sign Ulu ..Balinese Vowel Sign Ra R @@ -4765,16 +5756,36 @@ (0x01cf4, 0x01cf4,), # Vedic Tone Candra Above (0x01cf8, 0x01cf9,), # Vedic Tone Ring Above ..Vedic Tone Double Ring A (0x01dc0, 0x01dff,), # Combining Dotted Grave A..Combining Right Arrowhea - (0x020d0, 0x020f0,), # Combining Left Harpoon A..Combining Asterisk Above + (0x01fbd, 0x01fbd,), # Greek Koronis + (0x01fbf, 0x01fc1,), # Greek Psili ..Greek Dialytika And Peri + (0x01fcd, 0x01fcf,), # Greek Psili And Varia ..Greek Psili And Perispom + (0x01fdd, 0x01fdf,), # Greek Dasia And Varia ..Greek Dasia And Perispom + (0x01fed, 0x01fef,), # Greek Dialytika And Vari..Greek Varia + (0x01ffd, 0x01ffe,), # Greek Oxia ..Greek Dasia + (0x0200b, 0x0200f,), # Zero Width Space ..Right-to-left Mark + (0x02028, 0x02029,), # Line Separator ..Paragraph Separator + (0x0202a, 0x0202e,), # Left-to-right Embedding ..Right-to-left Override + (0x02060, 0x02064,), # Word Joiner ..Invisible Plus + (0x02066, 0x0206f,), # Left-to-right Isolate ..Nominal Digit Shapes + (0x020d0, 0x020dc,), # Combining Left Harpoon A..Combining Four Dots Abov + (0x020dd, 0x020e0,), # Combining Enclosing Circ..Combining Enclosing Circ + (0x020e1, 0x020e1,), # Combining Left Right Arrow Above + (0x020e2, 0x020e4,), # Combining Enclosing Scre..Combining Enclosing Upwa + (0x020e5, 0x020f0,), # Combining Reverse Solidu..Combining Asterisk Above (0x02cef, 0x02cf1,), # Coptic Combining Ni Abov..Coptic Combining Spiritu (0x02d7f, 0x02d7f,), # Tifinagh Consonant Joiner (0x02de0, 0x02dff,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0302a, 0x0302d,), # Ideographic Level Tone M..Ideographic Entering Ton (0x03099, 0x0309a,), # Combining Katakana-hirag..Combining Katakana-hirag - (0x0a66f, 0x0a672,), # Combining Cyrillic Vzmet..Combining Cyrillic Thous + (0x0309b, 0x0309c,), # Katakana-hiragana Voiced..Katakana-hiragana Semi-v + (0x0a66f, 0x0a66f,), # Combining Cyrillic Vzmet + (0x0a670, 0x0a672,), # Combining Cyrillic Ten M..Combining Cyrillic Thous (0x0a674, 0x0a67d,), # Combining Cyrillic Lette..Combining Cyrillic Payer (0x0a69e, 0x0a69f,), # Combining Cyrillic Lette..Combining Cyrillic Lette (0x0a6f0, 0x0a6f1,), # Bamum Combining Mark Koq..Bamum Combining Mark Tuk + (0x0a700, 0x0a716,), # Modifier Letter Chinese ..Modifier Letter Extra-lo + (0x0a720, 0x0a721,), # Modifier Letter Stress A..Modifier Letter Stress A + (0x0a789, 0x0a78a,), # Modifier Letter Colon ..Modifier Letter Short Eq (0x0a802, 0x0a802,), # Syloti Nagri Sign Dvisvara (0x0a806, 0x0a806,), # Syloti Nagri Sign Hasanta (0x0a80b, 0x0a80b,), # Syloti Nagri Sign Anusvara @@ -4803,12 +5814,20 @@ (0x0aac1, 0x0aac1,), # Tai Viet Tone Mai Tho (0x0aaec, 0x0aaed,), # Meetei Mayek Vowel Sign ..Meetei Mayek Vowel Sign (0x0aaf6, 0x0aaf6,), # Meetei Mayek Virama + (0x0ab5b, 0x0ab5b,), # Modifier Breve With Inverted Breve + (0x0ab6a, 0x0ab6b,), # Modifier Letter Left Tac..Modifier Letter Right Ta (0x0abe5, 0x0abe5,), # Meetei Mayek Vowel Sign Anap (0x0abe8, 0x0abe8,), # Meetei Mayek Vowel Sign Unap (0x0abed, 0x0abed,), # Meetei Mayek Apun Iyek (0x0fb1e, 0x0fb1e,), # Hebrew Point Judeo-spanish Varika + (0x0fbb2, 0x0fbc2,), # Arabic Symbol Dot Above ..Arabic Symbol Wasla Abov (0x0fe00, 0x0fe0f,), # Variation Selector-1 ..Variation Selector-16 (0x0fe20, 0x0fe2f,), # Combining Ligature Left ..Combining Cyrillic Titlo + (0x0feff, 0x0feff,), # Zero Width No-break Space + (0x0ff3e, 0x0ff3e,), # Fullwidth Circumflex Accent + (0x0ff40, 0x0ff40,), # Fullwidth Grave Accent + (0x0ffe3, 0x0ffe3,), # Fullwidth Macron + (0x0fff9, 0x0fffb,), # Interlinear Annotation A..Interlinear Annotation T (0x101fd, 0x101fd,), # Phaistos Disc Sign Combining Oblique Stroke (0x102e0, 0x102e0,), # Coptic Epact Thousands Mark (0x10376, 0x1037a,), # Combining Old Permic Let..Combining Old Permic Let @@ -4830,7 +5849,9 @@ (0x1107f, 0x11081,), # Brahmi Number Joiner ..Kaithi Sign Anusvara (0x110b3, 0x110b6,), # Kaithi Vowel Sign U ..Kaithi Vowel Sign Ai (0x110b9, 0x110ba,), # Kaithi Sign Virama ..Kaithi Sign Nukta + (0x110bd, 0x110bd,), # Kaithi Number Sign (0x110c2, 0x110c2,), # Kaithi Vowel Sign Vocalic R + (0x110cd, 0x110cd,), # Kaithi Number Sign Above (0x11100, 0x11102,), # Chakma Sign Candrabindu ..Chakma Sign Visarga (0x11127, 0x1112b,), # Chakma Vowel Sign A ..Chakma Vowel Sign Uu (0x1112d, 0x11134,), # Chakma Vowel Sign Ai ..Chakma Maayyaa @@ -4909,6 +5930,7 @@ (0x11f36, 0x11f3a,), # Kawi Vowel Sign I ..Kawi Vowel Sign Vocalic (0x11f40, 0x11f40,), # Kawi Vowel Sign Eu (0x11f42, 0x11f42,), # Kawi Conjoiner + (0x13430, 0x1343f,), # Egyptian Hieroglyph Vert..Egyptian Hieroglyph End (0x13440, 0x13440,), # Egyptian Hieroglyph Mirror Horizontally (0x13447, 0x13455,), # Egyptian Hieroglyph Modi..Egyptian Hieroglyph Modi (0x16af0, 0x16af4,), # Bassa Vah Combining High..Bassa Vah Combining High @@ -4917,9 +5939,11 @@ (0x16f8f, 0x16f92,), # Miao Tone Right ..Miao Tone Below (0x16fe4, 0x16fe4,), # Khitan Small Script Filler (0x1bc9d, 0x1bc9e,), # Duployan Thick Letter Se..Duployan Double Mark + (0x1bca0, 0x1bca3,), # Shorthand Format Letter ..Shorthand Format Up Step (0x1cf00, 0x1cf2d,), # Znamenny Combining Mark ..Znamenny Combining Mark (0x1cf30, 0x1cf46,), # Znamenny Combining Tonal..Znamenny Priznak Modifie (0x1d167, 0x1d169,), # Musical Symbol Combining..Musical Symbol Combining + (0x1d173, 0x1d17a,), # Musical Symbol Begin Bea..Musical Symbol End Phras (0x1d17b, 0x1d182,), # Musical Symbol Combining..Musical Symbol Combining (0x1d185, 0x1d18b,), # Musical Symbol Combining..Musical Symbol Combining (0x1d1aa, 0x1d1ad,), # Musical Symbol Combining..Musical Symbol Combining @@ -4942,6 +5966,9 @@ (0x1e4ec, 0x1e4ef,), # Nag Mundari Sign Muhor ..Nag Mundari Sign Sutuh (0x1e8d0, 0x1e8d6,), # Mende Kikakui Combining ..Mende Kikakui Combining (0x1e944, 0x1e94a,), # Adlam Alif Lengthener ..Adlam Nukta + (0x1f3fb, 0x1f3ff,), # Emoji Modifier Fitzpatri..Emoji Modifier Fitzpatri + (0xe0001, 0xe0001,), # Language Tag + (0xe0020, 0xe007f,), # Tag Space ..Cancel Tag (0xe0100, 0xe01ef,), # Variation Selector-17 ..Variation Selector-256 ), } diff --git a/wcwidth/unicode_versions.py b/wcwidth/unicode_versions.py index 4e9ccbf..3906e17 100644 --- a/wcwidth/unicode_versions.py +++ b/wcwidth/unicode_versions.py @@ -1,7 +1,7 @@ """ Exports function list_versions() for unicode version level support. -This code generated by wcwidth/bin/update-tables.py on 2023-09-14 15:45:33 UTC. +This code generated by wcwidth/bin/update-tables.py on 2023-09-30 05:01:27 UTC. """ @@ -36,3 +36,12 @@ def list_versions(): "15.0.0", "15.1.0", ) + +def list_zwj_versions(): + """ + Return Unicode Emoji version levels supported by this module release. + + :rtype: list[str] + """ + return ( + ) diff --git a/wcwidth/wcwidth.py b/wcwidth/wcwidth.py index f83e410..2570e3f 100644 --- a/wcwidth/wcwidth.py +++ b/wcwidth/wcwidth.py @@ -63,6 +63,7 @@ from __future__ import division # std imports +import re import os import sys import warnings @@ -70,7 +71,7 @@ # local from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH -from .unicode_versions import list_versions +from .unicode_versions import list_versions, list_zwj_versions try: # std imports @@ -81,34 +82,9 @@ from backports.functools_lru_cache import lru_cache # global cache -_UNICODE_CMPTABLE = None _PY3 = (sys.version_info[0] >= 3) -# NOTE: created by hand, there isn't anything identifiable other than -# general Cf category code to identify these, and some characters in Cf -# category code are of non-zero width. -# Also includes some Cc, Mn, Zl, and Zp characters -ZERO_WIDTH_CF = set([ - 0, # Null (Cc) - 0x034F, # Combining grapheme joiner (Mn) - 0x200B, # Zero width space - 0x200C, # Zero width non-joiner - 0x200D, # Zero width joiner - 0x200E, # Left-to-right mark - 0x200F, # Right-to-left mark - 0x2028, # Line separator (Zl) - 0x2029, # Paragraph separator (Zp) - 0x202A, # Left-to-right embedding - 0x202B, # Right-to-left embedding - 0x202C, # Pop directional formatting - 0x202D, # Left-to-right override - 0x202E, # Right-to-left override - 0x2060, # Word joiner - 0x2061, # Function application - 0x2062, # Invisible times - 0x2063, # Invisible separator -]) def _bisearch(ucs, table): @@ -138,6 +114,82 @@ def _bisearch(ucs, table): return 0 +@lru_cache(maxsize=1000) +def _wcwidth(ucs, unicode_version): + r""" + Given one Unicode point, return its printable length on a terminal. + + :param int ucs: A Unicode codepoint value. + :param str unicode_version: Return value of :func:`_wcmatch_version`. + + :return: The width, in cells, necessary to display the character of + Unicode string character, ``wc``. Returns 0 if the ``wc`` argument has + no printable effect on a terminal (such as NUL '\0'), -1 if ``wc`` is + not printable, or has an indeterminate effect on the terminal, such as + a control character. Otherwise, the number of column positions the + character occupies on a graphic terminal (1 or 2) is returned. + :rtype: int + + This function is precisely the same as :func:`wcwidth`, but receives the + ordinal value (unicode point) rather than a string character, and, + ``unicode_version`` is already resolved. + """ + # NULL + if ucs == 0: + return 0 + + # C0/C1 control characters are -1 for compatibility with POSIX-like calls + if ucs and ucs < 32 or 0x07F <= ucs < 0x0A0: + return -1 + + # Zero width + if _bisearch(ucs, ZERO_WIDTH[unicode_version]): + return 0 + + # 1 or 2 width + return 1 + _bisearch(ucs, WIDE_EASTASIAN[unicode_version]) + + +def _wcswidth(ucs, unicode_version, errors='ignore'): + """ + :param str ucs: A single Ordinal Unicode point. + :param str unicode_version: Return value of :func:`_wcmatch_version`. + :param str errors: for POSIX compatibility in :func:`wcswidth`, + setting argument ``errors='strict'`` will raise a UnicodeError + on any C0/C1 Control Characters. + + Any ZWJ character is measured as though the current character + width is zero, and the subsequent character will occupy the same + cell as the preceeding. + + Although a list of Recommened Emoji ZWJ Sequences is made available by + unicode data files and as tests in this library, for performance reasons we + trust any combination. This library is not intended to defend against + non-recommended combinations ("glitch text"). + + A C1 Control character is abnormal and meant for machine-machine communication. + C0 Control characters, such as Tab ('\t') cannot be accounted for, as the + current screen position and tabstop value is not known. Both C0 and C1 + control characters are measured as width 0. + """ + idx = 0 + measured_width = 0 + while idx < len(ucs): + if ucs[idx] == '\u200D': + # Zero Width Joiner, do not measure this or next character + idx += 2 + continue + # measure width at current index + result = wcwidth(ucs[idx], unicode_version) + if result < 0: + if errors == 'strict': + raise UnicodeError('Control character {0!r} at index {1}'.format(ucs[idx], idx)) + else: + measured_width += result + idx += 1 + return measured_width + + @lru_cache(maxsize=1000) def wcwidth(wc, unicode_version='auto'): r""" @@ -145,8 +197,8 @@ def wcwidth(wc, unicode_version='auto'): :param str wc: A single Unicode character. :param str unicode_version: A Unicode version number, such as - ``'6.0.0'``, the list of available version levels may be - listed by pairing function :func:`list_versions`. + ``'6.0.0'``. A list of version levels suported by wcwidth + is returned by :func:`list_versions`. Any version string may be specified without error -- the nearest matching version is selected. When ``latest`` (default), the @@ -165,31 +217,17 @@ def wcwidth(wc, unicode_version='auto'): - C1 control characters and DEL (U+07F through U+0A0). - The following have a column width of 0: - - - Non-spacing and enclosing combining characters (general - category code Mn or Me in the Unicode database). - - - NULL (``U+0000``). - - - COMBINING GRAPHEME JOINER (``U+034F``). - - - ZERO WIDTH SPACE (``U+200B``) *through* - RIGHT-TO-LEFT MARK (``U+200F``). - - - LINE SEPARATOR (``U+2028``) *and* - PARAGRAPH SEPARATOR (``U+2029``). - - - LEFT-TO-RIGHT EMBEDDING (``U+202A``) *through* - RIGHT-TO-LEFT OVERRIDE (``U+202E``). - - - WORD JOINER (``U+2060``) *through* - INVISIBLE SEPARATOR (``U+2063``). + The following have a column width of 0, by Unicode General Category code: + - ``Me``: an enclosing combining mark + - ``Mn``: a nonspacing combining mark (zero advance width) + - ``Cf``: a format control character + - ``Zl``: U+2028 LINE SEPARATOR only + - ``Zp``: U+2029 PARAGRAPH SEPARATOR only + - ``Sk``: a non-letterlike modifier symbol + - and NULL (``U+0000``). The following have a column width of 1: - - SOFT HYPHEN (``U+00AD``). - - All remaining characters, including all printable ISO 8859-1 and WGL4 characters, Unicode control characters, etc. @@ -201,25 +239,7 @@ def wcwidth(wc, unicode_version='auto'): - Some kinds of Emoji or symbols. """ - # NOTE: created by hand, there isn't anything identifiable other than - # general Cf category code to identify these, and some characters in Cf - # category code are of non-zero width. - ucs = ord(wc) - if ucs in ZERO_WIDTH_CF: - return 0 - - # C0/C1 control characters - if ucs < 32 or 0x07F <= ucs < 0x0A0: - return -1 - - _unicode_version = _wcmatch_version(unicode_version) - - # combining characters with zero width - if _bisearch(ucs, ZERO_WIDTH[_unicode_version]): - return 0 - - # "Wide EastAsian" (and emojis) - return 1 + _bisearch(ucs, WIDE_EASTASIAN[_unicode_version]) + return _wcwidth(ucs=ord(wc), unicode_version=_wcmatch_version(unicode_version)) def wcswidth(pwcs, n=None, unicode_version='auto'): @@ -234,22 +254,40 @@ def wcswidth(pwcs, n=None, unicode_version='auto'): the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest available unicode version, otherwise. :rtype: int - :returns: The width, in cells, necessary to display the first ``n`` - characters of the unicode string ``pwcs``. Returns ``-1`` if - a non-printable character is encountered. + :returns: The width, in cells, needed to display the first ``n`` characters + of the unicode string ``pwcs``. Returns ``-1`` if a non-printable + character is encountered. """ - # pylint: disable=C0103 - # Invalid argument name "n" + try: + return _wcswidth(pwcs[:n], + unicode_version=_wcmatch_version(unicode_version), + errors='strict') + except UnicodeError: + # this final -1 return value is an ugly C POSIX holdover for C0/C1 + # control characters, only -- see function width() for a version that + # never returns -1! + return -1 - end = len(pwcs) if n is None else n - idx = slice(0, end) - width = 0 - for char in pwcs[idx]: - wcw = wcwidth(char, unicode_version) - if wcw < 0: - return -1 - width += wcw - return width + +def width(text, unicode_version='auto', emoji_zwj_version='auto'): + """ + Given a unicode string, return its printable length on a terminal. + + Unlike :func:`wcswidth`, ``-1`` is never returned when a non-printable + character is encountered, and, Emoji Zero Width Joiner (ZWJ) Sequences are + handled. + + :param str text: Measure width of given unicode string. + :param str unicode_version: An explicit definition of the unicode version + level to target for measurement, may be ``auto`` (default), which uses + the Environment Variable, ``UNICODE_VERSION`` if defined, or the latest + available unicode version, otherwise. + :param str emoji_zwj_version: An explicit definition of the unicode emoji + version to use, may be 'None' for no support. + :rtype: int + :returns: Approximate number cells needed to display the characters of ``text``. + """ + unicode_version = _wcmatch_version(unicode_version) @lru_cache(maxsize=128) @@ -292,10 +330,12 @@ def _wcmatch_version(given_version): """ # Design note: the choice to return the same type that is given certainly # complicates it for python 2 str-type, but allows us to define an api that - # to use 'string-type', for unicode version level definitions, so all of our - # example code works with all versions of python. That, along with the - # string-to-numeric and comparisons of earliest, latest, matching, or - # nearest, greatly complicates this function. + # uses 'string-type' for unicode version level definitions, so all of our + # example code works with all versions of python. + # + # That, along with the string-to-numeric and comparisons of earliest, + # latest, matching, or # nearest, greatly complicates this function. + # Performance is somewhat curbed by memoization. _return_str = not _PY3 and isinstance(given_version, str) if _return_str: