Skip to content

Commit

Permalink
Rework script extension handling (#64)
Browse files Browse the repository at this point in the history
Co-authored-by: Zoltan Herczeg <[email protected]>
  • Loading branch information
zherczeg and Zoltan Herczeg authored Dec 29, 2021
1 parent 7713f33 commit afa4756
Show file tree
Hide file tree
Showing 13 changed files with 3,693 additions and 3,684 deletions.
40 changes: 40 additions & 0 deletions maint/GenerateCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,46 @@
'Extended_Pictographic', '14'
]

# ---------------------------------------------------------------------------
# REORDERING SCRIPT NAMES
# ---------------------------------------------------------------------------

import re

def reorder_scripts():
global script_names
global script_abbrevs

extended_script_abbrevs = set()
with open("Unicode.tables/ScriptExtensions.txt") as f:
names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')

for line in f:
match_obj = names_re.match(line)

if match_obj == None:
continue

for name in match_obj.group(1).split(" "):
extended_script_abbrevs.add(name)

new_script_names = []
new_script_abbrevs = []

for idx, abbrev in enumerate(script_abbrevs):
if abbrev in extended_script_abbrevs:
new_script_names.append(script_names[idx])
new_script_abbrevs.append(abbrev)

for idx, abbrev in enumerate(script_abbrevs):
if abbrev not in extended_script_abbrevs:
new_script_names.append(script_names[idx])
new_script_abbrevs.append(abbrev)

script_names = new_script_names
script_abbrevs = new_script_abbrevs

reorder_scripts()

# ---------------------------------------------------------------------------
# DERIVED LISTS
Expand Down
80 changes: 22 additions & 58 deletions maint/GenerateUcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,30 +252,15 @@ def get_other_case(chardata):
# Parse a line of ScriptExtensions.txt

def get_script_extension(chardata):
this_script_list = list(chardata[1].split(' '))
if len(this_script_list) == 1:
return script_abbrevs.index(this_script_list[0])
global last_script_extension

script_numbers = []
for d in this_script_list:
script_numbers.append(script_abbrevs.index(d))
script_numbers.append(0)
script_numbers_length = len(script_numbers)
offset = len(script_lists) * script_list_item_size
if last_script_extension == chardata[1]:
return offset - script_list_item_size

for i in range(1, len(script_lists) - script_numbers_length + 1):
for j in range(0, script_numbers_length):
found = True
if script_lists[i+j] != script_numbers[j]:
found = False
break
if found:
return -i

# Not found in existing lists

return_value = len(script_lists)
script_lists.extend(script_numbers)
return -return_value
last_script_extension = chardata[1]
script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
return offset


# Read a whole table in memory, setting/checking the Unicode version
Expand Down Expand Up @@ -538,34 +523,18 @@ def write_records(records, record_size):
# multiple scripts. Initialize this list with a single entry, as the zeroth
# element is never used.

script_lists = [0]
script_abbrevs_default = script_abbrevs.index('Zzzz')
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)

# Scan all characters and set their default script extension to the main
# script. We also have to adjust negative scriptx values, following a change in
# the way these work. They are currently negated offsets into the script_lists
# list, but have to be changed into indices in the new ucd_script_sets vector,
# which has fixed-size entries. We can compute the new offset by counting the
# zeros that precede the current offset.

for i in range(0, MAX_UNICODE):
if scriptx[i] == script_abbrevs_default:
scriptx[i] = script[i]
elif scriptx[i] < 0:
count = 1
for j in range(-scriptx[i], 0, -1):
if script_lists[j] == 0:
count += 1
scriptx[i] = -count * (int(len(script_names)/32) + 1)
script_lists = [[]]
script_list_item_size = (script_names.index('Unknown') + 31) // 32
last_script_extension = ""
scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)

# With the addition of the Script Extensions field, we needed some padding to
# get the Unicode records up to 12 bytes (multiple of 4). Originally this was a
# 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits
# are now used for the bidi class, so zero will do.

padding_dummy = [0] * MAX_UNICODE
padding_dummy[0] = 0
padding_dummy[0] = 256

# This block of code was added by PH in September 2012. It scans the other_case
# table to find sets of more than two characters that must all match each other
Expand Down Expand Up @@ -806,24 +775,19 @@ def write_records(records, record_size):
const uint32_t PRIV(ucd_script_sets)[] = {
""")

bitword_count = len(script_names)/32 + 1
bitwords = [0] * int(bitword_count)

for d in script_lists:
if d == 0:
s = " "
f.write(" ")
for x in bitwords:
f.write("%s" % s)
s = ", "
f.write("0x%08xu" % x)
f.write(",\n")
bitwords = [0] * int(bitword_count)
bitwords = [0] * script_list_item_size

else:
x = int(d/32)
y = int(d%32)
bitwords[x] = bitwords[x] | (1 << y)
for idx in d:
bitwords[idx // 32] |= 1 << (idx % 31)

s = " "
for x in bitwords:
f.write("%s" % s)
s = ", "
f.write("0x%08xu" % x)
f.write(",\n")

f.write("};\n\n")

Expand Down
4 changes: 3 additions & 1 deletion maint/GenerateUcpHeader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@
f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
f.write("};\n\n")

f.write("/* These are the script identifications, additions happen at the end. */\n\nenum {\n")
f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n")
for i in script_names:
if i == "Unknown":
f.write("\n /* Scripts which has no characters in other scripts. */\n")
f.write(" ucp_%s,\n" % i)
f.write("\n")

Expand Down
8 changes: 6 additions & 2 deletions maint/GenerateUcpTables.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,12 @@ def stdnames(x):
# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
# still use the full original names.

utt_table = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
utt_table += list(zip(std_script_abbrevs, script_names, ['PT_SCX'] * len(script_abbrevs)))
scx_end = script_names.index('Unknown')

utt_table = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))

# At lease one script abbreviation is the same as the full name of the script,
# so we must remove duplicates. It doesn't matter if this operation changes the
Expand Down
7 changes: 2 additions & 5 deletions src/pcre2_auto_possess.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
BOOL negated)
{
BOOL ok;
int scriptx;
const uint32_t *p;
const ucd_record *prop = GET_UCD(c);

Expand All @@ -221,10 +220,8 @@ switch(ptype)
return (pdata == prop->script) == negated;

case PT_SCX:
scriptx = prop->scriptx;
ok = pdata == prop->script || pdata == (unsigned int)scriptx;
if (!ok && scriptx < 0)
ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, pdata) != 0;
ok = (pdata == prop->script
|| MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, pdata) != 0);
return ok == negated;

/* These are specials */
Expand Down
16 changes: 13 additions & 3 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -2206,13 +2206,23 @@ while (bot < top)
{
*pdataptr = PRIV(utt)[i].value;
if (vptr == NULL || ptscript == PT_NOTSCRIPT)
{
*ptypeptr = PRIV(utt)[i].type;
else
return TRUE;
}

switch (PRIV(utt)[i].type)
{
if (PRIV(utt)[i].type != PT_SCX) break; /* Non-script found */
case PT_SC:
*ptypeptr = PT_SC;
return TRUE;

case PT_SCX:
*ptypeptr = ptscript;
return TRUE;
}
return TRUE;

break; /* Non-script found */
}

if (r > 0) bot = i + 1; else top = i;
Expand Down
23 changes: 9 additions & 14 deletions src/pcre2_dfa_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -1194,9 +1194,8 @@ for (;;)
break;

case PT_SCX:
OK = prop->script == code[2] || prop->scriptx == (int)code[2];
if (!OK && prop->scriptx < 0)
OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[2]) != 0;
OK = (prop->script == code[2] ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[2]) != 0);
break;

/* These are specials for combination cases. */
Expand Down Expand Up @@ -1466,9 +1465,8 @@ for (;;)
break;

case PT_SCX:
OK = prop->script == code[3] || prop->scriptx == (int)code[3];
if (!OK && prop->scriptx < 0)
OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0;
OK = (prop->script == code[3] ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[3]) != 0);
break;

/* These are specials for combination cases. */
Expand Down Expand Up @@ -1721,9 +1719,8 @@ for (;;)
break;

case PT_SCX:
OK = prop->script == code[3] || prop->scriptx == (int)code[3];
if (!OK && prop->scriptx < 0)
OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0;
OK = (prop->script == code[3] ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[3]) != 0);
break;

/* These are specials for combination cases. */
Expand Down Expand Up @@ -2001,11 +1998,9 @@ for (;;)
break;

case PT_SCX:
OK = prop->script == code[1 + IMM2_SIZE + 2] ||
prop->scriptx == (int)code[1 + IMM2_SIZE + 2];
if (!OK && prop->scriptx < 0)
OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx,
code[1 + IMM2_SIZE + 2]) != 0;
OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx,
code[1 + IMM2_SIZE + 2]) != 0);
break;

/* These are specials for combination cases. */
Expand Down
8 changes: 4 additions & 4 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1822,9 +1822,9 @@ typedef struct {
uint8_t gbprop; /* ucp_gbControl, etc. (grapheme break property) */
uint8_t caseset; /* offset to multichar other cases or zero */
int32_t other_case; /* offset to other case, or zero if none */
int16_t scriptx; /* script extension value */
uint8_t scriptx; /* script extension value */
uint8_t bidi; /* bidi class and control flag */
uint8_t dummy; /* spare - to round to multiple of 4 bytes */
uint16_t dummy; /* spare - to round to multiple of 4 bytes */
} ucd_record;

/* UCD access macros */
Expand All @@ -1849,8 +1849,8 @@ typedef struct {
#define UCD_OTHERCASE(ch) ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
#define UCD_SCRIPTX(ch) GET_UCD(ch)->scriptx

/* The "scriptx" field, when negative, gives an offset into a vector of 32-bit
words that form a bitmap representing a list of scripts. This macro tests for a
/* The "scriptx" field gives an offset into a vector of 32-bit words that
form a bitmap representing a list of scripts. This macro tests for a
script in the map by number. */

#define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))
Expand Down
28 changes: 8 additions & 20 deletions src/pcre2_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -2454,11 +2454,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

case PT_SCX:
{
int scriptx = prop->scriptx;
BOOL ok = Fecode[2] == prop->script ||
Fecode[2] == (unsigned int)scriptx;
if (!ok && scriptx < 0)
ok = MAPBIT((PRIV(ucd_script_sets) - scriptx), Fecode[2]) != 0;
BOOL ok = (Fecode[2] == prop->script ||
MAPBIT((PRIV(ucd_script_sets) + prop->scriptx), Fecode[2]) != 0);
if (ok == notmatch) RRETURN(MATCH_NOMATCH);
}
break;
Expand Down Expand Up @@ -2728,7 +2725,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for (i = 1; i <= Lmin; i++)
{
BOOL ok;
int scriptx;
const ucd_record *prop;
if (Feptr >= mb->end_subject)
{
Expand All @@ -2737,10 +2733,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARINCTEST(fc, Feptr);
prop = GET_UCD(fc);
scriptx = prop->scriptx;
ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
if (!ok && scriptx < 0)
ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
ok = (prop->script == Lpropvalue ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
if (ok == notmatch)
RRETURN(MATCH_NOMATCH);
}
Expand Down Expand Up @@ -3521,7 +3515,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
for (;;)
{
BOOL ok;
int scriptx;
const ucd_record *prop;
RMATCH(Fecode, RM225);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
Expand All @@ -3533,10 +3526,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARINCTEST(fc, Feptr);
prop = GET_UCD(fc);
scriptx = prop->scriptx;
ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
if (!ok && scriptx < 0)
ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
ok = (prop->script == Lpropvalue
|| MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
if (ok == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
Expand Down Expand Up @@ -4104,7 +4095,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
BOOL ok;
const ucd_record *prop;
int scriptx;
int len = 1;
if (Feptr >= mb->end_subject)
{
Expand All @@ -4113,10 +4103,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
}
GETCHARLENTEST(fc, Feptr, len);
prop = GET_UCD(fc);
scriptx = prop->scriptx;
ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
if (!ok && scriptx < 0)
ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
ok = (prop->script == Lpropvalue ||
MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
if (ok == notmatch) break;
Feptr+= len;
}
Expand Down
Loading

0 comments on commit afa4756

Please sign in to comment.