Rework script extension handling (#64)

Co-authored-by: Zoltan Herczeg <[email protected]>
PCRE2Project · Dec 29, 2021 · afa4756 · afa4756
1 parent 7713f33
commit afa4756
Show file tree

Hide file tree

Showing 13 changed files with 3,693 additions and 3,684 deletions.
diff --git a/maint/GenerateCommon.py b/maint/GenerateCommon.py
@@ -184,6 +184,46 @@
   'Extended_Pictographic', '14'
   ]
 
+# ---------------------------------------------------------------------------
+#                      REORDERING SCRIPT NAMES
+# ---------------------------------------------------------------------------
+
+import re
+
+def reorder_scripts():
+  global script_names
+  global script_abbrevs
+
+  extended_script_abbrevs = set()
+  with open("Unicode.tables/ScriptExtensions.txt") as f:
+    names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #')
+
+    for line in f:
+      match_obj = names_re.match(line)
+
+      if match_obj == None:
+        continue
+
+      for name in match_obj.group(1).split(" "):
+        extended_script_abbrevs.add(name)
+
+  new_script_names = []
+  new_script_abbrevs = []
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  for idx, abbrev in enumerate(script_abbrevs):
+    if abbrev not in extended_script_abbrevs:
+      new_script_names.append(script_names[idx])
+      new_script_abbrevs.append(abbrev)
+
+  script_names = new_script_names
+  script_abbrevs = new_script_abbrevs
+
+reorder_scripts()
 
 # ---------------------------------------------------------------------------
 #                         DERIVED LISTS

diff --git a/maint/GenerateUcd.py b/maint/GenerateUcd.py
@@ -252,30 +252,15 @@ def get_other_case(chardata):
 # Parse a line of ScriptExtensions.txt
 
 def get_script_extension(chardata):
-  this_script_list = list(chardata[1].split(' '))
-  if len(this_script_list) == 1:
-    return script_abbrevs.index(this_script_list[0])
+  global last_script_extension
 
-  script_numbers = []
-  for d in this_script_list:
-    script_numbers.append(script_abbrevs.index(d))
-  script_numbers.append(0)
-  script_numbers_length = len(script_numbers)
+  offset = len(script_lists) * script_list_item_size
+  if last_script_extension == chardata[1]:
+    return offset - script_list_item_size
 
-  for i in range(1, len(script_lists) - script_numbers_length + 1):
-    for j in range(0, script_numbers_length):
-      found = True
-      if script_lists[i+j] != script_numbers[j]:
-        found = False
-        break
-    if found:
-      return -i
-
-  # Not found in existing lists
-
-  return_value = len(script_lists)
-  script_lists.extend(script_numbers)
-  return -return_value
+  last_script_extension = chardata[1]
+  script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
+  return offset
 
 
 # Read a whole table in memory, setting/checking the Unicode version
@@ -538,34 +523,18 @@ def write_records(records, record_size):
 # multiple scripts. Initialize this list with a single entry, as the zeroth
 # element is never used.
 
-script_lists = [0]
-script_abbrevs_default = script_abbrevs.index('Zzzz')
-scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default)
-
-# Scan all characters and set their default script extension to the main
-# script. We also have to adjust negative scriptx values, following a change in
-# the way these work. They are currently negated offsets into the script_lists
-# list, but have to be changed into indices in the new ucd_script_sets vector,
-# which has fixed-size entries. We can compute the new offset by counting the
-# zeros that precede the current offset.
-
-for i in range(0, MAX_UNICODE):
-  if scriptx[i] == script_abbrevs_default:
-    scriptx[i] = script[i]
-  elif scriptx[i] < 0:
-    count = 1
-    for j in range(-scriptx[i], 0, -1):
-      if script_lists[j] == 0:
-        count += 1
-    scriptx[i] = -count * (int(len(script_names)/32) + 1)
+script_lists = [[]]
+script_list_item_size = (script_names.index('Unknown') + 31) // 32
+last_script_extension = ""
+scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
 
 # With the addition of the Script Extensions field, we needed some padding to
 # get the Unicode records up to 12 bytes (multiple of 4). Originally this was a
 # 16-bit field and padding_dummy[0] was set to 256 to ensure this, but 8 bits
 # are now used for the bidi class, so zero will do.
 
 padding_dummy = [0] * MAX_UNICODE
-padding_dummy[0] = 0
+padding_dummy[0] = 256
 
 # This block of code was added by PH in September 2012. It scans the other_case
 # table to find sets of more than two characters that must all match each other
@@ -806,24 +775,19 @@ def write_records(records, record_size):
 const uint32_t PRIV(ucd_script_sets)[] = {
 """)
 
-bitword_count = len(script_names)/32 + 1
-bitwords = [0] * int(bitword_count)
 
 for d in script_lists:
-  if d == 0:
-    s = " "
-    f.write("  ")
-    for x in bitwords:
-      f.write("%s" % s)
-      s = ", "
-      f.write("0x%08xu" % x)
-    f.write(",\n")
-    bitwords = [0] * int(bitword_count)
+  bitwords = [0] * script_list_item_size
 
-  else:
-    x = int(d/32)
-    y = int(d%32)
-    bitwords[x] = bitwords[x] | (1 << y)
+  for idx in d:
+    bitwords[idx // 32] |= 1 << (idx % 31)
+
+  s = " "
+  for x in bitwords:
+    f.write("%s" % s)
+    s = ", "
+    f.write("0x%08xu" % x)
+  f.write(",\n")
 
 f.write("};\n\n")
 

diff --git a/maint/GenerateUcpHeader.py b/maint/GenerateUcpHeader.py
@@ -64,8 +64,10 @@
     f.write("  ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1]))
 f.write("};\n\n")
 
-f.write("/* These are the script identifications, additions happen at the end. */\n\nenum {\n")
+f.write("/* These are the script identifications. */\n\nenum {\n  /* Scripts which has characters in other scripts. */\n")
 for i in script_names:
+    if i == "Unknown":
+      f.write("\n  /* Scripts which has no characters in other scripts. */\n")
     f.write("  ucp_%s,\n" % i)
 f.write("\n")
 

diff --git a/maint/GenerateUcpTables.py b/maint/GenerateUcpTables.py
@@ -92,8 +92,12 @@ def stdnames(x):
 # latter is used for the ucp_xx names. NOTE: for the script abbreviations, we
 # still use the full original names.
 
-utt_table  = list(zip(std_script_names, script_names, ['PT_SCX'] * len(script_names)))
-utt_table += list(zip(std_script_abbrevs, script_names, ['PT_SCX'] * len(script_abbrevs)))
+scx_end = script_names.index('Unknown')
+
+utt_table  = list(zip(std_script_names[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
+utt_table += list(zip(std_script_names[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
+utt_table += list(zip(std_script_abbrevs[0:scx_end], script_names[0:scx_end], ['PT_SCX'] * scx_end))
+utt_table += list(zip(std_script_abbrevs[scx_end:], script_names[scx_end:], ['PT_SC'] * (len(script_names) - scx_end)))
 
 # At lease one script abbreviation is the same as the full name of the script,
 # so we must remove duplicates. It doesn't matter if this operation changes the

diff --git a/src/pcre2_auto_possess.c b/src/pcre2_auto_possess.c
@@ -200,7 +200,6 @@ check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
   BOOL negated)
 {
 BOOL ok;
-int scriptx;
 const uint32_t *p;
 const ucd_record *prop = GET_UCD(c);
 
@@ -221,10 +220,8 @@ switch(ptype)
   return (pdata == prop->script) == negated;
 
   case PT_SCX:
-  scriptx = prop->scriptx;
-  ok = pdata == prop->script || pdata == (unsigned int)scriptx;
-  if (!ok && scriptx < 0)
-    ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, pdata) != 0;
+  ok = (pdata == prop->script
+        || MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, pdata) != 0);
   return ok == negated;
 
   /* These are specials */

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -2206,13 +2206,23 @@ while (bot < top)
     {
     *pdataptr = PRIV(utt)[i].value;
     if (vptr == NULL || ptscript == PT_NOTSCRIPT)
+      {
       *ptypeptr = PRIV(utt)[i].type;
-    else
+      return TRUE;
+      }
+
+    switch (PRIV(utt)[i].type)
       {
-      if (PRIV(utt)[i].type != PT_SCX) break;  /* Non-script found */
+      case PT_SC:
+      *ptypeptr = PT_SC;
+      return TRUE;
+
+      case PT_SCX:
       *ptypeptr = ptscript;
+      return TRUE;
       }
-    return TRUE;
+
+    break;  /* Non-script found */
     }
 
   if (r > 0) bot = i + 1; else top = i;

diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c
@@ -1194,9 +1194,8 @@ for (;;)
           break;
 
           case PT_SCX:
-          OK = prop->script == code[2] || prop->scriptx == (int)code[2];
-          if (!OK && prop->scriptx < 0)
-            OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[2]) != 0;
+          OK = (prop->script == code[2] ||
+                MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[2]) != 0);
           break;
 
           /* These are specials for combination cases. */
@@ -1466,9 +1465,8 @@ for (;;)
           break;
 
           case PT_SCX:
-          OK = prop->script == code[3] || prop->scriptx == (int)code[3];
-          if (!OK && prop->scriptx < 0)
-            OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0;
+          OK = (prop->script == code[3] ||
+                MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[3]) != 0);
           break;
 
           /* These are specials for combination cases. */
@@ -1721,9 +1719,8 @@ for (;;)
           break;
 
           case PT_SCX:
-          OK = prop->script == code[3] || prop->scriptx == (int)code[3];
-          if (!OK && prop->scriptx < 0)
-            OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx, code[3]) != 0;
+          OK = (prop->script == code[3] ||
+                MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, code[3]) != 0);
           break;
 
           /* These are specials for combination cases. */
@@ -2001,11 +1998,9 @@ for (;;)
           break;
 
           case PT_SCX:
-          OK = prop->script == code[1 + IMM2_SIZE + 2] ||
-               prop->scriptx == (int)code[1 + IMM2_SIZE + 2];
-          if (!OK && prop->scriptx < 0)
-            OK = MAPBIT(PRIV(ucd_script_sets) - prop->scriptx,
-              code[1 + IMM2_SIZE + 2]) != 0;
+          OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
+                MAPBIT(PRIV(ucd_script_sets) + prop->scriptx,
+                  code[1 + IMM2_SIZE + 2]) != 0);
           break;
 
           /* These are specials for combination cases. */

diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
@@ -1822,9 +1822,9 @@ typedef struct {
   uint8_t gbprop;     /* ucp_gbControl, etc. (grapheme break property) */
   uint8_t caseset;    /* offset to multichar other cases or zero */
   int32_t other_case; /* offset to other case, or zero if none */
-  int16_t scriptx;    /* script extension value */
+  uint8_t scriptx;    /* script extension value */
   uint8_t bidi;       /* bidi class and control flag */
-  uint8_t dummy;      /* spare - to round to multiple of 4 bytes */
+  uint16_t dummy;     /* spare - to round to multiple of 4 bytes */
 } ucd_record;
 
 /* UCD access macros */
@@ -1849,8 +1849,8 @@ typedef struct {
 #define UCD_OTHERCASE(ch)   ((uint32_t)((int)ch + (int)(GET_UCD(ch)->other_case)))
 #define UCD_SCRIPTX(ch)     GET_UCD(ch)->scriptx
 
-/* The "scriptx" field, when negative, gives an offset into a vector of 32-bit
-words that form a bitmap representing a list of scripts. This macro tests for a
+/* The "scriptx" field gives an offset into a vector of 32-bit words that
+form a bitmap representing a list of scripts. This macro tests for a
 script in the map by number. */
 
 #define MAPBIT(map,script) ((map)[(script)/32]&(1u<<((script)%32)))

diff --git a/src/pcre2_match.c b/src/pcre2_match.c
@@ -2454,11 +2454,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
 
         case PT_SCX:
           {
-          int scriptx = prop->scriptx;
-          BOOL ok = Fecode[2] == prop->script ||
-                    Fecode[2] == (unsigned int)scriptx;
-          if (!ok && scriptx < 0)
-            ok = MAPBIT((PRIV(ucd_script_sets) - scriptx), Fecode[2]) != 0;
+          BOOL ok = (Fecode[2] == prop->script ||
+                     MAPBIT((PRIV(ucd_script_sets) + prop->scriptx), Fecode[2]) != 0);
           if (ok == notmatch) RRETURN(MATCH_NOMATCH);
           }
         break;
@@ -2728,7 +2725,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
           for (i = 1; i <= Lmin; i++)
             {
             BOOL ok;
-            int scriptx;
             const ucd_record *prop;
             if (Feptr >= mb->end_subject)
               {
@@ -2737,10 +2733,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
               }
             GETCHARINCTEST(fc, Feptr);
             prop = GET_UCD(fc);
-            scriptx = prop->scriptx;
-            ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
-            if (!ok && scriptx < 0)
-              ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
+            ok = (prop->script == Lpropvalue ||
+                  MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
             if (ok == notmatch)
               RRETURN(MATCH_NOMATCH);
             }
@@ -3521,7 +3515,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
           for (;;)
             {
             BOOL ok;
-            int scriptx;
             const ucd_record *prop;
             RMATCH(Fecode, RM225);
             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@@ -3533,10 +3526,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
               }
             GETCHARINCTEST(fc, Feptr);
             prop = GET_UCD(fc);
-            scriptx = prop->scriptx;
-            ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
-            if (!ok && scriptx < 0)
-              ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
+            ok = (prop->script == Lpropvalue
+                  || MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
             if (ok == (Lctype == OP_NOTPROP))
               RRETURN(MATCH_NOMATCH);
             }
@@ -4104,7 +4095,6 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
             {
             BOOL ok;
             const ucd_record *prop;
-            int scriptx;
             int len = 1;
             if (Feptr >= mb->end_subject)
               {
@@ -4113,10 +4103,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
               }
             GETCHARLENTEST(fc, Feptr, len);
             prop = GET_UCD(fc);
-            scriptx = prop->scriptx;
-            ok = prop->script == Lpropvalue || scriptx == (int)Lpropvalue;
-            if (!ok && scriptx < 0)
-              ok = MAPBIT(PRIV(ucd_script_sets) - scriptx, Lpropvalue) != 0;
+            ok = (prop->script == Lpropvalue ||
+                  MAPBIT(PRIV(ucd_script_sets) + prop->scriptx, Lpropvalue) != 0);
             if (ok == notmatch) break;
             Feptr+= len;
             }