Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto generate unicode property tests. #67

Merged
merged 1 commit into from
Dec 31, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions RunTest
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ title22="Test 22: \C tests with UTF (not supported for DFA matching)"
title23="Test 23: \C disabled test"
title24="Test 24: Non-UTF pattern conversion tests"
title25="Test 25: UTF pattern conversion tests"
maxtest=25
title26="Test 26: Auto-generated unicode property tests"
maxtest=26

if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title0
Expand Down Expand Up @@ -109,6 +110,7 @@ if [ $# -eq 1 -a "$1" = "list" ]; then
echo $title23
echo $title24
echo $title25
echo $title26
exit 0
fi

Expand Down Expand Up @@ -238,6 +240,7 @@ do22=no
do23=no
do24=no
do25=no
do26=no

while [ $# -gt 0 ] ; do
case $1 in
Expand Down Expand Up @@ -267,6 +270,7 @@ while [ $# -gt 0 ] ; do
23) do23=yes;;
24) do24=yes;;
25) do25=yes;;
26) do26=yes;;
-8) arg8=yes;;
-16) arg16=yes;;
-32) arg32=yes;;
Expand Down Expand Up @@ -417,7 +421,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
$do12 = no -a $do13 = no -a $do14 = no -a $do15 = no -a \
$do16 = no -a $do17 = no -a $do18 = no -a $do19 = no -a \
$do20 = no -a $do21 = no -a $do22 = no -a $do23 = no -a \
$do24 = no -a $do25 = no \
$do24 = no -a $do25 = no -a $do26 = no \
]; then
do0=yes
do1=yes
Expand Down Expand Up @@ -445,6 +449,7 @@ if [ $do0 = no -a $do1 = no -a $do2 = no -a $do3 = no -a \
do23=yes
do24=yes
do25=yes
do26=yes
fi

# Handle any explicit skips at this stage, so that an argument list may consist
Expand Down Expand Up @@ -863,6 +868,20 @@ for bmode in "$test8" "$test16" "$test32"; do
fi
fi

# Auto-generated unicode property tests

if [ $do26 = yes ] ; then
echo $title26
if [ $utf -eq 0 ] ; then
echo " Skipped because UTF-$bits support is not available"
else
for opt in "" $jitopt; do
$sim $valgrind ${opt:+$vjs} ./pcre2test -q $setstack $bmode $opt $testdata/testinput26 testtry
checkresult $? 26 "$opt"
done
fi
fi

# End of loop for 8/16/32-bit tests
done

Expand Down
188 changes: 188 additions & 0 deletions maint/GenerateTest26.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#! /usr/bin/python

# PCRE2 UNICODE PROPERTY SUPPORT
# ------------------------------
#
# This file auto-generates unicode property tests and their expected output.
# It is recommended to re-run this generator after the unicode files are
# updated. The names of the generated files are `testinput26` and `testoutput26`

import re
import sys

from GenerateCommon import \
script_names, \
script_abbrevs

def write_both(text):
input_file.write(text)
output_file.write(text)

def to_string_char(ch_idx):
if ch_idx < 128:
if ch_idx < 16:
return "\\x{0%x}" % ch_idx
if ch_idx >= 32:
return chr(ch_idx)
return "\\x{%x}" % ch_idx

output_directory = ""

if len(sys.argv) > 2:
print('** Too many arguments: just give a directory name')
sys.exit(1)
if len(sys.argv) == 2:
output_directory = sys.argv[1]
if not output_directory.endswith("/"):
output_directory += "/"

try:
input_file = open(output_directory + "testinput26", "w")
output_file = open(output_directory + "testoutput26", "w")
except IOError:
print ("** Couldn't open output files")
sys.exit(1)

write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")

# ---------------------------------------------------------------------------
# UNICODE SCRIPT EXTENSION TESTS
# ---------------------------------------------------------------------------

write_both("# Unicode Script Extension tests.\n\n")

def gen_script_tests():
script_data = [None] * len(script_names)
char_data = [None] * 0x110000

property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
prev_name = ""
script_idx = -1

with open("Unicode.tables/Scripts.txt") as f:
for line in f:
match_obj = property_re.match(line)

if match_obj == None:
continue

name = match_obj.group(3)
if name != prev_name:
script_idx = script_names.index(name)
prev_name = name

low = int(match_obj.group(1), 16)
high = low
char_data[low] = name

if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)
for idx in range(low + 1, high + 1):
char_data[idx] = name

if script_data[script_idx] == None:
script_data[script_idx] = [low, None, None, None, None]
script_data[script_idx][1] = high

extended_script_indicies = {}

with open("Unicode.tables/ScriptExtensions.txt") as f:
for line in f:
match_obj = property_re.match(line)

if match_obj == None:
continue

low = int(match_obj.group(1), 16)
high = low
if match_obj.group(2) != None:
high = int(match_obj.group(2), 16)

for abbrev in match_obj.group(3).split(" "):
if abbrev not in extended_script_indicies:
idx = script_abbrevs.index(abbrev)
extended_script_indicies[abbrev] = idx
rec = script_data[idx]
rec[2] = low
rec[3] = high
else:
idx = extended_script_indicies[abbrev]
rec = script_data[idx]
if rec[2] > low:
rec[2] = low
if rec[3] < high:
rec[3] = high

if rec[4] == None:
name = script_names[idx]
for idx in range(low, high + 1):
if char_data[idx] != name:
rec[4] = idx
break

long_property_name = False

for idx, rec in enumerate(script_data):
script_name = script_names[idx]

if script_name == "Unknown":
continue

script_abbrev = script_abbrevs[idx]

write_both("# Base script check\n")
write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[0]))
output_file.write(" 0: %s\n" % to_string_char(rec[0]))
write_both("\n")

write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
write_both(" %s\n" % to_string_char(rec[1]))
output_file.write(" 0: %s\n" % to_string_char(rec[1]))
write_both("\n")

if rec[2] != None:
property_name = "scx"
if long_property_name:
property_name = "Script_Extensions"

write_both("# Script extension check\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[2]))
output_file.write(" 0: %s\n" % to_string_char(rec[2]))
write_both("\n")

write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
write_both(" %s\n" % to_string_char(rec[3]))
output_file.write(" 0: %s\n" % to_string_char(rec[3]))
write_both("\n")

long_property_name = not long_property_name

if rec[4] != None:
write_both("# Script extension only character\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write(" 0: %s\n" % to_string_char(rec[4]))
write_both("\n")

write_both("/^\\p{sc=%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(rec[4]))
output_file.write("No match\n")
write_both("\n")
else:
print("External character has not found for %s" % script_name)

high = rec[1]
if rec[3] != None and rec[3] > rec[1]:
high = rec[3]
write_both("# Character not in script\n")
write_both("/^\\p{%s}/utf\n" % script_name)
write_both(" %s\n" % to_string_char(high + 1))
output_file.write("No match\n")
write_both("\n")


gen_script_tests()

write_both("# End of testinput26\n")
Loading