Skip to content

Commit

Permalink
implement ONIGENC_CASE_FOLD_ASCII_ONLY
Browse files Browse the repository at this point in the history
  • Loading branch information
kkos committed Nov 28, 2020
1 parent b3cc984 commit c9599e0
Show file tree
Hide file tree
Showing 19 changed files with 208 additions and 101 deletions.
8 changes: 6 additions & 2 deletions src/cp1251.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = {
};

static int
cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
cp1251_mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;

*lower = ENC_CP1251_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_CP1251_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
94 changes: 51 additions & 43 deletions src/iso8859_1.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag,
}

static int
get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
get_case_fold_codes_by_str(OnigCaseFoldType flag,
const OnigUChar* p, const OnigUChar* end,
OnigCaseFoldCodeItem items[])
{
Expand All @@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,

if (0x41 <= *p && *p <= 0x5a) {
if (*p == LARGE_S && end > p + 1
&& (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */
&& (*(p+1) == LARGE_S || *(p+1) == SMALL_S)
&& CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */
ss_combination:
items[0].byte_len = 2;
items[0].code_len = 1;
Expand Down Expand Up @@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
}
else if (0x61 <= *p && *p <= 0x7a) {
if (*p == SMALL_S && end > p + 1
&& (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */
&& (*(p+1) == SMALL_S || *(p+1) == LARGE_S)
&& CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */
goto ss_combination;
}

Expand All @@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
else if (0xc0 <= *p && *p <= 0xcf) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
return 1;
}
else if (0xd0 <= *p && *p <= 0xdf) {
if (*p == 0xdf) {
else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {
if (0xc0 <= *p && *p <= 0xcf) {
items[0].byte_len = 1;
items[0].code_len = 2;
items[0].code[0] = (OnigCodePoint )'s';
items[0].code[1] = (OnigCodePoint )'s';
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
return 1;
}
else if (0xd0 <= *p && *p <= 0xdf) {
if (*p == 0xdf) {
items[0].byte_len = 1;
items[0].code_len = 2;
items[0].code[0] = (OnigCodePoint )'s';
items[0].code[1] = (OnigCodePoint )'s';

items[1].byte_len = 1;
items[1].code_len = 2;
items[1].code[0] = (OnigCodePoint )'S';
items[1].code[1] = (OnigCodePoint )'S';
items[1].byte_len = 1;
items[1].code_len = 2;
items[1].code[0] = (OnigCodePoint )'S';
items[1].code[1] = (OnigCodePoint )'S';

items[2].byte_len = 1;
items[2].code_len = 2;
items[2].code[0] = (OnigCodePoint )'s';
items[2].code[1] = (OnigCodePoint )'S';
items[2].byte_len = 1;
items[2].code_len = 2;
items[2].code[0] = (OnigCodePoint )'s';
items[2].code[1] = (OnigCodePoint )'S';

items[3].byte_len = 1;
items[3].code_len = 2;
items[3].code[0] = (OnigCodePoint )'S';
items[3].code[1] = (OnigCodePoint )'s';
items[3].byte_len = 1;
items[3].code_len = 2;
items[3].code[0] = (OnigCodePoint )'S';
items[3].code[1] = (OnigCodePoint )'s';

return 4;
}
else if (*p != 0xd7) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
return 1;
return 4;
}
else if (*p != 0xd7) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p + 0x20);
return 1;
}
}
}
else if (0xe0 <= *p && *p <= 0xef) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
else if (0xf0 <= *p && *p <= 0xfe) {
if (*p != 0xf7) {
else if (0xe0 <= *p && *p <= 0xef) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
else if (0xf0 <= *p && *p <= 0xfe) {
if (*p != 0xf7) {
items[0].byte_len = 1;
items[0].code_len = 1;
items[0].code[0] = (OnigCodePoint )(*p - 0x20);
return 1;
}
}
}

return 0;
Expand All @@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
return 2;
}

*lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_10.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_13.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_14.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1; /* return byte length of converted char to lower */
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_15.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1; /* return byte length of converted char to lower */
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_16.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1; /* return byte length of converted char to lower */
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1; /* return byte length of converted char to lower */
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_3.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,
return 2;
}

*lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_4.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1; /* return byte length of converted char to lower */
}
Expand Down
8 changes: 6 additions & 2 deletions src/iso8859_5.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = {
};

static int
mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;

*lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
8 changes: 6 additions & 2 deletions src/iso8859_7.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = {
};

static int
mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;

*lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/iso8859_9.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,
return 2;
}

*lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
8 changes: 6 additions & 2 deletions src/koi8.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = {


static int
koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
koi8_mbc_case_fold(OnigCaseFoldType flag,
const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)
{
const UChar* p = *pp;

*lower = ENC_KOI8_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_KOI8_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
6 changes: 5 additions & 1 deletion src/koi8_r.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,
{
const UChar* p = *pp;

*lower = ENC_KOI8_R_TO_LOWER_CASE(*p);
if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p))
*lower = ENC_KOI8_R_TO_LOWER_CASE(*p);
else
*lower = *p;

(*pp)++;
return 1;
}
Expand Down
3 changes: 2 additions & 1 deletion src/regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,
StrNode* sn = STR_(node);
UChar *s = sn->s;

if (NODE_IS_REAL_IGNORECASE(node)) {
if (NODE_IS_REAL_IGNORECASE(node) &&
CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) {
/* Such a case is possible.
ex. /(?i)(?<=\1)(a)/
Backref node refer to capture group, but it doesn't tune yet.
Expand Down
Loading

0 comments on commit c9599e0

Please sign in to comment.