From c9599e05d7ed1d8e64aa0184259795f6b1633aa5 Mon Sep 17 00:00:00 2001 From: "K.Kosako" Date: Fri, 27 Nov 2020 10:13:06 +0900 Subject: [PATCH] implement ONIGENC_CASE_FOLD_ASCII_ONLY --- src/cp1251.c | 8 +++-- src/iso8859_1.c | 94 ++++++++++++++++++++++++++---------------------- src/iso8859_10.c | 6 +++- src/iso8859_13.c | 6 +++- src/iso8859_14.c | 6 +++- src/iso8859_15.c | 6 +++- src/iso8859_16.c | 6 +++- src/iso8859_2.c | 6 +++- src/iso8859_3.c | 6 +++- src/iso8859_4.c | 6 +++- src/iso8859_5.c | 8 +++-- src/iso8859_7.c | 8 +++-- src/iso8859_9.c | 6 +++- src/koi8.c | 8 +++-- src/koi8_r.c | 6 +++- src/regcomp.c | 3 +- src/regenc.c | 17 ++++++--- src/regparse.c | 10 +++++- src/unicode.c | 93 +++++++++++++++++++++++++++++------------------ 19 files changed, 208 insertions(+), 101 deletions(-) diff --git a/src/cp1251.c b/src/cp1251.c index fa207809..a73e541f 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = { }; static int -cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +cp1251_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_CP1251_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_CP1251_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_1.c b/src/iso8859_1.c index d75509e4..7e3ab1bc 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag, } static int -get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, +get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { @@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, if (0x41 <= *p && *p <= 0x5a) { if (*p == LARGE_S && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */ ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, } else if (0x61 <= *p && *p <= 0x7a) { if (*p == SMALL_S && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */ + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */ goto ss_combination; } @@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (0xc0 <= *p && *p <= 0xcf) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; - } - else if (0xd0 <= *p && *p <= 0xdf) { - if (*p == 0xdf) { + else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { + if (0xc0 <= *p && *p <= 0xcf) { items[0].byte_len = 1; - items[0].code_len = 2; - items[0].code[0] = (OnigCodePoint )'s'; - items[0].code[1] = (OnigCodePoint )'s'; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } + else if (0xd0 <= *p && *p <= 0xdf) { + if (*p == 0xdf) { + items[0].byte_len = 1; + items[0].code_len = 2; + items[0].code[0] = (OnigCodePoint )'s'; + items[0].code[1] = (OnigCodePoint )'s'; - items[1].byte_len = 1; - items[1].code_len = 2; - items[1].code[0] = (OnigCodePoint )'S'; - items[1].code[1] = (OnigCodePoint )'S'; + items[1].byte_len = 1; + items[1].code_len = 2; + items[1].code[0] = (OnigCodePoint )'S'; + items[1].code[1] = (OnigCodePoint )'S'; - items[2].byte_len = 1; - items[2].code_len = 2; - items[2].code[0] = (OnigCodePoint )'s'; - items[2].code[1] = (OnigCodePoint )'S'; + items[2].byte_len = 1; + items[2].code_len = 2; + items[2].code[0] = (OnigCodePoint )'s'; + items[2].code[1] = (OnigCodePoint )'S'; - items[3].byte_len = 1; - items[3].code_len = 2; - items[3].code[0] = (OnigCodePoint )'S'; - items[3].code[1] = (OnigCodePoint )'s'; + items[3].byte_len = 1; + items[3].code_len = 2; + items[3].code[0] = (OnigCodePoint )'S'; + items[3].code[1] = (OnigCodePoint )'s'; - return 4; - } - else if (*p != 0xd7) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p + 0x20); - return 1; + return 4; + } + else if (*p != 0xd7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p + 0x20); + return 1; + } } - } - else if (0xe0 <= *p && *p <= 0xef) { - items[0].byte_len = 1; - items[0].code_len = 1; - items[0].code[0] = (OnigCodePoint )(*p - 0x20); - return 1; - } - else if (0xf0 <= *p && *p <= 0xfe) { - if (*p != 0xf7) { + else if (0xe0 <= *p && *p <= 0xef) { items[0].byte_len = 1; items[0].code_len = 1; items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } + else if (0xf0 <= *p && *p <= 0xfe) { + if (*p != 0xf7) { + items[0].byte_len = 1; + items[0].code_len = 1; + items[0].code[0] = (OnigCodePoint )(*p - 0x20); + return 1; + } + } } return 0; @@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_10.c b/src/iso8859_10.c index e98cffb8..6919a9ab 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 2bd460fe..648bc404 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 5030b557..d172a37c 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_15.c b/src/iso8859_15.c index f32c3ded..f6e62065 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 22a653a5..d5682515 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_2.c b/src/iso8859_2.c index dc3d0a14..833b94da 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_3.c b/src/iso8859_3.c index 49dc6b20..8e529729 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, return 2; } - *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_4.c b/src/iso8859_4.c index f3f6ba96..a14e6d58 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; /* return byte length of converted char to lower */ } diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a5f587c7..0399af9c 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 018efac7..22ac2238 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = { }; static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1f9bdead..f0080d2e 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, return 2; } - *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/koi8.c b/src/koi8.c index 37023c69..bdd39cbc 100644 --- a/src/koi8.c +++ b/src/koi8.c @@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = { static int -koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +koi8_mbc_case_fold(OnigCaseFoldType flag, const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower) { const UChar* p = *pp; - *lower = ENC_KOI8_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/koi8_r.c b/src/koi8_r.c index c77302fc..61117776 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, { const UChar* p = *pp; - *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) + *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); + else + *lower = *p; + (*pp)++; return 1; } diff --git a/src/regcomp.c b/src/regcomp.c index 08487630..74a5bf08 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -771,7 +771,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, StrNode* sn = STR_(node); UChar *s = sn->s; - if (NODE_IS_REAL_IGNORECASE(node)) { + if (NODE_IS_REAL_IGNORECASE(node) && + CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) { /* Such a case is possible. ex. /(?i)(?<=\1)(a)/ Backref node refer to capture group, but it doesn't tune yet. diff --git a/src/regenc.c b/src/regenc.c index 27e45493..a6b174ca 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -569,6 +569,9 @@ onigenc_apply_all_case_fold_with_map(int map_size, r = onigenc_ascii_apply_all_case_fold(flag, f, arg); if (r != 0) return r; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { code = map[i].to; r = (*f)(map[i].from, &code, 1, arg); @@ -588,7 +591,7 @@ onigenc_apply_all_case_fold_with_map(int map_size, extern int onigenc_get_case_fold_codes_by_str_with_map(int map_size, const OnigPairCaseFoldCodes map[], - int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED, + int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]) { int i, j, n; @@ -596,7 +599,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, if (0x41 <= *p && *p <= 0x5a) { /* A - Z */ if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ + && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */ + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { ss_combination: items[0].byte_len = 2; items[0].code_len = 1; @@ -625,7 +629,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, } else if (0x61 <= *p && *p <= 0x7a) { /* a - z */ if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1 - && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { + && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { goto ss_combination; } @@ -634,7 +639,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, items[0].code[0] = (OnigCodePoint )(*p - 0x20); return 1; } - else if (*p == 0xdf && ess_tsett_flag != 0) { + else if (*p == 0xdf && ess_tsett_flag != 0 + && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { items[0].byte_len = 1; items[0].code_len = 2; items[0].code[0] = (OnigCodePoint )'s'; @@ -660,6 +666,9 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size, else { int i; + if (CASE_FOLD_IS_ASCII_ONLY(flag)) + return 0; + for (i = 0; i < map_size; i++) { if (*p == map[i].from) { items[0].byte_len = 1; diff --git a/src/regparse.c b/src/regparse.c index 343650bd..c8dbd75d 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -8268,7 +8268,8 @@ typedef struct { } IApplyCaseFoldArg; static int -i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) +i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, + void* arg) { IApplyCaseFoldArg* iarg; ScanEnv* env; @@ -8278,6 +8279,13 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) env = iarg->env; cc = iarg->cc; +#if 0 + if (CASE_FOLD_IS_ASCII_ONLY(env->case_fold_flag)) { + if (! ONIGENC_IS_ASCII_CODE(from) || to_len != 1 || ! ONIGENC_IS_ASCII_CODE(*to)) + return 0; + } +#endif + if (to_len == 1) { int is_in = onig_is_code_in_cc(env->enc, from, cc); #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS diff --git a/src/unicode.c b/src/unicode.c index 6703d4b9..2fd9b204 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -77,9 +77,8 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = { #include "unicode_fold_data.c" extern int -onigenc_unicode_mbc_case_fold(OnigEncoding enc, - OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, - UChar* fold) +onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, + const UChar** pp, const UChar* end, UChar* fold) { const struct ByUnfoldKey* buk; @@ -104,23 +103,27 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } #endif - buk = onigenc_unicode_unfold_key(code); - if (buk != 0) { - if (buk->fold_len == 1) { - return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); - } - else { - OnigCodePoint* addr; - - FOLDS_FOLD_ADDR_BUK(buk, addr); - rlen = 0; - for (i = 0; i < buk->fold_len; i++) { - OnigCodePoint c = addr[i]; - len = ONIGENC_CODE_TO_MBC(enc, c, fold); - fold += len; - rlen += len; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) { + buk = onigenc_unicode_unfold_key(code); + if (buk != 0) { + if (buk->fold_len == 1) { + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index))) + return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); + } + else { + OnigCodePoint* addr; + + FOLDS_FOLD_ADDR_BUK(buk, addr); + rlen = 0; + for (i = 0; i < buk->fold_len; i++) { + OnigCodePoint c = addr[i]; + len = ONIGENC_CODE_TO_MBC(enc, c, fold); + fold += len; + rlen += len; + } + return rlen; } - return rlen; } } @@ -131,16 +134,22 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc, } static int -apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) +apply_case_fold1(OnigCaseFoldType flag, int from, int to, + OnigApplyAllCaseFoldFunc f, void* arg) { int i, j, k, n, r; for (i = from; i < to; ) { OnigCodePoint fold = *FOLDS1_FOLD(i); + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break; + n = FOLDS1_UNFOLDS_NUM(i); for (j = 0; j < n; j++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold)) + continue; + r = (*f)(fold, &unfold, 1, arg); if (r != 0) return r; r = (*f)(unfold, &fold, 1, arg); @@ -148,6 +157,9 @@ apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) for (k = 0; k < j; k++) { OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k]; + if (CASE_FOLD_IS_ASCII_ONLY(flag) && + ! ONIGENC_IS_ASCII_CODE(unfold2)) continue; + r = (*f)(unfold, &unfold2, 1, arg); if (r != 0) return r; r = (*f)(unfold2, &unfold, 1, arg); @@ -225,7 +237,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, { int r; - r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg); + r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -246,7 +258,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, } else { #endif - r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); + r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); if (r != 0) return r; #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI } @@ -288,6 +300,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, n = 0; code = ONIGENC_MBC_TO_CODE(enc, p, end); + if (CASE_FOLD_IS_ASCII_ONLY(flag)) { + if (! ONIGENC_IS_ASCII_CODE(code)) return n; + } len = enclen(enc, p); #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -449,19 +464,26 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (buk1 != 0) { if (buk1->fold_len == 1) { int un; - items[0].byte_len = lens[0]; - items[0].code_len = 1; - items[0].code[0] = *FOLDS1_FOLD(buk1->index); - n++; + + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) { + items[0].byte_len = lens[0]; + items[0].code_len = 1; + items[0].code[0] = *FOLDS1_FOLD(buk1->index); + n++; + } un = FOLDS1_UNFOLDS_NUM(buk1->index); for (i = 0; i < un; i++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i]; if (unfold != orig_codes[0]) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = unfold; - n++; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || + ONIGENC_IS_ASCII_CODE(unfold)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = unfold; + n++; + } } } } @@ -548,10 +570,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { - items[n].byte_len = lens[0]; - items[n].code_len = 1; - items[n].code[0] = FOLDS1_UNFOLDS(index)[i]; - n++; + code = FOLDS1_UNFOLDS(index)[i]; + if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) { + items[n].byte_len = lens[0]; + items[n].code_len = 1; + items[n].code[0] = code; + n++; + } } } }