Skip to content

Commit

Permalink
Merge branch 'PHP-8.1'
Browse files Browse the repository at this point in the history
* PHP-8.1:
  Reintroduce legacy 'SJIS-win' text encoding in mbstring
  • Loading branch information
alexdowad committed Aug 16, 2022
2 parents c8f6c7d + 371367c commit 5f8993b
Show file tree
Hide file tree
Showing 6 changed files with 215 additions and 23 deletions.
181 changes: 167 additions & 14 deletions ext/mbstring/libmbfl/filters/mbfilter_cp932.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,36 @@
*
*/

/* CP932 is Microsoft's version of Shift-JIS.
*
* What we call "SJIS-win" is a variant of CP932 which maps U+00A5
* and U+203E the same way as eucJP-win; namely, instead of mapping
* U+00A5 (YEN SIGN) to 0x5C and U+203E (OVERLINE) to 0x7E,
* these codepoints are mapped to appropriate JIS X 0208 characters.
*
* When converting from Shift-JIS to Unicode, there is no difference
* between CP932 and "SJIS-win".
*
* Additional facts:
*
* • In the libmbfl library which formed the base for mbstring, "CP932" and
* "SJIS-win" were originally aliases. The differing mappings were added in
* December 2002. The libmbfl author later stated that this was done so that
* "CP932" would comply with a certain specification, while "SJIS-win" would
* maintain the existing mappings. He does not remember which specification
* it was.
* • The WHATWG specification for "Shift_JIS" (followed by web browsers)
* agrees with our mappings for "CP932".
* • Microsoft Windows' "best-fit" mappings for CP932 (via the
* WideCharToMultiByte API) convert U+00A5 to 0x5C, which also agrees with
* our mappings for "CP932".
* • glibc's iconv converts U+203E to CP932 0x7E, which again agrees with
* our mappings for "CP932".
* • When converting Shift-JIS to CP932, the conversion goes through Unicode.
* Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that
* 0x7E will go to 0x7E when converting Shift-JIS to CP932.
*/

#include "mbfilter.h"
#include "mbfilter_cp932.h"

Expand All @@ -36,6 +66,7 @@
static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);

static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand All @@ -56,7 +87,8 @@ static const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
};

static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", "SJIS-win", "SJIS-ms", "SJIS-open", NULL};
static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL};
static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL};

const mbfl_encoding mbfl_encoding_cp932 = {
mbfl_no_encoding_cp932,
Expand Down Expand Up @@ -91,6 +123,39 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
NULL,
};

const mbfl_encoding mbfl_encoding_sjiswin = {
mbfl_no_encoding_sjiswin,
"SJIS-win",
"Shift_JIS",
mbfl_encoding_sjiswin_aliases,
mblen_table_sjis,
MBFL_ENCTYPE_GL_UNSAFE,
&vtbl_sjiswin_wchar,
&vtbl_wchar_sjiswin,
mb_cp932_to_wchar,
mb_wchar_to_sjiswin
};

const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
mbfl_no_encoding_sjiswin,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_cp932_wchar,
mbfl_filt_conv_cp932_wchar_flush,
NULL,
};

const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = {
mbfl_no_encoding_wchar,
mbfl_no_encoding_sjiswin,
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_wchar_sjiswin,
mbfl_filt_conv_common_flush,
NULL,
};

#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)

#define SJIS_ENCODE(c1,c2,s1,s2) \
Expand Down Expand Up @@ -136,12 +201,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp932 = {
} \
} while (0)


/*
* SJIS-win => wchar
*/
int
mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter)
{
int c1, s, s1, s2, w;

Expand Down Expand Up @@ -227,18 +287,16 @@ static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter)
return 0;
}

/*
* wchar => SJIS-win
*/
int
mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
{
int c1, c2, s1, s2;

s1 = 0;
s2 = 0;
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
} else if (c == 0x203E) {
s1 = 0x7E;
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
} else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
Expand All @@ -254,7 +312,7 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
}
if (s1 <= 0) {
if (c == 0xa5) { /* YEN SIGN */
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
s1 = 0x5C;
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0x2225) { /* PARALLEL TO */
Expand Down Expand Up @@ -314,6 +372,20 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
return 0;
}

int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter)
{
if (c == 0xA5) {
CK((*filter->output_function)(0x81, filter->data));
CK((*filter->output_function)(0x8F, filter->data));
} else if (c == 0x203E) {
CK((*filter->output_function)(0x81, filter->data));
CK((*filter->output_function)(0x50, filter->data));
} else {
return mbfl_filt_conv_wchar_cp932(c, filter);
}
return 0;
}

static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
{
unsigned char *p = *in, *e = p + *in_len;
Expand Down Expand Up @@ -389,6 +461,87 @@ static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, boo
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);

while (len--) {
uint32_t w = *in++;
unsigned int s1 = 0, s2 = 0, c1, c2;

if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) {
s1 = ucs_a1_jis_table[w - ucs_a1_jis_table_min];
} else if (w == 0x203E) {
s1 = 0x7E;
} else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) {
s1 = ucs_a2_jis_table[w - ucs_a2_jis_table_min];
} else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) {
s1 = ucs_i_jis_table[w - ucs_i_jis_table_min];
} else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) {
s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
s1 = w - 0xE000;
c1 = s1/94 + 0x7F;
c2 = s1%94 + 0x21;
s1 = (c1 << 8) | c2;
s2 = 1;
}

if (w == 0xA5) { /* YEN SIGN */
s1 = 0x5C;
} else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (w == 0x2225) { /* PARALLEL TO */
s1 = 0x2142;
} else if (w == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */
s1 = 0x215D;
} else if (w == 0xFFE0) { /* FULLWIDTH CENT SIGN */
s1 = 0x2171;
} else if (w == 0xFFE1) { /* FULLWIDTH POUND SIGN */
s1 = 0x2172;
} else if (w == 0xFFE2) { /* FULLWIDTH NOT SIGN */
s1 = 0x224C;
} else if (w == 0) {
out = mb_convert_buf_add(out, 0);
continue;
}

if (!s1 || (s1 >= 0x8080 && !s2)) { /* not found or X 0212 */
for (unsigned int i = 0; i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
if (cp932ext1_ucs_table[i] == w) {
s1 = ((i/94 + 0x2D) << 8) + (i%94 + 0x21);
goto emit_output;
}
}

for (unsigned int i = 0; i < cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; i++) {
if (cp932ext3_ucs_table[i] == w) {
s1 = ((i/94 + 0x93) << 8) + (i%94 + 0x21);
goto emit_output;
}
}

MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp932);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);
continue;
}

emit_output:
if (s1 < 0x100) {
out = mb_convert_buf_add(out, s1);
} else {
c1 = (s1 >> 8) & 0xFF;
c2 = s1 & 0xFF;
SJIS_ENCODE(c1, c2, s1, s2);
out = mb_convert_buf_add2(out, s1, s2);
}
}

MB_CONVERT_BUF_STORE(buf, out, limit);
}

static void mb_wchar_to_sjiswin(uint32_t *in, size_t len, mb_convert_buf *buf, bool end)
{
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(buf, out, limit);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2);

while (len--) {
uint32_t w = *in++;
unsigned int s1 = 0, s2 = 0, c1, c2;
Expand All @@ -403,7 +556,7 @@ static void mb_wchar_to_cp932(uint32_t *in, size_t len, mb_convert_buf *buf, boo
s1 = ucs_r_jis_table[w - ucs_r_jis_table_min];
} else if (w >= 0xE000 && w < (0xE000 + 20*94)) {
s1 = w - 0xE000;
c1 = s1/94 + 0x7f;
c1 = s1/94 + 0x7F;
c2 = s1%94 + 0x21;
s1 = (c1 << 8) | c2;
s2 = 1;
Expand Down
5 changes: 5 additions & 0 deletions ext/mbstring/libmbfl/filters/mbfilter_cp932.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ extern const mbfl_encoding mbfl_encoding_cp932;
extern const struct mbfl_convert_vtbl vtbl_cp932_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_cp932;

extern const mbfl_encoding mbfl_encoding_sjiswin;
extern const struct mbfl_convert_vtbl vtbl_sjiswin_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_sjiswin;

int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter);

#endif /* MBFL_MBFILTER_CP932_H */
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/mbfl/mbfl_encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
&mbfl_encoding_utf8_kddi_b,
&mbfl_encoding_utf8_sb,
&mbfl_encoding_cp932,
&mbfl_encoding_sjiswin,
&mbfl_encoding_cp51932,
&mbfl_encoding_jis,
&mbfl_encoding_2022jp,
Expand Down
1 change: 1 addition & 0 deletions ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ enum mbfl_no_encoding {
mbfl_no_encoding_sjis_mac,
mbfl_no_encoding_sjis2004,
mbfl_no_encoding_cp932,
mbfl_no_encoding_sjiswin,
mbfl_no_encoding_cp51932,
mbfl_no_encoding_jis,
mbfl_no_encoding_2022jp,
Expand Down
46 changes: 39 additions & 7 deletions ext/mbstring/tests/cp932_encoding.phpt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
--TEST--
Exhaustive test of CP932 encoding verification and conversion
Exhaustive test of CP932 encoding verification and conversion (including 'SJIS-win' variant)
--EXTENSIONS--
mbstring
--SKIPIF--
Expand Down Expand Up @@ -34,8 +34,10 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
$fromUnicode["\x00\xA2"] = "\x81\x91";
/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */
$fromUnicode["\x00\xA3"] = "\x81\x92";
/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
$fromUnicode["\x00\xA5"] = "\x81\x8F";
/* U+00A5 is YEN SIGN; convert to 0x5C, which has conflicting uses
* (either as backslash or as Yen sign) */
$fromUnicode["\x00\xA5"] = "\x5C";


/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
* But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */
Expand All @@ -51,12 +53,13 @@ $fromUnicode["\x20\x16"] = "\x81\x61";
* but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
$fromUnicode["\x00\xAC"] = "\x81\xCA";

/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\x81\x50";

/* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */
/* U+00AF is MACRON; convert to FULLWIDTH MACRON */
$fromUnicode["\x00\xAF"] = "\x81\x50";

/* U+203E is OVERLINE; convert to 0x7E, which has conflicting uses
* (either as tilde or as overline) */
$fromUnicode["\x20\x3E"] = "\x7E";

findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));

findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
Expand Down Expand Up @@ -106,17 +109,46 @@ echo "CP932 verification and conversion works on all invalid characters\n";
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
echo "Unicode -> CP932 conversion works on all invalid codepoints\n";

/* Now test 'SJIS-win' variant of CP932, which is really CP932 but with
* two different mappings
* Instead of mapping U+00A5 and U+203E to the single bytes 0x5C and 07E
* (which have conflicting uses), 'SJIS-win' maps them to appropriate
* JIS X 0208 characters */

/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
$fromUnicode["\x00\xA5"] = "\x81\x8F";
/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\x81\x50";

testAllValidChars($validChars, 'SJIS-win', 'UTF-16BE');
foreach ($nonInvertible as $cp932 => $unicode)
testValidString($cp932, $unicode, 'SJIS-win', 'UTF-16BE', false);
echo "SJIS-win verification and conversion works on all valid characters\n";

testAllInvalidChars($invalidChars, $validChars, 'SJIS-win', 'UTF-16BE', "\x00%");
echo "SJIS-win verification and conversion works on all invalid characters\n";

convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'SJIS-win', '%');
echo "Unicode -> SJIS-win conversion works on all invalid codepoints\n";

// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "%", "CP932", "UTF-8");
convertInvalidString("\xEA", "%", "CP932", "UTF-8");
convertInvalidString("\x81\x20", "%", "CP932", "UTF-8");
convertInvalidString("\xEA\xA9", "%", "CP932", "UTF-8");
convertInvalidString("\x80", "%", "SJIS-win", "UTF-8");
convertInvalidString("\xEA", "%", "SJIS-win", "UTF-8");
convertInvalidString("\x81\x20", "%", "SJIS-win", "UTF-8");
convertInvalidString("\xEA\xA9", "%", "SJIS-win", "UTF-8");

echo "Done!\n";
?>
--EXPECT--
CP932 verification and conversion works on all valid characters
CP932 verification and conversion works on all invalid characters
Unicode -> CP932 conversion works on all invalid codepoints
SJIS-win verification and conversion works on all valid characters
SJIS-win verification and conversion works on all invalid characters
Unicode -> SJIS-win conversion works on all invalid codepoints
Done!
4 changes: 2 additions & 2 deletions ext/mbstring/tests/mb_internal_encoding_variation2.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,10 @@ string(9) "eucJP-win"
-- Iteration 20 --
string(9) "eucJP-win"
bool(true)
string(5) "CP932"
string(8) "SJIS-win"

-- Iteration 21 --
string(5) "CP932"
string(8) "SJIS-win"
bool(true)
string(11) "ISO-2022-JP"

Expand Down

0 comments on commit 5f8993b

Please sign in to comment.