Skip to content

Commit

Permalink
Add EncodingService test cases
Browse files Browse the repository at this point in the history
Signed-off-by: Raul <[email protected]>
  • Loading branch information
Raudius committed May 16, 2022
1 parent 303d867 commit 749cc0a
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 89 deletions.
46 changes: 19 additions & 27 deletions lib/Service/EncodingService.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,15 @@
namespace OCA\Text\Service;

class EncodingService {
public const COMMON_ENCODINGS = ['UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];

public const UTF_BOMs = [
'UTF-32BE' => "\x00\x00\xfe\xff",
'UTF-32LE' => "\xff\xfe\x00\x00",
'UTF-16BE' => "\xfe\xff",
'UTF-16LE' => "\xff\xfe",
'UTF-8' => "\xef\xbb\xbf"
];
public const COMMON_ENCODINGS = [ 'UTF-8', 'GB2312', 'GBK', 'BIG-5', 'SJIS-win', 'EUC-JP', 'Windows-1252', 'ISO-8859-15', 'ISO-8859-1', 'ASCII'];

public const UTF_BOMs = [
'UTF-32BE' => "\x00\x00\xfe\xff",
'UTF-32LE' => "\xff\xfe\x00\x00",
'UTF-16BE' => "\xfe\xff",
'UTF-16LE' => "\xff\xfe",
'UTF-8' => "\xef\xbb\xbf"
];

public function encodeToUtf8(string $string): ?string {
$encoding = $this->detectEncoding($string);
Expand All @@ -47,32 +46,25 @@ public function encodeToUtf8(string $string): ?string {
}

public function detectEncoding(string $string): ?string {
$bom_detect = $this->detectUtfBom($string);
if ($bom_detect) {
return $bom_detect;
}
$bom_detect = $this->detectUtfBom($string);
if ($bom_detect) {
return $bom_detect;
}

return mb_detect_encoding($string, $this->getEncodings(), true) ?: null;
}

$encodings = $this->getEncodings();
foreach ($encodings as $encoding) {
if (mb_check_encoding($string, $encoding)) {
private function detectUtfBom(string $string): ?string {
foreach (self::UTF_BOMs as $encoding => $utf_bom) {
$bom = substr($string, 0, strlen($utf_bom));
if ($bom === $utf_bom) {
return $encoding;
}
}

return null;
}

public function detectUtfBom(string $string): ?string {
foreach (self::UTF_BOMs as $encoding => $utf_bom) {
$bom = substr($string, 0, strlen($utf_bom));
if ($bom === $utf_bom) {
return $encoding;
}
}

return null;
}

/**
* @return string[]
*/
Expand Down
31 changes: 1 addition & 30 deletions tests/data/big5.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/cp936.txt

Large diffs are not rendered by default.

22 changes: 0 additions & 22 deletions tests/data/gbk.txt

This file was deleted.

File renamed without changes.
12 changes: 12 additions & 0 deletions tests/data/iso-8859-5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
������ 1
��� ���� ��������� ���������� � ������� � ����� ����������� � ������. ��� �������� ������� � �������� � ������ ��������� � ��������� ���� ����� � ���� ��������.

������ 2
������ ������� ������ �������� ����� ������� � ����� ���������, ���������������� ��������� �����������, ��� ������ �� �� �� ���� ��������, ���-�� � ��������� ����, ����� ����, ����, �����, �������, ������������ ��� ���� ���������, ������������� ��� ����������� �������������, ��������������, ���������� ��� ����� ���������.

����� ����, �� ������ ����������� �������� �������� �� ������ �������������, ��������� ��� �������������� ������� ������ ��� ����������, � ������� ������� �����������, ���������� �� ����, �������� �� ��� ���������� �����������, ����������, ������������������� ��� ���-���� ����� ������������ � ����� ������������.

������ 3
������ ������� ����� ����� �� �����, �� ������� � �� ������ ������������������.


Binary file added tests/data/utf-16.txt
Binary file not shown.
40 changes: 30 additions & 10 deletions tests/unit/Service/EncodingServiceTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,62 @@ protected function setUp(): void {
/**
* Attempt to decode the file using the default decoding order.
* For files with encodings not included in the COMMON_ENCODINGS array encoding to UTF-8 will fail.
* We then do the conversion again after setting the mb_detect_order value: all conversions should succeed.
* @dataProvider dataFileEncodings
*/
public function testDefault(string $file, string $encoding) {
$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));

// If encoding is not part of the default encodings we can expect it to fail
// It might still succeed because encoding detection is not precise.
if (!$utf8_string && !in_array($encoding, EncodingService::COMMON_ENCODINGS, true)) {
return;
if ($utf8_string || $this->isSupportedEncoding($encoding)) {
$this->assertNotNull($utf8_string);
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
}

$original_order = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
$this->assertNotNull($utf8_string);
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));

mb_detect_order($original_order);
}

/**
* Includes the encoding of the file in the detection order config value.
* This means that all files should be successfully encoded to UTF-8.
* If the encoding is in the list of common encodings we should be able to detect an encoding (it might not be the
* correct encoding due to detection inaccuracies). If not, add the encoding to mb_detect_order.
* @dataProvider dataFileEncodings
*/
public function testCustomOrder(string $file, string $encoding) {
public function testDetection(string $file, string $encoding) {
$detected_encoding = $this->encodingService->detectEncoding(file_get_contents($file));
if ($this->isSupportedEncoding($encoding)) {
$this->assertNotNull($detected_encoding);
}

$original_order = mb_detect_order();
$this->assertNotFalse(mb_detect_order($encoding));

$utf8_string = $this->encodingService->encodeToUtf8(file_get_contents($file));
$this->assertNotNull($utf8_string);
$this->assertNotFalse(mb_detect_encoding($utf8_string, 'UTF-8', true));
$detected_encoding = $this->encodingService->detectEncoding(file_get_contents($file));
$this->assertEquals($encoding, $detected_encoding);

mb_detect_order($original_order);
}


public function dataFileEncodings(): array {
return [
['./tests/data/iso-8859.txt', 'ISO-8859-1'],
['./tests/data/iso-8859-15.txt', 'ISO-8859-15'],
['./tests/data/big5.txt', 'BIG-5'],
['./tests/data/gbk.txt', 'GBK']
['./tests/data/cp936.txt', 'CP936'],
['./tests/data/utf-16.txt', 'UTF-16LE'],
['./tests/data/iso-8859-5.txt', 'ISO-8859-5'],
];
}

private function isSupportedEncoding(string $encoding): bool {
return in_array($encoding, EncodingService::COMMON_ENCODINGS, true)
|| isset(EncodingService::UTF_BOMs[$encoding]);
}
}

0 comments on commit 749cc0a

Please sign in to comment.