Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSV - Guess Encoding, Handle Null-string Escape #1717

Merged
merged 5 commits into from
Dec 25, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/topics/reading-and-writing-to-file.md
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,24 @@ $reader->setSheetIndex(0);

$spreadsheet = $reader->load("sample.csv");
```
You may also let PhpSpreadsheet attempt to guess the input encoding.
It will do so based on a test for BOM (UTF-8, UTF-16BE, UTF-16LE, UTF-32BE,
or UTF-32LE),
or by doing heuristic tests for those encodings, falling back to a
specifiable encoding (default is CP1252) if all of those tests fail.

```php
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
$encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding('sample.csv');
// or, e.g. $encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding(
// 'sample.csv', 'ISO-8859-2');
$reader->setInputEncoding($encoding);
$reader->setDelimiter(';');
$reader->setEnclosure('');
$reader->setSheetIndex(0);

$spreadsheet = $reader->load('sample.csv');
```

#### Read a specific worksheet

Expand Down
76 changes: 68 additions & 8 deletions src/PhpSpreadsheet/Reader/Csv.php
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,18 @@ public function getInputEncoding()
return $this->inputEncoding;
}

const UTF8_BOM = "\xEF\xBB\xBF";
const UTF8_BOM_LEN = 3;

/**
oleibman marked this conversation as resolved.
Show resolved Hide resolved
* Move filepointer past any BOM marker.
*/
protected function skipBOM(): void
{
rewind($this->fileHandle);

switch ($this->inputEncoding) {
case 'UTF-8':
fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ?
fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0);

break;
if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) {
rewind($this->fileHandle);
}
}

Expand Down Expand Up @@ -213,7 +212,9 @@ function ($sum, $value) use ($median) {
private function getNextLine()
{
$line = '';
$enclosure = '(?<!' . preg_quote($this->escapeCharacter, '/') . ')' . preg_quote($this->enclosure, '/');
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');

do {
// Get the next line in the file
Expand Down Expand Up @@ -307,7 +308,7 @@ private function openFileOrMemory($pFilename): void
$this->fileHandle = fopen('php://memory', 'r+b');
$data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding);
fwrite($this->fileHandle, $data);
rewind($this->fileHandle);
$this->skipBOM();
}
}

Expand Down Expand Up @@ -531,4 +532,63 @@ public function canRead($pFilename)

return in_array($type, $supportedTypes, true);
}

private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void
{
if ($encoding === '') {
$pos = strpos($contents, $compare);
if ($pos !== false && $pos % strlen($compare) === 0) {
$encoding = $setEncoding;
}
}
}

private static function guessEncodingNoBom(string $filename): string
{
$encoding = '';
$contents = file_get_contents($filename);
self::guessEncodingTestNoBom($encoding, $contents, "\x00\x00\x00\x0a", 'UTF-32BE');
self::guessEncodingTestNoBom($encoding, $contents, "\x0a\x00\x00\x00", 'UTF-32LE');
self::guessEncodingTestNoBom($encoding, $contents, "\x00\x0a", 'UTF-16BE');
self::guessEncodingTestNoBom($encoding, $contents, "\x0a\x00", 'UTF-16LE');
if ($encoding === '' && 1 == preg_match('//u', $contents)) {
oleibman marked this conversation as resolved.
Show resolved Hide resolved
$encoding = 'UTF-8';
}

return $encoding;
}

private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void
{
if ($encoding === '') {
if ($compare === substr($first4, 0, strlen($compare))) {
$encoding = $setEncoding;
}
}
}

private static function guessEncodingBom(string $filename): string
{
$encoding = '';
$first4 = file_get_contents($filename, false, null, 0, 4);
if ($first4 !== false) {
self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8');
oleibman marked this conversation as resolved.
Show resolved Hide resolved
self::guessEncodingTestBom($encoding, $first4, "\xfe\xff", 'UTF-16BE');
self::guessEncodingTestBom($encoding, $first4, "\x00\x00\xfe\xff", 'UTF-32BE');
self::guessEncodingTestBom($encoding, $first4, "\xff\xfe\x00\x00", 'UTF-32LE');
self::guessEncodingTestBom($encoding, $first4, "\xff\xfe", 'UTF-16LE');
}

return $encoding;
}

public static function guessEncoding(string $filename, string $dflt = 'CP1252'): string
{
$encoding = self::guessEncodingBom($filename);
if ($encoding === '') {
$encoding = self::guessEncodingNoBom($filename);
}

return ($encoding === '') ? $dflt : $encoding;
}
}
62 changes: 62 additions & 0 deletions tests/PhpSpreadsheetTests/Reader/CsvTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -275,4 +275,66 @@ public function testReadNonexistentFileName(): void
$reader = new Csv();
$reader->load('tests/data/Reader/CSV/encoding.utf8.csvxxx');
}

/**
* @dataProvider providerEscapes
*/
public function testInferSeparator(string $escape, string $delimiter): void
{
$reader = new Csv();
$reader->setEscapeCharacter($escape);
$filename = 'tests/data/Reader/CSV/escape.csv';
$reader->listWorksheetInfo($filename);
self::assertEquals($delimiter, $reader->getDelimiter());
}

public function providerEscapes()
{
return [
['\\', ';'],
["\x0", ','],
[(version_compare(PHP_VERSION, '7.4') < 0) ? "\x0" : '', ','],
];
}

/**
* @dataProvider providerGuessEncoding
*/
public function testGuessEncoding(string $filename): void
{
$reader = new Csv();
$reader->setInputEncoding(Csv::guessEncoding($filename));
$spreadsheet = $reader->load($filename);
$sheet = $spreadsheet->getActiveSheet();
self::assertEquals('première', $sheet->getCell('A1')->getValue());
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
}

public function providerGuessEncoding()
{
return [
['tests/data/Reader/CSV/premiere.utf8.csv'],
['tests/data/Reader/CSV/premiere.utf8bom.csv'],
['tests/data/Reader/CSV/premiere.utf16be.csv'],
['tests/data/Reader/CSV/premiere.utf16bebom.csv'],
['tests/data/Reader/CSV/premiere.utf16le.csv'],
['tests/data/Reader/CSV/premiere.utf16lebom.csv'],
['tests/data/Reader/CSV/premiere.utf32be.csv'],
['tests/data/Reader/CSV/premiere.utf32bebom.csv'],
['tests/data/Reader/CSV/premiere.utf32le.csv'],
['tests/data/Reader/CSV/premiere.utf32lebom.csv'],
['tests/data/Reader/CSV/premiere.win1252.csv'],
];
}

public function testGuessEncodingDefltIso2(): void
{
$filename = 'tests/data/Reader/CSV/premiere.win1252.csv';
$reader = new Csv();
$reader->setInputEncoding(Csv::guessEncoding($filename, 'ISO-8859-2'));
$spreadsheet = $reader->load($filename);
$sheet = $spreadsheet->getActiveSheet();
self::assertEquals('premičre', $sheet->getCell('A1')->getValue());
self::assertEquals('sixičme', $sheet->getCell('C2')->getValue());
}
}
4 changes: 4 additions & 0 deletions tests/data/Reader/CSV/escape.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\",d
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
Binary file added tests/data/Reader/CSV/premiere.utf16be.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf16bebom.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf16le.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf16lebom.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf32be.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf32bebom.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf32le.csv
Binary file not shown.
Binary file added tests/data/Reader/CSV/premiere.utf32lebom.csv
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/data/Reader/CSV/premiere.utf8.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
première,second,troisième
Quatrième,cinquième,sixième
2 changes: 2 additions & 0 deletions tests/data/Reader/CSV/premiere.utf8bom.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
première,second,troisième
Quatrième,cinquième,sixième
2 changes: 2 additions & 0 deletions tests/data/Reader/CSV/premiere.win1252.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
premi�re,second,troisi�me
Quatri�me,cinqui�me,sixi�me