diff --git a/.codeclimate.yml b/.codeclimate.yml index 641e74d..99a2ccf 100644 --- a/.codeclimate.yml +++ b/.codeclimate.yml @@ -2,6 +2,9 @@ version: "2" plugins: phpmd: enabled: true + checks: + CleanCode/StaticAccess: + enabled: false phpcodesniffer: enabled: true sonar-php: diff --git a/.travis.yml b/.travis.yml index 16ba0c7..74ff91c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,10 @@ language: php php: - 5.6 - 7.0 - - hhvm + - 7.1 + - 7.2 + - 7.3 + - 7.4 before_script: - composer install diff --git a/composer.json b/composer.json index f18e95e..812d5d3 100644 --- a/composer.json +++ b/composer.json @@ -29,6 +29,7 @@ }, "require-dev": { "phpunit/phpunit": "^5.7", - "squizlabs/php_codesniffer": "^3.2" + "squizlabs/php_codesniffer": "^3.2", + "ext-json": "*" } } diff --git a/src/CsvReader.php b/src/CsvReader.php index a3dc7aa..ee1fa9a 100644 --- a/src/CsvReader.php +++ b/src/CsvReader.php @@ -111,29 +111,10 @@ protected function openCsvFile($fileName) */ protected function detectLineBreak() { - rewind($this->getFilePointer()); - $sample = fread($this->getFilePointer(), 10000); - - $possibleLineBreaks = [ - "\r\n", // win - "\r", // mac - "\n", // unix - ]; - - $lineBreaksPositions = []; - foreach ($possibleLineBreaks as $lineBreak) { - $position = strpos($sample, $lineBreak); - if ($position === false) { - continue; - } - $lineBreaksPositions[$lineBreak] = $position; - } - - - asort($lineBreaksPositions); - reset($lineBreaksPositions); + @rewind($this->getFilePointer()); + $sample = @fread($this->getFilePointer(), 10000); - return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions); + return LineBreaksHelper::detectLineBreaks($sample, $this->getEnclosure(), $this->getEscapedBy()); } /** @@ -148,7 +129,7 @@ protected function readLine() // allow empty enclosure hack $enclosure = !$this->getEnclosure() ? chr(0) : $this->getEnclosure(); $escapedBy = !$this->getEscapedBy() ? chr(0) : $this->getEscapedBy(); - return fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy); + return @fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy); } /** diff --git a/src/CsvWriter.php b/src/CsvWriter.php index 45c683d..454a571 100644 --- a/src/CsvWriter.php +++ b/src/CsvWriter.php @@ -90,7 +90,7 @@ public function writeRow(array $row) "Cannot write to CSV file " . $this->fileName . ($ret === false && error_get_last() ? 'Error: ' . error_get_last()['message'] : '') . ' Return: ' . json_encode($ret) . - ' To write: ' . strlen($str) . ' Written: ' . $ret, + ' To write: ' . strlen($str) . ' Written: ' . (int) $ret, Exception::WRITE_ERROR ); } diff --git a/src/LineBreaksHelper.php b/src/LineBreaksHelper.php new file mode 100644 index 0000000..a6e18d6 --- /dev/null +++ b/src/LineBreaksHelper.php @@ -0,0 +1,99 @@ +(?>"")|[^"])*"~ + * enclosure: |"|, escapedBy: |\|, regexp: ~"(?>(?>\\"|\\\\)|[^"])*"~ + */ + // @formatter:off + $regexp = + // regexp start + self::REGEXP_DELIMITER . + // enclosure start + preg_quote($enclosure, self::REGEXP_DELIMITER) . + /* + * Once-only group => if there is a match, do not try other alternatives + * See: https://www.php.net/manual/en/regexp.reference.onlyonce.php + * Without once-only group will be |"abc\"| false positive, + * because |\| is matched by group and |"| at the end of regexp. + */ + // repeated once-only group start + '(?>' . + // once-only group start + '(?>' . + // escaped enclosure + preg_quote($escapedEnclosure, self::REGEXP_DELIMITER) . + // OR escaped escape char + ($escapedEscape ? '|' . preg_quote($escapedEscape, self::REGEXP_DELIMITER) : '') . + // group end + ')' . + // OR not enclosure + '|[^' . preg_quote($enclosure, self::REGEXP_DELIMITER) . ']' . + // group end + ')*' . + // enclosure end + preg_quote($enclosure, self::REGEXP_DELIMITER) . + // regexp end + self::REGEXP_DELIMITER; + // @formatter:on + + return preg_replace($regexp, $doubleEnclosure, $sample); + } +} diff --git a/tests/CsvReadTest.php b/tests/CsvReadTest.php index c6bff66..dec42c5 100644 --- a/tests/CsvReadTest.php +++ b/tests/CsvReadTest.php @@ -137,6 +137,24 @@ public function testParseEscapedBy() self::assertEquals($expected, iterator_to_array($csvFile)); } + public function testParseMacLineEndsInField() + { + $csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\'); + + $expected = [ + [ + 'test', + "some text\rwith\r\\r line breaks\rinside\rbut\rrows\rare\rusing \\n \\\"line\\\" break\r", + ], + [ + 'name', 'data' + ] + ]; + + self::assertEquals($expected, iterator_to_array($csvFile)); + } + + public function testEmptyHeader() { $csvFile = new CsvReader(__DIR__ . '/data/test-input.empty.csv', ',', '"'); @@ -364,14 +382,6 @@ public function invalidSkipLinesProvider() ]; } - public function testInvalidNewLines() - { - self::expectException(Exception::class); - self::expectExceptionMessage('Invalid line break. Please use unix \n or win \r\n line breaks.'); - new CsvReader(__DIR__ . DIRECTORY_SEPARATOR . 'data/binary'); - } - - public function testValidWithoutRewind() { $fileName = __DIR__ . '/data/simple.csv'; @@ -479,4 +489,54 @@ public function testInvalidFile() self::expectExceptionMessage('Invalid file: array'); new CsvReader(['bad']); } + + /** + * @dataProvider getPerformanceTestInputs + * @param string $fileContent + * @param int $expectedRows + * @param float $maxDuration + */ + public function testPerformance($fileContent, $expectedRows, $maxDuration) + { + self::markTestSkipped( + 'Run this test only manually. Because the duration is very different in local CI environment.' + ); + + try { + $fileName = sys_get_temp_dir() . DIRECTORY_SEPARATOR . uniqid('perf-test'); + file_put_contents($fileName, $fileContent); + $startTime = microtime(true); + $reader = new CsvReader($fileName); + $rows = 0; + foreach ($reader as $line){ + $rows++; + } + $duration = microtime(true) - $startTime; + self::assertSame($expectedRows, $rows); + self::assertLessThanOrEqual($maxDuration, $duration); + } finally { + @unlink($fileName); + } + } + + public function getPerformanceTestInputs() + { + yield '1M-simple-rows' => [ + str_repeat("abc,def,\"xyz\"\n", 1000000), + 1000000, + 8.0 + ]; + + yield '1M-empty-lines-n' => [ + str_repeat("\n", 1000000), + 1000000, + 8.0 + ]; + + yield '1M-no-separators' => [ + str_repeat(md5('abc') . "\n", 1000000), + 1000000, + 8.0 + ]; + } } diff --git a/tests/CsvWriteTest.php b/tests/CsvWriteTest.php index a3323a4..24f77f8 100644 --- a/tests/CsvWriteTest.php +++ b/tests/CsvWriteTest.php @@ -6,6 +6,8 @@ use Keboola\Csv\CsvWriter; use Keboola\Csv\Exception; use PHPUnit\Framework\TestCase; +use PHPUnit_Framework_Constraint_Or; +use PHPUnit_Framework_Constraint_StringContains; class CsvWriteTest extends TestCase { @@ -87,9 +89,19 @@ public function testWriteInvalidObject() ]; $csvFile->writeRow($rows[0]); - self::expectException(Exception::class); - self::expectExceptionMessage("Cannot write data into column: stdClass::"); - $csvFile->writeRow($rows[1]); + + try { + $csvFile->writeRow($rows[1]); + self::fail('Expected exception was not thrown.'); + } catch (Exception $e) { + // Exception message differs between PHP versions. + $or = new PHPUnit_Framework_Constraint_Or(); + $or->setConstraints([ + new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: stdClass::"), + new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: (object) array(\n)") + ]); + self::assertThat($e->getMessage(), $or); + } } public function testWriteValidObject() @@ -182,9 +194,24 @@ public function testInvalidPointer() $pointer = fopen($fileName, 'r'); $csvFile = new CsvWriter($pointer); $rows = [['col1', 'col2']]; - self::expectException(Exception::class); - self::expectExceptionMessage('Cannot write to CSV file Return: 0 To write: 14 Written: 0'); - $csvFile->writeRow($rows[0]); + + try { + $csvFile->writeRow($rows[0]); + self::fail('Expected exception was not thrown.'); + } catch (Exception $e) { + // Exception message differs between PHP versions. + $or = new PHPUnit_Framework_Constraint_Or(); + $or->setConstraints([ + new PHPUnit_Framework_Constraint_StringContains( + 'Cannot write to CSV file Return: 0 To write: 14 Written: 0' + ), + new PHPUnit_Framework_Constraint_StringContains( + 'Cannot write to CSV file Error: fwrite(): ' . + 'write of 14 bytes failed with errno=9 Bad file descriptor Return: false To write: 14 Written: 0' + ) + ]); + self::assertThat($e->getMessage(), $or); + } } public function testInvalidPointer2() diff --git a/tests/LineBreaksHelperTest.php b/tests/LineBreaksHelperTest.php new file mode 100644 index 0000000..5f22caa --- /dev/null +++ b/tests/LineBreaksHelperTest.php @@ -0,0 +1,140 @@ + "\n", + 'r' => "\r", + "r-n" => "\r\n", + ]; + + yield "empty" => [ + '"', + '', + '', + '', + "\n" + ]; + + yield "empty-escaped-by" => [ + '"', + '\\', + '', + '', + "\n" + ]; + + foreach ($lineEnds as $prefix => $lineEnd) { + yield "$prefix-simple" => [ + '"', + '', + implode($lineEnd, [ + 'col1,col2', + 'line without enclosure,second column', + '"enclosure "" in column","hello \"', + '"line with enclosure","second column"', + '"column with enclosure "", and comma inside text","second column enclosure in text """', + ]), + implode($lineEnd, [ + 'col1,col2', + 'line without enclosure,second column', + '"",""', + '"",""', + '"",""', + ]), + $lineEnd + ]; + + yield "$prefix-simple-escaped-by" => [ + '"', + '\\', + implode($lineEnd, [ + 'col1,col2', + 'line without enclosure,second column', + '"enclosure \" in column","hello \\\\"', + '"line with enclosure","second column"', + '"column with enclosure \", and comma inside text","second column enclosure in text \""' + ]), + implode($lineEnd, [ + 'col1,col2', + 'line without enclosure,second column', + '"",""', + '"",""', + '"",""', + ]), + $lineEnd + ]; + + yield "$prefix-multiline-n" => [ + '"', + '', + implode($lineEnd, [ + "\"xyz\",\"\n\n\nabc\n\n\n\"\"\n\n\nxyz\n\n\n\"", + '"abc","def"', + ]), + implode($lineEnd, [ + '"",""', + '"",""', + ]), + $lineEnd + ]; + + yield "$prefix-multiline-r" => [ + '"', + '', + implode($lineEnd, [ + "\"xyz\",\"\r\r\rabc\r\r\r\"\"\r\r\rxyz\r\r\r\"", + '"abc","def"', + ]), + implode($lineEnd, [ + '"",""', + '"",""', + ]), + $lineEnd + ]; + + yield "$prefix-multiline-r-n" => [ + '"', + '', + implode($lineEnd, [ + "\"xyz\",\"\r\n\r\n\r\nabc\r\n\r\n\r\n\"\"\r\n\r\n\r\nxyz\r\n\r\n\r\n\"", + '"abc","def"', + ]), + implode($lineEnd, [ + '"",""', + '"",""', + ]), + $lineEnd + ]; + } + } +} diff --git a/tests/data/binary b/tests/data/binary deleted file mode 100644 index 421f1a7..0000000 Binary files a/tests/data/binary and /dev/null differ diff --git a/tests/data/test-input.lineBreaks.csv b/tests/data/test-input.lineBreaks.csv new file mode 100644 index 0000000..ed4ce9d --- /dev/null +++ b/tests/data/test-input.lineBreaks.csv @@ -0,0 +1,2 @@ +"test","some text with \r line breaks inside but rows are using \n \"line\" break " +"name","data"