Skip to content

Commit

Permalink
Merge pull request #41 from keboola/webrouse-COM-350-fix-line-breaks-…
Browse files Browse the repository at this point in the history
…detect

Fix line breaks detection in edge-case
  • Loading branch information
michaljurecko authored Jul 28, 2020
2 parents b77f1c7 + 2811a35 commit a439656
Show file tree
Hide file tree
Showing 11 changed files with 356 additions and 40 deletions.
3 changes: 3 additions & 0 deletions .codeclimate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ version: "2"
plugins:
phpmd:
enabled: true
checks:
CleanCode/StaticAccess:
enabled: false
phpcodesniffer:
enabled: true
sonar-php:
Expand Down
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ language: php
php:
- 5.6
- 7.0
- hhvm
- 7.1
- 7.2
- 7.3
- 7.4

before_script:
- composer install
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
},
"require-dev": {
"phpunit/phpunit": "^5.7",
"squizlabs/php_codesniffer": "^3.2"
"squizlabs/php_codesniffer": "^3.2",
"ext-json": "*"
}
}
27 changes: 4 additions & 23 deletions src/CsvReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,29 +111,10 @@ protected function openCsvFile($fileName)
*/
protected function detectLineBreak()
{
rewind($this->getFilePointer());
$sample = fread($this->getFilePointer(), 10000);

$possibleLineBreaks = [
"\r\n", // win
"\r", // mac
"\n", // unix
];

$lineBreaksPositions = [];
foreach ($possibleLineBreaks as $lineBreak) {
$position = strpos($sample, $lineBreak);
if ($position === false) {
continue;
}
$lineBreaksPositions[$lineBreak] = $position;
}


asort($lineBreaksPositions);
reset($lineBreaksPositions);
@rewind($this->getFilePointer());
$sample = @fread($this->getFilePointer(), 10000);

return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions);
return LineBreaksHelper::detectLineBreaks($sample, $this->getEnclosure(), $this->getEscapedBy());
}

/**
Expand All @@ -148,7 +129,7 @@ protected function readLine()
// allow empty enclosure hack
$enclosure = !$this->getEnclosure() ? chr(0) : $this->getEnclosure();
$escapedBy = !$this->getEscapedBy() ? chr(0) : $this->getEscapedBy();
return fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy);
return @fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/CsvWriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public function writeRow(array $row)
"Cannot write to CSV file " . $this->fileName .
($ret === false && error_get_last() ? 'Error: ' . error_get_last()['message'] : '') .
' Return: ' . json_encode($ret) .
' To write: ' . strlen($str) . ' Written: ' . $ret,
' To write: ' . strlen($str) . ' Written: ' . (int) $ret,
Exception::WRITE_ERROR
);
}
Expand Down
99 changes: 99 additions & 0 deletions src/LineBreaksHelper.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
<?php

namespace Keboola\Csv;

class LineBreaksHelper
{
const REGEXP_DELIMITER = '~';

/**
* Detect line-breaks style in CSV file
* @param string $sample
* @param string $enclosure
* @param string $escapedBy
* @return string
*/
public static function detectLineBreaks($sample, $enclosure, $escapedBy)
{
$cleared = self::clearCsvValues($sample, $enclosure, $escapedBy);

$possibleLineBreaks = [
"\r\n", // win
"\r", // mac
"\n", // unix
];

$lineBreaksPositions = [];
foreach ($possibleLineBreaks as $lineBreak) {
$position = strpos($cleared, $lineBreak);
if ($position === false) {
continue;
}
$lineBreaksPositions[$lineBreak] = $position;
}


asort($lineBreaksPositions);
reset($lineBreaksPositions);

return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions);
}

/**
* Clear enclosured values in CSV eg. "abc" to "",
* because these values can contain line breaks eg, "abc\n\r\n\r\r\r\r",
* and this makes it difficult to detect line breaks style in CSV,
* if are another line breaks present in first line.
* @internal Should be used only in detectLineBreaks, public for easier testing.
* @param string $sample
* @param string $enclosure
* @param string $escapedBy eg. empty string or \
* @return string
*/
public static function clearCsvValues($sample, $enclosure, $escapedBy)
{
// Usually an enclosure character is escaped by doubling it, however, the escapeBy can be used
$doubleEnclosure = $enclosure . $enclosure;
$escapedEnclosure = empty($escapedBy) ? $doubleEnclosure : $escapedBy . $enclosure;
$escapedEscape = empty($escapedBy) ? null : $escapedBy . $escapedBy;

/*
* Regexp examples:
* enclosure: |"|, escapedBy: none, regexp: ~"(?>(?>"")|[^"])*"~
* enclosure: |"|, escapedBy: |\|, regexp: ~"(?>(?>\\"|\\\\)|[^"])*"~
*/
// @formatter:off
$regexp =
// regexp start
self::REGEXP_DELIMITER .
// enclosure start
preg_quote($enclosure, self::REGEXP_DELIMITER) .
/*
* Once-only group => if there is a match, do not try other alternatives
* See: https://www.php.net/manual/en/regexp.reference.onlyonce.php
* Without once-only group will be |"abc\"| false positive,
* because |\| is matched by group and |"| at the end of regexp.
*/
// repeated once-only group start
'(?>' .
// once-only group start
'(?>' .
// escaped enclosure
preg_quote($escapedEnclosure, self::REGEXP_DELIMITER) .
// OR escaped escape char
($escapedEscape ? '|' . preg_quote($escapedEscape, self::REGEXP_DELIMITER) : '') .
// group end
')' .
// OR not enclosure
'|[^' . preg_quote($enclosure, self::REGEXP_DELIMITER) . ']' .
// group end
')*' .
// enclosure end
preg_quote($enclosure, self::REGEXP_DELIMITER) .
// regexp end
self::REGEXP_DELIMITER;
// @formatter:on

return preg_replace($regexp, $doubleEnclosure, $sample);
}
}
76 changes: 68 additions & 8 deletions tests/CsvReadTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,24 @@ public function testParseEscapedBy()
self::assertEquals($expected, iterator_to_array($csvFile));
}

public function testParseMacLineEndsInField()
{
$csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\');

$expected = [
[
'test',
"some text\rwith\r\\r line breaks\rinside\rbut\rrows\rare\rusing \\n \\\"line\\\" break\r",
],
[
'name', 'data'
]
];

self::assertEquals($expected, iterator_to_array($csvFile));
}


public function testEmptyHeader()
{
$csvFile = new CsvReader(__DIR__ . '/data/test-input.empty.csv', ',', '"');
Expand Down Expand Up @@ -364,14 +382,6 @@ public function invalidSkipLinesProvider()
];
}

public function testInvalidNewLines()
{
self::expectException(Exception::class);
self::expectExceptionMessage('Invalid line break. Please use unix \n or win \r\n line breaks.');
new CsvReader(__DIR__ . DIRECTORY_SEPARATOR . 'data/binary');
}


public function testValidWithoutRewind()
{
$fileName = __DIR__ . '/data/simple.csv';
Expand Down Expand Up @@ -479,4 +489,54 @@ public function testInvalidFile()
self::expectExceptionMessage('Invalid file: array');
new CsvReader(['bad']);
}

/**
* @dataProvider getPerformanceTestInputs
* @param string $fileContent
* @param int $expectedRows
* @param float $maxDuration
*/
public function testPerformance($fileContent, $expectedRows, $maxDuration)
{
self::markTestSkipped(
'Run this test only manually. Because the duration is very different in local CI environment.'
);

try {
$fileName = sys_get_temp_dir() . DIRECTORY_SEPARATOR . uniqid('perf-test');
file_put_contents($fileName, $fileContent);
$startTime = microtime(true);
$reader = new CsvReader($fileName);
$rows = 0;
foreach ($reader as $line){
$rows++;
}
$duration = microtime(true) - $startTime;
self::assertSame($expectedRows, $rows);
self::assertLessThanOrEqual($maxDuration, $duration);
} finally {
@unlink($fileName);
}
}

public function getPerformanceTestInputs()
{
yield '1M-simple-rows' => [
str_repeat("abc,def,\"xyz\"\n", 1000000),
1000000,
8.0
];

yield '1M-empty-lines-n' => [
str_repeat("\n", 1000000),
1000000,
8.0
];

yield '1M-no-separators' => [
str_repeat(md5('abc') . "\n", 1000000),
1000000,
8.0
];
}
}
39 changes: 33 additions & 6 deletions tests/CsvWriteTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
use Keboola\Csv\CsvWriter;
use Keboola\Csv\Exception;
use PHPUnit\Framework\TestCase;
use PHPUnit_Framework_Constraint_Or;
use PHPUnit_Framework_Constraint_StringContains;

class CsvWriteTest extends TestCase
{
Expand Down Expand Up @@ -87,9 +89,19 @@ public function testWriteInvalidObject()
];

$csvFile->writeRow($rows[0]);
self::expectException(Exception::class);
self::expectExceptionMessage("Cannot write data into column: stdClass::");
$csvFile->writeRow($rows[1]);

try {
$csvFile->writeRow($rows[1]);
self::fail('Expected exception was not thrown.');
} catch (Exception $e) {
// Exception message differs between PHP versions.
$or = new PHPUnit_Framework_Constraint_Or();
$or->setConstraints([
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: stdClass::"),
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: (object) array(\n)")
]);
self::assertThat($e->getMessage(), $or);
}
}

public function testWriteValidObject()
Expand Down Expand Up @@ -182,9 +194,24 @@ public function testInvalidPointer()
$pointer = fopen($fileName, 'r');
$csvFile = new CsvWriter($pointer);
$rows = [['col1', 'col2']];
self::expectException(Exception::class);
self::expectExceptionMessage('Cannot write to CSV file Return: 0 To write: 14 Written: 0');
$csvFile->writeRow($rows[0]);

try {
$csvFile->writeRow($rows[0]);
self::fail('Expected exception was not thrown.');
} catch (Exception $e) {
// Exception message differs between PHP versions.
$or = new PHPUnit_Framework_Constraint_Or();
$or->setConstraints([
new PHPUnit_Framework_Constraint_StringContains(
'Cannot write to CSV file Return: 0 To write: 14 Written: 0'
),
new PHPUnit_Framework_Constraint_StringContains(
'Cannot write to CSV file Error: fwrite(): ' .
'write of 14 bytes failed with errno=9 Bad file descriptor Return: false To write: 14 Written: 0'
)
]);
self::assertThat($e->getMessage(), $or);
}
}

public function testInvalidPointer2()
Expand Down
Loading

0 comments on commit a439656

Please sign in to comment.