-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix line breaks detection in edge-case #41
Changes from all commits
ee66641
2595325
c25316e
f9eaab3
e78714d
5bee503
bba5f75
a6d0826
86f24a0
602fa66
ff9aed8
40d85d0
2811a35
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,10 @@ language: php | |
php: | ||
- 5.6 | ||
- 7.0 | ||
- hhvm | ||
- 7.1 | ||
- 7.2 | ||
- 7.3 | ||
- 7.4 | ||
|
||
before_script: | ||
- composer install | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -111,29 +111,10 @@ protected function openCsvFile($fileName) | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
protected function detectLineBreak() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
{ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
rewind($this->getFilePointer()); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$sample = fread($this->getFilePointer(), 10000); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$possibleLineBreaks = [ | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
"\r\n", // win | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
"\r", // mac | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
"\n", // unix | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
]; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$lineBreaksPositions = []; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
foreach ($possibleLineBreaks as $lineBreak) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$position = strpos($sample, $lineBreak); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
if ($position === false) { | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
continue; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$lineBreaksPositions[$lineBreak] = $position; | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
asort($lineBreaksPositions); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
reset($lineBreaksPositions); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
@rewind($this->getFilePointer()); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$sample = @fread($this->getFilePointer(), 10000); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
return LineBreaksHelper::detectLineBreaks($sample, $this->getEnclosure(), $this->getEscapedBy()); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
/** | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
@@ -148,7 +129,7 @@ protected function readLine() | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
// allow empty enclosure hack | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$enclosure = !$this->getEnclosure() ? chr(0) : $this->getEnclosure(); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
$escapedBy = !$this->getEscapedBy() ? chr(0) : $this->getEscapedBy(); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
return fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
return @fgetcsv($this->getFilePointer(), null, $this->getDelimiter(), $enclosure, $escapedBy); | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. V PHP 7.4 (pridane do CI) funkcie Z pohladu uzivatela sa vrati prazdne pole, testuje sa to tu (tento test padal): Lines 477 to 484 in 86f24a0
V php-csv/tests/CsvWriteTest.php Lines 190 to 215 in 86f24a0
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
/** | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,7 +90,7 @@ public function writeRow(array $row) | |
"Cannot write to CSV file " . $this->fileName . | ||
($ret === false && error_get_last() ? 'Error: ' . error_get_last()['message'] : '') . | ||
' Return: ' . json_encode($ret) . | ||
' To write: ' . strlen($str) . ' Written: ' . $ret, | ||
' To write: ' . strlen($str) . ' Written: ' . (int) $ret, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. $ret moze byt aj false, no v chybovej sprave chceme cislo. |
||
Exception::WRITE_ERROR | ||
); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
<?php | ||
|
||
namespace Keboola\Csv; | ||
|
||
class LineBreaksHelper | ||
{ | ||
const REGEXP_DELIMITER = '~'; | ||
|
||
/** | ||
* Detect line-breaks style in CSV file | ||
* @param string $sample | ||
* @param string $enclosure | ||
* @param string $escapedBy | ||
* @return string | ||
*/ | ||
public static function detectLineBreaks($sample, $enclosure, $escapedBy) | ||
{ | ||
$cleared = self::clearCsvValues($sample, $enclosure, $escapedBy); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ked sa tento riadok upravi na There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Najprv sa zo vzorky odstrania hodnoty a potom sa pokracuje povodnym sposobom. |
||
|
||
$possibleLineBreaks = [ | ||
"\r\n", // win | ||
"\r", // mac | ||
"\n", // unix | ||
]; | ||
|
||
$lineBreaksPositions = []; | ||
foreach ($possibleLineBreaks as $lineBreak) { | ||
$position = strpos($cleared, $lineBreak); | ||
if ($position === false) { | ||
continue; | ||
} | ||
$lineBreaksPositions[$lineBreak] = $position; | ||
} | ||
|
||
|
||
asort($lineBreaksPositions); | ||
reset($lineBreaksPositions); | ||
|
||
return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions); | ||
} | ||
|
||
/** | ||
* Clear enclosured values in CSV eg. "abc" to "", | ||
* because these values can contain line breaks eg, "abc\n\r\n\r\r\r\r", | ||
* and this makes it difficult to detect line breaks style in CSV, | ||
* if are another line breaks present in first line. | ||
* @internal Should be used only in detectLineBreaks, public for easier testing. | ||
* @param string $sample | ||
* @param string $enclosure | ||
* @param string $escapedBy eg. empty string or \ | ||
* @return string | ||
*/ | ||
public static function clearCsvValues($sample, $enclosure, $escapedBy) | ||
{ | ||
// Usually an enclosure character is escaped by doubling it, however, the escapeBy can be used | ||
$doubleEnclosure = $enclosure . $enclosure; | ||
$escapedEnclosure = empty($escapedBy) ? $doubleEnclosure : $escapedBy . $enclosure; | ||
$escapedEscape = empty($escapedBy) ? null : $escapedBy . $escapedBy; | ||
|
||
/* | ||
* Regexp examples: | ||
* enclosure: |"|, escapedBy: none, regexp: ~"(?>(?>"")|[^"])*"~ | ||
* enclosure: |"|, escapedBy: |\|, regexp: ~"(?>(?>\\"|\\\\)|[^"])*"~ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generovanie regularneho vyrazu som okomentoval, nie je nejaky dlhy, ale aby bolo jasne co sa tam deje. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Musim pouzit viac-riadkovy komentar, kedze sa tam nachadza There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. jako dobry poctenicko :) |
||
*/ | ||
// @formatter:off | ||
$regexp = | ||
// regexp start | ||
self::REGEXP_DELIMITER . | ||
// enclosure start | ||
preg_quote($enclosure, self::REGEXP_DELIMITER) . | ||
/* | ||
* Once-only group => if there is a match, do not try other alternatives | ||
* See: https://www.php.net/manual/en/regexp.reference.onlyonce.php | ||
* Without once-only group will be |"abc\"| false positive, | ||
* because |\| is matched by group and |"| at the end of regexp. | ||
*/ | ||
// repeated once-only group start | ||
'(?>' . | ||
// once-only group start | ||
'(?>' . | ||
// escaped enclosure | ||
preg_quote($escapedEnclosure, self::REGEXP_DELIMITER) . | ||
// OR escaped escape char | ||
($escapedEscape ? '|' . preg_quote($escapedEscape, self::REGEXP_DELIMITER) : '') . | ||
// group end | ||
')' . | ||
// OR not enclosure | ||
'|[^' . preg_quote($enclosure, self::REGEXP_DELIMITER) . ']' . | ||
// group end | ||
')*' . | ||
// enclosure end | ||
preg_quote($enclosure, self::REGEXP_DELIMITER) . | ||
// regexp end | ||
self::REGEXP_DELIMITER; | ||
// @formatter:on | ||
|
||
return preg_replace($regexp, $doubleEnclosure, $sample); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -137,6 +137,24 @@ public function testParseEscapedBy() | |
self::assertEquals($expected, iterator_to_array($csvFile)); | ||
} | ||
|
||
public function testParseMacLineEndsInField() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Najprv som do PR pridal test, ktory testuje tento edge-case, bez fixu to padalo: |
||
{ | ||
$csvFile = new CsvReader(__DIR__ . '/data/test-input.lineBreaks.csv', ",", '"', '\\'); | ||
|
||
$expected = [ | ||
[ | ||
'test', | ||
"some text\rwith\r\\r line breaks\rinside\rbut\rrows\rare\rusing \\n \\\"line\\\" break\r", | ||
], | ||
[ | ||
'name', 'data' | ||
] | ||
]; | ||
|
||
self::assertEquals($expected, iterator_to_array($csvFile)); | ||
} | ||
|
||
|
||
public function testEmptyHeader() | ||
{ | ||
$csvFile = new CsvReader(__DIR__ . '/data/test-input.empty.csv', ',', '"'); | ||
|
@@ -364,14 +382,6 @@ public function invalidSkipLinesProvider() | |
]; | ||
} | ||
|
||
public function testInvalidNewLines() | ||
{ | ||
self::expectException(Exception::class); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tento test teraz uz prechadza - nenastane exception, ... binarny subor ktory sa tam pouzival, obsahuje vela uvodzoviek ... a kedze sa teraz uz obsah medzi uvodzovkami ignoruje, ked sa detekuje zalomenie riadkov ... tak sa to nahodou teraz zdetekuje ako There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nevim jestli to neni dost brutalni zmena, co t bude delat kdyz to bude parsovat jpgcko nebo zip :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nemyslim si, ze ide o velku zmenu. Tento test prechadzal iba nahodou, ak by sa tam dal iny binarny subor, ktory by obsahoval skor kod znaku Toto repo CSVcka nijako nevaliduje, mozes mat napr rozny pocet stlpcov v jednotlivych riadkoch. Tu su pozície klucovych znakov zo suboru
Teda v starom kode sa ako prve naslo Mozem ten binarny subor upravit aby na pozicii There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. jojo, super! vubec me to vcera takhle nedoslo |
||
self::expectExceptionMessage('Invalid line break. Please use unix \n or win \r\n line breaks.'); | ||
new CsvReader(__DIR__ . DIRECTORY_SEPARATOR . 'data/binary'); | ||
} | ||
|
||
|
||
public function testValidWithoutRewind() | ||
{ | ||
$fileName = __DIR__ . '/data/simple.csv'; | ||
|
@@ -479,4 +489,54 @@ public function testInvalidFile() | |
self::expectExceptionMessage('Invalid file: array'); | ||
new CsvReader(['bad']); | ||
} | ||
|
||
/** | ||
* @dataProvider getPerformanceTestInputs | ||
* @param string $fileContent | ||
* @param int $expectedRows | ||
* @param float $maxDuration | ||
*/ | ||
public function testPerformance($fileContent, $expectedRows, $maxDuration) | ||
{ | ||
self::markTestSkipped( | ||
'Run this test only manually. Because the duration is very different in local CI environment.' | ||
); | ||
|
||
try { | ||
$fileName = sys_get_temp_dir() . DIRECTORY_SEPARATOR . uniqid('perf-test'); | ||
file_put_contents($fileName, $fileContent); | ||
$startTime = microtime(true); | ||
$reader = new CsvReader($fileName); | ||
$rows = 0; | ||
foreach ($reader as $line){ | ||
$rows++; | ||
} | ||
$duration = microtime(true) - $startTime; | ||
self::assertSame($expectedRows, $rows); | ||
self::assertLessThanOrEqual($maxDuration, $duration); | ||
} finally { | ||
@unlink($fileName); | ||
} | ||
} | ||
|
||
public function getPerformanceTestInputs() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pridal som perf testy. Podla nich na mojom PC trva spracovanie regularneho vyrazu Napr. pre prvy data set
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ja bych rek, ze dobry - melo by to byt zhorseni o konstantu, takze v pohode |
||
{ | ||
yield '1M-simple-rows' => [ | ||
str_repeat("abc,def,\"xyz\"\n", 1000000), | ||
1000000, | ||
8.0 | ||
]; | ||
|
||
yield '1M-empty-lines-n' => [ | ||
str_repeat("\n", 1000000), | ||
1000000, | ||
8.0 | ||
]; | ||
|
||
yield '1M-no-separators' => [ | ||
str_repeat(md5('abc') . "\n", 1000000), | ||
1000000, | ||
8.0 | ||
]; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,8 @@ | |
use Keboola\Csv\CsvWriter; | ||
use Keboola\Csv\Exception; | ||
use PHPUnit\Framework\TestCase; | ||
use PHPUnit_Framework_Constraint_Or; | ||
use PHPUnit_Framework_Constraint_StringContains; | ||
|
||
class CsvWriteTest extends TestCase | ||
{ | ||
|
@@ -87,9 +89,19 @@ public function testWriteInvalidObject() | |
]; | ||
|
||
$csvFile->writeRow($rows[0]); | ||
self::expectException(Exception::class); | ||
self::expectExceptionMessage("Cannot write data into column: stdClass::"); | ||
$csvFile->writeRow($rows[1]); | ||
|
||
try { | ||
$csvFile->writeRow($rows[1]); | ||
self::fail('Expected exception was not thrown.'); | ||
} catch (Exception $e) { | ||
// Exception message differs between PHP versions. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pridal som do CI novsie verzie PHP, v tomto teste sa mierne lisi chybova sprava, medzi verziami PHP. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. me nikdy nenapadlo se s tim takhle srat, ja bych zkratil tu message :D ale kazdopadne 👍 |
||
$or = new PHPUnit_Framework_Constraint_Or(); | ||
$or->setConstraints([ | ||
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: stdClass::"), | ||
new PHPUnit_Framework_Constraint_StringContains("Cannot write data into column: (object) array(\n)") | ||
]); | ||
self::assertThat($e->getMessage(), $or); | ||
} | ||
} | ||
|
||
public function testWriteValidObject() | ||
|
@@ -182,9 +194,24 @@ public function testInvalidPointer() | |
$pointer = fopen($fileName, 'r'); | ||
$csvFile = new CsvWriter($pointer); | ||
$rows = [['col1', 'col2']]; | ||
self::expectException(Exception::class); | ||
self::expectExceptionMessage('Cannot write to CSV file Return: 0 To write: 14 Written: 0'); | ||
$csvFile->writeRow($rows[0]); | ||
|
||
try { | ||
$csvFile->writeRow($rows[0]); | ||
self::fail('Expected exception was not thrown.'); | ||
} catch (Exception $e) { | ||
// Exception message differs between PHP versions. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pridal som do CI novsie verzie PHP, v tomto teste sa tiez mierne lisi chybova sprava, medzi verziami PHP. |
||
$or = new PHPUnit_Framework_Constraint_Or(); | ||
$or->setConstraints([ | ||
new PHPUnit_Framework_Constraint_StringContains( | ||
'Cannot write to CSV file Return: 0 To write: 14 Written: 0' | ||
), | ||
new PHPUnit_Framework_Constraint_StringContains( | ||
'Cannot write to CSV file Error: fwrite(): ' . | ||
'write of 14 bytes failed with errno=9 Bad file descriptor Return: false To write: 14 Written: 0' | ||
) | ||
]); | ||
self::assertThat($e->getMessage(), $or); | ||
} | ||
} | ||
|
||
public function testInvalidPointer2() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stare ani nove buildy uz s HHVM neprechadzaju, HHVM uz nepodporuje PHPcko:
https://hhvm.com/blog/2017/09/18/the-future-of-hhvm.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
jasnacka, ale pridej tam nove verze PHP
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok. vdaka pridane v a6d0826