-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from keboola/webrouse-COM-350-fix-line-breaks-…
…detect Fix line breaks detection in edge-case
- Loading branch information
Showing
11 changed files
with
356 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,10 @@ language: php | |
php: | ||
- 5.6 | ||
- 7.0 | ||
- hhvm | ||
- 7.1 | ||
- 7.2 | ||
- 7.3 | ||
- 7.4 | ||
|
||
before_script: | ||
- composer install | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
<?php | ||
|
||
namespace Keboola\Csv; | ||
|
||
class LineBreaksHelper | ||
{ | ||
const REGEXP_DELIMITER = '~'; | ||
|
||
/** | ||
* Detect line-breaks style in CSV file | ||
* @param string $sample | ||
* @param string $enclosure | ||
* @param string $escapedBy | ||
* @return string | ||
*/ | ||
public static function detectLineBreaks($sample, $enclosure, $escapedBy) | ||
{ | ||
$cleared = self::clearCsvValues($sample, $enclosure, $escapedBy); | ||
|
||
$possibleLineBreaks = [ | ||
"\r\n", // win | ||
"\r", // mac | ||
"\n", // unix | ||
]; | ||
|
||
$lineBreaksPositions = []; | ||
foreach ($possibleLineBreaks as $lineBreak) { | ||
$position = strpos($cleared, $lineBreak); | ||
if ($position === false) { | ||
continue; | ||
} | ||
$lineBreaksPositions[$lineBreak] = $position; | ||
} | ||
|
||
|
||
asort($lineBreaksPositions); | ||
reset($lineBreaksPositions); | ||
|
||
return empty($lineBreaksPositions) ? "\n" : key($lineBreaksPositions); | ||
} | ||
|
||
/** | ||
* Clear enclosured values in CSV eg. "abc" to "", | ||
* because these values can contain line breaks eg, "abc\n\r\n\r\r\r\r", | ||
* and this makes it difficult to detect line breaks style in CSV, | ||
* if are another line breaks present in first line. | ||
* @internal Should be used only in detectLineBreaks, public for easier testing. | ||
* @param string $sample | ||
* @param string $enclosure | ||
* @param string $escapedBy eg. empty string or \ | ||
* @return string | ||
*/ | ||
public static function clearCsvValues($sample, $enclosure, $escapedBy) | ||
{ | ||
// Usually an enclosure character is escaped by doubling it, however, the escapeBy can be used | ||
$doubleEnclosure = $enclosure . $enclosure; | ||
$escapedEnclosure = empty($escapedBy) ? $doubleEnclosure : $escapedBy . $enclosure; | ||
$escapedEscape = empty($escapedBy) ? null : $escapedBy . $escapedBy; | ||
|
||
/* | ||
* Regexp examples: | ||
* enclosure: |"|, escapedBy: none, regexp: ~"(?>(?>"")|[^"])*"~ | ||
* enclosure: |"|, escapedBy: |\|, regexp: ~"(?>(?>\\"|\\\\)|[^"])*"~ | ||
*/ | ||
// @formatter:off | ||
$regexp = | ||
// regexp start | ||
self::REGEXP_DELIMITER . | ||
// enclosure start | ||
preg_quote($enclosure, self::REGEXP_DELIMITER) . | ||
/* | ||
* Once-only group => if there is a match, do not try other alternatives | ||
* See: https://www.php.net/manual/en/regexp.reference.onlyonce.php | ||
* Without once-only group will be |"abc\"| false positive, | ||
* because |\| is matched by group and |"| at the end of regexp. | ||
*/ | ||
// repeated once-only group start | ||
'(?>' . | ||
// once-only group start | ||
'(?>' . | ||
// escaped enclosure | ||
preg_quote($escapedEnclosure, self::REGEXP_DELIMITER) . | ||
// OR escaped escape char | ||
($escapedEscape ? '|' . preg_quote($escapedEscape, self::REGEXP_DELIMITER) : '') . | ||
// group end | ||
')' . | ||
// OR not enclosure | ||
'|[^' . preg_quote($enclosure, self::REGEXP_DELIMITER) . ']' . | ||
// group end | ||
')*' . | ||
// enclosure end | ||
preg_quote($enclosure, self::REGEXP_DELIMITER) . | ||
// regexp end | ||
self::REGEXP_DELIMITER; | ||
// @formatter:on | ||
|
||
return preg_replace($regexp, $doubleEnclosure, $sample); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.