-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Csv reader refactor infer delimiter (#1948)
* Refactor delimiter inference for CSV file reading into a separate class
- Loading branch information
Mark Baker
authored
Mar 24, 2021
1 parent
07ad800
commit b7f9375
Showing
2 changed files
with
150 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
<?php | ||
|
||
namespace PhpOffice\PhpSpreadsheet\Reader\Csv; | ||
|
||
class Delimiter | ||
{ | ||
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~']; | ||
|
||
protected $fileHandle; | ||
|
||
protected $escapeCharacter; | ||
|
||
protected $enclosure; | ||
|
||
protected $counts = []; | ||
|
||
protected $numberLines = 0; | ||
|
||
protected $delimiter; | ||
|
||
public function __construct($fileHandle, $escapeCharacter, $enclosure) | ||
{ | ||
$this->fileHandle = $fileHandle; | ||
$this->escapeCharacter = $escapeCharacter; | ||
$this->enclosure = $enclosure; | ||
|
||
$this->countPotentialDelimiters(); | ||
} | ||
|
||
public function getDefaultDelimiter(): string | ||
{ | ||
return self::POTENTIAL_DELIMETERS[0]; | ||
} | ||
|
||
public function linesCounted(): int | ||
{ | ||
return $this->numberLines; | ||
} | ||
|
||
protected function countPotentialDelimiters(): void | ||
{ | ||
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []); | ||
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS); | ||
|
||
// Count how many times each of the potential delimiters appears in each line | ||
$this->numberLines = 0; | ||
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) { | ||
$this->countDelimiterValues($line, $delimiterKeys); | ||
} | ||
} | ||
|
||
protected function countDelimiterValues(string $line, array $delimiterKeys): void | ||
{ | ||
$splitString = str_split($line, 1); | ||
if (!is_array($splitString)) { | ||
return; | ||
} | ||
|
||
$distribution = array_count_values($splitString); | ||
$countLine = array_intersect_key($distribution, $delimiterKeys); | ||
|
||
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { | ||
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0; | ||
} | ||
} | ||
|
||
public function infer(): ?string | ||
{ | ||
// Calculate the mean square deviations for each delimiter | ||
// (ignoring delimiters that haven't been found consistently) | ||
$meanSquareDeviations = []; | ||
$middleIdx = floor(($this->numberLines - 1) / 2); | ||
|
||
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { | ||
$series = $this->counts[$delimiter]; | ||
sort($series); | ||
|
||
$median = ($this->numberLines % 2) | ||
? $series[$middleIdx] | ||
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; | ||
|
||
if ($median === 0) { | ||
continue; | ||
} | ||
|
||
$meanSquareDeviations[$delimiter] = array_reduce( | ||
$series, | ||
function ($sum, $value) use ($median) { | ||
return $sum + ($value - $median) ** 2; | ||
} | ||
) / count($series); | ||
} | ||
|
||
// ... and pick the delimiter with the smallest mean square deviation | ||
// (in case of ties, the order in potentialDelimiters is respected) | ||
$min = INF; | ||
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { | ||
if (!isset($meanSquareDeviations[$delimiter])) { | ||
continue; | ||
} | ||
|
||
if ($meanSquareDeviations[$delimiter] < $min) { | ||
$min = $meanSquareDeviations[$delimiter]; | ||
$this->delimiter = $delimiter; | ||
} | ||
} | ||
|
||
return $this->delimiter; | ||
} | ||
|
||
/** | ||
* Get the next full line from the file. | ||
* | ||
* @return false|string | ||
*/ | ||
public function getNextLine() | ||
{ | ||
$line = ''; | ||
$enclosure = ($this->escapeCharacter === '' ? '' | ||
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')')) | ||
. preg_quote($this->enclosure, '/'); | ||
|
||
do { | ||
// Get the next line in the file | ||
$newLine = fgets($this->fileHandle); | ||
|
||
// Return false if there is no next line | ||
if ($newLine === false) { | ||
return false; | ||
} | ||
|
||
// Add the new line to the line passed in | ||
$line = $line . $newLine; | ||
|
||
// Drop everything that is enclosed to avoid counting false positives in enclosures | ||
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line); | ||
|
||
// See if we have any enclosures left in the line | ||
// if we still have an enclosure then we need to read the next line as well | ||
} while (preg_match('/(' . $enclosure . ')/', $line) > 0); | ||
|
||
return $line; | ||
} | ||
} |