Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Csv reader refactor infer delimiter #1948

Merged
merged 4 commits into from
Mar 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 6 additions & 97 deletions src/PhpSpreadsheet/Reader/Csv.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use InvalidArgumentException;
use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter;
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
use PhpOffice\PhpSpreadsheet\Spreadsheet;

Expand Down Expand Up @@ -138,118 +139,26 @@ protected function inferSeparator(): void
return;
}

$potentialDelimiters = [',', ';', "\t", '|', ':', ' ', '~'];
$counts = [];
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter] = [];
}

// Count how many times each of the potential delimiters appears in each line
$numberLines = 0;
while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) {
$countLine = [];
for ($i = strlen($line) - 1; $i >= 0; --$i) {
$char = $line[$i];
if (isset($counts[$char])) {
if (!isset($countLine[$char])) {
$countLine[$char] = 0;
}
++$countLine[$char];
}
}
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter][] = $countLine[$delimiter]
?? 0;
}
}
$inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure);

// If number of lines is 0, nothing to infer : fall back to the default
if ($numberLines === 0) {
$this->delimiter = reset($potentialDelimiters);
if ($inferenceEngine->linesCounted() === 0) {
$this->delimiter = $inferenceEngine->getDefaultDelimiter();
$this->skipBOM();

return;
}

// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($numberLines - 1) / 2);

foreach ($potentialDelimiters as $delimiter) {
$series = $counts[$delimiter];
sort($series);

$median = ($numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;

if ($median === 0) {
continue;
}

$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + ($value - $median) ** 2;
}
) / count($series);
}

// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach ($potentialDelimiters as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}

if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}
$this->delimiter = $inferenceEngine->infer();

// If no delimiter could be detected, fall back to the default
if ($this->delimiter === null) {
$this->delimiter = reset($potentialDelimiters);
$this->delimiter = $inferenceEngine->getDefaultDelimiter();
}

$this->skipBOM();
}

/**
* Get the next full line from the file.
*
* @return false|string
*/
private function getNextLine()
{
$line = '';
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');

do {
// Get the next line in the file
$newLine = fgets($this->fileHandle);

// Return false if there is no next line
if ($newLine === false) {
return false;
}

// Add the new line to the line passed in
$line = $line . $newLine;

// Drop everything that is enclosed to avoid counting false positives in enclosures
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);

// See if we have any enclosures left in the line
// if we still have an enclosure then we need to read the next line as well
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);

return $line;
}

/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
*
Expand Down
144 changes: 144 additions & 0 deletions src/PhpSpreadsheet/Reader/Csv/Delimiter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
<?php

namespace PhpOffice\PhpSpreadsheet\Reader\Csv;

class Delimiter
{
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~'];

protected $fileHandle;

protected $escapeCharacter;

protected $enclosure;

protected $counts = [];

protected $numberLines = 0;

protected $delimiter;

public function __construct($fileHandle, $escapeCharacter, $enclosure)
{
$this->fileHandle = $fileHandle;
$this->escapeCharacter = $escapeCharacter;
$this->enclosure = $enclosure;

$this->countPotentialDelimiters();
}

public function getDefaultDelimiter(): string
{
return self::POTENTIAL_DELIMETERS[0];
}

public function linesCounted(): int
{
return $this->numberLines;
}

protected function countPotentialDelimiters(): void
{
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []);
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS);

// Count how many times each of the potential delimiters appears in each line
$this->numberLines = 0;
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) {
$this->countDelimiterValues($line, $delimiterKeys);
}
}

protected function countDelimiterValues(string $line, array $delimiterKeys): void
{
$splitString = str_split($line, 1);
if (!is_array($splitString)) {
return;
}

$distribution = array_count_values($splitString);
$countLine = array_intersect_key($distribution, $delimiterKeys);

foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0;
}
}

public function infer(): ?string
{
// Calculate the mean square deviations for each delimiter
// (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($this->numberLines - 1) / 2);

foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
$series = $this->counts[$delimiter];
sort($series);

$median = ($this->numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;

if ($median === 0) {
continue;
}

$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + ($value - $median) ** 2;
}
) / count($series);
}

// ... and pick the delimiter with the smallest mean square deviation
// (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach (self::POTENTIAL_DELIMETERS as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}

if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}

return $this->delimiter;
}

/**
* Get the next full line from the file.
*
* @return false|string
*/
public function getNextLine()
{
$line = '';
$enclosure = ($this->escapeCharacter === '' ? ''
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
. preg_quote($this->enclosure, '/');

do {
// Get the next line in the file
$newLine = fgets($this->fileHandle);

// Return false if there is no next line
if ($newLine === false) {
return false;
}

// Add the new line to the line passed in
$line = $line . $newLine;

// Drop everything that is enclosed to avoid counting false positives in enclosures
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);

// See if we have any enclosures left in the line
// if we still have an enclosure then we need to read the next line as well
} while (preg_match('/(' . $enclosure . ')/', $line) > 0);

return $line;
}
}