diff --git a/src/PhpSpreadsheet/Reader/Csv.php b/src/PhpSpreadsheet/Reader/Csv.php index 92b0f6ac61..dc746735d2 100644 --- a/src/PhpSpreadsheet/Reader/Csv.php +++ b/src/PhpSpreadsheet/Reader/Csv.php @@ -4,6 +4,7 @@ use InvalidArgumentException; use PhpOffice\PhpSpreadsheet\Cell\Coordinate; +use PhpOffice\PhpSpreadsheet\Reader\Csv\Delimiter; use PhpOffice\PhpSpreadsheet\Shared\StringHelper; use PhpOffice\PhpSpreadsheet\Spreadsheet; @@ -138,118 +139,26 @@ protected function inferSeparator(): void return; } - $potentialDelimiters = [',', ';', "\t", '|', ':', ' ', '~']; - $counts = []; - foreach ($potentialDelimiters as $delimiter) { - $counts[$delimiter] = []; - } - - // Count how many times each of the potential delimiters appears in each line - $numberLines = 0; - while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) { - $countLine = []; - for ($i = strlen($line) - 1; $i >= 0; --$i) { - $char = $line[$i]; - if (isset($counts[$char])) { - if (!isset($countLine[$char])) { - $countLine[$char] = 0; - } - ++$countLine[$char]; - } - } - foreach ($potentialDelimiters as $delimiter) { - $counts[$delimiter][] = $countLine[$delimiter] - ?? 0; - } - } + $inferenceEngine = new Delimiter($this->fileHandle, $this->escapeCharacter, $this->enclosure); // If number of lines is 0, nothing to infer : fall back to the default - if ($numberLines === 0) { - $this->delimiter = reset($potentialDelimiters); + if ($inferenceEngine->linesCounted() === 0) { + $this->delimiter = $inferenceEngine->getDefaultDelimiter(); $this->skipBOM(); return; } - // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) - $meanSquareDeviations = []; - $middleIdx = floor(($numberLines - 1) / 2); - - foreach ($potentialDelimiters as $delimiter) { - $series = $counts[$delimiter]; - sort($series); - - $median = ($numberLines % 2) - ? $series[$middleIdx] - : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; - - if ($median === 0) { - continue; - } - - $meanSquareDeviations[$delimiter] = array_reduce( - $series, - function ($sum, $value) use ($median) { - return $sum + ($value - $median) ** 2; - } - ) / count($series); - } - - // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) - $min = INF; - foreach ($potentialDelimiters as $delimiter) { - if (!isset($meanSquareDeviations[$delimiter])) { - continue; - } - - if ($meanSquareDeviations[$delimiter] < $min) { - $min = $meanSquareDeviations[$delimiter]; - $this->delimiter = $delimiter; - } - } + $this->delimiter = $inferenceEngine->infer(); // If no delimiter could be detected, fall back to the default if ($this->delimiter === null) { - $this->delimiter = reset($potentialDelimiters); + $this->delimiter = $inferenceEngine->getDefaultDelimiter(); } $this->skipBOM(); } - /** - * Get the next full line from the file. - * - * @return false|string - */ - private function getNextLine() - { - $line = ''; - $enclosure = ($this->escapeCharacter === '' ? '' - : ('(?escapeCharacter, '/') . ')')) - . preg_quote($this->enclosure, '/'); - - do { - // Get the next line in the file - $newLine = fgets($this->fileHandle); - - // Return false if there is no next line - if ($newLine === false) { - return false; - } - - // Add the new line to the line passed in - $line = $line . $newLine; - - // Drop everything that is enclosed to avoid counting false positives in enclosures - $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line); - - // See if we have any enclosures left in the line - // if we still have an enclosure then we need to read the next line as well - } while (preg_match('/(' . $enclosure . ')/', $line) > 0); - - return $line; - } - /** * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). * diff --git a/src/PhpSpreadsheet/Reader/Csv/Delimiter.php b/src/PhpSpreadsheet/Reader/Csv/Delimiter.php new file mode 100644 index 0000000000..eb62c9ac04 --- /dev/null +++ b/src/PhpSpreadsheet/Reader/Csv/Delimiter.php @@ -0,0 +1,144 @@ +fileHandle = $fileHandle; + $this->escapeCharacter = $escapeCharacter; + $this->enclosure = $enclosure; + + $this->countPotentialDelimiters(); + } + + public function getDefaultDelimiter(): string + { + return self::POTENTIAL_DELIMETERS[0]; + } + + public function linesCounted(): int + { + return $this->numberLines; + } + + protected function countPotentialDelimiters(): void + { + $this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []); + $delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS); + + // Count how many times each of the potential delimiters appears in each line + $this->numberLines = 0; + while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) { + $this->countDelimiterValues($line, $delimiterKeys); + } + } + + protected function countDelimiterValues(string $line, array $delimiterKeys): void + { + $splitString = str_split($line, 1); + if (!is_array($splitString)) { + return; + } + + $distribution = array_count_values($splitString); + $countLine = array_intersect_key($distribution, $delimiterKeys); + + foreach (self::POTENTIAL_DELIMETERS as $delimiter) { + $this->counts[$delimiter][] = $countLine[$delimiter] ?? 0; + } + } + + public function infer(): ?string + { + // Calculate the mean square deviations for each delimiter + // (ignoring delimiters that haven't been found consistently) + $meanSquareDeviations = []; + $middleIdx = floor(($this->numberLines - 1) / 2); + + foreach (self::POTENTIAL_DELIMETERS as $delimiter) { + $series = $this->counts[$delimiter]; + sort($series); + + $median = ($this->numberLines % 2) + ? $series[$middleIdx] + : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; + + if ($median === 0) { + continue; + } + + $meanSquareDeviations[$delimiter] = array_reduce( + $series, + function ($sum, $value) use ($median) { + return $sum + ($value - $median) ** 2; + } + ) / count($series); + } + + // ... and pick the delimiter with the smallest mean square deviation + // (in case of ties, the order in potentialDelimiters is respected) + $min = INF; + foreach (self::POTENTIAL_DELIMETERS as $delimiter) { + if (!isset($meanSquareDeviations[$delimiter])) { + continue; + } + + if ($meanSquareDeviations[$delimiter] < $min) { + $min = $meanSquareDeviations[$delimiter]; + $this->delimiter = $delimiter; + } + } + + return $this->delimiter; + } + + /** + * Get the next full line from the file. + * + * @return false|string + */ + public function getNextLine() + { + $line = ''; + $enclosure = ($this->escapeCharacter === '' ? '' + : ('(?escapeCharacter, '/') . ')')) + . preg_quote($this->enclosure, '/'); + + do { + // Get the next line in the file + $newLine = fgets($this->fileHandle); + + // Return false if there is no next line + if ($newLine === false) { + return false; + } + + // Add the new line to the line passed in + $line = $line . $newLine; + + // Drop everything that is enclosed to avoid counting false positives in enclosures + $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line); + + // See if we have any enclosures left in the line + // if we still have an enclosure then we need to read the next line as well + } while (preg_match('/(' . $enclosure . ')/', $line) > 0); + + return $line; + } +}