From a44e7ea065c5dba65d64b6d7290812ae3681ecf1 Mon Sep 17 00:00:00 2001 From: Michael Sinelnikov Date: Fri, 17 Nov 2023 21:55:58 +0300 Subject: [PATCH] Add separator validation --- src/core/algorithms/algo_factory.cpp | 5 +- src/core/parser/csv_parser/csv_parser.cpp | 106 ++++++++++++++++++++++ src/core/parser/csv_parser/csv_parser.h | 5 + src/core/util/separator_validator.cpp | 12 +++ src/core/util/separator_validator.h | 12 +++ src/python_bindings/py_util/py_to_any.cpp | 4 +- 6 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 src/core/util/separator_validator.cpp create mode 100644 src/core/util/separator_validator.h diff --git a/src/core/algorithms/algo_factory.cpp b/src/core/algorithms/algo_factory.cpp index 64e0173be..0a0101a35 100644 --- a/src/core/algorithms/algo_factory.cpp +++ b/src/core/algorithms/algo_factory.cpp @@ -10,6 +10,7 @@ #include "algorithms/pipelines/typo_miner/typo_miner.h" #include "config/names.h" #include "tabular_data/input_tables_type.h" +#include "util/separator_validator.h" namespace algos { @@ -47,7 +48,9 @@ void LoadAlgorithm(Algorithm& algorithm, StdParamsMap const& options) { ConfigureFromFunction(algorithm, [&options](std::string_view option_name) { using namespace config::names; auto create_input_table = [](CSVConfig const& csv_config) -> config::InputTable { - return std::make_shared(csv_config); + auto csv_parser = std::make_shared(csv_config); + csv_parser->ValidateSeparator(); + return csv_parser; }; if (option_name == kTable && options.find(std::string{kTable}) == options.end()) { diff --git a/src/core/parser/csv_parser/csv_parser.cpp b/src/core/parser/csv_parser/csv_parser.cpp index ca5ec4d4f..e0bcf186c 100644 --- a/src/core/parser/csv_parser/csv_parser.cpp +++ b/src/core/parser/csv_parser/csv_parser.cpp @@ -5,11 +5,13 @@ #include #include #include +#include #include #include #include #include +#include inline std::string& CSVParser::rtrim(std::string& s) { boost::trim_right(s); @@ -180,3 +182,107 @@ std::vector CSVParser::GetNextRow() { return result; } + +std::optional CSVParser::DeduceSeparator() { + // Calculate statistics including the header row + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + std::unordered_map letter_count; + if (has_next_) { + for (char c : next_line_) { + letter_count[c]++; + } + } + + std::unordered_map next_letter_count; + while (has_next_) { + GetNextIfHas(); + next_letter_count.clear(); + for (char c : next_line_) { + next_letter_count[c]++; + } + for (auto letter : letter_count) { + if (letter.second != next_letter_count[letter.first]) { + letter_count[letter.first] = 0; + } + } + } + + char possible_separator; + unsigned max_separator_count = 0; + + for (auto letter : letter_count) { + if (letter.second > max_separator_count) { + max_separator_count = letter.second; + possible_separator = letter.first; + } + } + Reset(); + + if (max_separator_count) { + return possible_separator; + } + + return std::nullopt; +} + +bool CSVParser::CheckSeparator(char sep) { + // Calculate statistics including the header row + bool has_header_copy = has_header_; + has_header_ = false; + Reset(); + has_header_ = has_header_copy; + + char separator_copy = separator_; + separator_ = sep; + + unsigned sep_count = 0; + std::vector next_parsed; + if (has_next_) { + next_parsed = GetNextRow(); + sep_count = next_parsed.size(); + } + + while (has_next_) { + next_parsed = GetNextRow(); + if (sep_count != next_parsed.size()) { + Reset(); + separator_ = separator_copy; + return false; + } + } + + Reset(); + separator_ = separator_copy; + + return true; +} + +std::optional CSVParser::ValidateSeparator() { + std::optional possible_separator = DeduceSeparator(); + + if (CheckSeparator(separator_)) { + if (possible_separator == std::nullopt || separator_ == possible_separator || + GetNumberOfColumns() != 1 || !CheckSeparator(possible_separator.value())) { + return separator_; + } + + LOG(WARNING) << "Inserted separator for the table " << relation_name_ + << " seems to be wrong"; + LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value() + << "\'"; + return possible_separator; + } + + LOG(WARNING) << "Inserted separator for the table " << relation_name_ << " seems to be wrong"; + if (possible_separator != std::nullopt && CheckSeparator(possible_separator.value())) { + LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value() + << "\'"; + return possible_separator; + } + + return std::nullopt; +} diff --git a/src/core/parser/csv_parser/csv_parser.h b/src/core/parser/csv_parser/csv_parser.h index 7b5a20d11..16a2cde70 100644 --- a/src/core/parser/csv_parser/csv_parser.h +++ b/src/core/parser/csv_parser/csv_parser.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -36,6 +37,8 @@ class CSVParser : public model::IDatasetStream { std::vector ParseString(std::string const& s) const; void GetNextIfHas(); void SkipLine(); + std::optional DeduceSeparator(); + bool CheckSeparator(char sep); inline static std::string& rtrim(std::string& s); @@ -49,6 +52,8 @@ class CSVParser : public model::IDatasetStream { std::string GetUnparsedLine(unsigned long long const line_index); std::vector ParseLine(unsigned long long const line_index); + std::optional ValidateSeparator(); + bool HasNextRow() const override { return has_next_; } diff --git a/src/core/util/separator_validator.cpp b/src/core/util/separator_validator.cpp new file mode 100644 index 000000000..0b605abaf --- /dev/null +++ b/src/core/util/separator_validator.cpp @@ -0,0 +1,12 @@ +#include "separator_validator.h" + +#include + +namespace util { + +std::optional ValidateSeparator(std::filesystem::path const& path, char separator) { + auto parser = std::make_unique(path, separator, false); + return parser->ValidateSeparator(); +} + +} // namespace util diff --git a/src/core/util/separator_validator.h b/src/core/util/separator_validator.h new file mode 100644 index 000000000..cd7b2dbd4 --- /dev/null +++ b/src/core/util/separator_validator.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include + +#include "parser/csv_parser/csv_parser.h" + +namespace util { + +std::optional ValidateSeparator(std::filesystem::path const& path, char separator); + +} // namespace util diff --git a/src/python_bindings/py_util/py_to_any.cpp b/src/python_bindings/py_util/py_to_any.cpp index f1d683f9f..3a076bc89 100644 --- a/src/python_bindings/py_util/py_to_any.cpp +++ b/src/python_bindings/py_util/py_to_any.cpp @@ -36,10 +36,12 @@ config::InputTable CreateCsvParser(std::string_view option_name, py::tuple const throw config::ConfigurationError("Cannot create a CSV parser from passed tuple."); } - return std::make_shared( + auto csv_parser = std::make_shared( CastAndReplaceCastError(option_name, arguments[0]), CastAndReplaceCastError(option_name, arguments[1]), CastAndReplaceCastError(option_name, arguments[2])); + csv_parser->ValidateSeparator(); + return csv_parser; } template