Skip to content

Commit

Permalink
Add separator validation
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelS239 committed Mar 3, 2024
1 parent cf77139 commit a44e7ea
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 2 deletions.
5 changes: 4 additions & 1 deletion src/core/algorithms/algo_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "algorithms/pipelines/typo_miner/typo_miner.h"
#include "config/names.h"
#include "tabular_data/input_tables_type.h"
#include "util/separator_validator.h"

namespace algos {

Expand Down Expand Up @@ -47,7 +48,9 @@ void LoadAlgorithm(Algorithm& algorithm, StdParamsMap const& options) {
ConfigureFromFunction(algorithm, [&options](std::string_view option_name) {
using namespace config::names;
auto create_input_table = [](CSVConfig const& csv_config) -> config::InputTable {
return std::make_shared<CSVParser>(csv_config);
auto csv_parser = std::make_shared<CSVParser>(csv_config);
csv_parser->ValidateSeparator();
return csv_parser;
};

if (option_name == kTable && options.find(std::string{kTable}) == options.end()) {
Expand Down
106 changes: 106 additions & 0 deletions src/core/parser/csv_parser/csv_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
#include <filesystem>
#include <fstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>
#include <easylogging++.h>

inline std::string& CSVParser::rtrim(std::string& s) {
boost::trim_right(s);
Expand Down Expand Up @@ -180,3 +182,107 @@ std::vector<std::string> CSVParser::GetNextRow() {

return result;
}

std::optional<char> CSVParser::DeduceSeparator() {
// Calculate statistics including the header row
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

std::unordered_map<char, unsigned> letter_count;
if (has_next_) {
for (char c : next_line_) {
letter_count[c]++;
}
}

std::unordered_map<char, unsigned> next_letter_count;
while (has_next_) {
GetNextIfHas();
next_letter_count.clear();
for (char c : next_line_) {
next_letter_count[c]++;
}
for (auto letter : letter_count) {
if (letter.second != next_letter_count[letter.first]) {
letter_count[letter.first] = 0;
}
}
}

char possible_separator;
unsigned max_separator_count = 0;

for (auto letter : letter_count) {
if (letter.second > max_separator_count) {
max_separator_count = letter.second;
possible_separator = letter.first;
}
}
Reset();

if (max_separator_count) {
return possible_separator;
}

return std::nullopt;
}

bool CSVParser::CheckSeparator(char sep) {
// Calculate statistics including the header row
bool has_header_copy = has_header_;
has_header_ = false;
Reset();
has_header_ = has_header_copy;

char separator_copy = separator_;
separator_ = sep;

unsigned sep_count = 0;
std::vector<std::string> next_parsed;
if (has_next_) {
next_parsed = GetNextRow();
sep_count = next_parsed.size();
}

while (has_next_) {
next_parsed = GetNextRow();
if (sep_count != next_parsed.size()) {
Reset();
separator_ = separator_copy;
return false;
}
}

Reset();
separator_ = separator_copy;

return true;
}

std::optional<char> CSVParser::ValidateSeparator() {
std::optional<char> possible_separator = DeduceSeparator();

if (CheckSeparator(separator_)) {
if (possible_separator == std::nullopt || separator_ == possible_separator ||
GetNumberOfColumns() != 1 || !CheckSeparator(possible_separator.value())) {
return separator_;
}

LOG(WARNING) << "Inserted separator for the table " << relation_name_
<< " seems to be wrong";
LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value()
<< "\'";
return possible_separator;
}

LOG(WARNING) << "Inserted separator for the table " << relation_name_ << " seems to be wrong";
if (possible_separator != std::nullopt && CheckSeparator(possible_separator.value())) {
LOG(WARNING) << "Possible separator for the table is: \'" << possible_separator.value()
<< "\'";
return possible_separator;
}

return std::nullopt;
}
5 changes: 5 additions & 0 deletions src/core/parser/csv_parser/csv_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <filesystem>
#include <fstream>
#include <optional>
#include <string>
#include <vector>

Expand Down Expand Up @@ -36,6 +37,8 @@ class CSVParser : public model::IDatasetStream {
std::vector<std::string> ParseString(std::string const& s) const;
void GetNextIfHas();
void SkipLine();
std::optional<char> DeduceSeparator();
bool CheckSeparator(char sep);

inline static std::string& rtrim(std::string& s);

Expand All @@ -49,6 +52,8 @@ class CSVParser : public model::IDatasetStream {
std::string GetUnparsedLine(unsigned long long const line_index);
std::vector<std::string> ParseLine(unsigned long long const line_index);

std::optional<char> ValidateSeparator();

bool HasNextRow() const override {
return has_next_;
}
Expand Down
12 changes: 12 additions & 0 deletions src/core/util/separator_validator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#include "separator_validator.h"

#include <easylogging++.h>

namespace util {

std::optional<char> ValidateSeparator(std::filesystem::path const& path, char separator) {
auto parser = std::make_unique<CSVParser>(path, separator, false);
return parser->ValidateSeparator();
}

} // namespace util
12 changes: 12 additions & 0 deletions src/core/util/separator_validator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <filesystem>
#include <optional>

#include "parser/csv_parser/csv_parser.h"

namespace util {

std::optional<char> ValidateSeparator(std::filesystem::path const& path, char separator);

} // namespace util
4 changes: 3 additions & 1 deletion src/python_bindings/py_util/py_to_any.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ config::InputTable CreateCsvParser(std::string_view option_name, py::tuple const
throw config::ConfigurationError("Cannot create a CSV parser from passed tuple.");
}

return std::make_shared<CSVParser>(
auto csv_parser = std::make_shared<CSVParser>(
CastAndReplaceCastError<std::string>(option_name, arguments[0]),
CastAndReplaceCastError<char>(option_name, arguments[1]),
CastAndReplaceCastError<bool>(option_name, arguments[2]));
csv_parser->ValidateSeparator();
return csv_parser;
}

template <typename Type>
Expand Down

0 comments on commit a44e7ea

Please sign in to comment.