Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix double counting of selected columns in CSV reader #8508

Merged
merged 8 commits into from
Jul 6, 2021
12 changes: 9 additions & 3 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include <numeric>
#include <tuple>
#include <unordered_map>
#include <unordered_set>

using std::string;
using std::vector;
Expand Down Expand Up @@ -336,13 +337,18 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
for (const auto index : opts_.get_use_cols_indexes()) {
column_flags_[index] = column_parse::enabled;
}
num_active_cols_ = opts_.get_use_cols_indexes().size();
num_active_cols_ = std::unordered_set<int>(opts_.get_use_cols_indexes().begin(),
opts_.get_use_cols_indexes().end())
.size();

for (const auto &name : opts_.get_use_cols_names()) {
const auto it = std::find(col_names_.begin(), col_names_.end(), name);
if (it != col_names_.end()) {
column_flags_[it - col_names_.begin()] = column_parse::enabled;
num_active_cols_++;
auto curr_it = it - col_names_.begin();
if (column_flags_[curr_it] == column_parse::disabled) {
column_flags_[curr_it] = column_parse::enabled;
num_active_cols_++;
}
}
}
}
Expand Down
36 changes: 36 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,42 @@ TEST_F(CsvReaderTest, MultiColumn)
expect_column_data_equal(float64_values, view.column(14));
}

TEST_F(CsvReaderTest, RepeatColumn)
{
constexpr auto num_rows = 10;
auto int16_values = random_values<int16_t>(num_rows);
auto int64_values = random_values<int64_t>(num_rows);
auto uint64_values = random_values<uint64_t>(num_rows);
auto float32_values = random_values<float>(num_rows);

auto filepath = temp_env->get_temp_dir() + "RepeatColumn.csv";
{
std::ostringstream line;
for (int i = 0; i < num_rows; ++i) {
line << int16_values[i] << "," << int64_values[i] << "," << uint64_values[i] << ","
<< float32_values[i] << "\n";
}
std::ofstream outfile(filepath, std::ofstream::out);
outfile << line.str();
}

// repeats column in indexes and names, misses 1 column.
cudf_io::csv_reader_options in_opts =
cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
.dtypes(std::vector<std::string>{"int16", "int64", "uint64", "float"})
.names({"A", "B", "C", "D"})
.use_cols_indexes({1, 0, 0})
.use_cols_names({"D", "B", "B"})
.header(-1);
auto result = cudf_io::read_csv(in_opts);

const auto view = result.tbl->view();
EXPECT_EQ(3, view.num_columns());
expect_column_data_equal(int16_values, view.column(0));
expect_column_data_equal(int64_values, view.column(1));
expect_column_data_equal(float32_values, view.column(2));
}

TEST_F(CsvReaderTest, Booleans)
{
auto filepath = temp_env->get_temp_dir() + "Booleans.csv";
Expand Down