From eae4478dbf37559daab7ce036699fc0f9f424498 Mon Sep 17 00:00:00 2001 From: ochan1 Date: Sat, 12 Jun 2021 15:37:43 -0700 Subject: [PATCH 1/6] Fix column select bug with index and column_name "num_active_cols_" will double count when a particular column is selected twice by the user in an index + column_name situation and when the column_name is listed twice --- cpp/src/io/csv/reader_impl.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 325cb3356d9..685335ed3b5 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -317,18 +317,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) // User can specify which columns should be parsed if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) { - std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled); + std::fill(h_column_flags_.begin(), h_column_flags_.end(), column_parse::disabled); for (const auto index : opts_.get_use_cols_indexes()) { - column_flags_[index] = column_parse::enabled; + h_column_flags_[index] = column_parse::enabled; } num_active_cols_ = opts_.get_use_cols_indexes().size(); for (const auto &name : opts_.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] = column_parse::enabled; - num_active_cols_++; + if (h_column_flags_[it - col_names_.begin()] == column_parse::disabled) { + h_column_flags_[it - col_names_.begin()] = column_parse::enabled; + num_active_cols_++; + } } } } From dc2eded529bf8b450be05485ac0c4b02f7f46770 Mon Sep 17 00:00:00 2001 From: Oscar Chan Date: Sat, 12 Jun 2021 22:54:02 +0000 Subject: [PATCH 2/6] Fix column_flags_ to be correct variable name --- cpp/src/io/csv/reader_impl.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 685335ed3b5..b5866f68a78 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -317,18 +317,18 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) // User can specify which columns should be parsed if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) { - std::fill(h_column_flags_.begin(), h_column_flags_.end(), column_parse::disabled); + std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled); for (const auto index : opts_.get_use_cols_indexes()) { - h_column_flags_[index] = column_parse::enabled; + column_flags_[index] = column_parse::enabled; } num_active_cols_ = opts_.get_use_cols_indexes().size(); for (const auto &name : opts_.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - if (h_column_flags_[it - col_names_.begin()] == column_parse::disabled) { - h_column_flags_[it - col_names_.begin()] = column_parse::enabled; + if (column_flags_[it - col_names_.begin()] == column_parse::disabled) { + column_flags_[it - col_names_.begin()] = column_parse::enabled; num_active_cols_++; } } From ec2492940680a0edcc1b2767819b9a49e4193c39 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 2 Jul 2021 16:10:21 +0530 Subject: [PATCH 3/6] fix double counting of columns in select indices --- cpp/src/io/csv/reader_impl.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 498e63cc863..182eaf44cc4 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -40,6 +40,7 @@ #include #include #include +#include using std::string; using std::vector; @@ -336,7 +337,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) for (const auto index : opts_.get_use_cols_indexes()) { column_flags_[index] = column_parse::enabled; } - num_active_cols_ = opts_.get_use_cols_indexes().size(); + num_active_cols_ = std::unordered_set(opts_.get_use_cols_indexes().begin(), + opts_.get_use_cols_indexes().end()) + .size(); for (const auto &name : opts_.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); From d3a70b96944b8a68c4094d947e78f00d217cff77 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 2 Jul 2021 16:14:39 +0530 Subject: [PATCH 4/6] unit test for duplicates in select indices, select column names --- cpp/tests/io/csv_test.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 8996dd95e06..dec7e136446 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -527,6 +527,42 @@ TEST_F(CsvReaderTest, MultiColumn) expect_column_data_equal(float64_values, view.column(14)); } +TEST_F(CsvReaderTest, RepeatColumn) +{ + constexpr auto num_rows = 10; + auto int16_values = random_values(num_rows); + auto int64_values = random_values(num_rows); + auto uint64_values = random_values(num_rows); + auto float32_values = random_values(num_rows); + + auto filepath = temp_env->get_temp_dir() + "RepeatColumn.csv"; + { + std::ostringstream line; + for (int i = 0; i < num_rows; ++i) { + line << int16_values[i] << "," << int64_values[i] << "," << uint64_values[i] << "," + << float32_values[i] << "\n"; + } + std::ofstream outfile(filepath, std::ofstream::out); + outfile << line.str(); + } + + // repeats column in indexes, and names, misses 1 column. + cudf_io::csv_reader_options in_opts = + cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) + .dtypes(std::vector{"int16", "int64", "uint64", "float"}) + .names({"A", "B", "C", "D"}) + .use_cols_indexes({1, 0, 0}) + .use_cols_names({"D", "B", "B"}) + .header(-1); + auto result = cudf_io::read_csv(in_opts); + + const auto view = result.tbl->view(); + EXPECT_EQ(3, view.num_columns()); + expect_column_data_equal(int16_values, view.column(0)); + expect_column_data_equal(int64_values, view.column(1)); + expect_column_data_equal(float32_values, view.column(2)); +} + TEST_F(CsvReaderTest, Booleans) { auto filepath = temp_env->get_temp_dir() + "Booleans.csv"; From e50228e4a4f597f80865af4a1abf490f9abc0b4a Mon Sep 17 00:00:00 2001 From: Oscar Chan Date: Fri, 2 Jul 2021 14:30:19 +0000 Subject: [PATCH 5/6] Add variable for iterator difference --- cpp/src/io/csv/reader_impl.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index b5866f68a78..6106185d82c 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -327,8 +327,9 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) for (const auto &name : opts_.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - if (column_flags_[it - col_names_.begin()] == column_parse::disabled) { - column_flags_[it - col_names_.begin()] = column_parse::enabled; + auto curr_it = it - col_names_.begin(); + if (column_flags_[curr_it] == column_parse::disabled) { + column_flags_[curr_it] = column_parse::enabled; num_active_cols_++; } } From 85c81602fb50fd9c40e043c4cb0f51e711e41d35 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 5 Jul 2021 11:57:09 +0530 Subject: [PATCH 6/6] comment --- cpp/tests/io/csv_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dec7e136446..9c3a9a1b015 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -546,7 +546,7 @@ TEST_F(CsvReaderTest, RepeatColumn) outfile << line.str(); } - // repeats column in indexes, and names, misses 1 column. + // repeats column in indexes and names, misses 1 column. cudf_io::csv_reader_options in_opts = cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}) .dtypes(std::vector{"int16", "int64", "uint64", "float"})