From f3a6c44423a4235a9e57f32e79c41b87aea806b9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 12 Sep 2019 10:29:06 -0400 Subject: [PATCH 01/54] cudf strings column classes --- cpp/CMakeLists.txt | 4 +- .../cudf/column/column_device_view.cuh | 4 +- cpp/include/cudf/strings/string_view.cuh | 323 +++++++++++ cpp/include/cudf/strings/string_view.inl | 544 ++++++++++++++++++ .../cudf/strings/strings_column_factories.hpp | 48 ++ .../cudf/strings/strings_column_handler.hpp | 65 +++ .../{string => strings}/nvcategory_util.cpp | 0 cpp/src/strings/strings_column_factories.cu | 124 ++++ cpp/src/strings/strings_column_handler.cu | 168 ++++++ 9 files changed, 1278 insertions(+), 2 deletions(-) create mode 100644 cpp/include/cudf/strings/string_view.cuh create mode 100644 cpp/include/cudf/strings/string_view.inl create mode 100644 cpp/include/cudf/strings/strings_column_factories.hpp create mode 100644 cpp/include/cudf/strings/strings_column_handler.hpp rename cpp/src/{string => strings}/nvcategory_util.cpp (100%) create mode 100644 cpp/src/strings/strings_column_factories.cu create mode 100644 cpp/src/strings/strings_column_handler.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6b659140376..b3347cb83e7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -324,7 +324,7 @@ add_library(cudf src/column/legacy/column.cpp src/column/legacy/context.cpp src/table/legacy/table.cpp - src/string/nvcategory_util.cpp + src/strings/nvcategory_util.cpp src/join/joining.cu src/orderby/orderby.cu src/predicates/is_sorted.cu @@ -425,6 +425,8 @@ add_library(cudf src/table/table.cpp src/bitmask/null_mask.cpp src/sort/sort.cu + src/strings/strings_column_factories.cu + src/strings/strings_column_handler.cu src/column/legacy/interop.cpp) # Rename installation to proper names for later finding diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index b66ccacfcd4..1322c6bceb1 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -226,7 +226,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *`source_view` available in device memory. *---------------------------------------------------------------------------**/ static auto create(column_view source_view, cudaStream_t stream = 0); - + /**---------------------------------------------------------------------------* * @brief Returns the specified child * @@ -244,6 +244,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { ///< may contain additional data size_type _num_children{}; ///< The number of child columns +public: /**---------------------------------------------------------------------------* * @brief Construct's a `column_device_view` from a `column_view` populating * all but the children. @@ -254,6 +255,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *---------------------------------------------------------------------------**/ column_device_view(column_view source); +protected: /**---------------------------------------------------------------------------* * @brief Destroy the `device_column_view` object. * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh new file mode 100644 index 00000000000..b6bb6ba8d81 --- /dev/null +++ b/cpp/include/cudf/strings/string_view.cuh @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf +{ + +// utf8 characters are 1-4 bytes +typedef unsigned int Char; + +/**---------------------------------------------------------------------------* + * @brief A non-owning, immutable view of device data that is variable length + * character array representing a UTF-8 string. The caller must maintain the + * device memory for the lifetime of this instance. + * + * It provides a simple wrapper and string operations for individual char array + * within a strings column. This is likely created dynamically and temporarily. + * It is not recommended to be allocated directly on the global memory heap. + *---------------------------------------------------------------------------**/ +class string_view +{ + public: + string_view() = default; + /**---------------------------------------------------------------------------* + * @brief Create instance from existing device char array. + * + * @param data Device char array encoded in UTF8. + * @param bytes Number of bytes in data array. + *---------------------------------------------------------------------------**/ + __device__ string_view(const char* data, size_type bytes); + /**---------------------------------------------------------------------------* + * @brief Create instance from existing device char array. The array must + * include a null-terminator ('\0). + * + * @param data Device char array encoded in UTF8. + *---------------------------------------------------------------------------**/ + __device__ string_view(const char* data); + string_view(const string_view&) = default; + string_view(string_view&&) = default; + ~string_view() = default; + string_view& operator=(const string_view&) = default; + string_view& operator=(string_view&&) = default; + + /**---------------------------------------------------------------------------* + * @brief Return the number of bytes in this string + *---------------------------------------------------------------------------**/ + __device__ size_type size() const; + /**---------------------------------------------------------------------------* + * @brief Return the number of bytes in this string + *---------------------------------------------------------------------------**/ + __device__ size_type length() const; + /**---------------------------------------------------------------------------* + * @brief Return the number of characters (UTF-8) in this string + *---------------------------------------------------------------------------**/ + __device__ size_type characters() const; + /**---------------------------------------------------------------------------* + * @brief Return a pointer to the internal device array + *---------------------------------------------------------------------------**/ + __device__ const char* data() const; + + /**---------------------------------------------------------------------------* + * @brief Return true if string has no characters + *---------------------------------------------------------------------------**/ + __device__ bool empty() const; + __device__ bool is_null() const; + + /**---------------------------------------------------------------------------* + * @brief Handy iterator for navigating through encoded characters. + *---------------------------------------------------------------------------**/ + class iterator + { + public: + __device__ iterator(const string_view& str, size_type pos); + iterator(const iterator& mit) = default; + iterator(iterator&& mit) = default; + __device__ iterator& operator++(); + __device__ iterator operator++(int); + __device__ bool operator==(const iterator& rhs) const; + __device__ bool operator!=(const iterator& rhs) const; + __device__ Char operator*() const; + __device__ size_type position() const; + __device__ size_type byte_offset() const; + private: + const char* p{}; + size_type cpos{}, offset{}; + }; + + /**---------------------------------------------------------------------------* + * @brief Return new iterator pointing to the beginning of this string + *---------------------------------------------------------------------------**/ + __device__ iterator begin() const; + /**---------------------------------------------------------------------------* + * @brief Return new iterator pointing past the end of this string + *---------------------------------------------------------------------------**/ + __device__ iterator end() const; + + /**---------------------------------------------------------------------------* + * @brief Return single UTF-8 character at the given character position + * + * @param pos Character position + *---------------------------------------------------------------------------**/ + __device__ Char at(size_type pos) const; + __device__ Char operator[](size_type pos) const; + /**---------------------------------------------------------------------------* + * @brief Return the byte offset from data() for a given character position + * + * @param pos Character position + *---------------------------------------------------------------------------**/ + __device__ size_type byte_offset_for(size_type pos) const; + + /**---------------------------------------------------------------------------* + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + *---------------------------------------------------------------------------**/ + __device__ int compare(const string_view& str) const; + /**---------------------------------------------------------------------------* + * @brief Comparing target string with this string. Each character is compared + * as a UTF-8 code-point value. + * + * @param str Target string to compare with this string. + * @param bytes Number of bytes in str. + * @return 0 If they compare equal. + * <0 Either the value of the first character of this string that does + * not match is lower in the arg string, or all compared characters + * match but the arg string is shorter. + * >0 Either the value of the first character of this string that does + * not match is greater in the arg string, or all compared characters + * match but the arg string is longer. + *---------------------------------------------------------------------------**/ + __device__ int compare(const char* data, size_type bytes) const; + + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string matches this string exactly. + *---------------------------------------------------------------------------**/ + __device__ bool operator==(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string does not match this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator!=(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string sorts ascending to this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator<(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string sorts descending to this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator>(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string sorts ascending or matches this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator<=(const string_view& rhs) const; + /**---------------------------------------------------------------------------* + * @brief Returns true if arg string sorts descending or matches this string. + *---------------------------------------------------------------------------**/ + __device__ bool operator>=(const string_view& rhs) const; + + /**---------------------------------------------------------------------------* + * @brief Returns first character position if arg string is contained in this string. + * + * @param str Target string to compare with this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( const string_view& str, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns first character position if arg array is contained in this string. + * + * @param str Target string to compare with this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Returns first character position if arg character is contained in this string. + * + * @param chr Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type find( Char chr, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Same as find() but searches from the end of this string. + * + * @param str Target string to compare with this string. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( const string_view& str, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Same as find() but searches from the end of this string. + * + * @param str Target string to compare with this string. + * @param bytes Number of bytes in str. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; + /**---------------------------------------------------------------------------* + * @brief Same as find() but searches from the end of this string. + * + * @param chr Single encoded character. + * @param pos Character position to start search within this string. + * @param count Number of characters from pos to include in the search. + * Specify -1 to indicate to the end of the string. + * @return -1 if arg string is not found in this string. + *---------------------------------------------------------------------------**/ + __device__ size_type rfind( Char chr, size_type pos=0, size_type count=-1 ) const; + + /**---------------------------------------------------------------------------* + * @brief Return a sub-string of this string. The original string and device + * memory but must still be maintained for the lifetime of the instance. + * + * @param start Character position to start the sub-string. + * @param length Number of characters from start to include in the sub-string. + * @return New instance pointing to a subset of the characters within this instance. + *---------------------------------------------------------------------------**/ + __device__ string_view substr( size_type start, size_type length ) const; + + /**---------------------------------------------------------------------------* + * @brief Tokenizes this string around the given delimiter up to count time. + * + * @param delim Character to use for separating tokens. + * @param count Maximum number of tokens to return. + * Specify -1 to indicate all tokens. + * @param[out] Array to hold output tokens. + * Specify nullptr here to return just the token count. + * @return Number of tokens. + *---------------------------------------------------------------------------**/ + __device__ size_type split( const char* delim, size_type count, string_view* strs ) const; + + /**---------------------------------------------------------------------------* + * @brief Same as split() but starts tokenizing from the end of the string. + * + * @param delim Character to use for separating tokens. + * @param count Maximum number of tokens to return. + * Specify -1 to indicate all tokens. + * @param[out] Array to hold output tokens. + * Specify nullptr here to return just the token count. + * @return Number of tokens. + *---------------------------------------------------------------------------**/ + __device__ size_type rsplit( const char* delim, size_type count, string_view* strs ) const; + + /**---------------------------------------------------------------------------* + * @brief Returns the number of bytes in the specified character. + *---------------------------------------------------------------------------**/ + __host__ __device__ static size_type bytes_in_char( Char chr ); + /**---------------------------------------------------------------------------* + * @brief Convert a char array into a Char value. + * + * @param str String containing encoded char bytes. + * @param[out] chr Single Char value. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ + __host__ __device__ static size_type char_to_Char( const char* str, Char& chr ); + /**---------------------------------------------------------------------------* + * @brief Place a Char value into a char array. + * + * @param chr Single character + * @param[out] str Allocated char array with enough space to hold the encoded characer. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ + __host__ __device__ static size_type Char_to_char( Char chr, char* str ); + /**---------------------------------------------------------------------------* + * @brief Return the number of characters in this provided char array. + * + * @param str String with encoded char bytes. + * @param bytes Number of bytes in str. + * @return The number of characters in the array. + *---------------------------------------------------------------------------**/ + __host__ __device__ static size_type chars_in_string( const char* str, size_type bytes ); + +private: + const char* _data{}; ///< Pointer to device memory contain char array for this string + size_type _bytes{}; ///< Number of bytes in _data for this string + + /**---------------------------------------------------------------------------* + * @brief Return the character position of the given byte offset. + * + * @param bytepos Byte position from start of _data. + * @return The character position for the specified byte. + *---------------------------------------------------------------------------**/ + __device__ size_type char_offset(size_type bytepos) const; +}; + +} + +#include "./string_view.inl" diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl new file mode 100644 index 00000000000..56a64e5a330 --- /dev/null +++ b/cpp/include/cudf/strings/string_view.inl @@ -0,0 +1,544 @@ +/* +*/ + +#include + +namespace cudf +{ + +typedef unsigned char BYTE; + +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes used to represent the provided byte. + * This could 0 to 4 bytes. 0 is returned for intermediate bytes within a + * single character. For example, for the two-byte 0xC3A8 single character, + * the first byte would return 2 and the second byte would return 0. + * + * @param byte Byte from an encoded character. + * @return Number of bytes. + *---------------------------------------------------------------------------**/ +__host__ __device__ inline static size_type bytes_in_char_byte(BYTE byte) +{ + size_type count = 1; + // no if-statements means no divergence + count += (int)((byte & 0xF0) == 0xF0); + count += (int)((byte & 0xE0) == 0xE0); + count += (int)((byte & 0xC0) == 0xC0); + count -= (int)((byte & 0xC0) == 0x80); + return count; +} + + +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes used in the provided char array by + * searching for a null-terminator ('\0') byte. + * + * @param str Null-terminated array of chars. + * @return Number of bytes. + *---------------------------------------------------------------------------**/ +__device__ inline static size_type string_length( const char* str ) +{ + if( !str ) + return 0; + size_type bytes = 0; + while(*str++) + ++bytes; + return bytes; +} + + +__device__ inline string_view::string_view(const char* data, size_type bytes) + : _data(data), _bytes(bytes) +{} + +__device__ inline string_view::string_view(const char* data) + : _data(data) +{ + _bytes = string_length(data); +} + +// +__device__ inline size_type string_view::size() const +{ + return _bytes; +} + +__device__ inline size_type string_view::length() const +{ + return _bytes; +} + +__device__ inline size_type string_view::characters() const +{ + return chars_in_string(_data,_bytes); +} + +__device__ inline const char* string_view::data() const +{ + return _data; +} + +__device__ inline bool string_view::empty() const +{ + return _bytes == 0; +} + +__device__ inline bool string_view::is_null() const +{ + return _data == nullptr; +} + +// the custom iterator knows about UTF8 encoding +__device__ inline string_view::iterator::iterator(const string_view& str, size_type pos) + : cpos(pos) +{ + p = str.data(); + offset = str.byte_offset_for(cpos); +} + +__device__ inline string_view::iterator& string_view::iterator::operator++() +{ + offset += bytes_in_char_byte((BYTE)p[offset]); + ++cpos; + return *this; +} + +// what is the int parm for? +__device__ inline string_view::iterator string_view::iterator::operator++(int) +{ + iterator tmp(*this); + operator++(); + return tmp; +} + +__device__ inline bool string_view::iterator::operator==(const string_view::iterator& rhs) const +{ + return (p == rhs.p) && (cpos == rhs.cpos); +} + +__device__ inline bool string_view::iterator::operator!=(const string_view::iterator& rhs) const +{ + return (p != rhs.p) || (cpos != rhs.cpos); +} + +// unsigned int can hold 1-4 bytes for the UTF8 char +__device__ inline Char string_view::iterator::operator*() const +{ + Char chr = 0; + char_to_Char(p + offset, chr); + return chr; +} + +__device__ inline size_type string_view::iterator::position() const +{ + return cpos; +} + +__device__ inline size_type string_view::iterator::byte_offset() const +{ + return offset; +} + +__device__ inline string_view::iterator string_view::begin() const +{ + return iterator(*this, 0); +} + +__device__ inline string_view::iterator string_view::end() const +{ + return iterator(*this, characters()); +} + +__device__ inline Char string_view::at(size_type pos) const +{ + unsigned int offset = byte_offset_for(pos); + if(offset >= _bytes) + return 0; + Char chr = 0; + char_to_Char(data() + offset, chr); + return chr; +} + +__device__ inline Char string_view::operator[](size_type pos) const +{ + return at(pos); +} + +__device__ inline size_type string_view::byte_offset_for(size_type pos) const +{ + size_type offset = 0; + const char* sptr = _data; + const char* eptr = sptr + _bytes; + while( (pos > 0) && (sptr < eptr) ) + { + size_type charbytes = bytes_in_char_byte((BYTE)*sptr++); + if( charbytes ) + --pos; + offset += charbytes; + } + return offset; +} + +__device__ inline int string_view::compare(const string_view& in) const +{ + return compare(in.data(), in.size()); +} + +__device__ inline int string_view::compare(const char* data, size_type bytes) const +{ + const unsigned char* ptr1 = reinterpret_cast(this->data()); + if(!ptr1) + return -1; + const unsigned char* ptr2 = reinterpret_cast(data); + if(!ptr2) + return 1; + size_type len1 = size(); + size_type idx = 0; + for(; (idx < len1) && (idx < bytes); ++idx) + { + if(*ptr1 != *ptr2) + return (int)*ptr1 - (int)*ptr2; + ++ptr1; + ++ptr2; + } + if(idx < len1) + return 1; + if(idx < bytes) + return -1; + return 0; +} + +__device__ inline bool string_view::operator==(const string_view& rhs) const +{ + return compare(rhs) == 0; +} + +__device__ inline bool string_view::operator!=(const string_view& rhs) const +{ + return compare(rhs) != 0; +} + +__device__ inline bool string_view::operator<(const string_view& rhs) const +{ + return compare(rhs) < 0; +} + +__device__ inline bool string_view::operator>(const string_view& rhs) const +{ + return compare(rhs) > 0; +} + +__device__ inline bool string_view::operator<=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc < 0); +} + +__device__ inline bool string_view::operator>=(const string_view& rhs) const +{ + int rc = compare(rhs); + return (rc == 0) || (rc > 0); +} + +__device__ inline size_type string_view::find(const string_view& str, size_type pos, int count) const +{ + return find(str.data(), str.size(), pos, count); +} + +__device__ inline size_type string_view::find(const char* str, size_type bytes, size_type pos, int count) const +{ + const char* sptr = data(); + if(!str || !bytes) + return -1; + size_type nchars = characters(); + if(count < 0) + count = nchars; + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + size_type spos = byte_offset_for(pos); + size_type epos = byte_offset_for(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + spos; + const char* ptr2 = str; + for(size_type idx=0; idx < len1; ++idx) + { + bool match = true; + for( size_type jdx=0; match && (jdx < len2); ++jdx ) + match = (ptr1[jdx] == ptr2[jdx]); + if( match ) + return char_offset(idx+spos); + ptr1++; + } + return -1; +} + +// maybe get rid of this one +__device__ inline size_type string_view::find(Char chr, size_type pos, int count) const +{ + size_type sz = size(); + size_type nchars = characters(); + if(count < 0) + count = nchars; + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + if(pos > end || chr == 0 || sz == 0) + return -1; + size_type spos = byte_offset_for(pos); + size_type epos = byte_offset_for(end); + // + size_type chsz = bytes_in_char(chr); + const char* sptr = data(); + const char* ptr = sptr + spos; + size_type len = (epos - spos) - chsz; + for(size_type idx = 0; idx <= len; ++idx) + { + Char ch = 0; + char_to_Char(ptr++, ch); + if(chr == ch) + return chars_in_string(sptr, idx + spos); + } + return -1; +} + +__device__ inline size_type string_view::rfind(const string_view& str, size_type pos, int count) const +{ + return rfind(str.data(), str.size(), pos, count); +} + +__device__ inline size_type string_view::rfind(const char* str, size_type bytes, size_type pos, int count) const +{ + const char* sptr = data(); + if(!str || !bytes) + return -1; + size_type sz = size(); + size_type nchars = characters(); + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + size_type spos = byte_offset_for(pos); + size_type epos = byte_offset_for(end); + + size_type len2 = bytes; + size_type len1 = (epos - spos) - len2 + 1; + + const char* ptr1 = sptr + epos - len2; + const char* ptr2 = str; + for(int idx=0; idx < len1; ++idx) + { + bool match = true; + for(size_type jdx=0; match && (jdx < len2); ++jdx) + match = (ptr1[jdx] == ptr2[jdx]); + if(match) + return char_offset(epos - len2 - idx); + ptr1--; // go backwards + } + return -1; +} + +__device__ inline size_type string_view::rfind(Char chr, size_type pos, int count) const +{ + size_type sz = size(); + size_type nchars = characters(); + if(count < 0) + count = nchars; + size_type end = pos + count; + if(end < 0 || end > nchars) + end = nchars; + if(pos > end || chr == 0 || sz == 0) + return -1; + size_type spos = byte_offset_for(pos); + size_type epos = byte_offset_for(end); + + size_type chsz = bytes_in_char(chr); + const char* sptr = data(); + const char* ptr = sptr + epos - 1; + size_type len = (epos - spos) - chsz; + for(size_type idx = 0; idx < len; ++idx) + { + Char ch = 0; + char_to_Char(ptr--, ch); + if(chr == ch) + return chars_in_string(sptr, epos - idx - 1); + } + return -1; +} + + +// parameters are character position values +__device__ inline string_view string_view::substr(size_type pos, size_type length) const +{ + size_type spos = byte_offset_for(pos); + size_type epos = byte_offset_for(pos + length); + if( epos > size() ) + epos = size(); + if(spos >= epos) + return string_view("",0); + length = epos - spos; // converts length to bytes + return string_view(data()+spos,length); +} + +__device__ inline size_type string_view::split(const char* delim, int count, string_view* strs) const +{ + const char* sptr = data(); + size_type sz = size(); + if(sz == 0) + { + if(strs && count) + strs[0] = *this; + return 1; + } + + size_type bytes = string_length(delim); + size_type delimCount = 0; + size_type pos = find(delim, bytes); + while(pos >= 0) + { + ++delimCount; + pos = find(delim, bytes, pos + bytes); + } + + size_type strsCount = delimCount + 1; + size_type rtn = strsCount; + if((count > 0) && (rtn > count)) + rtn = count; + if(!strs) + return rtn; + // + if(strsCount < count) + count = strsCount; + // + size_type dchars = (bytes ? chars_in_string(delim,bytes) : 1); + size_type nchars = characters(); + size_type spos = 0, sidx = 0; + size_type epos = find(delim, bytes); + while(epos >= 0) + { + if(sidx >= (count - 1)) // add this to the while clause + break; + strs[sidx++] = substr(spos, epos - spos); + spos = epos + dchars; + epos = find(delim, bytes, spos); + } + if((spos <= nchars) && (sidx < count)) + strs[sidx] = substr(spos, nchars - spos); + // + return rtn; +} + + +__device__ inline size_type string_view::rsplit(const char* delim, int count, string_view* strs) const +{ + const char* sptr = data(); + size_type sz = size(); + if(sz == 0) + { + if(strs && count) + strs[0] = *this; + return 1; + } + + size_type bytes = string_length(delim); + size_type delimCount = 0; + size_type pos = find(delim, bytes); + while(pos >= 0) + { + ++delimCount; + pos = find(delim, bytes, (unsigned int)pos + bytes); + } + + unsigned int strsCount = delimCount + 1; + unsigned int rtn = strsCount; + if((count > 0) && (rtn > count)) + rtn = count; + if(!strs) + return rtn; + // + if(strsCount < count) + count = strsCount; + // + unsigned int dchars = (bytes ? chars_in_string(delim,bytes) : 1); + int epos = (int)characters(); // end pos is not inclusive + int sidx = count - 1; // index for strs array + int spos = rfind(delim, bytes); + while(spos >= 0) + { + if(sidx <= 0) + break; + //int spos = pos + (int)bytes; + int len = epos - spos - dchars; + strs[sidx--] = substr((unsigned int)spos+dchars, (unsigned int)len); + epos = spos; + spos = rfind(delim, bytes, 0, (unsigned int)epos); + } + if(epos >= 0) + strs[0] = substr(0, epos); + // + return rtn; +} + + +__host__ __device__ inline size_type string_view::bytes_in_char(Char chr) +{ + size_type count = 1; + count += (int)((chr & (unsigned)0x0000FF00) > 0); + count += (int)((chr & (unsigned)0x00FF0000) > 0); + count += (int)((chr & (unsigned)0xFF000000) > 0); + return count; +} + +__host__ __device__ inline size_type string_view::char_to_Char(const char* pSrc, Char &chr) +{ + size_type chwidth = bytes_in_char_byte((BYTE)*pSrc); + chr = (Char)(*pSrc++) & 0xFF; + if(chwidth > 1) + { + chr = chr << 8; + chr |= ((Char)(*pSrc++) & 0xFF); // << 8; + if(chwidth > 2) + { + chr = chr << 8; + chr |= ((Char)(*pSrc++) & 0xFF); // << 16; + if(chwidth > 3) + { + chr = chr << 8; + chr |= ((Char)(*pSrc++) & 0xFF); // << 24; + } + } + } + return chwidth; +} + +__host__ __device__ inline size_type string_view::Char_to_char(Char chr, char* dst) +{ + size_type chwidth = bytes_in_char(chr); + for(size_type idx = 0; idx < chwidth; ++idx) + { + dst[chwidth - idx - 1] = (char)chr & 0xFF; + chr = chr >> 8; + } + return chwidth; +} + +// counts the number of characters in the given char array +__host__ __device__ inline size_type string_view::chars_in_string(const char* str, size_type bytes) +{ + if( (str==0) || (bytes==0) ) + return 0; + // + unsigned int nchars = 0; + for(size_type idx = 0; idx < bytes; ++idx) + nchars += (unsigned int)(((BYTE)str[idx] & 0xC0) != 0x80); + return (size_type)nchars; +} + +__device__ inline size_type string_view::char_offset(size_type bytepos) const +{ + return chars_in_string(data(), bytepos); +} + +} \ No newline at end of file diff --git a/cpp/include/cudf/strings/strings_column_factories.hpp b/cpp/include/cudf/strings/strings_column_factories.hpp new file mode 100644 index 00000000000..faf772b26d5 --- /dev/null +++ b/cpp/include/cudf/strings/strings_column_factories.hpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include "../column/column.hpp" + +namespace cudf { +/**---------------------------------------------------------------------------* + * @brief Construct strings column given an array of pointer/size pairs. + * Use the strings_column_handler class to perform strings operations on + * this type of column. + * + * @note `null_count()` and `null_bitmask` are determined if a pair contains + * a null pointer. Otherwise, it is considered an empty string and not null. + * + * @throws std::bad_alloc if device memory allocation fails + * @throws cudf::logic_error if pointers are invalid + * + * @param[in] strs The pointer/size pair arrays. + * Each pointer must be valid device memory address. + * The size must be the number of bytes. + * @param[in] count The number of elements in the strs array. + * @param[in] stream Optional stream on which to issue all memory allocation and device + * kernels + * @param[in] mr Optional resource to use for device memory + * allocation of the column's `data` and `null_mask`. + *---------------------------------------------------------------------------**/ +std::unique_ptr make_strings_column( + std::pair* strs, size_type count, + cudaStream_t stream = 0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +} // namespace cudf diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_handler.hpp new file mode 100644 index 00000000000..bb2b0bb3abb --- /dev/null +++ b/cpp/include/cudf/strings/strings_column_handler.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { + +class strings_column_handler +{ + public: + ~strings_column_handler() = default; + + strings_column_handler( const column_view& strings_column ); + //strings_column_handler( const column_view&& strings_column ); + + size_type count() const; + + const char* chars_data() const; + const int32_t* offsets_data() const; + + size_type chars_column_size() const; + + const bitmask_type* null_mask() const; + size_type null_count() const; + + enum sort_type { + none=0, ///< no sorting + length=1, ///< sort by string length + name=2 ///< sort by characters code-points + }; + + // print strings to stdout + void print( size_type start=0, size_type end=-1, + size_type max_width=-1, const char* delimiter = "\n" ) const; + + // new strings column from subset of given strings column + std::unique_ptr sublist( size_type start, size_type end, size_type step ); + + // return sorted version of the given strings column + std::unique_ptr sort( sort_type stype, bool ascending=true, bool nullfirst=true ); + + // return sorted indexes only -- returns integer column + std::unique_ptr order( sort_type stype, bool ascending, bool nullfirst=true ); + +private: + const column_view _parent; +}; + +} diff --git a/cpp/src/string/nvcategory_util.cpp b/cpp/src/strings/nvcategory_util.cpp similarity index 100% rename from cpp/src/string/nvcategory_util.cpp rename to cpp/src/strings/nvcategory_util.cpp diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu new file mode 100644 index 00000000000..d8e2b3a31b9 --- /dev/null +++ b/cpp/src/strings/strings_column_factories.cu @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf { + +// Create a strings-type column. +// A strings-column has children columns to manage the variable-length +// encoded character array. +// Use the strings_column_handler class to perform strings operations +// on this type of column. +std::unique_ptr make_strings_column( + std::pair* strings, size_type count, cudaStream_t stream, + rmm::mr::device_memory_resource* mr) +{ + // maybe a separate factory for creating null strings-column + CUDF_EXPECTS(count > 0, "must have at least one pair"); + + auto execpol = rmm::exec_policy(stream); + auto strs = thrust::device_pointer_cast(reinterpret_cast*>(strings)); + auto d_strs = strs.get(); + + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::ALL_VALID, stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + offsets_view.data(), + [d_strs] __device__ (size_type idx) { + thrust::pair item = d_strs[idx]; + return ( item.first ? (int32_t)item.second : 0 ); + }, + thrust::plus() ); + + // get number of bytes (last offset value) + auto offsets_data = thrust::device_pointer_cast(offsets_view.data()); + size_type bytes = offsets_data[count-1]; // this copy may not be stream friendly + + // count nulls + size_type null_count = thrust::transform_reduce( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + [d_strs] __device__ (size_type idx) { return (size_type)(d_strs[idx].first==nullptr); }, + 0, thrust::plus() ); + + // build null_mask + mask_state state = mask_state::UNINITIALIZED; + if( null_count==0 ) + state = mask_state::ALL_VALID; + else if( null_count==count ) + state = mask_state::ALL_NULL; + auto null_mask = create_null_mask(count, state, stream, mr); + if( (null_count > 0) && (null_count < count) ) + { + uint8_t* d_null_mask = static_cast(null_mask.data()); + CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); + thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, + [d_strs, count, d_null_mask] __device__(size_type byte_idx) { + unsigned char byte = 0; // set one byte per thread -- init to all nulls + for( size_type i=0; i < 8; ++i ) + { + size_type idx = i + (byte_idx*8); // compute d_strs index + byte = byte >> 1; // shift until we are done + if( idx < count ) // check boundary + { + if( d_strs[idx].first ) + byte |= 128; // string is not null, set high bit + } + } + d_null_mask[byte_idx] = byte; + }); + } + + // build chars column + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::ALL_VALID, stream, mr ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + auto d_offsets = offsets_view.data(); + thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, + [d_strs, d_offsets, d_chars] __device__(size_type idx){ + // place individual strings + auto item = d_strs[idx]; + if( item.first ) + { + size_type offset = (idx ? d_offsets[idx-1] : 0); + memcpy(d_chars + offset, item.first, item.second ); + } + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); +} + +} // namespace cudf diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu new file mode 100644 index 00000000000..7432ffc3761 --- /dev/null +++ b/cpp/src/strings/strings_column_handler.cu @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf { + +#define STR_OFFSETS_CHILD_INDEX 0 +#define STR_CHARS_CHILD_INDEX 1 + +// +strings_column_handler::strings_column_handler( const column_view& strings_column ) + : _parent(strings_column) +{ + CUDF_EXPECTS( _parent.type().id()==STRING, "string_column_view only support strings"); + CUDF_EXPECTS( _parent.num_children()>0, "string column must have children"); +} + +size_type strings_column_handler::count() const +{ + return _parent.child(STR_OFFSETS_CHILD_INDEX).size(); +} + +const char* strings_column_handler::chars_data() const +{ + return _parent.child(STR_CHARS_CHILD_INDEX).data(); +} + +const int32_t* strings_column_handler::offsets_data() const +{ + return _parent.child(STR_OFFSETS_CHILD_INDEX).data(); +} + +size_type strings_column_handler::chars_column_size() const +{ + return _parent.child(STR_OFFSETS_CHILD_INDEX).size(); +} + +const bitmask_type* strings_column_handler::null_mask() const +{ + return _parent.null_mask(); +} + +size_type strings_column_handler::null_count() const +{ + return _parent.null_count(); +} + +// print strings to stdout +void strings_column_handler::print( size_type start, size_type end, + size_type max_width, const char* delimiter ) const +{ + size_type count = this->count(); + if( end < 0 || end > count ) + end = count; + if( start < 0 ) + start = 0; + if( start >= end ) + return; + count = end - start; + + // stick with the default stream for this odd/rare stdout function + auto execpol = rmm::exec_policy(0); + auto strings_column = column_device_view(_parent); + auto d_offsets = offsets_data(); + auto d_strings = chars_data(); + + // get individual strings sizes + rmm::device_vector output_offsets(count,0); + thrust::transform( execpol->on(0), + thrust::make_counting_iterator(start), thrust::make_counting_iterator(end), + output_offsets.begin(), + [strings_column, d_strings, max_width, d_offsets] __device__ (size_type idx) { + if( strings_column.nullable() && strings_column.is_null(idx) ) + return 0; + size_type offset = idx ? d_offsets[idx-1] : 0; // this logic will be a template + size_type bytes = d_offsets[idx] - offset; // specialization on element() + string_view dstr( d_strings + offset, bytes ); // method of column_device_view + if( (max_width > 0) && (dstr.characters() > max_width) ) + bytes = dstr.byte_offset_for(max_width); + return bytes+1; // allow for null-terminator on non-null strings + }); + // convert to offsets + thrust::inclusive_scan( execpol->on(0), output_offsets.begin(), output_offsets.end(), output_offsets.begin() ); + // build output buffer + size_t buffer_size = output_offsets[count-1]; + if( buffer_size == 0 ) + { + printf("all %d strings are null\n", count); + return; + } + rmm::device_vector buffer(buffer_size,0); // allocate and pre-null-terminate + char* d_buffer = buffer.data().get(); + // copy strings into output buffer + size_t* d_output_offsets = output_offsets.data().get(); + thrust::for_each_n(execpol->on(0), + thrust::make_counting_iterator(0), (end-start), + [d_strings, start, d_offsets, d_output_offsets, d_buffer] __device__(size_type idx) { + size_t output_offset = (idx ? d_output_offsets[idx-1] : 0); + size_t length = d_output_offsets[idx] - output_offset; // bytes + if( length ) // this is only 0 for nulls + { + idx += start; + size_type offset = (idx ? d_offsets[idx-1]:0); + memcpy(d_buffer + output_offset, d_strings + offset, length-1 ); + } + }); + + // copy output buffer to host + std::vector h_offsets(count); + cudaMemcpyAsync( h_offsets.data(), d_output_offsets, count*sizeof(size_t), cudaMemcpyDeviceToHost); + std::vector h_buffer(buffer_size); + cudaMemcpyAsync( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost ); + cudaStreamSynchronize(0); + + // print out the strings to stdout + for( size_type idx=0; idx < count; ++idx ) + { + size_t offset = (idx ? h_offsets[idx-1]:0); + size_t length = h_offsets[idx] - offset; + printf("%d:",idx); + if( length ) + printf("[%s]", h_buffer.data()+offset); + else + printf(""); + printf("%s",delimiter); + } +} + +// new strings column from subset of given strings column +std::unique_ptr strings_column_handler::sublist( size_type start, size_type end, size_type step ) +{ + return make_strings_column(nullptr, 0); +} + +// return sorted version of the given strings column +std::unique_ptr strings_column_handler::sort( sort_type stype, bool ascending, bool nullfirst ) +{ + return make_strings_column(nullptr, 0); +} + +// return sorted indexes only -- returns integer column +std::unique_ptr strings_column_handler::order( sort_type stype, bool ascending, bool nullfirst ) +{ + return make_strings_column(nullptr, 0); +} + +} // namespace cudf \ No newline at end of file From 52f774dbf692ba1ca5862d9219ae1cbe63ff794a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 16 Sep 2019 15:10:40 -0400 Subject: [PATCH 02/54] cudf strings column --- cpp/custrings/text/NVText.cu | 42 ++++ .../cudf/column/column_device_view.cuh | 89 ++++---- .../cudf/strings/strings_column_factories.hpp | 4 +- .../cudf/strings/strings_column_handler.hpp | 82 ++++++- cpp/include/nvstrings/NVText.h | 1 + cpp/src/column/column_device_view.cu | 28 ++- cpp/src/strings/strings_column_factories.cu | 156 +++++++------- cpp/src/strings/strings_column_handler.cu | 202 ++++++++++++++---- python/nvstrings/cpp/pytext.cpp | 23 ++ python/nvstrings/nvtext.py | 6 + 10 files changed, 451 insertions(+), 182 deletions(-) diff --git a/cpp/custrings/text/NVText.cu b/cpp/custrings/text/NVText.cu index 671c318e75e..4de66bb1a5f 100644 --- a/cpp/custrings/text/NVText.cu +++ b/cpp/custrings/text/NVText.cu @@ -170,3 +170,45 @@ NVStrings* NVText::scatter_count( NVStrings& strs, unsigned int* counts, bool bd // build strings object from elements return NVStrings::create_from_index((std::pair*)d_results,total_count); } + +// +unsigned int NVText::code_points( NVStrings& strs, unsigned int* results ) +{ + unsigned int count = strs.size(); + if( count==0 || results==nullptr ) + return 0; + + // + auto execpol = rmm::exec_policy(0); + rmm::device_vector strings(count,nullptr); + custring_view** d_strings = strings.data().get(); + strs.create_custring_index(d_strings); + + // get all the lengths to build the offsets + // offsets point to each individual range + rmm::device_vector offsets(count,0); + size_t* d_offsets = offsets.data().get(); + thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, + [d_strings, d_offsets] __device__(unsigned int idx){ + custring_view* dstr = d_strings[idx]; + if( dstr ) + d_offsets[idx] = dstr->chars_count(); + }); + thrust::inclusive_scan( execpol->on(0), offsets.begin(), offsets.end(), offsets.begin() ); + + // now set the ranges + auto d_results = results; + thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, + [d_strings, d_offsets, d_results] __device__(unsigned int idx){ + custring_view* dstr = d_strings[idx]; + if( !dstr ) + return; + auto offset = (idx ? d_offsets[idx-1] : 0); + auto result = d_results + offset; + for( auto itr = dstr->begin(); itr != dstr->end(); ++itr ) + *result++ = (unsigned int)*itr; + }); + // + unsigned int rtn = offsets[count-1]; + return rtn; +} diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index ea4280a45d8..8a511552b1a 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -15,9 +15,11 @@ */ #pragma once +#include #include #include #include +#include namespace cudf { @@ -70,34 +72,6 @@ class alignas(16) column_device_view_base { return head() + _offset; } - /**---------------------------------------------------------------------------* - * @brief Returns reference to element at the specified index. - * - * This function accounts for the offset. - * - * @tparam T The element type - * @param element_index Position of the desired element - *---------------------------------------------------------------------------**/ - template - __device__ T const& element(size_type element_index) const noexcept { - return data()[element_index]; - } - - /**---------------------------------------------------------------------------* - * @brief Returns `string_view` to the string element at the specified index. - * - * This function accounts for the offset. - * - * @param element_index Position of the desired string - *---------------------------------------------------------------------------**/ - /* - template <> - __device__ string_view const& element( - size_type element_index) const noexcept { - // Fill this in - } - */ - /**---------------------------------------------------------------------------* * @brief Returns the number of elements in the column *---------------------------------------------------------------------------**/ @@ -253,8 +227,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return A `unique_ptr` to a `column_device_view` that makes the data from *`source_view` available in device memory. *---------------------------------------------------------------------------**/ - static auto create(column_view source_view, cudaStream_t stream = 0); + static std::unique_ptr> create(column_view source_view, cudaStream_t stream = 0); + /**---------------------------------------------------------------------------* + * @brief Returns reference to element at the specified index. + * + * This function accounts for the offset. + * + * @tparam T The element type + * @param element_index Position of the desired element + *---------------------------------------------------------------------------**/ + template + __device__ T const element(size_type element_index) const noexcept { + return data()[element_index]; + } + /**---------------------------------------------------------------------------* * @brief Returns the specified child * @@ -272,7 +259,6 @@ class alignas(16) column_device_view : public detail::column_device_view_base { ///< may contain additional data size_type _num_children{}; ///< The number of child columns -public: /**---------------------------------------------------------------------------* * @brief Construct's a `column_device_view` from a `column_view` populating * all but the children. @@ -283,7 +269,6 @@ public: *---------------------------------------------------------------------------**/ column_device_view(column_view source); -protected: /**---------------------------------------------------------------------------* * @brief Destroy the `device_column_view` object. * @@ -372,25 +357,10 @@ class alignas(16) mutable_column_device_view * @param element_index Position of the desired element *---------------------------------------------------------------------------**/ template - __device__ T& element(size_type element_index) noexcept { + __device__ T element(size_type element_index) noexcept { return data()[element_index]; } - /**---------------------------------------------------------------------------* - * @brief Returns `string_view` to the string element at the specified index. - * - * This function accounts for the offset. - * - * @param element_index Position of the desired string - *---------------------------------------------------------------------------**/ - /* - template <> - __device__ string_view& element( - size_type element_index) noexcept { - // Fill this in - } - */ - /**---------------------------------------------------------------------------* * @brief Returns raw pointer to the underlying bitmask allocation. * @@ -481,4 +451,29 @@ class alignas(16) mutable_column_device_view void destroy(); }; + /**---------------------------------------------------------------------------* + * @brief Returns `string_view` to the string element at the specified index. + * + * This function accounts for the offset. + * + * @param element_index Position of the desired string + *---------------------------------------------------------------------------**/ + + template <> + __device__ inline string_view const column_device_view::element( + size_type element_index) const noexcept { + size_type index = element_index + _offset; // account for this view's _offset + const int32_t* d_offsets = d_children[0].data(); + const char* d_strings = d_children[1].data(); + size_type offset = index ? d_offsets[index-1] : 0; + return string_view{d_strings + offset, d_offsets[index] - offset}; + } + + //template <> + //__device__ inline string_view mutable_column_device_view::element( + // size_type element_index) noexcept { + // return string_view{}; + //} + + } // namespace cudf \ No newline at end of file diff --git a/cpp/include/cudf/strings/strings_column_factories.hpp b/cpp/include/cudf/strings/strings_column_factories.hpp index faf772b26d5..5bd89745346 100644 --- a/cpp/include/cudf/strings/strings_column_factories.hpp +++ b/cpp/include/cudf/strings/strings_column_factories.hpp @@ -17,7 +17,9 @@ #include #include -#include "../column/column.hpp" + +#include +#include namespace cudf { /**---------------------------------------------------------------------------* diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_handler.hpp index bb2b0bb3abb..606dd84a2d9 100644 --- a/cpp/include/cudf/strings/strings_column_handler.hpp +++ b/cpp/include/cudf/strings/strings_column_handler.hpp @@ -15,28 +15,52 @@ */ #pragma once -#include - #include +#include +#include +#include + namespace cudf { +/**---------------------------------------------------------------------------* + * @brief Given a column-view of strings type, an instance of this class + * provides the strings operations on the column. + *---------------------------------------------------------------------------**/ class strings_column_handler { public: ~strings_column_handler() = default; - strings_column_handler( const column_view& strings_column ); + strings_column_handler( const column_view& strings_column, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); //strings_column_handler( const column_view&& strings_column ); + /**---------------------------------------------------------------------------* + * @brief Returns the number of strings in the column + *---------------------------------------------------------------------------**/ size_type count() const; + /**---------------------------------------------------------------------------* + * @brief Returns a pointer to the internal char data array + *---------------------------------------------------------------------------**/ const char* chars_data() const; + /**---------------------------------------------------------------------------* + * @brief Returns a pointer to the internal offsets array + *---------------------------------------------------------------------------**/ const int32_t* offsets_data() const; + /**---------------------------------------------------------------------------* + * @brief Returns the size of the char data array in bytes + *---------------------------------------------------------------------------**/ size_type chars_column_size() const; + /**---------------------------------------------------------------------------* + * @brief Returns a pointer to the internal null mask memory + *---------------------------------------------------------------------------**/ const bitmask_type* null_mask() const; + /**---------------------------------------------------------------------------* + * @brief Returns the number of nulls in this column + *---------------------------------------------------------------------------**/ size_type null_count() const; enum sort_type { @@ -45,21 +69,59 @@ class strings_column_handler name=2 ///< sort by characters code-points }; - // print strings to stdout + /**---------------------------------------------------------------------------* + * @brief Prints the strings to stdout. + * + * @param start Index of first string to print. + * @param end Index of last string to print. Specify -1 for all strings. + * @param max_width Maximum number of characters to print per string. + * Specify -1 to print all characters. + * @param delimiter The chars to print between each string. + * Default is new-line character. + *---------------------------------------------------------------------------**/ void print( size_type start=0, size_type end=-1, size_type max_width=-1, const char* delimiter = "\n" ) const; - // new strings column from subset of given strings column - std::unique_ptr sublist( size_type start, size_type end, size_type step ); + /**---------------------------------------------------------------------------* + * @brief Returns a new strings column created from a subset of + * of this instance's strings column. + * + * @param start Index of first string to use. + * @param end Index of last string to use. + * @param step Increment value between indexes. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column of size (end-start)/step. + *---------------------------------------------------------------------------**/ + std::unique_ptr sublist( size_type start, size_type end, size_type step, cudaStream_t stream=0 ); + + /**---------------------------------------------------------------------------* + * @brief Returns a new strings column created this strings instance using + * the specified indices to select the strings. + * + * @param indices The indices with which to select strings for the new column. + * Values must be within [0,count()) range. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column of size indices.size() + *---------------------------------------------------------------------------**/ + std::unique_ptr gather( const column_view& indices, cudaStream_t stream=0 ); // return sorted version of the given strings column - std::unique_ptr sort( sort_type stype, bool ascending=true, bool nullfirst=true ); - - // return sorted indexes only -- returns integer column - std::unique_ptr order( sort_type stype, bool ascending, bool nullfirst=true ); + /**---------------------------------------------------------------------------* + * @brief Returns a new strings column that is a sorted version of the + * strings in this instance. + * + * @param stype Specify what attribute of the string to sort on. + * @param ascending Sort strings in ascending or descending order. + * @param nullfirst Sort nulls to the beginning or the end of the new column. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column with sorted elements of this instance. + *---------------------------------------------------------------------------**/ + std::unique_ptr sort( sort_type stype, bool ascending=true, bool nullfirst=true, cudaStream_t stream=0 ); private: const column_view _parent; + rmm::mr::device_memory_resource* _mr; + }; } diff --git a/cpp/include/nvstrings/NVText.h b/cpp/include/nvstrings/NVText.h index 2fdff32c61d..3fddf4777c3 100644 --- a/cpp/include/nvstrings/NVText.h +++ b/cpp/include/nvstrings/NVText.h @@ -171,4 +171,5 @@ class NVText * @return New strings instance with appropriate scattered elements. */ static NVStrings* scatter_count( NVStrings& strs, unsigned int* counts, bool devmem=true ); + static unsigned int code_points( NVStrings& strs, unsigned int* results ); }; diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 18bcf44aefb..1c458635b68 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -32,21 +32,33 @@ column_device_view::column_device_view(column_view source) // Free device memory allocated for children void column_device_view::destroy() { - // TODO Implement once support for children is added + // TODO Needs to handle grand-children + if( d_children ) + RMM_FREE(d_children,0); + delete this; } // Construct a unique_ptr that invokes `destroy()` as it's deleter -auto column_device_view::create(column_view source, cudaStream_t stream) { +std::unique_ptr> column_device_view::create(column_view source, cudaStream_t stream) { size_type num_descendants{count_descendants(source)}; - if (num_descendants > 0) { - CUDF_FAIL("Columns with children are not currently supported."); - } - + //if( num_descendants > 0 ) { + // CUDF_FAIL("Columns with children are not currently supported."); + // } auto deleter = [](column_device_view* v) { v->destroy(); }; - std::unique_ptr p{ new column_device_view(source), deleter}; - + if( num_descendants > 0 ) + { + // ignore grand-children right now + RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_descendants, stream); + for( size_type idx=0; idx < num_descendants; ++idx ) + { + column_device_view child(source.child(idx)); + cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), cudaMemcpyHostToDevice, stream); + } + p->_num_children = num_descendants; + cudaStreamSynchronize(stream); + } return p; } diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index d8e2b3a31b9..0747e283318 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -15,10 +15,9 @@ */ #include -#include -#include #include -#include +#include +#include #include #include @@ -30,95 +29,94 @@ namespace cudf { // Create a strings-type column. // A strings-column has children columns to manage the variable-length -// encoded character array. +// encoded character array. // Use the strings_column_handler class to perform strings operations // on this type of column. std::unique_ptr make_strings_column( std::pair* strings, size_type count, cudaStream_t stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* mr) { - // maybe a separate factory for creating null strings-column - CUDF_EXPECTS(count > 0, "must have at least one pair"); + // maybe a separate factory for creating null strings-column + CUDF_EXPECTS(count > 0, "must have at least one pair"); - auto execpol = rmm::exec_policy(stream); - auto strs = thrust::device_pointer_cast(reinterpret_cast*>(strings)); - auto d_strs = strs.get(); + auto execpol = rmm::exec_policy(stream); + auto strs = thrust::device_pointer_cast(reinterpret_cast*>(strings)); + auto d_strs = strs.get(); - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::ALL_VALID, stream, mr ); - auto offsets_view = offsets_column->mutable_view(); - thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - offsets_view.data(), - [d_strs] __device__ (size_type idx) { - thrust::pair item = d_strs[idx]; - return ( item.first ? (int32_t)item.second : 0 ); - }, - thrust::plus() ); + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + offsets_view.data(), + [d_strs] __device__ (size_type idx) { + thrust::pair item = d_strs[idx]; + return ( item.first ? (int32_t)item.second : 0 ); + }, + thrust::plus() ); - // get number of bytes (last offset value) - auto offsets_data = thrust::device_pointer_cast(offsets_view.data()); - size_type bytes = offsets_data[count-1]; // this copy may not be stream friendly + // get number of bytes (last offset value) + size_type bytes = thrust::device_pointer_cast(offsets_view.data())[count-1]; - // count nulls - size_type null_count = thrust::transform_reduce( execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - [d_strs] __device__ (size_type idx) { return (size_type)(d_strs[idx].first==nullptr); }, - 0, thrust::plus() ); - - // build null_mask - mask_state state = mask_state::UNINITIALIZED; - if( null_count==0 ) - state = mask_state::ALL_VALID; - else if( null_count==count ) - state = mask_state::ALL_NULL; - auto null_mask = create_null_mask(count, state, stream, mr); - if( (null_count > 0) && (null_count < count) ) - { - uint8_t* d_null_mask = static_cast(null_mask.data()); - CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); - thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, - [d_strs, count, d_null_mask] __device__(size_type byte_idx) { - unsigned char byte = 0; // set one byte per thread -- init to all nulls - for( size_type i=0; i < 8; ++i ) - { - size_type idx = i + (byte_idx*8); // compute d_strs index - byte = byte >> 1; // shift until we are done - if( idx < count ) // check boundary + // count nulls + size_type null_count = thrust::transform_reduce( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + [d_strs] __device__ (size_type idx) { return (size_type)(d_strs[idx].first==nullptr); }, + 0, thrust::plus() ); + + // build null_mask + mask_state state = mask_state::UNINITIALIZED; + if( null_count==0 ) + state = mask_state::UNALLOCATED; + else if( null_count==count ) + state = mask_state::ALL_NULL; + auto null_mask = create_null_mask(count, state, stream, mr); + if( (null_count > 0) && (null_count < count) ) + { + uint8_t* d_null_mask = static_cast(null_mask.data()); + CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), (count/8), + [d_strs, count, d_null_mask] __device__(size_type byte_idx) { + unsigned char byte = 0; // set one byte per thread -- init to all nulls + for( size_type i=0; i < 8; ++i ) + { + size_type idx = i + (byte_idx*8); // compute d_strs index + byte = byte >> 1; // shift until we are done + if( idx < count ) // check boundary + { + if( d_strs[idx].first ) + byte |= 128; // string is not null, set high bit + } + } + d_null_mask[byte_idx] = byte; + }); + } + + // build chars column + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + auto d_offsets = offsets_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_strs, d_offsets, d_chars] __device__(size_type idx){ + // place individual strings + auto item = d_strs[idx]; + if( item.first ) { - if( d_strs[idx].first ) - byte |= 128; // string is not null, set high bit + size_type offset = (idx ? d_offsets[idx-1] : 0); + memcpy(d_chars + offset, item.first, item.second ); } - } - d_null_mask[byte_idx] = byte; - }); - } - - // build chars column - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::ALL_VALID, stream, mr ); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - auto d_offsets = offsets_view.data(); - thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, - [d_strs, d_offsets, d_chars] __device__(size_type idx){ - // place individual strings - auto item = d_strs[idx]; - if( item.first ) - { - size_type offset = (idx ? d_offsets[idx-1] : 0); - memcpy(d_chars + offset, item.first, item.second ); - } - }); + }); - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); - return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, - std::move(children)); + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); } } // namespace cudf diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu index 7432ffc3761..8c04ee8c6f7 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_handler.cu @@ -14,46 +14,46 @@ * limitations under the License. */ -#include +#include +#include #include +#include #include -#include #include -#include #include +#include +#include +#include namespace cudf { -#define STR_OFFSETS_CHILD_INDEX 0 -#define STR_CHARS_CHILD_INDEX 1 - // -strings_column_handler::strings_column_handler( const column_view& strings_column ) - : _parent(strings_column) +strings_column_handler::strings_column_handler( const column_view& strings_column, rmm::mr::device_memory_resource* mr ) + : _parent(strings_column), _mr(mr) { - CUDF_EXPECTS( _parent.type().id()==STRING, "string_column_view only support strings"); - CUDF_EXPECTS( _parent.num_children()>0, "string column must have children"); + CUDF_EXPECTS( _parent.type().id()==STRING, "string_column_handler only support strings"); + CUDF_EXPECTS( _parent.num_children()>0, "string column must have children"); // revisit this (all nulls column?) } size_type strings_column_handler::count() const { - return _parent.child(STR_OFFSETS_CHILD_INDEX).size(); + return _parent.child(0).size(); } const char* strings_column_handler::chars_data() const { - return _parent.child(STR_CHARS_CHILD_INDEX).data(); + return _parent.child(1).data(); } const int32_t* strings_column_handler::offsets_data() const { - return _parent.child(STR_OFFSETS_CHILD_INDEX).data(); + return _parent.child(0).data(); } size_type strings_column_handler::chars_column_size() const { - return _parent.child(STR_OFFSETS_CHILD_INDEX).size(); + return _parent.child(1).size(); } const bitmask_type* strings_column_handler::null_mask() const @@ -76,22 +76,23 @@ void strings_column_handler::print( size_type start, size_type end, if( start < 0 ) start = 0; if( start >= end ) - return; + throw std::invalid_argument("invalid parameter value"); count = end - start; // stick with the default stream for this odd/rare stdout function auto execpol = rmm::exec_policy(0); - auto strings_column = column_device_view(_parent); + auto strings_column = column_device_view::create(_parent); + auto d_column = *strings_column; auto d_offsets = offsets_data(); auto d_strings = chars_data(); - // get individual strings sizes + // create output strings offsets rmm::device_vector output_offsets(count,0); - thrust::transform( execpol->on(0), + thrust::transform_inclusive_scan( execpol->on(0), thrust::make_counting_iterator(start), thrust::make_counting_iterator(end), output_offsets.begin(), - [strings_column, d_strings, max_width, d_offsets] __device__ (size_type idx) { - if( strings_column.nullable() && strings_column.is_null(idx) ) + [d_column, d_strings, max_width, d_offsets] __device__ (size_type idx) { + if( d_column.nullable() && d_column.is_null(idx) ) return 0; size_type offset = idx ? d_offsets[idx-1] : 0; // this logic will be a template size_type bytes = d_offsets[idx] - offset; // specialization on element() @@ -99,11 +100,10 @@ void strings_column_handler::print( size_type start, size_type end, if( (max_width > 0) && (dstr.characters() > max_width) ) bytes = dstr.byte_offset_for(max_width); return bytes+1; // allow for null-terminator on non-null strings - }); - // convert to offsets - thrust::inclusive_scan( execpol->on(0), output_offsets.begin(), output_offsets.end(), output_offsets.begin() ); + }, + thrust::plus()); // build output buffer - size_t buffer_size = output_offsets[count-1]; + size_t buffer_size = output_offsets.back(); // last element has total size if( buffer_size == 0 ) { printf("all %d strings are null\n", count); @@ -114,7 +114,7 @@ void strings_column_handler::print( size_type start, size_type end, // copy strings into output buffer size_t* d_output_offsets = output_offsets.data().get(); thrust::for_each_n(execpol->on(0), - thrust::make_counting_iterator(0), (end-start), + thrust::make_counting_iterator(0), count, [d_strings, start, d_offsets, d_output_offsets, d_buffer] __device__(size_type idx) { size_t output_offset = (idx ? d_output_offsets[idx-1] : 0); size_t length = d_output_offsets[idx] - output_offset; // bytes @@ -128,10 +128,9 @@ void strings_column_handler::print( size_type start, size_type end, // copy output buffer to host std::vector h_offsets(count); - cudaMemcpyAsync( h_offsets.data(), d_output_offsets, count*sizeof(size_t), cudaMemcpyDeviceToHost); + cudaMemcpy( h_offsets.data(), d_output_offsets, count*sizeof(size_t), cudaMemcpyDeviceToHost); std::vector h_buffer(buffer_size); - cudaMemcpyAsync( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost ); - cudaStreamSynchronize(0); + cudaMemcpy( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost ); // print out the strings to stdout for( size_type idx=0; idx < count; ++idx ) @@ -147,22 +146,151 @@ void strings_column_handler::print( size_type start, size_type end, } } -// new strings column from subset of given strings column -std::unique_ptr strings_column_handler::sublist( size_type start, size_type end, size_type step ) +// new strings column from subset of this strings instance +std::unique_ptr strings_column_handler::sublist( size_type start, size_type end, + size_type step, cudaStream_t stream ) { - return make_strings_column(nullptr, 0); + if( step <= 0 ) + step = 1; + size_type count = this->count(); + if( end < 0 || end > count ) + end = count; + if( start < 0 || start > end ) + throw std::invalid_argument("invalid start parameter"); + count = (end - start)/step +1; + // + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(_parent, stream); + auto d_column = *strings_column; + + // build indices + thrust::device_vector indices(count); + thrust::sequence( execpol->on(stream), indices.begin(), indices.end(), start, step ); + // should have a way to create a column_view with an existing memory buffer + auto indices_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); + auto indices_view = indices_column->mutable_view(); + cudaMemcpyAsync( indices_view.data(), indices.data().get(), count*sizeof(int32_t), cudaMemcpyDeviceToDevice, stream); + + return gather(indices_view); } -// return sorted version of the given strings column -std::unique_ptr strings_column_handler::sort( sort_type stype, bool ascending, bool nullfirst ) +// return new strings column with strings from this instance as specified by the indices +std::unique_ptr strings_column_handler::gather( const column_view& indices, cudaStream_t stream ) { - return make_strings_column(nullptr, 0); + size_type count = indices.size(); + auto d_indices = indices.data(); + + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(_parent,stream); + auto d_column = *strings_column; + auto d_offsets = offsets_data(); + + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); + auto offsets_view = offsets_column->mutable_view(); + auto d_new_offsets = offsets_view.data(); + // create new offsets array + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + d_new_offsets, + [d_column, d_offsets, d_indices] __device__ (size_type idx) { + size_type index = d_indices[idx]; + if( d_column.nullable() && d_column.is_null(index) ) + return 0; + size_type offset = index ? d_offsets[index-1] : 0; + return d_offsets[index] - offset; + }, + thrust::plus()); + // build null mask + size_type null_count = this->null_count(); + mask_state state = mask_state::UNINITIALIZED; + if( null_count==0 ) + state = mask_state::UNALLOCATED; + else if( null_count==count ) + state = mask_state::ALL_NULL; + auto null_mask = create_null_mask(count, state, stream, _mr); + if( (null_count > 0) && (null_count < count) ) + { + uint8_t* d_null_mask = static_cast(null_mask.data()); + CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); + thrust::transform(execpol->on(stream), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(count/8), + d_null_mask, + [d_column, count] __device__(size_type byte_idx) { + unsigned char byte = 0; // set one byte per thread -- init to all nulls + for( size_type i=0; i < 8; ++i ) + { + size_type idx = i + (byte_idx*8); // compute d_strs index + byte = byte >> 1; // shift until we are done + if( idx < count ) // check boundary + { + if( d_column.is_null(idx) ) + byte |= 128; // string is not null, set high bit + } + } + return byte; //d_null_mask[byte_idx] = byte; + }); + } + + // build chars column + size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, _mr ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ + // place individual strings + if( d_column.nullable() && d_column.is_null(idx) ) + return; + string_view dstr = d_column.element(d_indices[idx]); + size_type offset = (idx ? d_new_offsets[idx-1] : 0); + memcpy(d_chars + offset, dstr.data(), dstr.size() ); + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,_mr}, + null_mask, null_count, + std::move(children)); } -// return sorted indexes only -- returns integer column -std::unique_ptr strings_column_handler::order( sort_type stype, bool ascending, bool nullfirst ) +// return sorted version of the given strings column +std::unique_ptr strings_column_handler::sort( sort_type stype, bool ascending, bool nullfirst, cudaStream_t stream ) { - return make_strings_column(nullptr, 0); + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(_parent, stream); + auto d_column = *strings_column; + + // lets sort indices + size_type count = this->count(); + thrust::device_vector indices(count); + thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); + thrust::sort( execpol->on(stream), indices.begin(), indices.end(), + [d_column, stype, ascending, nullfirst] __device__ (size_type lhs, size_type rhs) { + bool lhs_null{d_column.nullable() && d_column.is_null(lhs)}; + bool rhs_null{d_column.nullable() && d_column.is_null(rhs)}; + if( lhs_null || rhs_null ) + return (nullfirst ? !rhs_null : !lhs_null); + string_view lhs_str = d_column.element(lhs); + string_view rhs_str = d_column.element(rhs); + int cmp = lhs_str.compare(rhs_str); + return (ascending ? (cmp<0) : (cmp>0)); + }); + + // should have a way to create a column_view with an existing memory buffer + auto d_indices = indices.data().get(); + // we will create an empty one and pass in this data for now + auto indices_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); + auto indices_view = indices_column->mutable_view(); + cudaMemcpyAsync( indices_view.data(), d_indices, count*sizeof(int32_t), cudaMemcpyDeviceToDevice, stream); + + // now build a new strings column from the indices + return gather( indices_view ); } + } // namespace cudf \ No newline at end of file diff --git a/python/nvstrings/cpp/pytext.cpp b/python/nvstrings/cpp/pytext.cpp index 423ed1d7185..95231eb06ee 100644 --- a/python/nvstrings/cpp/pytext.cpp +++ b/python/nvstrings/cpp/pytext.cpp @@ -649,6 +649,28 @@ static PyObject* n_scatter_count( PyObject* self, PyObject* args ) return PyLong_FromVoidPtr((void*)strs); } +static PyObject* n_code_points( PyObject* self, PyObject* args ) +{ + PyObject* pystrs = PyTuple_GetItem(args,0); + NVStrings* strs = strings_from_object(pystrs); + if( strs==0 ) + Py_RETURN_NONE; + + PyObject* pyresults = PyTuple_GetItem(args,1); + std::string name = pyresults->ob_type->tp_name; + if( name.compare("int")!=0 ) + { + printf("results must be device pointer\n"); + Py_RETURN_NONE; + } + unsigned int* results = (unsigned int*)PyLong_AsVoidPtr(pyresults); + + Py_BEGIN_ALLOW_THREADS + NVText::code_points(*strs,results); + Py_END_ALLOW_THREADS + Py_RETURN_NONE; +} + // static PyMethodDef s_Methods[] = { { "n_tokenize", n_tokenize, METH_VARARGS, "" }, @@ -663,6 +685,7 @@ static PyMethodDef s_Methods[] = { { "n_edit_distance", n_edit_distance, METH_VARARGS, "" }, { "n_create_ngrams", n_create_ngrams, METH_VARARGS, "" }, { "n_scatter_count", n_scatter_count, METH_VARARGS, "" }, + { "n_code_points", n_code_points, METH_VARARGS, "" }, { NULL, NULL, 0, NULL } }; diff --git a/python/nvstrings/nvtext.py b/python/nvstrings/nvtext.py index 8420ba48a58..40c98929ada 100644 --- a/python/nvstrings/nvtext.py +++ b/python/nvstrings/nvtext.py @@ -346,3 +346,9 @@ def scatter_count(strs, counts): if rtn is not None: rtn = nvs.nvstrings(rtn) return rtn + +def code_points(strs, results): + if results is None: + raise ValueError("results must be device pointer") + pyniNVText.n_code_points(strs, results) + From 559b5c841279916e06243adc196cca55bb05226e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 17 Sep 2019 14:17:54 -0400 Subject: [PATCH 03/54] use valid_if for creating null bitmask --- .../cudf/strings/strings_column_factories.hpp | 35 +++-- .../cudf/strings/strings_column_handler.hpp | 113 ++++++++------ cpp/src/column/column_device_view.cu | 1 + cpp/src/strings/strings_column_factories.cu | 78 +++++----- cpp/src/strings/strings_column_handler.cu | 143 ++++++++---------- 5 files changed, 181 insertions(+), 189 deletions(-) diff --git a/cpp/include/cudf/strings/strings_column_factories.hpp b/cpp/include/cudf/strings/strings_column_factories.hpp index 5bd89745346..74970393638 100644 --- a/cpp/include/cudf/strings/strings_column_factories.hpp +++ b/cpp/include/cudf/strings/strings_column_factories.hpp @@ -23,28 +23,33 @@ namespace cudf { /**---------------------------------------------------------------------------* - * @brief Construct strings column given an array of pointer/size pairs. - * Use the strings_column_handler class to perform strings operations on - * this type of column. - * + * @brief Construct STRING type column given an array of pointer/size pairs. + * The total number of char bytes must not exceed the maximum size of size_type. + * This column contains 2 or more children to manage its variable width data + * elements. Use the strings_column_handler class to perform strings operations + * on this type of column. The string characters are expected to be UTF-8 + * encoded sequence of char bytes. + * * @note `null_count()` and `null_bitmask` are determined if a pair contains - * a null pointer. Otherwise, it is considered an empty string and not null. + * a null string. That is, for each pair, if `.first` is null, that string + * is considered null. Likewise, a string is considered empty (not null) + * if `.first` is not null and `.second` is 0. Otherwise the `.first` member + * must be a valid device address pointing to `.second` consecutive bytes. * * @throws std::bad_alloc if device memory allocation fails - * @throws cudf::logic_error if pointers are invalid + * @throws cudf::logic_error if pointers or sizes are invalid * - * @param[in] strs The pointer/size pair arrays. - * Each pointer must be valid device memory address. - * The size must be the number of bytes. - * @param[in] count The number of elements in the strs array. - * @param[in] stream Optional stream on which to issue all memory allocation and device - * kernels + * @param[in] strings The pointer/size pair arrays. + * Each pointer must be a valid device memory address. + * The size must be the number of bytes. + * @param[in] stream Optional stream for use with all memory allocation + * and device kernels * @param[in] mr Optional resource to use for device memory - * allocation of the column's `data` and `null_mask`. + * allocation of the column's `null_mask` and children. *---------------------------------------------------------------------------**/ std::unique_ptr make_strings_column( - std::pair* strs, size_type count, + const rmm::device_vector>& strings, cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); - + } // namespace cudf diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_handler.hpp index 606dd84a2d9..75f4e937195 100644 --- a/cpp/include/cudf/strings/strings_column_handler.hpp +++ b/cpp/include/cudf/strings/strings_column_handler.hpp @@ -32,46 +32,48 @@ class strings_column_handler public: ~strings_column_handler() = default; - strings_column_handler( const column_view& strings_column, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + strings_column_handler( column_view strings_column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); //strings_column_handler( const column_view&& strings_column ); /**---------------------------------------------------------------------------* * @brief Returns the number of strings in the column *---------------------------------------------------------------------------**/ - size_type count() const; + size_type size() const; /**---------------------------------------------------------------------------* - * @brief Returns a pointer to the internal char data array + * @brief Returns the internal parent string column *---------------------------------------------------------------------------**/ - const char* chars_data() const; + column_view parent_column() const; + /**---------------------------------------------------------------------------* - * @brief Returns a pointer to the internal offsets array + * @brief Returns the internal column of offsets *---------------------------------------------------------------------------**/ - const int32_t* offsets_data() const; + column_view offsets_column() const; /**---------------------------------------------------------------------------* - * @brief Returns the size of the char data array in bytes + * @brief Returns the internal column of chars *---------------------------------------------------------------------------**/ - size_type chars_column_size() const; + column_view chars_column() const; /**---------------------------------------------------------------------------* * @brief Returns a pointer to the internal null mask memory *---------------------------------------------------------------------------**/ const bitmask_type* null_mask() const; + /**---------------------------------------------------------------------------* * @brief Returns the number of nulls in this column *---------------------------------------------------------------------------**/ size_type null_count() const; - enum sort_type { - none=0, ///< no sorting - length=1, ///< sort by string length - name=2 ///< sort by characters code-points - }; + /**---------------------------------------------------------------------------* + * @brief Returns the registered memory resource + *---------------------------------------------------------------------------**/ + rmm::mr::device_memory_resource* memory_resource() const; /**---------------------------------------------------------------------------* * @brief Prints the strings to stdout. - * + * * @param start Index of first string to print. * @param end Index of last string to print. Specify -1 for all strings. * @param max_width Maximum number of characters to print per string. @@ -82,46 +84,57 @@ class strings_column_handler void print( size_type start=0, size_type end=-1, size_type max_width=-1, const char* delimiter = "\n" ) const; - /**---------------------------------------------------------------------------* - * @brief Returns a new strings column created from a subset of - * of this instance's strings column. - * - * @param start Index of first string to use. - * @param end Index of last string to use. - * @param step Increment value between indexes. - * @param stream CUDA stream to use kernels in this method. - * @return New strings column of size (end-start)/step. - *---------------------------------------------------------------------------**/ - std::unique_ptr sublist( size_type start, size_type end, size_type step, cudaStream_t stream=0 ); - - /**---------------------------------------------------------------------------* - * @brief Returns a new strings column created this strings instance using - * the specified indices to select the strings. - * - * @param indices The indices with which to select strings for the new column. - * Values must be within [0,count()) range. - * @param stream CUDA stream to use kernels in this method. - * @return New strings column of size indices.size() - *---------------------------------------------------------------------------**/ - std::unique_ptr gather( const column_view& indices, cudaStream_t stream=0 ); - - // return sorted version of the given strings column - /**---------------------------------------------------------------------------* - * @brief Returns a new strings column that is a sorted version of the - * strings in this instance. - * - * @param stype Specify what attribute of the string to sort on. - * @param ascending Sort strings in ascending or descending order. - * @param nullfirst Sort nulls to the beginning or the end of the new column. - * @param stream CUDA stream to use kernels in this method. - * @return New strings column with sorted elements of this instance. - *---------------------------------------------------------------------------**/ - std::unique_ptr sort( sort_type stype, bool ascending=true, bool nullfirst=true, cudaStream_t stream=0 ); + // sort types can be combined + enum sort_type { + none=0, ///< no sorting + length=1, ///< sort by string length + name=2 ///< sort by characters code-points + }; private: const column_view _parent; rmm::mr::device_memory_resource* _mr; - + }; +/**---------------------------------------------------------------------------* + * @brief Returns a new strings column created from a subset of + * of this instance's strings column. + * + * @param start Index of first string to use. + * @param end Index of last string to use. + * @param step Increment value between indexes. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column of size (end-start)/step. + *---------------------------------------------------------------------------**/ +std::unique_ptr sublist( strings_column_handler handler, + size_type start, size_type end, + size_type step, cudaStream_t stream=0 ); + +/**---------------------------------------------------------------------------* + * @brief Returns a new strings column created this strings instance using + * the specified indices to select the strings. + * + * @param indices The indices with which to select strings for the new column. + * Values must be within [0,count()) range. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column of size indices.size() + *---------------------------------------------------------------------------**/ +std::unique_ptr gather( strings_column_handler handler, + column_view gather_map, cudaStream_t stream=0 ); + +/**---------------------------------------------------------------------------* + * @brief Returns a new strings column that is a sorted version of the + * strings in this instance. + * + * @param stype Specify what attribute of the string to sort on. + * @param ascending Sort strings in ascending or descending order. + * @param nullfirst Sort nulls to the beginning or the end of the new column. + * @param stream CUDA stream to use kernels in this method. + * @return New strings column with sorted elements of this instance. + *---------------------------------------------------------------------------**/ +std::unique_ptr sort( strings_column_handler handler, + strings_column_handler::sort_type stype, bool ascending=true, + bool nullfirst=true, cudaStream_t stream=0 ); + } diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 1c458635b68..15c33abaa68 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -54,6 +54,7 @@ std::unique_ptr> co for( size_type idx=0; idx < num_descendants; ++idx ) { column_device_view child(source.child(idx)); + CUDF_EXPECTS( child._num_children==0, "column grand-children not currently supported"); cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), cudaMemcpyHostToDevice, stream); } p->_num_children = num_descendants; diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 0747e283318..1b1fb555161 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -27,21 +28,35 @@ namespace cudf { + // Create a strings-type column. // A strings-column has children columns to manage the variable-length // encoded character array. // Use the strings_column_handler class to perform strings operations // on this type of column. std::unique_ptr make_strings_column( - std::pair* strings, size_type count, cudaStream_t stream, + const rmm::device_vector>& strings, + cudaStream_t stream, rmm::mr::device_memory_resource* mr) { + size_type count = (size_type)strings.size(); // maybe a separate factory for creating null strings-column - CUDF_EXPECTS(count > 0, "must have at least one pair"); + CUDF_EXPECTS(count > 0, "must specify at least one pair"); auto execpol = rmm::exec_policy(stream); - auto strs = thrust::device_pointer_cast(reinterpret_cast*>(strings)); - auto d_strs = strs.get(); + auto d_strings = strings.data().get(); + + // check total size is not too large for cudf column + size_t bytes = thrust::transform_reduce( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + [d_strings] __device__ (size_t idx) { + auto item = d_strings[idx]; + return item.first ? item.second : (size_t)0; + }, + (size_t)0, + thrust::plus()); + CUDF_EXPECTS( bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column" ); // build offsets column auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, mr ); @@ -49,48 +64,24 @@ std::unique_ptr make_strings_column( thrust::transform_inclusive_scan( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), offsets_view.data(), - [d_strs] __device__ (size_type idx) { - thrust::pair item = d_strs[idx]; + [d_strings] __device__ (size_type idx) { + thrust::pair item = d_strings[idx]; return ( item.first ? (int32_t)item.second : 0 ); }, thrust::plus() ); - // get number of bytes (last offset value) - size_type bytes = thrust::device_pointer_cast(offsets_view.data())[count-1]; - - // count nulls - size_type null_count = thrust::transform_reduce( execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - [d_strs] __device__ (size_type idx) { return (size_type)(d_strs[idx].first==nullptr); }, - 0, thrust::plus() ); + // create null mask + auto null_mask = valid_if( static_cast(nullptr), + [d_strings] __device__ (size_type idx) { return d_strings[idx].first!=nullptr; }, + count, stream ); // build null_mask - mask_state state = mask_state::UNINITIALIZED; - if( null_count==0 ) - state = mask_state::UNALLOCATED; - else if( null_count==count ) - state = mask_state::ALL_NULL; - auto null_mask = create_null_mask(count, state, stream, mr); - if( (null_count > 0) && (null_count < count) ) - { - uint8_t* d_null_mask = static_cast(null_mask.data()); - CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), (count/8), - [d_strs, count, d_null_mask] __device__(size_type byte_idx) { - unsigned char byte = 0; // set one byte per thread -- init to all nulls - for( size_type i=0; i < 8; ++i ) - { - size_type idx = i + (byte_idx*8); // compute d_strs index - byte = byte >> 1; // shift until we are done - if( idx < count ) // check boundary - { - if( d_strs[idx].first ) - byte |= 128; // string is not null, set high bit - } - } - d_null_mask[byte_idx] = byte; - }); - } + //mask_state state = mask_state::UNINITIALIZED; + //if( null_count==0 ) + // state = mask_state::UNALLOCATED; + //else if( null_count==count ) + // state = mask_state::ALL_NULL; + //auto null_mask = create_null_mask(count, state, stream, mr); // build chars column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); @@ -98,9 +89,9 @@ std::unique_ptr make_strings_column( auto d_chars = chars_view.data(); auto d_offsets = offsets_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, - [d_strs, d_offsets, d_chars] __device__(size_type idx){ + [d_strings, d_offsets, d_chars] __device__(size_type idx){ // place individual strings - auto item = d_strs[idx]; + auto item = d_strings[idx]; if( item.first ) { size_type offset = (idx ? d_offsets[idx-1] : 0); @@ -113,9 +104,10 @@ std::unique_ptr make_strings_column( children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(chars_column)); + // see column_view.cpp(45) to see why size must be 0 here return std::make_unique( data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, + rmm::device_buffer(null_mask.first,(size_type)null_mask.second), null_mask.second, std::move(children)); } diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu index 8c04ee8c6f7..481747f903d 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_handler.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -29,31 +30,32 @@ namespace cudf { // -strings_column_handler::strings_column_handler( const column_view& strings_column, rmm::mr::device_memory_resource* mr ) +strings_column_handler::strings_column_handler( column_view strings_column, + rmm::mr::device_memory_resource* mr ) : _parent(strings_column), _mr(mr) { - CUDF_EXPECTS( _parent.type().id()==STRING, "string_column_handler only support strings"); - CUDF_EXPECTS( _parent.num_children()>0, "string column must have children"); // revisit this (all nulls column?) + CUDF_EXPECTS( _parent.type().id()==STRING, "strings_column_handler only supports strings"); + CUDF_EXPECTS( _parent.num_children()>0, "strings column must have children"); // revisit this (all nulls column?) } -size_type strings_column_handler::count() const +size_type strings_column_handler::size() const { return _parent.child(0).size(); } -const char* strings_column_handler::chars_data() const +column_view strings_column_handler::parent_column() const { - return _parent.child(1).data(); + return _parent; } -const int32_t* strings_column_handler::offsets_data() const +column_view strings_column_handler::offsets_column() const { - return _parent.child(0).data(); + return _parent.child(0); } -size_type strings_column_handler::chars_column_size() const +column_view strings_column_handler::chars_column() const { - return _parent.child(1).size(); + return _parent.child(1); } const bitmask_type* strings_column_handler::null_mask() const @@ -66,11 +68,16 @@ size_type strings_column_handler::null_count() const return _parent.null_count(); } +rmm::mr::device_memory_resource* strings_column_handler::memory_resource() const +{ + return _mr; +} + // print strings to stdout void strings_column_handler::print( size_type start, size_type end, size_type max_width, const char* delimiter ) const { - size_type count = this->count(); + size_type count = size(); if( end < 0 || end > count ) end = count; if( start < 0 ) @@ -83,8 +90,8 @@ void strings_column_handler::print( size_type start, size_type end, auto execpol = rmm::exec_policy(0); auto strings_column = column_device_view::create(_parent); auto d_column = *strings_column; - auto d_offsets = offsets_data(); - auto d_strings = chars_data(); + auto d_offsets = offsets_column().data(); + auto d_strings = chars_column().data(); // create output strings offsets rmm::device_vector output_offsets(count,0); @@ -96,12 +103,13 @@ void strings_column_handler::print( size_type start, size_type end, return 0; size_type offset = idx ? d_offsets[idx-1] : 0; // this logic will be a template size_type bytes = d_offsets[idx] - offset; // specialization on element() - string_view dstr( d_strings + offset, bytes ); // method of column_device_view - if( (max_width > 0) && (dstr.characters() > max_width) ) - bytes = dstr.byte_offset_for(max_width); + string_view d_str( d_strings + offset, bytes ); // method of column_device_view + if( (max_width > 0) && (d_str.characters() > max_width) ) + bytes = d_str.byte_offset_for(max_width); return bytes+1; // allow for null-terminator on non-null strings }, thrust::plus()); + // build output buffer size_t buffer_size = output_offsets.back(); // last element has total size if( buffer_size == 0 ) @@ -147,12 +155,13 @@ void strings_column_handler::print( size_type start, size_type end, } // new strings column from subset of this strings instance -std::unique_ptr strings_column_handler::sublist( size_type start, size_type end, - size_type step, cudaStream_t stream ) +std::unique_ptr sublist( strings_column_handler handler, + size_type start, size_type end, + size_type step, cudaStream_t stream ) { if( step <= 0 ) step = 1; - size_type count = this->count(); + size_type count = handler.size(); if( end < 0 || end > count ) end = count; if( start < 0 || start > end ) @@ -160,38 +169,36 @@ std::unique_ptr strings_column_handler::sublist( size_type start, count = (end - start)/step +1; // auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(_parent, stream); - auto d_column = *strings_column; - // build indices thrust::device_vector indices(count); thrust::sequence( execpol->on(stream), indices.begin(), indices.end(), start, step ); - // should have a way to create a column_view with an existing memory buffer - auto indices_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); - auto indices_view = indices_column->mutable_view(); - cudaMemcpyAsync( indices_view.data(), indices.data().get(), count*sizeof(int32_t), cudaMemcpyDeviceToDevice, stream); - - return gather(indices_view); + // create a column_view as a wrapper of these indices + column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); + // build a new strings column from the indices + return gather(handler, indices_view); } // return new strings column with strings from this instance as specified by the indices -std::unique_ptr strings_column_handler::gather( const column_view& indices, cudaStream_t stream ) +std::unique_ptr gather( strings_column_handler handler, + column_view gather_map, cudaStream_t stream ) { - size_type count = indices.size(); - auto d_indices = indices.data(); + size_type count = gather_map.size(); + auto d_indices = gather_map.data(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(_parent,stream); + auto strings_column = column_device_view::create(handler.parent_column(),stream); auto d_column = *strings_column; - auto d_offsets = offsets_data(); + auto d_offsets = handler.offsets_column().data(); // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); auto offsets_view = offsets_column->mutable_view(); auto d_new_offsets = offsets_view.data(); // create new offsets array thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), d_new_offsets, [d_column, d_offsets, d_indices] __device__ (size_type idx) { size_type index = d_indices[idx]; @@ -201,42 +208,20 @@ std::unique_ptr strings_column_handler::gather( const column_view& return d_offsets[index] - offset; }, thrust::plus()); + // build null mask - size_type null_count = this->null_count(); - mask_state state = mask_state::UNINITIALIZED; - if( null_count==0 ) - state = mask_state::UNALLOCATED; - else if( null_count==count ) - state = mask_state::ALL_NULL; - auto null_mask = create_null_mask(count, state, stream, _mr); - if( (null_count > 0) && (null_count < count) ) - { - uint8_t* d_null_mask = static_cast(null_mask.data()); - CUDA_TRY(cudaMemsetAsync(d_null_mask, 0, null_mask.size(), stream)); - thrust::transform(execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count/8), - d_null_mask, - [d_column, count] __device__(size_type byte_idx) { - unsigned char byte = 0; // set one byte per thread -- init to all nulls - for( size_type i=0; i < 8; ++i ) - { - size_type idx = i + (byte_idx*8); // compute d_strs index - byte = byte >> 1; // shift until we are done - if( idx < count ) // check boundary - { - if( d_column.is_null(idx) ) - byte |= 128; // string is not null, set high bit - } - } - return byte; //d_null_mask[byte_idx] = byte; - }); - } + auto null_mask = valid_if( static_cast(nullptr), + [d_column, d_indices] __device__ (size_type idx) { + return !d_column.nullable() || !d_column.is_null(d_indices[idx]); + }, + count, stream ); // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, _mr ); + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); + auto d_chars = chars_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ // place individual strings @@ -253,20 +238,22 @@ std::unique_ptr strings_column_handler::gather( const column_view& children.emplace_back(std::move(chars_column)); return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,_mr}, - null_mask, null_count, - std::move(children)); + data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, + rmm::device_buffer(null_mask.first,(size_type)null_mask.second), null_mask.second, + std::move(children)); } // return sorted version of the given strings column -std::unique_ptr strings_column_handler::sort( sort_type stype, bool ascending, bool nullfirst, cudaStream_t stream ) +std::unique_ptr sort( strings_column_handler handler, + strings_column_handler::sort_type stype, + bool ascending, bool nullfirst, cudaStream_t stream ) { auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(_parent, stream); + auto strings_column = column_device_view::create(handler.parent_column(), stream); auto d_column = *strings_column; // lets sort indices - size_type count = this->count(); + size_type count = handler.size(); thrust::device_vector indices(count); thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); thrust::sort( execpol->on(stream), indices.begin(), indices.end(), @@ -281,16 +268,10 @@ std::unique_ptr strings_column_handler::sort( sort_type stype, boo return (ascending ? (cmp<0) : (cmp>0)); }); - // should have a way to create a column_view with an existing memory buffer - auto d_indices = indices.data().get(); - // we will create an empty one and pass in this data for now - auto indices_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, _mr ); - auto indices_view = indices_column->mutable_view(); - cudaMemcpyAsync( indices_view.data(), d_indices, count*sizeof(int32_t), cudaMemcpyDeviceToDevice, stream); - + // create a column_view as a wrapper of these indices + column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); // now build a new strings column from the indices - return gather( indices_view ); + return gather( handler, indices_view ); } - } // namespace cudf \ No newline at end of file From 96b260fa85c6290a410b2485c2818a3e7e58bbbe Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 17 Sep 2019 15:46:35 -0400 Subject: [PATCH 04/54] fix memory leak --- .../cudf/strings/strings_column_handler.hpp | 6 +++++- cpp/src/strings/strings_column_factories.cu | 16 ++++++---------- cpp/src/strings/strings_column_handler.cu | 18 +++++++++++++----- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_handler.hpp index 75f4e937195..67cb4e70231 100644 --- a/cpp/include/cudf/strings/strings_column_handler.hpp +++ b/cpp/include/cudf/strings/strings_column_handler.hpp @@ -97,6 +97,9 @@ class strings_column_handler }; +namespace strings +{ + /**---------------------------------------------------------------------------* * @brief Returns a new strings column created from a subset of * of this instance's strings column. @@ -137,4 +140,5 @@ std::unique_ptr sort( strings_column_handler handler, strings_column_handler::sort_type stype, bool ascending=true, bool nullfirst=true, cudaStream_t stream=0 ); -} +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 1b1fb555161..8a408819c60 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -71,17 +72,12 @@ std::unique_ptr make_strings_column( thrust::plus() ); // create null mask - auto null_mask = valid_if( static_cast(nullptr), + auto valid_mask = valid_if( static_cast(nullptr), [d_strings] __device__ (size_type idx) { return d_strings[idx].first!=nullptr; }, count, stream ); - - // build null_mask - //mask_state state = mask_state::UNINITIALIZED; - //if( null_count==0 ) - // state = mask_state::UNALLOCATED; - //else if( null_count==count ) - // state = mask_state::ALL_NULL; - //auto null_mask = create_null_mask(count, state, stream, mr); + auto null_count = valid_mask.second; + rmm::device_buffer null_mask(valid_mask.first,gdf_valid_allocation_size(count)); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build chars column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); @@ -107,7 +103,7 @@ std::unique_ptr make_strings_column( // see column_view.cpp(45) to see why size must be 0 here return std::make_unique( data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, - rmm::device_buffer(null_mask.first,(size_type)null_mask.second), null_mask.second, + null_mask, null_count, std::move(children)); } diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu index 481747f903d..7fa0fa7dedd 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_handler.cu @@ -154,6 +154,9 @@ void strings_column_handler::print( size_type start, size_type end, } } +namespace strings +{ + // new strings column from subset of this strings instance std::unique_ptr sublist( strings_column_handler handler, size_type start, size_type end, @@ -210,11 +213,15 @@ std::unique_ptr gather( strings_column_handler handler, thrust::plus()); // build null mask - auto null_mask = valid_if( static_cast(nullptr), + auto valid_mask = valid_if( static_cast(nullptr), [d_column, d_indices] __device__ (size_type idx) { return !d_column.nullable() || !d_column.is_null(d_indices[idx]); }, count, stream ); + auto null_count = valid_mask.second; + auto null_size = gdf_valid_allocation_size(count); + rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly @@ -227,9 +234,9 @@ std::unique_ptr gather( strings_column_handler handler, // place individual strings if( d_column.nullable() && d_column.is_null(idx) ) return; - string_view dstr = d_column.element(d_indices[idx]); + string_view d_str = d_column.element(d_indices[idx]); size_type offset = (idx ? d_new_offsets[idx-1] : 0); - memcpy(d_chars + offset, dstr.data(), dstr.size() ); + memcpy(d_chars + offset, d_str.data(), d_str.size() ); }); // build children vector @@ -239,7 +246,7 @@ std::unique_ptr gather( strings_column_handler handler, return std::make_unique( data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, - rmm::device_buffer(null_mask.first,(size_type)null_mask.second), null_mask.second, + null_mask, null_count, std::move(children)); } @@ -274,4 +281,5 @@ std::unique_ptr sort( strings_column_handler handler, return gather( handler, indices_view ); } -} // namespace cudf \ No newline at end of file +} // namespace strings +} // namespace cudf \ No newline at end of file From 991889f7d298884ce1db1bf25ea1d50c7fceecc3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 17 Sep 2019 16:10:23 -0400 Subject: [PATCH 05/54] remove unintentional nvtext changes --- cpp/custrings/text/NVText.cu | 42 --------------------------------- cpp/include/nvstrings/NVText.h | 1 - python/nvstrings/cpp/pytext.cpp | 23 ------------------ python/nvstrings/nvtext.py | 6 ----- 4 files changed, 72 deletions(-) diff --git a/cpp/custrings/text/NVText.cu b/cpp/custrings/text/NVText.cu index 4de66bb1a5f..671c318e75e 100644 --- a/cpp/custrings/text/NVText.cu +++ b/cpp/custrings/text/NVText.cu @@ -170,45 +170,3 @@ NVStrings* NVText::scatter_count( NVStrings& strs, unsigned int* counts, bool bd // build strings object from elements return NVStrings::create_from_index((std::pair*)d_results,total_count); } - -// -unsigned int NVText::code_points( NVStrings& strs, unsigned int* results ) -{ - unsigned int count = strs.size(); - if( count==0 || results==nullptr ) - return 0; - - // - auto execpol = rmm::exec_policy(0); - rmm::device_vector strings(count,nullptr); - custring_view** d_strings = strings.data().get(); - strs.create_custring_index(d_strings); - - // get all the lengths to build the offsets - // offsets point to each individual range - rmm::device_vector offsets(count,0); - size_t* d_offsets = offsets.data().get(); - thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, - [d_strings, d_offsets] __device__(unsigned int idx){ - custring_view* dstr = d_strings[idx]; - if( dstr ) - d_offsets[idx] = dstr->chars_count(); - }); - thrust::inclusive_scan( execpol->on(0), offsets.begin(), offsets.end(), offsets.begin() ); - - // now set the ranges - auto d_results = results; - thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, - [d_strings, d_offsets, d_results] __device__(unsigned int idx){ - custring_view* dstr = d_strings[idx]; - if( !dstr ) - return; - auto offset = (idx ? d_offsets[idx-1] : 0); - auto result = d_results + offset; - for( auto itr = dstr->begin(); itr != dstr->end(); ++itr ) - *result++ = (unsigned int)*itr; - }); - // - unsigned int rtn = offsets[count-1]; - return rtn; -} diff --git a/cpp/include/nvstrings/NVText.h b/cpp/include/nvstrings/NVText.h index 3fddf4777c3..2fdff32c61d 100644 --- a/cpp/include/nvstrings/NVText.h +++ b/cpp/include/nvstrings/NVText.h @@ -171,5 +171,4 @@ class NVText * @return New strings instance with appropriate scattered elements. */ static NVStrings* scatter_count( NVStrings& strs, unsigned int* counts, bool devmem=true ); - static unsigned int code_points( NVStrings& strs, unsigned int* results ); }; diff --git a/python/nvstrings/cpp/pytext.cpp b/python/nvstrings/cpp/pytext.cpp index 95231eb06ee..423ed1d7185 100644 --- a/python/nvstrings/cpp/pytext.cpp +++ b/python/nvstrings/cpp/pytext.cpp @@ -649,28 +649,6 @@ static PyObject* n_scatter_count( PyObject* self, PyObject* args ) return PyLong_FromVoidPtr((void*)strs); } -static PyObject* n_code_points( PyObject* self, PyObject* args ) -{ - PyObject* pystrs = PyTuple_GetItem(args,0); - NVStrings* strs = strings_from_object(pystrs); - if( strs==0 ) - Py_RETURN_NONE; - - PyObject* pyresults = PyTuple_GetItem(args,1); - std::string name = pyresults->ob_type->tp_name; - if( name.compare("int")!=0 ) - { - printf("results must be device pointer\n"); - Py_RETURN_NONE; - } - unsigned int* results = (unsigned int*)PyLong_AsVoidPtr(pyresults); - - Py_BEGIN_ALLOW_THREADS - NVText::code_points(*strs,results); - Py_END_ALLOW_THREADS - Py_RETURN_NONE; -} - // static PyMethodDef s_Methods[] = { { "n_tokenize", n_tokenize, METH_VARARGS, "" }, @@ -685,7 +663,6 @@ static PyMethodDef s_Methods[] = { { "n_edit_distance", n_edit_distance, METH_VARARGS, "" }, { "n_create_ngrams", n_create_ngrams, METH_VARARGS, "" }, { "n_scatter_count", n_scatter_count, METH_VARARGS, "" }, - { "n_code_points", n_code_points, METH_VARARGS, "" }, { NULL, NULL, 0, NULL } }; diff --git a/python/nvstrings/nvtext.py b/python/nvstrings/nvtext.py index 40c98929ada..8420ba48a58 100644 --- a/python/nvstrings/nvtext.py +++ b/python/nvstrings/nvtext.py @@ -346,9 +346,3 @@ def scatter_count(strs, counts): if rtn is not None: rtn = nvs.nvstrings(rtn) return rtn - -def code_points(strs, results): - if results is None: - raise ValueError("results must be device pointer") - pyniNVText.n_code_points(strs, results) - From 6482affba72ff065c1f23577882faf28fb9a82dc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 18 Sep 2019 14:06:22 -0400 Subject: [PATCH 06/54] added strings array.cu and attributes.cu source --- cpp/CMakeLists.txt | 3 + cpp/include/cudf/strings/string_view.cuh | 2 +- .../cudf/strings/strings_column_handler.hpp | 110 ++++++- cpp/src/strings/array.cu | 272 ++++++++++++++++++ cpp/src/strings/attributes.cu | 86 ++++++ cpp/src/strings/strings_column_handler.cu | 126 -------- cpp/src/strings/utilities.cu | 79 +++++ cpp/src/strings/utilities.h | 48 ++++ 8 files changed, 588 insertions(+), 138 deletions(-) create mode 100644 cpp/src/strings/array.cu create mode 100644 cpp/src/strings/attributes.cu create mode 100644 cpp/src/strings/utilities.cu create mode 100644 cpp/src/strings/utilities.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7b888c9f2b1..35b496ce391 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -428,6 +428,9 @@ add_library(cudf src/sort/sort.cu src/strings/strings_column_factories.cu src/strings/strings_column_handler.cu + src/strings/array.cu + src/strings/attributes.cu + src/strings/utilities.cu src/column/legacy/interop.cpp) # Rename installation to proper names for later finding diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index b6bb6ba8d81..ef4d2470fa1 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -43,7 +43,7 @@ class string_view * @param data Device char array encoded in UTF8. * @param bytes Number of bytes in data array. *---------------------------------------------------------------------------**/ - __device__ string_view(const char* data, size_type bytes); + __host__ __device__ string_view(const char* data, size_type bytes); /**---------------------------------------------------------------------------* * @brief Create instance from existing device char array. The array must * include a null-terminator ('\0). diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_handler.hpp index 67cb4e70231..5b5c9ff36c5 100644 --- a/cpp/include/cudf/strings/strings_column_handler.hpp +++ b/cpp/include/cudf/strings/strings_column_handler.hpp @@ -15,8 +15,6 @@ */ #pragma once -#include - #include #include #include @@ -77,9 +75,9 @@ class strings_column_handler * @param start Index of first string to print. * @param end Index of last string to print. Specify -1 for all strings. * @param max_width Maximum number of characters to print per string. - * Specify -1 to print all characters. + * Specify -1 to print all characters. * @param delimiter The chars to print between each string. - * Default is new-line character. + * Default is new-line character. *---------------------------------------------------------------------------**/ void print( size_type start=0, size_type end=-1, size_type max_width=-1, const char* delimiter = "\n" ) const; @@ -100,31 +98,49 @@ class strings_column_handler namespace strings { +// array.cu /**---------------------------------------------------------------------------* * @brief Returns a new strings column created from a subset of * of this instance's strings column. * + * @code + * s1 = ["a", "b", "c", "d", "e", "f"] + * s2 = sublist( s1, 2 ) + * s2 is ["c", "d", "e", "f"] + * @endcode + * * @param start Index of first string to use. * @param end Index of last string to use. + * Default -1 indicates the last element. * @param step Increment value between indexes. + * Default step is 1. * @param stream CUDA stream to use kernels in this method. * @return New strings column of size (end-start)/step. *---------------------------------------------------------------------------**/ std::unique_ptr sublist( strings_column_handler handler, - size_type start, size_type end, - size_type step, cudaStream_t stream=0 ); + size_type start, size_type end=-1, + size_type step=1, + cudaStream_t stream=(cudaStream_t)0 ); /**---------------------------------------------------------------------------* * @brief Returns a new strings column created this strings instance using * the specified indices to select the strings. + * + * @code + * s1 = ["a", "b", "c", "d", "e", "f"] + * map = [0, 2] + * s2 = gather( s1, map ) + * s2 is ["a", "c"] + * @endcode * - * @param indices The indices with which to select strings for the new column. - * Values must be within [0,count()) range. + * @param gather_map The indices with which to select strings for the new column. + * Values must be within [0,size()) range. * @param stream CUDA stream to use kernels in this method. * @return New strings column of size indices.size() *---------------------------------------------------------------------------**/ std::unique_ptr gather( strings_column_handler handler, - column_view gather_map, cudaStream_t stream=0 ); + cudf::column_view gather_map, + cudaStream_t stream=(cudaStream_t)0 ); /**---------------------------------------------------------------------------* * @brief Returns a new strings column that is a sorted version of the @@ -137,8 +153,80 @@ std::unique_ptr gather( strings_column_handler handler, * @return New strings column with sorted elements of this instance. *---------------------------------------------------------------------------**/ std::unique_ptr sort( strings_column_handler handler, - strings_column_handler::sort_type stype, bool ascending=true, - bool nullfirst=true, cudaStream_t stream=0 ); + strings_column_handler::sort_type stype, + bool ascending=true, + bool nullfirst=true, + cudaStream_t stream=(cudaStream_t)0 ); + +/** + * @brief Returns new instance using the provided map values and strings. + * The map values specify the location in the new strings instance. + * Missing values pass through from the handler instance into those positions. + * + * @code + * s1 = ["a", "b", "c", "d"] + * s2 = ["e", "f"] + * map = [1, 3] + * s3 = scatter( s1, s2, m1 ) + * s3 is ["a", "e", "c", "f"] + * @endcode + * + * @param[in] strings The instance for which to retrieve the values + * specified in map column. + * @param[in] scatter_map The 0-based index values to retrieve from the + * strings parameter. Number of values must equal the number + * of elements in strings pararameter (strings.size()). + * @param stream CUDA stream to use kernels in this method. + * @return New instance with the specified strings. + */ +std::unique_ptr scatter( strings_column_handler handler, + strings_column_handler strings, + cudf::column_view scatter_map, + cudaStream_t stream=(cudaStream_t)0 ); +/** + * @brief Returns new instance using the provided index values and a + * single string. The map values specify where to place the string + * in the new strings instance. Missing values pass through from + * the handler instance at those positions. + * + * @code + * s1 = ["a", "b", "c", "d"] + * map = [1, 3] + * s2 = scatter( s1, "e", m1 ) + * s2 is ["a", "e", "c", "e"] + * @endcode + * + * @param[in] string The string to place in according to the scatter_map. + * @param[in] scatter_map The 0-based index values to place the given string. + * @return New instance with the specified strings. + */ +std::unique_ptr scatter( strings_column_handler handler, + const char* string, + cudf::column_view scatter_map, + cudaStream_t stream=(cudaStream_t)0 ); + +// attributes.cu +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes for each string in a strings column. + * Null strings will have a byte count of 0. + * + * @param stream CUDA stream to use kernels in this method. + * @return Numeric column of type int32. + *---------------------------------------------------------------------------**/ +std::unique_ptr bytes_counts( strings_column_handler handler, + cudaStream_t stream=(cudaStream_t)0 ); + +/**---------------------------------------------------------------------------* + * @brief Returns the number of characters for each string in a strings column. + * Null strings will have a count of 0. The number of characters is not the + * same as the number of bytes if multi-byte encoded characters make up a + * string. + * + * @param stream CUDA stream to use kernels in this method. + * @return Numeric column of type int32. + *---------------------------------------------------------------------------**/ +std::unique_ptr characters_counts( strings_column_handler handler, + cudaStream_t stream=(cudaStream_t)0 ); } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu new file mode 100644 index 00000000000..97cb52c0490 --- /dev/null +++ b/cpp/src/strings/array.cu @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include "./utilities.h" + +#include +#include +#include +#include + +namespace cudf +{ +namespace strings +{ + +// new strings column from subset of this strings instance +std::unique_ptr sublist( strings_column_handler handler, + size_type start, size_type end, + size_type step, cudaStream_t stream ) +{ + if( step <= 0 ) + step = 1; + size_type count = handler.size(); + if( end < 0 || end > count ) + end = count; + if( start < 0 || start > end ) + throw std::invalid_argument("invalid start parameter"); + count = (end - start)/step +1; + // + auto execpol = rmm::exec_policy(stream); + // build indices + thrust::device_vector indices(count); + thrust::sequence( execpol->on(stream), indices.begin(), indices.end(), start, step ); + // create a column_view as a wrapper of these indices + column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); + // build a new strings column from the indices + return gather(handler, indices_view); +} + +// return new strings column with strings from this instance as specified by the indices +std::unique_ptr gather( strings_column_handler handler, + column_view gather_map, cudaStream_t stream ) +{ + size_type count = gather_map.size(); + auto d_indices = gather_map.data(); + + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto d_column = *strings_column; + auto d_offsets = handler.offsets_column().data(); + + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); + auto offsets_view = offsets_column->mutable_view(); + auto d_new_offsets = offsets_view.data(); + // create new offsets array + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_new_offsets, + [d_column, d_offsets, d_indices] __device__ (size_type idx) { + size_type index = d_indices[idx]; + if( d_column.nullable() && d_column.is_null(index) ) + return 0; + size_type offset = index ? d_offsets[index-1] : 0; + return d_offsets[index] - offset; + }, + thrust::plus()); + + // build null mask + auto valid_mask = valid_if( static_cast(nullptr), + [d_column, d_indices] __device__ (size_type idx) { + return !d_column.nullable() || !d_column.is_null(d_indices[idx]); + }, + count, stream ); + auto null_count = valid_mask.second; + auto null_size = gdf_valid_allocation_size(count); + rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + + // build chars column + size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ + // place individual strings + if( d_column.nullable() && d_column.is_null(idx) ) + return; + string_view d_str = d_column.element(d_indices[idx]); + size_type offset = (idx ? d_new_offsets[idx-1] : 0); + memcpy(d_chars + offset, d_str.data(), d_str.size() ); + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, + null_mask, null_count, + std::move(children)); +} + +// return sorted version of the given strings column +std::unique_ptr sort( strings_column_handler handler, + strings_column_handler::sort_type stype, + bool ascending, bool nullfirst, cudaStream_t stream ) +{ + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(handler.parent_column(), stream); + auto d_column = *strings_column; + + // lets sort indices + size_type count = handler.size(); + thrust::device_vector indices(count); + thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); + thrust::sort( execpol->on(stream), indices.begin(), indices.end(), + [d_column, stype, ascending, nullfirst] __device__ (size_type lhs, size_type rhs) { + bool lhs_null{d_column.nullable() && d_column.is_null(lhs)}; + bool rhs_null{d_column.nullable() && d_column.is_null(rhs)}; + if( lhs_null || rhs_null ) + return (nullfirst ? !rhs_null : !lhs_null); + string_view lhs_str = d_column.element(lhs); + string_view rhs_str = d_column.element(rhs); + int cmp = lhs_str.compare(rhs_str); + return (ascending ? (cmp<0) : (cmp>0)); + }); + + // create a column_view as a wrapper of these indices + column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); + // now build a new strings column from the indices + return gather( handler, indices_view ); +} + +// +// s1 = ['a','b,'c','d'] +// s2 = ['e','f'] +// pos = [1,3] -- must be the same length as s2 +// s3 = s1.scatter(s2,pos) +// ['a','e','c','f'] +// +std::unique_ptr scatter( strings_column_handler handler, + strings_column_handler strings, + cudf::column_view scatter_map, + cudaStream_t stream ) +{ + size_type elements = strings.size(); + CUDF_EXPECTS( elements==scatter_map.size(), "number of strings must match map size" ); + size_type count = handler.size(); + auto d_indices = scatter_map.data(); + auto execpol = rmm::exec_policy(stream); + + // + rmm::device_buffer buffer = create_string_array_from_column(handler,stream); + cudf::string_view* d_strings = reinterpret_cast(buffer.data()); + rmm::device_buffer map_buffer = create_string_array_from_column(strings,stream); + cudf::string_view* d_map_strings = reinterpret_cast(map_buffer.data()); + thrust::scatter( execpol->on(stream), d_map_strings, d_map_strings+elements, d_indices, d_strings ); + + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.data(); + // create new offsets array + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_offsets, + [d_strings, d_offsets] __device__ (size_type idx) { + return d_strings[idx].size(); + }, + thrust::plus()); + + // build null mask + auto valid_mask = valid_if( static_cast(nullptr), + [d_strings] __device__ (size_type idx) { return !d_strings[idx].is_null(); }, + count, stream ); + auto null_count = valid_mask.second; + auto null_size = gdf_valid_allocation_size(count); + rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + + // build chars column + size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, + stream, handler.memory_resource() ); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_strings, d_offsets, d_chars] __device__(size_type idx){ + cudf::string_view d_str = d_strings[idx]; + if( d_str.is_null() ) + return; + size_type offset = (idx ? d_offsets[idx-1] : 0); + memcpy(d_chars + offset, d_str.data(), d_str.size() ); + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, + null_mask, null_count, + std::move(children)); +} + +// +// s1 = ['a','b,'c','d'] +// pos = [1,3] +// s3 = s1.scatter('e',pos,2) +// ['a','e','c','e'] +// +std::unique_ptr scatter( strings_column_handler handler, + const char* string, + cudf::column_view scatter_map, + cudaStream_t stream ) +{ +// size_type count = size(); +// size_type elements = scatter_map.size(); +// auto execpol = rmm::exec_policy(0); +// // copy string to device +// auto d_string = string_from_host(string); +// cudf::string_view* d_replace = *d_string; +// // create result output array +// rmm::device_vector results(count,nullptr); +// auto d_results = results.data().get(); +// custring_view_array d_strings = pImpl->getStringsPtr(); +// thrust::copy( execpol->on(0), d_strings, d_strings+count, d_results ); +// thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), elements, +// [d_pos, count, d_repl, d_results] __device__ (unsigned int idx) { +// int pos = d_pos[idx]; +// if( (pos >= 0) && (pos < count) ) +// d_results[pos] = d_repl; +// }); +// // build resulting instance +// NVStrings* rtn = new NVStrings(count); +// NVStrings_init_from_custrings(rtn->pImpl, d_results, count); +// if( !bdevmem ) +// RMM_FREE((void*)d_pos,0); +// RMM_FREE((void*)d_repl,0); + return nullptr; +} + + +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu new file mode 100644 index 00000000000..6a29013716f --- /dev/null +++ b/cpp/src/strings/attributes.cu @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace cudf +{ +namespace strings +{ + +std::unique_ptr characters_counts( strings_column_handler handler, + cudaStream_t stream ) +{ + size_type count = handler.size(); + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto d_column = *strings_column; + // create output column + auto result = std::make_unique( data_type{INT32}, count, + rmm::device_buffer(count * sizeof(int32_t), stream, handler.memory_resource()), + rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), + stream, handler.memory_resource()), + d_column.null_count()); + auto results_view = result->mutable_view(); + auto d_lengths = results_view.data(); + // set lengths + thrust::transform( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_lengths, + [d_column] __device__ (int32_t idx) { + if( d_column.nullable() && d_column.is_null(idx) ) + return 0; + return d_column.element(idx).characters(); + }); + return result; +} + +std::unique_ptr bytes_counts( strings_column_handler handler, + cudaStream_t stream ) +{ + size_type count = handler.size(); + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto d_column = *strings_column; + // create output column + auto result = std::make_unique( data_type{INT32}, count, + rmm::device_buffer(count * sizeof(int32_t), stream, handler.memory_resource()), + rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), + stream, handler.memory_resource()), + d_column.null_count()); + auto results_view = result->mutable_view(); + auto d_lengths = results_view.data(); + // set sizes + thrust::transform( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_lengths, + [d_column] __device__ (int32_t idx) { + if( d_column.nullable() && d_column.is_null(idx) ) + return 0; + return d_column.element(idx).size(); + }); + return result; +} + + +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu index 7fa0fa7dedd..e17bb1327c7 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_handler.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -23,8 +22,6 @@ #include #include -#include -#include #include namespace cudf { @@ -157,129 +154,6 @@ void strings_column_handler::print( size_type start, size_type end, namespace strings { -// new strings column from subset of this strings instance -std::unique_ptr sublist( strings_column_handler handler, - size_type start, size_type end, - size_type step, cudaStream_t stream ) -{ - if( step <= 0 ) - step = 1; - size_type count = handler.size(); - if( end < 0 || end > count ) - end = count; - if( start < 0 || start > end ) - throw std::invalid_argument("invalid start parameter"); - count = (end - start)/step +1; - // - auto execpol = rmm::exec_policy(stream); - // build indices - thrust::device_vector indices(count); - thrust::sequence( execpol->on(stream), indices.begin(), indices.end(), start, step ); - // create a column_view as a wrapper of these indices - column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); - // build a new strings column from the indices - return gather(handler, indices_view); -} - -// return new strings column with strings from this instance as specified by the indices -std::unique_ptr gather( strings_column_handler handler, - column_view gather_map, cudaStream_t stream ) -{ - size_type count = gather_map.size(); - auto d_indices = gather_map.data(); - - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(),stream); - auto d_column = *strings_column; - auto d_offsets = handler.offsets_column().data(); - - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); - auto offsets_view = offsets_column->mutable_view(); - auto d_new_offsets = offsets_view.data(); - // create new offsets array - thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_new_offsets, - [d_column, d_offsets, d_indices] __device__ (size_type idx) { - size_type index = d_indices[idx]; - if( d_column.nullable() && d_column.is_null(index) ) - return 0; - size_type offset = index ? d_offsets[index-1] : 0; - return d_offsets[index] - offset; - }, - thrust::plus()); - - // build null mask - auto valid_mask = valid_if( static_cast(nullptr), - [d_column, d_indices] __device__ (size_type idx) { - return !d_column.nullable() || !d_column.is_null(d_indices[idx]); - }, - count, stream ); - auto null_count = valid_mask.second; - auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy - RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, - [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ - // place individual strings - if( d_column.nullable() && d_column.is_null(idx) ) - return; - string_view d_str = d_column.element(d_indices[idx]); - size_type offset = (idx ? d_new_offsets[idx-1] : 0); - memcpy(d_chars + offset, d_str.data(), d_str.size() ); - }); - - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); - - return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, - null_mask, null_count, - std::move(children)); -} - -// return sorted version of the given strings column -std::unique_ptr sort( strings_column_handler handler, - strings_column_handler::sort_type stype, - bool ascending, bool nullfirst, cudaStream_t stream ) -{ - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(), stream); - auto d_column = *strings_column; - - // lets sort indices - size_type count = handler.size(); - thrust::device_vector indices(count); - thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); - thrust::sort( execpol->on(stream), indices.begin(), indices.end(), - [d_column, stype, ascending, nullfirst] __device__ (size_type lhs, size_type rhs) { - bool lhs_null{d_column.nullable() && d_column.is_null(lhs)}; - bool rhs_null{d_column.nullable() && d_column.is_null(rhs)}; - if( lhs_null || rhs_null ) - return (nullfirst ? !rhs_null : !lhs_null); - string_view lhs_str = d_column.element(lhs); - string_view rhs_str = d_column.element(rhs); - int cmp = lhs_str.compare(rhs_str); - return (ascending ? (cmp<0) : (cmp>0)); - }); - - // create a column_view as a wrapper of these indices - column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); - // now build a new strings column from the indices - return gather( handler, indices_view ); -} } // namespace strings } // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu new file mode 100644 index 00000000000..47364009df9 --- /dev/null +++ b/cpp/src/strings/utilities.cu @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include "./utilities.h" + +#include + +namespace cudf +{ +namespace strings +{ + +// Used to build a temporary string_view object from a single host string. +// It will create a single piece of device memory that includes +// the string_view instance and the string data. +std::unique_ptr> + string_from_host( const char* str, cudaStream_t stream ) +{ + if( !str ) + return nullptr; + size_type length = (size_type)std::strlen(str); + size_type bytes = sizeof(cudf::string_view) + length; + + char* d_data; + RMM_TRY(RMM_ALLOC( &d_data, bytes, stream )); + char* d_str = d_data + sizeof(cudf::string_view); + cudf::string_view tmp{d_str,length}; + std::vector h_data(bytes); + memcpy( h_data.data(), &tmp, sizeof(cudf::string_view) ); + memcpy( h_data.data() + sizeof(cudf::string_view), str, length ); + CUDA_TRY(cudaMemcpyAsync( d_data, h_data.data(), bytes, + cudaMemcpyHostToDevice, stream )); + CUDA_TRY(cudaStreamSynchronize(stream)); + auto deleter = [](cudf::string_view* sv) { RMM_FREE(sv,0); }; + return std::unique_ptr{reinterpret_cast(d_data),deleter}; +} + +rmm::device_buffer create_string_array_from_column( + strings_column_handler handler, + cudaStream_t stream ) +{ + auto execpol = rmm::exec_policy(stream); + auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto d_column = *strings_column; + + auto count = handler.size(); + rmm::device_buffer buffer( count*sizeof(cudf::string_view), stream, handler.memory_resource() ); + cudf::string_view* d_strings = reinterpret_cast(buffer.data()); + thrust::for_each_n( execpol->on(stream), + thrust::make_counting_iterator(0), count, + [d_column, d_strings] __device__ (size_type idx) { + if( d_column.nullable() && d_column.is_null(idx) ) + d_strings[idx] = cudf::string_view(nullptr,0); + else + d_strings[idx] = d_column.element(idx); + }); + + return buffer; +} + +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.h new file mode 100644 index 00000000000..7483e3074a1 --- /dev/null +++ b/cpp/src/strings/utilities.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace cudf +{ +namespace strings +{ + +/** + * @brief Creates a temporary string_view object from a host string. + * The host string is copied into device memory and the object + * pointer can be used in device code. + * + * @param[in] str Null-terminated, encoded string in CPU memory. + * @param[in] stream Stream to execute any device code against. + * @return Device object pointer. + */ +std::unique_ptr> + string_from_host( const char* str, cudaStream_t stream=0 ); + +/** + * + */ +rmm::device_buffer create_string_array_from_column( + strings_column_handler strings, + cudaStream_t stream = (cudaStream_t)0 ); + +} // namespace strings +} // namespace cudf \ No newline at end of file From 8e881d961e7d97681c49d4e8192d9632541f2bda Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 19 Sep 2019 14:48:23 -0400 Subject: [PATCH 07/54] add child column create utility --- cpp/include/cudf/strings/string_view.cuh | 6 +- cpp/src/strings/array.cu | 70 +++++++---------- cpp/src/strings/strings_column_handler.cu | 7 -- cpp/src/strings/utilities.cu | 92 ++++++++++++++++++----- cpp/src/strings/utilities.h | 24 +++++- 5 files changed, 127 insertions(+), 72 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index ef4d2470fa1..68d02237fd5 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -60,11 +60,11 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return the number of bytes in this string *---------------------------------------------------------------------------**/ - __device__ size_type size() const; + __host__ __device__ size_type size() const; /**---------------------------------------------------------------------------* * @brief Return the number of bytes in this string *---------------------------------------------------------------------------**/ - __device__ size_type length() const; + __host__ __device__ size_type length() const; /**---------------------------------------------------------------------------* * @brief Return the number of characters (UTF-8) in this string *---------------------------------------------------------------------------**/ @@ -72,7 +72,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return a pointer to the internal device array *---------------------------------------------------------------------------**/ - __device__ const char* data() const; + __host__ __device__ const char* data() const; /**---------------------------------------------------------------------------* * @brief Return true if string has no characters diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 97cb52c0490..b62b9d423b5 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -113,12 +113,12 @@ std::unique_ptr gather( strings_column_handler handler, memcpy(d_chars + offset, d_str.data(), d_str.size() ); }); - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); - return std::make_unique( + return std::make_unique( data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, null_mask, null_count, std::move(children)); @@ -163,37 +163,32 @@ std::unique_ptr sort( strings_column_handler handler, // ['a','e','c','f'] // std::unique_ptr scatter( strings_column_handler handler, - strings_column_handler strings, + strings_column_handler map_strings, cudf::column_view scatter_map, cudaStream_t stream ) { - size_type elements = strings.size(); + size_type elements = map_strings.size(); CUDF_EXPECTS( elements==scatter_map.size(), "number of strings must match map size" ); size_type count = handler.size(); auto d_indices = scatter_map.data(); auto execpol = rmm::exec_policy(stream); - // - rmm::device_buffer buffer = create_string_array_from_column(handler,stream); - cudf::string_view* d_strings = reinterpret_cast(buffer.data()); - rmm::device_buffer map_buffer = create_string_array_from_column(strings,stream); - cudf::string_view* d_map_strings = reinterpret_cast(map_buffer.data()); - thrust::scatter( execpol->on(stream), d_map_strings, d_map_strings+elements, d_indices, d_strings ); + // create strings arrays + rmm::device_vector strings = + detail::create_string_array_from_column(handler,stream); + cudf::string_view* d_strings = strings.data().get(); + rmm::device_vector values = + detail::create_string_array_from_column(map_strings,stream); + cudf::string_view* d_values = values.data().get(); + // do the scatter + thrust::scatter( execpol->on(stream), + d_values, d_values+elements, + d_indices, d_strings ); // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); - auto offsets_view = offsets_column->mutable_view(); + auto offsets_column = detail::offsets_column_from_string_array(strings,stream,handler.memory_resource()); + auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); - // create new offsets array - thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_offsets, - [d_strings, d_offsets] __device__ (size_type idx) { - return d_strings[idx].size(); - }, - thrust::plus()); // build null mask auto valid_mask = valid_if( static_cast(nullptr), @@ -206,25 +201,16 @@ std::unique_ptr scatter( strings_column_handler handler, // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, - [d_strings, d_offsets, d_chars] __device__(size_type idx){ - cudf::string_view d_str = d_strings[idx]; - if( d_str.is_null() ) - return; - size_type offset = (idx ? d_offsets[idx-1] : 0); - memcpy(d_chars + offset, d_str.data(), d_str.size() ); - }); + auto chars_column = + detail::chars_column_from_string_array(strings, d_offsets, + stream, handler.memory_resource()); - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); - return std::make_unique( + return std::make_unique( data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, null_mask, null_count, std::move(children)); diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_handler.cu index e17bb1327c7..bd53b674f93 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_handler.cu @@ -14,9 +14,7 @@ * limitations under the License. */ -#include #include -#include #include #include #include @@ -151,9 +149,4 @@ void strings_column_handler::print( size_type start, size_type end, } } -namespace strings -{ - - -} // namespace strings } // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 47364009df9..7aaf6c39a08 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -15,45 +15,44 @@ */ #include +#include #include #include #include "./utilities.h" #include +#include +#include namespace cudf { namespace strings { +namespace detail +{ // Used to build a temporary string_view object from a single host string. -// It will create a single piece of device memory that includes -// the string_view instance and the string data. std::unique_ptr> string_from_host( const char* str, cudaStream_t stream ) { if( !str ) return nullptr; size_type length = (size_type)std::strlen(str); - size_type bytes = sizeof(cudf::string_view) + length; - char* d_data; - RMM_TRY(RMM_ALLOC( &d_data, bytes, stream )); - char* d_str = d_data + sizeof(cudf::string_view); - cudf::string_view tmp{d_str,length}; - std::vector h_data(bytes); - memcpy( h_data.data(), &tmp, sizeof(cudf::string_view) ); - memcpy( h_data.data() + sizeof(cudf::string_view), str, length ); - CUDA_TRY(cudaMemcpyAsync( d_data, h_data.data(), bytes, + char* d_str; + RMM_TRY(RMM_ALLOC( &d_str, length, stream )); + CUDA_TRY(cudaMemcpyAsync( d_str, str, length, cudaMemcpyHostToDevice, stream )); CUDA_TRY(cudaStreamSynchronize(stream)); - auto deleter = [](cudf::string_view* sv) { RMM_FREE(sv,0); }; + + auto deleter = [](cudf::string_view* sv) { RMM_FREE(const_cast(sv->data()),0); }; return std::unique_ptr{reinterpret_cast(d_data),deleter}; + decltype(deleter)>{ new cudf::string_view(d_str,length), deleter}; } -rmm::device_buffer create_string_array_from_column( - strings_column_handler handler, +// build an array of string_view objects from a strings column +rmm::device_vector create_string_array_from_column( + cudf::strings_column_handler handler, cudaStream_t stream ) { auto execpol = rmm::exec_policy(stream); @@ -61,8 +60,8 @@ rmm::device_buffer create_string_array_from_column( auto d_column = *strings_column; auto count = handler.size(); - rmm::device_buffer buffer( count*sizeof(cudf::string_view), stream, handler.memory_resource() ); - cudf::string_view* d_strings = reinterpret_cast(buffer.data()); + rmm::device_vector strings(count); + cudf::string_view* d_strings = strings.data().get(); thrust::for_each_n( execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_strings] __device__ (size_type idx) { @@ -72,8 +71,65 @@ rmm::device_buffer create_string_array_from_column( d_strings[idx] = d_column.element(idx); }); - return buffer; + return strings; +} + +// build a strings offsets column from an array of string_views +std::unique_ptr offsets_column_from_string_array( + const rmm::device_vector& strings, + cudaStream_t stream, rmm::mr::device_memory_resource* mr ) +{ + size_type count = strings.size(); + auto d_strings = strings.data().get(); + auto execpol = rmm::exec_policy(stream); + auto offsets_column = make_numeric_column( data_type{INT32}, count, + mask_state::UNALLOCATED, + stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.data(); + // create new offsets array + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_offsets, + [d_strings] __device__ (size_type idx) { + return d_strings[idx].size(); + }, + thrust::plus()); + + return offsets_column; +} + +// build a strings chars column from an array of string_views +std::unique_ptr chars_column_from_string_array( + const rmm::device_vector& strings, + const int32_t* d_offsets, + cudaStream_t stream, rmm::mr::device_memory_resource* mr ) +{ + size_type count = strings.size(); + auto d_strings = strings.data().get(); + auto execpol = rmm::exec_policy(stream); + auto size_fn = [d_strings] __device__ (size_type idx) { return d_strings[idx].size(); }; + size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; + // create column + auto chars_column = make_numeric_column( data_type{INT8}, bytes, + mask_state::UNALLOCATED, + stream, mr ); + // get it's view + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_strings, d_offsets, d_chars] __device__(size_type idx){ + string_view d_str = d_strings[idx]; + if( d_str.is_null() ) + return; + size_type offset = (idx ? d_offsets[idx-1] : 0); + memcpy(d_chars + offset, d_str.data(), d_str.size() ); + }); + + return chars_column; } +} // namespace detail } // namespace strings } // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.h index 7483e3074a1..3739845acaa 100644 --- a/cpp/src/strings/utilities.h +++ b/cpp/src/strings/utilities.h @@ -24,6 +24,8 @@ namespace cudf { namespace strings { +namespace detail +{ /** * @brief Creates a temporary string_view object from a host string. @@ -40,9 +42,27 @@ std::unique_ptr> /** * */ -rmm::device_buffer create_string_array_from_column( - strings_column_handler strings, +rmm::device_vector create_string_array_from_column( + cudf::strings_column_handler strings, cudaStream_t stream = (cudaStream_t)0 ); +/** + * + */ +std::unique_ptr offsets_column_from_string_array( + const rmm::device_vector& strings, + cudaStream_t stream = (cudaStream_t)0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +/** + * + */ +std::unique_ptr chars_column_from_string_array( + const rmm::device_vector& strings, + const int32_t* d_offsets, + cudaStream_t stream = (cudaStream_t)0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +} // namespace detail } // namespace strings } // namespace cudf \ No newline at end of file From 425c81f177c832dad3fc09ed10029714e80c36b7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 19 Sep 2019 17:48:14 -0400 Subject: [PATCH 08/54] strings_column_handler to strings_column_view --- cpp/CMakeLists.txt | 2 +- ...mn_handler.hpp => strings_column_view.hpp} | 131 ++++++++------- cpp/src/strings/array.cu | 154 +++++++++++------- cpp/src/strings/attributes.cu | 28 ++-- ...lumn_handler.cu => strings_column_view.cu} | 42 ++--- cpp/src/strings/utilities.cu | 17 +- cpp/src/strings/utilities.h | 41 +++-- 7 files changed, 237 insertions(+), 178 deletions(-) rename cpp/include/cudf/strings/{strings_column_handler.hpp => strings_column_view.hpp} (64%) rename cpp/src/strings/{strings_column_handler.cu => strings_column_view.cu} (77%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35b496ce391..bc9dc67c5b5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -427,7 +427,7 @@ add_library(cudf src/bitmask/null_mask.cpp src/sort/sort.cu src/strings/strings_column_factories.cu - src/strings/strings_column_handler.cu + src/strings/strings_column_view.cu src/strings/array.cu src/strings/attributes.cu src/strings/utilities.cu diff --git a/cpp/include/cudf/strings/strings_column_handler.hpp b/cpp/include/cudf/strings/strings_column_view.hpp similarity index 64% rename from cpp/include/cudf/strings/strings_column_handler.hpp rename to cpp/include/cudf/strings/strings_column_view.hpp index 5b5c9ff36c5..9d6d653a5fa 100644 --- a/cpp/include/cudf/strings/strings_column_handler.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -23,16 +23,15 @@ namespace cudf { /**---------------------------------------------------------------------------* * @brief Given a column-view of strings type, an instance of this class - * provides the strings operations on the column. + * provides a wrapper on the column for strings operations. *---------------------------------------------------------------------------**/ -class strings_column_handler +class strings_column_view : private column_view { public: - ~strings_column_handler() = default; - - strings_column_handler( column_view strings_column, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - //strings_column_handler( const column_view&& strings_column ); + strings_column_view( column_view strings_column ); + strings_column_view( strings_column_view&& strings_view ) = default; + strings_column_view( const strings_column_view& strings_view ) = default; + ~strings_column_view() = default; /**---------------------------------------------------------------------------* * @brief Returns the number of strings in the column @@ -42,17 +41,17 @@ class strings_column_handler /**---------------------------------------------------------------------------* * @brief Returns the internal parent string column *---------------------------------------------------------------------------**/ - column_view parent_column() const; + column_view parent() const; /**---------------------------------------------------------------------------* * @brief Returns the internal column of offsets *---------------------------------------------------------------------------**/ - column_view offsets_column() const; + column_view offsets() const; /**---------------------------------------------------------------------------* * @brief Returns the internal column of chars *---------------------------------------------------------------------------**/ - column_view chars_column() const; + column_view chars() const; /**---------------------------------------------------------------------------* * @brief Returns a pointer to the internal null mask memory @@ -64,40 +63,30 @@ class strings_column_handler *---------------------------------------------------------------------------**/ size_type null_count() const; - /**---------------------------------------------------------------------------* - * @brief Returns the registered memory resource - *---------------------------------------------------------------------------**/ - rmm::mr::device_memory_resource* memory_resource() const; - - /**---------------------------------------------------------------------------* - * @brief Prints the strings to stdout. - * - * @param start Index of first string to print. - * @param end Index of last string to print. Specify -1 for all strings. - * @param max_width Maximum number of characters to print per string. - * Specify -1 to print all characters. - * @param delimiter The chars to print between each string. - * Default is new-line character. - *---------------------------------------------------------------------------**/ - void print( size_type start=0, size_type end=-1, - size_type max_width=-1, const char* delimiter = "\n" ) const; - - // sort types can be combined - enum sort_type { - none=0, ///< no sorting - length=1, ///< sort by string length - name=2 ///< sort by characters code-points - }; - private: const column_view _parent; - rmm::mr::device_memory_resource* _mr; }; namespace strings { +/**---------------------------------------------------------------------------* + * @brief Prints the strings to stdout. + * + * @param strings Strings instance for this operation. + * @param start Index of first string to print. + * @param end Index of last string to print. Specify -1 for all strings. + * @param max_width Maximum number of characters to print per string. + * Specify -1 to print all characters. + * @param delimiter The chars to print between each string. + * Default is new-line character. + *---------------------------------------------------------------------------**/ +void print( strings_column_view strings, + size_type start=0, size_type end=-1, + size_type max_width=-1, const char* delimiter = "\n" ); + + // array.cu /**---------------------------------------------------------------------------* * @brief Returns a new strings column created from a subset of @@ -109,6 +98,7 @@ namespace strings * s2 is ["c", "d", "e", "f"] * @endcode * + * @param strings Strings instance for this operation. * @param start Index of first string to use. * @param end Index of last string to use. * Default -1 indicates the last element. @@ -117,10 +107,11 @@ namespace strings * @param stream CUDA stream to use kernels in this method. * @return New strings column of size (end-start)/step. *---------------------------------------------------------------------------**/ -std::unique_ptr sublist( strings_column_handler handler, +std::unique_ptr sublist( strings_column_view strings, size_type start, size_type end=-1, size_type step=1, - cudaStream_t stream=(cudaStream_t)0 ); + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* * @brief Returns a new strings column created this strings instance using @@ -133,30 +124,41 @@ std::unique_ptr sublist( strings_column_handler handler, * s2 is ["a", "c"] * @endcode * + * @param strings Strings instance for this operation. * @param gather_map The indices with which to select strings for the new column. * Values must be within [0,size()) range. * @param stream CUDA stream to use kernels in this method. * @return New strings column of size indices.size() *---------------------------------------------------------------------------**/ -std::unique_ptr gather( strings_column_handler handler, +std::unique_ptr gather( strings_column_view strings, cudf::column_view gather_map, - cudaStream_t stream=(cudaStream_t)0 ); + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +// sort types can be combined +enum sort_type { + none=0, ///< no sorting + length=1, ///< sort by string length + name=2 ///< sort by characters code-points +}; /**---------------------------------------------------------------------------* * @brief Returns a new strings column that is a sorted version of the * strings in this instance. * + * @param strings Strings instance for this operation. * @param stype Specify what attribute of the string to sort on. * @param ascending Sort strings in ascending or descending order. * @param nullfirst Sort nulls to the beginning or the end of the new column. * @param stream CUDA stream to use kernels in this method. * @return New strings column with sorted elements of this instance. *---------------------------------------------------------------------------**/ -std::unique_ptr sort( strings_column_handler handler, - strings_column_handler::sort_type stype, +std::unique_ptr sort( strings_column_view strings, + sort_type stype, bool ascending=true, bool nullfirst=true, - cudaStream_t stream=(cudaStream_t)0 ); + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /** * @brief Returns new instance using the provided map values and strings. @@ -171,18 +173,20 @@ std::unique_ptr sort( strings_column_handler handler, * s3 is ["a", "e", "c", "f"] * @endcode * - * @param[in] strings The instance for which to retrieve the values - * specified in map column. - * @param[in] scatter_map The 0-based index values to retrieve from the - * strings parameter. Number of values must equal the number - * of elements in strings pararameter (strings.size()). + * @param strings Strings instance for this operation. + * @param values The instance for which to retrieve the strings + * specified in map column. + * @param scatter_map The 0-based index values to retrieve from the + * strings parameter. Number of values must equal the number + * of elements in strings pararameter (strings.size()). * @param stream CUDA stream to use kernels in this method. * @return New instance with the specified strings. */ -std::unique_ptr scatter( strings_column_handler handler, - strings_column_handler strings, +std::unique_ptr scatter( strings_column_view strings, + strings_column_view values, cudf::column_view scatter_map, - cudaStream_t stream=(cudaStream_t)0 ); + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /** * @brief Returns new instance using the provided index values and a * single string. The map values specify where to place the string @@ -196,25 +200,30 @@ std::unique_ptr scatter( strings_column_handler handler, * s2 is ["a", "e", "c", "e"] * @endcode * - * @param[in] string The string to place in according to the scatter_map. - * @param[in] scatter_map The 0-based index values to place the given string. + * @param strings Strings instance for this operation. + * @param value Null-terminated encoded string in host memory to use with + * the scatter_map. + * @param scatter_map The 0-based index values to place the given string. * @return New instance with the specified strings. */ -std::unique_ptr scatter( strings_column_handler handler, - const char* string, +std::unique_ptr scatter( strings_column_view strings, + const char* value, cudf::column_view scatter_map, - cudaStream_t stream=(cudaStream_t)0 ); + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); // attributes.cu /**---------------------------------------------------------------------------* * @brief Returns the number of bytes for each string in a strings column. * Null strings will have a byte count of 0. * + * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. * @return Numeric column of type int32. *---------------------------------------------------------------------------**/ -std::unique_ptr bytes_counts( strings_column_handler handler, - cudaStream_t stream=(cudaStream_t)0 ); +std::unique_ptr bytes_counts( strings_column_view strings, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* * @brief Returns the number of characters for each string in a strings column. @@ -222,11 +231,13 @@ std::unique_ptr bytes_counts( strings_column_handler handler, * same as the number of bytes if multi-byte encoded characters make up a * string. * + * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. * @return Numeric column of type int32. *---------------------------------------------------------------------------**/ -std::unique_ptr characters_counts( strings_column_handler handler, - cudaStream_t stream=(cudaStream_t)0 ); +std::unique_ptr characters_counts( strings_column_view strings, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index b62b9d423b5..4a70f47909f 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "./utilities.h" @@ -32,9 +32,10 @@ namespace strings { // new strings column from subset of this strings instance -std::unique_ptr sublist( strings_column_handler handler, +std::unique_ptr sublist( strings_column_view handler, size_type start, size_type end, - size_type step, cudaStream_t stream ) + size_type step, cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { if( step <= 0 ) step = 1; @@ -52,24 +53,25 @@ std::unique_ptr sublist( strings_column_handler handler, // create a column_view as a wrapper of these indices column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); // build a new strings column from the indices - return gather(handler, indices_view); + return gather(handler, indices_view, stream, mr); } // return new strings column with strings from this instance as specified by the indices -std::unique_ptr gather( strings_column_handler handler, - column_view gather_map, cudaStream_t stream ) +std::unique_ptr gather( strings_column_view handler, + column_view gather_map, cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { size_type count = gather_map.size(); auto d_indices = gather_map.data(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto strings_column = column_device_view::create(handler.parent(),stream); auto d_column = *strings_column; - auto d_offsets = handler.offsets_column().data(); + auto d_offsets = handler.offsets().data(); // build offsets column auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); + stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_new_offsets = offsets_view.data(); // create new offsets array @@ -100,7 +102,7 @@ std::unique_ptr gather( strings_column_handler handler, // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, - stream, handler.memory_resource() ); + stream, mr ); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, @@ -119,18 +121,20 @@ std::unique_ptr gather( strings_column_handler handler, children.emplace_back(std::move(chars_column)); return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } // return sorted version of the given strings column -std::unique_ptr sort( strings_column_handler handler, - strings_column_handler::sort_type stype, - bool ascending, bool nullfirst, cudaStream_t stream ) +std::unique_ptr sort( strings_column_view handler, + sort_type stype, + bool ascending, bool nullfirst, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(), stream); + auto strings_column = column_device_view::create(handler.parent(), stream); auto d_column = *strings_column; // lets sort indices @@ -152,7 +156,7 @@ std::unique_ptr sort( strings_column_handler handler, // create a column_view as a wrapper of these indices column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); // now build a new strings column from the indices - return gather( handler, indices_view ); + return gather( handler, indices_view, stream, mr ); } // @@ -162,34 +166,30 @@ std::unique_ptr sort( strings_column_handler handler, // s3 = s1.scatter(s2,pos) // ['a','e','c','f'] // -std::unique_ptr scatter( strings_column_handler handler, - strings_column_handler map_strings, +std::unique_ptr scatter( strings_column_view strings, + strings_column_view values, cudf::column_view scatter_map, - cudaStream_t stream ) + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { - size_type elements = map_strings.size(); + size_type elements = values.size(); CUDF_EXPECTS( elements==scatter_map.size(), "number of strings must match map size" ); - size_type count = handler.size(); + size_type count = strings.size(); auto d_indices = scatter_map.data(); auto execpol = rmm::exec_policy(stream); // create strings arrays - rmm::device_vector strings = - detail::create_string_array_from_column(handler,stream); - cudf::string_view* d_strings = strings.data().get(); - rmm::device_vector values = - detail::create_string_array_from_column(map_strings,stream); - cudf::string_view* d_values = values.data().get(); + rmm::device_vector strings_array = + detail::create_string_array_from_column(strings,stream); + cudf::string_view* d_strings = strings_array.data().get(); + rmm::device_vector values_array = + detail::create_string_array_from_column(values,stream); + cudf::string_view* d_values = values_array.data().get(); // do the scatter thrust::scatter( execpol->on(stream), d_values, d_values+elements, d_indices, d_strings ); - // build offsets column - auto offsets_column = detail::offsets_column_from_string_array(strings,stream,handler.memory_resource()); - auto offsets_view = offsets_column->view(); - auto d_offsets = offsets_view.data(); - // build null mask auto valid_mask = valid_if( static_cast(nullptr), [d_strings] __device__ (size_type idx) { return !d_strings[idx].is_null(); }, @@ -199,19 +199,24 @@ std::unique_ptr scatter( strings_column_handler handler, rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + // build offsets column + auto offsets_column = detail::offsets_from_string_array(strings_array,stream,mr); + auto offsets_view = offsets_column->view(); + auto d_offsets = offsets_view.data(); + // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly - auto chars_column = - detail::chars_column_from_string_array(strings, d_offsets, - stream, handler.memory_resource()); + auto chars_column = detail::chars_from_string_array(strings_array, d_offsets, + stream, mr); // build children vector std::vector> children; children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(chars_column)); + // return new strings column return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,handler.memory_resource()}, + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } @@ -222,35 +227,60 @@ std::unique_ptr scatter( strings_column_handler handler, // s3 = s1.scatter('e',pos,2) // ['a','e','c','e'] // -std::unique_ptr scatter( strings_column_handler handler, +std::unique_ptr scatter( strings_column_view handler, const char* string, cudf::column_view scatter_map, - cudaStream_t stream ) + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { -// size_type count = size(); -// size_type elements = scatter_map.size(); -// auto execpol = rmm::exec_policy(0); -// // copy string to device -// auto d_string = string_from_host(string); -// cudf::string_view* d_replace = *d_string; -// // create result output array -// rmm::device_vector results(count,nullptr); -// auto d_results = results.data().get(); -// custring_view_array d_strings = pImpl->getStringsPtr(); -// thrust::copy( execpol->on(0), d_strings, d_strings+count, d_results ); -// thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), elements, -// [d_pos, count, d_repl, d_results] __device__ (unsigned int idx) { -// int pos = d_pos[idx]; -// if( (pos >= 0) && (pos < count) ) -// d_results[pos] = d_repl; -// }); -// // build resulting instance -// NVStrings* rtn = new NVStrings(count); -// NVStrings_init_from_custrings(rtn->pImpl, d_results, count); -// if( !bdevmem ) -// RMM_FREE((void*)d_pos,0); -// RMM_FREE((void*)d_repl,0); - return nullptr; + size_type count = handler.size(); + size_type elements = scatter_map.size(); + auto execpol = rmm::exec_policy(0); + auto d_indices = scatter_map.data(); + // copy string to device + auto replace = detail::string_from_host(string, stream); + auto d_replace = *replace; + // create strings array + rmm::device_vector strings = + detail::create_string_array_from_column(handler, stream); + auto d_strings = strings.data().get(); + // replace specific elements + thrust::for_each_n(execpol->on(0), + thrust::make_counting_iterator(0), elements, + [d_indices, d_replace, d_strings] __device__ (unsigned int idx) { + d_strings[d_indices[idx]] = d_replace; + }); + + // create strings column + // build null mask + auto valid_mask = valid_if( static_cast(nullptr), + [d_strings] __device__ (size_type idx) { return !d_strings[idx].is_null(); }, + count, stream ); + auto null_count = valid_mask.second; + auto null_size = gdf_valid_allocation_size(count); + rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + + // build offsets column + auto offsets_column = detail::offsets_from_string_array(strings,stream,mr); + auto offsets_view = offsets_column->view(); + auto d_offsets = offsets_view.data(); + + // build chars column + size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly + auto chars_column = detail::chars_from_string_array(strings, d_offsets, + stream, mr); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + // return new strings column + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); } diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 6a29013716f..21ec2420d11 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include @@ -25,18 +25,19 @@ namespace cudf namespace strings { -std::unique_ptr characters_counts( strings_column_handler handler, - cudaStream_t stream ) +std::unique_ptr characters_counts( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { - size_type count = handler.size(); + size_type count = strings.size(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; // create output column auto result = std::make_unique( data_type{INT32}, count, - rmm::device_buffer(count * sizeof(int32_t), stream, handler.memory_resource()), + rmm::device_buffer(count * sizeof(int32_t), stream, mr), rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), - stream, handler.memory_resource()), + stream, mr), d_column.null_count()); auto results_view = result->mutable_view(); auto d_lengths = results_view.data(); @@ -53,18 +54,19 @@ std::unique_ptr characters_counts( strings_column_handler handler, return result; } -std::unique_ptr bytes_counts( strings_column_handler handler, - cudaStream_t stream ) +std::unique_ptr bytes_counts( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { - size_type count = handler.size(); + size_type count = strings.size(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; // create output column auto result = std::make_unique( data_type{INT32}, count, - rmm::device_buffer(count * sizeof(int32_t), stream, handler.memory_resource()), + rmm::device_buffer(count * sizeof(int32_t), stream, mr), rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), - stream, handler.memory_resource()), + stream, mr), d_column.null_count()); auto results_view = result->mutable_view(); auto d_lengths = results_view.data(); diff --git a/cpp/src/strings/strings_column_handler.cu b/cpp/src/strings/strings_column_view.cu similarity index 77% rename from cpp/src/strings/strings_column_handler.cu rename to cpp/src/strings/strings_column_view.cu index bd53b674f93..bd3306e6b4c 100644 --- a/cpp/src/strings/strings_column_handler.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include @@ -25,54 +25,52 @@ namespace cudf { // -strings_column_handler::strings_column_handler( column_view strings_column, - rmm::mr::device_memory_resource* mr ) - : _parent(strings_column), _mr(mr) +strings_column_view::strings_column_view( column_view strings_column ) + : _parent(strings_column) { - CUDF_EXPECTS( _parent.type().id()==STRING, "strings_column_handler only supports strings"); + CUDF_EXPECTS( _parent.type().id()==STRING, "strings_column_view only supports strings"); CUDF_EXPECTS( _parent.num_children()>0, "strings column must have children"); // revisit this (all nulls column?) } -size_type strings_column_handler::size() const +size_type strings_column_view::size() const { return _parent.child(0).size(); } -column_view strings_column_handler::parent_column() const +column_view strings_column_view::parent() const { return _parent; } -column_view strings_column_handler::offsets_column() const +column_view strings_column_view::offsets() const { return _parent.child(0); } -column_view strings_column_handler::chars_column() const +column_view strings_column_view::chars() const { return _parent.child(1); } -const bitmask_type* strings_column_handler::null_mask() const +const bitmask_type* strings_column_view::null_mask() const { return _parent.null_mask(); } -size_type strings_column_handler::null_count() const +size_type strings_column_view::null_count() const { return _parent.null_count(); } -rmm::mr::device_memory_resource* strings_column_handler::memory_resource() const +namespace strings { - return _mr; -} // print strings to stdout -void strings_column_handler::print( size_type start, size_type end, - size_type max_width, const char* delimiter ) const +void print( strings_column_view strings, + size_type start, size_type end, + size_type max_width, const char* delimiter ) { - size_type count = size(); + size_type count = strings.size(); if( end < 0 || end > count ) end = count; if( start < 0 ) @@ -83,15 +81,16 @@ void strings_column_handler::print( size_type start, size_type end, // stick with the default stream for this odd/rare stdout function auto execpol = rmm::exec_policy(0); - auto strings_column = column_device_view::create(_parent); + auto strings_column = column_device_view::create(strings.parent()); auto d_column = *strings_column; - auto d_offsets = offsets_column().data(); - auto d_strings = chars_column().data(); + auto d_offsets = strings.offsets().data(); + auto d_strings = strings.chars().data(); // create output strings offsets rmm::device_vector output_offsets(count,0); thrust::transform_inclusive_scan( execpol->on(0), - thrust::make_counting_iterator(start), thrust::make_counting_iterator(end), + thrust::make_counting_iterator(start), + thrust::make_counting_iterator(end), output_offsets.begin(), [d_column, d_strings, max_width, d_offsets] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) @@ -149,4 +148,5 @@ void strings_column_handler::print( size_type start, size_type end, } } +} // namespace strings } // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 7aaf6c39a08..172478f0252 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -52,16 +52,16 @@ std::unique_ptr> // build an array of string_view objects from a strings column rmm::device_vector create_string_array_from_column( - cudf::strings_column_handler handler, + cudf::strings_column_view strings, cudaStream_t stream ) { auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent_column(),stream); + auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; - auto count = handler.size(); - rmm::device_vector strings(count); - cudf::string_view* d_strings = strings.data().get(); + auto count = strings.size(); + rmm::device_vector strings_array(count); + cudf::string_view* d_strings = strings_array.data().get(); thrust::for_each_n( execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_strings] __device__ (size_type idx) { @@ -70,12 +70,11 @@ rmm::device_vector create_string_array_from_column( else d_strings[idx] = d_column.element(idx); }); - - return strings; + return strings_array; } // build a strings offsets column from an array of string_views -std::unique_ptr offsets_column_from_string_array( +std::unique_ptr offsets_from_string_array( const rmm::device_vector& strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { @@ -101,7 +100,7 @@ std::unique_ptr offsets_column_from_string_array( } // build a strings chars column from an array of string_views -std::unique_ptr chars_column_from_string_array( +std::unique_ptr chars_from_string_array( const rmm::device_vector& strings, const int32_t* d_offsets, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.h index 3739845acaa..b90a044d654 100644 --- a/cpp/src/strings/utilities.h +++ b/cpp/src/strings/utilities.h @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include @@ -29,8 +29,6 @@ namespace detail /** * @brief Creates a temporary string_view object from a host string. - * The host string is copied into device memory and the object - * pointer can be used in device code. * * @param[in] str Null-terminated, encoded string in CPU memory. * @param[in] stream Stream to execute any device code against. @@ -40,27 +38,46 @@ std::unique_ptr> string_from_host( const char* str, cudaStream_t stream=0 ); /** - * + * @brief Creates a strings array from a strings column. + * This is useful for doing some intermediate array operations. + * + * @param strings Strings instance. + * @param stream Stream to execute any device code against. + * @return Strings array */ rmm::device_vector create_string_array_from_column( - cudf::strings_column_handler strings, - cudaStream_t stream = (cudaStream_t)0 ); + cudf::strings_column_view strings, + cudaStream_t stream=0 ); /** - * + * @brief Creates an offsets column from a strings array. + * This can be used to recreate the offsets child of a new + * strings column from an intermediate strings array. + * + * @param strings Strings array + * @param stream Stream to execute any device code against. + * @param mr Memory resource to use. + * @return Offsets column */ -std::unique_ptr offsets_column_from_string_array( +std::unique_ptr offsets_from_string_array( const rmm::device_vector& strings, - cudaStream_t stream = (cudaStream_t)0, + cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /** - * + * @brief Creates a chars column from a strings array. + * This can be used to recreate the chars child of a new + * strings column from an intermediate strings array. + * + * @param strings Strings array + * @param stream Stream to execute any device code against. + * @param mr Memory resource to use. + * @return chars column */ -std::unique_ptr chars_column_from_string_array( +std::unique_ptr chars_from_string_array( const rmm::device_vector& strings, const int32_t* d_offsets, - cudaStream_t stream = (cudaStream_t)0, + cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); } // namespace detail From ed450f91b9fe96662f2df92ea47bb67088a67431 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 20 Sep 2019 13:54:03 -0400 Subject: [PATCH 09/54] string_view iterator and code-points fn --- .../cudf/column/column_device_view.cuh | 4 +- cpp/include/cudf/strings/string_view.cuh | 8 +++ .../cudf/strings/strings_column_view.hpp | 24 +++++++- cpp/src/strings/array.cu | 12 ++-- cpp/src/strings/attributes.cu | 58 ++++++++++++++++++- cpp/src/strings/strings_column_view.cu | 1 + cpp/src/strings/utilities.cu | 4 +- 7 files changed, 96 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 8a511552b1a..f40381d7892 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -454,11 +454,11 @@ class alignas(16) mutable_column_device_view /**---------------------------------------------------------------------------* * @brief Returns `string_view` to the string element at the specified index. * - * This function accounts for the offset. + * This function accounts for the offset. Do not call this for a null element. * * @param element_index Position of the desired string + * @return string_view instance representing this element at this index *---------------------------------------------------------------------------**/ - template <> __device__ inline string_view const column_device_view::element( size_type element_index) const noexcept { diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 68d02237fd5..4f9c22156cc 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -86,9 +87,16 @@ class string_view class iterator { public: + typedef ptrdiff_t difference_type; + typedef Char value_type; + typedef Char& reference; + typedef Char* pointer; + typedef std::input_iterator_tag iterator_category; // do not allow going backwards __device__ iterator(const string_view& str, size_type pos); iterator(const iterator& mit) = default; iterator(iterator&& mit) = default; + iterator& operator=(const iterator&) = default; + iterator& operator=(iterator&&) = default; __device__ iterator& operator++(); __device__ iterator operator++(int); __device__ bool operator==(const iterator& rhs) const; diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 9d6d653a5fa..c3ac1600e0b 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include @@ -23,7 +22,7 @@ namespace cudf { /**---------------------------------------------------------------------------* * @brief Given a column-view of strings type, an instance of this class - * provides a wrapper on the column for strings operations. + * provides a wrapper on this compound column for strings operations. *---------------------------------------------------------------------------**/ class strings_column_view : private column_view { @@ -239,5 +238,26 @@ std::unique_ptr characters_counts( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Creates a column with code point values (integers) for each string. + * A code point is the integer value representation of a character. + * For example, in UTF-8 the code point value for the character 'A' is 65. + * The column is an array of variable-length integer arrays each with length + * as returned by characters_counts(). + * + * @code + * s = ["a","xyz", "éee"] + * v = code_points(s) + * v is [97, 120, 121, 122, 50089, 101, 101] + * @endcode + * + * @param strings Strings instance for this operation. + * @param stream CUDA stream to use kernels in this method. + * @return Numeric column of type int32. TODO: need uint32 here + *---------------------------------------------------------------------------**/ +std::unique_ptr code_points( strings_column_view strings, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 4a70f47909f..c0d385ac45a 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -21,6 +21,7 @@ #include #include "./utilities.h" +#include #include #include #include @@ -107,10 +108,10 @@ std::unique_ptr gather( strings_column_view handler, auto d_chars = chars_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ - // place individual strings - if( d_column.nullable() && d_column.is_null(idx) ) + size_type index = d_indices[idx]; + if( d_column.nullable() && d_column.is_null(index) ) return; - string_view d_str = d_column.element(d_indices[idx]); + string_view d_str = d_column.element(index); size_type offset = (idx ? d_new_offsets[idx-1] : 0); memcpy(d_chars + offset, d_str.data(), d_str.size() ); }); @@ -267,7 +268,7 @@ std::unique_ptr scatter( strings_column_view handler, auto d_offsets = offsets_view.data(); // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly + size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; auto chars_column = detail::chars_from_string_array(strings, d_offsets, stream, mr); @@ -283,6 +284,5 @@ std::unique_ptr scatter( strings_column_view handler, std::move(children)); } - } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 21ec2420d11..2893d514dfd 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -14,11 +14,14 @@ * limitations under the License. */ +#include #include #include #include +#include #include +#include namespace cudf { @@ -29,7 +32,7 @@ std::unique_ptr characters_counts( strings_column_view strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - size_type count = strings.size(); + auto count = strings.size(); auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; @@ -58,7 +61,7 @@ std::unique_ptr bytes_counts( strings_column_view strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - size_type count = strings.size(); + auto count = strings.size(); auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; @@ -83,6 +86,55 @@ std::unique_ptr bytes_counts( strings_column_view strings, return result; } +// +// +std::unique_ptr code_points( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + auto count = strings.size(); + auto execpol = rmm::exec_policy(0); + auto strings_column = column_device_view::create(strings.parent(),stream); + auto d_column = *strings_column; + + // offsets point to each individual integer range + rmm::device_vector offsets(count); + size_type* d_offsets = offsets.data().get(); + thrust::transform_inclusive_scan(execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_offsets, + [d_column] __device__(size_type idx){ + if( d_column.nullable() && d_column.is_null(idx) ) + return 0; + return d_column.element(idx).characters(); + }, + thrust::plus()); + + // need the total size to build the column + // the size is the last element from an inclusive-scan + size_type size = offsets.back(); + // create output column + auto result = make_numeric_column( data_type{INT32}, size, + mask_state::UNALLOCATED, + stream, mr ); + auto results_view = result->mutable_view(); + auto d_results = results_view.data(); + // now set the ranges from each strings' character values + thrust::for_each_n(execpol->on(stream), + thrust::make_counting_iterator(0), count, + [d_column, d_offsets, d_results] __device__(unsigned int idx){ + if( d_column.nullable() && d_column.is_null(idx) ) + return; + auto d_str = d_column.element(idx); + auto result = d_results + (idx ? d_offsets[idx-1] :0); + thrust::copy( thrust::seq, d_str.begin(), d_str.end(), result); + //for( auto itr = d_str.begin(); itr != d_str.end(); ++itr ) + // *result++ = (unsigned int)*itr; + }); + // + return result; +} } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index bd3306e6b4c..7e3b80cf1c7 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 172478f0252..708d5a3d679 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -21,6 +21,7 @@ #include "./utilities.h" #include +#include #include #include @@ -108,7 +109,6 @@ std::unique_ptr chars_from_string_array( size_type count = strings.size(); auto d_strings = strings.data().get(); auto execpol = rmm::exec_policy(stream); - auto size_fn = [d_strings] __device__ (size_type idx) { return d_strings[idx].size(); }; size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // create column auto chars_column = make_numeric_column( data_type{INT8}, bytes, @@ -131,4 +131,4 @@ std::unique_ptr chars_from_string_array( } // namespace detail } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf From c4af4993b481543aee871a6245f8189a0c8402dc Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 20 Sep 2019 16:59:16 -0400 Subject: [PATCH 10/54] added is_ and split api declarations --- cpp/include/cudf/strings/string_view.cuh | 38 +-- .../cudf/strings/strings_column_view.hpp | 256 +++++++++++++++--- 2 files changed, 240 insertions(+), 54 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 4f9c22156cc..c941ee9238c 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -120,14 +120,14 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return single UTF-8 character at the given character position - * + * * @param pos Character position *---------------------------------------------------------------------------**/ __device__ Char at(size_type pos) const; __device__ Char operator[](size_type pos) const; /**---------------------------------------------------------------------------* * @brief Return the byte offset from data() for a given character position - * + * * @param pos Character position *---------------------------------------------------------------------------**/ __device__ size_type byte_offset_for(size_type pos) const; @@ -135,7 +135,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. - * + * * @param str Target string to compare with this string. * @return 0 If they compare equal. * <0 Either the value of the first character of this string that does @@ -149,7 +149,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. - * + * * @param str Target string to compare with this string. * @param bytes Number of bytes in str. * @return 0 If they compare equal. @@ -189,7 +189,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Returns first character position if arg string is contained in this string. - * + * * @param str Target string to compare with this string. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -199,7 +199,7 @@ class string_view __device__ size_type find( const string_view& str, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Returns first character position if arg array is contained in this string. - * + * * @param str Target string to compare with this string. * @param bytes Number of bytes in str. * @param pos Character position to start search within this string. @@ -210,7 +210,7 @@ class string_view __device__ size_type find( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Returns first character position if arg character is contained in this string. - * + * * @param chr Single encoded character. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -220,7 +220,7 @@ class string_view __device__ size_type find( Char chr, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Same as find() but searches from the end of this string. - * + * * @param str Target string to compare with this string. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -230,7 +230,7 @@ class string_view __device__ size_type rfind( const string_view& str, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Same as find() but searches from the end of this string. - * + * * @param str Target string to compare with this string. * @param bytes Number of bytes in str. * @param pos Character position to start search within this string. @@ -241,7 +241,7 @@ class string_view __device__ size_type rfind( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Same as find() but searches from the end of this string. - * + * * @param chr Single encoded character. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -253,7 +253,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return a sub-string of this string. The original string and device * memory but must still be maintained for the lifetime of the instance. - * + * * @param start Character position to start the sub-string. * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. @@ -262,11 +262,11 @@ class string_view /**---------------------------------------------------------------------------* * @brief Tokenizes this string around the given delimiter up to count time. - * + * * @param delim Character to use for separating tokens. * @param count Maximum number of tokens to return. * Specify -1 to indicate all tokens. - * @param[out] Array to hold output tokens. + * @param[out] Array to hold output tokens. * Specify nullptr here to return just the token count. * @return Number of tokens. *---------------------------------------------------------------------------**/ @@ -274,11 +274,11 @@ class string_view /**---------------------------------------------------------------------------* * @brief Same as split() but starts tokenizing from the end of the string. - * + * * @param delim Character to use for separating tokens. * @param count Maximum number of tokens to return. * Specify -1 to indicate all tokens. - * @param[out] Array to hold output tokens. + * @param[out] Array to hold output tokens. * Specify nullptr here to return just the token count. * @return Number of tokens. *---------------------------------------------------------------------------**/ @@ -290,7 +290,7 @@ class string_view __host__ __device__ static size_type bytes_in_char( Char chr ); /**---------------------------------------------------------------------------* * @brief Convert a char array into a Char value. - * + * * @param str String containing encoded char bytes. * @param[out] chr Single Char value. * @return The number of bytes in the character @@ -298,7 +298,7 @@ class string_view __host__ __device__ static size_type char_to_Char( const char* str, Char& chr ); /**---------------------------------------------------------------------------* * @brief Place a Char value into a char array. - * + * * @param chr Single character * @param[out] str Allocated char array with enough space to hold the encoded characer. * @return The number of bytes in the character @@ -306,7 +306,7 @@ class string_view __host__ __device__ static size_type Char_to_char( Char chr, char* str ); /**---------------------------------------------------------------------------* * @brief Return the number of characters in this provided char array. - * + * * @param str String with encoded char bytes. * @param bytes Number of bytes in str. * @return The number of characters in the array. @@ -319,7 +319,7 @@ private: /**---------------------------------------------------------------------------* * @brief Return the character position of the given byte offset. - * + * * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. *---------------------------------------------------------------------------**/ diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index c3ac1600e0b..4a3f68d250c 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -26,45 +26,44 @@ namespace cudf { *---------------------------------------------------------------------------**/ class strings_column_view : private column_view { - public: - strings_column_view( column_view strings_column ); - strings_column_view( strings_column_view&& strings_view ) = default; - strings_column_view( const strings_column_view& strings_view ) = default; - ~strings_column_view() = default; +public: + strings_column_view( column_view strings_column ); + strings_column_view( strings_column_view&& strings_view ) = default; + strings_column_view( const strings_column_view& strings_view ) = default; + ~strings_column_view() = default; - /**---------------------------------------------------------------------------* - * @brief Returns the number of strings in the column - *---------------------------------------------------------------------------**/ - size_type size() const; + /**---------------------------------------------------------------------------* + * @brief Returns the number of strings in the column + *---------------------------------------------------------------------------**/ + size_type size() const; - /**---------------------------------------------------------------------------* - * @brief Returns the internal parent string column - *---------------------------------------------------------------------------**/ - column_view parent() const; + /**---------------------------------------------------------------------------* + * @brief Returns the internal parent string column + *---------------------------------------------------------------------------**/ + column_view parent() const; - /**---------------------------------------------------------------------------* - * @brief Returns the internal column of offsets - *---------------------------------------------------------------------------**/ - column_view offsets() const; + /**---------------------------------------------------------------------------* + * @brief Returns the internal column of offsets + *---------------------------------------------------------------------------**/ + column_view offsets() const; - /**---------------------------------------------------------------------------* - * @brief Returns the internal column of chars - *---------------------------------------------------------------------------**/ - column_view chars() const; + /**---------------------------------------------------------------------------* + * @brief Returns the internal column of chars + *---------------------------------------------------------------------------**/ + column_view chars() const; - /**---------------------------------------------------------------------------* - * @brief Returns a pointer to the internal null mask memory - *---------------------------------------------------------------------------**/ - const bitmask_type* null_mask() const; + /**---------------------------------------------------------------------------* + * @brief Returns a pointer to the internal null mask memory + *---------------------------------------------------------------------------**/ + const bitmask_type* null_mask() const; - /**---------------------------------------------------------------------------* - * @brief Returns the number of nulls in this column - *---------------------------------------------------------------------------**/ - size_type null_count() const; + /**---------------------------------------------------------------------------* + * @brief Returns the number of nulls in this column + *---------------------------------------------------------------------------**/ + size_type null_count() const; private: - const column_view _parent; - + const column_view _parent; }; namespace strings @@ -96,7 +95,7 @@ void print( strings_column_view strings, * s2 = sublist( s1, 2 ) * s2 is ["c", "d", "e", "f"] * @endcode - * + * * @param strings Strings instance for this operation. * @param start Index of first string to use. * @param end Index of last string to use. @@ -104,6 +103,7 @@ void print( strings_column_view strings, * @param step Increment value between indexes. * Default step is 1. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return New strings column of size (end-start)/step. *---------------------------------------------------------------------------**/ std::unique_ptr sublist( strings_column_view strings, @@ -115,7 +115,7 @@ std::unique_ptr sublist( strings_column_view strings, /**---------------------------------------------------------------------------* * @brief Returns a new strings column created this strings instance using * the specified indices to select the strings. - * + * * @code * s1 = ["a", "b", "c", "d", "e", "f"] * map = [0, 2] @@ -127,6 +127,7 @@ std::unique_ptr sublist( strings_column_view strings, * @param gather_map The indices with which to select strings for the new column. * Values must be within [0,size()) range. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return New strings column of size indices.size() *---------------------------------------------------------------------------**/ std::unique_ptr gather( strings_column_view strings, @@ -150,6 +151,7 @@ enum sort_type { * @param ascending Sort strings in ascending or descending order. * @param nullfirst Sort nulls to the beginning or the end of the new column. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return New strings column with sorted elements of this instance. *---------------------------------------------------------------------------**/ std::unique_ptr sort( strings_column_view strings, @@ -179,6 +181,7 @@ std::unique_ptr sort( strings_column_view strings, * strings parameter. Number of values must equal the number * of elements in strings pararameter (strings.size()). * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return New instance with the specified strings. */ std::unique_ptr scatter( strings_column_view strings, @@ -198,11 +201,13 @@ std::unique_ptr scatter( strings_column_view strings, * s2 = scatter( s1, "e", m1 ) * s2 is ["a", "e", "c", "e"] * @endcode - * + * * @param strings Strings instance for this operation. * @param value Null-terminated encoded string in host memory to use with * the scatter_map. * @param scatter_map The 0-based index values to place the given string. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return New instance with the specified strings. */ std::unique_ptr scatter( strings_column_view strings, @@ -218,6 +223,7 @@ std::unique_ptr scatter( strings_column_view strings, * * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return Numeric column of type int32. *---------------------------------------------------------------------------**/ std::unique_ptr bytes_counts( strings_column_view strings, @@ -232,6 +238,7 @@ std::unique_ptr bytes_counts( strings_column_view strings, * * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return Numeric column of type int32. *---------------------------------------------------------------------------**/ std::unique_ptr characters_counts( strings_column_view strings, @@ -244,7 +251,7 @@ std::unique_ptr characters_counts( strings_column_view strings, * For example, in UTF-8 the code point value for the character 'A' is 65. * The column is an array of variable-length integer arrays each with length * as returned by characters_counts(). - * + * * @code * s = ["a","xyz", "éee"] * v = code_points(s) @@ -253,11 +260,190 @@ std::unique_ptr characters_counts( strings_column_view strings, * * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. * @return Numeric column of type int32. TODO: need uint32 here *---------------------------------------------------------------------------**/ std::unique_ptr code_points( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +enum character_attribute { + DECIMAL=0, + NUMERIC=1, + DIGIT=2, + ALPHA=3, + SPACE=4, + UPPER=5, + LOWER=6, + ALPHANUM=7, + EMPTY=8 +}; +/**---------------------------------------------------------------------------* + * @brief Returns true for strings that have only characters of the specified + * type. + * @param strings Strings instance for this operation. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return Column of type bool. + *---------------------------------------------------------------------------**/ +std::unique_ptr is_of_type( strings_column_view strings, + character_attribute ca_type, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +// combine.cu +/**---------------------------------------------------------------------------* + * @brief Row-wise concatenates two columns of strings into a new a column. + * The number of strings in both columns must match. + * @param strings 1st string column. + * @param others 2nd string column. + * @param separator Null-terminated CPU string that should appear between each element. + * @param narep Null-terminated CPU string that should represent any null strings found. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return New instance with the concatenated + *---------------------------------------------------------------------------**/ +std::unique_ptr concatenate( strings_column_view strings, + strings_column_view others, + const char* separator, const char* narep=nullptr, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +/**---------------------------------------------------------------------------* + * @brief Row-wise oncatenates the given list of strings columns with the first column. + * @param strings 1st string column. + * @param others List of string columns to concatenate. + * @param separator Null-terminated CPU string that should appear between each instance. + * @param narep Null-terminated CPU string that should represent any null strings found. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return New instance with the concatenated + *---------------------------------------------------------------------------**/ +std::unique_ptr concatenate( strings_column_view strings, + std::vector& others, + const char* separator, const char* narep=nullptr, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +/**---------------------------------------------------------------------------* + * @brief Concatenates all strings in the column into one new string. + * This provides the Pandas strings equivalent of join(). + * @param strings Strings for this operation. + * @param separator Null-terminated CPU string that should appear between each string. + * @param narep Null-terminated CPU string that should represent any null strings found. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return Resulting instance with one string. + *---------------------------------------------------------------------------**/ +std::unique_ptr build_single_string( strings_column_view strings, + const char* separator="", + const char* narep=nullptr, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +// split.cu +/**---------------------------------------------------------------------------* + * @brief Split strings vertically creating new columns of strings. + * The number of columns will be equal to the string with the most splits. + * + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * Default of null splits on whitespace. + * @param maxsplit Maximum number of splits to perform searching from the beginning. + * Default -1 indicates all delimiters are processed. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of strings columns. + *---------------------------------------------------------------------------**/ +std::vector> split( strings_column_view strings, + const char* delimiter=nullptr, + int maxsplit=-1, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Split strings vertically creating new columns of NVStrings instances. + * The number of columns will be equal to the string with the most splits. + * + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * Default of null splits on whitespace. + * @param maxsplit Maximum number of splits to perform searching right to left. + * Default -1 indicates all delimiters are processed. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of strings columns. + *---------------------------------------------------------------------------**/ +std::vector> rsplit( strings_column_view strings, + const char* delimiter=nullptr, + int maxsplit=-1, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Each string is split into a list of new strings. + * The delimiter is searched from the beginning of each string. + * Each string results in a new strings column. + * + * @param strings Strings for this operation. + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * Default of null splits on whitespace. + * @param maxsplit Maximum number of splits to perform searching from the beginning. + * Default -1 indicates all delimiters are processed. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of columns for each string. + *---------------------------------------------------------------------------**/ +std::vector> split_record( strings_column_view strings, + const char* delimiter=nullptr, + int maxsplit=-1, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Each string is split into a list of new strings. + * The delimiter is searched from the end of each string. + * Each string results in a new strings column. + * + * @param strings Strings for this operation. + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * Default of null splits on whitespace. + * @param maxsplit Maximum number of splits to perform searching from the end. + * Default -1 indicates all delimiters are processed. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of columns for each string. + *---------------------------------------------------------------------------**/ +std::vector> rsplit_record( strings_column_view strings, + const char* delimiter=nullptr, + int maxsplit=-1, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Each string is split into two strings on the first delimiter found. + * Three strings are always created for each string: left-half, delimiter itself, right-half. + * The result is 3 strings columns representing the 3 partitions. + * + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * @param results The list of instances for each string. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of columns for each partition. + *---------------------------------------------------------------------------**/ +std::vector> partition( strings_column_view strings, + const char* delimiter, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +/**---------------------------------------------------------------------------* + * @brief Each string is split into two strings on the last delimiter found. + * Three strings are always created for each string: left-half, delimiter itself, right-half. + * The result is 3 strings columns representing the 3 partitions. + * + * @param delimiter Null-terminated CPU string identifying the split points within each string. + * @param results The list of instances for each string. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return List of columns for each partition. + *---------------------------------------------------------------------------**/ +std::vector> rpartition( strings_column_view strings, + const char* delimiter, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + } // namespace strings } // namespace cudf From 85b3150c3703747b74f8e8f33d5c1bffeb15dc7d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 23 Sep 2019 14:01:36 -0400 Subject: [PATCH 11/54] added a concatenate function --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/string_view.cuh | 10 +- .../cudf/strings/strings_column_view.hpp | 15 +- cpp/src/strings/combine.cu | 160 ++++++++++++++++++ cpp/src/strings/utilities.cu | 4 +- cpp/src/strings/utilities.h | 18 +- 6 files changed, 198 insertions(+), 10 deletions(-) create mode 100644 cpp/src/strings/combine.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bc9dc67c5b5..cd9ee416b73 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -430,6 +430,7 @@ add_library(cudf src/strings/strings_column_view.cu src/strings/array.cu src/strings/attributes.cu + src/strings/combine.cu src/strings/utilities.cu src/column/legacy/interop.cpp) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index c941ee9238c..69c55b773fe 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -87,11 +87,11 @@ class string_view class iterator { public: - typedef ptrdiff_t difference_type; - typedef Char value_type; - typedef Char& reference; - typedef Char* pointer; - typedef std::input_iterator_tag iterator_category; // do not allow going backwards + using difference_type = ptrdiff_t; + using value_type = Char; + using reference = Char&; + using pointer = Char*; + using iterator_category = std::input_iterator_tag; // do not allow going backwards __device__ iterator(const string_view& str, size_type pos); iterator(const iterator& mit) = default; iterator(iterator&& mit) = default; diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 4a3f68d250c..19a8752407d 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -305,23 +305,34 @@ std::unique_ptr is_of_type( strings_column_view strings, *---------------------------------------------------------------------------**/ std::unique_ptr concatenate( strings_column_view strings, strings_column_view others, - const char* separator, const char* narep=nullptr, + const char* separator="", const char* narep=nullptr, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* * @brief Row-wise oncatenates the given list of strings columns with the first column. + * + * @code + * s1 = ['aa', null, '', 'aa'] + * s2 = ['', 'bb', 'bb', null] + * r = concatenate(s1,s2) + * r is ['aa', null, 'bb', null] + * @endcode + * * @param strings 1st string column. * @param others List of string columns to concatenate. * @param separator Null-terminated CPU string that should appear between each instance. + * Default is empty string. * @param narep Null-terminated CPU string that should represent any null strings found. + * Default of null means any null operand produces a null result. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. * @return New instance with the concatenated *---------------------------------------------------------------------------**/ std::unique_ptr concatenate( strings_column_view strings, std::vector& others, - const char* separator, const char* narep=nullptr, + const char* separator="", + const char* narep=nullptr, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu new file mode 100644 index 00000000000..81c25c4ca03 --- /dev/null +++ b/cpp/src/strings/combine.cu @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "./utilities.h" + +#include +#include + +namespace cudf +{ +namespace strings +{ + +std::unique_ptr concatenate( strings_column_view strings, + strings_column_view others, + const char* separator, + const char* narep, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + CUDF_EXPECTS( strings.size()==others.size(), "columns must be the same size"); + + auto execpol = rmm::exec_policy(stream); + size_type count = strings.size(); + + if( !separator ) + separator = ""; + auto separator_ptr = detail::string_from_host(separator, stream); + auto d_separator = *separator_ptr; + auto narep_ptr = detail::string_from_host(narep, stream); + cudf::string_view d_narep(nullptr,0); + if( narep_ptr ) + d_narep = *narep_ptr; + + // create strings arrays + auto strings_column_ptr = column_device_view::create(strings.parent(),stream); + auto d_strings = *strings_column_ptr; + auto others_column_ptr = column_device_view::create(others.parent(),stream); + auto d_others = *others_column_ptr; + + // create resulting null mask + auto valid_mask = valid_if( static_cast(nullptr), + [d_strings, d_others, d_narep] __device__ (size_type idx) { + return !(((d_strings.nullable() && d_strings.is_null(idx)) || + (d_others.nullable() && d_others.is_null(idx))) && + d_narep.is_null()); + }, + count, stream ); + auto null_count = valid_mask.second; + auto null_size = gdf_valid_allocation_size(count); + rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + + // build offsets column + auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + stream, mr ); + auto offsets_view = offsets_column->mutable_view(); + auto d_results_offsets = offsets_view.data(); + // compute offsets + thrust::transform_inclusive_scan( execpol->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(count), + d_results_offsets, + [d_strings, d_others, d_separator, d_narep] __device__ (size_type idx) { + cudf::string_view d_str1; + if( d_strings.nullable() && d_strings.is_null(idx) ) + d_str1 = cudf::string_view(nullptr,0); + else + d_str1 = d_strings.element(idx); + cudf::string_view d_str2; + if( d_others.nullable() && d_others.is_null(idx) ) + d_str2 = cudf::string_view(nullptr,0); + else + d_str2 = d_others.element(idx); + if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) + return 0; // null output case + size_type bytes = 0; + // left-side + if( !d_str1.is_null() ) + bytes = d_str1.size(); + else if( !d_narep.is_null() ) + bytes = d_narep.size(); + // separator + bytes += d_separator.size(); + if( !d_str2.is_null() ) + bytes += d_str2.size(); + else if( !d_narep.is_null() ) + bytes += d_narep.size(); + return bytes; + }, + thrust::plus() ); + + // build chars column + size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count-1]; // this may not be stream friendly + auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, + stream, mr ); + auto chars_view = chars_column->mutable_view(); + auto d_results_chars = chars_view.data(); + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + [d_strings, d_others, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(size_type idx){ + cudf::string_view d_str1; + if( d_strings.nullable() && d_strings.is_null(idx) ) + d_str1 = cudf::string_view(nullptr,0); + else + d_str1 = d_strings.element(idx); + cudf::string_view d_str2; + if( d_others.nullable() && d_others.is_null(idx) ) + d_str2 = cudf::string_view(nullptr,0); + else + d_str2 = d_others.element(idx); + if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) + return; // null -- nothing to do + // concat the two strings with appropriate separator and narep + size_type offset = (idx ? d_results_offsets[idx-1] : 0); + char* d_buffer = d_results_chars + offset; + if( !d_str1.is_null() ) + d_buffer = detail::copy_string(d_buffer, d_str1); + else if( !d_narep.is_null() ) + d_buffer = detail::copy_string(d_buffer, d_narep); + if( !d_separator.is_null() ) + d_buffer = detail::copy_string(d_buffer, d_separator); + if( !d_str2.is_null() ) + d_buffer = detail::copy_string(d_buffer, d_str2); + else if( !d_narep.is_null() ) + d_buffer = detail::copy_string(d_buffer, d_narep); + }); + + // build children vector + std::vector> children; + children.emplace_back(std::move(offsets_column)); + children.emplace_back(std::move(chars_column)); + + return std::make_unique( + data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + null_mask, null_count, + std::move(children)); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 708d5a3d679..a329c566ab9 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -25,7 +25,7 @@ #include #include -namespace cudf +namespace cudf { namespace strings { @@ -63,7 +63,7 @@ rmm::device_vector create_string_array_from_column( auto count = strings.size(); rmm::device_vector strings_array(count); cudf::string_view* d_strings = strings_array.data().get(); - thrust::for_each_n( execpol->on(stream), + thrust::for_each_n( execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_strings] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.h index b90a044d654..d895c4d5378 100644 --- a/cpp/src/strings/utilities.h +++ b/cpp/src/strings/utilities.h @@ -20,7 +20,7 @@ #include -namespace cudf +namespace cudf { namespace strings { @@ -80,6 +80,22 @@ std::unique_ptr chars_from_string_array( cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + +/** + * @brief This utility will copy the argument string's data into + * the provided buffer. + * + * @param buffer Device buffer to copy to. + * @param d_string String to copy. + * @return Points to the end of the buffer after the copy. + */ +__device__ inline char* copy_string( char* buffer, const cudf::string_view& d_string ) +{ + memcpy( buffer, d_string.data(), d_string.size() ); + return buffer + d_string.size(); +} + + } // namespace detail } // namespace strings } // namespace cudf \ No newline at end of file From 83ee6bef31a1ee6cfb9654b82220cdeb7be9f10c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 24 Sep 2019 13:44:33 -0400 Subject: [PATCH 12/54] fixed missing stream, mr parms --- cpp/src/strings/array.cu | 4 ++-- cpp/src/strings/combine.cu | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 19f3005da4e..1af4ece63fb 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -97,7 +97,7 @@ std::unique_ptr gather( strings_column_view handler, count, stream ); auto null_count = valid_mask.second; auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); // does deep copy RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build chars column @@ -259,7 +259,7 @@ std::unique_ptr scatter( strings_column_view handler, count, stream ); auto null_count = valid_mask.second; auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build offsets column diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 81c25c4ca03..bdc88c65f23 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -68,7 +68,7 @@ std::unique_ptr concatenate( strings_column_view strings, count, stream ); auto null_count = valid_mask.second; auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size); // does deep copy + rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); // does deep copy RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build offsets column From 0a3ca5d0fa7fd57d0960ed9b7bb892fa2e2a32e5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 24 Sep 2019 14:48:57 -0400 Subject: [PATCH 13/54] create_offsets method --- .../cudf/strings/strings_column_view.hpp | 16 +++++++++++++ cpp/src/strings/array.cu | 6 ++--- cpp/src/strings/strings_column_view.cu | 24 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 19a8752407d..2522bcbfc6f 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -18,6 +18,8 @@ #include #include +#include + namespace cudf { /**---------------------------------------------------------------------------* @@ -85,6 +87,20 @@ void print( strings_column_view strings, size_type max_width=-1, const char* delimiter = "\n" ); +/**---------------------------------------------------------------------------* + * @brief Create output pair per Arrow format of strings. + * The return pair is a the array of chars and an array of offsets. + * + * @param strings Strings instance for this operation. + * @param stream CUDA stream to use kernels in this method. + * @param mr Resource for allocating device memory. + * @return Contiguous array of chars and an array of offsets. + *---------------------------------------------------------------------------**/ +std::pair, rmm::device_vector> + create_offsets( strings_column_view strings, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); + // array.cu /**---------------------------------------------------------------------------* * @brief Returns a new strings column created from a subset of diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 1af4ece63fb..fe4d09e45e4 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -27,7 +27,7 @@ #include #include -namespace cudf +namespace cudf { namespace strings { @@ -180,10 +180,10 @@ std::unique_ptr scatter( strings_column_view strings, auto execpol = rmm::exec_policy(stream); // create strings arrays - rmm::device_vector strings_array = + rmm::device_vector strings_array = detail::create_string_array_from_column(strings,stream); cudf::string_view* d_strings = strings_array.data().get(); - rmm::device_vector values_array = + rmm::device_vector values_array = detail::create_string_array_from_column(values,stream); cudf::string_view* d_values = values_array.data().get(); // do the scatter diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 7e3b80cf1c7..45a124d045a 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -149,5 +149,29 @@ void print( strings_column_view strings, } } +std::pair, rmm::device_vector> + create_offsets( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + std::pair, rmm::device_vector> results; + + size_type count = strings.size(); + auto d_offsets = strings.offsets().data(); + size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; + results.second = rmm::device_vector(count+1); + results.second[0] = 0; + cudaMemcpyAsync( results.second.data().get()+1, d_offsets, count*sizeof(size_type), + cudaMemcpyDeviceToHost, stream); + + auto d_chars = strings.chars().data(); + results.first = rmm::device_vector(bytes); + cudaMemcpyAsync( results.first.data().get(), d_chars, bytes, + cudaMemcpyDeviceToHost, stream); + + return results; +} + + } // namespace strings } // namespace cudf \ No newline at end of file From da263207452086b0ffb0e2b2d4ecdd0551170d6f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 Sep 2019 11:20:02 -0400 Subject: [PATCH 14/54] first strings columns gtests --- cpp/src/strings/attributes.cu | 15 ++- cpp/tests/CMakeLists.txt | 10 ++ cpp/tests/strings/attributes_tests.cu | 66 ++++++++++++ cpp/tests/strings/factories_test.cu | 140 ++++++++++++++++++++++++++ cpp/tests/strings/utilities.cu | 55 ++++++++++ cpp/tests/strings/utilities.h | 19 ++++ 6 files changed, 302 insertions(+), 3 deletions(-) create mode 100644 cpp/tests/strings/attributes_tests.cu create mode 100644 cpp/tests/strings/factories_test.cu create mode 100644 cpp/tests/strings/utilities.cu create mode 100644 cpp/tests/strings/utilities.h diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 2893d514dfd..5a53ee7728a 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -36,6 +36,7 @@ std::unique_ptr characters_counts( strings_column_view strings, auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; + cudf::size_type null_count = d_column.null_count(); // create output column auto result = std::make_unique( data_type{INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), @@ -54,6 +55,7 @@ std::unique_ptr characters_counts( strings_column_view strings, return 0; return d_column.element(idx).characters(); }); + results_view.set_null_count(null_count); return result; } @@ -65,12 +67,16 @@ std::unique_ptr bytes_counts( strings_column_view strings, auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; + rmm::device_buffer null_mask; + cudf::size_type null_count = d_column.null_count(); + if( d_column.nullable() ) + null_mask = rmm::device_buffer( d_column.null_mask(), + gdf_valid_allocation_size(count), + stream, mr); // create output column auto result = std::make_unique( data_type{INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), - rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), - stream, mr), - d_column.null_count()); + null_mask, null_count); auto results_view = result->mutable_view(); auto d_lengths = results_view.data(); // set sizes @@ -83,6 +89,9 @@ std::unique_ptr bytes_counts( strings_column_view strings, return 0; return d_column.element(idx).size(); }); + // reset the null count + results_view.set_null_count(null_count); + printf("size=%d, null_count=%d\n", count, null_count); return result; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ee4c351366b..1d85d6a41f6 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -476,6 +476,16 @@ set(DISPATCHER_TEST_SRC ConfigureTest(DISPATCHER_TEST "${DISPATCHER_TEST_SRC}") +################################################################################################### +# - strings test -------------------------------------------------------------------------------------- + +set(STRINGS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/strings/factories_test.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/attributes_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/utilities.cu") + +ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}") + ################################################################################################### ### enable testing ################################################################################ ################################################################################################### diff --git a/cpp/tests/strings/attributes_tests.cu b/cpp/tests/strings/attributes_tests.cu new file mode 100644 index 00000000000..749c52060c0 --- /dev/null +++ b/cpp/tests/strings/attributes_tests.cu @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include "./utilities.h" + +#include +#include + + +struct AttrsTest : public GdfTest {}; + + +TEST_F(AttrsTest, BytesCounts) +{ + std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; + std::vector h_bytes{ 3, 0, 3, 0, 3, 4 }; + + auto strings = cudf::test::create_strings_column(h_test_strings); + auto strings_view = cudf::strings_column_view(strings->view()); + cudf::size_type count = strings_view.size(); + cudf::strings::print(strings_view); + + auto column = cudf::strings::bytes_counts(strings_view); + rmm::device_vector d_expected(h_bytes); + cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, + d_expected.data().get(), nullptr, 0 ); + cudf::test::expect_columns_equal(column->view(), column_expected); +} + +TEST_F(AttrsTest, CharactersCounts) +{ + std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; + std::vector h_characters{ 3, 0, 2, 0, 3, 2 }; + + auto strings = cudf::test::create_strings_column(h_test_strings); + auto strings_view = cudf::strings_column_view(strings->view()); + cudf::size_type count = strings_view.size(); + + auto column = cudf::strings::characters_counts(strings_view); + rmm::device_vector d_expected(h_characters); + cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, + d_expected.data().get(), nullptr, 0 ); + cudf::test::expect_columns_equal(column->view(), column_expected); +} + diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu new file mode 100644 index 00000000000..855e4db7ba7 --- /dev/null +++ b/cpp/tests/strings/factories_test.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +#include +#include + + +struct FactoriesTest : public GdfTest {}; + +TEST_F(FactoriesTest, CreateColumnFromArray) +{ + std::vector h_test_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", nullptr, "absent stop words" }; + + cudf::size_type memsize = 0; + for( auto itr=h_test_strings.begin(); itr!=h_test_strings.end(); ++itr ) + memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; + cudf::size_type count = (cudf::size_type)h_test_strings.size(); + thrust::host_vector h_buffer(memsize); + thrust::device_vector d_buffer(memsize); + thrust::host_vector > strings(count); + thrust::host_vector h_offsets(count+1); + cudf::size_type offset = 0; + cudf::size_type nulls = 0; + h_offsets[0] = 0; + for( cudf::size_type idx=0; idx < count; ++idx ) + { + const char* str = h_test_strings[idx]; + if( !str ) + { + strings[idx] = thrust::pair{nullptr,0}; + nulls++; + } + else + { + cudf::size_type length = (cudf::size_type)strlen(str); + memcpy( h_buffer.data() + offset, str, length ); + strings[idx] = thrust::pair{d_buffer.data().get()+offset,(size_t)length}; + offset += length; + } + h_offsets[idx+1] = offset; + } + rmm::device_vector> d_strings(strings); + cudaMemcpy( d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice ); + auto column = cudf::make_strings_column( d_strings ); + EXPECT_EQ(column->type(), cudf::data_type{cudf::STRING}); + EXPECT_EQ(column->null_count(), nulls); + if( nulls ) + { + EXPECT_TRUE(column->nullable()); + EXPECT_TRUE(column->has_nulls()); + } + EXPECT_EQ(2, column->num_children()); + + cudf::strings_column_view strings_view(column->view()); + EXPECT_EQ( strings_view.size(), count); + EXPECT_EQ( strings_view.offsets().size(), count ); + EXPECT_EQ( strings_view.chars().size(), memsize ); + + // check string data + auto strings_data = cudf::strings::create_offsets(strings_view); + thrust::host_vector h_chars_data(strings_data.first); + thrust::host_vector h_offsets_data(strings_data.second); + EXPECT_EQ( memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0 ); + EXPECT_EQ( memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size()*sizeof(cudf::size_type)), 0); +} + +TEST_F(FactoriesTest, CreateColumnFromOffsets) +{ + std::vector h_test_strings{ "the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "absent stop words" }; + + cudf::size_type memsize = 0; + for( auto itr=h_test_strings.begin(); itr!=h_test_strings.end(); ++itr ) + memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; + cudf::size_type count = (cudf::size_type)h_test_strings.size(); + thrust::host_vector h_buffer(memsize); + thrust::host_vector h_offsets(count+1); + cudf::size_type offset = 0; + h_offsets[0] = 0; + for( cudf::size_type idx=0; idx < count; ++idx ) + { + const char* str = h_test_strings[idx]; + if( str ) + { + cudf::size_type length = (cudf::size_type)strlen(str); + memcpy( h_buffer.data() + offset, str, length ); + offset += length; + } + h_offsets[idx+1] = offset; + } + rmm::device_vector d_buffer(h_buffer); + rmm::device_vector d_offsets(h_offsets); + rmm::device_vector d_nulls; + auto column = cudf::make_strings_column( d_buffer, d_offsets, d_nulls, 0 ); + EXPECT_EQ(column->type(), cudf::data_type{cudf::STRING}); + EXPECT_EQ(column->null_count(), 0); + EXPECT_EQ(2, column->num_children()); + + cudf::strings_column_view strings_view(column->view()); + EXPECT_EQ( strings_view.size(), count); + EXPECT_EQ( strings_view.offsets().size(), count ); + EXPECT_EQ( strings_view.chars().size(), memsize ); + + // check string data + auto strings_data = cudf::strings::create_offsets(strings_view); + thrust::host_vector h_chars_data(strings_data.first); + thrust::host_vector h_offsets_data(strings_data.second); + EXPECT_EQ( memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0 ); + EXPECT_EQ( memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size()*sizeof(cudf::size_type)), 0); +} diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cu new file mode 100644 index 00000000000..f1d384aa4d6 --- /dev/null +++ b/cpp/tests/strings/utilities.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "./utilities.h" + +#include + +namespace cudf { +namespace test { + +// +std::unique_ptr create_strings_column( const std::vector& h_strings ) +{ + cudf::size_type memsize = 0; + for( auto itr=h_strings.begin(); itr!=h_strings.end(); ++itr ) + memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; + cudf::size_type count = (cudf::size_type)h_strings.size(); + thrust::host_vector h_buffer(memsize); + thrust::device_vector d_buffer(memsize); + thrust::host_vector > strings(count); + cudf::size_type offset = 0; + for( cudf::size_type idx=0; idx < count; ++idx ) + { + const char* str = h_strings[idx]; + if( !str ) + strings[idx] = thrust::pair{nullptr,0}; + else + { + cudf::size_type length = (cudf::size_type)strlen(str); + memcpy( h_buffer.data() + offset, str, length ); + strings[idx] = thrust::pair{d_buffer.data().get()+offset,(size_t)length}; + offset += length; + } + } + rmm::device_vector> d_strings(strings); + cudaMemcpy( d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice ); + return cudf::make_strings_column( d_strings ); +} + +} // namespace test +} // namespace cudf diff --git a/cpp/tests/strings/utilities.h b/cpp/tests/strings/utilities.h new file mode 100644 index 00000000000..76f02f72312 --- /dev/null +++ b/cpp/tests/strings/utilities.h @@ -0,0 +1,19 @@ + + +#include + +#include + +namespace cudf { +namespace test { + +/**---------------------------------------------------------------------------* + * @brief Utility for creating a strings column from a vector of host strings + * + * @param h_strings Pointer to null-terminated, UTF-8 encode chars arrays. + * @return column instance of type STRING + *---------------------------------------------------------------------------**/ +std::unique_ptr create_strings_column( const std::vector& h_strings ); + +} // namespace test +} // namespace cudf \ No newline at end of file From 69e43bde7fccbdcf78dccd40d4b5e48718543d08 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 Sep 2019 11:47:04 -0400 Subject: [PATCH 15/54] gtests for strings column attributes --- cpp/src/strings/attributes.cu | 19 +++++++++---------- cpp/tests/strings/attributes_tests.cu | 9 ++++++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 5a53ee7728a..9290da1341d 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -38,12 +38,12 @@ std::unique_ptr characters_counts( strings_column_view strings, auto d_column = *strings_column; cudf::size_type null_count = d_column.null_count(); // create output column - auto result = std::make_unique( data_type{INT32}, count, + auto results = std::make_unique( data_type{INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), stream, mr), d_column.null_count()); - auto results_view = result->mutable_view(); + auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // set lengths thrust::transform( execpol->on(stream), @@ -55,8 +55,8 @@ std::unique_ptr characters_counts( strings_column_view strings, return 0; return d_column.element(idx).characters(); }); - results_view.set_null_count(null_count); - return result; + results->set_null_count(null_count); + return results; } std::unique_ptr bytes_counts( strings_column_view strings, @@ -74,10 +74,10 @@ std::unique_ptr bytes_counts( strings_column_view strings, gdf_valid_allocation_size(count), stream, mr); // create output column - auto result = std::make_unique( data_type{INT32}, count, + auto results = std::make_unique( data_type{INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), null_mask, null_count); - auto results_view = result->mutable_view(); + auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // set sizes thrust::transform( execpol->on(stream), @@ -89,10 +89,9 @@ std::unique_ptr bytes_counts( strings_column_view strings, return 0; return d_column.element(idx).size(); }); - // reset the null count - results_view.set_null_count(null_count); - printf("size=%d, null_count=%d\n", count, null_count); - return result; + // reset null count must be done on the column and not the view + results->set_null_count(null_count); + return results; } // diff --git a/cpp/tests/strings/attributes_tests.cu b/cpp/tests/strings/attributes_tests.cu index 749c52060c0..af958bb86b9 100644 --- a/cpp/tests/strings/attributes_tests.cu +++ b/cpp/tests/strings/attributes_tests.cu @@ -35,16 +35,17 @@ TEST_F(AttrsTest, BytesCounts) { std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; std::vector h_bytes{ 3, 0, 3, 0, 3, 4 }; + std::vector h_nbits{ 0x0037 }; auto strings = cudf::test::create_strings_column(h_test_strings); auto strings_view = cudf::strings_column_view(strings->view()); cudf::size_type count = strings_view.size(); - cudf::strings::print(strings_view); auto column = cudf::strings::bytes_counts(strings_view); rmm::device_vector d_expected(h_bytes); + rmm::device_vector d_nbits(h_nbits); cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, - d_expected.data().get(), nullptr, 0 ); + d_expected.data().get(), d_nbits.data().get(), 1 ); cudf::test::expect_columns_equal(column->view(), column_expected); } @@ -52,6 +53,7 @@ TEST_F(AttrsTest, CharactersCounts) { std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; std::vector h_characters{ 3, 0, 2, 0, 3, 2 }; + std::vector h_nbits{ 0x0037 }; auto strings = cudf::test::create_strings_column(h_test_strings); auto strings_view = cudf::strings_column_view(strings->view()); @@ -59,8 +61,9 @@ TEST_F(AttrsTest, CharactersCounts) auto column = cudf::strings::characters_counts(strings_view); rmm::device_vector d_expected(h_characters); + rmm::device_vector d_nbits(h_nbits); cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, - d_expected.data().get(), nullptr, 0 ); + d_expected.data().get(), d_nbits.data().get(), 1 ); cudf::test::expect_columns_equal(column->view(), column_expected); } From 6600614d6173022ab595dc42187d8e8699b81f47 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 Sep 2019 11:56:49 -0400 Subject: [PATCH 16/54] finish gtests for attributes --- cpp/src/strings/attributes.cu | 20 ++++++++++++-------- cpp/tests/strings/attributes_tests.cu | 20 ++++++++++++++++---- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 9290da1341d..11df7f2dd51 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -36,13 +36,16 @@ std::unique_ptr characters_counts( strings_column_view strings, auto execpol = rmm::exec_policy(stream); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; + rmm::device_buffer null_mask; cudf::size_type null_count = d_column.null_count(); + if( d_column.nullable() ) + null_mask = rmm::device_buffer( d_column.null_mask(), + gdf_valid_allocation_size(count), + stream, mr); // create output column auto results = std::make_unique( data_type{INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), - rmm::device_buffer(d_column.null_mask(), gdf_valid_allocation_size(count), - stream, mr), - d_column.null_count()); + null_mask, null_count); auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // set lengths @@ -123,10 +126,10 @@ std::unique_ptr code_points( strings_column_view strings, // the size is the last element from an inclusive-scan size_type size = offsets.back(); // create output column - auto result = make_numeric_column( data_type{INT32}, size, - mask_state::UNALLOCATED, - stream, mr ); - auto results_view = result->mutable_view(); + auto results = make_numeric_column( data_type{INT32}, size, + mask_state::UNALLOCATED, + stream, mr ); + auto results_view = results->mutable_view(); auto d_results = results_view.data(); // now set the ranges from each strings' character values thrust::for_each_n(execpol->on(stream), @@ -141,7 +144,8 @@ std::unique_ptr code_points( strings_column_view strings, // *result++ = (unsigned int)*itr; }); // - return result; + results->set_null_count(0); // no nulls here + return results; } } // namespace strings diff --git a/cpp/tests/strings/attributes_tests.cu b/cpp/tests/strings/attributes_tests.cu index af958bb86b9..b2261cbbd6a 100644 --- a/cpp/tests/strings/attributes_tests.cu +++ b/cpp/tests/strings/attributes_tests.cu @@ -39,12 +39,11 @@ TEST_F(AttrsTest, BytesCounts) auto strings = cudf::test::create_strings_column(h_test_strings); auto strings_view = cudf::strings_column_view(strings->view()); - cudf::size_type count = strings_view.size(); auto column = cudf::strings::bytes_counts(strings_view); rmm::device_vector d_expected(h_bytes); rmm::device_vector d_nbits(h_nbits); - cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, + cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), d_expected.data().get(), d_nbits.data().get(), 1 ); cudf::test::expect_columns_equal(column->view(), column_expected); } @@ -57,13 +56,26 @@ TEST_F(AttrsTest, CharactersCounts) auto strings = cudf::test::create_strings_column(h_test_strings); auto strings_view = cudf::strings_column_view(strings->view()); - cudf::size_type count = strings_view.size(); auto column = cudf::strings::characters_counts(strings_view); rmm::device_vector d_expected(h_characters); rmm::device_vector d_nbits(h_nbits); - cudf::column_view column_expected( cudf::data_type{cudf::INT32}, count, + cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), d_expected.data().get(), d_nbits.data().get(), 1 ); cudf::test::expect_columns_equal(column->view(), column_expected); } +TEST_F(AttrsTest, CodePoints) +{ + std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; + std::vector h_codepoints{ 120, 121, 122, 97, 50089, 98, 98, 98, 50089, 50089 }; + + auto strings = cudf::test::create_strings_column(h_test_strings); + auto strings_view = cudf::strings_column_view(strings->view()); + + auto column = cudf::strings::code_points(strings_view); + rmm::device_vector d_expected(h_codepoints); + cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), + d_expected.data().get(), nullptr, 0 ); + cudf::test::expect_columns_equal(column->view(), column_expected); +} From 80d0c1c09a771aca705aeed12242a8bef86f2522 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 Sep 2019 18:02:00 -0400 Subject: [PATCH 17/54] array gtests --- .../cudf/column/column_device_view.cuh | 69 +++++++---- cpp/src/column/column_device_view.cu | 59 ++++++++- cpp/src/strings/array.cu | 2 +- cpp/src/strings/attributes.cu | 8 +- cpp/src/table/table_device_view.cu | 41 +++++-- cpp/tests/CMakeLists.txt | 2 + cpp/tests/strings/array_tests.cu | 112 ++++++++++++++++++ cpp/tests/strings/attributes_tests.cu | 3 - cpp/tests/strings/combine_tests.cu | 49 ++++++++ cpp/tests/strings/utilities.cu | 50 +++++++- cpp/tests/strings/utilities.h | 29 ++++- 11 files changed, 376 insertions(+), 48 deletions(-) create mode 100644 cpp/tests/strings/array_tests.cu create mode 100644 cpp/tests/strings/combine_tests.cu diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index bcb3c3c7a1a..be1daeb4c00 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -209,6 +209,8 @@ class alignas(16) column_device_view : public detail::column_device_view_base { column_device_view& operator=(column_device_view const&) = default; column_device_view& operator=(column_device_view&&) = default; + // + column_device_view( column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); /**---------------------------------------------------------------------------* * @brief Returns reference to element at the specified index. @@ -254,6 +256,15 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *---------------------------------------------------------------------------**/ void destroy(); + /**---------------------------------------------------------------------------* + * @brief Return the amount of memory needed to hold this instance in + * contiguous memory block. This accounts for the children as well as + * the object itself. + * + * @param source_view The `column_view` to use for this calculation. + *---------------------------------------------------------------------------**/ + static size_type extent(column_view source_view); + /**---------------------------------------------------------------------------* * @brief Returns the specified child * @@ -298,6 +309,8 @@ class alignas(16) mutable_column_device_view default; mutable_column_device_view& operator=(mutable_column_device_view&&) = default; + mutable_column_device_view( mutable_column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); + /**---------------------------------------------------------------------------* * @brief Factory to construct a column view that is usable in device memory. * @@ -429,6 +442,15 @@ class alignas(16) mutable_column_device_view null_mask()[element_index] = new_element; } + /**---------------------------------------------------------------------------* + * @brief Return the amount of memory needed to hold this instance in + * contiguous memory block. This accounts for the children as well as + * the object itself. + * + * @param source_view The `column_view` to use for this calculation. + *---------------------------------------------------------------------------**/ + static size_type extent(column_view source_view); + private: mutable_column_device_view* mutable_children{}; ///< Array of `mutable_column_device_view` @@ -454,31 +476,32 @@ class alignas(16) mutable_column_device_view * allocated to hold the child views. *---------------------------------------------------------------------------**/ void destroy(); -}; - /**---------------------------------------------------------------------------* - * @brief Returns `string_view` to the string element at the specified index. - * - * This function accounts for the offset. Do not call this for a null element. - * - * @param element_index Position of the desired string - * @return string_view instance representing this element at this index - *---------------------------------------------------------------------------**/ - template <> - __device__ inline string_view const column_device_view::element( - size_type element_index) const noexcept { - size_type index = element_index + _offset; // account for this view's _offset - const int32_t* d_offsets = d_children[0].data(); - const char* d_strings = d_children[1].data(); - size_type offset = index ? d_offsets[index-1] : 0; - return string_view{d_strings + offset, d_offsets[index] - offset}; - } +}; - //template <> - //__device__ inline string_view mutable_column_device_view::element( - // size_type element_index) noexcept { - // return string_view{}; - //} +/**---------------------------------------------------------------------------* + * @brief Returns `string_view` to the string element at the specified index. + * + * This function accounts for the offset. Do not call this for a null element. + * + * @param element_index Position of the desired string + * @return string_view instance representing this element at this index + *---------------------------------------------------------------------------**/ +template <> +__device__ inline string_view const column_device_view::element( + size_type element_index) const noexcept { + size_type index = element_index + _offset; // account for this view's _offset + const int32_t* d_offsets = d_children[0].data(); + const char* d_strings = d_children[1].data(); + size_type offset = index ? d_offsets[index-1] : 0; + return string_view{d_strings + offset, d_offsets[index] - offset}; +} + +//template <> +//__device__ inline string_view mutable_column_device_view::element( +// size_type element_index) noexcept { +// return string_view{}; +//} } // namespace cudf diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 15c33abaa68..87c8e78b9a8 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -38,29 +38,76 @@ void column_device_view::destroy() { delete this; } +// For use with inplace-new to pre-fill memory to be copied to device +column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()}, + _num_children{source.num_children()} +{ + if( _num_children > 0 ) + { + column_device_view* h_column = reinterpret_cast(h_ptr); + column_device_view* d_column = reinterpret_cast(d_ptr); + d_children = d_column; + for( size_type idx=0; idx < _num_children; ++idx ) + { // inplace-new each child + column_view child = source.child(idx); + CUDF_EXPECTS( child.num_children()==0, "column grand-children not currently supported"); + new(h_column) column_device_view(child); + h_column++; + //d_column++; + } + } +} + +// For use with inplace-new to pre-fill memory to be copied to device +mutable_column_device_view::mutable_column_device_view( mutable_column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()} +{} + // Construct a unique_ptr that invokes `destroy()` as it's deleter std::unique_ptr> column_device_view::create(column_view source, cudaStream_t stream) { - size_type num_descendants{count_descendants(source)}; + //size_type num_descendants{count_descendants(source)}; //if( num_descendants > 0 ) { // CUDF_FAIL("Columns with children are not currently supported."); // } auto deleter = [](column_device_view* v) { v->destroy(); }; std::unique_ptr p{ new column_device_view(source), deleter}; - if( num_descendants > 0 ) + size_type num_children = source.num_children(); + if( num_children > 0 ) { // ignore grand-children right now - RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_descendants, stream); - for( size_type idx=0; idx < num_descendants; ++idx ) + RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_children, stream); + for( size_type idx=0; idx < num_children; ++idx ) { column_device_view child(source.child(idx)); CUDF_EXPECTS( child._num_children==0, "column grand-children not currently supported"); - cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), cudaMemcpyHostToDevice, stream); + CUDA_TRY(cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), + cudaMemcpyHostToDevice, stream)); } - p->_num_children = num_descendants; + p->_num_children = num_children; cudaStreamSynchronize(stream); } return p; } +size_type column_device_view::extent(column_view source) { + size_type data_size = sizeof(column_device_view); + for( size_type idx=0; idx < source.num_children(); ++idx ) + data_size += extent(source.child(idx)); + return data_size; +} + +size_type mutable_column_device_view::extent(column_view source) { + size_type data_size = sizeof(column_device_view); + for( size_type idx=0; idx < source.num_children(); ++idx ) + data_size += extent(source.child(idx)); + return data_size; +} + + } // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index fe4d09e45e4..cf2d76bc49b 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -45,7 +45,7 @@ std::unique_ptr sublist( strings_column_view handler, end = count; if( start < 0 || start > end ) throw std::invalid_argument("invalid start parameter"); - count = (end - start)/step +1; + count = (end - start)/step; // auto execpol = rmm::exec_policy(stream); // build indices diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 11df7f2dd51..7861d779748 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -23,7 +23,7 @@ #include #include -namespace cudf +namespace cudf { namespace strings { @@ -49,7 +49,7 @@ std::unique_ptr characters_counts( strings_column_view strings, auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // set lengths - thrust::transform( execpol->on(stream), + thrust::transform( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), d_lengths, @@ -83,7 +83,7 @@ std::unique_ptr bytes_counts( strings_column_view strings, auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); // set sizes - thrust::transform( execpol->on(stream), + thrust::transform( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), d_lengths, @@ -121,7 +121,7 @@ std::unique_ptr code_points( strings_column_view strings, return d_column.element(idx).characters(); }, thrust::plus()); - + // need the total size to build the column // the size is the last element from an inclusive-scan size_type size = offsets.back(); diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 30b284fc6e6..093b00dabb6 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -21,6 +21,7 @@ #include +#include #include #include @@ -40,18 +41,40 @@ table_device_view_base::table_device_view_base( _num_columns{source_view.num_columns()}, _stream{stream} { if (source_view.num_columns() > 0) { - size_type total_descendants = +// size_type total_descendants = +// std::accumulate(source_view.begin(), source_view.end(), 0, +// [](size_type init, column_view col) { +// return init + count_descendants(col); +// }); +// CUDF_EXPECTS(0 == total_descendants, +// "Columns with descendants are not yet supported."); +// auto views_size_bytes = +// source_view.num_columns() * sizeof(*source_view.begin()); + size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, - [](size_type init, column_view col) { - return init + count_descendants(col); - }); - CUDF_EXPECTS(0 == total_descendants, - "Columns with descendants are not yet supported."); + [](size_type init, column_view col) { + return init + ColumnDeviceView::extent(col); + }); + + //CUDA_TRY(cudaMemcpyAsync(_columns, &(*source_view.begin()), + // views_size_bytes, cudaMemcpyDefault, stream)); - auto views_size_bytes = - source_view.num_columns() * sizeof(*source_view.begin()); + std::vector h_buffer(views_size_bytes); + ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); + int8_t* h_end = (int8_t*)(h_column + _num_columns); RMM_TRY(RMM_ALLOC(&_columns, views_size_bytes, stream)); - CUDA_TRY(cudaMemcpyAsync(_columns, &(*source_view.begin()), + ColumnDeviceView* d_column = _columns; + int8_t* d_end = (int8_t*)(d_column + _num_columns); + for( size_type idx=0; idx < _num_columns; ++idx ) + { + auto col = source_view.column(idx); + new(h_column) ColumnDeviceView(col,(ptrdiff_t)h_end,(ptrdiff_t)d_end); + h_column++; + h_end += (ColumnDeviceView::extent(col)); + d_end += (ColumnDeviceView::extent(col)); + } + + CUDA_TRY(cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream)); } } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6f4320ad682..1e030ea36a2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -481,7 +481,9 @@ ConfigureTest(DISPATCHER_TEST "${DISPATCHER_TEST_SRC}") set(STRINGS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/strings/factories_test.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/array_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/attributes_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/combine_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/utilities.cu") ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}") diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu new file mode 100644 index 00000000000..1e04744336e --- /dev/null +++ b/cpp/tests/strings/array_tests.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include "./utilities.h" + +#include + + +struct ArrayTest : public GdfTest {}; + +TEST_F(ArrayTest, Sort) +{ + std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_expected{ nullptr, "", "aa", "bb", "bbb", "eee", "ééé" }; + + auto d_strings = cudf::test::create_strings_column(h_strings); + auto strings_view = cudf::strings_column_view(d_strings->view()); + + auto results = cudf::strings::sort(strings_view, cudf::strings::name); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} + +TEST_F(ArrayTest, Sublist) +{ + std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_expected{ "", "aa", "bbb", "ééé" }; + + auto d_strings = cudf::test::create_strings_column(h_strings); + auto strings_view = cudf::strings_column_view(d_strings->view()); + + auto results = cudf::strings::sublist(strings_view,3); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} + +TEST_F(ArrayTest, Gather) +{ + std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_expected{ "aa", "bb" }; + + auto d_strings = cudf::test::create_strings_column(h_strings); + auto strings_view = cudf::strings_column_view(d_strings->view()); + + rmm::device_vector gather_map(2,0); + gather_map[0] = 4; + gather_map[1] = 1; + cudf::column_view gather_map_view( cudf::data_type{cudf::INT32}, gather_map.size(), + gather_map.data().get(), nullptr, 0); + + auto results = cudf::strings::gather(strings_view,gather_map_view); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} + +TEST_F(ArrayTest, Scatter) +{ + std::vector h_strings1{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_strings2{ "1", "22" }; + std::vector h_expected{ "eee", "22", nullptr, "", "1", "bbb", "ééé" }; + + auto d_strings1 = cudf::test::create_strings_column(h_strings1); + auto view1 = cudf::strings_column_view(d_strings1->view()); + auto d_strings2 = cudf::test::create_strings_column(h_strings2); + auto view2 = cudf::strings_column_view(d_strings2->view()); + + rmm::device_vector scatter_map(2,0); + scatter_map[0] = 4; + scatter_map[1] = 1; + cudf::column_view scatter_map_view( cudf::data_type{cudf::INT32}, scatter_map.size(), + scatter_map.data().get(), nullptr, 0); + + auto results = cudf::strings::scatter(view1,view2,scatter_map_view); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} diff --git a/cpp/tests/strings/attributes_tests.cu b/cpp/tests/strings/attributes_tests.cu index b2261cbbd6a..59f2a283c49 100644 --- a/cpp/tests/strings/attributes_tests.cu +++ b/cpp/tests/strings/attributes_tests.cu @@ -14,9 +14,7 @@ * limitations under the License. */ -#include #include -#include #include #include @@ -25,7 +23,6 @@ #include "./utilities.h" #include -#include struct AttrsTest : public GdfTest {}; diff --git a/cpp/tests/strings/combine_tests.cu b/cpp/tests/strings/combine_tests.cu new file mode 100644 index 00000000000..c7bff891245 --- /dev/null +++ b/cpp/tests/strings/combine_tests.cu @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include "./utilities.h" + +#include + + +struct CombineTest : public GdfTest {}; + +TEST_F(CombineTest, Concatenate) +{ + std::vector h_strings1{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_strings2{ "xyz", "abc", "d", "éa", "", nullptr, "f" }; + std::vector h_expected{ "eeexyz", "bbabc", nullptr, "éa", "aa", nullptr, "éééf" }; + + auto d_strings1 = cudf::test::create_strings_column(h_strings1); + auto view1 = cudf::strings_column_view(d_strings1->view()); + auto d_strings2 = cudf::test::create_strings_column(h_strings2); + auto view2 = cudf::strings_column_view(d_strings2->view()); + + auto results = cudf::strings::concatenate(view1,view2); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cu index f1d384aa4d6..723bc9e76e6 100644 --- a/cpp/tests/strings/utilities.cu +++ b/cpp/tests/strings/utilities.cu @@ -14,10 +14,17 @@ * limitations under the License. */ -#include #include "./utilities.h" +#include +#include +#include + #include +#include +#include + +#include namespace cudf { namespace test { @@ -51,5 +58,46 @@ std::unique_ptr create_strings_column( const std::vector(lidx); + cudf::string_view rstr = d_rhs.element(ridx); + return lstr.compare(rstr)==0; + } + column_device_view d_lhs; + column_device_view d_rhs; +}; + +// +void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs) +{ + EXPECT_EQ(lhs.size(), rhs.size()); + EXPECT_EQ(lhs.null_count(), rhs.null_count()); + + // this almost works + //auto d_lhs = cudf::table_device_view::create(table_view{{lhs.parent()}}); + //auto d_rhs = cudf::table_device_view::create(table_view{{rhs.parent()}}); + //EXPECT_TRUE( + // thrust::equal(thrust::device, thrust::make_counting_iterator(0), + // thrust::make_counting_iterator(lhs.size()), + // thrust::make_counting_iterator(0), + // cudf::exp::row_equality_comparator{*d_lhs, *d_rhs})); + //CUDA_TRY(cudaDeviceSynchronize()); + + auto col_lhs = column_device_view::create(lhs.parent()); + auto col_rhs = column_device_view::create(rhs.parent()); + + EXPECT_TRUE( + thrust::equal(thrust::device, thrust::make_counting_iterator(0), + thrust::make_counting_iterator((int)lhs.size()), + thrust::make_counting_iterator(0), + compare_strings_fn{*col_lhs,*col_rhs})); +} + } // namespace test } // namespace cudf diff --git a/cpp/tests/strings/utilities.h b/cpp/tests/strings/utilities.h index 76f02f72312..cadd40d73ad 100644 --- a/cpp/tests/strings/utilities.h +++ b/cpp/tests/strings/utilities.h @@ -1,6 +1,22 @@ - +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once #include +#include #include @@ -15,5 +31,16 @@ namespace test { *---------------------------------------------------------------------------**/ std::unique_ptr create_strings_column( const std::vector& h_strings ); +/**---------------------------------------------------------------------------* + * @brief Verifies the element-wise equality of two strings columns. + * + * Treats null elements as equivalent. + * Based on `expect_columns_equal()` in tests/utilities/column_utilities.cu + * + * @param lhs The first column + * @param rhs The second column + *---------------------------------------------------------------------------**/ +void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs); + } // namespace test } // namespace cudf \ No newline at end of file From cfd9bbc4ce50cccf861e28f1de5f4faf66769ed3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 Sep 2019 15:40:31 -0400 Subject: [PATCH 18/54] finish gtests --- cpp/src/strings/array.cu | 6 +++ cpp/src/strings/combine.cu | 2 + cpp/src/strings/strings_column_factories.cu | 2 + cpp/src/table/table_device_view.cu | 10 +--- cpp/tests/strings/array_tests.cu | 53 +++++++++++++++++---- cpp/tests/strings/utilities.cu | 10 ++-- 6 files changed, 61 insertions(+), 22 deletions(-) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index cf2d76bc49b..af728583d56 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -102,6 +102,8 @@ std::unique_ptr gather( strings_column_view handler, // build chars column size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); auto chars_view = chars_column->mutable_view(); @@ -207,6 +209,8 @@ std::unique_ptr scatter( strings_column_view strings, // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings auto chars_column = detail::chars_from_string_array(strings_array, d_offsets, stream, mr); @@ -269,6 +273,8 @@ std::unique_ptr scatter( strings_column_view handler, // build chars column size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings auto chars_column = detail::chars_from_string_array(strings, d_offsets, stream, mr); diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index bdc88c65f23..78d30674c7e 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -112,6 +112,8 @@ std::unique_ptr concatenate( strings_column_view strings, // build chars column size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count-1]; // this may not be stream friendly + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); auto chars_view = chars_column->mutable_view(); diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 16702c7ef31..16a85492a39 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -75,6 +75,8 @@ std::unique_ptr make_strings_column( rmm::device_buffer null_mask(valid_mask.first, gdf_valid_allocation_size(count), stream, mr); RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings // build chars column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 093b00dabb6..eeae71e77d3 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -41,15 +41,7 @@ table_device_view_base::table_device_view_base( _num_columns{source_view.num_columns()}, _stream{stream} { if (source_view.num_columns() > 0) { -// size_type total_descendants = -// std::accumulate(source_view.begin(), source_view.end(), 0, -// [](size_type init, column_view col) { -// return init + count_descendants(col); -// }); -// CUDF_EXPECTS(0 == total_descendants, -// "Columns with descendants are not yet supported."); -// auto views_size_bytes = -// source_view.num_columns() * sizeof(*source_view.begin()); + // size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, [](size_type init, column_view col) { diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu index 1e04744336e..1c5993357b5 100644 --- a/cpp/tests/strings/array_tests.cu +++ b/cpp/tests/strings/array_tests.cu @@ -18,15 +18,14 @@ #include #include -#include -#include -#include +#include #include "./utilities.h" #include +#include -struct ArrayTest : public GdfTest {}; +struct ArrayTest : public cudf::test::BaseFixture {}; TEST_F(ArrayTest, Sort) { @@ -45,15 +44,25 @@ TEST_F(ArrayTest, Sort) cudf::test::expect_strings_columns_equal(results_view, expected_view); } -TEST_F(ArrayTest, Sublist) +class ArrayTestParms1 : public ArrayTest, + public testing::WithParamInterface {}; + +TEST_P(ArrayTestParms1, Sublist) { std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_expected{ "", "aa", "bbb", "ééé" }; + cudf::size_type start = 3; + cudf::size_type end = GetParam(); + std::vector h_expected; + if( end > start ) + { + for( cudf::size_type idx=start; (idx < end) && (idx < (cudf::size_type)h_strings.size()); ++idx ) + h_expected.push_back( h_strings[idx] ); + } auto d_strings = cudf::test::create_strings_column(h_strings); auto strings_view = cudf::strings_column_view(d_strings->view()); - auto results = cudf::strings::sublist(strings_view,3); + auto results = cudf::strings::sublist(strings_view,start,end); auto results_view = cudf::strings_column_view(results->view()); auto d_expected = cudf::test::create_strings_column(h_expected); @@ -62,6 +71,9 @@ TEST_F(ArrayTest, Sublist) cudf::test::expect_strings_columns_equal(results_view, expected_view); } +INSTANTIATE_TEST_CASE_P(SublistParms, ArrayTestParms1, + testing::ValuesIn(std::array{5,6,7})); + TEST_F(ArrayTest, Gather) { std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; @@ -72,7 +84,7 @@ TEST_F(ArrayTest, Gather) rmm::device_vector gather_map(2,0); gather_map[0] = 4; - gather_map[1] = 1; + gather_map[1] = 1; cudf::column_view gather_map_view( cudf::data_type{cudf::INT32}, gather_map.size(), gather_map.data().get(), nullptr, 0); @@ -98,7 +110,7 @@ TEST_F(ArrayTest, Scatter) rmm::device_vector scatter_map(2,0); scatter_map[0] = 4; - scatter_map[1] = 1; + scatter_map[1] = 1; cudf::column_view scatter_map_view( cudf::data_type{cudf::INT32}, scatter_map.size(), scatter_map.data().get(), nullptr, 0); @@ -110,3 +122,26 @@ TEST_F(ArrayTest, Scatter) cudf::test::expect_strings_columns_equal(results_view, expected_view); } + +TEST_F(ArrayTest, ScatterScalar) +{ + std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; + std::vector h_expected{ "eee", "---", nullptr, "", "---", "bbb", "ééé" }; + + auto d_strings = cudf::test::create_strings_column(h_strings); + auto view = cudf::strings_column_view(d_strings->view()); + + rmm::device_vector scatter_map(2,0); + scatter_map[0] = 4; + scatter_map[1] = 1; + cudf::column_view scatter_map_view( cudf::data_type{cudf::INT32}, scatter_map.size(), + scatter_map.data().get(), nullptr, 0); + + auto results = cudf::strings::scatter(view,"---",scatter_map_view); + auto results_view = cudf::strings_column_view(results->view()); + + auto d_expected = cudf::test::create_strings_column(h_expected); + auto expected_view = cudf::strings_column_view(d_expected->view()); + + cudf::test::expect_strings_columns_equal(results_view, expected_view); +} diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cu index 723bc9e76e6..2466b2d8aea 100644 --- a/cpp/tests/strings/utilities.cu +++ b/cpp/tests/strings/utilities.cu @@ -35,6 +35,8 @@ std::unique_ptr create_strings_column( const std::vector h_buffer(memsize); thrust::device_vector d_buffer(memsize); @@ -62,9 +64,9 @@ struct compare_strings_fn { __device__ bool operator()(int lidx, int ridx) { - if( d_lhs.nullable() && d_lhs.is_null(lidx) && - d_rhs.nullable() && d_rhs.is_null(ridx) ) - return true; + if( (d_lhs.nullable() && d_lhs.is_null(lidx)) || + (d_rhs.nullable() && d_rhs.is_null(ridx)) ) + return d_lhs.is_null(lidx)==d_rhs.is_null(ridx); cudf::string_view lstr = d_lhs.element(lidx); cudf::string_view rstr = d_rhs.element(ridx); return lstr.compare(rstr)==0; @@ -74,7 +76,7 @@ struct compare_strings_fn }; // -void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs) +void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs) { EXPECT_EQ(lhs.size(), rhs.size()); EXPECT_EQ(lhs.null_count(), rhs.null_count()); From 02a00c830957704963874d9f3035ca68919d7bed Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 30 Sep 2019 09:04:40 -0400 Subject: [PATCH 19/54] improve gather scan call --- cpp/src/strings/array.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index af728583d56..62e6d6fff49 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -19,9 +19,11 @@ #include #include #include +#include #include "./utilities.h" #include +#include #include #include #include @@ -77,15 +79,14 @@ std::unique_ptr gather( strings_column_view handler, auto d_new_offsets = offsets_view.data(); // create new offsets array thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), + d_indices, + d_indices + count, d_new_offsets, [d_column, d_offsets, d_indices] __device__ (size_type idx) { - size_type index = d_indices[idx]; - if( d_column.nullable() && d_column.is_null(index) ) + if( d_column.nullable() && d_column.is_null(idx) ) return 0; - size_type offset = index ? d_offsets[index-1] : 0; - return d_offsets[index] - offset; + size_type offset = idx ? d_offsets[idx-1] : 0; + return d_offsets[idx] - offset; }, thrust::plus()); From c9fb410947a498835f3df2cbc7d917f92c57c78e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 30 Sep 2019 11:14:22 -0400 Subject: [PATCH 20/54] some minor fixes per review --- .../cudf/column/column_device_view.cuh | 4 +- cpp/include/cudf/strings/string_view.cuh | 152 ++++++++++-------- cpp/include/cudf/strings/string_view.inl | 121 +++++++------- .../cudf/strings/strings_column_view.hpp | 2 +- cpp/src/strings/array.cu | 10 +- cpp/src/strings/attributes.cu | 2 - cpp/src/strings/combine.cu | 27 ++-- cpp/src/strings/strings_column_factories.cu | 9 +- cpp/src/strings/strings_column_view.cu | 2 +- cpp/src/strings/utilities.cu | 22 +-- cpp/src/strings/utilities.cuh | 46 ++++++ cpp/src/strings/utilities.h | 24 +-- cpp/tests/strings/utilities.cu | 4 +- 13 files changed, 243 insertions(+), 182 deletions(-) create mode 100644 cpp/src/strings/utilities.cuh diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index ec451d164ab..ec61a863c72 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -486,13 +486,13 @@ class alignas(16) mutable_column_device_view * @return string_view instance representing this element at this index *---------------------------------------------------------------------------**/ template <> -__device__ inline string_view const column_device_view::element( +__device__ inline strings::string_view const column_device_view::element( size_type element_index) const noexcept { size_type index = element_index + _offset; // account for this view's _offset const int32_t* d_offsets = d_children[0].data(); const char* d_strings = d_children[1].data(); size_type offset = index ? d_offsets[index-1] : 0; - return string_view{d_strings + offset, d_offsets[index] - offset}; + return strings::string_view{d_strings + offset, d_offsets[index] - offset}; } //template <> diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 69c55b773fe..664b4ae16a8 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -21,9 +21,11 @@ namespace cudf { +namespace strings +{ -// utf8 characters are 1-4 bytes -typedef unsigned int Char; +// UTF-8 characters are 1-4 bytes +typedef unsigned int char_utf8; /**---------------------------------------------------------------------------* * @brief A non-owning, immutable view of device data that is variable length @@ -31,8 +33,7 @@ typedef unsigned int Char; * device memory for the lifetime of this instance. * * It provides a simple wrapper and string operations for individual char array - * within a strings column. This is likely created dynamically and temporarily. - * It is not recommended to be allocated directly on the global memory heap. + * within a strings column. *---------------------------------------------------------------------------**/ class string_view { @@ -79,6 +80,10 @@ class string_view * @brief Return true if string has no characters *---------------------------------------------------------------------------**/ __device__ bool empty() const; + /**---------------------------------------------------------------------------* + * @brief Return true if string pointer is null. + * That is, `data()==nullptr` for this instance. + *---------------------------------------------------------------------------**/ __device__ bool is_null() const; /**---------------------------------------------------------------------------* @@ -88,9 +93,9 @@ class string_view { public: using difference_type = ptrdiff_t; - using value_type = Char; - using reference = Char&; - using pointer = Char*; + using value_type = char_utf8; + using reference = char_utf8&; + using pointer = char_utf8*; using iterator_category = std::input_iterator_tag; // do not allow going backwards __device__ iterator(const string_view& str, size_type pos); iterator(const iterator& mit) = default; @@ -101,7 +106,7 @@ class string_view __device__ iterator operator++(int); __device__ bool operator==(const iterator& rhs) const; __device__ bool operator!=(const iterator& rhs) const; - __device__ Char operator*() const; + __device__ char_utf8 operator*() const; __device__ size_type position() const; __device__ size_type byte_offset() const; private: @@ -123,14 +128,19 @@ class string_view * * @param pos Character position *---------------------------------------------------------------------------**/ - __device__ Char at(size_type pos) const; - __device__ Char operator[](size_type pos) const; + __device__ char_utf8 at(size_type pos) const; + /**---------------------------------------------------------------------------* + * @brief Return single UTF-8 character at the given character position + * + * @param pos Character position + *---------------------------------------------------------------------------**/ + __device__ char_utf8 operator[](size_type pos) const; /**---------------------------------------------------------------------------* * @brief Return the byte offset from data() for a given character position * * @param pos Character position *---------------------------------------------------------------------------**/ - __device__ size_type byte_offset_for(size_type pos) const; + __device__ size_type byte_offset(size_type pos) const; /**---------------------------------------------------------------------------* * @brief Comparing target string with this string. Each character is compared @@ -188,19 +198,21 @@ class string_view __device__ bool operator>=(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns first character position if arg string is contained in this string. + * @brief Returns the character position of the first occurrence where the + * argument str is found in this string within the character range [pos,pos+n). * - * @param str Target string to compare with this string. + * @param str Target string to search within this string. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. * Specify -1 to indicate to the end of the string. - * @return -1 if arg string is not found in this string. + * @return -1 if str is not found in this string. *---------------------------------------------------------------------------**/ __device__ size_type find( const string_view& str, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* - * @brief Returns first character position if arg array is contained in this string. + * @brief Returns the character position of the first occurrence where the + * array str is found in this string within the character range [pos,pos+n). * - * @param str Target string to compare with this string. + * @param str Target array to search within this string. * @param bytes Number of bytes in str. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -209,19 +221,21 @@ class string_view *---------------------------------------------------------------------------**/ __device__ size_type find( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* - * @brief Returns first character position if arg character is contained in this string. + * @brief Returns the character position of the first occurrence where + * character is found in this string within the character range [pos,pos+n). * - * @param chr Single encoded character. + * @param character Single encoded character. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. *---------------------------------------------------------------------------**/ - __device__ size_type find( Char chr, size_type pos=0, size_type count=-1 ) const; + __device__ size_type find( char_utf8 character, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* - * @brief Same as find() but searches from the end of this string. + * @brief Returns the character position of the last occurrence where the + * argument str is found in this string within the character range [pos,pos+n). * - * @param str Target string to compare with this string. + * @param str Target string to search within this string. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. * Specify -1 to indicate to the end of the string. @@ -229,9 +243,10 @@ class string_view *---------------------------------------------------------------------------**/ __device__ size_type rfind( const string_view& str, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* - * @brief Same as find() but searches from the end of this string. + * @brief Returns the character position of the last occurrence where the + * array str is found in this string within the character range [pos,pos+n). * - * @param str Target string to compare with this string. + * @param str Target string to search with this string. * @param bytes Number of bytes in str. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. @@ -240,15 +255,16 @@ class string_view *---------------------------------------------------------------------------**/ __device__ size_type rfind( const char* str, size_type bytes, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* - * @brief Same as find() but searches from the end of this string. + * @brief Returns the character position of the last occurrence where + * character is found in this string within the character range [pos,pos+n). * - * @param chr Single encoded character. + * @param character Single encoded character. * @param pos Character position to start search within this string. * @param count Number of characters from pos to include in the search. * Specify -1 to indicate to the end of the string. * @return -1 if arg string is not found in this string. *---------------------------------------------------------------------------**/ - __device__ size_type rfind( Char chr, size_type pos=0, size_type count=-1 ) const; + __device__ size_type rfind( char_utf8 character, size_type pos=0, size_type count=-1 ) const; /**---------------------------------------------------------------------------* * @brief Return a sub-string of this string. The original string and device @@ -261,57 +277,29 @@ class string_view __device__ string_view substr( size_type start, size_type length ) const; /**---------------------------------------------------------------------------* - * @brief Tokenizes this string around the given delimiter up to count time. + * @brief Tokenizes this string around the given delimiter up to count times. * - * @param delim Character to use for separating tokens. + * @param delimiter Character to use for separating tokens. * @param count Maximum number of tokens to return. * Specify -1 to indicate all tokens. * @param[out] Array to hold output tokens. * Specify nullptr here to return just the token count. * @return Number of tokens. *---------------------------------------------------------------------------**/ - __device__ size_type split( const char* delim, size_type count, string_view* strs ) const; + __device__ size_type split( const char* delimiter, size_type count, string_view* strs ) const; /**---------------------------------------------------------------------------* - * @brief Same as split() but starts tokenizing from the end of the string. + * @brief Tokenizes this string around the given delimiter up to count times + * starting at the end of the string. * - * @param delim Character to use for separating tokens. + * @param delimiter Character to use for separating tokens. * @param count Maximum number of tokens to return. * Specify -1 to indicate all tokens. * @param[out] Array to hold output tokens. * Specify nullptr here to return just the token count. * @return Number of tokens. *---------------------------------------------------------------------------**/ - __device__ size_type rsplit( const char* delim, size_type count, string_view* strs ) const; - - /**---------------------------------------------------------------------------* - * @brief Returns the number of bytes in the specified character. - *---------------------------------------------------------------------------**/ - __host__ __device__ static size_type bytes_in_char( Char chr ); - /**---------------------------------------------------------------------------* - * @brief Convert a char array into a Char value. - * - * @param str String containing encoded char bytes. - * @param[out] chr Single Char value. - * @return The number of bytes in the character - *---------------------------------------------------------------------------**/ - __host__ __device__ static size_type char_to_Char( const char* str, Char& chr ); - /**---------------------------------------------------------------------------* - * @brief Place a Char value into a char array. - * - * @param chr Single character - * @param[out] str Allocated char array with enough space to hold the encoded characer. - * @return The number of bytes in the character - *---------------------------------------------------------------------------**/ - __host__ __device__ static size_type Char_to_char( Char chr, char* str ); - /**---------------------------------------------------------------------------* - * @brief Return the number of characters in this provided char array. - * - * @param str String with encoded char bytes. - * @param bytes Number of bytes in str. - * @return The number of characters in the array. - *---------------------------------------------------------------------------**/ - __host__ __device__ static size_type chars_in_string( const char* str, size_type bytes ); + __device__ size_type rsplit( const char* delimiter, size_type count, string_view* strs ) const; private: const char* _data{}; ///< Pointer to device memory contain char array for this string @@ -323,9 +311,47 @@ private: * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. *---------------------------------------------------------------------------**/ - __device__ size_type char_offset(size_type bytepos) const; + __device__ size_type character_offset(size_type bytepos) const; }; -} +namespace detail +{ +/**---------------------------------------------------------------------------* + * @brief Returns the number of bytes in the specified character. + * + * @param chr Single character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type bytes_in_char_utf8( char_utf8 character ); + +/**---------------------------------------------------------------------------* + * @brief Convert a char array into a char_utf8 value. + * + * @param str String containing encoded char bytes. + * @param[out] chr Single char_utf8 value. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type to_char_utf8( const char* str, char_utf8& character ); + +/**---------------------------------------------------------------------------* + * @brief Place a char_utf8 value into a char array. + * + * @param chr Single character + * @param[out] str Allocated char array with enough space to hold the encoded characer. + * @return The number of bytes in the character + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type from_char_utf8( char_utf8 character, char* str ); + +/**---------------------------------------------------------------------------* + * @brief Return the number of characters in this provided char array. + * + * @param str String with encoded char bytes. + * @param bytes Number of bytes in str. + * @return The number of characters in the array. + *---------------------------------------------------------------------------**/ +__host__ __device__ size_type characters_in_string( const char* str, size_type bytes ); + +} // namespace detail +} // namespace strings +} // namespace cudf #include "./string_view.inl" diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 56a64e5a330..ff0def946b0 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -3,9 +3,8 @@ #include -namespace cudf +namespace { - typedef unsigned char BYTE; /**---------------------------------------------------------------------------* @@ -17,9 +16,9 @@ typedef unsigned char BYTE; * @param byte Byte from an encoded character. * @return Number of bytes. *---------------------------------------------------------------------------**/ -__host__ __device__ inline static size_type bytes_in_char_byte(BYTE byte) +__host__ __device__ inline static cudf::size_type bytes_in_utf8_byte(BYTE byte) { - size_type count = 1; + cudf::size_type count = 1; // no if-statements means no divergence count += (int)((byte & 0xF0) == 0xF0); count += (int)((byte & 0xE0) == 0xE0); @@ -27,7 +26,12 @@ __host__ __device__ inline static size_type bytes_in_char_byte(BYTE byte) count -= (int)((byte & 0xC0) == 0x80); return count; } +} // namespace +namespace cudf +{ +namespace strings +{ /**---------------------------------------------------------------------------* * @brief Returns the number of bytes used in the provided char array by @@ -70,7 +74,7 @@ __device__ inline size_type string_view::length() const __device__ inline size_type string_view::characters() const { - return chars_in_string(_data,_bytes); + return detail::characters_in_string(_data,_bytes); } __device__ inline const char* string_view::data() const @@ -93,12 +97,12 @@ __device__ inline string_view::iterator::iterator(const string_view& str, size_t : cpos(pos) { p = str.data(); - offset = str.byte_offset_for(cpos); + offset = str.byte_offset(cpos); } __device__ inline string_view::iterator& string_view::iterator::operator++() { - offset += bytes_in_char_byte((BYTE)p[offset]); + offset += bytes_in_utf8_byte((BYTE)p[offset]); ++cpos; return *this; } @@ -122,10 +126,10 @@ __device__ inline bool string_view::iterator::operator!=(const string_view::iter } // unsigned int can hold 1-4 bytes for the UTF8 char -__device__ inline Char string_view::iterator::operator*() const +__device__ inline char_utf8 string_view::iterator::operator*() const { - Char chr = 0; - char_to_Char(p + offset, chr); + char_utf8 chr = 0; + detail::to_char_utf8(p + offset, chr); return chr; } @@ -149,29 +153,29 @@ __device__ inline string_view::iterator string_view::end() const return iterator(*this, characters()); } -__device__ inline Char string_view::at(size_type pos) const +__device__ inline char_utf8 string_view::at(size_type pos) const { - unsigned int offset = byte_offset_for(pos); + unsigned int offset = byte_offset(pos); if(offset >= _bytes) return 0; - Char chr = 0; - char_to_Char(data() + offset, chr); + char_utf8 chr = 0; + detail::to_char_utf8(data() + offset, chr); return chr; } -__device__ inline Char string_view::operator[](size_type pos) const +__device__ inline char_utf8 string_view::operator[](size_type pos) const { return at(pos); } -__device__ inline size_type string_view::byte_offset_for(size_type pos) const +__device__ inline size_type string_view::byte_offset(size_type pos) const { size_type offset = 0; const char* sptr = _data; const char* eptr = sptr + _bytes; while( (pos > 0) && (sptr < eptr) ) { - size_type charbytes = bytes_in_char_byte((BYTE)*sptr++); + size_type charbytes = bytes_in_utf8_byte((BYTE)*sptr++); if( charbytes ) --pos; offset += charbytes; @@ -256,8 +260,8 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, size_type end = pos + count; if(end < 0 || end > nchars) end = nchars; - size_type spos = byte_offset_for(pos); - size_type epos = byte_offset_for(end); + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); size_type len2 = bytes; size_type len1 = (epos - spos) - len2 + 1; @@ -270,14 +274,14 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, for( size_type jdx=0; match && (jdx < len2); ++jdx ) match = (ptr1[jdx] == ptr2[jdx]); if( match ) - return char_offset(idx+spos); + return character_offset(idx+spos); ptr1++; } return -1; } // maybe get rid of this one -__device__ inline size_type string_view::find(Char chr, size_type pos, int count) const +__device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const { size_type sz = size(); size_type nchars = characters(); @@ -288,19 +292,19 @@ __device__ inline size_type string_view::find(Char chr, size_type pos, int count end = nchars; if(pos > end || chr == 0 || sz == 0) return -1; - size_type spos = byte_offset_for(pos); - size_type epos = byte_offset_for(end); + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); // - size_type chsz = bytes_in_char(chr); + size_type chsz = detail::bytes_in_char_utf8(chr); const char* sptr = data(); const char* ptr = sptr + spos; size_type len = (epos - spos) - chsz; for(size_type idx = 0; idx <= len; ++idx) { - Char ch = 0; - char_to_Char(ptr++, ch); + char_utf8 ch = 0; + detail::to_char_utf8(ptr++, ch); if(chr == ch) - return chars_in_string(sptr, idx + spos); + return detail::characters_in_string(sptr, idx + spos); } return -1; } @@ -320,8 +324,8 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, size_type end = pos + count; if(end < 0 || end > nchars) end = nchars; - size_type spos = byte_offset_for(pos); - size_type epos = byte_offset_for(end); + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); size_type len2 = bytes; size_type len1 = (epos - spos) - len2 + 1; @@ -334,13 +338,13 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, for(size_type jdx=0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); if(match) - return char_offset(epos - len2 - idx); + return character_offset(epos - len2 - idx); ptr1--; // go backwards } return -1; } -__device__ inline size_type string_view::rfind(Char chr, size_type pos, int count) const +__device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const { size_type sz = size(); size_type nchars = characters(); @@ -351,19 +355,19 @@ __device__ inline size_type string_view::rfind(Char chr, size_type pos, int coun end = nchars; if(pos > end || chr == 0 || sz == 0) return -1; - size_type spos = byte_offset_for(pos); - size_type epos = byte_offset_for(end); + size_type spos = byte_offset(pos); + size_type epos = byte_offset(end); - size_type chsz = bytes_in_char(chr); + size_type chsz = detail::bytes_in_char_utf8(chr); const char* sptr = data(); const char* ptr = sptr + epos - 1; size_type len = (epos - spos) - chsz; for(size_type idx = 0; idx < len; ++idx) { - Char ch = 0; - char_to_Char(ptr--, ch); + char_utf8 ch = 0; + detail::to_char_utf8(ptr--, ch); if(chr == ch) - return chars_in_string(sptr, epos - idx - 1); + return detail::characters_in_string(sptr, epos - idx - 1); } return -1; } @@ -372,8 +376,8 @@ __device__ inline size_type string_view::rfind(Char chr, size_type pos, int coun // parameters are character position values __device__ inline string_view string_view::substr(size_type pos, size_type length) const { - size_type spos = byte_offset_for(pos); - size_type epos = byte_offset_for(pos + length); + size_type spos = byte_offset(pos); + size_type epos = byte_offset(pos + length); if( epos > size() ) epos = size(); if(spos >= epos) @@ -412,7 +416,7 @@ __device__ inline size_type string_view::split(const char* delim, int count, str if(strsCount < count) count = strsCount; // - size_type dchars = (bytes ? chars_in_string(delim,bytes) : 1); + size_type dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); size_type nchars = characters(); size_type spos = 0, sidx = 0; size_type epos = find(delim, bytes); @@ -461,7 +465,7 @@ __device__ inline size_type string_view::rsplit(const char* delim, int count, st if(strsCount < count) count = strsCount; // - unsigned int dchars = (bytes ? chars_in_string(delim,bytes) : 1); + unsigned int dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); int epos = (int)characters(); // end pos is not inclusive int sidx = count - 1; // index for strs array int spos = rfind(delim, bytes); @@ -481,8 +485,14 @@ __device__ inline size_type string_view::rsplit(const char* delim, int count, st return rtn; } +__device__ inline size_type string_view::character_offset(size_type bytepos) const +{ + return detail::characters_in_string(data(), bytepos); +} -__host__ __device__ inline size_type string_view::bytes_in_char(Char chr) +namespace detail +{ +__host__ __device__ inline size_type bytes_in_char_utf8(char_utf8 chr) { size_type count = 1; count += (int)((chr & (unsigned)0x0000FF00) > 0); @@ -491,31 +501,31 @@ __host__ __device__ inline size_type string_view::bytes_in_char(Char chr) return count; } -__host__ __device__ inline size_type string_view::char_to_Char(const char* pSrc, Char &chr) +__host__ __device__ inline size_type to_char_utf8(const char* pSrc, char_utf8 &chr) { - size_type chwidth = bytes_in_char_byte((BYTE)*pSrc); - chr = (Char)(*pSrc++) & 0xFF; + size_type chwidth = bytes_in_utf8_byte((BYTE)*pSrc); + chr = (char_utf8)(*pSrc++) & 0xFF; if(chwidth > 1) { chr = chr << 8; - chr |= ((Char)(*pSrc++) & 0xFF); // << 8; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 8; if(chwidth > 2) { chr = chr << 8; - chr |= ((Char)(*pSrc++) & 0xFF); // << 16; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 16; if(chwidth > 3) { chr = chr << 8; - chr |= ((Char)(*pSrc++) & 0xFF); // << 24; + chr |= ((char_utf8)(*pSrc++) & 0xFF); // << 24; } } } return chwidth; } -__host__ __device__ inline size_type string_view::Char_to_char(Char chr, char* dst) +__host__ __device__ inline size_type from_char_utf8(char_utf8 chr, char* dst) { - size_type chwidth = bytes_in_char(chr); + size_type chwidth = bytes_in_char_utf8(chr); for(size_type idx = 0; idx < chwidth; ++idx) { dst[chwidth - idx - 1] = (char)chr & 0xFF; @@ -525,7 +535,7 @@ __host__ __device__ inline size_type string_view::Char_to_char(Char chr, char* d } // counts the number of characters in the given char array -__host__ __device__ inline size_type string_view::chars_in_string(const char* str, size_type bytes) +__host__ __device__ inline size_type characters_in_string(const char* str, size_type bytes) { if( (str==0) || (bytes==0) ) return 0; @@ -536,9 +546,6 @@ __host__ __device__ inline size_type string_view::chars_in_string(const char* st return (size_type)nchars; } -__device__ inline size_type string_view::char_offset(size_type bytepos) const -{ - return chars_in_string(data(), bytepos); -} - -} \ No newline at end of file +} // namespace detail +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 2522bcbfc6f..a105fc0afa6 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -326,7 +326,7 @@ std::unique_ptr concatenate( strings_column_view strings, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* - * @brief Row-wise oncatenates the given list of strings columns with the first column. + * @brief Row-wise concatenates the given list of strings columns with the first column. * * @code * s1 = ['aa', null, '', 'aa'] diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 62e6d6fff49..6dfeefa21d6 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -183,12 +183,12 @@ std::unique_ptr scatter( strings_column_view strings, auto execpol = rmm::exec_policy(stream); // create strings arrays - rmm::device_vector strings_array = + rmm::device_vector strings_array = detail::create_string_array_from_column(strings,stream); - cudf::string_view* d_strings = strings_array.data().get(); - rmm::device_vector values_array = + string_view* d_strings = strings_array.data().get(); + rmm::device_vector values_array = detail::create_string_array_from_column(values,stream); - cudf::string_view* d_values = values_array.data().get(); + string_view* d_values = values_array.data().get(); // do the scatter thrust::scatter( execpol->on(stream), d_values, d_values+elements, @@ -247,7 +247,7 @@ std::unique_ptr scatter( strings_column_view handler, auto replace = detail::string_from_host(string, stream); auto d_replace = *replace; // create strings array - rmm::device_vector strings = + rmm::device_vector strings = detail::create_string_array_from_column(handler, stream); auto d_strings = strings.data().get(); // replace specific elements diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 7861d779748..9a87c83f844 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -140,8 +140,6 @@ std::unique_ptr code_points( strings_column_view strings, auto d_str = d_column.element(idx); auto result = d_results + (idx ? d_offsets[idx-1] :0); thrust::copy( thrust::seq, d_str.begin(), d_str.end(), result); - //for( auto itr = d_str.begin(); itr != d_str.end(); ++itr ) - // *result++ = (unsigned int)*itr; }); // results->set_null_count(0); // no nulls here diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 78d30674c7e..57ac87c6639 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -22,6 +22,7 @@ #include #include #include "./utilities.h" +#include "./utilities.cuh" #include #include @@ -48,7 +49,7 @@ std::unique_ptr concatenate( strings_column_view strings, auto separator_ptr = detail::string_from_host(separator, stream); auto d_separator = *separator_ptr; auto narep_ptr = detail::string_from_host(narep, stream); - cudf::string_view d_narep(nullptr,0); + string_view d_narep(nullptr,0); if( narep_ptr ) d_narep = *narep_ptr; @@ -82,16 +83,16 @@ std::unique_ptr concatenate( strings_column_view strings, thrust::make_counting_iterator(count), d_results_offsets, [d_strings, d_others, d_separator, d_narep] __device__ (size_type idx) { - cudf::string_view d_str1; + string_view d_str1; if( d_strings.nullable() && d_strings.is_null(idx) ) - d_str1 = cudf::string_view(nullptr,0); + d_str1 = string_view(nullptr,0); else - d_str1 = d_strings.element(idx); - cudf::string_view d_str2; + d_str1 = d_strings.element(idx); + string_view d_str2; if( d_others.nullable() && d_others.is_null(idx) ) - d_str2 = cudf::string_view(nullptr,0); + d_str2 = string_view(nullptr,0); else - d_str2 = d_others.element(idx); + d_str2 = d_others.element(idx); if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) return 0; // null output case size_type bytes = 0; @@ -120,16 +121,16 @@ std::unique_ptr concatenate( strings_column_view strings, auto d_results_chars = chars_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, [d_strings, d_others, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(size_type idx){ - cudf::string_view d_str1; + string_view d_str1; if( d_strings.nullable() && d_strings.is_null(idx) ) - d_str1 = cudf::string_view(nullptr,0); + d_str1 = string_view(nullptr,0); else - d_str1 = d_strings.element(idx); - cudf::string_view d_str2; + d_str1 = d_strings.element(idx); + string_view d_str2; if( d_others.nullable() && d_others.is_null(idx) ) - d_str2 = cudf::string_view(nullptr,0); + d_str2 = string_view(nullptr,0); else - d_str2 = d_others.element(idx); + d_str2 = d_others.element(idx); if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) return; // null -- nothing to do // concat the two strings with appropriate separator and narep diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 16a85492a39..d8ff5aff93e 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -36,7 +36,7 @@ std::unique_ptr make_strings_column( cudaStream_t stream, rmm::mr::device_memory_resource* mr) { - size_type count = (size_type)strings.size(); + size_type count = strings.size(); // maybe a separate factory for creating null strings-column CUDF_EXPECTS(count > 0, "must specify at least one pair"); @@ -49,10 +49,9 @@ std::unique_ptr make_strings_column( thrust::make_counting_iterator(count), [d_strings] __device__ (size_t idx) { auto item = d_strings[idx]; - return item.first ? item.second : (size_t)0; + return item.first ? item.second : 0; }, - (size_t)0, - thrust::plus()); + 0, thrust::plus()); CUDF_EXPECTS( bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column" ); // build offsets column @@ -63,7 +62,7 @@ std::unique_ptr make_strings_column( offsets_view.data(), [d_strings] __device__ (size_type idx) { thrust::pair item = d_strings[idx]; - return ( item.first ? (int32_t)item.second : 0 ); + return ( item.first ? static_cast(item.second) : 0 ); }, thrust::plus() ); diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 45a124d045a..42f779600b7 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -100,7 +100,7 @@ void print( strings_column_view strings, size_type bytes = d_offsets[idx] - offset; // specialization on element() string_view d_str( d_strings + offset, bytes ); // method of column_device_view if( (max_width > 0) && (d_str.characters() > max_width) ) - bytes = d_str.byte_offset_for(max_width); + bytes = d_str.byte_offset(max_width); return bytes+1; // allow for null-terminator on non-null strings }, thrust::plus()); diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index a329c566ab9..23130824c68 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -33,7 +33,7 @@ namespace detail { // Used to build a temporary string_view object from a single host string. -std::unique_ptr> +std::unique_ptr> string_from_host( const char* str, cudaStream_t stream ) { if( !str ) @@ -46,13 +46,13 @@ std::unique_ptr> cudaMemcpyHostToDevice, stream )); CUDA_TRY(cudaStreamSynchronize(stream)); - auto deleter = [](cudf::string_view* sv) { RMM_FREE(const_cast(sv->data()),0); }; - return std::unique_ptr{ new cudf::string_view(d_str,length), deleter}; + auto deleter = [](string_view* sv) { RMM_FREE(const_cast(sv->data()),0); }; + return std::unique_ptr{ new string_view(d_str,length), deleter}; } // build an array of string_view objects from a strings column -rmm::device_vector create_string_array_from_column( +rmm::device_vector create_string_array_from_column( cudf::strings_column_view strings, cudaStream_t stream ) { @@ -61,22 +61,22 @@ rmm::device_vector create_string_array_from_column( auto d_column = *strings_column; auto count = strings.size(); - rmm::device_vector strings_array(count); - cudf::string_view* d_strings = strings_array.data().get(); + rmm::device_vector strings_array(count); + string_view* d_strings = strings_array.data().get(); thrust::for_each_n( execpol->on(stream), thrust::make_counting_iterator(0), count, [d_column, d_strings] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) - d_strings[idx] = cudf::string_view(nullptr,0); + d_strings[idx] = string_view(nullptr,0); else - d_strings[idx] = d_column.element(idx); + d_strings[idx] = d_column.element(idx); }); return strings_array; } // build a strings offsets column from an array of string_views std::unique_ptr offsets_from_string_array( - const rmm::device_vector& strings, + const rmm::device_vector& strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { size_type count = strings.size(); @@ -102,7 +102,7 @@ std::unique_ptr offsets_from_string_array( // build a strings chars column from an array of string_views std::unique_ptr chars_from_string_array( - const rmm::device_vector& strings, + const rmm::device_vector& strings, const int32_t* d_offsets, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh new file mode 100644 index 00000000000..afd51b9b821 --- /dev/null +++ b/cpp/src/strings/utilities.cuh @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +namespace cudf +{ +namespace strings +{ +namespace detail +{ + +/** + * @brief This utility will copy the argument string's data into + * the provided buffer. + * + * @param buffer Device buffer to copy to. + * @param d_string String to copy. + * @return Points to the end of the buffer after the copy. + */ +__device__ inline char* copy_string( char* buffer, const string_view& d_string ) +{ + memcpy( buffer, d_string.data(), d_string.size() ); + return buffer + d_string.size(); +} + + +} // namespace detail +} // namespace strings +} // namespace cudf \ No newline at end of file diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.h index d895c4d5378..9c4215a9938 100644 --- a/cpp/src/strings/utilities.h +++ b/cpp/src/strings/utilities.h @@ -34,7 +34,7 @@ namespace detail * @param[in] stream Stream to execute any device code against. * @return Device object pointer. */ -std::unique_ptr> +std::unique_ptr> string_from_host( const char* str, cudaStream_t stream=0 ); /** @@ -45,7 +45,7 @@ std::unique_ptr> * @param stream Stream to execute any device code against. * @return Strings array */ -rmm::device_vector create_string_array_from_column( +rmm::device_vector create_string_array_from_column( cudf::strings_column_view strings, cudaStream_t stream=0 ); @@ -60,7 +60,7 @@ rmm::device_vector create_string_array_from_column( * @return Offsets column */ std::unique_ptr offsets_from_string_array( - const rmm::device_vector& strings, + const rmm::device_vector& strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); @@ -75,27 +75,11 @@ std::unique_ptr offsets_from_string_array( * @return chars column */ std::unique_ptr chars_from_string_array( - const rmm::device_vector& strings, + const rmm::device_vector& strings, const int32_t* d_offsets, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/** - * @brief This utility will copy the argument string's data into - * the provided buffer. - * - * @param buffer Device buffer to copy to. - * @param d_string String to copy. - * @return Points to the end of the buffer after the copy. - */ -__device__ inline char* copy_string( char* buffer, const cudf::string_view& d_string ) -{ - memcpy( buffer, d_string.data(), d_string.size() ); - return buffer + d_string.size(); -} - - } // namespace detail } // namespace strings } // namespace cudf \ No newline at end of file diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cu index 2466b2d8aea..4c009b421d8 100644 --- a/cpp/tests/strings/utilities.cu +++ b/cpp/tests/strings/utilities.cu @@ -67,8 +67,8 @@ struct compare_strings_fn if( (d_lhs.nullable() && d_lhs.is_null(lidx)) || (d_rhs.nullable() && d_rhs.is_null(ridx)) ) return d_lhs.is_null(lidx)==d_rhs.is_null(ridx); - cudf::string_view lstr = d_lhs.element(lidx); - cudf::string_view rstr = d_rhs.element(ridx); + cudf::strings::string_view lstr = d_lhs.element(lidx); + cudf::strings::string_view rstr = d_rhs.element(ridx); return lstr.compare(rstr)==0; } column_device_view d_lhs; From 892d7a7ce8d53c0a4812e1e7e99782ee76125cce Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 30 Sep 2019 14:48:13 -0400 Subject: [PATCH 21/54] refactored counts methods; fixed typos per review --- cpp/include/cudf/strings/string_view.inl | 27 ++-- .../cudf/strings/strings_column_view.hpp | 124 +++++++++++++----- cpp/src/strings/attributes.cu | 72 +++++----- 3 files changed, 134 insertions(+), 89 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index ff0def946b0..057dbba8715 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -16,22 +16,15 @@ typedef unsigned char BYTE; * @param byte Byte from an encoded character. * @return Number of bytes. *---------------------------------------------------------------------------**/ -__host__ __device__ inline static cudf::size_type bytes_in_utf8_byte(BYTE byte) +__host__ __device__ inline cudf::size_type bytes_in_utf8_byte(BYTE byte) { cudf::size_type count = 1; - // no if-statements means no divergence - count += (int)((byte & 0xF0) == 0xF0); - count += (int)((byte & 0xE0) == 0xE0); - count += (int)((byte & 0xC0) == 0xC0); - count -= (int)((byte & 0xC0) == 0x80); + count += (int)((byte & 0xF0) == 0xF0); // 4-byte character prefix + count += (int)((byte & 0xE0) == 0xE0); // 3-byte character prefix + count += (int)((byte & 0xC0) == 0xC0); // 2-byte character prefix + count -= (int)((byte & 0xC0) == 0x80); // intermediate byte return count; } -} // namespace - -namespace cudf -{ -namespace strings -{ /**---------------------------------------------------------------------------* * @brief Returns the number of bytes used in the provided char array by @@ -40,16 +33,22 @@ namespace strings * @param str Null-terminated array of chars. * @return Number of bytes. *---------------------------------------------------------------------------**/ -__device__ inline static size_type string_length( const char* str ) +__device__ inline cudf::size_type string_length( const char* str ) { if( !str ) return 0; - size_type bytes = 0; + cudf::size_type bytes = 0; while(*str++) ++bytes; return bytes; } +} // namespace + +namespace cudf +{ +namespace strings +{ __device__ inline string_view::string_view(const char* data, size_type bytes) : _data(data), _bytes(bytes) diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index a105fc0afa6..a18f444d1a5 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -88,29 +88,28 @@ void print( strings_column_view strings, /**---------------------------------------------------------------------------* - * @brief Create output pair per Arrow format of strings. - * The return pair is a the array of chars and an array of offsets. + * @brief Create output per Arrow strings format. + * The return pair is the array of chars and the array of offsets. * * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. - * @return Contiguous array of chars and an array of offsets. + * @return Pair containing a contiguous array of chars and an array of offsets. *---------------------------------------------------------------------------**/ std::pair, rmm::device_vector> create_offsets( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -// array.cu /**---------------------------------------------------------------------------* * @brief Returns a new strings column created from a subset of * of this instance's strings column. * - * @code + * ``` * s1 = ["a", "b", "c", "d", "e", "f"] * s2 = sublist( s1, 2 ) * s2 is ["c", "d", "e", "f"] - * @endcode + * ``` * * @param strings Strings instance for this operation. * @param start Index of first string to use. @@ -129,15 +128,15 @@ std::unique_ptr sublist( strings_column_view strings, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* - * @brief Returns a new strings column created this strings instance using - * the specified indices to select the strings. + * @brief Returns a new strings column using the specified indices to select + * elements from the specified strings column. * - * @code + * ``` * s1 = ["a", "b", "c", "d", "e", "f"] * map = [0, 2] * s2 = gather( s1, map ) * s2 is ["a", "c"] - * @endcode + * ``` * * @param strings Strings instance for this operation. * @param gather_map The indices with which to select strings for the new column. @@ -151,7 +150,9 @@ std::unique_ptr gather( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -// sort types can be combined +/**---------------------------------------------------------------------------* + * @brief Sort types for the sort method. + *---------------------------------------------------------------------------**/ enum sort_type { none=0, ///< no sorting length=1, ///< sort by string length @@ -182,13 +183,13 @@ std::unique_ptr sort( strings_column_view strings, * The map values specify the location in the new strings instance. * Missing values pass through from the handler instance into those positions. * - * @code + * ``` * s1 = ["a", "b", "c", "d"] * s2 = ["e", "f"] * map = [1, 3] * s3 = scatter( s1, s2, m1 ) * s3 is ["a", "e", "c", "f"] - * @endcode + * ``` * * @param strings Strings instance for this operation. * @param values The instance for which to retrieve the strings @@ -211,12 +212,12 @@ std::unique_ptr scatter( strings_column_view strings, * in the new strings instance. Missing values pass through from * the handler instance at those positions. * - * @code + * ``` * s1 = ["a", "b", "c", "d"] * map = [1, 3] * s2 = scatter( s1, "e", m1 ) * s2 is ["a", "e", "c", "e"] - * @endcode + * ``` * * @param strings Strings instance for this operation. * @param value Null-terminated encoded string in host memory to use with @@ -232,7 +233,6 @@ std::unique_ptr scatter( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -// attributes.cu /**---------------------------------------------------------------------------* * @brief Returns the number of bytes for each string in a strings column. * Null strings will have a byte count of 0. @@ -268,11 +268,11 @@ std::unique_ptr characters_counts( strings_column_view strings, * The column is an array of variable-length integer arrays each with length * as returned by characters_counts(). * - * @code + * ``` * s = ["a","xyz", "éee"] * v = code_points(s) * v is [97, 120, 121, 122, 50089, 101, 101] - * @endcode + * ``` * * @param strings Strings instance for this operation. * @param stream CUDA stream to use kernels in this method. @@ -307,7 +307,6 @@ std::unique_ptr is_of_type( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -// combine.cu /**---------------------------------------------------------------------------* * @brief Row-wise concatenates two columns of strings into a new a column. * The number of strings in both columns must match. @@ -317,7 +316,7 @@ std::unique_ptr is_of_type( strings_column_view strings, * @param narep Null-terminated CPU string that should represent any null strings found. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. - * @return New instance with the concatenated + * @return New column with concatenated results *---------------------------------------------------------------------------**/ std::unique_ptr concatenate( strings_column_view strings, strings_column_view others, @@ -327,14 +326,14 @@ std::unique_ptr concatenate( strings_column_view strings, /**---------------------------------------------------------------------------* * @brief Row-wise concatenates the given list of strings columns with the first column. - * + * * @code * s1 = ['aa', null, '', 'aa'] * s2 = ['', 'bb', 'bb', null] * r = concatenate(s1,s2) * r is ['aa', null, 'bb', null] * @endcode - * + * * @param strings 1st string column. * @param others List of string columns to concatenate. * @param separator Null-terminated CPU string that should appear between each instance. @@ -343,10 +342,9 @@ std::unique_ptr concatenate( strings_column_view strings, * Default of null means any null operand produces a null result. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. - * @return New instance with the concatenated + * @return New column with concatenated results *---------------------------------------------------------------------------**/ -std::unique_ptr concatenate( strings_column_view strings, - std::vector& others, +std::unique_ptr concatenate( std::vector& strings, const char* separator="", const char* narep=nullptr, cudaStream_t stream=0, @@ -360,18 +358,27 @@ std::unique_ptr concatenate( strings_column_view strings, * @param narep Null-terminated CPU string that should represent any null strings found. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. - * @return Resulting instance with one string. + * @return New column containing one string. *---------------------------------------------------------------------------**/ -std::unique_ptr build_single_string( strings_column_view strings, - const char* separator="", - const char* narep=nullptr, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); +std::unique_ptr join_strings( strings_column_view strings, + const char* separator="", + const char* narep=nullptr, + cudaStream_t stream=0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -// split.cu /**---------------------------------------------------------------------------* * @brief Split strings vertically creating new columns of strings. * The number of columns will be equal to the string with the most splits. + * The delimiter is searched starting from the beginning of each string. + * + * ``` + * s = ["a b c", "d e f", "g h"] + * r = split(s," ") + * r is vector of 3 columns: + * r[0] = ["a", "d", "g"] + * r[1] = ["b", "e", "h"] + * r[2] = ["c", "f", nullptr] + * ``` * * @param delimiter Null-terminated CPU string identifying the split points within each string. * Default of null splits on whitespace. @@ -387,8 +394,17 @@ std::vector> split( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* - * @brief Split strings vertically creating new columns of NVStrings instances. + * @brief Split strings vertically creating new columns of strings. * The number of columns will be equal to the string with the most splits. + * The delimiter is searched starting from the end of each string. + * + * ``` + * s = ["a b c", "d e f", "g h"] + * r = split(s," ",1) + * r is vector of 2 columns: + * r[0] = ["a b", "d e", "g h"] + * r[1] = ["c", "f", nullptr] + * ``` * * @param delimiter Null-terminated CPU string identifying the split points within each string. * Default of null splits on whitespace. @@ -404,10 +420,20 @@ std::vector> rsplit( strings_column_view strings, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); /**---------------------------------------------------------------------------* - * @brief Each string is split into a list of new strings. + * @brief Each string is split into a list of new column of strings. * The delimiter is searched from the beginning of each string. * Each string results in a new strings column. * + * ``` + * s = ["a b c", "d e f", "g h", "i j"] + * r = split_record(s," ") + * r is vector of 4 columns: + * r[0] = ["a", "b", "c"] + * r[1] = ["d", "e", "f"] + * r[2] = ["g", "h", nullptr] + * r[3] = ["i", "j", nullptr] + * ``` + * * @param strings Strings for this operation. * @param delimiter Null-terminated CPU string identifying the split points within each string. * Default of null splits on whitespace. @@ -427,6 +453,16 @@ std::vector> split_record( strings_column_view str * The delimiter is searched from the end of each string. * Each string results in a new strings column. * + * ``` + * s = ["a b c", "d e f", "g h", "i j"] + * r = rsplit_record(s," ",1) + * r is vector of 4 columns: + * r[0] = ["a b", "c"] + * r[1] = ["d e", "f"] + * r[2] = ["g", "h"] + * r[3] = ["i", "j"] + * ``` + * * @param strings Strings for this operation. * @param delimiter Null-terminated CPU string identifying the split points within each string. * Default of null splits on whitespace. @@ -446,6 +482,16 @@ std::vector> rsplit_record( strings_column_view st * Three strings are always created for each string: left-half, delimiter itself, right-half. * The result is 3 strings columns representing the 3 partitions. * + * ``` + * s = ["a:b:c", "d:e:f", "g:h", "i:j"] + * r = partition(s,":") + * r is vector of 4 columns: + * r[0] = ["a", ":", "b:c"] + * r[1] = ["d", ":", "e:f"] + * r[2] = ["g", ":", "h"] + * r[3] = ["i", ":", "j"] + * ``` + * * @param delimiter Null-terminated CPU string identifying the split points within each string. * @param results The list of instances for each string. * @param stream CUDA stream to use kernels in this method. @@ -461,6 +507,16 @@ std::vector> partition( strings_column_view string * Three strings are always created for each string: left-half, delimiter itself, right-half. * The result is 3 strings columns representing the 3 partitions. * + * ``` + * s = ["a:b:c", "d:e:f", "g:h", "i:j"] + * r = rpartition(s,":") + * r is vector of 4 columns: + * r[0] = ["a:b", ":", "c"] + * r[1] = ["d:e", ":", "f"] + * r[2] = ["g", ":", "h"] + * r[3] = ["i", ":", "j"] + * ``` + * * @param delimiter Null-terminated CPU string identifying the split points within each string. * @param results The list of instances for each string. * @param stream CUDA stream to use kernels in this method. diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 9a87c83f844..5f63dc899c0 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -23,18 +24,19 @@ #include #include -namespace cudf -{ -namespace strings +namespace { -std::unique_ptr characters_counts( strings_column_view strings, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) +// used by bytes_counts() and characters_counts() +template +std::unique_ptr counts( cudf::strings_column_view strings, + predicate& pfn, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) { auto count = strings.size(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(strings.parent(),stream); + auto strings_column = cudf::column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; rmm::device_buffer null_mask; cudf::size_type null_count = d_column.null_count(); @@ -43,58 +45,46 @@ std::unique_ptr characters_counts( strings_column_view strings, gdf_valid_allocation_size(count), stream, mr); // create output column - auto results = std::make_unique( data_type{INT32}, count, + auto results = std::make_unique( cudf::data_type{cudf::INT32}, count, rmm::device_buffer(count * sizeof(int32_t), stream, mr), null_mask, null_count); auto results_view = results->mutable_view(); auto d_lengths = results_view.data(); - // set lengths + // set the counts thrust::transform( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), d_lengths, - [d_column] __device__ (int32_t idx) { + [d_column, pfn] __device__ (int32_t idx) { if( d_column.nullable() && d_column.is_null(idx) ) return 0; - return d_column.element(idx).characters(); + return pfn(d_column.element(idx)); }); results->set_null_count(null_count); return results; } +} // namespace + +namespace cudf +{ +namespace strings +{ + +std::unique_ptr characters_counts( strings_column_view strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr ) +{ + auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.characters(); }; + return counts(strings,pfn,stream,mr); +} + std::unique_ptr bytes_counts( strings_column_view strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - auto count = strings.size(); - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(strings.parent(),stream); - auto d_column = *strings_column; - rmm::device_buffer null_mask; - cudf::size_type null_count = d_column.null_count(); - if( d_column.nullable() ) - null_mask = rmm::device_buffer( d_column.null_mask(), - gdf_valid_allocation_size(count), - stream, mr); - // create output column - auto results = std::make_unique( data_type{INT32}, count, - rmm::device_buffer(count * sizeof(int32_t), stream, mr), - null_mask, null_count); - auto results_view = results->mutable_view(); - auto d_lengths = results_view.data(); - // set sizes - thrust::transform( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_lengths, - [d_column] __device__ (int32_t idx) { - if( d_column.nullable() && d_column.is_null(idx) ) - return 0; - return d_column.element(idx).size(); - }); - // reset null count must be done on the column and not the view - results->set_null_count(null_count); - return results; + auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.size(); }; + return counts(strings,pfn,stream,mr); } // @@ -109,7 +99,7 @@ std::unique_ptr code_points( strings_column_view strings, auto d_column = *strings_column; // offsets point to each individual integer range - rmm::device_vector offsets(count); + rmm::device_vector offsets(count); size_type* d_offsets = offsets.data().get(); thrust::transform_inclusive_scan(execpol->on(stream), thrust::make_counting_iterator(0), From 05ca768cb6b18309e1918b72ab0c6e032dc65732 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 30 Sep 2019 17:39:25 -0400 Subject: [PATCH 22/54] remove unused var from lambda --- cpp/src/strings/array.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 6dfeefa21d6..9ad0caf1bf8 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -82,7 +82,7 @@ std::unique_ptr gather( strings_column_view handler, d_indices, d_indices + count, d_new_offsets, - [d_column, d_offsets, d_indices] __device__ (size_type idx) { + [d_column, d_offsets] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) return 0; size_type offset = idx ? d_offsets[idx-1] : 0; From 5043c7bcf073e7407cc0f2cb22bad75689438062 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 08:11:08 -0400 Subject: [PATCH 23/54] rename utilities.h to utilities.hpp --- cpp/src/strings/array.cu | 2 +- cpp/src/strings/combine.cu | 2 +- cpp/src/strings/utilities.cu | 2 +- cpp/src/strings/{utilities.h => utilities.hpp} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename cpp/src/strings/{utilities.h => utilities.hpp} (100%) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 9ad0caf1bf8..113dc3f8002 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -20,7 +20,7 @@ #include #include #include -#include "./utilities.h" +#include "./utilities.hpp" #include #include diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 57ac87c6639..5613dcfca7e 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -21,7 +21,7 @@ #include #include #include -#include "./utilities.h" +#include "./utilities.hpp" #include "./utilities.cuh" #include diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 23130824c68..922326d0a29 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -18,7 +18,7 @@ #include #include #include -#include "./utilities.h" +#include "./utilities.hpp" #include #include diff --git a/cpp/src/strings/utilities.h b/cpp/src/strings/utilities.hpp similarity index 100% rename from cpp/src/strings/utilities.h rename to cpp/src/strings/utilities.hpp From ce9d417f790431309e282e1465b42a07ff5a1f6e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 08:12:50 -0400 Subject: [PATCH 24/54] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7e2807c178..22ee7773dec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ - PR #2838 CSV Reader: Support ARROW_RANDOM_FILE input - PR #2655 CuPy-based Series and Dataframe .values property - PR #2803 Added `edit_distance_matrix()` function to calculate pairwise edit distance for each string on a given nvstrings object. - +- PR #2811 Start of cudf strings column work based on 2207 ## Improvements From 60bda842aa847d65942cd67a10d67089218e1baf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 11:29:31 -0400 Subject: [PATCH 25/54] offsets +1 --- .../cudf/column/column_device_view.cuh | 4 +-- cpp/src/column/column_view.cpp | 4 +-- cpp/src/strings/array.cu | 36 +++++++++---------- cpp/src/strings/combine.cu | 12 ++++--- cpp/src/strings/strings_column_factories.cu | 29 ++++++++------- cpp/src/strings/strings_column_view.cu | 36 +++++++++---------- cpp/src/strings/utilities.cu | 31 ++++++++-------- cpp/src/strings/utilities.hpp | 4 ++- cpp/tests/strings/factories_test.cu | 4 +-- 9 files changed, 81 insertions(+), 79 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index ec61a863c72..05574d34a16 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -491,8 +491,8 @@ __device__ inline strings::string_view const column_device_view::element(); const char* d_strings = d_children[1].data(); - size_type offset = index ? d_offsets[index-1] : 0; - return strings::string_view{d_strings + offset, d_offsets[index] - offset}; + size_type offset = d_offsets[index]; + return strings::string_view{d_strings + offset, d_offsets[index+1] - offset}; } //template <> diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 562bcd11d6c..949a2d62105 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -42,8 +42,8 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); - } else if (size > 0) { - CUDF_EXPECTS(nullptr != data, "Null data pointer."); + //} else if (size > 0) { + // CUDF_EXPECTS(nullptr != data, "Null data pointer."); } CUDF_EXPECTS(offset >= 0, "Invalid offset."); diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 113dc3f8002..1c3ce56125b 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -73,22 +73,21 @@ std::unique_ptr gather( strings_column_view handler, auto d_offsets = handler.offsets().data(); // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_new_offsets = offsets_view.data(); - // create new offsets array + // fill new offsets array -- last entry includes the total size thrust::transform_inclusive_scan( execpol->on(stream), - d_indices, - d_indices + count, - d_new_offsets, + d_indices, d_indices + count, d_new_offsets+1, [d_column, d_offsets] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) return 0; - size_type offset = idx ? d_offsets[idx-1] : 0; - return d_offsets[idx] - offset; + return d_offsets[idx+1] - d_offsets[idx]; }, thrust::plus()); + int32_t offset_zero = 0; + cudaMemcpyAsync( d_new_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); // build null mask auto valid_mask = valid_if( static_cast(nullptr), @@ -102,7 +101,7 @@ std::unique_ptr gather( strings_column_view handler, RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build chars column - size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count-1]; // this may not be stream friendly + size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count]; // this may not be stream friendly if( (bytes==0) && (null_count < count) ) bytes = 1; // all entries are empty strings auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, @@ -115,8 +114,7 @@ std::unique_ptr gather( strings_column_view handler, if( d_column.nullable() && d_column.is_null(index) ) return; string_view d_str = d_column.element(index); - size_type offset = (idx ? d_new_offsets[idx-1] : 0); - memcpy(d_chars + offset, d_str.data(), d_str.size() ); + memcpy(d_chars + d_new_offsets[idx], d_str.data(), d_str.size() ); }); // build children vector @@ -125,7 +123,7 @@ std::unique_ptr gather( strings_column_view handler, children.emplace_back(std::move(chars_column)); return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } @@ -141,7 +139,7 @@ std::unique_ptr sort( strings_column_view handler, auto strings_column = column_device_view::create(handler.parent(), stream); auto d_column = *strings_column; - // lets sort indices + // sort the indices of the strings size_type count = handler.size(); thrust::device_vector indices(count); thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); @@ -209,11 +207,10 @@ std::unique_ptr scatter( strings_column_view strings, auto d_offsets = offsets_view.data(); // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; // this may not be stream friendly + size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; // this may not be stream friendly if( (bytes==0) && (null_count < count) ) bytes = 1; // all entries are empty strings - auto chars_column = detail::chars_from_string_array(strings_array, d_offsets, - stream, mr); + auto chars_column = detail::chars_from_string_array(strings_array,d_offsets,null_count,stream,mr); // build children vector std::vector> children; @@ -222,7 +219,7 @@ std::unique_ptr scatter( strings_column_view strings, // return new strings column return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } @@ -273,11 +270,10 @@ std::unique_ptr scatter( strings_column_view handler, auto d_offsets = offsets_view.data(); // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; + size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; if( (bytes==0) && (null_count < count) ) bytes = 1; // all entries are empty strings - auto chars_column = detail::chars_from_string_array(strings, d_offsets, - stream, mr); + auto chars_column = detail::chars_from_string_array(strings,d_offsets,null_count,stream,mr); // build children vector std::vector> children; @@ -286,7 +282,7 @@ std::unique_ptr scatter( strings_column_view handler, // return new strings column return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 5613dcfca7e..4310623ffdc 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -73,7 +73,7 @@ std::unique_ptr concatenate( strings_column_view strings, RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, + auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_results_offsets = offsets_view.data(); @@ -81,7 +81,7 @@ std::unique_ptr concatenate( strings_column_view strings, thrust::transform_inclusive_scan( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - d_results_offsets, + d_results_offsets+1, [d_strings, d_others, d_separator, d_narep] __device__ (size_type idx) { string_view d_str1; if( d_strings.nullable() && d_strings.is_null(idx) ) @@ -110,9 +110,11 @@ std::unique_ptr concatenate( strings_column_view strings, return bytes; }, thrust::plus() ); + int32_t offset_zero = 0; + cudaMemcpyAsync( d_results_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); // build chars column - size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count-1]; // this may not be stream friendly + size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count]; if( (bytes==0) && (null_count < count) ) bytes = 1; // all entries are empty strings auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, @@ -134,7 +136,7 @@ std::unique_ptr concatenate( strings_column_view strings, if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) return; // null -- nothing to do // concat the two strings with appropriate separator and narep - size_type offset = (idx ? d_results_offsets[idx-1] : 0); + size_type offset = d_results_offsets[idx]; char* d_buffer = d_results_chars + offset; if( !d_str1.is_null() ) d_buffer = detail::copy_string(d_buffer, d_str1); @@ -154,7 +156,7 @@ std::unique_ptr concatenate( strings_column_view strings, children.emplace_back(std::move(chars_column)); return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index d8ff5aff93e..689e4ad97c8 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -54,17 +54,20 @@ std::unique_ptr make_strings_column( 0, thrust::plus()); CUDF_EXPECTS( bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column" ); - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, mr ); + // build offsets column -- last entry is the total size + auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.data(); thrust::transform_inclusive_scan( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - offsets_view.data(), + d_offsets+1, [d_strings] __device__ (size_type idx) { thrust::pair item = d_strings[idx]; return ( item.first ? static_cast(item.second) : 0 ); }, thrust::plus() ); + int32_t offset_zero = 0; + cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); // create null mask auto valid_mask = valid_if( static_cast(nullptr), @@ -81,16 +84,12 @@ std::unique_ptr make_strings_column( auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - auto d_offsets = offsets_view.data(); thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, [d_strings, d_offsets, d_chars] __device__(size_type idx){ // place individual strings auto item = d_strings[idx]; if( item.first ) - { - size_type offset = (idx ? d_offsets[idx-1] : 0); - memcpy(d_chars + offset, item.first, item.second ); - } + memcpy(d_chars + d_offsets[idx], item.first, item.second ); }); // build children vector @@ -98,9 +97,9 @@ std::unique_ptr make_strings_column( children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(chars_column)); - // see column_view.cpp(45) to see why size must be 0 here + // no data-ptr with count elements plus children return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } @@ -122,11 +121,11 @@ std::unique_ptr make_strings_column( size_type bytes = offsets.back() - offsets[0]; CUDF_EXPECTS( bytes >=0, "invalid offsets vector"); - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count, mask_state::UNALLOCATED, stream, mr ); + // build offsets column -- this is the number of strings + 1 + auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); - cudaMemcpyAsync( offsets_view.data(), offsets.data().get()+1, - count*sizeof(int32_t), + cudaMemcpyAsync( offsets_view.data(), offsets.data().get(), + (count+1)*sizeof(int32_t), cudaMemcpyDeviceToHost, stream ); // build null bitmask @@ -149,7 +148,7 @@ std::unique_ptr make_strings_column( // return std::make_unique( - data_type{STRING}, 0, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 42f779600b7..55b0f7a4267 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -35,7 +35,7 @@ strings_column_view::strings_column_view( column_view strings_column ) size_type strings_column_view::size() const { - return _parent.child(0).size(); + return _parent.size(); } column_view strings_column_view::parent() const @@ -88,22 +88,24 @@ void print( strings_column_view strings, auto d_strings = strings.chars().data(); // create output strings offsets - rmm::device_vector output_offsets(count,0); + rmm::device_vector output_offsets(count+1,0); + size_t* d_output_offsets = output_offsets.data().get(); thrust::transform_inclusive_scan( execpol->on(0), thrust::make_counting_iterator(start), thrust::make_counting_iterator(end), - output_offsets.begin(), - [d_column, d_strings, max_width, d_offsets] __device__ (size_type idx) { + d_output_offsets+1, + [d_column, max_width] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) return 0; - size_type offset = idx ? d_offsets[idx-1] : 0; // this logic will be a template - size_type bytes = d_offsets[idx] - offset; // specialization on element() - string_view d_str( d_strings + offset, bytes ); // method of column_device_view + string_view d_str = d_column.element(idx); + size_type bytes = d_str.size(); if( (max_width > 0) && (d_str.characters() > max_width) ) bytes = d_str.byte_offset(max_width); return bytes+1; // allow for null-terminator on non-null strings }, thrust::plus()); + int32_t offset_zero = 0; + cudaMemcpy( d_output_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice); // build output buffer size_t buffer_size = output_offsets.back(); // last element has total size @@ -115,31 +117,30 @@ void print( strings_column_view strings, rmm::device_vector buffer(buffer_size,0); // allocate and pre-null-terminate char* d_buffer = buffer.data().get(); // copy strings into output buffer - size_t* d_output_offsets = output_offsets.data().get(); thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), count, [d_strings, start, d_offsets, d_output_offsets, d_buffer] __device__(size_type idx) { - size_t output_offset = (idx ? d_output_offsets[idx-1] : 0); - size_t length = d_output_offsets[idx] - output_offset; // bytes + size_t output_offset = d_output_offsets[idx]; + size_t length = d_output_offsets[idx+1] - output_offset; // bytes if( length ) // this is only 0 for nulls { idx += start; - size_type offset = (idx ? d_offsets[idx-1]:0); + size_type offset = d_offsets[idx]; memcpy(d_buffer + output_offset, d_strings + offset, length-1 ); } }); // copy output buffer to host - std::vector h_offsets(count); - cudaMemcpy( h_offsets.data(), d_output_offsets, count*sizeof(size_t), cudaMemcpyDeviceToHost); + std::vector h_offsets(count+1); + cudaMemcpy( h_offsets.data(), d_output_offsets, (count+1)*sizeof(size_t), cudaMemcpyDeviceToHost); std::vector h_buffer(buffer_size); cudaMemcpy( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost ); // print out the strings to stdout for( size_type idx=0; idx < count; ++idx ) { - size_t offset = (idx ? h_offsets[idx-1]:0); - size_t length = h_offsets[idx] - offset; + size_t offset = h_offsets[idx]; + size_t length = h_offsets[idx+1] - offset; printf("%d:",idx); if( length ) printf("[%s]", h_buffer.data()+offset); @@ -158,12 +159,11 @@ std::pair, rmm::device_vector> size_type count = strings.size(); auto d_offsets = strings.offsets().data(); - size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; results.second = rmm::device_vector(count+1); - results.second[0] = 0; - cudaMemcpyAsync( results.second.data().get()+1, d_offsets, count*sizeof(size_type), + cudaMemcpyAsync( results.second.data().get(), d_offsets, (count+1)*sizeof(size_type), cudaMemcpyDeviceToHost, stream); + size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; auto d_chars = strings.chars().data(); results.first = rmm::device_vector(bytes); cudaMemcpyAsync( results.first.data().get(), d_chars, bytes, diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 922326d0a29..1f8c9e1d5e1 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -82,34 +82,38 @@ std::unique_ptr offsets_from_string_array( size_type count = strings.size(); auto d_strings = strings.data().get(); auto execpol = rmm::exec_policy(stream); - auto offsets_column = make_numeric_column( data_type{INT32}, count, + // offsets elements is the number of strings + 1 + auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_offsets = offsets_view.data(); - // create new offsets array + // create new offsets array -- last entry includes the total size thrust::transform_inclusive_scan( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - d_offsets, - [d_strings] __device__ (size_type idx) { - return d_strings[idx].size(); - }, + d_offsets+1, + [d_strings] __device__ (size_type idx) { return d_strings[idx].size(); }, thrust::plus()); - + int32_t offset_zero = 0; + cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); + // return offsets_column; } // build a strings chars column from an array of string_views std::unique_ptr chars_from_string_array( const rmm::device_vector& strings, - const int32_t* d_offsets, + const int32_t* d_offsets, cudf::size_type null_count, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { size_type count = strings.size(); auto d_strings = strings.data().get(); auto execpol = rmm::exec_policy(stream); - size_type bytes = thrust::device_pointer_cast(d_offsets)[count-1]; + size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; + if( (bytes==0) && (null_count < count) ) + bytes = 1; // all entries are empty strings + // create column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, @@ -117,13 +121,12 @@ std::unique_ptr chars_from_string_array( // get it's view auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + thrust::for_each_n(execpol->on(stream), + thrust::make_counting_iterator(0), count, [d_strings, d_offsets, d_chars] __device__(size_type idx){ string_view d_str = d_strings[idx]; - if( d_str.is_null() ) - return; - size_type offset = (idx ? d_offsets[idx-1] : 0); - memcpy(d_chars + offset, d_str.data(), d_str.size() ); + if( !d_str.is_null() ) + memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size() ); }); return chars_column; diff --git a/cpp/src/strings/utilities.hpp b/cpp/src/strings/utilities.hpp index 9c4215a9938..36dc1a0814e 100644 --- a/cpp/src/strings/utilities.hpp +++ b/cpp/src/strings/utilities.hpp @@ -70,13 +70,15 @@ std::unique_ptr offsets_from_string_array( * strings column from an intermediate strings array. * * @param strings Strings array + * @param d_offsets Offsets array for placing strings into column's memory. + * @param null_count Number of null strings. * @param stream Stream to execute any device code against. * @param mr Memory resource to use. * @return chars column */ std::unique_ptr chars_from_string_array( const rmm::device_vector& strings, - const int32_t* d_offsets, + const int32_t* d_offsets, cudf::size_type null_count, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 855e4db7ba7..b4440e5c7a8 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -79,7 +79,7 @@ TEST_F(FactoriesTest, CreateColumnFromArray) cudf::strings_column_view strings_view(column->view()); EXPECT_EQ( strings_view.size(), count); - EXPECT_EQ( strings_view.offsets().size(), count ); + EXPECT_EQ( strings_view.offsets().size(), count+1 ); EXPECT_EQ( strings_view.chars().size(), memsize ); // check string data @@ -128,7 +128,7 @@ TEST_F(FactoriesTest, CreateColumnFromOffsets) cudf::strings_column_view strings_view(column->view()); EXPECT_EQ( strings_view.size(), count); - EXPECT_EQ( strings_view.offsets().size(), count ); + EXPECT_EQ( strings_view.offsets().size(), count+1 ); EXPECT_EQ( strings_view.chars().size(), memsize ); // check string data From 8e2beec245f15a2df60c8069917329e8609f207e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 13:16:48 -0400 Subject: [PATCH 26/54] add comments, fix variable names --- .../cudf/strings/strings_column_view.hpp | 4 +- cpp/src/strings/array.cu | 43 ++++++++++--------- cpp/src/strings/strings_column_factories.cu | 10 +++-- cpp/src/strings/utilities.cuh | 2 +- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index a18f444d1a5..bcc42bd99a0 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -181,7 +181,7 @@ std::unique_ptr sort( strings_column_view strings, /** * @brief Returns new instance using the provided map values and strings. * The map values specify the location in the new strings instance. - * Missing values pass through from the handler instance into those positions. + * Missing values pass through from the column at those positions. * * ``` * s1 = ["a", "b", "c", "d"] @@ -210,7 +210,7 @@ std::unique_ptr scatter( strings_column_view strings, * @brief Returns new instance using the provided index values and a * single string. The map values specify where to place the string * in the new strings instance. Missing values pass through from - * the handler instance at those positions. + * the column at those positions. * * ``` * s1 = ["a", "b", "c", "d"] diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 1c3ce56125b..5e8b5a274bc 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -21,6 +21,7 @@ #include #include #include "./utilities.hpp" +#include "./utilities.cuh" #include #include @@ -35,14 +36,14 @@ namespace strings { // new strings column from subset of this strings instance -std::unique_ptr sublist( strings_column_view handler, +std::unique_ptr sublist( strings_column_view strings, size_type start, size_type end, size_type step, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { if( step <= 0 ) step = 1; - size_type count = handler.size(); + size_type count = strings.size(); if( end < 0 || end > count ) end = count; if( start < 0 || start > end ) @@ -56,11 +57,11 @@ std::unique_ptr sublist( strings_column_view handler, // create a column_view as a wrapper of these indices column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); // build a new strings column from the indices - return gather(handler, indices_view, stream, mr); + return gather(strings, indices_view, stream, mr); } // return new strings column with strings from this instance as specified by the indices -std::unique_ptr gather( strings_column_view handler, +std::unique_ptr gather( strings_column_view strings, column_view gather_map, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { @@ -68,25 +69,27 @@ std::unique_ptr gather( strings_column_view handler, auto d_indices = gather_map.data(); auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent(),stream); + auto strings_column = column_device_view::create(strings.parent(),stream); auto d_column = *strings_column; - auto d_offsets = handler.offsets().data(); + auto d_offsets = strings.offsets().data(); // build offsets column auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_new_offsets = offsets_view.data(); - // fill new offsets array -- last entry includes the total size + // fill new offsets array + // using inclusive-scan to compute last entry which is the total size thrust::transform_inclusive_scan( execpol->on(stream), - d_indices, d_indices + count, d_new_offsets+1, + d_indices, d_indices + count, + d_new_offsets+1, // fills in entries [1,count] [d_column, d_offsets] __device__ (size_type idx) { if( d_column.nullable() && d_column.is_null(idx) ) return 0; return d_offsets[idx+1] - d_offsets[idx]; }, thrust::plus()); - int32_t offset_zero = 0; + int32_t offset_zero = 0; // need to set the first entry to 0 cudaMemcpyAsync( d_new_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); // build null mask @@ -129,18 +132,18 @@ std::unique_ptr gather( strings_column_view handler, } // return sorted version of the given strings column -std::unique_ptr sort( strings_column_view handler, +std::unique_ptr sort( strings_column_view strings, sort_type stype, bool ascending, bool nullfirst, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(handler.parent(), stream); + auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; // sort the indices of the strings - size_type count = handler.size(); + size_type count = strings.size(); thrust::device_vector indices(count); thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); thrust::sort( execpol->on(stream), indices.begin(), indices.end(), @@ -158,7 +161,7 @@ std::unique_ptr sort( strings_column_view handler, // create a column_view as a wrapper of these indices column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); // now build a new strings column from the indices - return gather( handler, indices_view, stream, mr ); + return gather( strings, indices_view, stream, mr ); } // @@ -230,13 +233,13 @@ std::unique_ptr scatter( strings_column_view strings, // s3 = s1.scatter('e',pos,2) // ['a','e','c','e'] // -std::unique_ptr scatter( strings_column_view handler, +std::unique_ptr scatter( strings_column_view strings, const char* string, cudf::column_view scatter_map, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - size_type count = handler.size(); + size_type count = strings.size(); size_type elements = scatter_map.size(); auto execpol = rmm::exec_policy(0); auto d_indices = scatter_map.data(); @@ -244,9 +247,9 @@ std::unique_ptr scatter( strings_column_view handler, auto replace = detail::string_from_host(string, stream); auto d_replace = *replace; // create strings array - rmm::device_vector strings = - detail::create_string_array_from_column(handler, stream); - auto d_strings = strings.data().get(); + rmm::device_vector strings_vector = + detail::create_string_array_from_column(strings, stream); + auto d_strings = strings_vector.data().get(); // replace specific elements thrust::for_each_n(execpol->on(0), thrust::make_counting_iterator(0), elements, @@ -265,7 +268,7 @@ std::unique_ptr scatter( strings_column_view handler, RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future // build offsets column - auto offsets_column = detail::offsets_from_string_array(strings,stream,mr); + auto offsets_column = detail::offsets_from_string_array(strings_vector,stream,mr); auto offsets_view = offsets_column->view(); auto d_offsets = offsets_view.data(); @@ -273,7 +276,7 @@ std::unique_ptr scatter( strings_column_view handler, size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; if( (bytes==0) && (null_count < count) ) bytes = 1; // all entries are empty strings - auto chars_column = detail::chars_from_string_array(strings,d_offsets,null_count,stream,mr); + auto chars_column = detail::chars_from_string_array(strings_vector,d_offsets,null_count,stream,mr); // build children vector std::vector> children; diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 689e4ad97c8..94675fc456f 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -58,15 +58,20 @@ std::unique_ptr make_strings_column( auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_offsets = offsets_view.data(); + // Using inclusive-scan to compute last entry which is the total size. + // Exclusive-scan is possible but will not compute that last entry. + // Rather than manually computing the final offset using values in device memory, + // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first + // zero offset manually. thrust::transform_inclusive_scan( execpol->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - d_offsets+1, + d_offsets+1, // fills in offsets entries [1,count] [d_strings] __device__ (size_type idx) { thrust::pair item = d_strings[idx]; return ( item.first ? static_cast(item.second) : 0 ); }, thrust::plus() ); - int32_t offset_zero = 0; + int32_t offset_zero = 0; // set the first offset to 0 cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); // create null mask @@ -153,5 +158,4 @@ std::unique_ptr make_strings_column( std::move(children)); } - } // namespace cudf diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index afd51b9b821..da9ded23882 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -16,6 +16,7 @@ #include #include +#include #include @@ -40,7 +41,6 @@ __device__ inline char* copy_string( char* buffer, const string_view& d_string ) return buffer + d_string.size(); } - } // namespace detail } // namespace strings } // namespace cudf \ No newline at end of file From e8424e49de538ec071ae389536dcc212c8123696 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 14:12:48 -0400 Subject: [PATCH 27/54] characters() -> length() --- cpp/include/cudf/strings/string_view.cuh | 12 +++---- cpp/include/cudf/strings/string_view.inl | 31 ++++++++----------- .../cudf/strings/strings_column_view.hpp | 4 +-- cpp/src/strings/attributes.cu | 4 +-- cpp/src/strings/strings_column_view.cu | 2 +- 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 664b4ae16a8..708f4394159 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -64,13 +64,9 @@ class string_view *---------------------------------------------------------------------------**/ __host__ __device__ size_type size() const; /**---------------------------------------------------------------------------* - * @brief Return the number of bytes in this string - *---------------------------------------------------------------------------**/ - __host__ __device__ size_type length() const; - /**---------------------------------------------------------------------------* - * @brief Return the number of characters (UTF-8) in this string + * @brief Return the number of characters in this string *---------------------------------------------------------------------------**/ - __device__ size_type characters() const; + __device__ size_type length() const; /**---------------------------------------------------------------------------* * @brief Return a pointer to the internal device array *---------------------------------------------------------------------------**/ @@ -79,12 +75,12 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return true if string has no characters *---------------------------------------------------------------------------**/ - __device__ bool empty() const; + __host__ __device__ bool empty() const; /**---------------------------------------------------------------------------* * @brief Return true if string pointer is null. * That is, `data()==nullptr` for this instance. *---------------------------------------------------------------------------**/ - __device__ bool is_null() const; + __host__ __device__ bool is_null() const; /**---------------------------------------------------------------------------* * @brief Handy iterator for navigating through encoded characters. diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 057dbba8715..4c9a9833927 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -50,7 +50,7 @@ namespace cudf namespace strings { -__device__ inline string_view::string_view(const char* data, size_type bytes) +__host__ __device__ inline string_view::string_view(const char* data, size_type bytes) : _data(data), _bytes(bytes) {} @@ -61,32 +61,27 @@ __device__ inline string_view::string_view(const char* data) } // -__device__ inline size_type string_view::size() const +__host__ __device__ inline size_type string_view::size() const { return _bytes; } __device__ inline size_type string_view::length() const -{ - return _bytes; -} - -__device__ inline size_type string_view::characters() const { return detail::characters_in_string(_data,_bytes); } -__device__ inline const char* string_view::data() const +__host__ __device__ inline const char* string_view::data() const { return _data; } -__device__ inline bool string_view::empty() const +__host__ __device__ inline bool string_view::empty() const { return _bytes == 0; } -__device__ inline bool string_view::is_null() const +__host__ __device__ inline bool string_view::is_null() const { return _data == nullptr; } @@ -149,7 +144,7 @@ __device__ inline string_view::iterator string_view::begin() const __device__ inline string_view::iterator string_view::end() const { - return iterator(*this, characters()); + return iterator(*this, length()); } __device__ inline char_utf8 string_view::at(size_type pos) const @@ -253,7 +248,7 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, const char* sptr = data(); if(!str || !bytes) return -1; - size_type nchars = characters(); + size_type nchars = length(); if(count < 0) count = nchars; size_type end = pos + count; @@ -283,7 +278,7 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const { size_type sz = size(); - size_type nchars = characters(); + size_type nchars = length(); if(count < 0) count = nchars; size_type end = pos + count; @@ -319,7 +314,7 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, if(!str || !bytes) return -1; size_type sz = size(); - size_type nchars = characters(); + size_type nchars = length(); size_type end = pos + count; if(end < 0 || end > nchars) end = nchars; @@ -346,7 +341,7 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const { size_type sz = size(); - size_type nchars = characters(); + size_type nchars = length(); if(count < 0) count = nchars; size_type end = pos + count; @@ -416,7 +411,7 @@ __device__ inline size_type string_view::split(const char* delim, int count, str count = strsCount; // size_type dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); - size_type nchars = characters(); + size_type nchars = length(); size_type spos = 0, sidx = 0; size_type epos = find(delim, bytes); while(epos >= 0) @@ -465,8 +460,8 @@ __device__ inline size_type string_view::rsplit(const char* delim, int count, st count = strsCount; // unsigned int dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); - int epos = (int)characters(); // end pos is not inclusive - int sidx = count - 1; // index for strs array + int epos = (int)length(); // end pos is not inclusive + int sidx = count - 1; // index for strs array int spos = rfind(delim, bytes); while(spos >= 0) { diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index bcc42bd99a0..7ad56e2eadf 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -327,12 +327,12 @@ std::unique_ptr concatenate( strings_column_view strings, /**---------------------------------------------------------------------------* * @brief Row-wise concatenates the given list of strings columns with the first column. * - * @code + * ``` * s1 = ['aa', null, '', 'aa'] * s2 = ['', 'bb', 'bb', null] * r = concatenate(s1,s2) * r is ['aa', null, 'bb', null] - * @endcode + * ``` * * @param strings 1st string column. * @param others List of string columns to concatenate. diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 5f63dc899c0..be9d9453fa8 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -75,7 +75,7 @@ std::unique_ptr characters_counts( strings_column_view strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.characters(); }; + auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.length(); }; return counts(strings,pfn,stream,mr); } @@ -108,7 +108,7 @@ std::unique_ptr code_points( strings_column_view strings, [d_column] __device__(size_type idx){ if( d_column.nullable() && d_column.is_null(idx) ) return 0; - return d_column.element(idx).characters(); + return d_column.element(idx).length(); }, thrust::plus()); diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 55b0f7a4267..d415cd8daa0 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -99,7 +99,7 @@ void print( strings_column_view strings, return 0; string_view d_str = d_column.element(idx); size_type bytes = d_str.size(); - if( (max_width > 0) && (d_str.characters() > max_width) ) + if( (max_width > 0) && (d_str.length() > max_width) ) bytes = d_str.byte_offset(max_width); return bytes+1; // allow for null-terminator on non-null strings }, From 7b245a16646b322abfcfb6a0657412d12af90bdd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Oct 2019 16:25:17 -0400 Subject: [PATCH 28/54] missed copyright comment --- cpp/include/cudf/strings/string_view.inl | 18 ++++++++++++++++-- cpp/src/column/column_device_view.cu | 1 - cpp/src/strings/strings_column_view.cu | 3 +-- cpp/src/strings/utilities.cuh | 2 +- cpp/src/strings/utilities.hpp | 2 +- 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 4c9a9833927..6afbb9041d2 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -1,5 +1,19 @@ /* -*/ + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include @@ -542,4 +556,4 @@ __host__ __device__ inline size_type characters_in_string(const char* str, size_ } // namespace detail } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 87c8e78b9a8..2bc86b3da65 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -56,7 +56,6 @@ column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptr CUDF_EXPECTS( child.num_children()==0, "column grand-children not currently supported"); new(h_column) column_device_view(child); h_column++; - //d_column++; } } } diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index d415cd8daa0..db08c156f76 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -172,6 +172,5 @@ std::pair, rmm::device_vector> return results; } - } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index da9ded23882..6186d4e5ceb 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -43,4 +43,4 @@ __device__ inline char* copy_string( char* buffer, const string_view& d_string ) } // namespace detail } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/src/strings/utilities.hpp b/cpp/src/strings/utilities.hpp index 36dc1a0814e..6c59c7089ef 100644 --- a/cpp/src/strings/utilities.hpp +++ b/cpp/src/strings/utilities.hpp @@ -84,4 +84,4 @@ std::unique_ptr chars_from_string_array( } // namespace detail } // namespace strings -} // namespace cudf \ No newline at end of file +} // namespace cudf From 704970426f6a6ffa59dc18b76817a72a6580d404 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 08:53:35 -0400 Subject: [PATCH 29/54] size() to size_bytes() --- cpp/include/cudf/strings/string_view.cuh | 2 +- cpp/include/cudf/strings/string_view.inl | 24 ++++++++++++------------ cpp/src/strings/array.cu | 2 +- cpp/src/strings/attributes.cu | 2 +- cpp/src/strings/combine.cu | 10 +++++----- cpp/src/strings/strings_column_view.cu | 2 +- cpp/src/strings/utilities.cu | 4 ++-- cpp/src/strings/utilities.cuh | 4 ++-- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 708f4394159..eed46598aa3 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -62,7 +62,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return the number of bytes in this string *---------------------------------------------------------------------------**/ - __host__ __device__ size_type size() const; + __host__ __device__ size_type size_bytes() const; /**---------------------------------------------------------------------------* * @brief Return the number of characters in this string *---------------------------------------------------------------------------**/ diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 6afbb9041d2..085d7acf9b5 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -75,7 +75,7 @@ __device__ inline string_view::string_view(const char* data) } // -__host__ __device__ inline size_type string_view::size() const +__host__ __device__ inline size_type string_view::size_bytes() const { return _bytes; } @@ -193,7 +193,7 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const __device__ inline int string_view::compare(const string_view& in) const { - return compare(in.data(), in.size()); + return compare(in.data(), in.size_bytes()); } __device__ inline int string_view::compare(const char* data, size_type bytes) const @@ -204,7 +204,7 @@ __device__ inline int string_view::compare(const char* data, size_type bytes) co const unsigned char* ptr2 = reinterpret_cast(data); if(!ptr2) return 1; - size_type len1 = size(); + size_type len1 = size_bytes(); size_type idx = 0; for(; (idx < len1) && (idx < bytes); ++idx) { @@ -254,7 +254,7 @@ __device__ inline bool string_view::operator>=(const string_view& rhs) const __device__ inline size_type string_view::find(const string_view& str, size_type pos, int count) const { - return find(str.data(), str.size(), pos, count); + return find(str.data(), str.size_bytes(), pos, count); } __device__ inline size_type string_view::find(const char* str, size_type bytes, size_type pos, int count) const @@ -291,7 +291,7 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, // maybe get rid of this one __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const { - size_type sz = size(); + size_type sz = size_bytes(); size_type nchars = length(); if(count < 0) count = nchars; @@ -319,7 +319,7 @@ __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int __device__ inline size_type string_view::rfind(const string_view& str, size_type pos, int count) const { - return rfind(str.data(), str.size(), pos, count); + return rfind(str.data(), str.size_bytes(), pos, count); } __device__ inline size_type string_view::rfind(const char* str, size_type bytes, size_type pos, int count) const @@ -327,7 +327,7 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, const char* sptr = data(); if(!str || !bytes) return -1; - size_type sz = size(); + size_type sz = size_bytes(); size_type nchars = length(); size_type end = pos + count; if(end < 0 || end > nchars) @@ -354,7 +354,7 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const { - size_type sz = size(); + size_type sz = size_bytes(); size_type nchars = length(); if(count < 0) count = nchars; @@ -386,8 +386,8 @@ __device__ inline string_view string_view::substr(size_type pos, size_type lengt { size_type spos = byte_offset(pos); size_type epos = byte_offset(pos + length); - if( epos > size() ) - epos = size(); + if( epos > size_bytes() ) + epos = size_bytes(); if(spos >= epos) return string_view("",0); length = epos - spos; // converts length to bytes @@ -397,7 +397,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type lengt __device__ inline size_type string_view::split(const char* delim, int count, string_view* strs) const { const char* sptr = data(); - size_type sz = size(); + size_type sz = size_bytes(); if(sz == 0) { if(strs && count) @@ -446,7 +446,7 @@ __device__ inline size_type string_view::split(const char* delim, int count, str __device__ inline size_type string_view::rsplit(const char* delim, int count, string_view* strs) const { const char* sptr = data(); - size_type sz = size(); + size_type sz = size_bytes(); if(sz == 0) { if(strs && count) diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 5e8b5a274bc..114de4f14b5 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -117,7 +117,7 @@ std::unique_ptr gather( strings_column_view strings, if( d_column.nullable() && d_column.is_null(index) ) return; string_view d_str = d_column.element(index); - memcpy(d_chars + d_new_offsets[idx], d_str.data(), d_str.size() ); + memcpy(d_chars + d_new_offsets[idx], d_str.data(), d_str.size_bytes() ); }); // build children vector diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index be9d9453fa8..ff34023d792 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -83,7 +83,7 @@ std::unique_ptr bytes_counts( strings_column_view strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.size(); }; + auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.size_bytes(); }; return counts(strings,pfn,stream,mr); } diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 4310623ffdc..e2dfd2cf770 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -98,15 +98,15 @@ std::unique_ptr concatenate( strings_column_view strings, size_type bytes = 0; // left-side if( !d_str1.is_null() ) - bytes = d_str1.size(); + bytes = d_str1.size_bytes(); else if( !d_narep.is_null() ) - bytes = d_narep.size(); + bytes = d_narep.size_bytes(); // separator - bytes += d_separator.size(); + bytes += d_separator.size_bytes(); if( !d_str2.is_null() ) - bytes += d_str2.size(); + bytes += d_str2.size_bytes(); else if( !d_narep.is_null() ) - bytes += d_narep.size(); + bytes += d_narep.size_bytes(); return bytes; }, thrust::plus() ); diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index db08c156f76..358f805d3cb 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -98,7 +98,7 @@ void print( strings_column_view strings, if( d_column.nullable() && d_column.is_null(idx) ) return 0; string_view d_str = d_column.element(idx); - size_type bytes = d_str.size(); + size_type bytes = d_str.size_bytes(); if( (max_width > 0) && (d_str.length() > max_width) ) bytes = d_str.byte_offset(max_width); return bytes+1; // allow for null-terminator on non-null strings diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 1f8c9e1d5e1..c6e098c8401 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -93,7 +93,7 @@ std::unique_ptr offsets_from_string_array( thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), d_offsets+1, - [d_strings] __device__ (size_type idx) { return d_strings[idx].size(); }, + [d_strings] __device__ (size_type idx) { return d_strings[idx].size_bytes(); }, thrust::plus()); int32_t offset_zero = 0; cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); @@ -126,7 +126,7 @@ std::unique_ptr chars_from_string_array( [d_strings, d_offsets, d_chars] __device__(size_type idx){ string_view d_str = d_strings[idx]; if( !d_str.is_null() ) - memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size() ); + memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() ); }); return chars_column; diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index 6186d4e5ceb..b456e36ffc5 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -37,8 +37,8 @@ namespace detail */ __device__ inline char* copy_string( char* buffer, const string_view& d_string ) { - memcpy( buffer, d_string.data(), d_string.size() ); - return buffer + d_string.size(); + memcpy( buffer, d_string.data(), d_string.size_bytes() ); + return buffer + d_string.size_bytes(); } } // namespace detail From f601bef4a3448d23364f893aa2d68aa52c0017b5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 09:21:49 -0400 Subject: [PATCH 30/54] use cudaMemset --- cpp/src/column/column_device_view.cu | 11 +++++------ cpp/src/column/column_view.cpp | 2 -- cpp/src/strings/array.cu | 4 ++-- cpp/src/strings/combine.cu | 3 +-- cpp/src/strings/strings_column_factories.cu | 4 ++-- cpp/src/strings/strings_column_view.cu | 3 +-- cpp/src/strings/utilities.cu | 3 +-- cpp/src/table/table_device_view.cu | 3 --- cpp/tests/CMakeLists.txt | 4 ++-- cpp/tests/strings/utilities.h | 2 +- 10 files changed, 15 insertions(+), 24 deletions(-) diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 2bc86b3da65..f05ebeca92c 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -69,14 +69,14 @@ mutable_column_device_view::mutable_column_device_view( mutable_column_view sour // Construct a unique_ptr that invokes `destroy()` as it's deleter std::unique_ptr> column_device_view::create(column_view source, cudaStream_t stream) { - //size_type num_descendants{count_descendants(source)}; - //if( num_descendants > 0 ) { - // CUDF_FAIL("Columns with children are not currently supported."); - // } + size_type num_children = source.num_children(); + if( count_descendants(source) > num_children ) { + CUDF_FAIL("Columns with grand-children are not currently supported."); + } auto deleter = [](column_device_view* v) { v->destroy(); }; std::unique_ptr p{ new column_device_view(source), deleter}; - size_type num_children = source.num_children(); + if( num_children > 0 ) { // ignore grand-children right now @@ -84,7 +84,6 @@ std::unique_ptr> co for( size_type idx=0; idx < num_children; ++idx ) { column_device_view child(source.child(idx)); - CUDF_EXPECTS( child._num_children==0, "column grand-children not currently supported"); CUDA_TRY(cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), cudaMemcpyHostToDevice, stream)); } diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 949a2d62105..1f7814b41e1 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -42,8 +42,6 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); - //} else if (size > 0) { - // CUDF_EXPECTS(nullptr != data, "Null data pointer."); } CUDF_EXPECTS(offset >= 0, "Invalid offset."); diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 114de4f14b5..9c2d9a1585d 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -89,8 +89,8 @@ std::unique_ptr gather( strings_column_view strings, return d_offsets[idx+1] - d_offsets[idx]; }, thrust::plus()); - int32_t offset_zero = 0; // need to set the first entry to 0 - cudaMemcpyAsync( d_new_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); + // need to set the first entry to 0 + cudaMemsetAsync( d_new_offsets, 0, sizeof(*d_new_offsets), stream); // build null mask auto valid_mask = valid_if( static_cast(nullptr), diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index e2dfd2cf770..6acafb981e4 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -110,8 +110,7 @@ std::unique_ptr concatenate( strings_column_view strings, return bytes; }, thrust::plus() ); - int32_t offset_zero = 0; - cudaMemcpyAsync( d_results_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); + cudaMemsetAsync( d_results_offsets, 0, sizeof(*d_results_offsets), stream); // build chars column size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count]; diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 94675fc456f..507f75a0cac 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -71,8 +71,8 @@ std::unique_ptr make_strings_column( return ( item.first ? static_cast(item.second) : 0 ); }, thrust::plus() ); - int32_t offset_zero = 0; // set the first offset to 0 - cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); + // set the first offset to 0 + cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream); // create null mask auto valid_mask = valid_if( static_cast(nullptr), diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index 358f805d3cb..f0f5cfda4ae 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -104,8 +104,7 @@ void print( strings_column_view strings, return bytes+1; // allow for null-terminator on non-null strings }, thrust::plus()); - int32_t offset_zero = 0; - cudaMemcpy( d_output_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice); + cudaMemset( d_output_offsets, 0, sizeof(*d_output_offsets)); // build output buffer size_t buffer_size = output_offsets.back(); // last element has total size diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index c6e098c8401..512175d3261 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -95,8 +95,7 @@ std::unique_ptr offsets_from_string_array( d_offsets+1, [d_strings] __device__ (size_type idx) { return d_strings[idx].size_bytes(); }, thrust::plus()); - int32_t offset_zero = 0; - cudaMemcpyAsync( d_offsets, &offset_zero, sizeof(int32_t), cudaMemcpyHostToDevice, stream); + cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream); // return offsets_column; } diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index eeae71e77d3..9910448cf50 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -48,9 +48,6 @@ table_device_view_base::table_device_view_base( return init + ColumnDeviceView::extent(col); }); - //CUDA_TRY(cudaMemcpyAsync(_columns, &(*source_view.begin()), - // views_size_bytes, cudaMemcpyDefault, stream)); - std::vector h_buffer(views_size_bytes); ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); int8_t* h_end = (int8_t*)(h_column + _num_columns); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index bed491b0f2a..7cc51906709 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -465,7 +465,7 @@ ConfigureTest(SEARCH_TEST "${SEARCH_TEST_SRC}") set(TRAITS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/types/traits_test.cpp") -#ConfigureTest(TRAITS_TEST "${TRAITS_TEST_SRC}") +ConfigureTest(TRAITS_TEST "${TRAITS_TEST_SRC}") ################################################################################################### # - factories test ----------------------------------------------------------------------------------- @@ -473,7 +473,7 @@ set(TRAITS_TEST_SRC set(FACTORIES_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/column/factories_test.cpp") -#ConfigureTest(FACTORIES_TEST "${FACTORIES_TEST_SRC}") +ConfigureTest(FACTORIES_TEST "${FACTORIES_TEST_SRC}") ################################################################################################### # - dispatcher test ----------------------------------------------------------------------------------- diff --git a/cpp/tests/strings/utilities.h b/cpp/tests/strings/utilities.h index cadd40d73ad..349be5ee04a 100644 --- a/cpp/tests/strings/utilities.h +++ b/cpp/tests/strings/utilities.h @@ -43,4 +43,4 @@ std::unique_ptr create_strings_column( const std::vector Date: Wed, 2 Oct 2019 09:47:13 -0400 Subject: [PATCH 31/54] use cudf sort enums for sort() --- cpp/include/cudf/strings/strings_column_view.hpp | 8 ++++---- cpp/src/strings/array.cu | 9 +++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 7ad56e2eadf..6db660cd889 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -165,16 +165,16 @@ enum sort_type { * * @param strings Strings instance for this operation. * @param stype Specify what attribute of the string to sort on. - * @param ascending Sort strings in ascending or descending order. - * @param nullfirst Sort nulls to the beginning or the end of the new column. + * @param order Sort strings in ascending or descending order. + * @param null_order Sort nulls to the beginning or the end of the new column. * @param stream CUDA stream to use kernels in this method. * @param mr Resource for allocating device memory. * @return New strings column with sorted elements of this instance. *---------------------------------------------------------------------------**/ std::unique_ptr sort( strings_column_view strings, sort_type stype, - bool ascending=true, - bool nullfirst=true, + cudf::order order=cudf::order::ASCENDING, + cudf::null_order null_order=cudf::null_order::BEFORE, cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu index 9c2d9a1585d..023b9a302b2 100644 --- a/cpp/src/strings/array.cu +++ b/cpp/src/strings/array.cu @@ -134,7 +134,8 @@ std::unique_ptr gather( strings_column_view strings, // return sorted version of the given strings column std::unique_ptr sort( strings_column_view strings, sort_type stype, - bool ascending, bool nullfirst, + cudf::order order, + cudf::null_order null_order, cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { @@ -147,15 +148,15 @@ std::unique_ptr sort( strings_column_view strings, thrust::device_vector indices(count); thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); thrust::sort( execpol->on(stream), indices.begin(), indices.end(), - [d_column, stype, ascending, nullfirst] __device__ (size_type lhs, size_type rhs) { + [d_column, stype, order, null_order] __device__ (size_type lhs, size_type rhs) { bool lhs_null{d_column.nullable() && d_column.is_null(lhs)}; bool rhs_null{d_column.nullable() && d_column.is_null(rhs)}; if( lhs_null || rhs_null ) - return (nullfirst ? !rhs_null : !lhs_null); + return (null_order==cudf::null_order::BEFORE ? !rhs_null : !lhs_null); string_view lhs_str = d_column.element(lhs); string_view rhs_str = d_column.element(rhs); int cmp = lhs_str.compare(rhs_str); - return (ascending ? (cmp<0) : (cmp>0)); + return (order==cudf::order::ASCENDING ? (cmp<0) : (cmp>0)); }); // create a column_view as a wrapper of these indices From eeafe063872a845face5e4b9e5affcaf470efec6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 11:34:36 -0400 Subject: [PATCH 32/54] too many files to review --- cpp/CMakeLists.txt | 4 - cpp/tests/CMakeLists.txt | 6 +- cpp/tests/strings/array_tests.cu | 147 -------------------------- cpp/tests/strings/attributes_tests.cu | 78 -------------- cpp/tests/strings/combine_tests.cu | 49 --------- cpp/tests/strings/utilities.cu | 105 ------------------ cpp/tests/strings/utilities.h | 46 -------- 7 files changed, 1 insertion(+), 434 deletions(-) delete mode 100644 cpp/tests/strings/array_tests.cu delete mode 100644 cpp/tests/strings/attributes_tests.cu delete mode 100644 cpp/tests/strings/combine_tests.cu delete mode 100644 cpp/tests/strings/utilities.cu delete mode 100644 cpp/tests/strings/utilities.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 87f6cb08d38..130ed69e3e1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -433,10 +433,6 @@ add_library(cudf src/sort/sort.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cu - src/strings/array.cu - src/strings/attributes.cu - src/strings/combine.cu - src/strings/utilities.cu src/column/legacy/interop.cpp) # Rename installation to proper names for later finding diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7cc51906709..91d89ce7277 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -487,11 +487,7 @@ ConfigureTest(DISPATCHER_TEST "${DISPATCHER_TEST_SRC}") # - strings test -------------------------------------------------------------------------------------- set(STRINGS_TEST_SRC - "${CMAKE_CURRENT_SOURCE_DIR}/strings/factories_test.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/strings/array_tests.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/strings/attributes_tests.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/strings/combine_tests.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/strings/utilities.cu") + "${CMAKE_CURRENT_SOURCE_DIR}/strings/factories_test.cu") ConfigureTest(STRINGS_TEST "${STRINGS_TEST_SRC}") # - bitmask tests --------------------------------------------------------------------------------- diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu deleted file mode 100644 index 1c5993357b5..00000000000 --- a/cpp/tests/strings/array_tests.cu +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include "./utilities.h" - -#include -#include - - -struct ArrayTest : public cudf::test::BaseFixture {}; - -TEST_F(ArrayTest, Sort) -{ - std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_expected{ nullptr, "", "aa", "bb", "bbb", "eee", "ééé" }; - - auto d_strings = cudf::test::create_strings_column(h_strings); - auto strings_view = cudf::strings_column_view(d_strings->view()); - - auto results = cudf::strings::sort(strings_view, cudf::strings::name); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} - -class ArrayTestParms1 : public ArrayTest, - public testing::WithParamInterface {}; - -TEST_P(ArrayTestParms1, Sublist) -{ - std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - cudf::size_type start = 3; - cudf::size_type end = GetParam(); - std::vector h_expected; - if( end > start ) - { - for( cudf::size_type idx=start; (idx < end) && (idx < (cudf::size_type)h_strings.size()); ++idx ) - h_expected.push_back( h_strings[idx] ); - } - - auto d_strings = cudf::test::create_strings_column(h_strings); - auto strings_view = cudf::strings_column_view(d_strings->view()); - - auto results = cudf::strings::sublist(strings_view,start,end); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} - -INSTANTIATE_TEST_CASE_P(SublistParms, ArrayTestParms1, - testing::ValuesIn(std::array{5,6,7})); - -TEST_F(ArrayTest, Gather) -{ - std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_expected{ "aa", "bb" }; - - auto d_strings = cudf::test::create_strings_column(h_strings); - auto strings_view = cudf::strings_column_view(d_strings->view()); - - rmm::device_vector gather_map(2,0); - gather_map[0] = 4; - gather_map[1] = 1; - cudf::column_view gather_map_view( cudf::data_type{cudf::INT32}, gather_map.size(), - gather_map.data().get(), nullptr, 0); - - auto results = cudf::strings::gather(strings_view,gather_map_view); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} - -TEST_F(ArrayTest, Scatter) -{ - std::vector h_strings1{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_strings2{ "1", "22" }; - std::vector h_expected{ "eee", "22", nullptr, "", "1", "bbb", "ééé" }; - - auto d_strings1 = cudf::test::create_strings_column(h_strings1); - auto view1 = cudf::strings_column_view(d_strings1->view()); - auto d_strings2 = cudf::test::create_strings_column(h_strings2); - auto view2 = cudf::strings_column_view(d_strings2->view()); - - rmm::device_vector scatter_map(2,0); - scatter_map[0] = 4; - scatter_map[1] = 1; - cudf::column_view scatter_map_view( cudf::data_type{cudf::INT32}, scatter_map.size(), - scatter_map.data().get(), nullptr, 0); - - auto results = cudf::strings::scatter(view1,view2,scatter_map_view); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} - -TEST_F(ArrayTest, ScatterScalar) -{ - std::vector h_strings{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_expected{ "eee", "---", nullptr, "", "---", "bbb", "ééé" }; - - auto d_strings = cudf::test::create_strings_column(h_strings); - auto view = cudf::strings_column_view(d_strings->view()); - - rmm::device_vector scatter_map(2,0); - scatter_map[0] = 4; - scatter_map[1] = 1; - cudf::column_view scatter_map_view( cudf::data_type{cudf::INT32}, scatter_map.size(), - scatter_map.data().get(), nullptr, 0); - - auto results = cudf::strings::scatter(view,"---",scatter_map_view); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} diff --git a/cpp/tests/strings/attributes_tests.cu b/cpp/tests/strings/attributes_tests.cu deleted file mode 100644 index 59f2a283c49..00000000000 --- a/cpp/tests/strings/attributes_tests.cu +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include "./utilities.h" - -#include - - -struct AttrsTest : public GdfTest {}; - - -TEST_F(AttrsTest, BytesCounts) -{ - std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; - std::vector h_bytes{ 3, 0, 3, 0, 3, 4 }; - std::vector h_nbits{ 0x0037 }; - - auto strings = cudf::test::create_strings_column(h_test_strings); - auto strings_view = cudf::strings_column_view(strings->view()); - - auto column = cudf::strings::bytes_counts(strings_view); - rmm::device_vector d_expected(h_bytes); - rmm::device_vector d_nbits(h_nbits); - cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), - d_expected.data().get(), d_nbits.data().get(), 1 ); - cudf::test::expect_columns_equal(column->view(), column_expected); -} - -TEST_F(AttrsTest, CharactersCounts) -{ - std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; - std::vector h_characters{ 3, 0, 2, 0, 3, 2 }; - std::vector h_nbits{ 0x0037 }; - - auto strings = cudf::test::create_strings_column(h_test_strings); - auto strings_view = cudf::strings_column_view(strings->view()); - - auto column = cudf::strings::characters_counts(strings_view); - rmm::device_vector d_expected(h_characters); - rmm::device_vector d_nbits(h_nbits); - cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), - d_expected.data().get(), d_nbits.data().get(), 1 ); - cudf::test::expect_columns_equal(column->view(), column_expected); -} - -TEST_F(AttrsTest, CodePoints) -{ - std::vector h_test_strings{ "xyz", "", "aé", nullptr, "bbb", "éé" }; - std::vector h_codepoints{ 120, 121, 122, 97, 50089, 98, 98, 98, 50089, 50089 }; - - auto strings = cudf::test::create_strings_column(h_test_strings); - auto strings_view = cudf::strings_column_view(strings->view()); - - auto column = cudf::strings::code_points(strings_view); - rmm::device_vector d_expected(h_codepoints); - cudf::column_view column_expected( cudf::data_type{cudf::INT32}, d_expected.size(), - d_expected.data().get(), nullptr, 0 ); - cudf::test::expect_columns_equal(column->view(), column_expected); -} diff --git a/cpp/tests/strings/combine_tests.cu b/cpp/tests/strings/combine_tests.cu deleted file mode 100644 index c7bff891245..00000000000 --- a/cpp/tests/strings/combine_tests.cu +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include -#include -#include "./utilities.h" - -#include - - -struct CombineTest : public GdfTest {}; - -TEST_F(CombineTest, Concatenate) -{ - std::vector h_strings1{ "eee", "bb", nullptr, "", "aa", "bbb", "ééé" }; - std::vector h_strings2{ "xyz", "abc", "d", "éa", "", nullptr, "f" }; - std::vector h_expected{ "eeexyz", "bbabc", nullptr, "éa", "aa", nullptr, "éééf" }; - - auto d_strings1 = cudf::test::create_strings_column(h_strings1); - auto view1 = cudf::strings_column_view(d_strings1->view()); - auto d_strings2 = cudf::test::create_strings_column(h_strings2); - auto view2 = cudf::strings_column_view(d_strings2->view()); - - auto results = cudf::strings::concatenate(view1,view2); - auto results_view = cudf::strings_column_view(results->view()); - - auto d_expected = cudf::test::create_strings_column(h_expected); - auto expected_view = cudf::strings_column_view(d_expected->view()); - - cudf::test::expect_strings_columns_equal(results_view, expected_view); -} diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cu deleted file mode 100644 index 4c009b421d8..00000000000 --- a/cpp/tests/strings/utilities.cu +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "./utilities.h" - -#include -#include -#include - -#include -#include -#include - -#include - -namespace cudf { -namespace test { - -// -std::unique_ptr create_strings_column( const std::vector& h_strings ) -{ - cudf::size_type memsize = 0; - for( auto itr=h_strings.begin(); itr!=h_strings.end(); ++itr ) - memsize += *itr ? (cudf::size_type)strlen(*itr) : 0; - if( memsize==0 && h_strings.size() ) - memsize = 1; // prevent vectors from being null in all empty-string case - cudf::size_type count = (cudf::size_type)h_strings.size(); - thrust::host_vector h_buffer(memsize); - thrust::device_vector d_buffer(memsize); - thrust::host_vector > strings(count); - cudf::size_type offset = 0; - for( cudf::size_type idx=0; idx < count; ++idx ) - { - const char* str = h_strings[idx]; - if( !str ) - strings[idx] = thrust::pair{nullptr,0}; - else - { - cudf::size_type length = (cudf::size_type)strlen(str); - memcpy( h_buffer.data() + offset, str, length ); - strings[idx] = thrust::pair{d_buffer.data().get()+offset,(size_t)length}; - offset += length; - } - } - rmm::device_vector> d_strings(strings); - cudaMemcpy( d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice ); - return cudf::make_strings_column( d_strings ); -} - -struct compare_strings_fn -{ - __device__ bool operator()(int lidx, int ridx) - { - if( (d_lhs.nullable() && d_lhs.is_null(lidx)) || - (d_rhs.nullable() && d_rhs.is_null(ridx)) ) - return d_lhs.is_null(lidx)==d_rhs.is_null(ridx); - cudf::strings::string_view lstr = d_lhs.element(lidx); - cudf::strings::string_view rstr = d_rhs.element(ridx); - return lstr.compare(rstr)==0; - } - column_device_view d_lhs; - column_device_view d_rhs; -}; - -// -void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs) -{ - EXPECT_EQ(lhs.size(), rhs.size()); - EXPECT_EQ(lhs.null_count(), rhs.null_count()); - - // this almost works - //auto d_lhs = cudf::table_device_view::create(table_view{{lhs.parent()}}); - //auto d_rhs = cudf::table_device_view::create(table_view{{rhs.parent()}}); - //EXPECT_TRUE( - // thrust::equal(thrust::device, thrust::make_counting_iterator(0), - // thrust::make_counting_iterator(lhs.size()), - // thrust::make_counting_iterator(0), - // cudf::exp::row_equality_comparator{*d_lhs, *d_rhs})); - //CUDA_TRY(cudaDeviceSynchronize()); - - auto col_lhs = column_device_view::create(lhs.parent()); - auto col_rhs = column_device_view::create(rhs.parent()); - - EXPECT_TRUE( - thrust::equal(thrust::device, thrust::make_counting_iterator(0), - thrust::make_counting_iterator((int)lhs.size()), - thrust::make_counting_iterator(0), - compare_strings_fn{*col_lhs,*col_rhs})); -} - -} // namespace test -} // namespace cudf diff --git a/cpp/tests/strings/utilities.h b/cpp/tests/strings/utilities.h deleted file mode 100644 index 349be5ee04a..00000000000 --- a/cpp/tests/strings/utilities.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -namespace cudf { -namespace test { - -/**---------------------------------------------------------------------------* - * @brief Utility for creating a strings column from a vector of host strings - * - * @param h_strings Pointer to null-terminated, UTF-8 encode chars arrays. - * @return column instance of type STRING - *---------------------------------------------------------------------------**/ -std::unique_ptr create_strings_column( const std::vector& h_strings ); - -/**---------------------------------------------------------------------------* - * @brief Verifies the element-wise equality of two strings columns. - * - * Treats null elements as equivalent. - * Based on `expect_columns_equal()` in tests/utilities/column_utilities.cu - * - * @param lhs The first column - * @param rhs The second column - *---------------------------------------------------------------------------**/ -void expect_strings_columns_equal(cudf::strings_column_view lhs, cudf::strings_column_view rhs); - -} // namespace test -} // namespace cudf From 0d0c66b9883aca0d275036be9384085e3635ebd2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 12:40:15 -0400 Subject: [PATCH 33/54] remove more files --- cpp/src/strings/array.cu | 295 ---------------------------------- cpp/src/strings/attributes.cu | 140 ---------------- cpp/src/strings/combine.cu | 164 ------------------- cpp/src/strings/utilities.cu | 136 ---------------- cpp/src/strings/utilities.cuh | 46 ------ cpp/src/strings/utilities.hpp | 87 ---------- 6 files changed, 868 deletions(-) delete mode 100644 cpp/src/strings/array.cu delete mode 100644 cpp/src/strings/attributes.cu delete mode 100644 cpp/src/strings/combine.cu delete mode 100644 cpp/src/strings/utilities.cu delete mode 100644 cpp/src/strings/utilities.cuh delete mode 100644 cpp/src/strings/utilities.hpp diff --git a/cpp/src/strings/array.cu b/cpp/src/strings/array.cu deleted file mode 100644 index 023b9a302b2..00000000000 --- a/cpp/src/strings/array.cu +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include "./utilities.hpp" -#include "./utilities.cuh" - -#include -#include -#include -#include -#include -#include - -namespace cudf -{ -namespace strings -{ - -// new strings column from subset of this strings instance -std::unique_ptr sublist( strings_column_view strings, - size_type start, size_type end, - size_type step, cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - if( step <= 0 ) - step = 1; - size_type count = strings.size(); - if( end < 0 || end > count ) - end = count; - if( start < 0 || start > end ) - throw std::invalid_argument("invalid start parameter"); - count = (end - start)/step; - // - auto execpol = rmm::exec_policy(stream); - // build indices - thrust::device_vector indices(count); - thrust::sequence( execpol->on(stream), indices.begin(), indices.end(), start, step ); - // create a column_view as a wrapper of these indices - column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); - // build a new strings column from the indices - return gather(strings, indices_view, stream, mr); -} - -// return new strings column with strings from this instance as specified by the indices -std::unique_ptr gather( strings_column_view strings, - column_view gather_map, cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - size_type count = gather_map.size(); - auto d_indices = gather_map.data(); - - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(strings.parent(),stream); - auto d_column = *strings_column; - auto d_offsets = strings.offsets().data(); - - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, - stream, mr ); - auto offsets_view = offsets_column->mutable_view(); - auto d_new_offsets = offsets_view.data(); - // fill new offsets array - // using inclusive-scan to compute last entry which is the total size - thrust::transform_inclusive_scan( execpol->on(stream), - d_indices, d_indices + count, - d_new_offsets+1, // fills in entries [1,count] - [d_column, d_offsets] __device__ (size_type idx) { - if( d_column.nullable() && d_column.is_null(idx) ) - return 0; - return d_offsets[idx+1] - d_offsets[idx]; - }, - thrust::plus()); - // need to set the first entry to 0 - cudaMemsetAsync( d_new_offsets, 0, sizeof(*d_new_offsets), stream); - - // build null mask - auto valid_mask = valid_if( static_cast(nullptr), - [d_column, d_indices] __device__ (size_type idx) { - return !d_column.nullable() || !d_column.is_null(d_indices[idx]); - }, - count, stream ); - auto null_count = valid_mask.second; - auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); // does deep copy - RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_new_offsets)[count]; // this may not be stream friendly - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, - stream, mr ); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, - [d_column, d_indices, d_new_offsets, d_chars] __device__(size_type idx){ - size_type index = d_indices[idx]; - if( d_column.nullable() && d_column.is_null(index) ) - return; - string_view d_str = d_column.element(index); - memcpy(d_chars + d_new_offsets[idx], d_str.data(), d_str.size_bytes() ); - }); - - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); - - return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, - std::move(children)); -} - -// return sorted version of the given strings column -std::unique_ptr sort( strings_column_view strings, - sort_type stype, - cudf::order order, - cudf::null_order null_order, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - - // sort the indices of the strings - size_type count = strings.size(); - thrust::device_vector indices(count); - thrust::sequence( execpol->on(stream), indices.begin(), indices.end() ); - thrust::sort( execpol->on(stream), indices.begin(), indices.end(), - [d_column, stype, order, null_order] __device__ (size_type lhs, size_type rhs) { - bool lhs_null{d_column.nullable() && d_column.is_null(lhs)}; - bool rhs_null{d_column.nullable() && d_column.is_null(rhs)}; - if( lhs_null || rhs_null ) - return (null_order==cudf::null_order::BEFORE ? !rhs_null : !lhs_null); - string_view lhs_str = d_column.element(lhs); - string_view rhs_str = d_column.element(rhs); - int cmp = lhs_str.compare(rhs_str); - return (order==cudf::order::ASCENDING ? (cmp<0) : (cmp>0)); - }); - - // create a column_view as a wrapper of these indices - column_view indices_view( data_type{INT32}, count, indices.data().get(), nullptr, 0 ); - // now build a new strings column from the indices - return gather( strings, indices_view, stream, mr ); -} - -// -// s1 = ['a','b,'c','d'] -// s2 = ['e','f'] -// pos = [1,3] -- must be the same length as s2 -// s3 = s1.scatter(s2,pos) -// ['a','e','c','f'] -// -std::unique_ptr scatter( strings_column_view strings, - strings_column_view values, - cudf::column_view scatter_map, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - size_type elements = values.size(); - CUDF_EXPECTS( elements==scatter_map.size(), "number of strings must match map size" ); - size_type count = strings.size(); - auto d_indices = scatter_map.data(); - auto execpol = rmm::exec_policy(stream); - - // create strings arrays - rmm::device_vector strings_array = - detail::create_string_array_from_column(strings,stream); - string_view* d_strings = strings_array.data().get(); - rmm::device_vector values_array = - detail::create_string_array_from_column(values,stream); - string_view* d_values = values_array.data().get(); - // do the scatter - thrust::scatter( execpol->on(stream), - d_values, d_values+elements, - d_indices, d_strings ); - - // build null mask - auto valid_mask = valid_if( static_cast(nullptr), - [d_strings] __device__ (size_type idx) { return !d_strings[idx].is_null(); }, - count, stream ); - auto null_count = valid_mask.second; - auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); // does deep copy - RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - - // build offsets column - auto offsets_column = detail::offsets_from_string_array(strings_array,stream,mr); - auto offsets_view = offsets_column->view(); - auto d_offsets = offsets_view.data(); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; // this may not be stream friendly - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings - auto chars_column = detail::chars_from_string_array(strings_array,d_offsets,null_count,stream,mr); - - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); - - // return new strings column - return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, - std::move(children)); -} - -// -// s1 = ['a','b,'c','d'] -// pos = [1,3] -// s3 = s1.scatter('e',pos,2) -// ['a','e','c','e'] -// -std::unique_ptr scatter( strings_column_view strings, - const char* string, - cudf::column_view scatter_map, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - size_type count = strings.size(); - size_type elements = scatter_map.size(); - auto execpol = rmm::exec_policy(0); - auto d_indices = scatter_map.data(); - // copy string to device - auto replace = detail::string_from_host(string, stream); - auto d_replace = *replace; - // create strings array - rmm::device_vector strings_vector = - detail::create_string_array_from_column(strings, stream); - auto d_strings = strings_vector.data().get(); - // replace specific elements - thrust::for_each_n(execpol->on(0), - thrust::make_counting_iterator(0), elements, - [d_indices, d_replace, d_strings] __device__ (unsigned int idx) { - d_strings[d_indices[idx]] = d_replace; - }); - - // create strings column - // build null mask - auto valid_mask = valid_if( static_cast(nullptr), - [d_strings] __device__ (size_type idx) { return !d_strings[idx].is_null(); }, - count, stream ); - auto null_count = valid_mask.second; - auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); - RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - - // build offsets column - auto offsets_column = detail::offsets_from_string_array(strings_vector,stream,mr); - auto offsets_view = offsets_column->view(); - auto d_offsets = offsets_view.data(); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings - auto chars_column = detail::chars_from_string_array(strings_vector,d_offsets,null_count,stream,mr); - - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); - - // return new strings column - return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, - std::move(children)); -} - -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu deleted file mode 100644 index ff34023d792..00000000000 --- a/cpp/src/strings/attributes.cu +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace -{ - -// used by bytes_counts() and characters_counts() -template -std::unique_ptr counts( cudf::strings_column_view strings, - predicate& pfn, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - auto count = strings.size(); - auto execpol = rmm::exec_policy(stream); - auto strings_column = cudf::column_device_view::create(strings.parent(),stream); - auto d_column = *strings_column; - rmm::device_buffer null_mask; - cudf::size_type null_count = d_column.null_count(); - if( d_column.nullable() ) - null_mask = rmm::device_buffer( d_column.null_mask(), - gdf_valid_allocation_size(count), - stream, mr); - // create output column - auto results = std::make_unique( cudf::data_type{cudf::INT32}, count, - rmm::device_buffer(count * sizeof(int32_t), stream, mr), - null_mask, null_count); - auto results_view = results->mutable_view(); - auto d_lengths = results_view.data(); - // set the counts - thrust::transform( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_lengths, - [d_column, pfn] __device__ (int32_t idx) { - if( d_column.nullable() && d_column.is_null(idx) ) - return 0; - return pfn(d_column.element(idx)); - }); - results->set_null_count(null_count); - return results; -} - -} // namespace - -namespace cudf -{ -namespace strings -{ - -std::unique_ptr characters_counts( strings_column_view strings, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.length(); }; - return counts(strings,pfn,stream,mr); -} - -std::unique_ptr bytes_counts( strings_column_view strings, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - auto pfn = [] __device__ (const cudf::strings::string_view& d_str) { return d_str.size_bytes(); }; - return counts(strings,pfn,stream,mr); -} - -// -// -std::unique_ptr code_points( strings_column_view strings, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - auto count = strings.size(); - auto execpol = rmm::exec_policy(0); - auto strings_column = column_device_view::create(strings.parent(),stream); - auto d_column = *strings_column; - - // offsets point to each individual integer range - rmm::device_vector offsets(count); - size_type* d_offsets = offsets.data().get(); - thrust::transform_inclusive_scan(execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_offsets, - [d_column] __device__(size_type idx){ - if( d_column.nullable() && d_column.is_null(idx) ) - return 0; - return d_column.element(idx).length(); - }, - thrust::plus()); - - // need the total size to build the column - // the size is the last element from an inclusive-scan - size_type size = offsets.back(); - // create output column - auto results = make_numeric_column( data_type{INT32}, size, - mask_state::UNALLOCATED, - stream, mr ); - auto results_view = results->mutable_view(); - auto d_results = results_view.data(); - // now set the ranges from each strings' character values - thrust::for_each_n(execpol->on(stream), - thrust::make_counting_iterator(0), count, - [d_column, d_offsets, d_results] __device__(unsigned int idx){ - if( d_column.nullable() && d_column.is_null(idx) ) - return; - auto d_str = d_column.element(idx); - auto result = d_results + (idx ? d_offsets[idx-1] :0); - thrust::copy( thrust::seq, d_str.begin(), d_str.end(), result); - }); - // - results->set_null_count(0); // no nulls here - return results; -} - -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu deleted file mode 100644 index 6acafb981e4..00000000000 --- a/cpp/src/strings/combine.cu +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include "./utilities.hpp" -#include "./utilities.cuh" - -#include -#include - -namespace cudf -{ -namespace strings -{ - -std::unique_ptr concatenate( strings_column_view strings, - strings_column_view others, - const char* separator, - const char* narep, - cudaStream_t stream, - rmm::mr::device_memory_resource* mr ) -{ - CUDF_EXPECTS( strings.size()==others.size(), "columns must be the same size"); - - auto execpol = rmm::exec_policy(stream); - size_type count = strings.size(); - - if( !separator ) - separator = ""; - auto separator_ptr = detail::string_from_host(separator, stream); - auto d_separator = *separator_ptr; - auto narep_ptr = detail::string_from_host(narep, stream); - string_view d_narep(nullptr,0); - if( narep_ptr ) - d_narep = *narep_ptr; - - // create strings arrays - auto strings_column_ptr = column_device_view::create(strings.parent(),stream); - auto d_strings = *strings_column_ptr; - auto others_column_ptr = column_device_view::create(others.parent(),stream); - auto d_others = *others_column_ptr; - - // create resulting null mask - auto valid_mask = valid_if( static_cast(nullptr), - [d_strings, d_others, d_narep] __device__ (size_type idx) { - return !(((d_strings.nullable() && d_strings.is_null(idx)) || - (d_others.nullable() && d_others.is_null(idx))) && - d_narep.is_null()); - }, - count, stream ); - auto null_count = valid_mask.second; - auto null_size = gdf_valid_allocation_size(count); - rmm::device_buffer null_mask(valid_mask.first,null_size,stream,mr); // does deep copy - RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - - // build offsets column - auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, - stream, mr ); - auto offsets_view = offsets_column->mutable_view(); - auto d_results_offsets = offsets_view.data(); - // compute offsets - thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_results_offsets+1, - [d_strings, d_others, d_separator, d_narep] __device__ (size_type idx) { - string_view d_str1; - if( d_strings.nullable() && d_strings.is_null(idx) ) - d_str1 = string_view(nullptr,0); - else - d_str1 = d_strings.element(idx); - string_view d_str2; - if( d_others.nullable() && d_others.is_null(idx) ) - d_str2 = string_view(nullptr,0); - else - d_str2 = d_others.element(idx); - if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) - return 0; // null output case - size_type bytes = 0; - // left-side - if( !d_str1.is_null() ) - bytes = d_str1.size_bytes(); - else if( !d_narep.is_null() ) - bytes = d_narep.size_bytes(); - // separator - bytes += d_separator.size_bytes(); - if( !d_str2.is_null() ) - bytes += d_str2.size_bytes(); - else if( !d_narep.is_null() ) - bytes += d_narep.size_bytes(); - return bytes; - }, - thrust::plus() ); - cudaMemsetAsync( d_results_offsets, 0, sizeof(*d_results_offsets), stream); - - // build chars column - size_type bytes = thrust::device_pointer_cast(d_results_offsets)[count]; - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings - auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, - stream, mr ); - auto chars_view = chars_column->mutable_view(); - auto d_results_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, - [d_strings, d_others, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(size_type idx){ - string_view d_str1; - if( d_strings.nullable() && d_strings.is_null(idx) ) - d_str1 = string_view(nullptr,0); - else - d_str1 = d_strings.element(idx); - string_view d_str2; - if( d_others.nullable() && d_others.is_null(idx) ) - d_str2 = string_view(nullptr,0); - else - d_str2 = d_others.element(idx); - if( (d_str1.is_null() || d_str2.is_null()) && d_narep.is_null() ) - return; // null -- nothing to do - // concat the two strings with appropriate separator and narep - size_type offset = d_results_offsets[idx]; - char* d_buffer = d_results_chars + offset; - if( !d_str1.is_null() ) - d_buffer = detail::copy_string(d_buffer, d_str1); - else if( !d_narep.is_null() ) - d_buffer = detail::copy_string(d_buffer, d_narep); - if( !d_separator.is_null() ) - d_buffer = detail::copy_string(d_buffer, d_separator); - if( !d_str2.is_null() ) - d_buffer = detail::copy_string(d_buffer, d_str2); - else if( !d_narep.is_null() ) - d_buffer = detail::copy_string(d_buffer, d_narep); - }); - - // build children vector - std::vector> children; - children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); - - return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, - null_mask, null_count, - std::move(children)); -} - -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu deleted file mode 100644 index 512175d3261..00000000000 --- a/cpp/src/strings/utilities.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include "./utilities.hpp" - -#include -#include -#include -#include - -namespace cudf -{ -namespace strings -{ -namespace detail -{ - -// Used to build a temporary string_view object from a single host string. -std::unique_ptr> - string_from_host( const char* str, cudaStream_t stream ) -{ - if( !str ) - return nullptr; - size_type length = (size_type)std::strlen(str); - - char* d_str; - RMM_TRY(RMM_ALLOC( &d_str, length, stream )); - CUDA_TRY(cudaMemcpyAsync( d_str, str, length, - cudaMemcpyHostToDevice, stream )); - CUDA_TRY(cudaStreamSynchronize(stream)); - - auto deleter = [](string_view* sv) { RMM_FREE(const_cast(sv->data()),0); }; - return std::unique_ptr{ new string_view(d_str,length), deleter}; -} - -// build an array of string_view objects from a strings column -rmm::device_vector create_string_array_from_column( - cudf::strings_column_view strings, - cudaStream_t stream ) -{ - auto execpol = rmm::exec_policy(stream); - auto strings_column = column_device_view::create(strings.parent(),stream); - auto d_column = *strings_column; - - auto count = strings.size(); - rmm::device_vector strings_array(count); - string_view* d_strings = strings_array.data().get(); - thrust::for_each_n( execpol->on(stream), - thrust::make_counting_iterator(0), count, - [d_column, d_strings] __device__ (size_type idx) { - if( d_column.nullable() && d_column.is_null(idx) ) - d_strings[idx] = string_view(nullptr,0); - else - d_strings[idx] = d_column.element(idx); - }); - return strings_array; -} - -// build a strings offsets column from an array of string_views -std::unique_ptr offsets_from_string_array( - const rmm::device_vector& strings, - cudaStream_t stream, rmm::mr::device_memory_resource* mr ) -{ - size_type count = strings.size(); - auto d_strings = strings.data().get(); - auto execpol = rmm::exec_policy(stream); - // offsets elements is the number of strings + 1 - auto offsets_column = make_numeric_column( data_type{INT32}, count+1, - mask_state::UNALLOCATED, - stream, mr ); - auto offsets_view = offsets_column->mutable_view(); - auto d_offsets = offsets_view.data(); - // create new offsets array -- last entry includes the total size - thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), - d_offsets+1, - [d_strings] __device__ (size_type idx) { return d_strings[idx].size_bytes(); }, - thrust::plus()); - cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream); - // - return offsets_column; -} - -// build a strings chars column from an array of string_views -std::unique_ptr chars_from_string_array( - const rmm::device_vector& strings, - const int32_t* d_offsets, cudf::size_type null_count, - cudaStream_t stream, rmm::mr::device_memory_resource* mr ) -{ - size_type count = strings.size(); - auto d_strings = strings.data().get(); - auto execpol = rmm::exec_policy(stream); - size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings - - // create column - auto chars_column = make_numeric_column( data_type{INT8}, bytes, - mask_state::UNALLOCATED, - stream, mr ); - // get it's view - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), - thrust::make_counting_iterator(0), count, - [d_strings, d_offsets, d_chars] __device__(size_type idx){ - string_view d_str = d_strings[idx]; - if( !d_str.is_null() ) - memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() ); - }); - - return chars_column; -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh deleted file mode 100644 index b456e36ffc5..00000000000 --- a/cpp/src/strings/utilities.cuh +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include - -namespace cudf -{ -namespace strings -{ -namespace detail -{ - -/** - * @brief This utility will copy the argument string's data into - * the provided buffer. - * - * @param buffer Device buffer to copy to. - * @param d_string String to copy. - * @return Points to the end of the buffer after the copy. - */ -__device__ inline char* copy_string( char* buffer, const string_view& d_string ) -{ - memcpy( buffer, d_string.data(), d_string.size_bytes() ); - return buffer + d_string.size_bytes(); -} - -} // namespace detail -} // namespace strings -} // namespace cudf diff --git a/cpp/src/strings/utilities.hpp b/cpp/src/strings/utilities.hpp deleted file mode 100644 index 6c59c7089ef..00000000000 --- a/cpp/src/strings/utilities.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include - -namespace cudf -{ -namespace strings -{ -namespace detail -{ - -/** - * @brief Creates a temporary string_view object from a host string. - * - * @param[in] str Null-terminated, encoded string in CPU memory. - * @param[in] stream Stream to execute any device code against. - * @return Device object pointer. - */ -std::unique_ptr> - string_from_host( const char* str, cudaStream_t stream=0 ); - -/** - * @brief Creates a strings array from a strings column. - * This is useful for doing some intermediate array operations. - * - * @param strings Strings instance. - * @param stream Stream to execute any device code against. - * @return Strings array - */ -rmm::device_vector create_string_array_from_column( - cudf::strings_column_view strings, - cudaStream_t stream=0 ); - -/** - * @brief Creates an offsets column from a strings array. - * This can be used to recreate the offsets child of a new - * strings column from an intermediate strings array. - * - * @param strings Strings array - * @param stream Stream to execute any device code against. - * @param mr Memory resource to use. - * @return Offsets column - */ -std::unique_ptr offsets_from_string_array( - const rmm::device_vector& strings, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/** - * @brief Creates a chars column from a strings array. - * This can be used to recreate the chars child of a new - * strings column from an intermediate strings array. - * - * @param strings Strings array - * @param d_offsets Offsets array for placing strings into column's memory. - * @param null_count Number of null strings. - * @param stream Stream to execute any device code against. - * @param mr Memory resource to use. - * @return chars column - */ -std::unique_ptr chars_from_string_array( - const rmm::device_vector& strings, - const int32_t* d_offsets, cudf::size_type null_count, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -} // namespace detail -} // namespace strings -} // namespace cudf From bae74aad4f98ae38f9597a0df0096eeeb0949ec5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 13:16:12 -0400 Subject: [PATCH 34/54] add missing mutable_column_device_view methods --- .../cudf/column/column_device_view.cuh | 3 +- cpp/src/column/column_device_view.cu | 50 +++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 05574d34a16..da84afad909 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -328,7 +328,8 @@ class alignas(16) mutable_column_device_view * @return A `unique_ptr` to a `mutable_column_device_view` that makes the *data from `source_view` available in device memory. *---------------------------------------------------------------------------**/ - static auto create(mutable_column_view source_view, cudaStream_t stream = 0); + static std::unique_ptr> + create(mutable_column_view source_view, cudaStream_t stream = 0); /**---------------------------------------------------------------------------* * @brief Returns pointer to the base device memory allocation casted to diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index f05ebeca92c..b6376eacabd 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -45,6 +45,9 @@ column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptr source.null_count(), source.offset()}, _num_children{source.num_children()} { + if( count_descendants(source) > _num_children ) { + CUDF_FAIL("Columns with grand-children are not currently supported."); + } if( _num_children > 0 ) { column_device_view* h_column = reinterpret_cast(h_ptr); @@ -53,20 +56,12 @@ column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptr for( size_type idx=0; idx < _num_children; ++idx ) { // inplace-new each child column_view child = source.child(idx); - CUDF_EXPECTS( child.num_children()==0, "column grand-children not currently supported"); new(h_column) column_device_view(child); h_column++; } } } -// For use with inplace-new to pre-fill memory to be copied to device -mutable_column_device_view::mutable_column_device_view( mutable_column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) - : detail::column_device_view_base{source.type(), source.size(), - source.head(), source.null_mask(), - source.null_count(), source.offset()} -{} - // Construct a unique_ptr that invokes `destroy()` as it's deleter std::unique_ptr> column_device_view::create(column_view source, cudaStream_t stream) { size_type num_children = source.num_children(); @@ -100,6 +95,43 @@ size_type column_device_view::extent(column_view source) { return data_size; } +// For use with inplace-new to pre-fill memory to be copied to device +mutable_column_device_view::mutable_column_device_view( mutable_column_view source ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()} +{ + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); +} + +mutable_column_device_view::mutable_column_device_view( mutable_column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) + : detail::column_device_view_base{source.type(), source.size(), + source.head(), source.null_mask(), + source.null_count(), source.offset()} +{ + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); +} + +// Handle freeing children +void mutable_column_device_view::destroy() { + if( mutable_children ) + RMM_FREE(mutable_children,0); + delete this; +} + +// Construct a unique_ptr that invokes `destroy()` as it's deleter +std::unique_ptr> + mutable_column_device_view::create(mutable_column_view source, cudaStream_t stream) { + // TODO children may not be actually possible for mutable columns + CUDF_EXPECTS(source.num_children()>0, "Mutable columns with children are not currently supported."); + auto deleter = [](mutable_column_device_view* v) { v->destroy(); }; + std::unique_ptr p{ + new mutable_column_device_view(source), deleter}; + return p; +} + size_type mutable_column_device_view::extent(column_view source) { size_type data_size = sizeof(column_device_view); for( size_type idx=0; idx < source.num_children(); ++idx ) @@ -108,4 +140,4 @@ size_type mutable_column_device_view::extent(column_view source) { } -} // namespace cudf \ No newline at end of file +} // namespace cudf From ec9e6eac80084a22a96183948b6bb20332727323 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 17:50:19 -0400 Subject: [PATCH 35/54] updates per PR review --- .../cudf/column/column_device_view.cuh | 52 +- cpp/include/cudf/strings/string_view.cuh | 54 +-- cpp/include/cudf/strings/string_view.inl | 105 +--- .../cudf/strings/strings_column_factories.hpp | 7 +- .../cudf/strings/strings_column_view.hpp | 457 +----------------- cpp/src/column/column_device_view.cu | 18 +- cpp/src/column/column_view.cpp | 6 + cpp/src/strings/strings_column_factories.cu | 61 +-- cpp/src/strings/strings_column_view.cu | 44 +- cpp/src/table/table_device_view.cu | 37 +- cpp/tests/strings/factories_test.cu | 8 +- 11 files changed, 186 insertions(+), 663 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index da84afad909..3b4be27de4b 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -15,11 +15,11 @@ */ #pragma once -#include #include #include #include #include +#include namespace cudf { @@ -208,11 +208,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base { column_device_view& operator=(column_device_view const&) = default; column_device_view& operator=(column_device_view&&) = default; - // + /**---------------------------------------------------------------------------* + * @brief Creates an instance of this class in the specified host memory + * using the device memory pointer as a base for child pointers. + * + * @param column Column view from which to create this instance. + * @param h_ptr Host memory pointer on which to place this instance. + * @param d_ptr Device memory pointer on which to base any child pointers. + *---------------------------------------------------------------------------**/ column_device_view( column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); /**---------------------------------------------------------------------------* * @brief Returns reference to element at the specified index. + * + * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`, + * then any attempt to use the result will lead to undefined behavior. * * This function accounts for the offset. * @@ -256,9 +266,8 @@ class alignas(16) column_device_view : public detail::column_device_view_base { void destroy(); /**---------------------------------------------------------------------------* - * @brief Return the amount of memory needed to hold this instance in - * contiguous memory block. This accounts for the children as well as - * the object itself. + * @brief Return the size in bytes of the amount of memory needed to hold a + * device view of the specified column and it's children. * * @param source_view The `column_view` to use for this calculation. *---------------------------------------------------------------------------**/ @@ -307,6 +316,14 @@ class alignas(16) mutable_column_device_view default; mutable_column_device_view& operator=(mutable_column_device_view&&) = default; + /**---------------------------------------------------------------------------* + * @brief Creates an instance of this class in the specified host memory + * using the device memory pointer as a base for child pointers. + * + * @param column Column view from which to create this instance. + * @param h_ptr Host memory pointer on which to place this instance. + * @param d_ptr Device memory pointer on which to base any child pointers. + *---------------------------------------------------------------------------**/ mutable_column_device_view( mutable_column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); /**---------------------------------------------------------------------------* @@ -374,7 +391,7 @@ class alignas(16) mutable_column_device_view * @param element_index Position of the desired element *---------------------------------------------------------------------------**/ template - __device__ T element(size_type element_index) noexcept { + __device__ T& element(size_type element_index) noexcept { return data()[element_index]; } @@ -442,13 +459,12 @@ class alignas(16) mutable_column_device_view } /**---------------------------------------------------------------------------* - * @brief Return the amount of memory needed to hold this instance in - * contiguous memory block. This accounts for the children as well as - * the object itself. + * @brief Return the size in bytes of the amount of memory needed to hold a + * device view of the specified column and it's children. * * @param source_view The `column_view` to use for this calculation. *---------------------------------------------------------------------------**/ - static size_type extent(column_view source_view); + static size_type extent(mutable_column_view source_view); private: mutable_column_device_view* @@ -481,7 +497,8 @@ class alignas(16) mutable_column_device_view /**---------------------------------------------------------------------------* * @brief Returns `string_view` to the string element at the specified index. * - * This function accounts for the offset. Do not call this for a null element. + * This function accounts for the offset. + * Calling this on a null element will result in undefined behavior. * * @param element_index Position of the desired string * @return string_view instance representing this element at this index @@ -489,18 +506,11 @@ class alignas(16) mutable_column_device_view template <> __device__ inline strings::string_view const column_device_view::element( size_type element_index) const noexcept { - size_type index = element_index + _offset; // account for this view's _offset - const int32_t* d_offsets = d_children[0].data(); - const char* d_strings = d_children[1].data(); + size_type index = element_index + offset(); // account for this view's _offset + const int32_t* d_offsets = d_children[strings_column_view::offsets_column_index].data(); + const char* d_strings = d_children[strings_column_view::offsets_column_index].data(); size_type offset = d_offsets[index]; return strings::string_view{d_strings + offset, d_offsets[index+1] - offset}; } -//template <> -//__device__ inline string_view mutable_column_device_view::element( -// size_type element_index) noexcept { -// return string_view{}; -//} - - } // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index eed46598aa3..3bdfee38197 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -25,15 +25,15 @@ namespace strings { // UTF-8 characters are 1-4 bytes -typedef unsigned int char_utf8; +using char_utf8 = uint32_t; /**---------------------------------------------------------------------------* * @brief A non-owning, immutable view of device data that is variable length * character array representing a UTF-8 string. The caller must maintain the * device memory for the lifetime of this instance. * - * It provides a simple wrapper and string operations for individual char array - * within a strings column. + * It provides a simple wrapper and string operations for an individual string + * with a column of strings. *---------------------------------------------------------------------------**/ class string_view { @@ -77,7 +77,7 @@ class string_view *---------------------------------------------------------------------------**/ __host__ __device__ bool empty() const; /**---------------------------------------------------------------------------* - * @brief Return true if string pointer is null. + * @brief Return true if string is NULL. * That is, `data()==nullptr` for this instance. *---------------------------------------------------------------------------**/ __host__ __device__ bool is_null() const; @@ -85,7 +85,7 @@ class string_view /**---------------------------------------------------------------------------* * @brief Handy iterator for navigating through encoded characters. *---------------------------------------------------------------------------**/ - class iterator + class const_iterator { public: using difference_type = ptrdiff_t; @@ -93,15 +93,15 @@ class string_view using reference = char_utf8&; using pointer = char_utf8*; using iterator_category = std::input_iterator_tag; // do not allow going backwards - __device__ iterator(const string_view& str, size_type pos); - iterator(const iterator& mit) = default; - iterator(iterator&& mit) = default; - iterator& operator=(const iterator&) = default; - iterator& operator=(iterator&&) = default; - __device__ iterator& operator++(); - __device__ iterator operator++(int); - __device__ bool operator==(const iterator& rhs) const; - __device__ bool operator!=(const iterator& rhs) const; + __device__ const_iterator(const string_view& str, size_type pos); + const_iterator(const const_iterator& mit) = default; + const_iterator(const_iterator&& mit) = default; + const_iterator& operator=(const const_iterator&) = default; + const_iterator& operator=(const_iterator&&) = default; + __device__ const_iterator& operator++(); + __device__ const_iterator operator++(int); + __device__ bool operator==(const const_iterator& rhs) const; + __device__ bool operator!=(const const_iterator& rhs) const; __device__ char_utf8 operator*() const; __device__ size_type position() const; __device__ size_type byte_offset() const; @@ -113,18 +113,12 @@ class string_view /**---------------------------------------------------------------------------* * @brief Return new iterator pointing to the beginning of this string *---------------------------------------------------------------------------**/ - __device__ iterator begin() const; + __device__ const_iterator begin() const; /**---------------------------------------------------------------------------* * @brief Return new iterator pointing past the end of this string *---------------------------------------------------------------------------**/ - __device__ iterator end() const; + __device__ const_iterator end() const; - /**---------------------------------------------------------------------------* - * @brief Return single UTF-8 character at the given character position - * - * @param pos Character position - *---------------------------------------------------------------------------**/ - __device__ char_utf8 at(size_type pos) const; /**---------------------------------------------------------------------------* * @brief Return single UTF-8 character at the given character position * @@ -166,30 +160,30 @@ class string_view * not match is greater in the arg string, or all compared characters * match but the arg string is longer. *---------------------------------------------------------------------------**/ - __device__ int compare(const char* data, size_type bytes) const; + __device__ int compare(const char* str, size_type bytes) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string matches this string exactly. + * @brief Returns true if rhs matches this string exactly. *---------------------------------------------------------------------------**/ __device__ bool operator==(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string does not match this string. + * @brief Returns true if rhs does not match this string. *---------------------------------------------------------------------------**/ __device__ bool operator!=(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string sorts ascending to this string. + * @brief Returns true if this string is ordered before rhs. *---------------------------------------------------------------------------**/ __device__ bool operator<(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string sorts descending to this string. + * @brief Returns true if rhs is ordered before this string. *---------------------------------------------------------------------------**/ __device__ bool operator>(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string sorts ascending or matches this string. + * @brief Returns true if this string matches or is ordered before rhs. *---------------------------------------------------------------------------**/ __device__ bool operator<=(const string_view& rhs) const; /**---------------------------------------------------------------------------* - * @brief Returns true if arg string sorts descending or matches this string. + * @brief Returns true if rhs matches or is ordered before this string. *---------------------------------------------------------------------------**/ __device__ bool operator>=(const string_view& rhs) const; @@ -338,7 +332,7 @@ __host__ __device__ size_type to_char_utf8( const char* str, char_utf8& characte __host__ __device__ size_type from_char_utf8( char_utf8 character, char* str ); /**---------------------------------------------------------------------------* - * @brief Return the number of characters in this provided char array. + * @brief Return the number of UTF-8 characters in this provided char array. * * @param str String with encoded char bytes. * @param bytes Number of bytes in str. diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 085d7acf9b5..65925e084b3 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -19,7 +19,7 @@ namespace { -typedef unsigned char BYTE; +using BYTE = uint8_t; /**---------------------------------------------------------------------------* * @brief Returns the number of bytes used to represent the provided byte. @@ -69,10 +69,8 @@ __host__ __device__ inline string_view::string_view(const char* data, size_type {} __device__ inline string_view::string_view(const char* data) - : _data(data) -{ - _bytes = string_length(data); -} + : _data{data}, _bytes{string_length(data)} +{} // __host__ __device__ inline size_type string_view::size_bytes() const @@ -101,67 +99,63 @@ __host__ __device__ inline bool string_view::is_null() const } // the custom iterator knows about UTF8 encoding -__device__ inline string_view::iterator::iterator(const string_view& str, size_type pos) - : cpos(pos) -{ - p = str.data(); - offset = str.byte_offset(cpos); -} +__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) + : cpos{pos}, p{str.data()}, offset{str.byte_offset(pos)} +{} -__device__ inline string_view::iterator& string_view::iterator::operator++() +__device__ inline string_view::const_iterator& string_view::const_iterator::operator++() { offset += bytes_in_utf8_byte((BYTE)p[offset]); ++cpos; return *this; } -// what is the int parm for? -__device__ inline string_view::iterator string_view::iterator::operator++(int) +__device__ inline string_view::const_iterator string_view::const_iterator::operator++(int) { - iterator tmp(*this); + const_iterator tmp(*this); operator++(); return tmp; } -__device__ inline bool string_view::iterator::operator==(const string_view::iterator& rhs) const +__device__ inline bool string_view::const_iterator::operator==(const string_view::const_iterator& rhs) const { return (p == rhs.p) && (cpos == rhs.cpos); } -__device__ inline bool string_view::iterator::operator!=(const string_view::iterator& rhs) const +__device__ inline bool string_view::const_iterator::operator!=(const string_view::const_iterator& rhs) const { return (p != rhs.p) || (cpos != rhs.cpos); } // unsigned int can hold 1-4 bytes for the UTF8 char -__device__ inline char_utf8 string_view::iterator::operator*() const +__device__ inline char_utf8 string_view::const_iterator::operator*() const { char_utf8 chr = 0; detail::to_char_utf8(p + offset, chr); return chr; } -__device__ inline size_type string_view::iterator::position() const +__device__ inline size_type string_view::const_iterator::position() const { return cpos; } -__device__ inline size_type string_view::iterator::byte_offset() const +__device__ inline size_type string_view::const_iterator::byte_offset() const { return offset; } -__device__ inline string_view::iterator string_view::begin() const +__device__ inline string_view::const_iterator string_view::begin() const { - return iterator(*this, 0); + return const_iterator(*this, 0); } -__device__ inline string_view::iterator string_view::end() const +__device__ inline string_view::const_iterator string_view::end() const { - return iterator(*this, length()); + return const_iterator(*this, length()); } -__device__ inline char_utf8 string_view::at(size_type pos) const +__device__ inline char_utf8 string_view::operator[](size_type pos) const { unsigned int offset = byte_offset(pos); if(offset >= _bytes) @@ -171,11 +165,6 @@ __device__ inline char_utf8 string_view::at(size_type pos) const return chr; } -__device__ inline char_utf8 string_view::operator[](size_type pos) const -{ - return at(pos); -} - __device__ inline size_type string_view::byte_offset(size_type pos) const { size_type offset = 0; @@ -288,33 +277,11 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, return -1; } -// maybe get rid of this one __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const { - size_type sz = size_bytes(); - size_type nchars = length(); - if(count < 0) - count = nchars; - size_type end = pos + count; - if(end < 0 || end > nchars) - end = nchars; - if(pos > end || chr == 0 || sz == 0) - return -1; - size_type spos = byte_offset(pos); - size_type epos = byte_offset(end); - // - size_type chsz = detail::bytes_in_char_utf8(chr); - const char* sptr = data(); - const char* ptr = sptr + spos; - size_type len = (epos - spos) - chsz; - for(size_type idx = 0; idx <= len; ++idx) - { - char_utf8 ch = 0; - detail::to_char_utf8(ptr++, ch); - if(chr == ch) - return detail::characters_in_string(sptr, idx + spos); - } - return -1; + char str[sizeof(char_utf8)]; + size_type chwidth = detail::from_char_utf8(chr,str); + return find(str,chwidth,pos,count); } __device__ inline size_type string_view::rfind(const string_view& str, size_type pos, int count) const @@ -354,33 +321,11 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const { - size_type sz = size_bytes(); - size_type nchars = length(); - if(count < 0) - count = nchars; - size_type end = pos + count; - if(end < 0 || end > nchars) - end = nchars; - if(pos > end || chr == 0 || sz == 0) - return -1; - size_type spos = byte_offset(pos); - size_type epos = byte_offset(end); - - size_type chsz = detail::bytes_in_char_utf8(chr); - const char* sptr = data(); - const char* ptr = sptr + epos - 1; - size_type len = (epos - spos) - chsz; - for(size_type idx = 0; idx < len; ++idx) - { - char_utf8 ch = 0; - detail::to_char_utf8(ptr--, ch); - if(chr == ch) - return detail::characters_in_string(sptr, epos - idx - 1); - } - return -1; + char str[sizeof(char_utf8)]; + size_type chwidth = detail::from_char_utf8(chr,str); + return rfind(str,chwidth,pos,count); } - // parameters are character position values __device__ inline string_view string_view::substr(size_type pos, size_type length) const { diff --git a/cpp/include/cudf/strings/strings_column_factories.hpp b/cpp/include/cudf/strings/strings_column_factories.hpp index b0c039417ac..035d22207c3 100644 --- a/cpp/include/cudf/strings/strings_column_factories.hpp +++ b/cpp/include/cudf/strings/strings_column_factories.hpp @@ -37,10 +37,9 @@ namespace cudf { * must be a valid device address pointing to `.second` consecutive bytes. * * @throws std::bad_alloc if device memory allocation fails - * @throws cudf::logic_error if pointers or sizes are invalid * * @param strings The pointer/size pair arrays. - * Each pointer must be a valid device memory address. + * Each pointer must be a device memory address or `nullptr` (indicating a null string). * The size must be the number of bytes. * @param stream Optional stream for use with all memory allocation * and device kernels @@ -48,7 +47,7 @@ namespace cudf { * allocation of the column's `null_mask` and children. *---------------------------------------------------------------------------**/ std::unique_ptr make_strings_column( - const rmm::device_vector>& strings, + const rmm::device_vector>& strings, cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); @@ -59,6 +58,8 @@ std::unique_ptr make_strings_column( * The total number of char bytes must not exceed the maximum size of size_type. * Use the strings_column_view class to perform strings operations on this type * of column. + * This function makes a deep copy of the strings, offsets, null_mask to create + * a new column. * * @throws std::bad_alloc if device memory allocation fails * diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 6db660cd889..653613504e0 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -26,23 +26,22 @@ namespace cudf { * @brief Given a column-view of strings type, an instance of this class * provides a wrapper on this compound column for strings operations. *---------------------------------------------------------------------------**/ -class strings_column_view : private column_view +class strings_column_view : public column_view { public: strings_column_view( column_view strings_column ); strings_column_view( strings_column_view&& strings_view ) = default; strings_column_view( const strings_column_view& strings_view ) = default; ~strings_column_view() = default; + strings_column_view& operator=(strings_column_view const&) = default; + strings_column_view& operator=(strings_column_view&&) = default; - /**---------------------------------------------------------------------------* - * @brief Returns the number of strings in the column - *---------------------------------------------------------------------------**/ - size_type size() const; + static constexpr size_type offsets_column_index{0}; + static constexpr size_type chars_column_index{1}; - /**---------------------------------------------------------------------------* - * @brief Returns the internal parent string column - *---------------------------------------------------------------------------**/ - column_view parent() const; + using column_view::size; + using column_view::null_mask; + using column_view::null_count; /**---------------------------------------------------------------------------* * @brief Returns the internal column of offsets @@ -54,18 +53,6 @@ class strings_column_view : private column_view *---------------------------------------------------------------------------**/ column_view chars() const; - /**---------------------------------------------------------------------------* - * @brief Returns a pointer to the internal null mask memory - *---------------------------------------------------------------------------**/ - const bitmask_type* null_mask() const; - - /**---------------------------------------------------------------------------* - * @brief Returns the number of nulls in this column - *---------------------------------------------------------------------------**/ - size_type null_count() const; - -private: - const column_view _parent; }; namespace strings @@ -86,7 +73,6 @@ void print( strings_column_view strings, size_type start=0, size_type end=-1, size_type max_width=-1, const char* delimiter = "\n" ); - /**---------------------------------------------------------------------------* * @brief Create output per Arrow strings format. * The return pair is the array of chars and the array of offsets. @@ -101,432 +87,5 @@ std::pair, rmm::device_vector> cudaStream_t stream=0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Returns a new strings column created from a subset of - * of this instance's strings column. - * - * ``` - * s1 = ["a", "b", "c", "d", "e", "f"] - * s2 = sublist( s1, 2 ) - * s2 is ["c", "d", "e", "f"] - * ``` - * - * @param strings Strings instance for this operation. - * @param start Index of first string to use. - * @param end Index of last string to use. - * Default -1 indicates the last element. - * @param step Increment value between indexes. - * Default step is 1. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New strings column of size (end-start)/step. - *---------------------------------------------------------------------------**/ -std::unique_ptr sublist( strings_column_view strings, - size_type start, size_type end=-1, - size_type step=1, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Returns a new strings column using the specified indices to select - * elements from the specified strings column. - * - * ``` - * s1 = ["a", "b", "c", "d", "e", "f"] - * map = [0, 2] - * s2 = gather( s1, map ) - * s2 is ["a", "c"] - * ``` - * - * @param strings Strings instance for this operation. - * @param gather_map The indices with which to select strings for the new column. - * Values must be within [0,size()) range. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New strings column of size indices.size() - *---------------------------------------------------------------------------**/ -std::unique_ptr gather( strings_column_view strings, - cudf::column_view gather_map, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Sort types for the sort method. - *---------------------------------------------------------------------------**/ -enum sort_type { - none=0, ///< no sorting - length=1, ///< sort by string length - name=2 ///< sort by characters code-points -}; - -/**---------------------------------------------------------------------------* - * @brief Returns a new strings column that is a sorted version of the - * strings in this instance. - * - * @param strings Strings instance for this operation. - * @param stype Specify what attribute of the string to sort on. - * @param order Sort strings in ascending or descending order. - * @param null_order Sort nulls to the beginning or the end of the new column. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New strings column with sorted elements of this instance. - *---------------------------------------------------------------------------**/ -std::unique_ptr sort( strings_column_view strings, - sort_type stype, - cudf::order order=cudf::order::ASCENDING, - cudf::null_order null_order=cudf::null_order::BEFORE, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/** - * @brief Returns new instance using the provided map values and strings. - * The map values specify the location in the new strings instance. - * Missing values pass through from the column at those positions. - * - * ``` - * s1 = ["a", "b", "c", "d"] - * s2 = ["e", "f"] - * map = [1, 3] - * s3 = scatter( s1, s2, m1 ) - * s3 is ["a", "e", "c", "f"] - * ``` - * - * @param strings Strings instance for this operation. - * @param values The instance for which to retrieve the strings - * specified in map column. - * @param scatter_map The 0-based index values to retrieve from the - * strings parameter. Number of values must equal the number - * of elements in strings pararameter (strings.size()). - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New instance with the specified strings. - */ -std::unique_ptr scatter( strings_column_view strings, - strings_column_view values, - cudf::column_view scatter_map, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/** - * @brief Returns new instance using the provided index values and a - * single string. The map values specify where to place the string - * in the new strings instance. Missing values pass through from - * the column at those positions. - * - * ``` - * s1 = ["a", "b", "c", "d"] - * map = [1, 3] - * s2 = scatter( s1, "e", m1 ) - * s2 is ["a", "e", "c", "e"] - * ``` - * - * @param strings Strings instance for this operation. - * @param value Null-terminated encoded string in host memory to use with - * the scatter_map. - * @param scatter_map The 0-based index values to place the given string. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New instance with the specified strings. - */ -std::unique_ptr scatter( strings_column_view strings, - const char* value, - cudf::column_view scatter_map, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Returns the number of bytes for each string in a strings column. - * Null strings will have a byte count of 0. - * - * @param strings Strings instance for this operation. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return Numeric column of type int32. - *---------------------------------------------------------------------------**/ -std::unique_ptr bytes_counts( strings_column_view strings, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Returns the number of characters for each string in a strings column. - * Null strings will have a count of 0. The number of characters is not the - * same as the number of bytes if multi-byte encoded characters make up a - * string. - * - * @param strings Strings instance for this operation. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return Numeric column of type int32. - *---------------------------------------------------------------------------**/ -std::unique_ptr characters_counts( strings_column_view strings, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Creates a column with code point values (integers) for each string. - * A code point is the integer value representation of a character. - * For example, in UTF-8 the code point value for the character 'A' is 65. - * The column is an array of variable-length integer arrays each with length - * as returned by characters_counts(). - * - * ``` - * s = ["a","xyz", "éee"] - * v = code_points(s) - * v is [97, 120, 121, 122, 50089, 101, 101] - * ``` - * - * @param strings Strings instance for this operation. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return Numeric column of type int32. TODO: need uint32 here - *---------------------------------------------------------------------------**/ -std::unique_ptr code_points( strings_column_view strings, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -enum character_attribute { - DECIMAL=0, - NUMERIC=1, - DIGIT=2, - ALPHA=3, - SPACE=4, - UPPER=5, - LOWER=6, - ALPHANUM=7, - EMPTY=8 -}; -/**---------------------------------------------------------------------------* - * @brief Returns true for strings that have only characters of the specified - * type. - * @param strings Strings instance for this operation. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return Column of type bool. - *---------------------------------------------------------------------------**/ -std::unique_ptr is_of_type( strings_column_view strings, - character_attribute ca_type, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Row-wise concatenates two columns of strings into a new a column. - * The number of strings in both columns must match. - * @param strings 1st string column. - * @param others 2nd string column. - * @param separator Null-terminated CPU string that should appear between each element. - * @param narep Null-terminated CPU string that should represent any null strings found. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New column with concatenated results - *---------------------------------------------------------------------------**/ -std::unique_ptr concatenate( strings_column_view strings, - strings_column_view others, - const char* separator="", const char* narep=nullptr, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Row-wise concatenates the given list of strings columns with the first column. - * - * ``` - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r = concatenate(s1,s2) - * r is ['aa', null, 'bb', null] - * ``` - * - * @param strings 1st string column. - * @param others List of string columns to concatenate. - * @param separator Null-terminated CPU string that should appear between each instance. - * Default is empty string. - * @param narep Null-terminated CPU string that should represent any null strings found. - * Default of null means any null operand produces a null result. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New column with concatenated results - *---------------------------------------------------------------------------**/ -std::unique_ptr concatenate( std::vector& strings, - const char* separator="", - const char* narep=nullptr, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Concatenates all strings in the column into one new string. - * This provides the Pandas strings equivalent of join(). - * @param strings Strings for this operation. - * @param separator Null-terminated CPU string that should appear between each string. - * @param narep Null-terminated CPU string that should represent any null strings found. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return New column containing one string. - *---------------------------------------------------------------------------**/ -std::unique_ptr join_strings( strings_column_view strings, - const char* separator="", - const char* narep=nullptr, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - -/**---------------------------------------------------------------------------* - * @brief Split strings vertically creating new columns of strings. - * The number of columns will be equal to the string with the most splits. - * The delimiter is searched starting from the beginning of each string. - * - * ``` - * s = ["a b c", "d e f", "g h"] - * r = split(s," ") - * r is vector of 3 columns: - * r[0] = ["a", "d", "g"] - * r[1] = ["b", "e", "h"] - * r[2] = ["c", "f", nullptr] - * ``` - * - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * Default of null splits on whitespace. - * @param maxsplit Maximum number of splits to perform searching from the beginning. - * Default -1 indicates all delimiters are processed. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of strings columns. - *---------------------------------------------------------------------------**/ -std::vector> split( strings_column_view strings, - const char* delimiter=nullptr, - int maxsplit=-1, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Split strings vertically creating new columns of strings. - * The number of columns will be equal to the string with the most splits. - * The delimiter is searched starting from the end of each string. - * - * ``` - * s = ["a b c", "d e f", "g h"] - * r = split(s," ",1) - * r is vector of 2 columns: - * r[0] = ["a b", "d e", "g h"] - * r[1] = ["c", "f", nullptr] - * ``` - * - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * Default of null splits on whitespace. - * @param maxsplit Maximum number of splits to perform searching right to left. - * Default -1 indicates all delimiters are processed. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of strings columns. - *---------------------------------------------------------------------------**/ -std::vector> rsplit( strings_column_view strings, - const char* delimiter=nullptr, - int maxsplit=-1, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Each string is split into a list of new column of strings. - * The delimiter is searched from the beginning of each string. - * Each string results in a new strings column. - * - * ``` - * s = ["a b c", "d e f", "g h", "i j"] - * r = split_record(s," ") - * r is vector of 4 columns: - * r[0] = ["a", "b", "c"] - * r[1] = ["d", "e", "f"] - * r[2] = ["g", "h", nullptr] - * r[3] = ["i", "j", nullptr] - * ``` - * - * @param strings Strings for this operation. - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * Default of null splits on whitespace. - * @param maxsplit Maximum number of splits to perform searching from the beginning. - * Default -1 indicates all delimiters are processed. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of columns for each string. - *---------------------------------------------------------------------------**/ -std::vector> split_record( strings_column_view strings, - const char* delimiter=nullptr, - int maxsplit=-1, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Each string is split into a list of new strings. - * The delimiter is searched from the end of each string. - * Each string results in a new strings column. - * - * ``` - * s = ["a b c", "d e f", "g h", "i j"] - * r = rsplit_record(s," ",1) - * r is vector of 4 columns: - * r[0] = ["a b", "c"] - * r[1] = ["d e", "f"] - * r[2] = ["g", "h"] - * r[3] = ["i", "j"] - * ``` - * - * @param strings Strings for this operation. - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * Default of null splits on whitespace. - * @param maxsplit Maximum number of splits to perform searching from the end. - * Default -1 indicates all delimiters are processed. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of columns for each string. - *---------------------------------------------------------------------------**/ -std::vector> rsplit_record( strings_column_view strings, - const char* delimiter=nullptr, - int maxsplit=-1, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Each string is split into two strings on the first delimiter found. - * Three strings are always created for each string: left-half, delimiter itself, right-half. - * The result is 3 strings columns representing the 3 partitions. - * - * ``` - * s = ["a:b:c", "d:e:f", "g:h", "i:j"] - * r = partition(s,":") - * r is vector of 4 columns: - * r[0] = ["a", ":", "b:c"] - * r[1] = ["d", ":", "e:f"] - * r[2] = ["g", ":", "h"] - * r[3] = ["i", ":", "j"] - * ``` - * - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * @param results The list of instances for each string. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of columns for each partition. - *---------------------------------------------------------------------------**/ -std::vector> partition( strings_column_view strings, - const char* delimiter, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); -/**---------------------------------------------------------------------------* - * @brief Each string is split into two strings on the last delimiter found. - * Three strings are always created for each string: left-half, delimiter itself, right-half. - * The result is 3 strings columns representing the 3 partitions. - * - * ``` - * s = ["a:b:c", "d:e:f", "g:h", "i:j"] - * r = rpartition(s,":") - * r is vector of 4 columns: - * r[0] = ["a:b", ":", "c"] - * r[1] = ["d:e", ":", "f"] - * r[2] = ["g", ":", "h"] - * r[3] = ["i", ":", "j"] - * ``` - * - * @param delimiter Null-terminated CPU string identifying the split points within each string. - * @param results The list of instances for each string. - * @param stream CUDA stream to use kernels in this method. - * @param mr Resource for allocating device memory. - * @return List of columns for each partition. - *---------------------------------------------------------------------------**/ -std::vector> rpartition( strings_column_view strings, - const char* delimiter, - cudaStream_t stream=0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource() ); - } // namespace strings } // namespace cudf diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index b6376eacabd..88ec29da87a 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -33,8 +33,7 @@ column_device_view::column_device_view(column_view source) // Free device memory allocated for children void column_device_view::destroy() { // TODO Needs to handle grand-children - if( d_children ) - RMM_FREE(d_children,0); + RMM_FREE(d_children,0); delete this; } @@ -75,13 +74,17 @@ std::unique_ptr> co if( num_children > 0 ) { // ignore grand-children right now - RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_children, stream); + RMM_TRY(RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_children, stream)); + std::vector buffer(sizeof(column_device_view)*num_children); + auto h_ptr = buffer.data(); for( size_type idx=0; idx < num_children; ++idx ) { column_device_view child(source.child(idx)); - CUDA_TRY(cudaMemcpyAsync(p->d_children+idx, &child, sizeof(column_device_view), - cudaMemcpyHostToDevice, stream)); + memcpy(h_ptr, &child, sizeof(column_device_view)); + h_ptr += sizeof(column_device_view); } + CUDA_TRY(cudaMemcpyAsync(p->d_children, buffer.data(), num_children*sizeof(column_device_view), + cudaMemcpyHostToDevice, stream)); p->_num_children = num_children; cudaStreamSynchronize(stream); } @@ -116,8 +119,7 @@ mutable_column_device_view::mutable_column_device_view( mutable_column_view sour // Handle freeing children void mutable_column_device_view::destroy() { - if( mutable_children ) - RMM_FREE(mutable_children,0); + RMM_FREE(mutable_children,0); delete this; } @@ -132,7 +134,7 @@ std::unique_ptr #include #include +#include #include #include @@ -43,6 +44,11 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); } + else if ( type.id() == STRING ){ // TODO change to is_compound(type) once type-dispatcher supports STRING + CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); + } else if( size > 0){ + CUDF_EXPECTS(nullptr != data, "Null data pointer."); + } CUDF_EXPECTS(offset >= 0, "Invalid offset."); diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 507f75a0cac..bbbd01c0218 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -32,13 +32,13 @@ namespace cudf { // Create a strings-type column from array of pointer/size pairs std::unique_ptr make_strings_column( - const rmm::device_vector>& strings, + const rmm::device_vector>& strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr) { - size_type count = strings.size(); + size_type num_strings = strings.size(); // maybe a separate factory for creating null strings-column - CUDF_EXPECTS(count > 0, "must specify at least one pair"); + CUDF_EXPECTS(num_strings > 0, "must specify at least one pair"); auto execpol = rmm::exec_policy(stream); auto d_strings = strings.data().get(); @@ -46,16 +46,16 @@ std::unique_ptr make_strings_column( // check total size is not too large for cudf column size_t bytes = thrust::transform_reduce( execpol->on(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(count), + thrust::make_counting_iterator(num_strings), [d_strings] __device__ (size_t idx) { auto item = d_strings[idx]; - return item.first ? item.second : 0; + return (item.first!=nullptr) ? item.second : 0; }, 0, thrust::plus()); CUDF_EXPECTS( bytes < std::numeric_limits::max(), "total size of strings is too large for cudf column" ); // build offsets column -- last entry is the total size - auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); + auto offsets_column = make_numeric_column( data_type{INT32}, num_strings+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); auto d_offsets = offsets_view.data(); // Using inclusive-scan to compute last entry which is the total size. @@ -64,36 +64,39 @@ std::unique_ptr make_strings_column( // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first // zero offset manually. thrust::transform_inclusive_scan( execpol->on(stream), - thrust::make_counting_iterator(0), thrust::make_counting_iterator(count), - d_offsets+1, // fills in offsets entries [1,count] + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_strings), + d_offsets+1, // fills in offsets entries [1,num_strings] [d_strings] __device__ (size_type idx) { - thrust::pair item = d_strings[idx]; - return ( item.first ? static_cast(item.second) : 0 ); + thrust::pair item = d_strings[idx]; + return ( item.first!=nullptr ? static_cast(item.second) : 0 ); }, thrust::plus() ); // set the first offset to 0 - cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream); + CUDA_TRY(cudaMemsetAsync( d_offsets, 0, sizeof(*d_offsets), stream)); // create null mask auto valid_mask = valid_if( static_cast(nullptr), [d_strings] __device__ (size_type idx) { return d_strings[idx].first!=nullptr; }, - count, stream ); + num_strings, stream ); auto null_count = valid_mask.second; - rmm::device_buffer null_mask(valid_mask.first, gdf_valid_allocation_size(count), + rmm::device_buffer null_mask(valid_mask.first, gdf_valid_allocation_size(num_strings), stream, mr); RMM_TRY( RMM_FREE(valid_mask.first,stream) ); // TODO valid_if to return device_buffer in future - if( (bytes==0) && (null_count < count) ) + // if we have all nulls, a null chars column is allowed + // if all non-null strings are empty strings, we need a non-null chars column + // - in this case we set the bytes to 1 to create a minimal one-byte chars column + if( (bytes==0) && (null_count < num_strings) ) bytes = 1; // all entries are empty strings // build chars column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); auto chars_view = chars_column->mutable_view(); auto d_chars = chars_view.data(); - thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, + thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), num_strings, [d_strings, d_offsets, d_chars] __device__(size_type idx){ // place individual strings auto item = d_strings[idx]; - if( item.first ) + if( item.first!=nullptr ) memcpy(d_chars + d_offsets[idx], item.first, item.second ); }); @@ -102,9 +105,9 @@ std::unique_ptr make_strings_column( children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(chars_column)); - // no data-ptr with count elements plus children + // no data-ptr with num_strings elements plus children return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, num_strings, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } @@ -118,33 +121,33 @@ std::unique_ptr make_strings_column( cudaStream_t stream, rmm::mr::device_memory_resource* mr ) { - size_type count = offsets.size()-1; - CUDF_EXPECTS( count > 0, "strings count must be greater than 0"); - CUDF_EXPECTS( null_count < count, "null strings column not yet supported"); + size_type num_strings = offsets.size()-1; + CUDF_EXPECTS( num_strings > 0, "strings count must be greater than 0"); + CUDF_EXPECTS( null_count < num_strings, "null strings column not yet supported"); auto execpol = rmm::exec_policy(stream); size_type bytes = offsets.back() - offsets[0]; CUDF_EXPECTS( bytes >=0, "invalid offsets vector"); // build offsets column -- this is the number of strings + 1 - auto offsets_column = make_numeric_column( data_type{INT32}, count+1, mask_state::UNALLOCATED, stream, mr ); + auto offsets_column = make_numeric_column( data_type{INT32}, num_strings+1, mask_state::UNALLOCATED, stream, mr ); auto offsets_view = offsets_column->mutable_view(); - cudaMemcpyAsync( offsets_view.data(), offsets.data().get(), - (count+1)*sizeof(int32_t), - cudaMemcpyDeviceToHost, stream ); + CUDA_TRY(cudaMemcpyAsync( offsets_view.data(), offsets.data().get(), + (num_strings+1)*sizeof(int32_t), + cudaMemcpyDeviceToDevice, stream )); // build null bitmask rmm::device_buffer null_mask; if( null_count ) null_mask = rmm::device_buffer(valid_mask.data().get(), - gdf_valid_allocation_size(count), + gdf_valid_allocation_size(num_strings), stream, mr); // build chars column auto chars_column = make_numeric_column( data_type{INT8}, bytes, mask_state::UNALLOCATED, stream, mr ); auto chars_view = chars_column->mutable_view(); - cudaMemcpyAsync( chars_view.data(), strings.data().get(), bytes, - cudaMemcpyDeviceToHost, stream ); + CUDA_TRY(cudaMemcpyAsync( chars_view.data(), strings.data().get(), bytes, + cudaMemcpyDeviceToDevice, stream )); // build children vector std::vector> children; @@ -153,7 +156,7 @@ std::unique_ptr make_strings_column( // return std::make_unique( - data_type{STRING}, count, rmm::device_buffer{0,stream,mr}, + data_type{STRING}, num_strings, rmm::device_buffer{0,stream,mr}, null_mask, null_count, std::move(children)); } diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index f0f5cfda4ae..dd74da2f87f 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -27,41 +27,21 @@ namespace cudf { // strings_column_view::strings_column_view( column_view strings_column ) - : _parent(strings_column) + : column_view(strings_column) { - CUDF_EXPECTS( _parent.type().id()==STRING, "strings_column_view only supports strings"); - CUDF_EXPECTS( _parent.num_children()>0, "strings column must have children"); // revisit this (all nulls column?) -} - -size_type strings_column_view::size() const -{ - return _parent.size(); -} - -column_view strings_column_view::parent() const -{ - return _parent; + CUDF_EXPECTS( type().id()==STRING, "strings_column_view only supports strings"); } column_view strings_column_view::offsets() const { - return _parent.child(0); + return child(offsets_column_index); } column_view strings_column_view::chars() const { - return _parent.child(1); + return child(chars_column_index); } -const bitmask_type* strings_column_view::null_mask() const -{ - return _parent.null_mask(); -} - -size_type strings_column_view::null_count() const -{ - return _parent.null_count(); -} namespace strings { @@ -82,7 +62,7 @@ void print( strings_column_view strings, // stick with the default stream for this odd/rare stdout function auto execpol = rmm::exec_policy(0); - auto strings_column = column_device_view::create(strings.parent()); + auto strings_column = column_device_view::create(strings); auto d_column = *strings_column; auto d_offsets = strings.offsets().data(); auto d_strings = strings.chars().data(); @@ -104,7 +84,7 @@ void print( strings_column_view strings, return bytes+1; // allow for null-terminator on non-null strings }, thrust::plus()); - cudaMemset( d_output_offsets, 0, sizeof(*d_output_offsets)); + CUDA_TRY(cudaMemset( d_output_offsets, 0, sizeof(*d_output_offsets))); // build output buffer size_t buffer_size = output_offsets.back(); // last element has total size @@ -131,9 +111,9 @@ void print( strings_column_view strings, // copy output buffer to host std::vector h_offsets(count+1); - cudaMemcpy( h_offsets.data(), d_output_offsets, (count+1)*sizeof(size_t), cudaMemcpyDeviceToHost); + CUDA_TRY(cudaMemcpy( h_offsets.data(), d_output_offsets, (count+1)*sizeof(size_t), cudaMemcpyDeviceToHost)); std::vector h_buffer(buffer_size); - cudaMemcpy( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost ); + CUDA_TRY(cudaMemcpy( h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost )); // print out the strings to stdout for( size_type idx=0; idx < count; ++idx ) @@ -159,14 +139,14 @@ std::pair, rmm::device_vector> size_type count = strings.size(); auto d_offsets = strings.offsets().data(); results.second = rmm::device_vector(count+1); - cudaMemcpyAsync( results.second.data().get(), d_offsets, (count+1)*sizeof(size_type), - cudaMemcpyDeviceToHost, stream); + CUDA_TRY(cudaMemcpyAsync( results.second.data().get(), d_offsets, (count+1)*sizeof(size_type), + cudaMemcpyDeviceToHost, stream)); size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; auto d_chars = strings.chars().data(); results.first = rmm::device_vector(bytes); - cudaMemcpyAsync( results.first.data().get(), d_chars, bytes, - cudaMemcpyDeviceToHost, stream); + CUDA_TRY(cudaMemcpyAsync( results.first.data().get(), d_chars, bytes, + cudaMemcpyDeviceToHost, stream)); return results; } diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 9910448cf50..81fb38618f3 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -25,6 +25,7 @@ #include #include + namespace cudf { namespace detail { @@ -40,27 +41,49 @@ table_device_view_base::table_device_view_base( : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()}, _stream{stream} { + + // The table's columns must be converted to column_device + // objects and copied into device memory for the table_device's + // device columns. if (source_view.num_columns() > 0) { // + // First calculate the size of memory needed to hold the + // array of ColumnDeviceView's. This is done by calling extent() + // for each of the ColumnView's in the table's columns vector. size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, - [](size_type init, column_view col) { + [](size_type init, auto col) { return init + ColumnDeviceView::extent(col); }); - + // A buffer of CPU memory is created to hold the ColumnDeviceView + // objects and then copied to device memory at the _columns member pointer. + // But each instance may have child objects which require setting an + // internal device pointer before being copied from CPU to device. std::vector h_buffer(views_size_bytes); ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); - int8_t* h_end = (int8_t*)(h_column + _num_columns); + // Create the device pointer to be used in the result. + // We need to pass this down to the columns so they can be resolved + // to point to any child objects. RMM_TRY(RMM_ALLOC(&_columns, views_size_bytes, stream)); ColumnDeviceView* d_column = _columns; + // The beginning of the memory must be the fixed-sized ColumnDeviceView + // objects to be used as an array. Therefore, the child data is assigned + // to the end of the array. + int8_t* h_end = (int8_t*)(h_column + _num_columns); int8_t* d_end = (int8_t*)(d_column + _num_columns); + // Create the ColumnDeviceView from each column within the CPU memory + // array. Any column child data should be copied into h_end and any + // internal pointers should be set using d_end. for( size_type idx=0; idx < _num_columns; ++idx ) { - auto col = source_view.column(idx); + auto col = source_view.column(idx); // get the column_view + // convert the ColumnView into ColumnDeviceView new(h_column) ColumnDeviceView(col,(ptrdiff_t)h_end,(ptrdiff_t)d_end); - h_column++; - h_end += (ColumnDeviceView::extent(col)); - d_end += (ColumnDeviceView::extent(col)); + h_column++; // next element in array + // point to the next chunk of memory for use of the children of the next column + auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView)); + h_end += col_child_data_size; + d_end += col_child_data_size; } CUDA_TRY(cudaMemcpyAsync(_columns, h_buffer.data(), diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index b4440e5c7a8..5944fb336b4 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -43,7 +43,7 @@ TEST_F(FactoriesTest, CreateColumnFromArray) cudf::size_type count = (cudf::size_type)h_test_strings.size(); thrust::host_vector h_buffer(memsize); thrust::device_vector d_buffer(memsize); - thrust::host_vector > strings(count); + thrust::host_vector > strings(count); thrust::host_vector h_offsets(count+1); cudf::size_type offset = 0; cudf::size_type nulls = 0; @@ -53,19 +53,19 @@ TEST_F(FactoriesTest, CreateColumnFromArray) const char* str = h_test_strings[idx]; if( !str ) { - strings[idx] = thrust::pair{nullptr,0}; + strings[idx] = thrust::pair{nullptr,0}; nulls++; } else { cudf::size_type length = (cudf::size_type)strlen(str); memcpy( h_buffer.data() + offset, str, length ); - strings[idx] = thrust::pair{d_buffer.data().get()+offset,(size_t)length}; + strings[idx] = thrust::pair{d_buffer.data().get()+offset,length}; offset += length; } h_offsets[idx+1] = offset; } - rmm::device_vector> d_strings(strings); + rmm::device_vector> d_strings(strings); cudaMemcpy( d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice ); auto column = cudf::make_strings_column( d_strings ); EXPECT_EQ(column->type(), cudf::data_type{cudf::STRING}); From aafa9abce447c6618215cfd3a7f667be15380fca Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 2 Oct 2019 18:18:51 -0400 Subject: [PATCH 36/54] fixed some documentation --- cpp/include/cudf/column/column_device_view.cuh | 4 ++-- cpp/src/column/column_device_view.cu | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 3b4be27de4b..3ddaae62148 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -213,7 +213,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * using the device memory pointer as a base for child pointers. * * @param column Column view from which to create this instance. - * @param h_ptr Host memory pointer on which to place this instance. + * @param h_ptr Host memory pointer on which to place any child data. * @param d_ptr Device memory pointer on which to base any child pointers. *---------------------------------------------------------------------------**/ column_device_view( column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); @@ -321,7 +321,7 @@ class alignas(16) mutable_column_device_view * using the device memory pointer as a base for child pointers. * * @param column Column view from which to create this instance. - * @param h_ptr Host memory pointer on which to place this instance. + * @param h_ptr Host memory pointer on which to place any child data. * @param d_ptr Device memory pointer on which to base any child pointers. *---------------------------------------------------------------------------**/ mutable_column_device_view( mutable_column_view column, ptrdiff_t h_ptr, ptrdiff_t d_ptr ); diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 88ec29da87a..811048abe11 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -73,16 +73,21 @@ std::unique_ptr> co if( num_children > 0 ) { - // ignore grand-children right now + // create device memory for the children RMM_TRY(RMM_ALLOC(&p->d_children, sizeof(column_device_view)*num_children, stream)); + // build the children into CPU memory first std::vector buffer(sizeof(column_device_view)*num_children); auto h_ptr = buffer.data(); for( size_type idx=0; idx < num_children; ++idx ) { + // create device-view from view column_device_view child(source.child(idx)); + // copy child into buffer memcpy(h_ptr, &child, sizeof(column_device_view)); + // point to the next array slot h_ptr += sizeof(column_device_view); } + // copy the CPU memory with the children into device memory CUDA_TRY(cudaMemcpyAsync(p->d_children, buffer.data(), num_children*sizeof(column_device_view), cudaMemcpyHostToDevice, stream)); p->_num_children = num_children; @@ -135,7 +140,7 @@ std::unique_ptr Date: Thu, 3 Oct 2019 09:06:36 -0400 Subject: [PATCH 37/54] moved strings factories header to column_factories.hpp --- cpp/include/cudf/column/column_factories.hpp | 66 ++++++++++++++ .../cudf/strings/strings_column_factories.hpp | 89 ------------------- cpp/src/strings/strings_column_factories.cu | 1 - cpp/tests/strings/factories_test.cu | 2 +- 4 files changed, 67 insertions(+), 91 deletions(-) delete mode 100644 cpp/include/cudf/strings/strings_column_factories.hpp diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 9dbd24f67e4..884567eadf5 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -19,6 +19,8 @@ #include #include "column.hpp" +#include + namespace cudf { /**---------------------------------------------------------------------------* * @brief Construct column with sufficient uninitialized storage @@ -43,4 +45,68 @@ std::unique_ptr make_numeric_column( data_type type, size_type size, mask_state state = UNALLOCATED, cudaStream_t stream = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/**---------------------------------------------------------------------------* + * @brief Construct STRING type column given an array of pointer/size pairs. + * The total number of char bytes must not exceed the maximum size of size_type. + * The string characters are expected to be UTF-8 encoded sequence of char bytes. + * Use the strings_column_view class to perform strings operations on this type + * of column. + * + * @note `null_count()` and `null_bitmask` are determined if a pair contains + * a null string. That is, for each pair, if `.first` is null, that string + * is considered null. Likewise, a string is considered empty (not null) + * if `.first` is not null and `.second` is 0. Otherwise the `.first` member + * must be a valid device address pointing to `.second` consecutive bytes. + * + * @throws std::bad_alloc if device memory allocation fails + * + * @param strings The pointer/size pair arrays. + * Each pointer must be a device memory address or `nullptr` (indicating a null string). + * The size must be the number of bytes. + * @param stream Optional stream for use with all memory allocation + * and device kernels + * @param mr Optional resource to use for device memory + * allocation of the column's `null_mask` and children. + *---------------------------------------------------------------------------**/ +std::unique_ptr make_strings_column( + const rmm::device_vector>& strings, + cudaStream_t stream = 0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/**---------------------------------------------------------------------------* + * @brief Construct STRING type column given an contiguous array of chars + * encoded as UTF-8, an array of byte offsets identifying individual strings + * within the char array, and a null bitmask. + * The total number of char bytes must not exceed the maximum size of size_type. + * Use the strings_column_view class to perform strings operations on this type + * of column. + * This function makes a deep copy of the strings, offsets, null_mask to create + * a new column. + * + * @throws std::bad_alloc if device memory allocation fails + * + * @param strings The contiguous array of chars in device memory. + * This char array is expected to be UTF-8 encoded characters. + * @param offsets The array of byte offsets in device memory. + * The number of elements is one more than the total number + * of strings so the offset[last] - offset[0] is the total + * number of bytes in the strings array. + * @param null_mask The array of bits specifying the null strings. + * This array must be in device memory. + * Arrow format for nulls is used for interpeting this bitmask. + * @param null_count The number of null string entries. + * @param stream Optional stream for use with all memory allocation + * and device kernels + * @param mr Optional resource to use for device memory + * allocation of the column's `null_mask` and children. + *---------------------------------------------------------------------------**/ +std::unique_ptr make_strings_column( + const rmm::device_vector& strings, + const rmm::device_vector& offsets, + const rmm::device_vector& null_mask, + size_type null_count, + cudaStream_t stream = 0, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + } // namespace cudf diff --git a/cpp/include/cudf/strings/strings_column_factories.hpp b/cpp/include/cudf/strings/strings_column_factories.hpp deleted file mode 100644 index 035d22207c3..00000000000 --- a/cpp/include/cudf/strings/strings_column_factories.hpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include - -namespace cudf { - -/**---------------------------------------------------------------------------* - * @brief Construct STRING type column given an array of pointer/size pairs. - * The total number of char bytes must not exceed the maximum size of size_type. - * The string characters are expected to be UTF-8 encoded sequence of char bytes. - * Use the strings_column_view class to perform strings operations on this type - * of column. - * - * @note `null_count()` and `null_bitmask` are determined if a pair contains - * a null string. That is, for each pair, if `.first` is null, that string - * is considered null. Likewise, a string is considered empty (not null) - * if `.first` is not null and `.second` is 0. Otherwise the `.first` member - * must be a valid device address pointing to `.second` consecutive bytes. - * - * @throws std::bad_alloc if device memory allocation fails - * - * @param strings The pointer/size pair arrays. - * Each pointer must be a device memory address or `nullptr` (indicating a null string). - * The size must be the number of bytes. - * @param stream Optional stream for use with all memory allocation - * and device kernels - * @param mr Optional resource to use for device memory - * allocation of the column's `null_mask` and children. - *---------------------------------------------------------------------------**/ -std::unique_ptr make_strings_column( - const rmm::device_vector>& strings, - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); - -/**---------------------------------------------------------------------------* - * @brief Construct STRING type column given an contiguous array of chars - * encoded as UTF-8, an array of byte offsets identifying individual strings - * within the char array, and a null bitmask. - * The total number of char bytes must not exceed the maximum size of size_type. - * Use the strings_column_view class to perform strings operations on this type - * of column. - * This function makes a deep copy of the strings, offsets, null_mask to create - * a new column. - * - * @throws std::bad_alloc if device memory allocation fails - * - * @param strings The contiguous array of chars in device memory. - * This char array is expected to be UTF-8 encoded characters. - * @param offsets The array of byte offsets in device memory. - * The number of elements is one more than the total number - * of strings so the offset[last] - offset[0] is the total - * number of bytes in the strings array. - * @param null_mask The array of bits specifying the null strings. - * This array must be in device memory. - * Arrow format for nulls is used for interpeting this bitmask. - * @param null_count The number of null string entries. - * @param stream Optional stream for use with all memory allocation - * and device kernels - * @param mr Optional resource to use for device memory - * allocation of the column's `null_mask` and children. - *---------------------------------------------------------------------------**/ -std::unique_ptr make_strings_column( - const rmm::device_vector& strings, - const rmm::device_vector& offsets, - const rmm::device_vector& null_mask, - size_type null_count, - cudaStream_t stream = 0, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); - -} // namespace cudf diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index bbbd01c0218..bd60fdaffaf 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 5944fb336b4..d2fafda5e3e 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include From 4296ecb19ae495a62e4fc75525d96e8c0e1dea65 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 09:57:06 -0400 Subject: [PATCH 38/54] strings_column_view private inherit from column_view --- cpp/include/cudf/column/column_device_view.cuh | 8 +++++--- cpp/include/cudf/strings/strings_column_view.hpp | 7 ++++++- cpp/src/strings/strings_column_view.cu | 7 ++++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 3ddaae62148..c2e0493f251 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -497,10 +497,12 @@ class alignas(16) mutable_column_device_view /**---------------------------------------------------------------------------* * @brief Returns `string_view` to the string element at the specified index. * + * If the element at the specified index is NULL, i.e., `is_null(element_index) == true`, + * then any attempt to use the result will lead to undefined behavior. + * * This function accounts for the offset. - * Calling this on a null element will result in undefined behavior. * - * @param element_index Position of the desired string + * @param element_index Position of the desired string element * @return string_view instance representing this element at this index *---------------------------------------------------------------------------**/ template <> @@ -508,7 +510,7 @@ __device__ inline strings::string_view const column_device_view::element(); - const char* d_strings = d_children[strings_column_view::offsets_column_index].data(); + const char* d_strings = d_children[strings_column_view::chars_column_index].data(); size_type offset = d_offsets[index]; return strings::string_view{d_strings + offset, d_offsets[index+1] - offset}; } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 653613504e0..1aa55e71c0b 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -26,7 +26,7 @@ namespace cudf { * @brief Given a column-view of strings type, an instance of this class * provides a wrapper on this compound column for strings operations. *---------------------------------------------------------------------------**/ -class strings_column_view : public column_view +class strings_column_view : private column_view { public: strings_column_view( column_view strings_column ); @@ -43,6 +43,11 @@ class strings_column_view : public column_view using column_view::null_mask; using column_view::null_count; + /**---------------------------------------------------------------------------* + * @brief Returns the parent column. + *---------------------------------------------------------------------------**/ + column_view parent() const; + /**---------------------------------------------------------------------------* * @brief Returns the internal column of offsets *---------------------------------------------------------------------------**/ diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu index dd74da2f87f..e9148c95dac 100644 --- a/cpp/src/strings/strings_column_view.cu +++ b/cpp/src/strings/strings_column_view.cu @@ -32,6 +32,11 @@ strings_column_view::strings_column_view( column_view strings_column ) CUDF_EXPECTS( type().id()==STRING, "strings_column_view only supports strings"); } +column_view strings_column_view::parent() const +{ + return static_cast(*this); +} + column_view strings_column_view::offsets() const { return child(offsets_column_index); @@ -62,7 +67,7 @@ void print( strings_column_view strings, // stick with the default stream for this odd/rare stdout function auto execpol = rmm::exec_policy(0); - auto strings_column = column_device_view::create(strings); + auto strings_column = column_device_view::create(strings.parent()); auto d_column = *strings_column; auto d_offsets = strings.offsets().data(); auto d_strings = strings.chars().data(); From 9be11c575488f7857414ab470c7bdc97ff538fce Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 10:32:11 -0400 Subject: [PATCH 39/54] move string_view to cudf namespace --- .../cudf/column/column_device_view.cuh | 4 ++-- cpp/include/cudf/strings/string_view.cuh | 4 ++-- cpp/include/cudf/strings/string_view.inl | 20 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index c2e0493f251..fcae36e4ddb 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -506,13 +506,13 @@ class alignas(16) mutable_column_device_view * @return string_view instance representing this element at this index *---------------------------------------------------------------------------**/ template <> -__device__ inline strings::string_view const column_device_view::element( +__device__ inline string_view const column_device_view::element( size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset const int32_t* d_offsets = d_children[strings_column_view::offsets_column_index].data(); const char* d_strings = d_children[strings_column_view::chars_column_index].data(); size_type offset = d_offsets[index]; - return strings::string_view{d_strings + offset, d_offsets[index+1] - offset}; + return string_view{d_strings + offset, d_offsets[index+1] - offset}; } } // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 3bdfee38197..befbef2b890 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -21,8 +21,6 @@ namespace cudf { -namespace strings -{ // UTF-8 characters are 1-4 bytes using char_utf8 = uint32_t; @@ -304,6 +302,8 @@ private: __device__ size_type character_offset(size_type bytepos) const; }; +namespace strings +{ namespace detail { /**---------------------------------------------------------------------------* diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 65925e084b3..499661668b9 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -61,8 +61,6 @@ __device__ inline cudf::size_type string_length( const char* str ) namespace cudf { -namespace strings -{ __host__ __device__ inline string_view::string_view(const char* data, size_type bytes) : _data(data), _bytes(bytes) @@ -80,7 +78,7 @@ __host__ __device__ inline size_type string_view::size_bytes() const __device__ inline size_type string_view::length() const { - return detail::characters_in_string(_data,_bytes); + return strings::detail::characters_in_string(_data,_bytes); } __host__ __device__ inline const char* string_view::data() const @@ -131,7 +129,7 @@ __device__ inline bool string_view::const_iterator::operator!=(const string_view __device__ inline char_utf8 string_view::const_iterator::operator*() const { char_utf8 chr = 0; - detail::to_char_utf8(p + offset, chr); + strings::detail::to_char_utf8(p + offset, chr); return chr; } @@ -161,7 +159,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const if(offset >= _bytes) return 0; char_utf8 chr = 0; - detail::to_char_utf8(data() + offset, chr); + strings::detail::to_char_utf8(data() + offset, chr); return chr; } @@ -280,7 +278,7 @@ __device__ inline size_type string_view::find(const char* str, size_type bytes, __device__ inline size_type string_view::find(char_utf8 chr, size_type pos, int count) const { char str[sizeof(char_utf8)]; - size_type chwidth = detail::from_char_utf8(chr,str); + size_type chwidth = strings::detail::from_char_utf8(chr,str); return find(str,chwidth,pos,count); } @@ -322,7 +320,7 @@ __device__ inline size_type string_view::rfind(const char* str, size_type bytes, __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, int count) const { char str[sizeof(char_utf8)]; - size_type chwidth = detail::from_char_utf8(chr,str); + size_type chwidth = strings::detail::from_char_utf8(chr,str); return rfind(str,chwidth,pos,count); } @@ -369,7 +367,7 @@ __device__ inline size_type string_view::split(const char* delim, int count, str if(strsCount < count) count = strsCount; // - size_type dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); + size_type dchars = (bytes ? strings::detail::characters_in_string(delim,bytes) : 1); size_type nchars = length(); size_type spos = 0, sidx = 0; size_type epos = find(delim, bytes); @@ -418,7 +416,7 @@ __device__ inline size_type string_view::rsplit(const char* delim, int count, st if(strsCount < count) count = strsCount; // - unsigned int dchars = (bytes ? detail::characters_in_string(delim,bytes) : 1); + unsigned int dchars = (bytes ? strings::detail::characters_in_string(delim,bytes) : 1); int epos = (int)length(); // end pos is not inclusive int sidx = count - 1; // index for strs array int spos = rfind(delim, bytes); @@ -440,9 +438,11 @@ __device__ inline size_type string_view::rsplit(const char* delim, int count, st __device__ inline size_type string_view::character_offset(size_type bytepos) const { - return detail::characters_in_string(data(), bytepos); + return strings::detail::characters_in_string(data(), bytepos); } +namespace strings +{ namespace detail { __host__ __device__ inline size_type bytes_in_char_utf8(char_utf8 chr) From f8c08ac4ddf4d398f6f7a7c535f9cd48bae54dda Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 13:11:55 -0400 Subject: [PATCH 40/54] improve comments for table_device_view --- cpp/src/table/table_device_view.cu | 33 ++++++++++++++++-------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 81fb38618f3..4a3ac130d0c 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -49,38 +49,41 @@ table_device_view_base::table_device_view_base( // // First calculate the size of memory needed to hold the // array of ColumnDeviceView's. This is done by calling extent() - // for each of the ColumnView's in the table's columns vector. + // for each of the ColumnView's in the table's columns. size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, [](size_type init, auto col) { return init + ColumnDeviceView::extent(col); }); // A buffer of CPU memory is created to hold the ColumnDeviceView - // objects and then copied to device memory at the _columns member pointer. - // But each instance may have child objects which require setting an - // internal device pointer before being copied from CPU to device. + // objects. Once created, the CPU memory is copied to device memory + // at the _columns member pointer. + // But each ColumnDeviceView instance may have child objects which + // require setting an internal device pointer before being copied + // from CPU to device. std::vector h_buffer(views_size_bytes); ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); - // Create the device pointer to be used in the result. - // We need to pass this down to the columns so they can be resolved - // to point to any child objects. + // Allocate the device memory to be used in the result. + // We need this pointer in order to pass it down when creating the + // ColumnDeviceViews so the column can fix the pointer(s) for any + // of its child objects. RMM_TRY(RMM_ALLOC(&_columns, views_size_bytes, stream)); ColumnDeviceView* d_column = _columns; // The beginning of the memory must be the fixed-sized ColumnDeviceView - // objects to be used as an array. Therefore, the child data is assigned - // to the end of the array. - int8_t* h_end = (int8_t*)(h_column + _num_columns); - int8_t* d_end = (int8_t*)(d_column + _num_columns); + // objects in order for _columns to be used as an array. Therefore, + // any child data is assigned to the end of this array. + int8_t* h_end = (int8_t*)(h_column + source_view.num_columns()); + int8_t* d_end = (int8_t*)(d_column + source_view.num_columns()); // Create the ColumnDeviceView from each column within the CPU memory // array. Any column child data should be copied into h_end and any // internal pointers should be set using d_end. - for( size_type idx=0; idx < _num_columns; ++idx ) + for( auto itr=source_view.begin(); itr!=source_view.end(); ++itr ) { - auto col = source_view.column(idx); // get the column_view + auto col = *itr; // convert the ColumnView into ColumnDeviceView new(h_column) ColumnDeviceView(col,(ptrdiff_t)h_end,(ptrdiff_t)d_end); - h_column++; // next element in array - // point to the next chunk of memory for use of the children of the next column + h_column++; // point to memory slot for the next ColumnDeviceView + // update the pointers for holding ColumnDeviceView's child data auto col_child_data_size = (ColumnDeviceView::extent(col) - sizeof(ColumnDeviceView)); h_end += col_child_data_size; d_end += col_child_data_size; From 78e31ac016be0ee8e510028cbe3e5bbd66fd1aea Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 13:19:55 -0400 Subject: [PATCH 41/54] update comments for table_device_view --- cpp/src/table/table_device_view.cu | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 4a3ac130d0c..ca6128bd431 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -42,30 +42,30 @@ table_device_view_base::table_device_view_base( _num_columns{source_view.num_columns()}, _stream{stream} { - // The table's columns must be converted to column_device - // objects and copied into device memory for the table_device's - // device columns. + // The table's columns must be converted to ColumnDeviceView + // objects and copied into device memory for the table_device_view's + // _columns member. if (source_view.num_columns() > 0) { // // First calculate the size of memory needed to hold the - // array of ColumnDeviceView's. This is done by calling extent() - // for each of the ColumnView's in the table's columns. + // array of ColumnDeviceViews. This is done by calling extent() + // for each of the ColumnViews in the table_view's columns. size_type views_size_bytes = std::accumulate(source_view.begin(), source_view.end(), 0, [](size_type init, auto col) { return init + ColumnDeviceView::extent(col); }); - // A buffer of CPU memory is created to hold the ColumnDeviceView - // objects. Once created, the CPU memory is copied to device memory + // A buffer of CPU memory is allocated to hold the ColumnDeviceView + // objects. Once filled, the CPU memory is then copied to device memory // at the _columns member pointer. - // But each ColumnDeviceView instance may have child objects which - // require setting an internal device pointer before being copied - // from CPU to device. std::vector h_buffer(views_size_bytes); ColumnDeviceView* h_column = reinterpret_cast(h_buffer.data()); + // Each ColumnDeviceView instance may have child objects which may + // require setting some internal device pointers before being copied + // from CPU to device. // Allocate the device memory to be used in the result. // We need this pointer in order to pass it down when creating the - // ColumnDeviceViews so the column can fix the pointer(s) for any + // ColumnDeviceViews so the column can set the pointer(s) for any // of its child objects. RMM_TRY(RMM_ALLOC(&_columns, views_size_bytes, stream)); ColumnDeviceView* d_column = _columns; @@ -75,7 +75,7 @@ table_device_view_base::table_device_view_base( int8_t* h_end = (int8_t*)(h_column + source_view.num_columns()); int8_t* d_end = (int8_t*)(d_column + source_view.num_columns()); // Create the ColumnDeviceView from each column within the CPU memory - // array. Any column child data should be copied into h_end and any + // Any column child data should be copied into h_end and any // internal pointers should be set using d_end. for( auto itr=source_view.begin(); itr!=source_view.end(); ++itr ) { From 55b3c6d2e9d81a6e642a6d216eed37b62b7beab1 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 12:43:46 -0500 Subject: [PATCH 42/54] Add forward decl for string_view. --- cpp/include/cudf/types.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 8131825d2ae..e25cd8a0282 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -53,6 +53,7 @@ struct table; class column; class column_view; class mutable_column_view; +class string_view; namespace exp { class table; From 663ad068792c3218ee37f795d7cadc8b6f8d4d6d Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 12:43:58 -0500 Subject: [PATCH 43/54] Add string_view to STRING mapping. --- cpp/include/cudf/utilities/type_dispatcher.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index cc582b2df0c..23b43584386 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -92,6 +92,7 @@ CUDF_TYPE_MAPPING(int32_t, type_id::INT32); CUDF_TYPE_MAPPING(int64_t, type_id::INT64); CUDF_TYPE_MAPPING(float, type_id::FLOAT32); CUDF_TYPE_MAPPING(double, type_id::FLOAT64); +CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING); /**---------------------------------------------------------------------------* * @brief Invokes an `operator()` template with the type instantiation based on @@ -211,6 +212,9 @@ CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher( case FLOAT64: return f.template operator()::type>( std::forward(args)...); + case STRING: + return f.template operator()::type>( + std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported type_id."); From 04bc439e5dc97a15cce7322fbe1c0ee190a2bf7e Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 12:44:10 -0500 Subject: [PATCH 44/54] Specialize size_of for fixed-width only. --- cpp/src/column/column_factories.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index a3f5ca1d377..4d17c10290b 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -23,7 +23,14 @@ namespace cudf { namespace { struct size_of_helper { template - constexpr int operator()() const noexcept { + constexpr std::enable_if_t(), int> operator()() const + noexcept { + CUDF_FAIL("Invalid, non fixed-width element type."); + } + + template + constexpr std::enable_if_t(), int> operator()() const + noexcept { return sizeof(T); } }; From dc4bc7aba29df0200946cd945a6663f990f9af92 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 12:44:25 -0500 Subject: [PATCH 45/54] Update traits to use named function objects. --- cpp/include/cudf/utilities/traits.hpp | 70 ++++++++++++++++++--------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 741a022ac4d..a93cd3efba0 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -69,6 +69,13 @@ constexpr inline bool is_numeric() { return std::is_integral::value or std::is_floating_point::value; } +struct is_numeric_impl { + template + bool operator()() { + return is_numeric(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether `type` is a numeric `data_type`. * @@ -81,8 +88,7 @@ constexpr inline bool is_numeric() { * @return false `type` is not numeric *---------------------------------------------------------------------------**/ constexpr inline bool is_numeric(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_numeric(); }, 0); + return cudf::exp::type_dispatcher(type, is_numeric_impl{}); } /**---------------------------------------------------------------------------* @@ -101,6 +107,13 @@ constexpr inline bool is_fixed_width() { return cudf::is_numeric(); } +struct is_fixed_width_impl { + template + bool operator()() { + return is_fixed_width(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are fixed-width. * @@ -111,12 +124,11 @@ constexpr inline bool is_fixed_width() { * @return false `type` is variable-width *---------------------------------------------------------------------------**/ constexpr inline bool is_fixed_width(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_fixed_width(); }, 0); + return cudf::exp::type_dispatcher(type, is_fixed_width_impl{}); } /**---------------------------------------------------------------------------* - * @brief Indictates whether the type `T` is a compound type. + * @brief Indicates whether the type `T` is a compound type. * * `column`s with "compound" elements are logically a single column of elements, * but may be concretely implemented with two or more `column`s. For example, a @@ -129,24 +141,15 @@ constexpr inline bool is_fixed_width(data_type type) { *---------------------------------------------------------------------------**/ template constexpr inline bool is_compound() { - // TODO Implement with checks for the compound wrapper types - return false; + return std::is_same::value; } -/**---------------------------------------------------------------------------* - * @brief Indicates whether the type `T` is a simple type. - * - * "Simple" element types are implemented with only a single column, i.e., - * `num_children() == 0` for columns of "simple" elements - * - * @tparam T The type to verify - * @return true `T` corresponds to a simple type - * @return false `T` corresponds to a compound type - *---------------------------------------------------------------------------**/ -template -constexpr inline bool is_simple() { - return not is_compound(); -} +struct is_compound_impl { + template + bool operator()() { + return is_compound(); + } +}; /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are compound. @@ -161,10 +164,31 @@ constexpr inline bool is_simple() { * @return false `type` is a simple type *---------------------------------------------------------------------------**/ constexpr inline bool is_compound(data_type type) { - return cudf::exp::type_dispatcher( - type, [](auto dummy) { return is_compound(); }, 0); + return cudf::exp::type_dispatcher(type, is_compound_impl{}); } +/**---------------------------------------------------------------------------* + * @brief Indicates whether the type `T` is a simple type. + * + * "Simple" element types are implemented with only a single column, i.e., + * `num_children() == 0` for columns of "simple" elements + * + * @tparam T The type to verify + * @return true `T` corresponds to a simple type + * @return false `T` corresponds to a compound type + *---------------------------------------------------------------------------**/ +template +constexpr inline bool is_simple() { + return not is_compound(); +} + +struct is_simple_impl { + template + bool operator()() { + return is_simple(); + } +}; + /**---------------------------------------------------------------------------* * @brief Indicates whether elements of `type` are simple. * From e5f1c82ce3ed97de50e3fed5953deccbaf400aad Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 12:57:55 -0500 Subject: [PATCH 46/54] Add string_view to traits. --- cpp/include/cudf/utilities/traits.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index a93cd3efba0..a8751d8471a 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -18,6 +18,7 @@ #include #include +#include #include From caaadeaee039843ae18cd483a6eeb0b0c9bf40a3 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 13:18:02 -0500 Subject: [PATCH 47/54] Update column_view constructor to use is_compound. --- cpp/src/column/column_view.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 9b81a93c27a..701f0e3a7b8 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); } - else if ( type.id() == STRING ){ // TODO change to is_compound(type) once type-dispatcher supports STRING + else if ( is_compound(type) ){ // TODO change to is_compound(type) once type-dispatcher supports STRING CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); } else if( size > 0){ CUDF_EXPECTS(nullptr != data, "Null data pointer."); From 825099cdb9236714703f0948fe24ecd0ae17bf8a Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 13:19:57 -0500 Subject: [PATCH 48/54] Remove uneccessary include. --- cpp/src/column/column_view.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 701f0e3a7b8..0056a0151b3 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include From b80153fa6977a6c4bfd7e75bbc61e61000b76357 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 14:35:35 -0400 Subject: [PATCH 49/54] use is_compound() in column_view_base --- cpp/src/column/column_view.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 9b81a93c27a..4b84b1f5d18 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -44,7 +44,7 @@ column_view_base::column_view_base(data_type type, size_type size, CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); } - else if ( type.id() == STRING ){ // TODO change to is_compound(type) once type-dispatcher supports STRING + else if ( is_compound(type) ) { CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); } else if( size > 0){ CUDF_EXPECTS(nullptr != data, "Null data pointer."); From 6b3adb1fe75e0b3c67883b3d516d0049c0114dde Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Thu, 3 Oct 2019 14:12:15 -0500 Subject: [PATCH 50/54] Remove noexcept for size_of_helper degerenate case. --- cpp/src/column/column_factories.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 4d17c10290b..1d042d761ba 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -23,8 +23,7 @@ namespace cudf { namespace { struct size_of_helper { template - constexpr std::enable_if_t(), int> operator()() const - noexcept { + constexpr std::enable_if_t(), int> operator()() const { CUDF_FAIL("Invalid, non fixed-width element type."); } From d64110816e450970d4f05faf1801161b14c69bfe Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 3 Oct 2019 15:20:21 -0400 Subject: [PATCH 51/54] remove noexcept --- cpp/src/column/column_factories.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 4d17c10290b..1d042d761ba 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -23,8 +23,7 @@ namespace cudf { namespace { struct size_of_helper { template - constexpr std::enable_if_t(), int> operator()() const - noexcept { + constexpr std::enable_if_t(), int> operator()() const { CUDF_FAIL("Invalid, non fixed-width element type."); } From abd26a3fb2f6cb99401a451f59b72a77fd93802c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 4 Oct 2019 13:07:01 -0400 Subject: [PATCH 52/54] check for valid null mask --- cpp/src/strings/strings_column_factories.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index bd60fdaffaf..6ebbf716464 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -123,6 +123,9 @@ std::unique_ptr make_strings_column( size_type num_strings = offsets.size()-1; CUDF_EXPECTS( num_strings > 0, "strings count must be greater than 0"); CUDF_EXPECTS( null_count < num_strings, "null strings column not yet supported"); + if( null_count > 0 ) { + CUDF_EXPECTS( !valid_mask.empty(), "Cannot have null elements without a null mask." ); + } auto execpol = rmm::exec_policy(stream); size_type bytes = offsets.back() - offsets[0]; From 2d19ea65f191b70c89aefc2d25736febc24052ac Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 4 Oct 2019 15:57:10 -0400 Subject: [PATCH 53/54] fixed more typos per review --- cpp/include/cudf/column/column_device_view.cuh | 10 ++++++---- cpp/include/cudf/strings/string_view.cuh | 4 ++-- cpp/include/cudf/strings/string_view.inl | 2 +- cpp/src/column/column_device_view.cu | 5 +++-- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index fcae36e4ddb..18b46303c0c 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -209,8 +209,9 @@ class alignas(16) column_device_view : public detail::column_device_view_base { column_device_view& operator=(column_device_view&&) = default; /**---------------------------------------------------------------------------* - * @brief Creates an instance of this class in the specified host memory - * using the device memory pointer as a base for child pointers. + * @brief Creates an instance of this class using the specified host memory + * pointer (h_ptr) to store child objects and the device memory pointer (d_ptr) + * as a base for any child object pointers. * * @param column Column view from which to create this instance. * @param h_ptr Host memory pointer on which to place any child data. @@ -317,8 +318,9 @@ class alignas(16) mutable_column_device_view mutable_column_device_view& operator=(mutable_column_device_view&&) = default; /**---------------------------------------------------------------------------* - * @brief Creates an instance of this class in the specified host memory - * using the device memory pointer as a base for child pointers. + * @brief Creates an instance of this class using the specified host memory + * pointer (h_ptr) to store child objects and the device memory pointer (d_ptr) + * as a base for any child object pointers. * * @param column Column view from which to create this instance. * @param h_ptr Host memory pointer on which to place any child data. diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index befbef2b890..eb5113df517 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -26,8 +26,8 @@ namespace cudf using char_utf8 = uint32_t; /**---------------------------------------------------------------------------* - * @brief A non-owning, immutable view of device data that is variable length - * character array representing a UTF-8 string. The caller must maintain the + * @brief A non-owning, immutable view of device data that is a variable length + * char array representing a UTF-8 string. The caller must maintain the * device memory for the lifetime of this instance. * * It provides a simple wrapper and string operations for an individual string diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index 499661668b9..afbd9d962ae 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -23,7 +23,7 @@ using BYTE = uint8_t; /**---------------------------------------------------------------------------* * @brief Returns the number of bytes used to represent the provided byte. - * This could 0 to 4 bytes. 0 is returned for intermediate bytes within a + * This could be 0 to 4 bytes. 0 is returned for intermediate bytes within a * single character. For example, for the two-byte 0xC3A8 single character, * the first byte would return 2 and the second byte would return 0. * diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index 811048abe11..cef67f03b94 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -37,7 +37,8 @@ void column_device_view::destroy() { delete this; } -// For use with inplace-new to pre-fill memory to be copied to device +// Place any child objects in host memory (h_ptr) and use the device +// memory ptr (d_ptr) to set any child object pointers. column_device_view::column_device_view( column_view source, ptrdiff_t h_ptr, ptrdiff_t d_ptr ) : detail::column_device_view_base{source.type(), source.size(), source.head(), source.null_mask(), @@ -70,7 +71,7 @@ std::unique_ptr> co auto deleter = [](column_device_view* v) { v->destroy(); }; std::unique_ptr p{ new column_device_view(source), deleter}; - + if( num_children > 0 ) { // create device memory for the children From cda8b994e55ce7f1388c962827c42778fad5dfae Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 4 Oct 2019 16:12:09 -0400 Subject: [PATCH 54/54] use cudf::test::BaseFixture instead of GdfTest --- cpp/tests/strings/factories_test.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index d2fafda5e3e..a2866dbd445 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -17,16 +17,15 @@ #include #include #include +#include -#include #include -#include #include #include -struct FactoriesTest : public GdfTest {}; +struct FactoriesTest : public cudf::test::BaseFixture {}; TEST_F(FactoriesTest, CreateColumnFromArray) {