rapidsai · rapids-bot · Nov 18, 2021 · Nov 15, 2021 · Nov 15, 2021 · Nov 15, 2021
@@ -117,7 +117,7 @@ struct ops_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
@@ -164,7 +164,7 @@ struct ops2_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())

@@ -559,7 +559,7 @@ auto column_view_with_common_nulls(column_view const& column_0, column_view cons
 }
 
 /**
- * @brief Perform covariance betweeen two child columns of non-nullable struct column.
+ * @brief Perform covariance between two child columns of non-nullable struct column.
  *
  */
 template <>
@@ -602,7 +602,7 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
 };
 
 /**
- * @brief Perform correlation betweeen two child columns of non-nullable struct column.
+ * @brief Perform correlation between two child columns of non-nullable struct column.
  *
  */
 template <>

@@ -79,7 +79,7 @@ void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_co
  * @brief Adds the column with the given id to the mapping
  *
  * All nested columns and direct ancestors of column `id` are included.
- * Columns that are not on the direct path are excluded, which may result in prunning.
+ * Columns that are not on the direct path are excluded, which may result in pruning.
  */
 void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,

@@ -119,7 +119,7 @@ class aggregate_orc_metadata {
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
    *
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
-   * ommited to match the cuDF table hierarchy.
+   * omitted to match the cuDF table hierarchy.
    *
    * @param column_paths List of full column names (i.e. paths) to select from the ORC file
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level

@@ -654,7 +654,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
     auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
     auto dst_offset      = offset + s->nnz;
     auto vbuf_bit_idx    = [](int row) {
-      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      // valid_buf is a circular buffer with validity of 8 rows in each element
       return row % (encode_block_size * 8);
     };
     if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
@@ -690,7 +690,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
         ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
 
       if (!t) {
-        // Number of rows enocoded so far
+        // Number of rows encoded so far
         s->present_out += nrows_encoded;
         s->numvals -= min(s->numvals, nrows_encoded);
       }

@@ -1421,7 +1421,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       }
     }
     if (col.orc_kind() == LIST or col.orc_kind() == MAP) {
-      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      // Need a new pushdown mask unless both the parent and current column are not nullable
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);

@@ -307,7 +307,7 @@ struct EncColumnChunk {
   statistics_chunk const* stats;  //!< Fragment statistics
   uint32_t bfr_size;              //!< Uncompressed buffer size
   uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t max_page_data_size;    //!< Max data size (excuding header) of any page in this chunk
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
   uint32_t start_row;             //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
@@ -489,7 +489,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups
 /**
  * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
  *
- * @param chunks Flat span of chunks to intialize hash maps for
+ * @param chunks Flat span of chunks to initialize hash maps for
  * @param stream CUDA stream to use
  */
 void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);

@@ -260,7 +260,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow seperate logic.
+  // would have to follow separate logic.
   multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
     tile_multistates,
     tile_offsets,

@@ -67,7 +67,7 @@ struct has_negative_nans_fn {
  * @brief A structure to be used along with type_dispatcher to check if a column has any
  * negative NaN value.
  *
- * This functor is neccessary because when calling to segmented sort on the list entries, the
+ * This functor is necessary because when calling to segmented sort on the list entries, the
  * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
  * lists. We want to move all NaN values close together in order to call unique_copy later on.
  */
@@ -563,7 +563,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
                      values ? cudf::empty_like(values.value().parent()) : nullptr};
   }
 
-  // The child column conotaining list entries.
+  // The child column containing list entries.
   auto const keys_child = keys.get_sliced_child(stream);
 
   // Generate a mapping from list entries to their 1-based list indices for the keys column.

@@ -722,7 +722,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   }
 
   // STD aggregations depends on VARIANCE aggregation. Each element is applied
-  // with sqaured-root in the finalize() step.
+  // with square-root in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(data_type,
                                                   cudf::detail::std_aggregation const& agg) override
   {

@@ -84,7 +84,7 @@ TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes);
 // Test for fixed_width, dict, string, list, struct
 // column_view, column_view = same hash.
 // column_view, make a copy = same hash.
-// new column_view from colmn = same hash
+// new column_view from column = same hash
 // column_view, copy column = diff hash
 // column_view, diff column = diff hash.
 //

@@ -742,7 +742,7 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
         707904541L,     // 1992-06-07 08:09:01 GMT - leap year
         -2181005247L,   // 1900-11-20 09:12:33 GMT - non leap year
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00 - non-leap-year
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT - leap year
         4539564243L,    // 2113-11-08 06:04:03 GMT - non leap year
@@ -811,7 +811,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
         707904541L,     // 1992-06-07 08:09:01 GMT
         -2181005247L,   // 1900-11-20 09:12:33 GMT
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT
         4539564243L,    // 2113-11-08 06:04:03 GMT

@@ -229,7 +229,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
   // than max_block_size (256) rows.
   // With a large number of rows, computation spills to multiple thread-blocks,
-  // thus exercising the branch-stack comptutation.
+  // thus exercising the branch-stack computation.
   // The contents of the input column aren't as pertinent to this test as the
   // column size. For what it's worth, it looks as follows:
   //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
@@ -363,7 +363,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_nested_and_exp
   // Inner list column
   // clang-format off
   cudf::test::lists_column_wrapper<int> list{
-    {1, 2, 3, 4, 5},     
+    {1, 2, 3, 4, 5},
     {6, 7, 8},
     {33, 34, 35, 36, 37, 38, 39},
     {-1, -2},
@@ -409,7 +409,7 @@ std::unique_ptr<column> build_nested_column(std::vector<bool> const& struct_vali
 
   // Inner list column
   // clang-format off
-  cudf::test::lists_column_wrapper<int> list{    
+  cudf::test::lists_column_wrapper<int> list{
      {{1, 2, 3, 4, 5}, {2, 3}},
      {{6, 7, 8}, {8, 9}},
      {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}};

@@ -337,7 +337,7 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         else:
             return self.dropna(drop_nan=False).data_array_view
 
-    # TODO: This method is decpreated and can be removed when the associated
+    # TODO: This method is deprecated and can be removed when the associated
     # Frame methods are removed.
     def to_array(self, fillna=None) -> np.ndarray:
         """Get a dense numpy array for the data.
@@ -1851,7 +1851,7 @@ def as_column(
 
         arbitrary = np.asarray(arbitrary)
 
-        # Handle case that `arbitary` elements are cupy arrays
+        # Handle case that `arbitrary` elements are cupy arrays
         if (
             shape
             and shape[0]

@@ -540,7 +540,7 @@ def infer_format(element: str, **kwargs) -> str:
     if len(second_parts) > 1:
         # "Z" indicates Zulu time(widely used in aviation) - Which is
         # UTC timezone that currently cudf only supports. Having any other
-        # unsuppported timezone will let the code fail below
+        # unsupported timezone will let the code fail below
         # with a ValueError.
         second_parts.remove("Z")
         second_part = "".join(second_parts[1:])

@@ -160,7 +160,7 @@ def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
 
-        # Binary Arithmatics between decimal columns. `Scale` and `precision`
+        # Binary Arithmetics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
         if op in ("add", "sub", "mul", "div"):
             scale = _binop_scale(self.dtype, other.dtype, op)

@@ -6368,7 +6368,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(

@@ -18,7 +18,7 @@
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
-# respectively, which are called in the describe() method to ouput
+# respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
 def _quantile_25(x):
     return x.quantile(0.25)

@@ -623,7 +623,7 @@ def _union(self, other, sort=None):
                 else:
                     return result
 
-        # If all the above optimizations don't cater to the inpputs,
+        # If all the above optimizations don't cater to the inputs,
         # we materialize RangeIndex's into `Int64Index` and
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)

@@ -970,7 +970,7 @@ def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
 
-        # TODO: Verify if this is really necesary or if we can rely on
+        # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
             colnames = source_data[0].columns

@@ -2916,7 +2916,7 @@ def unique(self):
 
     def nunique(self, method="sort", dropna=True):
         """Returns the number of unique values of the Series: approximate version,
-        and exact version to be moved to libgdf
+        and exact version to be moved to libcudf
 
         Excludes NA values by default.
 
@@ -2985,7 +2985,7 @@ def value_counts(
 
         Returns
         -------
-        result : Series contanining counts of unique values.
+        result : Series containing counts of unique values.
 
         See also
         --------
@@ -3802,7 +3802,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
@@ -4989,7 +4989,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
-    Two values in ``a`` and ``b`` are  considiered equal when the following
+    Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::

@@ -316,7 +316,7 @@ def compile_or_get(frame, func, args):
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
     a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output valdity for each row.
+    to produce an output value and output validity for each row.
 
     If the UDF has already been compiled for this requested dtypes,
     a cached version will be returned instead of running compilation.

@@ -67,7 +67,7 @@ def unify(self, context, other):
         """
         Often within a UDF an instance arises where a variable could
         be a `MaskedType`, an `NAType`, or a literal based off
-        the data at runtime, for examplem the variable `ret` here:
+        the data at runtime, for example the variable `ret` here:
 
         def f(x):
             if x == 1:
@@ -185,7 +185,7 @@ class NAType(types.Type):
     """
     A type for handling ops against nulls
     Exists so we can:
-    1. Teach numba that all occurances of `cudf.NA` are
+    1. Teach numba that all occurrences of `cudf.NA` are
        to be read as instances of this type instead
     2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`

@@ -410,7 +410,7 @@ def assert_series_equal(
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_series_type : bool, default True
-        Whether to check the seires class, dtype and
+        Whether to check the series class, dtype and
         inferred_type are identical. Currently it is idle,
         and similar to pandas.
     check_less_precise : bool or int, default False

@@ -1173,7 +1173,7 @@ def make_scalar_product_data():
         )
     )
 
-    # we can muliply any timedelta by any int, or bool
+    # we can multiply any timedelta by any int, or bool
     valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES))
 
     # we can multiply a float by any int, float, or bool

@@ -44,7 +44,7 @@ def test_dataframe_accessor(gdf):
     "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})]
 )
 def test_dataframe_accessor_idendity(gdf1, gdf2):
-    """Test for accessor idendities
+    """Test for accessor identities
     - An object should hold persistent reference to the same accessor
     - Different objects should hold difference instances of the accessor
     """

@@ -171,7 +171,7 @@ def test_dt_ops(data):
     assert_eq(pd_data > pd_data, gdf_data > gdf_data)
 
 
-# libgdf doesn't respect timezones
+# licudf doesn't respect timezones
 @pytest.mark.parametrize("data", [data1()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_series(data, field):

@@ -738,9 +738,9 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 )
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
-    """Test memory idendity for deep copy
+    """Test memory identity for deep copy
     Case1: Constructed from GroupBy, StringColumns
-    Case2: Constrcuted from MultiIndex, NumericColumns
+    Case2: Constructed from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
@@ -768,19 +768,19 @@ def test_multiindex_copy_deep(data, deep):
         mi1 = data
         mi2 = mi1.copy(deep=deep)
 
-        # Assert ._levels idendity
+        # Assert ._levels identity
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._codes idendity
+        # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._data idendity
+        # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]