diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index ebb7bd40694ec5..7c791ab8a1b004 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -239,4 +239,420 @@ def ffill_indexer(ndarray[int64_t] indexer): return result -include "join_helper.pxi" +# ---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +# ---------------------------------------------------------------------- + +ctypedef fused join_t: + float64_t + float32_t + object + int32_t + int64_t + uint64_t + + +# Joins on ordered, unique indices + +# right might contain non-unique values + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + join_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"] +left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"] +left_join_indexer_unique_object = left_join_indexer_unique["object"] +left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"] +left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] +left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +left_join_indexer_float64 = left_join_indexer["float64_t"] +left_join_indexer_float32 = left_join_indexer["float32_t"] +left_join_indexer_object = left_join_indexer["object"] +left_join_indexer_int32 = left_join_indexer["int32_t"] +left_join_indexer_int64 = left_join_indexer["int64_t"] +left_join_indexer_uint64 = left_join_indexer["uint64_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +inner_join_indexer_float64 = inner_join_indexer["float64_t"] +inner_join_indexer_float32 = inner_join_indexer["float32_t"] +inner_join_indexer_object = inner_join_indexer["object"] +inner_join_indexer_int32 = inner_join_indexer["int32_t"] +inner_join_indexer_int64 = inner_join_indexer["int64_t"] +inner_join_indexer_uint64 = inner_join_indexer["uint64_t"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + join_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[join_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=left.dtype) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + + +outer_join_indexer_float64 = outer_join_indexer["float64_t"] +outer_join_indexer_float32 = outer_join_indexer["float32_t"] +outer_join_indexer_object = outer_join_indexer["object"] +outer_join_indexer_int32 = outer_join_indexer["int32_t"] +outer_join_indexer_int64 = outer_join_indexer["int64_t"] +outer_join_indexer_uint64 = outer_join_indexer["uint64_t"] diff --git a/pandas/_libs/join_helper.pxi.in b/pandas/_libs/join_helper.pxi.in deleted file mode 100644 index 35dedf90f8ca4c..00000000000000 --- a/pandas/_libs/join_helper.pxi.in +++ /dev/null @@ -1,424 +0,0 @@ -""" -Template for each `dtype` helper function for join - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# left_join_indexer, inner_join_indexer, outer_join_indexer -# ---------------------------------------------------------------------- - -ctypedef fused join_t: - float64_t - float32_t - object - int32_t - int64_t - uint64_t - - -# Joins on ordered, unique indices - -# right might contain non-unique values - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - join_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"] -left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"] -left_join_indexer_unique_object = left_join_indexer_unique["object"] -left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"] -left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] -left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] - - -{{py: - -# name, c_type, dtype -dtypes = [('float64', 'float64_t', 'np.float64'), - ('float32', 'float32_t', 'np.float32'), - ('object', 'object', 'object'), - ('int32', 'int32_t', 'np.int32'), - ('int64', 'int64_t', 'np.int64'), - ('uint64', 'uint64_t', 'np.uint64')] - -def get_dispatch(dtypes): - - for name, c_type, dtype in dtypes: - yield name, c_type, dtype - -}} - -{{for name, c_type, dtype in get_dispatch(dtypes)}} - - -# @cython.wraparound(False) -# @cython.boundscheck(False) -def left_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left, - ndarray[{{c_type}}] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - {{c_type}} lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[{{c_type}}] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype={{dtype}}) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -{{endfor}} diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 51c84d6e28cb46..55b4201f41b2ac 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -222,11 +222,21 @@ class Index(IndexOpsMixin, PandasObject): # To hand over control to subclasses _join_precedence = 1 - # Cython methods - _left_indexer_unique = libjoin.left_join_indexer_unique_object - _left_indexer = libjoin.left_join_indexer_object - _inner_indexer = libjoin.inner_join_indexer_object - _outer_indexer = libjoin.outer_join_indexer_object + # Cython methods; see github.com/cython/cython/issues/2647 + # for why we need to wrap these instead of making them class attributes + # Moreover, cython will choose the appropriate-dtyped sub-function + # given the dtypes of the passed arguments + def _left_indexer_unique(self, left, right): + return libjoin.left_join_indexer_unique(left, right) + + def _left_indexer(self, left, right): + return libjoin.left_join_indexer(left, right) + + def _inner_indexer(self, left, right): + return libjoin.inner_join_indexer(left, right) + + def _outer_indexer(self, left, right): + return libjoin.outer_join_indexer(left, right) _typ = 'index' _data = None diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 7f64fb744c682c..eabbb43d155f69 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,6 +1,5 @@ import numpy as np -from pandas._libs import (index as libindex, - join as libjoin) +from pandas._libs import index as libindex from pandas.core.dtypes.common import ( is_dtype_equal, pandas_dtype, @@ -185,10 +184,6 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args _typ = 'int64index' - _left_indexer_unique = libjoin.left_join_indexer_unique_int64 - _left_indexer = libjoin.left_join_indexer_int64 - _inner_indexer = libjoin.inner_join_indexer_int64 - _outer_indexer = libjoin.outer_join_indexer_int64 _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.int64 @@ -243,10 +238,6 @@ class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args _typ = 'uint64index' - _left_indexer_unique = libjoin.left_join_indexer_unique_uint64 - _left_indexer = libjoin.left_join_indexer_uint64 - _inner_indexer = libjoin.inner_join_indexer_uint64 - _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 @@ -321,11 +312,6 @@ class Float64Index(NumericIndex): _typ = 'float64index' _engine_type = libindex.Float64Engine - _left_indexer_unique = libjoin.left_join_indexer_unique_float64 - _left_indexer = libjoin.left_join_indexer_float64 - _inner_indexer = libjoin.inner_join_indexer_float64 - _outer_indexer = libjoin.outer_join_indexer_float64 - _default_dtype = np.float64 @property diff --git a/setup.py b/setup.py index f31aaa7e79a0d0..adffddc61cbac3 100755 --- a/setup.py +++ b/setup.py @@ -76,7 +76,7 @@ def is_platform_windows(): '_libs/algos_take_helper.pxi.in', '_libs/algos_rank_helper.pxi.in'], 'groupby': ['_libs/groupby_helper.pxi.in'], - 'join': ['_libs/join_helper.pxi.in', '_libs/join_func_helper.pxi.in'], + 'join': ['_libs/join_func_helper.pxi.in'], 'hashtable': ['_libs/hashtable_class_helper.pxi.in', '_libs/hashtable_func_helper.pxi.in'], 'index': ['_libs/index_class_helper.pxi.in'],