Improve performance of apriori (rasbt#619)

* minor performance improvements in apriori old_combination is sorted, thus its max() is its last element. Since items_types_in_previous_step is a Numpy array, we can find all valid elements with a single call, which makes inner loop shorter. * minor performance improvements in apriori Let generate_new_combinations return ints instead of tuples, and collect them with np.fromiter. Slower with low_memory=True, this will be fixed by next commit. * improve performance of apriori with low_memory=True Verbose output has to be modified, since we loop on valid combinations only. Performance is now equivalent to better than version with low_memory=False. Adjust test_fpbase.py output. * fixes for flake8 * speed up valid_input_check for boolean dataframes If all columns are boolean, there is nothing to check. In apriori.py, call valid_input_check. * add changelog entry Replace 0/1 by False/True in docstrings of apriori, fpgrowth and fpmax to promote usage of boolean arrays.
dbarbier · Nov 3, 2019 · 2f928cb · 2f928cb
1 parent fe0e22a
commit 2f928cb
Show file tree

Hide file tree

Showing 6 changed files with 145 additions and 82 deletions.
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -21,7 +21,7 @@ The CHANGELOG for the current development version is available at
 
 ##### Changes
 
-- -
+- Improve the runtime performance for the `apriori` frequent itemset generating function when `low_memory=True`. Setting `low_memory=False` (default) is still faster for small itemsets, but `low_memory=True` can be much faster for large itemsets and requires less memory.  Also, input validation for  `apriori`, ̀ fpgrowth` and `fpmax` takes a significant amount of time when input pandas DataFrame is large; this is now dramatically reduced when input contains boolean values (and not zeros/ones), which is the case when using `TransactionEncoder`. ([#619](https://github.com/rasbt/mlxtend/pull/619) via [Denis Barbier](https://github.com/dbarbier))
 
 ##### Bug Fixes
 - Fixes a bug in `mlxtend.plotting.plot_pca_correlation_graph` that caused the explaind variances not summing up to 1. Also, improves the runtime performance of the correlation computation and adds a missing function argument for the explained variances (eigenvalues) if users provide their own principal components. ([#593](https://github.com/rasbt/mlxtend/issues/593) via [Gabriel Azevedo Ferreira](https://github.com/Gabriel-Azevedo-Ferreira))

diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from ..frequent_patterns import fpcommon as fpc
 
 
 def generate_new_combinations(old_combinations):
@@ -30,8 +31,74 @@ def generate_new_combinations(old_combinations):
     Returns
     -----------
     Generator of all combinations from the last step x items
-    from the previous step. Every combination is a tuple
-    of item type ids in the ascending order.
+    from the previous step.
+
+    Examples
+    -----------
+    For usage examples, please see
+    http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/
+
+    """
+
+    items_types_in_previous_step = np.unique(old_combinations.flatten())
+    for old_combination in old_combinations:
+        max_combination = old_combination[-1]
+        mask = items_types_in_previous_step > max_combination
+        valid_items = items_types_in_previous_step[mask]
+        old_tuple = tuple(old_combination)
+        for item in valid_items:
+            yield from old_tuple
+            yield item
+
+
+def generate_new_combinations_low_memory(old_combinations, X, min_support,
+                                         is_sparse):
+    """
+    Generator of all combinations based on the last state of Apriori algorithm
+    Parameters
+    -----------
+    old_combinations: np.array
+        All combinations with enough support in the last step
+        Combinations are represented by a matrix.
+        Number of columns is equal to the combination size
+        of the previous step.
+        Each row represents one combination
+        and contains item type ids in the ascending order
+        ```
+               0        1
+        0      15       20
+        1      15       22
+        2      17       19
+        ```
+
+    X: np.array or scipy sparse matrix
+      The allowed values are either 0/1 or True/False.
+      For example,
+
+    ```
+        0     True False  True  True False  True
+        1     True False  True False False  True
+        2     True False  True False False False
+        3     True  True False False False False
+        4    False False  True  True  True  True
+        5    False False  True False  True  True
+        6    False False  True False  True False
+        7     True  True False False False False
+    ```
+
+    min_support : float (default: 0.5)
+      A float between 0 and 1 for minumum support of the itemsets returned.
+      The support is computed as the fraction
+      `transactions_where_item(s)_occur / total_transactions`.
+
+    is_sparse : bool True if X is sparse
+
+    Returns
+    -----------
+    Generator of all combinations from the last step x items
+    from the previous step. Every combination contains the
+    number of transactions where this item occurs, followed
+    by item type ids in the ascending order.
     No combination other than generated
     do not have a chance to get enough support
 
@@ -43,12 +110,25 @@ def generate_new_combinations(old_combinations):
     """
 
     items_types_in_previous_step = np.unique(old_combinations.flatten())
+    rows_count = X.shape[0]
+    threshold = min_support * rows_count
     for old_combination in old_combinations:
-        max_combination = max(old_combination)
-        for item in items_types_in_previous_step:
-            if item > max_combination:
-                res = tuple(old_combination) + (item,)
-                yield res
+        max_combination = old_combination[-1]
+        mask = items_types_in_previous_step > max_combination
+        valid_items = items_types_in_previous_step[mask]
+        old_tuple = tuple(old_combination)
+        if is_sparse:
+            mask_rows = X[:, old_tuple].toarray().all(axis=1)
+            X_cols = X[:, valid_items].toarray()
+            supports = X_cols[mask_rows].sum(axis=0)
+        else:
+            mask_rows = X[:, old_tuple].all(axis=1)
+            supports = X[mask_rows][:, valid_items].sum(axis=0)
+        valid_indices = (supports >= threshold).nonzero()[0]
+        for index in valid_indices:
+            yield supports[index]
+            yield from old_tuple
+            yield valid_items[index]
 
 
 def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
@@ -63,15 +143,15 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
       For example,
 
     ```
-             Apple  Bananas  Beer  Chicken  Milk  Rice
-        0      1        0     1        1     0     1
-        1      1        0     1        0     0     1
-        2      1        0     1        0     0     0
-        3      1        1     0        0     0     0
-        4      0        0     1        1     1     1
-        5      0        0     1        0     1     1
-        6      0        0     1        0     1     0
-        7      1        1     0        0     0     0
+             Apple  Bananas   Beer  Chicken   Milk   Rice
+        0     True    False   True     True  False   True
+        1     True    False   True    False  False   True
+        2     True    False   True    False  False  False
+        3     True     True  False    False  False  False
+        4    False    False   True     True   True   True
+        5    False    False   True    False   True   True
+        6    False    False   True    False   True  False
+        7     True     True  False    False  False  False
     ```
 
     min_support : float (default: 0.5)
@@ -148,21 +228,10 @@ def _support(_x, _n_rows, _is_sparse):
                          'number within the interval `(0, 1]`. '
                          'Got %s.' % min_support)
 
-    idxs = np.where((df.values != 1) & (df.values != 0))
-    if len(idxs[0]) > 0:
-        val = df.values[idxs[0][0], idxs[1][0]]
-        s = ('The allowed values for a DataFrame'
-             ' are True, False, 0, 1. Found value %s' % (val))
-        raise ValueError(s)
+    fpc.valid_input_check(df)
 
     is_sparse = hasattr(df, "to_coo")
     if is_sparse:
-        if not isinstance(df.columns[0], str) and df.columns[0] != 0:
-            raise ValueError('Due to current limitations in Pandas, '
-                             'if the SparseDataFrame has integer column names,'
-                             'names, please make sure they either start '
-                             'with `0` or cast them as string column names: '
-                             '`df.columns = [str(i) for i in df.columns`].')
         X = df.to_coo().tocsc()
     else:
         X = df.values
@@ -173,45 +242,37 @@ def _support(_x, _n_rows, _is_sparse):
     max_itemset = 1
     rows_count = float(X.shape[0])
 
-    iter_count = 0
     all_ones = np.ones((int(rows_count), 1))
 
     while max_itemset and max_itemset < (max_len or float('inf')):
         next_max_itemset = max_itemset + 1
-        combin = generate_new_combinations(itemset_dict[max_itemset])
 
         # With exceptionally large datasets, the matrix operations can use a
         # substantial amount of memory. For low memory applications or large
         # datasets, set `low_memory=True` to use a slower but more memory-
         # efficient implementation.
         if low_memory:
-            frequent_items = []
-            frequent_items_support = []
-            if is_sparse:
-                all_ones = np.ones((X.shape[0], next_max_itemset))
-            for c in combin:
-                if verbose:
-                    iter_count += 1
-                    print('\rIteration: %d | Sampling itemset size %d' %
-                          (iter_count, next_max_itemset), end="")
-                if is_sparse:
-                    together = np.all(X[:, c] == all_ones, axis=1)
-                else:
-                    together = X[:, c].all(axis=1)
-                support = together.sum() / rows_count
-                if support >= min_support:
-                    frequent_items.append(c)
-                    frequent_items_support.append(support)
-
-            if frequent_items:
-                itemset_dict[next_max_itemset] = np.array(frequent_items)
-                support_dict[next_max_itemset] = \
-                    np.array(frequent_items_support)
-                max_itemset = next_max_itemset
-            else:
+            combin = generate_new_combinations_low_memory(
+                itemset_dict[max_itemset], X, min_support, is_sparse)
+            # slightly faster than creating an array from a list of tuples
+            combin = np.fromiter(combin, dtype=int)
+            combin = combin.reshape(-1, next_max_itemset + 1)
+
+            if combin.size == 0:
                 break
+            if verbose:
+                print(
+                    '\rProcessing %d combinations | Sampling itemset size %d' %
+                    (combin.size, next_max_itemset), end="")
+
+            itemset_dict[next_max_itemset] = combin[:, 1:]
+            support_dict[next_max_itemset] = combin[:, 0].astype(float) \
+                / rows_count
+            max_itemset = next_max_itemset
         else:
-            combin = np.array(list(combin))
+            combin = generate_new_combinations(itemset_dict[max_itemset])
+            combin = np.fromiter(combin, dtype=int)
+            combin = combin.reshape(-1, next_max_itemset)
 
             if combin.size == 0:
                 break

diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py
@@ -48,13 +48,15 @@ def generate_itemsets(generator, num_itemsets, colname_map):
 
 
 def valid_input_check(df):
-    # Pandas is much slower than numpy, so use df.values instead of df here
-    idxs = np.where((df.values != 1) & (df.values != 0))
-    if len(idxs[0]) > 0:
-        val = df.values[idxs[0][0], idxs[1][0]]
-        s = ('The allowed values for a DataFrame'
-             ' are True, False, 0, 1. Found value %s' % (val))
-        raise ValueError(s)
+    # Fast path: if all columns are boolean, there is nothing to check
+    if not (df.dtypes == bool).all():
+        # Pandas is much slower than numpy, so use df.values instead of df here
+        idxs = np.where((df.values != 1) & (df.values != 0))
+        if len(idxs[0]) > 0:
+            val = df.values[idxs[0][0], idxs[1][0]]
+            s = ('The allowed values for a DataFrame'
+                 ' are True, False, 0, 1. Found value %s' % (val))
+            raise ValueError(s)
 
     is_sparse = hasattr(df, "to_coo")
     if is_sparse:

diff --git a/mlxtend/frequent_patterns/fpgrowth.py b/mlxtend/frequent_patterns/fpgrowth.py
@@ -19,15 +19,15 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
       For example,
 
     ```
-             Apple  Bananas  Beer  Chicken  Milk  Rice
-        0      1        0     1        1     0     1
-        1      1        0     1        0     0     1
-        2      1        0     1        0     0     0
-        3      1        1     0        0     0     0
-        4      0        0     1        1     1     1
-        5      0        0     1        0     1     1
-        6      0        0     1        0     1     0
-        7      1        1     0        0     0     0
+           Apple  Bananas   Beer  Chicken   Milk   Rice
+        0   True    False   True     True  False   True
+        1   True    False   True    False  False   True
+        2   True    False   True    False  False  False
+        3   True     True  False    False  False  False
+        4  False    False   True     True   True   True
+        5  False    False   True    False   True   True
+        6  False    False   True    False   True  False
+        7   True     True  False    False  False  False
     ```
 
     min_support : float (default: 0.5)

diff --git a/mlxtend/frequent_patterns/fpmax.py b/mlxtend/frequent_patterns/fpmax.py
@@ -19,15 +19,15 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0):
       For example,
 
     ```
-             Apple  Bananas  Beer  Chicken  Milk  Rice
-        0      1        0     1        1     0     1
-        1      1        0     1        0     0     1
-        2      1        0     1        0     0     0
-        3      1        1     0        0     0     0
-        4      0        0     1        1     1     1
-        5      0        0     1        0     1     1
-        6      0        0     1        0     1     0
-        7      1        1     0        0     0     0
+           Apple  Bananas   Beer  Chicken   Milk   Rice
+        0   True    False   True     True  False   True
+        1   True    False   True    False  False   True
+        2   True    False   True    False  False  False
+        3   True     True  False    False  False  False
+        4  False    False   True     True   True   True
+        5  False    False   True    False   True   True
+        6  False    False   True    False   True  False
+        7   True     True  False    False  False  False
     ```
 
     min_support : float (default: 0.5)

diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py
@@ -187,7 +187,7 @@ def test_low_memory_flag(self):
                 _ = self.fpalgo(self.df, low_memory=True, verbose=1)
 
             # Only get the last value of the stream to reduce test noise
-            expect = 'Iteration: 17 | Sampling itemset size 3\n'
+            expect = 'Processing 4 combinations | Sampling itemset size 3\n'
             out = out.getvalue().split('\r')[-1]
             assert out == expect
         else: