diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md index a07f2889d..36f62f98f 100755 --- a/docs/sources/CHANGELOG.md +++ b/docs/sources/CHANGELOG.md @@ -21,7 +21,7 @@ The CHANGELOG for the current development version is available at ##### Changes -- - +- Improve the runtime performance for the `apriori` frequent itemset generating function when `low_memory=True`. Setting `low_memory=False` (default) is still faster for small itemsets, but `low_memory=True` can be much faster for large itemsets and requires less memory. Also, input validation for `apriori`, ̀ fpgrowth` and `fpmax` takes a significant amount of time when input pandas DataFrame is large; this is now dramatically reduced when input contains boolean values (and not zeros/ones), which is the case when using `TransactionEncoder`. ([#619](https://github.com/rasbt/mlxtend/pull/619) via [Denis Barbier](https://github.com/dbarbier)) ##### Bug Fixes - Fixes a bug in `mlxtend.plotting.plot_pca_correlation_graph` that caused the explaind variances not summing up to 1. Also, improves the runtime performance of the correlation computation and adds a missing function argument for the explained variances (eigenvalues) if users provide their own principal components. ([#593](https://github.com/rasbt/mlxtend/issues/593) via [Gabriel Azevedo Ferreira](https://github.com/Gabriel-Azevedo-Ferreira)) diff --git a/mlxtend/frequent_patterns/apriori.py b/mlxtend/frequent_patterns/apriori.py index e0822c301..983ebd3aa 100644 --- a/mlxtend/frequent_patterns/apriori.py +++ b/mlxtend/frequent_patterns/apriori.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from ..frequent_patterns import fpcommon as fpc def generate_new_combinations(old_combinations): @@ -30,8 +31,74 @@ def generate_new_combinations(old_combinations): Returns ----------- Generator of all combinations from the last step x items - from the previous step. Every combination is a tuple - of item type ids in the ascending order. + from the previous step. + + Examples + ----------- + For usage examples, please see + http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/generate_new_combinations/ + + """ + + items_types_in_previous_step = np.unique(old_combinations.flatten()) + for old_combination in old_combinations: + max_combination = old_combination[-1] + mask = items_types_in_previous_step > max_combination + valid_items = items_types_in_previous_step[mask] + old_tuple = tuple(old_combination) + for item in valid_items: + yield from old_tuple + yield item + + +def generate_new_combinations_low_memory(old_combinations, X, min_support, + is_sparse): + """ + Generator of all combinations based on the last state of Apriori algorithm + Parameters + ----------- + old_combinations: np.array + All combinations with enough support in the last step + Combinations are represented by a matrix. + Number of columns is equal to the combination size + of the previous step. + Each row represents one combination + and contains item type ids in the ascending order + ``` + 0 1 + 0 15 20 + 1 15 22 + 2 17 19 + ``` + + X: np.array or scipy sparse matrix + The allowed values are either 0/1 or True/False. + For example, + + ``` + 0 True False True True False True + 1 True False True False False True + 2 True False True False False False + 3 True True False False False False + 4 False False True True True True + 5 False False True False True True + 6 False False True False True False + 7 True True False False False False + ``` + + min_support : float (default: 0.5) + A float between 0 and 1 for minumum support of the itemsets returned. + The support is computed as the fraction + `transactions_where_item(s)_occur / total_transactions`. + + is_sparse : bool True if X is sparse + + Returns + ----------- + Generator of all combinations from the last step x items + from the previous step. Every combination contains the + number of transactions where this item occurs, followed + by item type ids in the ascending order. No combination other than generated do not have a chance to get enough support @@ -43,12 +110,25 @@ def generate_new_combinations(old_combinations): """ items_types_in_previous_step = np.unique(old_combinations.flatten()) + rows_count = X.shape[0] + threshold = min_support * rows_count for old_combination in old_combinations: - max_combination = max(old_combination) - for item in items_types_in_previous_step: - if item > max_combination: - res = tuple(old_combination) + (item,) - yield res + max_combination = old_combination[-1] + mask = items_types_in_previous_step > max_combination + valid_items = items_types_in_previous_step[mask] + old_tuple = tuple(old_combination) + if is_sparse: + mask_rows = X[:, old_tuple].toarray().all(axis=1) + X_cols = X[:, valid_items].toarray() + supports = X_cols[mask_rows].sum(axis=0) + else: + mask_rows = X[:, old_tuple].all(axis=1) + supports = X[mask_rows][:, valid_items].sum(axis=0) + valid_indices = (supports >= threshold).nonzero()[0] + for index in valid_indices: + yield supports[index] + yield from old_tuple + yield valid_items[index] def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, @@ -63,15 +143,15 @@ def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0, For example, ``` - Apple Bananas Beer Chicken Milk Rice - 0 1 0 1 1 0 1 - 1 1 0 1 0 0 1 - 2 1 0 1 0 0 0 - 3 1 1 0 0 0 0 - 4 0 0 1 1 1 1 - 5 0 0 1 0 1 1 - 6 0 0 1 0 1 0 - 7 1 1 0 0 0 0 + Apple Bananas Beer Chicken Milk Rice + 0 True False True True False True + 1 True False True False False True + 2 True False True False False False + 3 True True False False False False + 4 False False True True True True + 5 False False True False True True + 6 False False True False True False + 7 True True False False False False ``` min_support : float (default: 0.5) @@ -148,21 +228,10 @@ def _support(_x, _n_rows, _is_sparse): 'number within the interval `(0, 1]`. ' 'Got %s.' % min_support) - idxs = np.where((df.values != 1) & (df.values != 0)) - if len(idxs[0]) > 0: - val = df.values[idxs[0][0], idxs[1][0]] - s = ('The allowed values for a DataFrame' - ' are True, False, 0, 1. Found value %s' % (val)) - raise ValueError(s) + fpc.valid_input_check(df) is_sparse = hasattr(df, "to_coo") if is_sparse: - if not isinstance(df.columns[0], str) and df.columns[0] != 0: - raise ValueError('Due to current limitations in Pandas, ' - 'if the SparseDataFrame has integer column names,' - 'names, please make sure they either start ' - 'with `0` or cast them as string column names: ' - '`df.columns = [str(i) for i in df.columns`].') X = df.to_coo().tocsc() else: X = df.values @@ -173,45 +242,37 @@ def _support(_x, _n_rows, _is_sparse): max_itemset = 1 rows_count = float(X.shape[0]) - iter_count = 0 all_ones = np.ones((int(rows_count), 1)) while max_itemset and max_itemset < (max_len or float('inf')): next_max_itemset = max_itemset + 1 - combin = generate_new_combinations(itemset_dict[max_itemset]) # With exceptionally large datasets, the matrix operations can use a # substantial amount of memory. For low memory applications or large # datasets, set `low_memory=True` to use a slower but more memory- # efficient implementation. if low_memory: - frequent_items = [] - frequent_items_support = [] - if is_sparse: - all_ones = np.ones((X.shape[0], next_max_itemset)) - for c in combin: - if verbose: - iter_count += 1 - print('\rIteration: %d | Sampling itemset size %d' % - (iter_count, next_max_itemset), end="") - if is_sparse: - together = np.all(X[:, c] == all_ones, axis=1) - else: - together = X[:, c].all(axis=1) - support = together.sum() / rows_count - if support >= min_support: - frequent_items.append(c) - frequent_items_support.append(support) - - if frequent_items: - itemset_dict[next_max_itemset] = np.array(frequent_items) - support_dict[next_max_itemset] = \ - np.array(frequent_items_support) - max_itemset = next_max_itemset - else: + combin = generate_new_combinations_low_memory( + itemset_dict[max_itemset], X, min_support, is_sparse) + # slightly faster than creating an array from a list of tuples + combin = np.fromiter(combin, dtype=int) + combin = combin.reshape(-1, next_max_itemset + 1) + + if combin.size == 0: break + if verbose: + print( + '\rProcessing %d combinations | Sampling itemset size %d' % + (combin.size, next_max_itemset), end="") + + itemset_dict[next_max_itemset] = combin[:, 1:] + support_dict[next_max_itemset] = combin[:, 0].astype(float) \ + / rows_count + max_itemset = next_max_itemset else: - combin = np.array(list(combin)) + combin = generate_new_combinations(itemset_dict[max_itemset]) + combin = np.fromiter(combin, dtype=int) + combin = combin.reshape(-1, next_max_itemset) if combin.size == 0: break diff --git a/mlxtend/frequent_patterns/fpcommon.py b/mlxtend/frequent_patterns/fpcommon.py index b1bca1b97..d52f7b536 100644 --- a/mlxtend/frequent_patterns/fpcommon.py +++ b/mlxtend/frequent_patterns/fpcommon.py @@ -48,13 +48,15 @@ def generate_itemsets(generator, num_itemsets, colname_map): def valid_input_check(df): - # Pandas is much slower than numpy, so use df.values instead of df here - idxs = np.where((df.values != 1) & (df.values != 0)) - if len(idxs[0]) > 0: - val = df.values[idxs[0][0], idxs[1][0]] - s = ('The allowed values for a DataFrame' - ' are True, False, 0, 1. Found value %s' % (val)) - raise ValueError(s) + # Fast path: if all columns are boolean, there is nothing to check + if not (df.dtypes == bool).all(): + # Pandas is much slower than numpy, so use df.values instead of df here + idxs = np.where((df.values != 1) & (df.values != 0)) + if len(idxs[0]) > 0: + val = df.values[idxs[0][0], idxs[1][0]] + s = ('The allowed values for a DataFrame' + ' are True, False, 0, 1. Found value %s' % (val)) + raise ValueError(s) is_sparse = hasattr(df, "to_coo") if is_sparse: diff --git a/mlxtend/frequent_patterns/fpgrowth.py b/mlxtend/frequent_patterns/fpgrowth.py index 2920b6384..5d72dea25 100644 --- a/mlxtend/frequent_patterns/fpgrowth.py +++ b/mlxtend/frequent_patterns/fpgrowth.py @@ -19,15 +19,15 @@ def fpgrowth(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): For example, ``` - Apple Bananas Beer Chicken Milk Rice - 0 1 0 1 1 0 1 - 1 1 0 1 0 0 1 - 2 1 0 1 0 0 0 - 3 1 1 0 0 0 0 - 4 0 0 1 1 1 1 - 5 0 0 1 0 1 1 - 6 0 0 1 0 1 0 - 7 1 1 0 0 0 0 + Apple Bananas Beer Chicken Milk Rice + 0 True False True True False True + 1 True False True False False True + 2 True False True False False False + 3 True True False False False False + 4 False False True True True True + 5 False False True False True True + 6 False False True False True False + 7 True True False False False False ``` min_support : float (default: 0.5) diff --git a/mlxtend/frequent_patterns/fpmax.py b/mlxtend/frequent_patterns/fpmax.py index e7140c913..22d0ccc10 100644 --- a/mlxtend/frequent_patterns/fpmax.py +++ b/mlxtend/frequent_patterns/fpmax.py @@ -19,15 +19,15 @@ def fpmax(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0): For example, ``` - Apple Bananas Beer Chicken Milk Rice - 0 1 0 1 1 0 1 - 1 1 0 1 0 0 1 - 2 1 0 1 0 0 0 - 3 1 1 0 0 0 0 - 4 0 0 1 1 1 1 - 5 0 0 1 0 1 1 - 6 0 0 1 0 1 0 - 7 1 1 0 0 0 0 + Apple Bananas Beer Chicken Milk Rice + 0 True False True True False True + 1 True False True False False True + 2 True False True False False False + 3 True True False False False False + 4 False False True True True True + 5 False False True False True True + 6 False False True False True False + 7 True True False False False False ``` min_support : float (default: 0.5) diff --git a/mlxtend/frequent_patterns/tests/test_fpbase.py b/mlxtend/frequent_patterns/tests/test_fpbase.py index c0500c775..896755bc5 100644 --- a/mlxtend/frequent_patterns/tests/test_fpbase.py +++ b/mlxtend/frequent_patterns/tests/test_fpbase.py @@ -187,7 +187,7 @@ def test_low_memory_flag(self): _ = self.fpalgo(self.df, low_memory=True, verbose=1) # Only get the last value of the stream to reduce test noise - expect = 'Iteration: 17 | Sampling itemset size 3\n' + expect = 'Processing 4 combinations | Sampling itemset size 3\n' out = out.getvalue().split('\r')[-1] assert out == expect else: