Merge pull request #26 from Webiks/log_scale_data

create share function for calculate the log scale
Webiks · May 19, 2021 · 0417e48 · 0417e48
2 parents 1a3c5bc + 26e3739
commit 0417e48
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 50 deletions.
diff --git a/auxiliaries/pipeline_auxiliaries.py b/auxiliaries/pipeline_auxiliaries.py
@@ -3,6 +3,7 @@
 from subprocess import call, run, Popen, PIPE
 from time import time, sleep
 import global_params
+import numpy as np
 import logging
 
 logger = logging.getLogger('main')
@@ -365,3 +366,14 @@ def count_memes(path):
     count = int(output) if p_status == 0 else 0
     print(f'Found {count} memes in {path}')
     return count
+
+
+def log_scale(df, rank_method):
+    # The options of ranking method are - hits, controlled_shuffles, pval, tfidf
+    if rank_method == 'hits':
+        df = np.log2(df + 1)
+    if rank_method == 'pval':    
+        df = 1-df
+    if rank_method == 'tfidf':
+        df = -np.log2(df + 0.0001)  # avoid 0  
+    return df       
diff --git a/model_fitting/module_wraper.py b/model_fitting/module_wraper.py
@@ -157,11 +157,9 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
         aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
         hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')
 
-        value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
-        hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
-        if rank_method == 'tfidf' or rank_method == 'shuffles':
-            value_cmd.append('--tfidf')
-            hits_cmd.append('--tfidf')
+        value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', f'--rank_method {rank_method}']
+        hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', '--rank_method hits']
+
 
         if not os.path.exists(pvalues_done_path):
             all_cmds_params.append(value_cmd)
@@ -218,15 +216,15 @@ def get_faa_file_name_from_path(path, use_mapitope):
     parser.add_argument('number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
     parser.add_argument('done_file_path', help='A path to a file that signals that the module finished running successfully.')
 
-    parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
+    parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='shuffles', help='Motifs ranking method')
     parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
     parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
     parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
     parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
     parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
     parser.add_argument('--num_of_random_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
-    parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
-    parser.add_argument('--seed_random_forest', default=42, help='Seed number for reconstructing experiments')    
+    parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
+    parser.add_argument('--seed_random_forest', default=42, type=int, help='Seed number for reconstructing experiments')    
     parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
     parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')
     parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')

diff --git a/model_fitting/random_forest.py b/model_fitting/random_forest.py
@@ -10,6 +10,15 @@
 from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
 from sklearn.metrics import plot_roc_curve
 
+if os.path.exists('/groups/pupko/orenavr2/'):
+    src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
+elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
+    src_dir = '/Users/Oren/Dropbox/Projects/gershoni/src'
+else:
+    src_dir = '.'
+sys.path.insert(0, src_dir)
+
+from auxiliaries.pipeline_auxiliaries import log_scale
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger('main')
@@ -87,18 +96,9 @@ def sample_configurations(hyperparameters_grid, num_of_configurations_to_sample,
     return configurations
 
 
-def generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, output_path):
-    # plt.figure(dpi=1000)
-    # transform the data for better contrast in the visualization
+def generate_heat_map(df, number_of_features, rank_method, number_of_samples, output_path):
 
-    train_data = np.log2(df+1) if hits_data else df 
-
-    #if hits_data:  # hits data
-    #    train_data = np.log2(df+1)  # pseudo counts
-    #elif use_tfidf: # tfidf data
-    #    train_data = -np.log2(df+0.0001)  # avoid 0
-    #else:  # p-values data
-    #    train_data = -np.log2(df)
+    train_data = log_scale(df, rank_method)
 
     cm = sns.clustermap(train_data, cmap="Blues", col_cluster=False, yticklabels=True)
     plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=150/number_of_samples)
@@ -139,7 +139,7 @@ def generate_roc_curve(X, y, classifier, number_of_features, output_path):
     plt.close()
 
 
-def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_path, cv_num_of_splits):
+def train(rf, X, y, feature_names, sample_names, rank_method, output_path, cv_num_of_splits):
     original_feature_names = feature_names[:]
     original_X = X[:]
     logger.debug('\n'+'#'*100 + f'\nTrue labels:\n{y.tolist()}\n' + '#'*100 + '\n')
@@ -177,10 +177,6 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
         # save previous cv_avg_error_rate to make sure the performances do not deteriorate
         previous_cv_avg_error_rate = cv_avg_error_rate
 
-        # predictions = model.predict(X)
-        # logger.info(f'Full model error rate is {1 - (predictions == y).mean()}')
-        # logger.info(f'Current model\'s predictions\n{predictions.tolist()}')
-
         # compute current model accuracy for each fold of the cross validation
         cv_score = cross_val_score(model, X, y, cv=StratifiedKFold(cv_num_of_splits))
 
@@ -197,7 +193,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
         # save current model (unsorted) features to a csv file
         df = save_model_features(original_X, features_indexes_to_keep, feature_names, sample_names, f'{output_path}/Top_{number_of_features}_features')
 
-        generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, f'{output_path}/{number_of_features}')
+        generate_heat_map(df, number_of_features, rank_method, number_of_samples, f'{output_path}/{number_of_features}')
 
         generate_roc_curve(X, y, rf, number_of_features, output_path)
 
@@ -229,7 +225,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
     return cv_avg_error_rates, number_of_features_per_model
 
 
-def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_tfidf, cv_num_of_splits, seed, argv):
+def train_models(csv_file_path, done_path, num_of_configurations_to_sample, rank_method, cv_num_of_splits, seed, argv):
     logging.info('Preparing output path...')
     csv_folder, csv_file_name = os.path.split(csv_file_path)
     csv_file_prefix = os.path.splitext(csv_file_name)[0]  # without extension
@@ -242,7 +238,6 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
     feature_selection_summary_path = f'{output_path}/feature_selection_summary.txt'
 
     logging.info('Parsing data...')
-    is_hits_data = 'hits' in os.path.split(csv_file_path)[-1]  # Does the file name contain "hits"?
 
     X_train, y_train, X_test, y_test, feature_names, sample_names_train, sample_names_test = parse_data(csv_file_path)
 
@@ -251,21 +246,12 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
     perfect_feature_names, perfect_feature_indexes = measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits)
     if perfect_feature_names:
         df = save_model_features(X_train, perfect_feature_indexes, perfect_feature_names, sample_names_train, f'{output_path}/perfect_feature_names')
-        generate_heat_map(df, df.shape[1], is_hits_data, df.shape[0], use_tfidf, f'{output_path}/perfect_feature_names')
+        generate_heat_map(df, df.shape[1], rank_method, df.shape[0], f'{output_path}/perfect_feature_names')
     else:
         # touch a file so we can see that there were no perfect features
         with open(f'{output_path}/perfect_feature_names', 'w') as f:
             pass
 
-    # sanity check
-    # perfect_feature_names =['PMMILDLDHSQV', 'YASTIVVDLDHT', 'CDAALHVIAALD', 'CVSASMDLDHLE', 'CSVELCNISGYG', 'SSQIMFHVYN', 'CPVSPQSLDAPC', 'MSWIFYALDPYE', 'SSHISLLMSV', 'ATLKHYLIQDGI', 'CTWGGPGFDLFC', 'CPFDSAGFDPYC', 'CTAAFISGHYPD', 'SSFFYMQAYV', 'TPIETISDSVYK', 'LDSLHIHYLVSF', 'CHISHVQDLPYL', 'WLWYLASDGI', 'SINHGLLAYI', 'CIALLKEGEPFS', 'CSADNLNLYC', 'ACDPYDCRHLYS', 'WLAVLYADMVFY', 'WHWSLLQDTMYH', 'GTSSFPYSHLWA', 'CALGWLLDLDHC', 'CQAFIDPYEC', 'SLSQNYSLLMII', 'NGYDYFGLRF', 'GDPYEILVLSRC', 'IESGDPYEARFS', 'GYDLLRDRYI', 'PINDLDADGI', 'SAHIWLVTSI', 'CLRLHLASLTQC', 'SRMDFRYLLE', 'CPHSLIMLLGDL', 'WPVHLEHTNI', 'GPYELHYEDK', 'CRFGAPGFDVQC', 'LLSLVLMDSMYF', 'APTCFRGTDC', 'TWRSGNASGSP', 'DGYFAHFWSVLQ', 'CAVLLDPYEC', 'CIARKDMQATQN', 'CMVWLDNVLSLC', 'PFEALELMRT']
-    # for feature in perfect_feature_names:
-    #     rf = RandomForestClassifier(**{'n_estimators': 2000, 'max_features': 'auto', 'max_depth': 70, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': True, 'n_jobs': -1}).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
-    #     rf = RandomForestClassifier(**configuration).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
-    #     print(','.join(rf.predict(df[feature].as_matrix().reshape(-1, 1)).tolist()))
-    #     print(','.join(y_train.as_matrix().tolist()))
-
-    # feature selection analysis
     logging.info('\nApplying feature selection analysis...')
     if cv_num_of_splits < 2:
         logging.info('Number of CV folds is less than 2. '
@@ -292,7 +278,7 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
 
         logging.info(f'Configuration #{i} hyper-parameters are:\n{configuration}')
         rf = RandomForestClassifier(**configuration)
-        errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, is_hits_data, use_tfidf, output_path_i, cv_num_of_splits)
+        errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, rank_method, output_path_i, cv_num_of_splits)
         # model.predict(X_test, y_test)
 
         plot_error_rate(errors, features, cv_num_of_splits, output_path_i)
@@ -331,22 +317,15 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
 
 
 def measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits):
-    feature_to_avg_accuracy = {}
-    # df = pd.read_csv(f'{output_path}/Top_149_features.csv', index_col='sample_name')
+    feature_to_avg_accuracy = {}    
     rf = RandomForestClassifier(random_state=np.random.seed(seed))
 
     for i, feature in enumerate(feature_names):
-        # if i % 10 == 0:
         logger.info(f'Checking feature {feature} number {i}')
         # assert df.columns[i] == feature
         cv_score = cross_val_score(rf, X_train[:, i].reshape(-1, 1), y_train, cv=StratifiedKFold(cv_num_of_splits, shuffle=True)).mean()
         if cv_score == 1:
             logger.info('-' * 10 + f'{feature} has 100% accuracy!' + '-' * 10)
-        #     print(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])
-        #     print(f'min of other class: {min(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])}')
-        #     print(X_train[:, i].reshape(-1, 1).tolist()[8:12])
-        #     # print(y_train.tolist())
-        #     print('Accuracy is 1')
         feature_to_avg_accuracy[feature] = cv_score
 
     with open(f'{output_path}/single_feature_accuracy.txt', 'w') as f:
@@ -384,9 +363,9 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
     parser.add_argument('done_file_path', help='A path to a file that signals that the script finished running successfully.')
     # parser.add_argument('n_splits', type=int, default=2, help='How manyfolds should be in the cross validation process? (use 0 for leave one out)')
     parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
-    parser.add_argument('--tfidf', action='store_true', help="Are inputs from TF-IDF (avoid log(0))")
-    parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
-    parser.add_argument('--seed', default=42, help='Seed number for reconstructing experiments')    
+    parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles', 'hits'], default='hits', help='Motifs ranking method')
+    parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
+    parser.add_argument('--seed', default=42, type=int, help='Seed number for reconstructing experiments')    
     parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
     args = parser.parse_args()
 
@@ -396,4 +375,4 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
         logging.basicConfig(level=logging.INFO)
     logger = logging.getLogger('main')
 
-    train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.tfidf, args.cv_num_of_splits, args.seed, argv=sys.argv)
+    train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.rank_method, args.cv_num_of_splits, args.seed, argv=sys.argv)