Merge branch 'reads_filtration_change_seq_length' into RF_parallel

Webiks · May 19, 2021 · 9525e37 · 9525e37
2 parents 63112a8 + 0417e48
commit 9525e37
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 23 deletions.
diff --git a/auxiliaries/pipeline_auxiliaries.py b/auxiliaries/pipeline_auxiliaries.py
@@ -3,6 +3,7 @@
 from subprocess import call, run, Popen, PIPE
 from time import time, sleep
 import global_params
+import numpy as np
 import logging
 
 logger = logging.getLogger('main')
@@ -365,3 +366,14 @@ def count_memes(path):
     count = int(output) if p_status == 0 else 0
     print(f'Found {count} memes in {path}')
     return count
+
+
+def log_scale(df, rank_method):
+    # The options of ranking method are - hits, controlled_shuffles, pval, tfidf
+    if rank_method == 'hits':
+        df = np.log2(df + 1)
+    if rank_method == 'pval':    
+        df = 1-df
+    if rank_method == 'tfidf':
+        df = -np.log2(df + 0.0001)  # avoid 0  
+    return df       
diff --git a/hits_cpp/shufflesGenerator.hpp b/hits_cpp/shufflesGenerator.hpp
@@ -1,13 +1,13 @@
 #pragma once
 
 #include "meme.hpp"
-
+#include <algorithm>
 using namespace std;
 
 class MemeShuffler {
 public:
     MemeShuffler(Meme& source, int maxShuffles, ShufflePatterns& patterns) :
-        _source(source), _patterns(patterns), _currentShuffle(-1) {
+        _source(source), _patterns(patterns), _currentShuffle(-1), _lenPatterns(patterns.size()), _numPattern(-1) {
         auto len = patterns.size();
         if (len < maxShuffles) {
             maxShuffles = len;
@@ -17,20 +17,29 @@ class MemeShuffler {
 
     bool next() {
         _currentShuffle++;
+        _numPattern++;
         return hasNext();
     }
 
     Meme generate() {
         auto meme = Meme(this->_source);
         meme.getRows().clear();
-
-        auto pattern = this->_patterns[this->_currentShuffle];
+        auto pattern = this->_patterns[this->_numPattern];
         auto iter = pattern.begin();
         auto end = pattern.end();
         while (iter != end) {
             meme.getRows().push_back(this->_source.getRows()[*iter]);
             iter++;
         }
+        //check that we don't change place of the same consensus:
+        for (int i = 0; i < this->_source.getRows().size(); i++){
+            auto max_source = max_element(this->_source.getRows()[i].begin(), this->_source.getRows()[i].end()) - this->_source.getRows()[i].begin();
+            auto max_shuffle = max_element(meme.getRows()[i].begin(), meme.getRows()[i].end()) - meme.getRows()[i].begin();
+            if (max_source == max_shuffle && (this->_lenPatterns - this->_numPattern) > (this->_maxShuffles - this->_currentShuffle)){
+                _numPattern++;
+                return this->generate();
+            }
+        }  
         return meme;
     }
 
@@ -46,6 +55,8 @@ class MemeShuffler {
     ShufflePatterns _patterns; //TODO pointer?
     int _maxShuffles;
     int _currentShuffle;
+    int _lenPatterns;
+    int _numPattern;
 };
 
 MemeShuffler getShuffler(Meme& meme, int maxPatterns);
diff --git a/model_fitting/module_wraper.py b/model_fitting/module_wraper.py
@@ -157,13 +157,13 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
             pvalues_done_path = os.path.join(logs_dir, f'{bc}_values_done_fitting.txt')
             aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
             hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')
-
+            
             value_cmd = [aggregated_values_path, pvalues_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_random_configurations_to_sample, f'--cv_num_of_splits {cv_num_of_splits}'
-                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', num_of_random_configurations_to_sample, '--random_forest_seed', random_forest_seed,
-                    f'--random_forest_seed {random_forest_seed_configurations}']
+                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', num_of_random_configurations_to_sample, '--seed', random_forest_seed,
+                    f'--random_forest_seed {random_forest_seed_configurations}', '--rank_method {rank_method}']
             hits_cmd = [aggregated_hits_path, hits_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_configurations_to_sample, f'--cv_num_of_splits {cv_num_of_splits}'
-                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--random_forest_seed', random_forest_seed,
-                     f'--random_forest_seed {random_forest_seed_configurations}']
+                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--_seed', random_forest_seed,
+                     f'--random_forest_seed {random_forest_seed_configurations}', '--rank_method hits']
             if rank_method == 'tfidf' or rank_method == 'shuffles':
                 value_cmd.append('--tfidf')
                 hits_cmd.append('--tfidf')
@@ -190,8 +190,6 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
 
             wait_for_results(script_name, logs_dir, num_of_expected_results, example_cmd=cmd,
                             error_file_path=error_path, suffix='_done_fitting.txt')
-        else:
-            logger.info(f'Skipping fitting, all found')
     else:
         print('stop before') 
         logger.info(f'Stop before random forest')        
@@ -234,8 +232,8 @@ def get_faa_file_name_from_path(path, use_mapitope):
     parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
     parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
     parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
-    parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
-    parser.add_argument('--seed_random_forest', default=42, help='Seed number for reconstructing experiments')    
+    parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
+    parser.add_argument('--seed_random_forest', default=42, type=int, help='Seed number for reconstructing experiments')    
     parser.add_argument('--random_forest_seed_configurations', default=123 , type=int, help='Random seed value for generating random forest configurations')
     parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
     parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')

diff --git a/model_fitting/random_forest.py b/model_fitting/random_forest.py
@@ -10,9 +10,11 @@
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
 from sklearn.metrics import plot_roc_curve
+
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger('main')
+
 if os.path.exists('/groups/pupko/orenavr2/'):
     src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
 elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
@@ -22,6 +24,7 @@
 sys.path.insert(0, src_dir)
 
 from auxiliaries.pipeline_auxiliaries import submit_pipeline_step, wait_for_results
+from auxiliaries.pipeline_auxiliaries import log_scale
 
 
 def parse_data(file_path):
@@ -95,8 +98,10 @@ def sample_configurations(hyperparameters_grid, num_of_configurations_to_sample,
     return configurations
 
 
-def generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, output_path):
-    train_data = np.log2(df+1) if hits_data else df 
+
+def generate_heat_map(df, number_of_features, rank_method, number_of_samples, output_path):
+
+    train_data = log_scale(df, rank_method)
     cm = sns.clustermap(train_data, cmap="Blues", col_cluster=False, yticklabels=True)
     plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=150/number_of_samples)
     cm.ax_heatmap.set_title(f"A heat-map of the significance of the top {number_of_features} discriminatory motifs")
@@ -114,6 +119,7 @@ def save_model_features(X, feature_indexes, feature_names, sample_names, output_
     return df
 
 
+
 def write_results_feature_selection_summary(feature_selection_summary_path, path_dir):
     #feature_selection_summary_f = open(feature_selection_summary_path, 'a')
     models = sorted([x[0] for x in os.walk(path_dir)])
@@ -128,8 +134,7 @@ def write_results_feature_selection_summary(feature_selection_summary_path, path
                 os.remove(path_file)
 
 
-
-def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configurations_to_sample, number_parallel_random_forest, min_value_error,use_tfidf, cv_num_of_splits, seed, random_forest_seed, argv):
+def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configurations_to_sample, number_parallel_random_forest, min_value_error, rank_method, cv_num_of_splits, seed, random_forest_seed, argv):
     logging.info('Preparing output path...')
     csv_folder, csv_file_name = os.path.split(csv_file_path)
     csv_file_prefix = os.path.splitext(csv_file_name)[0]  # without extension
@@ -141,7 +146,6 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
     feature_selection_summary_path = f'{output_path}/feature_selection_summary.txt'
 
     logging.info('Parsing data...')
-    is_hits_data = 'hits' in os.path.split(csv_file_path)[-1]  # Does the file name contain "hits"?
 
     X_train, y_train, X_test, y_test, feature_names, sample_names_train, sample_names_test = parse_data(csv_file_path)
 
@@ -150,7 +154,7 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
     perfect_feature_names, perfect_feature_indexes = measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits)
     if perfect_feature_names:
         df = save_model_features(X_train, perfect_feature_indexes, perfect_feature_names, sample_names_train, f'{output_path}/perfect_feature_names')
-        generate_heat_map(df, df.shape[1], is_hits_data, df.shape[0], use_tfidf, f'{output_path}/perfect_feature_names')
+        generate_heat_map(df, df.shape[1], rank_method, df.shape[0], f'{output_path}/perfect_feature_names')
     else:
         # touch a file so we can see that there were no perfect features
         with open(f'{output_path}/perfect_feature_names', 'w') as f:
@@ -219,7 +223,7 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
             if stop:
                 break
     else:
-         logger.info(f'Skipping random forest train, all found')
+         logger.info(f'Skipping random forest train, all found')       
 
     feature_selection_summary_f.close()
 
@@ -248,7 +252,8 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
 
 
 def measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits):
-    feature_to_avg_accuracy = {}
+
+    feature_to_avg_accuracy = {}    
     rf = RandomForestClassifier(random_state=np.random.seed(seed))
 
     for i, feature in enumerate(feature_names):
@@ -294,13 +299,14 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
     parser.add_argument('done_file_path', help='A path to a file that signals that the script finished running successfully.')
     parser.add_argument('logs_dir', help='A path for the log dir')
     parser.add_argument('error_path', help='Path for error file')
+
     parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
     parser.add_argument('--number_parallel_random_forest', default=20, type=int, help='How many random forest configurations to run in parallel')
     parser.add_argument('--min_value_error_random_forest', default=0, type=float, help='A random forest model error value for convergence allowing to stop early')
-    parser.add_argument('--tfidf', action='store_true', help="Are inputs from TF-IDF (avoid log(0))")
     parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
     parser.add_argument('--seed', default=42, help='Seed number for reconstructing experiments')    
     parser.add_argument('--random_forest_seed', default=123 , type=int, help='Random seed value for generating random forest configurations')
+    parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles', 'hits'], default='hits', help='Motifs ranking method')
     parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
     args = parser.parse_args()
     import logging
@@ -311,4 +317,4 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
     logger = logging.getLogger('main')
 
     train_models(args.data_path, args.done_file_path, args.logs_dir, args.error_path, args.num_of_configurations_to_sample, args.number_parallel_random_forest,
-     args.min_value_error_random_forest, args.tfidf, args.cv_num_of_splits, args.seed, args.random_forest_seed, argv=sys.argv)
+     args.min_value_error_random_forest, args.rank_method, args.cv_num_of_splits, args.seed, args.random_forest_seed, argv=sys.argv)