Webiks · shacharmo · May 19, 2021 · Nov 26, 2020 · Nov 29, 2020 · Dec 16, 2020
diff --git a/IgOmeProfiling_pipeline.py b/IgOmeProfiling_pipeline.py
@@ -10,13 +10,15 @@
 sys.path.insert(0, src_dir)
 
 from auxiliaries.pipeline_auxiliaries import *
+
 
 def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondition_path, analysis_dir, logs_dir,
                  left_construct, right_construct, max_mismatches_allowed, min_sequencing_quality, minimal_length_required, gz,
                  max_msas_per_sample, max_msas_per_bc,
                  max_number_of_cluster_members_per_sample, max_number_of_cluster_members_per_bc,
-                 allowed_gap_frequency, concurrent_cutoffs, meme_split_size, use_mapitope, number_of_random_pssms,
-                 rank_method, tfidf_method, tfidf_factor, shuffles, shuffles_percent, shuffles_digits,
+                 allowed_gap_frequency, concurrent_cutoffs, meme_split_size, use_mapitope, stop_random_forest, number_of_random_pssms,
+                 number_parallel_random_forest, min_value_error_random_forest,
+                 rank_method, tfidf_method, tfidf_factor, shuffles, shuffles_percent, shuffles_digits, seed_random_forest_classifier,
                  run_summary_path, error_path, queue, verbose, argv):
 
     os.makedirs(os.path.split(run_summary_path)[0], exist_ok=True)
@@ -86,8 +88,11 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
 
         module_parameters = [first_phase_output_path, second_phase_output_path, third_phase_output_path,
                              third_phase_logs_path, samplename2biologicalcondition_path, number_of_random_pssms,
-                             third_phase_done_path, f'--shuffles_percent {shuffles_percent}', f'--shuffles_digits {shuffles_digits}' ,f'--rank_method {rank_method}', f'--error_path {error_path}', 
-                             '-v' if verbose else '', f'-q {queue}','-m' if use_mapitope else '']
+                             third_phase_done_path, '--stop_random_forest' if stop_random_forest else '',
+                             f'--number_parallel_random_forest' {number_parallel_random_forest}, f'--min_value_error_random_forest' {min_value_error_random_forest},
+                             f'--shuffles_percent {shuffles_percent}', f'--shuffles_digits {shuffles_digits}'
+                             f'--rank_method {rank_method}', f'--seed_random_forest_classifier {seed_random_forest_classifier}', f'--error_path {error_path}', '-v' if verbose else '',
+                             f'-q {queue}','-m' if use_mapitope else '']
         if rank_method == 'tfidf':
             if tfidf_method:
                 module_parameters += ['--tfidf_method', tfidf_method]
@@ -161,14 +166,18 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
     parser.add_argument('-m', '--mapitope', action='store_true', help='use mapitope encoding')
 
     # optional parameters for the modelling step
+    parser.add_argument('--stop_random_forest', action='store_true', help='A boolean flag for mark if we need to run the random forest')
     parser.add_argument('--number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
+    parser.add_argument('--number_parallel_random_forest', default=20, type=int, help='How many random forest to run in parallel')
+    parser.add_argument('--min_value_error_random_forest', default=0, type=float, help='A min value for error that the run can stop in the random forest')
     parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
     parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
     parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
     parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
     parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
     parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
-
+    parser.add_argument('--seed_random_forest_classifier', defual=123 , type=int, help='A number for create the random forest stable when run the same configuration')
+
     # general optional parameters
     parser.add_argument('--run_summary_path', type=str,
                         help='a file in which the running configuration and timing will be written to')
@@ -195,6 +204,7 @@ def run_pipeline(fastq_path, barcode2samplename_path, samplename2biologicalcondi
                  args.left_construct, args.right_construct, args.max_mismatches_allowed, args.min_sequencing_quality, args.minimal_length_required, True if args.gz else False,
                  args.max_msas_per_sample, args.max_msas_per_bc,
                  args.max_number_of_cluster_members_per_sample, args.max_number_of_cluster_members_per_bc,
-                 args.allowed_gap_frequency, concurrent_cutoffs, args.meme_split_size, args.mapitope, args.number_of_random_pssms,
+                 args.allowed_gap_frequency, concurrent_cutoffs, args.meme_split_size, args.mapitope,True if args.stop_random_forest else False,
+                 args.number_of_random_pssms, args.number_parallel_random_forest, args.min_value_error_random_forest,
                  args.rank_method, args.tfidf_method, args.tfidf_factor, args.shuffles, args.shuffles_percent, args.shuffles_digits,
-                 run_summary_path, error_path, args.queue, True if args.verbose else False, sys.argv)
+                 args.seed_random_forest_classifier, run_summary_path, error_path, args.queue, True if args.verbose else False, sys.argv)
diff --git a/model_fitting/module_wraper.py b/model_fitting/module_wraper.py
@@ -21,9 +21,11 @@ def repeat_items(list):
 
 
 def build_classifier(first_phase_output_path, motif_inference_output_path,
-                     classification_output_path, logs_dir, samplename2biologicalcondition_path,
-                     fitting_done_path, number_of_random_pssms, rank_method, tfidf_method, tfidf_factor,
-                     shuffles, shuffles_percent, shuffles_digits, queue_name, verbose, error_path, use_mapitope, argv):
+                     classification_output_path, logs_dir, samplename2biologicalcondition_path, number_of_random_pssms,
+                     fitting_done_path, stop_random_forest, num_of_configurations_to_sample,
+                     number_parallel_random_forest, min_value_error_random_forest, rank_method, tfidf_method, tfidf_factor,
+                     shuffles, shuffles_percent, shuffles_digits, seed_random_forest_classifier,queue_name, verbose, seed_random_forest_classifier, error_path, use_mapitop, argv):
+
     is_pval = rank_method == 'pval'
     os.makedirs(classification_output_path, exist_ok=True)
     os.makedirs(logs_dir, exist_ok=True)
@@ -42,7 +44,6 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
         scanning_dir_path = os.path.join(bc_dir_path, 'scanning')
         os.makedirs(scanning_dir_path, exist_ok=True)
 
-
     # compute scanning scores (hits and values)
     logger.info('_'*100)
     logger.info(f'{datetime.datetime.now()}: scanning peptides vs motifs (hits and values)')
@@ -145,49 +146,54 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
 
 
     # fitting a random forest model (hits and values)
-    logger.info('_'*100)
-    logger.info(f'{datetime.datetime.now()}: fitting model')
-    script_name = 'random_forest.py'
-    num_of_expected_results = 0
-    all_cmds_params = []  # a list of lists. Each sublist contain different parameters set for the same script to reduce the total number of jobs
-    for bc in biological_conditions:
-        aggregated_values_path = os.path.join(classification_output_path, bc, f'{bc}_values.csv')
-        pvalues_done_path = os.path.join(logs_dir, f'{bc}_values_done_fitting.txt')
-        aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
-        hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')
-
-        value_cmd = [aggregated_values_path, pvalues_done_path]
-        hits_cmd = [aggregated_hits_path, hits_done_path]
-        if rank_method == 'tfidf' or rank_method == 'shuffles':
-            value_cmd.append('--tfidf')
-            hits_cmd.append('--tfidf')
-
-        if not os.path.exists(pvalues_done_path):
-            all_cmds_params.append(value_cmd)
-        else:
-            logger.debug(f'Skipping fitting as {pvalues_done_path} found')
-            num_of_expected_results += 1
-
-        if not os.path.exists(hits_done_path):
-            all_cmds_params.append(hits_cmd)
+    if not stop_random_forest:
+        print('not stop')
+        logger.info('_'*100)
+        logger.info(f'{datetime.datetime.now()}: fitting model')
+        script_name = 'random_forest.py'
+        num_of_expected_results = 0
+        all_cmds_params = []  # a list of lists. Each sublist contain different parameters set for the same script to reduce the total number of jobs
+        for bc in biological_conditions:
+            aggregated_values_path = os.path.join(classification_output_path, bc, f'{bc}_values.csv')
+            pvalues_done_path = os.path.join(logs_dir, f'{bc}_values_done_fitting.txt')
+            aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
+            hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')
+
+            value_cmd = [aggregated_values_path, pvalues_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_configurations_to_sample,
+                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--seed_random_forest_classifier', seed_random_forest_classifier]
+            hits_cmd = [aggregated_hits_path, hits_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_configurations_to_sample,
+                    '--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--seed_random_forest_classifier', seed_random_forest_classifier]
+            if rank_method == 'tfidf' or rank_method == 'shuffles':
+                value_cmd.append('--tfidf')
+                hits_cmd.append('--tfidf')
+            if not os.path.exists(pvalues_done_path):
+                all_cmds_params.append(value_cmd)
+            else:
+                logger.debug(f'Skipping fitting as {pvalues_done_path} found')
+                num_of_expected_results += 1
+
+            if not os.path.exists(hits_done_path):
+                all_cmds_params.append(hits_cmd)
+            else:
+                logger.debug(f'Skipping fitting as {hits_done_path} found')
+                num_of_expected_results += 1
+
+        if len(all_cmds_params) > 0:
+            doubled_bc = repeat_items(biological_conditions)
+            for cmds_params, bc in zip(all_cmds_params, doubled_bc):
+                cmd = submit_pipeline_step(f'{src_dir}/model_fitting/{script_name}',
+                                    [cmds_params],
+                                    logs_dir, f'{bc}_model',
+                                    queue_name, verbose)
+                num_of_expected_results += 1  # a single job for each biological condition
+
+            wait_for_results(script_name, logs_dir, num_of_expected_results, example_cmd=cmd,
+                            error_file_path=error_path, suffix='_done_fitting.txt')
         else:
-            logger.debug(f'Skipping fitting as {hits_done_path} found')
-            num_of_expected_results += 1
-
-    if len(all_cmds_params) > 0:
-        doubled_bc = repeat_items(biological_conditions)
-        for cmds_params, bc in zip(all_cmds_params, doubled_bc):
-            cmd = submit_pipeline_step(f'{src_dir}/model_fitting/{script_name}',
-                                [cmds_params],
-                                logs_dir, f'{bc}_model',
-                                queue_name, verbose)
-            num_of_expected_results += 1  # a single job for each biological condition
-
-        wait_for_results(script_name, logs_dir, num_of_expected_results, example_cmd=cmd,
-                        error_file_path=error_path, suffix='_done_fitting.txt')
+            logger.info(f'Skipping fitting, all found')
     else:
-        logger.info(f'Skipping fitting, all found')
-
+        print('stop before') 
+        logger.info(f'Stop before random forest')        
 
     # TODO: fix this bug with a GENERAL WRAPPER done_path
     # wait_for_results(script_name, num_of_expected_results)
@@ -217,12 +223,17 @@ def get_faa_file_name_from_path(path, use_mapitope):
     parser.add_argument('number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
     parser.add_argument('done_file_path', help='A path to a file that signals that the module finished running successfully.')
 
+    parser.add_argument('--stop_random_forest', action='store_true',help='A boolean flag for mark if we need to run the random forest')
+    parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
+    parser.add_argument('--number_parallel_random_forest', default=20, type=int, help='How many random forest to run in parallel')
+    parser.add_argument('--min_value_error_random_forest', default=0, type=float, help='A min value for error that the run can stop in the random forest')
     parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
     parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
     parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
     parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
     parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
     parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
+    parser.add_argument('--seed_random_forest_classifier', defual=123 , type=int, help='A number for create the random forest stable when run the same configuration')
     parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
     parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')
     parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
@@ -239,6 +250,8 @@ def get_faa_file_name_from_path(path, use_mapitope):
     error_path = args.error_path if args.error_path else os.path.join(args.parsed_fastq_results, 'error.txt')
 
     build_classifier(args.parsed_fastq_results, args.motif_inference_results, args.classification_output_path,
-                     args.logs_dir, args.samplename2biologicalcondition_path, args.done_file_path,
-                     args.number_of_random_pssms, args.rank_method, args.tfidf_method, args.tfidf_factor, 
-                     args.shuffles, args.shuffles_percent, args.shuffles_digits, args.queue, True if args.verbose else False, error_path, args.mapitope, sys.argv)
+                     args.logs_dir, args.samplename2biologicalcondition_path, args.number_of_random_pssms, args.done_file_path,
+                     True if args.stop_random_forest else False, args.num_of_configurations_to_sample,
+                     args.number_parallel_random_forest, args.min_value_error_random_forest, args.rank_method,
+                     args.tfidf_method, args.tfidf_factor, args.shuffles, args.shuffles_percent, args.shuffles_digits, args.seed_random_forest_classifier,
+                     args.queue, True if args.verbose else False, error_path, args.mapitope, sys.argv)