Skip to content

Commit

Permalink
Merge branch 'reads_filtration_change_seq_length' into RF_parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
yael1994 authored May 19, 2021
2 parents 63112a8 + 0417e48 commit 9525e37
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 23 deletions.
12 changes: 12 additions & 0 deletions auxiliaries/pipeline_auxiliaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from subprocess import call, run, Popen, PIPE
from time import time, sleep
import global_params
import numpy as np
import logging

logger = logging.getLogger('main')
Expand Down Expand Up @@ -365,3 +366,14 @@ def count_memes(path):
count = int(output) if p_status == 0 else 0
print(f'Found {count} memes in {path}')
return count


def log_scale(df, rank_method):
# The options of ranking method are - hits, controlled_shuffles, pval, tfidf
if rank_method == 'hits':
df = np.log2(df + 1)
if rank_method == 'pval':
df = 1-df
if rank_method == 'tfidf':
df = -np.log2(df + 0.0001) # avoid 0
return df
19 changes: 15 additions & 4 deletions hits_cpp/shufflesGenerator.hpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#pragma once

#include "meme.hpp"

#include <algorithm>
using namespace std;

class MemeShuffler {
public:
MemeShuffler(Meme& source, int maxShuffles, ShufflePatterns& patterns) :
_source(source), _patterns(patterns), _currentShuffle(-1) {
_source(source), _patterns(patterns), _currentShuffle(-1), _lenPatterns(patterns.size()), _numPattern(-1) {
auto len = patterns.size();
if (len < maxShuffles) {
maxShuffles = len;
Expand All @@ -17,20 +17,29 @@ class MemeShuffler {

bool next() {
_currentShuffle++;
_numPattern++;
return hasNext();
}

Meme generate() {
auto meme = Meme(this->_source);
meme.getRows().clear();

auto pattern = this->_patterns[this->_currentShuffle];
auto pattern = this->_patterns[this->_numPattern];
auto iter = pattern.begin();
auto end = pattern.end();
while (iter != end) {
meme.getRows().push_back(this->_source.getRows()[*iter]);
iter++;
}
//check that we don't change place of the same consensus:
for (int i = 0; i < this->_source.getRows().size(); i++){
auto max_source = max_element(this->_source.getRows()[i].begin(), this->_source.getRows()[i].end()) - this->_source.getRows()[i].begin();
auto max_shuffle = max_element(meme.getRows()[i].begin(), meme.getRows()[i].end()) - meme.getRows()[i].begin();
if (max_source == max_shuffle && (this->_lenPatterns - this->_numPattern) > (this->_maxShuffles - this->_currentShuffle)){
_numPattern++;
return this->generate();
}
}
return meme;
}

Expand All @@ -46,6 +55,8 @@ class MemeShuffler {
ShufflePatterns _patterns; //TODO pointer?
int _maxShuffles;
int _currentShuffle;
int _lenPatterns;
int _numPattern;
};

MemeShuffler getShuffler(Meme& meme, int maxPatterns);
16 changes: 7 additions & 9 deletions model_fitting/module_wraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,13 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
pvalues_done_path = os.path.join(logs_dir, f'{bc}_values_done_fitting.txt')
aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')

value_cmd = [aggregated_values_path, pvalues_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_random_configurations_to_sample, f'--cv_num_of_splits {cv_num_of_splits}'
'--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', num_of_random_configurations_to_sample, '--random_forest_seed', random_forest_seed,
f'--random_forest_seed {random_forest_seed_configurations}']
'--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', num_of_random_configurations_to_sample, '--seed', random_forest_seed,
f'--random_forest_seed {random_forest_seed_configurations}', '--rank_method {rank_method}']
hits_cmd = [aggregated_hits_path, hits_done_path, logs_dir, error_path, '--num_of_configurations_to_sample', num_of_configurations_to_sample, f'--cv_num_of_splits {cv_num_of_splits}'
'--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--random_forest_seed', random_forest_seed,
f'--random_forest_seed {random_forest_seed_configurations}']
'--number_parallel_random_forest', number_parallel_random_forest, '--min_value_error_random_forest', min_value_error_random_forest, '--_seed', random_forest_seed,
f'--random_forest_seed {random_forest_seed_configurations}', '--rank_method hits']
if rank_method == 'tfidf' or rank_method == 'shuffles':
value_cmd.append('--tfidf')
hits_cmd.append('--tfidf')
Expand All @@ -190,8 +190,6 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,

wait_for_results(script_name, logs_dir, num_of_expected_results, example_cmd=cmd,
error_file_path=error_path, suffix='_done_fitting.txt')
else:
logger.info(f'Skipping fitting, all found')
else:
print('stop before')
logger.info(f'Stop before random forest')
Expand Down Expand Up @@ -234,8 +232,8 @@ def get_faa_file_name_from_path(path, use_mapitope):
parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, type=int, help='Seed number for reconstructing experiments')
parser.add_argument('--random_forest_seed_configurations', default=123 , type=int, help='Random seed value for generating random forest configurations')
parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')
Expand Down
26 changes: 16 additions & 10 deletions model_fitting/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import plot_roc_curve

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')

if os.path.exists('/groups/pupko/orenavr2/'):
src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
Expand All @@ -22,6 +24,7 @@
sys.path.insert(0, src_dir)

from auxiliaries.pipeline_auxiliaries import submit_pipeline_step, wait_for_results
from auxiliaries.pipeline_auxiliaries import log_scale


def parse_data(file_path):
Expand Down Expand Up @@ -95,8 +98,10 @@ def sample_configurations(hyperparameters_grid, num_of_configurations_to_sample,
return configurations


def generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, output_path):
train_data = np.log2(df+1) if hits_data else df

def generate_heat_map(df, number_of_features, rank_method, number_of_samples, output_path):

train_data = log_scale(df, rank_method)
cm = sns.clustermap(train_data, cmap="Blues", col_cluster=False, yticklabels=True)
plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=150/number_of_samples)
cm.ax_heatmap.set_title(f"A heat-map of the significance of the top {number_of_features} discriminatory motifs")
Expand All @@ -114,6 +119,7 @@ def save_model_features(X, feature_indexes, feature_names, sample_names, output_
return df



def write_results_feature_selection_summary(feature_selection_summary_path, path_dir):
#feature_selection_summary_f = open(feature_selection_summary_path, 'a')
models = sorted([x[0] for x in os.walk(path_dir)])
Expand All @@ -128,8 +134,7 @@ def write_results_feature_selection_summary(feature_selection_summary_path, path
os.remove(path_file)



def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configurations_to_sample, number_parallel_random_forest, min_value_error,use_tfidf, cv_num_of_splits, seed, random_forest_seed, argv):
def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configurations_to_sample, number_parallel_random_forest, min_value_error, rank_method, cv_num_of_splits, seed, random_forest_seed, argv):
logging.info('Preparing output path...')
csv_folder, csv_file_name = os.path.split(csv_file_path)
csv_file_prefix = os.path.splitext(csv_file_name)[0] # without extension
Expand All @@ -141,7 +146,6 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
feature_selection_summary_path = f'{output_path}/feature_selection_summary.txt'

logging.info('Parsing data...')
is_hits_data = 'hits' in os.path.split(csv_file_path)[-1] # Does the file name contain "hits"?

X_train, y_train, X_test, y_test, feature_names, sample_names_train, sample_names_test = parse_data(csv_file_path)

Expand All @@ -150,7 +154,7 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
perfect_feature_names, perfect_feature_indexes = measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits)
if perfect_feature_names:
df = save_model_features(X_train, perfect_feature_indexes, perfect_feature_names, sample_names_train, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], is_hits_data, df.shape[0], use_tfidf, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], rank_method, df.shape[0], f'{output_path}/perfect_feature_names')
else:
# touch a file so we can see that there were no perfect features
with open(f'{output_path}/perfect_feature_names', 'w') as f:
Expand Down Expand Up @@ -219,7 +223,7 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura
if stop:
break
else:
logger.info(f'Skipping random forest train, all found')
logger.info(f'Skipping random forest train, all found')

feature_selection_summary_f.close()

Expand Down Expand Up @@ -248,7 +252,8 @@ def train_models(csv_file_path, done_path, logs_dir,error_path, num_of_configura


def measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits):
feature_to_avg_accuracy = {}

feature_to_avg_accuracy = {}
rf = RandomForestClassifier(random_state=np.random.seed(seed))

for i, feature in enumerate(feature_names):
Expand Down Expand Up @@ -294,13 +299,14 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
parser.add_argument('done_file_path', help='A path to a file that signals that the script finished running successfully.')
parser.add_argument('logs_dir', help='A path for the log dir')
parser.add_argument('error_path', help='Path for error file')

parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
parser.add_argument('--number_parallel_random_forest', default=20, type=int, help='How many random forest configurations to run in parallel')
parser.add_argument('--min_value_error_random_forest', default=0, type=float, help='A random forest model error value for convergence allowing to stop early')
parser.add_argument('--tfidf', action='store_true', help="Are inputs from TF-IDF (avoid log(0))")
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--random_forest_seed', default=123 , type=int, help='Random seed value for generating random forest configurations')
parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles', 'hits'], default='hits', help='Motifs ranking method')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
args = parser.parse_args()
import logging
Expand All @@ -311,4 +317,4 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
logger = logging.getLogger('main')

train_models(args.data_path, args.done_file_path, args.logs_dir, args.error_path, args.num_of_configurations_to_sample, args.number_parallel_random_forest,
args.min_value_error_random_forest, args.tfidf, args.cv_num_of_splits, args.seed, args.random_forest_seed, argv=sys.argv)
args.min_value_error_random_forest, args.rank_method, args.cv_num_of_splits, args.seed, args.random_forest_seed, argv=sys.argv)

0 comments on commit 9525e37

Please sign in to comment.