Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create share function for calculate the log scale #26

Merged
merged 2 commits into from
May 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions auxiliaries/pipeline_auxiliaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from subprocess import call, run, Popen, PIPE
from time import time, sleep
import global_params
import numpy as np
import logging

logger = logging.getLogger('main')
Expand Down Expand Up @@ -365,3 +366,14 @@ def count_memes(path):
count = int(output) if p_status == 0 else 0
print(f'Found {count} memes in {path}')
return count


def log_scale(df, rank_method):
# The options of ranking method are - hits, controlled_shuffles, pval, tfidf
if rank_method == 'hits':
df = np.log2(df + 1)
if rank_method == 'pval':
df = 1-df
if rank_method == 'tfidf':
df = -np.log2(df + 0.0001) # avoid 0
return df
14 changes: 6 additions & 8 deletions model_fitting/module_wraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,9 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')

value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
if rank_method == 'tfidf' or rank_method == 'shuffles':
value_cmd.append('--tfidf')
hits_cmd.append('--tfidf')
value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', f'--rank_method {rank_method}']
hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', '--rank_method hits']


if not os.path.exists(pvalues_done_path):
all_cmds_params.append(value_cmd)
Expand Down Expand Up @@ -218,15 +216,15 @@ def get_faa_file_name_from_path(path, use_mapitope):
parser.add_argument('number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
parser.add_argument('done_file_path', help='A path to a file that signals that the module finished running successfully.')

parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='shuffles', help='Motifs ranking method')
parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
parser.add_argument('--num_of_random_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, type=int, help='Seed number for reconstructing experiments')
parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
Expand Down
63 changes: 21 additions & 42 deletions model_fitting/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import plot_roc_curve

if os.path.exists('/groups/pupko/orenavr2/'):
src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
src_dir = '/Users/Oren/Dropbox/Projects/gershoni/src'
else:
src_dir = '.'
sys.path.insert(0, src_dir)

from auxiliaries.pipeline_auxiliaries import log_scale
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')
Expand Down Expand Up @@ -87,18 +96,9 @@ def sample_configurations(hyperparameters_grid, num_of_configurations_to_sample,
return configurations


def generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, output_path):
# plt.figure(dpi=1000)
# transform the data for better contrast in the visualization
def generate_heat_map(df, number_of_features, rank_method, number_of_samples, output_path):

train_data = np.log2(df+1) if hits_data else df

#if hits_data: # hits data
# train_data = np.log2(df+1) # pseudo counts
#elif use_tfidf: # tfidf data
# train_data = -np.log2(df+0.0001) # avoid 0
#else: # p-values data
# train_data = -np.log2(df)
train_data = log_scale(df, rank_method)

cm = sns.clustermap(train_data, cmap="Blues", col_cluster=False, yticklabels=True)
plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=150/number_of_samples)
Expand Down Expand Up @@ -139,7 +139,7 @@ def generate_roc_curve(X, y, classifier, number_of_features, output_path):
plt.close()


def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_path, cv_num_of_splits):
def train(rf, X, y, feature_names, sample_names, rank_method, output_path, cv_num_of_splits):
original_feature_names = feature_names[:]
original_X = X[:]
logger.debug('\n'+'#'*100 + f'\nTrue labels:\n{y.tolist()}\n' + '#'*100 + '\n')
Expand Down Expand Up @@ -177,10 +177,6 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
# save previous cv_avg_error_rate to make sure the performances do not deteriorate
previous_cv_avg_error_rate = cv_avg_error_rate

# predictions = model.predict(X)
# logger.info(f'Full model error rate is {1 - (predictions == y).mean()}')
# logger.info(f'Current model\'s predictions\n{predictions.tolist()}')

# compute current model accuracy for each fold of the cross validation
cv_score = cross_val_score(model, X, y, cv=StratifiedKFold(cv_num_of_splits))

Expand All @@ -197,7 +193,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
# save current model (unsorted) features to a csv file
df = save_model_features(original_X, features_indexes_to_keep, feature_names, sample_names, f'{output_path}/Top_{number_of_features}_features')

generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, f'{output_path}/{number_of_features}')
generate_heat_map(df, number_of_features, rank_method, number_of_samples, f'{output_path}/{number_of_features}')

generate_roc_curve(X, y, rf, number_of_features, output_path)

Expand Down Expand Up @@ -229,7 +225,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
return cv_avg_error_rates, number_of_features_per_model


def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_tfidf, cv_num_of_splits, seed, argv):
def train_models(csv_file_path, done_path, num_of_configurations_to_sample, rank_method, cv_num_of_splits, seed, argv):
logging.info('Preparing output path...')
csv_folder, csv_file_name = os.path.split(csv_file_path)
csv_file_prefix = os.path.splitext(csv_file_name)[0] # without extension
Expand All @@ -242,7 +238,6 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
feature_selection_summary_path = f'{output_path}/feature_selection_summary.txt'

logging.info('Parsing data...')
is_hits_data = 'hits' in os.path.split(csv_file_path)[-1] # Does the file name contain "hits"?

X_train, y_train, X_test, y_test, feature_names, sample_names_train, sample_names_test = parse_data(csv_file_path)

Expand All @@ -251,21 +246,12 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
perfect_feature_names, perfect_feature_indexes = measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits)
if perfect_feature_names:
df = save_model_features(X_train, perfect_feature_indexes, perfect_feature_names, sample_names_train, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], is_hits_data, df.shape[0], use_tfidf, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], rank_method, df.shape[0], f'{output_path}/perfect_feature_names')
else:
# touch a file so we can see that there were no perfect features
with open(f'{output_path}/perfect_feature_names', 'w') as f:
pass

# sanity check
# perfect_feature_names =['PMMILDLDHSQV', 'YASTIVVDLDHT', 'CDAALHVIAALD', 'CVSASMDLDHLE', 'CSVELCNISGYG', 'SSQIMFHVYN', 'CPVSPQSLDAPC', 'MSWIFYALDPYE', 'SSHISLLMSV', 'ATLKHYLIQDGI', 'CTWGGPGFDLFC', 'CPFDSAGFDPYC', 'CTAAFISGHYPD', 'SSFFYMQAYV', 'TPIETISDSVYK', 'LDSLHIHYLVSF', 'CHISHVQDLPYL', 'WLWYLASDGI', 'SINHGLLAYI', 'CIALLKEGEPFS', 'CSADNLNLYC', 'ACDPYDCRHLYS', 'WLAVLYADMVFY', 'WHWSLLQDTMYH', 'GTSSFPYSHLWA', 'CALGWLLDLDHC', 'CQAFIDPYEC', 'SLSQNYSLLMII', 'NGYDYFGLRF', 'GDPYEILVLSRC', 'IESGDPYEARFS', 'GYDLLRDRYI', 'PINDLDADGI', 'SAHIWLVTSI', 'CLRLHLASLTQC', 'SRMDFRYLLE', 'CPHSLIMLLGDL', 'WPVHLEHTNI', 'GPYELHYEDK', 'CRFGAPGFDVQC', 'LLSLVLMDSMYF', 'APTCFRGTDC', 'TWRSGNASGSP', 'DGYFAHFWSVLQ', 'CAVLLDPYEC', 'CIARKDMQATQN', 'CMVWLDNVLSLC', 'PFEALELMRT']
# for feature in perfect_feature_names:
# rf = RandomForestClassifier(**{'n_estimators': 2000, 'max_features': 'auto', 'max_depth': 70, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': True, 'n_jobs': -1}).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
# rf = RandomForestClassifier(**configuration).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
# print(','.join(rf.predict(df[feature].as_matrix().reshape(-1, 1)).tolist()))
# print(','.join(y_train.as_matrix().tolist()))

# feature selection analysis
logging.info('\nApplying feature selection analysis...')
if cv_num_of_splits < 2:
logging.info('Number of CV folds is less than 2. '
Expand All @@ -292,7 +278,7 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_

logging.info(f'Configuration #{i} hyper-parameters are:\n{configuration}')
rf = RandomForestClassifier(**configuration)
errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, is_hits_data, use_tfidf, output_path_i, cv_num_of_splits)
errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, rank_method, output_path_i, cv_num_of_splits)
# model.predict(X_test, y_test)

plot_error_rate(errors, features, cv_num_of_splits, output_path_i)
Expand Down Expand Up @@ -331,22 +317,15 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_


def measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits):
feature_to_avg_accuracy = {}
# df = pd.read_csv(f'{output_path}/Top_149_features.csv', index_col='sample_name')
feature_to_avg_accuracy = {}
rf = RandomForestClassifier(random_state=np.random.seed(seed))

for i, feature in enumerate(feature_names):
# if i % 10 == 0:
logger.info(f'Checking feature {feature} number {i}')
# assert df.columns[i] == feature
cv_score = cross_val_score(rf, X_train[:, i].reshape(-1, 1), y_train, cv=StratifiedKFold(cv_num_of_splits, shuffle=True)).mean()
if cv_score == 1:
logger.info('-' * 10 + f'{feature} has 100% accuracy!' + '-' * 10)
# print(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])
# print(f'min of other class: {min(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])}')
# print(X_train[:, i].reshape(-1, 1).tolist()[8:12])
# # print(y_train.tolist())
# print('Accuracy is 1')
feature_to_avg_accuracy[feature] = cv_score

with open(f'{output_path}/single_feature_accuracy.txt', 'w') as f:
Expand Down Expand Up @@ -384,9 +363,9 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
parser.add_argument('done_file_path', help='A path to a file that signals that the script finished running successfully.')
# parser.add_argument('n_splits', type=int, default=2, help='How manyfolds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
parser.add_argument('--tfidf', action='store_true', help="Are inputs from TF-IDF (avoid log(0))")
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles', 'hits'], default='hits', help='Motifs ranking method')
parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed', default=42, type=int, help='Seed number for reconstructing experiments')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
args = parser.parse_args()

Expand All @@ -396,4 +375,4 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')

train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.tfidf, args.cv_num_of_splits, args.seed, argv=sys.argv)
train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.rank_method, args.cv_num_of_splits, args.seed, argv=sys.argv)