Skip to content

Commit

Permalink
Merge pull request #26 from Webiks/log_scale_data
Browse files Browse the repository at this point in the history
create share function for calculate the log scale
  • Loading branch information
shacharmo authored May 19, 2021
2 parents 1a3c5bc + 26e3739 commit 0417e48
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 50 deletions.
12 changes: 12 additions & 0 deletions auxiliaries/pipeline_auxiliaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from subprocess import call, run, Popen, PIPE
from time import time, sleep
import global_params
import numpy as np
import logging

logger = logging.getLogger('main')
Expand Down Expand Up @@ -365,3 +366,14 @@ def count_memes(path):
count = int(output) if p_status == 0 else 0
print(f'Found {count} memes in {path}')
return count


def log_scale(df, rank_method):
# The options of ranking method are - hits, controlled_shuffles, pval, tfidf
if rank_method == 'hits':
df = np.log2(df + 1)
if rank_method == 'pval':
df = 1-df
if rank_method == 'tfidf':
df = -np.log2(df + 0.0001) # avoid 0
return df
14 changes: 6 additions & 8 deletions model_fitting/module_wraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,9 @@ def build_classifier(first_phase_output_path, motif_inference_output_path,
aggregated_hits_path = os.path.join(classification_output_path, bc, f'{bc}_hits.csv')
hits_done_path = os.path.join(logs_dir, f'{bc}_hits_done_fitting.txt')

value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}']
if rank_method == 'tfidf' or rank_method == 'shuffles':
value_cmd.append('--tfidf')
hits_cmd.append('--tfidf')
value_cmd = [aggregated_values_path, pvalues_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', f'--rank_method {rank_method}']
hits_cmd = [aggregated_hits_path, hits_done_path, f'--num_of_configurations_to_sample {num_of_random_configurations_to_sample}', f'--cv_num_of_splits {cv_num_of_splits}', f'--seed {seed_random_forest}', '--rank_method hits']


if not os.path.exists(pvalues_done_path):
all_cmds_params.append(value_cmd)
Expand Down Expand Up @@ -218,15 +216,15 @@ def get_faa_file_name_from_path(path, use_mapitope):
parser.add_argument('number_of_random_pssms', default=100, type=int, help='Number of pssm permutations')
parser.add_argument('done_file_path', help='A path to a file that signals that the module finished running successfully.')

parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='pval', help='Motifs ranking method')
parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles'], default='shuffles', help='Motifs ranking method')
parser.add_argument('--tfidf_method', choices=['boolean', 'terms', 'log', 'augmented'], default='boolean', help='TF-IDF method')
parser.add_argument('--tfidf_factor', type=float, default=0.5, help='TF-IDF augmented method factor (0-1)')
parser.add_argument('--shuffles', default=5, type=int, help='Number of controlled shuffles permutations')
parser.add_argument('--shuffles_percent', default=0.2, type=float, help='Percent from shuffle with greatest number of hits (0-1)')
parser.add_argument('--shuffles_digits', default=2, type=int, help='Number of digits after the point to print in scanning files.')
parser.add_argument('--num_of_random_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed_random_forest', default=42, type=int, help='Seed number for reconstructing experiments')
parser.add_argument('--error_path', type=str, help='a file in which errors will be written to')
parser.add_argument('-q', '--queue', default='pupkoweb', type=str, help='a queue to which the jobs will be submitted')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
Expand Down
63 changes: 21 additions & 42 deletions model_fitting/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate
from sklearn.metrics import plot_roc_curve

if os.path.exists('/groups/pupko/orenavr2/'):
src_dir = '/groups/pupko/orenavr2/igomeProfilingPipeline/src'
elif os.path.exists('/Users/Oren/Dropbox/Projects/'):
src_dir = '/Users/Oren/Dropbox/Projects/gershoni/src'
else:
src_dir = '.'
sys.path.insert(0, src_dir)

from auxiliaries.pipeline_auxiliaries import log_scale
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')
Expand Down Expand Up @@ -87,18 +96,9 @@ def sample_configurations(hyperparameters_grid, num_of_configurations_to_sample,
return configurations


def generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, output_path):
# plt.figure(dpi=1000)
# transform the data for better contrast in the visualization
def generate_heat_map(df, number_of_features, rank_method, number_of_samples, output_path):

train_data = np.log2(df+1) if hits_data else df

#if hits_data: # hits data
# train_data = np.log2(df+1) # pseudo counts
#elif use_tfidf: # tfidf data
# train_data = -np.log2(df+0.0001) # avoid 0
#else: # p-values data
# train_data = -np.log2(df)
train_data = log_scale(df, rank_method)

cm = sns.clustermap(train_data, cmap="Blues", col_cluster=False, yticklabels=True)
plt.setp(cm.ax_heatmap.yaxis.get_majorticklabels(), fontsize=150/number_of_samples)
Expand Down Expand Up @@ -139,7 +139,7 @@ def generate_roc_curve(X, y, classifier, number_of_features, output_path):
plt.close()


def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_path, cv_num_of_splits):
def train(rf, X, y, feature_names, sample_names, rank_method, output_path, cv_num_of_splits):
original_feature_names = feature_names[:]
original_X = X[:]
logger.debug('\n'+'#'*100 + f'\nTrue labels:\n{y.tolist()}\n' + '#'*100 + '\n')
Expand Down Expand Up @@ -177,10 +177,6 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
# save previous cv_avg_error_rate to make sure the performances do not deteriorate
previous_cv_avg_error_rate = cv_avg_error_rate

# predictions = model.predict(X)
# logger.info(f'Full model error rate is {1 - (predictions == y).mean()}')
# logger.info(f'Current model\'s predictions\n{predictions.tolist()}')

# compute current model accuracy for each fold of the cross validation
cv_score = cross_val_score(model, X, y, cv=StratifiedKFold(cv_num_of_splits))

Expand All @@ -197,7 +193,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
# save current model (unsorted) features to a csv file
df = save_model_features(original_X, features_indexes_to_keep, feature_names, sample_names, f'{output_path}/Top_{number_of_features}_features')

generate_heat_map(df, number_of_features, hits_data, number_of_samples, use_tfidf, f'{output_path}/{number_of_features}')
generate_heat_map(df, number_of_features, rank_method, number_of_samples, f'{output_path}/{number_of_features}')

generate_roc_curve(X, y, rf, number_of_features, output_path)

Expand Down Expand Up @@ -229,7 +225,7 @@ def train(rf, X, y, feature_names, sample_names, hits_data, use_tfidf, output_pa
return cv_avg_error_rates, number_of_features_per_model


def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_tfidf, cv_num_of_splits, seed, argv):
def train_models(csv_file_path, done_path, num_of_configurations_to_sample, rank_method, cv_num_of_splits, seed, argv):
logging.info('Preparing output path...')
csv_folder, csv_file_name = os.path.split(csv_file_path)
csv_file_prefix = os.path.splitext(csv_file_name)[0] # without extension
Expand All @@ -242,7 +238,6 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
feature_selection_summary_path = f'{output_path}/feature_selection_summary.txt'

logging.info('Parsing data...')
is_hits_data = 'hits' in os.path.split(csv_file_path)[-1] # Does the file name contain "hits"?

X_train, y_train, X_test, y_test, feature_names, sample_names_train, sample_names_test = parse_data(csv_file_path)

Expand All @@ -251,21 +246,12 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_
perfect_feature_names, perfect_feature_indexes = measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits)
if perfect_feature_names:
df = save_model_features(X_train, perfect_feature_indexes, perfect_feature_names, sample_names_train, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], is_hits_data, df.shape[0], use_tfidf, f'{output_path}/perfect_feature_names')
generate_heat_map(df, df.shape[1], rank_method, df.shape[0], f'{output_path}/perfect_feature_names')
else:
# touch a file so we can see that there were no perfect features
with open(f'{output_path}/perfect_feature_names', 'w') as f:
pass

# sanity check
# perfect_feature_names =['PMMILDLDHSQV', 'YASTIVVDLDHT', 'CDAALHVIAALD', 'CVSASMDLDHLE', 'CSVELCNISGYG', 'SSQIMFHVYN', 'CPVSPQSLDAPC', 'MSWIFYALDPYE', 'SSHISLLMSV', 'ATLKHYLIQDGI', 'CTWGGPGFDLFC', 'CPFDSAGFDPYC', 'CTAAFISGHYPD', 'SSFFYMQAYV', 'TPIETISDSVYK', 'LDSLHIHYLVSF', 'CHISHVQDLPYL', 'WLWYLASDGI', 'SINHGLLAYI', 'CIALLKEGEPFS', 'CSADNLNLYC', 'ACDPYDCRHLYS', 'WLAVLYADMVFY', 'WHWSLLQDTMYH', 'GTSSFPYSHLWA', 'CALGWLLDLDHC', 'CQAFIDPYEC', 'SLSQNYSLLMII', 'NGYDYFGLRF', 'GDPYEILVLSRC', 'IESGDPYEARFS', 'GYDLLRDRYI', 'PINDLDADGI', 'SAHIWLVTSI', 'CLRLHLASLTQC', 'SRMDFRYLLE', 'CPHSLIMLLGDL', 'WPVHLEHTNI', 'GPYELHYEDK', 'CRFGAPGFDVQC', 'LLSLVLMDSMYF', 'APTCFRGTDC', 'TWRSGNASGSP', 'DGYFAHFWSVLQ', 'CAVLLDPYEC', 'CIARKDMQATQN', 'CMVWLDNVLSLC', 'PFEALELMRT']
# for feature in perfect_feature_names:
# rf = RandomForestClassifier(**{'n_estimators': 2000, 'max_features': 'auto', 'max_depth': 70, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': True, 'n_jobs': -1}).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
# rf = RandomForestClassifier(**configuration).fit(df[feature].as_matrix().reshape(-1, 1), y_train)
# print(','.join(rf.predict(df[feature].as_matrix().reshape(-1, 1)).tolist()))
# print(','.join(y_train.as_matrix().tolist()))

# feature selection analysis
logging.info('\nApplying feature selection analysis...')
if cv_num_of_splits < 2:
logging.info('Number of CV folds is less than 2. '
Expand All @@ -292,7 +278,7 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_

logging.info(f'Configuration #{i} hyper-parameters are:\n{configuration}')
rf = RandomForestClassifier(**configuration)
errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, is_hits_data, use_tfidf, output_path_i, cv_num_of_splits)
errors, features = train(rf, X_train, y_train, feature_names, sample_names_train, rank_method, output_path_i, cv_num_of_splits)
# model.predict(X_test, y_test)

plot_error_rate(errors, features, cv_num_of_splits, output_path_i)
Expand Down Expand Up @@ -331,22 +317,15 @@ def train_models(csv_file_path, done_path, num_of_configurations_to_sample, use_


def measure_each_feature_accuracy(X_train, y_train, feature_names, output_path, seed, cv_num_of_splits):
feature_to_avg_accuracy = {}
# df = pd.read_csv(f'{output_path}/Top_149_features.csv', index_col='sample_name')
feature_to_avg_accuracy = {}
rf = RandomForestClassifier(random_state=np.random.seed(seed))

for i, feature in enumerate(feature_names):
# if i % 10 == 0:
logger.info(f'Checking feature {feature} number {i}')
# assert df.columns[i] == feature
cv_score = cross_val_score(rf, X_train[:, i].reshape(-1, 1), y_train, cv=StratifiedKFold(cv_num_of_splits, shuffle=True)).mean()
if cv_score == 1:
logger.info('-' * 10 + f'{feature} has 100% accuracy!' + '-' * 10)
# print(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])
# print(f'min of other class: {min(X_train[:, i].reshape(-1, 1).tolist()[:8] + X_train[:, i].reshape(-1, 1).tolist()[12:])}')
# print(X_train[:, i].reshape(-1, 1).tolist()[8:12])
# # print(y_train.tolist())
# print('Accuracy is 1')
feature_to_avg_accuracy[feature] = cv_score

with open(f'{output_path}/single_feature_accuracy.txt', 'w') as f:
Expand Down Expand Up @@ -384,9 +363,9 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
parser.add_argument('done_file_path', help='A path to a file that signals that the script finished running successfully.')
# parser.add_argument('n_splits', type=int, default=2, help='How manyfolds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--num_of_configurations_to_sample', default=100, type=int, help='How many random configurations of hyperparameters should be sampled?')
parser.add_argument('--tfidf', action='store_true', help="Are inputs from TF-IDF (avoid log(0))")
parser.add_argument('--cv_num_of_splits', default=2, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed', default=42, help='Seed number for reconstructing experiments')
parser.add_argument('--rank_method', choices=['pval', 'tfidf', 'shuffles', 'hits'], default='hits', help='Motifs ranking method')
parser.add_argument('--cv_num_of_splits', default=2, type=int, help='How folds should be in the cross validation process? (use 0 for leave one out)')
parser.add_argument('--seed', default=42, type=int, help='Seed number for reconstructing experiments')
parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity')
args = parser.parse_args()

Expand All @@ -396,4 +375,4 @@ def save_configuration_to_txt_file(sampled_configuration, output_path_i):
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('main')

train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.tfidf, args.cv_num_of_splits, args.seed, argv=sys.argv)
train_models(args.data_path, args.done_file_path, args.num_of_configurations_to_sample, args.rank_method, args.cv_num_of_splits, args.seed, argv=sys.argv)

0 comments on commit 0417e48

Please sign in to comment.