-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Started to port jupyter notebook code to covigator package
- Loading branch information
1 parent
65d9410
commit 3f53cc0
Showing
2 changed files
with
203 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
GENES_DICT = { | ||
'orf10': 'ORF10', | ||
's': 'S', | ||
'orf3a': 'ORF3a', | ||
'e': 'E', | ||
'm': 'M', | ||
'orf6': 'ORF6', | ||
'orf7a': 'ORF7a', | ||
'orf8': 'ORF8', | ||
'orf7b': 'ORF7b', | ||
'nc': 'N', # tricky! | ||
# polyprotein... | ||
'nsp1': 'ORF1ab', | ||
'nsp10': 'ORF1ab', | ||
'nsp15': 'ORF1ab', | ||
'nsp2': 'ORF1ab', | ||
'nsp3': 'ORF1ab', | ||
'nsp4': 'ORF1ab', | ||
'nsp6': 'ORF1ab', | ||
'nsp7': 'ORF1ab', | ||
'nsp8': 'ORF1ab', | ||
'nsp9': 'ORF1ab'} | ||
|
||
# transform protein coordinates in the polyprotein ORF1ab | ||
# these values have been extracted from the CoVigator domains table derived from Pfam domains | ||
# domains = pd.read_csv('../data/domain.csv') | ||
POLYPROTEIN_OFFSET = { | ||
'nsp1': 9, | ||
'nsp10': 4262, | ||
'nsp15': 5929, | ||
'nsp2': 183, | ||
'nsp3': 880, | ||
'nsp4': 2788, | ||
'nsp6': 3598, | ||
'nsp7': 3860, | ||
'nsp8': 3943, | ||
'nsp9': 4141} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,170 @@ | ||
from logzero import logger | ||
from pathlib import Path | ||
import pandas as pd | ||
import numpy as np | ||
from covigator.precomputations import GENES_DICT, POLYPROTEIN_OFFSET | ||
from sklearn.linear_model import LinearRegression | ||
import joblib | ||
|
||
class FeatureLoader: | ||
DATA = Path(__file__) / "model_data" | ||
|
||
@staticmethod | ||
def _load_sift_features(intrahost_observations: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Load SIFT pre-computed annotation scores and annotate intrahost_observations | ||
""" | ||
sift = pd.read_csv(FeatureLoader.DATA / 'Sars_cov_2.all_variants_SIFTannotations.tsv', | ||
sep='\t', | ||
usecols=['GENE_NAME', 'AMINO_POS', 'REF_AMINO', 'ALT_AMINO', 'SIFT_SCORE']) | ||
|
||
columns = ['variant_id', 'gene_name', 'position_amino_acid', 'reference_amino_acid', 'alternate_amino_acid'] | ||
assert all([x in columns for x in intrahost_observations.columns]), "Missing columns in intrahost table" | ||
#intrahost_observations = pd.read_parquet( | ||
# os.path.join(get_covigator_data_folder(), "subclonal_variant_observation_samples.parquet"), | ||
# columns=columns) | ||
intrahost_observations_sift = intrahost_observations.join( | ||
sift.set_index(['GENE_NAME', 'AMINO_POS', 'REF_AMINO', 'ALT_AMINO']), \ | ||
on=['gene_name', 'position_amino_acid', 'reference_amino_acid', 'alternate_amino_acid']) | ||
intrahost_observations_sift.rename(columns={'SIFT_SCORE': 'sift_score'}, inplace=True) | ||
|
||
sift_feature = intrahost_observations_sift.groupby('variant_id').first() | ||
sift_feature = sift_feature[~sift_feature.position_amino_acid.isna()] | ||
|
||
logger.info("Number of variants {} of which {} have a missing SIFT score".format( | ||
sift_feature.shape[0], sift_feature[sift_feature["sift_score"].isna()].shape[0])) | ||
|
||
logger.info(sift_feature.sift_score.describe()) | ||
|
||
return sift_feature | ||
|
||
@staticmethod | ||
def _load_mutfunc_features(intrahost_observations: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Load mutfunc resoource and annotate intrahost_observations | ||
""" | ||
mutfunc = pd.read_csv(FeatureLoader.DATA / 'mutfunc_sars/summary.tsv', sep='\t') | ||
|
||
# translate gene names | ||
mutfunc['gene_name'] = mutfunc['name'].transform(lambda x: GENES_DICT.get(x)) | ||
|
||
mutfunc['gene_position'] = mutfunc[['name', 'position']].apply( | ||
lambda x: POLYPROTEIN_OFFSET.get(x[0], 0) + x[1], axis=1) | ||
|
||
# maps the functional annotations in protein coordinates to our variant ids | ||
# WARNING: we lose the annotations from all ORF1ab as these are annotated with protein domain coordinates... who would have thought? | ||
columns = ['variant_id', 'gene_name', 'position_amino_acid', 'reference_amino_acid', 'alternate_amino_acid'] | ||
assert all([x in columns for x in intrahost_observations.columns]), "Missing columns in intrahost table" | ||
#intrahost_observations = pd.read_parquet( | ||
# os.path.join(get_covigator_data_folder(), "subclonal_variant_observation_samples.parquet"), | ||
# columns=columns) | ||
intrahost_observations_mutfunc = intrahost_observations.join( | ||
mutfunc.set_index(['gene_name', 'gene_position', 'wt', 'mut']), \ | ||
on=['gene_name', 'position_amino_acid', 'reference_amino_acid', 'alternate_amino_acid']) | ||
|
||
mutfunc_features = intrahost_observations_mutfunc.groupby('variant_id').first() | ||
mutfunc_features = mutfunc_features[~mutfunc_features.position_amino_acid.isna()] | ||
|
||
logger.info("Number of variants {}".format(mutfunc_features.shape[0])) | ||
logger.info(mutfunc_features.describe()) | ||
|
||
return mutfunc_features | ||
|
||
@staticmethod | ||
def _build_features_table( | ||
variants: pd.DataFrame, | ||
features_vaf_regression: pd.DataFrame, | ||
features_conservation: pd.DataFrame, | ||
features_mutfunc: pd.DataFrame, | ||
feature_sift: pd.DataFrame, | ||
features_dnds: pd.DataFrame, | ||
feature_iedb: pd.DataFrame = None, | ||
fill_na=None): | ||
""" | ||
variants is a DataFrame containing a column "variant_id" | ||
""" | ||
|
||
features_table = variants[['variant_id']].set_index('variant_id') \ | ||
.join(features_vaf_regression, how='left') \ | ||
.join(features_conservation, how='left') \ | ||
.join(features_mutfunc[['mut_escape_mean', 'mut_escape_max']], how='left') \ | ||
.join(feature_sift[['sift_score']], how='left') | ||
|
||
if feature_iedb is not None: | ||
# NOTE: we decided to exclude IEDB features in most models | ||
features_table = features_table \ | ||
.join(feature_iedb, how='left') | ||
|
||
features_table = features_table \ | ||
.join(features_dnds[['clonal_dnds', 'subclonal_dnds']], how='left') | ||
|
||
# sorts table by variant_id | ||
features_table.sort_index(inplace=True) | ||
|
||
if fill_na is not None: | ||
# fill NA values with whatever value was provided, this is necessary at least for RF models | ||
features_table.fillna(fill_na, inplace=True) | ||
|
||
return features_table | ||
|
||
@staticmethod | ||
def _linear_regression(x, y, model): | ||
# TODO: do we want to add the intersection as an additional feature? | ||
# that would represent how high is the starting point in terms of VAFs | ||
try: | ||
fit = model.fit(np.array(x).reshape((-1, 1)), np.array(y)) | ||
return fit.coef_[0] | ||
except ValueError: | ||
return None | ||
except TypeError: | ||
return None | ||
|
||
@staticmethod | ||
def _calculate_vaf_regression(intrahost_observations_samples: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Input DF requires the columns "variant_id", "vaf" and "collection_date" | ||
Output DF is grouped by "variant_id" and has an additional column "vaf_regression_coefficient" | ||
""" | ||
model = LinearRegression() | ||
intrahost_observations_samples['collection_date_timestamp'] = \ | ||
intrahost_observations_samples[ | ||
~intrahost_observations_samples.collection_date.isna()]['collection_date'].apply( | ||
lambda x: x.timestamp()) | ||
intrahost_regression_vafs = \ | ||
intrahost_observations_samples[['variant_id', 'vaf', 'collection_date_timestamp']] \ | ||
.groupby(['variant_id']) \ | ||
.agg({'vaf': list, 'collection_date_timestamp': list}) | ||
|
||
intrahost_regression_vafs['vaf_regression_coefficient'] = intrahost_regression_vafs.apply( | ||
lambda x: FeatureLoader._linear_regression(x[1], x[0], model), axis=1) | ||
|
||
return intrahost_regression_vafs[["vaf_regression_coefficient"]] | ||
|
||
@staticmethod | ||
def _extract_conservation_features(intrahost_observations_samples): | ||
""" | ||
Input DF requires the columns 'variant_id', 'cons_hmm_sars_cov_2', 'cons_hmm_sarbecovirus' | ||
and 'cons_hmm_vertebrate_cov' | ||
Output DF is grouped by "variant_id" and contains the above input columns | ||
""" | ||
conservation_features = intrahost_observations_samples[ | ||
['variant_id', 'cons_hmm_sars_cov_2', 'cons_hmm_sarbecovirus', 'cons_hmm_vertebrate_cov']] \ | ||
.groupby('variant_id').first() | ||
return conservation_features | ||
|
||
|
||
class IntrahostPredictionLoader: | ||
pass | ||
DATA = Path(__file__) / "model_data" | ||
def __init__(self): | ||
self.model_file = IntrahostPredictionLoader.DATA / "05_model7_xgboost.joblib" | ||
self.model = joblib.load(self.model_file) | ||
def get_intrahost_mutations(self): | ||
""" | ||
Implement query to database to obtain intrahost mutations | ||
- Merge with ENA sample metadata, required? | ||
Calculate dn/ds stuff | ||
- Requires porting Rangas R-code to python function | ||
""" | ||
pass | ||
def _build_feature_table(self): | ||
pass |