Skip to content

Commit

Permalink
Created script for finding peptides that have a disproportionately hi…
Browse files Browse the repository at this point in the history
…gh amount of e's and k's
  • Loading branch information
SeanGolez committed Oct 7, 2024
1 parent b0db6f1 commit d0c4e4e
Showing 1 changed file with 69 additions and 0 deletions.
69 changes: 69 additions & 0 deletions extensions/e_k_bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3

import pandas as pd
import argparse
import fastatools as ft
import numpy as np

def main():

parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('-i', '--fasta-file', help='Directory with enriched petide files for input', required=True)
parser.add_argument('-o', '--output-file', default="e_k_bias_out.tsv", help='Name of .tsv to output file with AA bias data')

args = parser.parse_args()

# get proportion of e's and k's for each peptide
e_k_props = get_e_k_props(args.fasta_file)

# get percentiles
percentile_dict = get_percentiles(e_k_props)

# create output df
out_data = [(name, round(prop, 3), round(percentile_dict[name], 2)) for name, prop in e_k_props.items()]

pd.DataFrame(out_data, columns=["CodeName", "e_k_Prop", "e_k_Percentile"]).to_csv(args.output_file, index=False, sep='\t')


# get proportion of e's and k's for each peptide
def get_e_k_props(fasta_file)->dict:
e_k_props = dict()

# get props for peptide file
fasta_dict = ft.read_fasta_dict(fasta_file)

# iterate through each sequence
for name, seq in fasta_dict.items():
e_k_count = 0

# loop through each AA, get count of e and k
for aa in seq:
if aa.lower() == 'e' or aa.lower() =='k':
e_k_count += 1

# add proportion to dict
e_k_props[name] = (e_k_count) / len(seq)

return e_k_props

# get percentile of each peptide using its e and k proportion
def get_percentiles(e_k_props)->dict:
# Calculate percentile of each peptide
names = list(e_k_props.keys())
all_props = np.array(list(e_k_props.values()))

# Get unique values and array for mapping each original value to its corresponding index in the unique array
unique_props, inverse_indices = np.unique(all_props, return_inverse=True)

# Calculate percentiles based on the unique props
percentile_ranks = np.linspace(0, 100, len(unique_props))

# Map sorted values back to the original corresponding name
percentile_dict = {names[i]: percentile_ranks[inverse_indices[i]] for i in range(len(names))}

return percentile_dict


if __name__ == "__main__":
main()

0 comments on commit d0c4e4e

Please sign in to comment.