Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add compartment information to genes.tsv #437

Merged
merged 9 commits into from
Dec 5, 2022
141 changes: 141 additions & 0 deletions code/modelCuration/getCompFromUniprotCellAtlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# collect subcellular localization for existing enzymes from Uniprot/Swissprot and Cell Atlas
# compartment info from both sources were combined and integrated into genes.tsv

import pandas as pd
import csv

# import subcellular location from SwissProt annotation
SwissProt_tsv = pd.read_table("~/Downloads/SwissProt_20221115.tsv")
SwissProt_proteins = SwissProt_tsv['Entry'].to_list()
SwissProt_subCellularlocation = SwissProt_tsv['Subcellular location [CC]'].to_list()


# build a dict for key word mapping of compartment assignment
swissprot_keywords = {
#'Extracellular': ['',''], # e
'Peroxisome': ['peroxisome'], # x
'Mitochondria': ['mitochondrion {','mitochondrion matrix'], # m
'Cytosol': ['cytoplasm','cytosol','cytoskeleton'], # c
'Lysosome': ['lysosome','lysosome lumen','lysosome membrane'], # l
'Endoplasmic reticulum': ['endoplasmic reticulum','Endoplasmic reticulum lumen','endoplasmic reticulum membrane'], # r
'Golgi apparatus': ['golgi apparatus','golgi apparatus lumen','golgi apparatus membrane'], # g
'Nucleus': ['nucleus','nucleolus'], # n
'Inner mitochondria': ['mitochondrion inner membrane','mitochondrion intermembrane space'] # i
}


# store swissport compartment info as a dict with a key of uniprot id
comps_from_swissprot = {}
for i, text in enumerate(SwissProt_subCellularlocation):
out_list = [] # output compartmetns as a list
comps_from_swissprot[SwissProt_proteins[i]] = out_list
if not pd.isna(text):
for key in swissprot_keywords:
# check if cellular location text contains any keyword from a compartment
if any(word in text.lower() for word in swissprot_keywords[key]):
out_list.append(key)
comps_from_swissprot[SwissProt_proteins[i]] = out_list


# load Human-GEM gene annotation file
HumanGenes_tsv = pd.read_table("../../model/genes.tsv")
Human_genes = HumanGenes_tsv['genes'].to_list()
Human_proteins = HumanGenes_tsv['geneUniProtID'].to_list()


# save SwissProt compartment information to tsv file
swissprot_compList = ['']*len(Human_genes)
for i in range(len(Human_genes)):
if not pd.isna(Human_proteins[i]):
if Human_proteins[i] in SwissProt_proteins:
swissprot_compList[i] = ';'.join(comps_from_swissprot[Human_proteins[i]])
haowang-bioinfo marked this conversation as resolved.
Show resolved Hide resolved
swissprot_compartments = pd.DataFrame()
swissprot_compartments['genes'] = Human_genes
swissprot_compartments['geneUniProtID'] = Human_proteins
swissprot_compartments['compartments'] = swissprot_compList
swissprot_compartments.to_csv('../../data/modelCuration/Swissprot_compartments.tsv', sep="\t", index=False)


# import Cell Atlas compartment info
cell_atlas_compartments = pd.read_table("../../data/modelCuration/CellAtlasCompartments_science_2017.tsv")
cellAtlas_comps = cell_atlas_compartments['Subcellular location'].to_list()
cellAtlas_ensembl_id = cell_atlas_compartments['Ensembl'].to_list()
cellAtlas_uniprot_id = cell_atlas_compartments['Uniprot'].to_list()

# investigate key words
compartment_names = []
for comps in cellAtlas_comps:
compartment_names.extend(comps.split(','))
unique_compartments = set(compartment_names)
unique_compartments
# Cell Atlas compartments were sorted and the following ones were discarded:
# 'Microtubules', 'Midbody ring', 'Midbody', 'Cytokinetic bridge', 'Microtubule ends', 'Mitotic spindle'
# 'Intermediate filaments', 'Actin filaments', 'Focal adhesion sites', 'Cleavage furrow',
# 'Lipid droplets', 'Vesicles', 'Endosomes', 'Plasma membrane', 'Cell Junctions'
# 'Centrosome', 'Centriolar satellite'

# construct a dict for key word mapping to Cell Atlas compartments
cellatlas_keywords = {
#'Extracellular': [''], # e
'Peroxisome': ['Peroxisomes'], # x
'Mitochondria': ['Mitochondria'], # m
'Cytosol': ['Cytosol', 'Cytoplasmic bodies', 'Rods & Rings', 'Aggresome'], # c
'Lysosome': ['Lysosomes'], # l
'Endoplasmic reticulum': ['Endoplasmic reticulum'], # r
'Golgi apparatus': ['Golgi apparatus'], # g
'Nucleus': ['Nuclear membrane', 'Nucleoli rim', 'Nucleoli', 'Nuclear bodies', 'Nucleoli fibrillar center', 'Nucleoplasm', 'Kinetochore', 'Mitotic chromosome', 'Nuclear speckles'], # n
#'Inner mitochondria': [''] # i, Cell Atlas does provide such location info
}


# store CellAtlas compartment info to a dict with keys as ensembl ids
geneComps_from_cell_atlas = {}
for gene in Human_genes:
out_list = [] # store compartments as a list
if gene not in cellAtlas_ensembl_id:
geneComps_from_cell_atlas[gene] = out_list
else:
gene_ind = cellAtlas_ensembl_id.index(gene)
for key in cellatlas_keywords:
# check if cellular location text contains any keyword from a compartment
if any(word in cellAtlas_comps[gene_ind] for word in cellatlas_keywords[key]):
out_list.append(key)
geneComps_from_cell_atlas[gene] = out_list


# integrate compartment info from cellAtlas and SwissProt with following rules:
# 1. output two columns: "compartments" and "compDataSource"
# 2. use one source if another has no assigned compartments
# 3. union compartments if both source are provided
swissprot_comps = pd.read_table("../../data/modelCuration/Swissprot_compartments.tsv")
geneComps_from_swissprot = swissprot_comps['compartments'].to_list()

# combine compartment info from two data sources
haowang-bioinfo marked this conversation as resolved.
Show resolved Hide resolved
geneComps_combined = []
source = ['']*len(Human_genes)
for i, gene in enumerate(Human_genes):
out_list = [] # save compartmetns as a list
if pd.isna(geneComps_from_swissprot[i]) and geneComps_from_cell_atlas[gene] != []:
out_list = geneComps_from_cell_atlas[gene]
source[i] = 'CellAtlas'
elif not pd.isna(geneComps_from_swissprot[i]) and geneComps_from_cell_atlas[gene] == []:
out_list = geneComps_from_swissprot[i].split(';')
source[i] = 'SwissProt'
elif not pd.isna(geneComps_from_swissprot[i]) and geneComps_from_cell_atlas[gene] != []:
union = set(geneComps_from_swissprot[i].split(';')+ geneComps_from_cell_atlas[gene])
out_list = list(union)
source[i] = 'SwissProt;CellAtlas'
if 'Mitochondria' not in geneComps_from_swissprot[i].split(';') and 'Inner mitochondria' in geneComps_from_swissprot[i].split(';'):
if 'Mitochondria' in geneComps_from_cell_atlas[gene]:
out_list.remove('Mitochondria')
# manual inspection of integration
print(gene+': '+geneComps_from_swissprot[i]+'\t'+';'.join(geneComps_from_cell_atlas[gene])+'\t'+';'.join(out_list))
geneComps_combined.append(';'.join(out_list))


# append columns to genes.tsv
HumanGenes_tsv['compartments'] = geneComps_combined
HumanGenes_tsv['compDataSource'] = source
HumanGenes_tsv[:0].to_csv('../../model/genes.tsv', sep="\t", index=False)
HumanGenes_tsv.to_csv('../../model/genes.tsv', sep="\t", index=False, quoting=csv.QUOTE_NONNUMERIC, header=False, mode="a")

Loading