Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use isoform lengths median to determine gene feature_length + write feature_length for 'spike-in' features in var #1005

Merged
merged 5 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
7 changes: 2 additions & 5 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,8 @@ def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
mapping_dict = {}

for i in ids:
if i.startswith("ENS"):
organism = gencode.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
else:
mapping_dict[i] = 0
organism = gencode.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)

return mapping_dict

Expand Down
55 changes: 23 additions & 32 deletions cellxgene_schema_cli/scripts/gene_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import gzip
import hashlib
import os
import statistics
import sys
import urllib.request
from collections import defaultdict
from typing import Dict

import gtf_tools
Expand Down Expand Up @@ -129,56 +131,45 @@ def _parse_gtf(self, gtf_path: str, gene_info_description: str):
def _get_gene_lengths_from_gtf(self, gtf_path: str) -> Dict[str, int]:
"""
Parses a GTF file and calculates gene lengths, which are calculated as follows for each gene:
1. Get all different isoforms
2. Merge exons to create a set of non-overlapping "meta" exons
3. Sum the lengths of these "meta" exons

Code inspired from http://www.genemine.org/gtftools.php
1. Get lengths for all different isoforms
2. Get the median of the lengths of these isoforms

:param str gtf_path: path to gzipped gtf file

:rtype Dict[str]
:return A dictionary with keys being gene ids and values the corresponding length in base pairs
"""

gene_to_isoforms_map = defaultdict(set)
isoform_to_length_map = defaultdict(int)
with gzip.open(gtf_path, "rb") as gtf:
# Dictionary of list of tuples that will store exon in bed format-like. Elements of the tuples will be the
# equivalent fields from the bed format: chromosome, start, end, strand). Each list of tuples will correspond
# to one gene.

exons_in_bed = {} # type: ignore

for byte_line in gtf:
line = byte_line.decode("utf-8")

if line[0] == "#":
continue

line = line.rstrip().split("\t") # type: ignore
# See https://www.gencodegenes.org/pages/data_format.html for GTF metadata schema
gene_metadata = line.rstrip().split("\t") # type: ignore

if line[2] != "exon":
if gene_metadata[2] != "exon":
continue

# Convert line to bed-like format: (chromosome, start, end, strand)
exon_bed = (line[0], int(line[3]) - 1, int(line[4]), line[6])

current_features = gtf_tools._get_features(line) # type: ignore
# Calculate exon length using genomic end location and genomic start location
exon_length = int(gene_metadata[4]) - int(gene_metadata[3]) + 1
current_features = gtf_tools._get_features(gene_metadata) # type: ignore
gene_id = current_features["gene_id"]
if gene_id in exons_in_bed:
exons_in_bed[gene_id].append(exon_bed)
else:
exons_in_bed[gene_id] = [exon_bed]
transcript_id = current_features["transcript_id"]

# Merge exons from the same gene to create non-overlapping "meta" genes
# Then calculate gene length
gene_lengths = {}
for gene in exons_in_bed:
meta_exons = gtf_tools.merge_bed_ranges(exons_in_bed[gene]) # type: ignore
gene_to_isoforms_map[gene_id].add(transcript_id)
isoform_to_length_map[transcript_id] += exon_length

# get length for this gene, i.e. sum of lengths of "meta" exons
gene_lengths[gene] = 0
for exon in meta_exons:
gene_lengths[gene] += exon[2] - exon[1]
gene_lengths = {}
for gene_id in gene_to_isoforms_map:
isoforms = gene_to_isoforms_map[gene_id]
isoform_lengths = []
for isoform in isoforms:
isoform_lengths.append(isoform_to_length_map[isoform])
# GTFTools established standard is to convert to int
gene_lengths[gene_id] = int(statistics.median(isoform_lengths))

return gene_lengths

Expand Down
61 changes: 0 additions & 61 deletions cellxgene_schema_cli/scripts/gtf_tools.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,3 @@
from typing import List, Tuple


def merge_bed_ranges(ranges: List[Tuple]) -> List[Tuple]:
"""
Merges bed-like ranges and returns non-overlapping ranges

code adapted from http://www.genemine.org/gtftools.php

:param ranges List[Tuple]: List of ranges, each range is a tuple of the form [chromosome, start, end, strand]

:rtype List[Tuple]
:return A list of non-overlapping ranges
"""

ranges.sort(key=lambda x: (x[0], x[1]))
merged = []
n_range = len(ranges)
if n_range == 1:
merged = ranges
elif n_range == 2:
imerge = _neighbor_merge_ranges(ranges[0], ranges[1])
for each in imerge:
merged.append(each)
else:
i = 2
imerge = _neighbor_merge_ranges(ranges[0], ranges[1])
n_imerge = len(imerge)
while n_imerge > 0:
if n_imerge == 2:
merged.append(imerge[0])

imerge = _neighbor_merge_ranges(imerge[n_imerge - 1], ranges[i])
n_imerge = len(imerge)
if i == n_range - 1:
for each in imerge:
merged.append(each)
n_imerge = -1
i += 1

return merged


def _neighbor_merge_ranges(range1: Tuple, range2: Tuple) -> List[Tuple]:
"""
Merges two neighboring ranges

:param range1 Tuple: a tuple of the form [chromosome, start, end, strand]
:param range2 Tuple: a tuple of the form [chromosome, start, end, strand]

:rtype List[Tuple]
:return A list of non-overlapping ranges
"""

if range2[1] <= range1[2]:
merged = [(range1[0], range1[1], max(range1[2], range2[2]), range1[3])]
else:
merged = [range1, range2]
return merged


def _get_features(gtf_line: str) -> dict:
"""
Parses the features found in column 8 of GTF, returns a dict with keys as feature names and values as the feature
Expand Down
8 changes: 4 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
invalid_species = ["Caenorhabditis elegans"]

valid_genes = {
gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 6836)},
gencode.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 2404)},
gencode.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 1797)},
}

valid_genes_same_name_diff_species = {
gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000166278": ("C2_ENSG00000166278", 7151)},
gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000166278": ("C2_ENSG00000166278", 1876)},
gencode.SupportedOrganisms.MUS_MUSCULUS: {
"ENSMUSG00000024371": ("C2_ENSMUSG00000024371", 3872),
"ENSMUSG00000024371": ("C2_ENSMUSG00000024371", 694),
},
}

Expand Down
6 changes: 3 additions & 3 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,9 @@
# these columns are defined in the schema
var_expected = pd.DataFrame(
[
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
["gene", False, "MACF1", "NCBITaxon:9606", 70573],
["gene", False, "Trp53", "NCBITaxon:10090", 4045],
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 1061],
["gene", False, "MACF1", "NCBITaxon:9606", 2821],
["gene", False, "Trp53", "NCBITaxon:10090", 1797],
["gene", False, "S", "NCBITaxon:2697049", 3822],
],
index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],
Expand Down
6 changes: 3 additions & 3 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,9 @@ def test_get_dictionary_mapping_feature_length(self, label_writer):
]
# values derived from csv
gene_lengths = [
0, # non-gene feature, so set to 0 regardless of csv value
70573,
4045,
1061,
2821,
1797,
3822,
]
expected_dict = dict(zip(ids, gene_lengths))
Expand Down
Loading