chanzuckerberg · nayib-jose-gloria · Sep 12, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 5, 2024
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_homo_sapiens.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_homo_sapiens.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_mus_musculus.csv.gz b/cellxgene_schema_cli/cellxgene_schema/gencode_files/genes_mus_musculus.csv.gz
diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py
@@ -220,11 +220,8 @@ def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
         mapping_dict = {}
 
         for i in ids:
-            if i.startswith("ENS"):
-                organism = gencode.get_organism_from_feature_id(i)
-                mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
-            else:
-                mapping_dict[i] = 0
+            organism = gencode.get_organism_from_feature_id(i)
+            mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
 
         return mapping_dict
 

diff --git a/cellxgene_schema_cli/scripts/gene_processing.py b/cellxgene_schema_cli/scripts/gene_processing.py
@@ -2,8 +2,10 @@
 import gzip
 import hashlib
 import os
+import statistics
 import sys
 import urllib.request
+from collections import defaultdict
 from typing import Dict
 
 import gtf_tools
@@ -129,56 +131,45 @@ def _parse_gtf(self, gtf_path: str, gene_info_description: str):
     def _get_gene_lengths_from_gtf(self, gtf_path: str) -> Dict[str, int]:
         """
         Parses a GTF file and calculates gene lengths, which are calculated as follows for each gene:
-        1. Get all different isoforms
-        2. Merge exons to create a set of non-overlapping "meta" exons
-        3. Sum the lengths of these "meta" exons
-
-        Code inspired from http://www.genemine.org/gtftools.php
+        1. Get lengths for all different isoforms
+        2. Get the median of the lengths of these isoforms
 
         :param str gtf_path: path to gzipped gtf file
 
         :rtype  Dict[str]
         :return A dictionary with keys being gene ids and values the corresponding length in base pairs
         """
-
+        gene_to_isoforms_map = defaultdict(set)
+        isoform_to_length_map = defaultdict(int)
         with gzip.open(gtf_path, "rb") as gtf:
-            # Dictionary of list of tuples that will store exon in bed format-like. Elements of the tuples will be the
-            # equivalent  fields from the bed format: chromosome, start, end, strand). Each list of tuples will correspond
-            # to one gene.
-
-            exons_in_bed = {}  # type: ignore
-
             for byte_line in gtf:
                 line = byte_line.decode("utf-8")
-
                 if line[0] == "#":
                     continue
 
-                line = line.rstrip().split("\t")  # type: ignore
+                # See https://www.gencodegenes.org/pages/data_format.html for GTF metadata schema
+                gene_metadata = line.rstrip().split("\t")  # type: ignore
 
-                if line[2] != "exon":
+                if gene_metadata[2] != "exon":
                     continue
 
-                # Convert line to bed-like format: (chromosome, start, end, strand)
-                exon_bed = (line[0], int(line[3]) - 1, int(line[4]), line[6])
-
-                current_features = gtf_tools._get_features(line)  # type: ignore
+                # Calculate exon length using genomic end location and genomic start location
+                exon_length = int(gene_metadata[4]) - int(gene_metadata[3]) + 1
+                current_features = gtf_tools._get_features(gene_metadata)  # type: ignore
                 gene_id = current_features["gene_id"]
-                if gene_id in exons_in_bed:
-                    exons_in_bed[gene_id].append(exon_bed)
-                else:
-                    exons_in_bed[gene_id] = [exon_bed]
+                transcript_id = current_features["transcript_id"]
 
-        # Merge exons from the same gene to create non-overlapping "meta" genes
-        # Then calculate gene length
-        gene_lengths = {}
-        for gene in exons_in_bed:
-            meta_exons = gtf_tools.merge_bed_ranges(exons_in_bed[gene])  # type: ignore
+                gene_to_isoforms_map[gene_id].add(transcript_id)
+                isoform_to_length_map[transcript_id] += exon_length
 
-            # get length for this gene, i.e. sum of lengths of "meta" exons
-            gene_lengths[gene] = 0
-            for exon in meta_exons:
-                gene_lengths[gene] += exon[2] - exon[1]
+        gene_lengths = {}
+        for gene_id in gene_to_isoforms_map:
+            isoforms = gene_to_isoforms_map[gene_id]
+            isoform_lengths = []
+            for isoform in isoforms:
+                isoform_lengths.append(isoform_to_length_map[isoform])
+            # GTFTools established standard is to convert to int
+            gene_lengths[gene_id] = int(statistics.median(isoform_lengths))
 
         return gene_lengths
 

diff --git a/cellxgene_schema_cli/scripts/gtf_tools.py b/cellxgene_schema_cli/scripts/gtf_tools.py
@@ -1,64 +1,3 @@
-from typing import List, Tuple
-
-
-def merge_bed_ranges(ranges: List[Tuple]) -> List[Tuple]:
-    """
-    Merges bed-like ranges and returns non-overlapping ranges
-
-    code adapted from http://www.genemine.org/gtftools.php
-
-    :param ranges List[Tuple]: List of ranges, each range is a tuple of the form [chromosome, start, end, strand]
-
-    :rtype List[Tuple]
-    :return A list of non-overlapping ranges
-    """
-
-    ranges.sort(key=lambda x: (x[0], x[1]))
-    merged = []
-    n_range = len(ranges)
-    if n_range == 1:
-        merged = ranges
-    elif n_range == 2:
-        imerge = _neighbor_merge_ranges(ranges[0], ranges[1])
-        for each in imerge:
-            merged.append(each)
-    else:
-        i = 2
-        imerge = _neighbor_merge_ranges(ranges[0], ranges[1])
-        n_imerge = len(imerge)
-        while n_imerge > 0:
-            if n_imerge == 2:
-                merged.append(imerge[0])
-
-            imerge = _neighbor_merge_ranges(imerge[n_imerge - 1], ranges[i])
-            n_imerge = len(imerge)
-            if i == n_range - 1:
-                for each in imerge:
-                    merged.append(each)
-                n_imerge = -1
-            i += 1
-
-    return merged
-
-
-def _neighbor_merge_ranges(range1: Tuple, range2: Tuple) -> List[Tuple]:
-    """
-    Merges two neighboring ranges
-
-    :param range1 Tuple: a tuple of the form [chromosome, start, end, strand]
-    :param range2 Tuple: a tuple of the form [chromosome, start, end, strand]
-
-    :rtype List[Tuple]
-    :return A list of non-overlapping ranges
-    """
-
-    if range2[1] <= range1[2]:
-        merged = [(range1[0], range1[1], max(range1[2], range2[2]), range1[3])]
-    else:
-        merged = [range1, range2]
-    return merged
-
-
 def _get_features(gtf_line: str) -> dict:
     """
     Parses the features found in column 8 of GTF, returns a dict with keys as feature names and values as the feature

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py b/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
@@ -4,14 +4,14 @@
 invalid_species = ["Caenorhabditis elegans"]
 
 valid_genes = {
-    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 6836)},
-    gencode.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
+    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 2404)},
+    gencode.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 1797)},
 }
 
 valid_genes_same_name_diff_species = {
-    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000166278": ("C2_ENSG00000166278", 7151)},
+    gencode.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000166278": ("C2_ENSG00000166278", 1876)},
     gencode.SupportedOrganisms.MUS_MUSCULUS: {
-        "ENSMUSG00000024371": ("C2_ENSMUSG00000024371", 3872),
+        "ENSMUSG00000024371": ("C2_ENSMUSG00000024371", 694),
     },
 }
 

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py
@@ -313,9 +313,9 @@
 # these columns are defined in the schema
 var_expected = pd.DataFrame(
     [
-        ["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
-        ["gene", False, "MACF1", "NCBITaxon:9606", 70573],
-        ["gene", False, "Trp53", "NCBITaxon:10090", 4045],
+        ["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 1061],
+        ["gene", False, "MACF1", "NCBITaxon:9606", 2821],
+        ["gene", False, "Trp53", "NCBITaxon:10090", 1797],
         ["gene", False, "S", "NCBITaxon:2697049", 3822],
     ],
     index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
@@ -181,9 +181,9 @@ def test_get_dictionary_mapping_feature_length(self, label_writer):
         ]
         # values derived from csv
         gene_lengths = [
-            0,  # non-gene feature, so set to 0 regardless of csv value
-            70573,
-            4045,
+            1061,
+            2821,
+            1797,
             3822,
         ]
         expected_dict = dict(zip(ids, gene_lengths))