Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve multple non-informative assignmets #239

Merged
merged 4 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions src/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,13 @@
import os
import re
import subprocess
import threading
import math
from collections import defaultdict
from enum import Enum

logger = logging.getLogger('IsoQuant')


class AtomicIDDistributor(object):
def __init__(self):
self.value = 0
self._lock = threading.Lock()

def increment(self):
with self._lock:
self.value += 1
return self.value


class CigarEvent(Enum):
match = 0
insertion = 1
Expand All @@ -48,6 +36,13 @@ def get_ins_del_match_events(cls):
return {cls.match, cls.insertion, cls.deletion, cls.seq_match, cls.seq_mismatch}


class TranscriptNaming:
transcript_prefix = "transcript"
novel_gene_prefix = "novel_gene_"
nic_transcript_suffix = ".nic"
nnic_transcript_suffix = ".nnic"


# key, value
def get_first_best_from_sorted(sorted_list_of_pairs):
if not sorted_list_of_pairs:
Expand Down
2 changes: 1 addition & 1 deletion src/dataset_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ def resolve_multimappers(self, chr_ids, sample, multimapped_reads):

for assignment_list in multimapped_reads.values():
if len(assignment_list) > 1:
multimap_resolver.resolve(assignment_list)
assignment_list = multimap_resolver.resolve(assignment_list)
resolved_lists = defaultdict(list)
for a in assignment_list:
resolved_lists[a.chr_id].append(a)
Expand Down
7 changes: 3 additions & 4 deletions src/gene_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
equal_ranges,
get_intron_strand,
intervals_total_length,
is_subprofile,
overlaps,
junctions_from_blocks,
AtomicIDDistributor
junctions_from_blocks
)
from .id_policy import SimpleIDDistributor

logger = logging.getLogger('IsoQuant')

Expand Down Expand Up @@ -121,7 +120,7 @@ def print_debug(self):

# exon/intron info
class FeatureInfo:
feature_id_counter = AtomicIDDistributor()
feature_id_counter = SimpleIDDistributor()
def __init__(self, chr_id, start, end, strand, type, gene_ids):
self.id = FeatureInfo.feature_id_counter.increment()
self.chr_id = chr_id
Expand Down
19 changes: 8 additions & 11 deletions src/graph_based_model_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from enum import unique, Enum

from .common import (
TranscriptNaming,
cmp,
get_exons,
intersection_len,
Expand Down Expand Up @@ -46,10 +47,6 @@ class StrandnessReportingLevel(Enum):


class GraphBasedModelConstructor:
transcript_prefix = "transcript"
novel_gene_prefix = "novel_gene_"
nic_transcript_suffix = ".nic"
nnic_transcript_suffix = ".nnic"
detected_known_isoforms = set()
extended_transcript_ids = set()

Expand Down Expand Up @@ -411,7 +408,7 @@ def construct_fl_isoforms(self):
transcript_range = (path[0][1], path[-1][1])
novel_exons = get_exons(transcript_range, list(intron_path))
count = self.path_storage.paths[path]
new_transcript_id = self.transcript_prefix + str(self.get_transcript_id())
new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id())
# logger.debug("uuu %s: %s" % (new_transcript_id, str(novel_exons)))

reference_isoform = None
Expand Down Expand Up @@ -482,17 +479,17 @@ def construct_fl_isoforms(self):

transcript_gene = self.select_reference_gene(intron_path, transcript_range, transcript_strand)
if transcript_gene is None:
transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id +
transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id +
"_" + str(self.get_transcript_id()))
elif transcript_strand == '.':
transcript_strand = self.gene_info.gene_strands[transcript_gene]

if all(intron in self.known_introns for intron in intron_path):
transcript_type = TranscriptModelType.novel_in_catalog
id_suffix = self.nic_transcript_suffix
id_suffix = TranscriptNaming.nic_transcript_suffix
else:
transcript_type = TranscriptModelType.novel_not_in_catalog
id_suffix = self.nnic_transcript_suffix
id_suffix = TranscriptNaming.nnic_transcript_suffix

new_model = TranscriptModel(self.gene_info.chr_id, transcript_strand,
new_transcript_id + ".%s" % self.gene_info.chr_id + id_suffix,
Expand Down Expand Up @@ -644,11 +641,11 @@ def generate_monoexon_from_clustered(self, clustered_reads, forward=True):

strand = '+' if forward else '-'
coordinates = (five_prime_pos, three_prime_pos) if forward else (three_prime_pos, five_prime_pos)
new_transcript_id = self.transcript_prefix + str(self.get_transcript_id())
transcript_gene = (GraphBasedModelConstructor.novel_gene_prefix + self.gene_info.chr_id +
new_transcript_id = TranscriptNaming.transcript_prefix + str(self.get_transcript_id())
transcript_gene = (TranscriptNaming.novel_gene_prefix + self.gene_info.chr_id +
"_" + str(self.get_transcript_id()))
transcript_type = TranscriptModelType.novel_not_in_catalog
id_suffix = self.nnic_transcript_suffix
id_suffix = TranscriptNaming.nnic_transcript_suffix

is_valid = True
half_len = interval_len(coordinates) / 2
Expand Down
21 changes: 16 additions & 5 deletions src/id_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# # All Rights Reserved
# See file LICENSE for details.
############################################################################
import threading


from .graph_based_model_construction import GraphBasedModelConstructor
from src.common import TranscriptNaming


class SimpleIDDistributor(object):
Expand All @@ -25,7 +25,7 @@ def __init__(self, genedb, chr_id):
return

for g in genedb.region(seqid=chr_id, start=1, featuretype="gene"):
if g.id.startswith(GraphBasedModelConstructor.novel_gene_prefix):
if g.id.startswith(TranscriptNaming.novel_gene_prefix):
try:
gene_num = int(g.id.split("_")[-1])
self.forbidden_ids.add(gene_num)
Expand All @@ -34,9 +34,9 @@ def __init__(self, genedb, chr_id):
except ValueError:
pass

transcript_num_start_pos = len(GraphBasedModelConstructor.transcript_prefix)
transcript_num_start_pos = len(TranscriptNaming.transcript_prefix)
for t in genedb.region(seqid=chr_id, start=1, featuretype=("transcript", "mRNA")):
if t.id.startswith(GraphBasedModelConstructor.transcript_prefix):
if t.id.startswith(TranscriptNaming.transcript_prefix):
try:
transcript_num = int(t.id.split(".")[0][transcript_num_start_pos:])
self.forbidden_ids.add(transcript_num)
Expand Down Expand Up @@ -78,3 +78,14 @@ def get_id(self, chr_id, feature, strand):
feature_id = self.id_dict[feature_tuple]

return feature_id


class AtomicIDDistributor(object):
def __init__(self):
self.value = 0
self._lock = threading.Lock()

def increment(self):
with self._lock:
self.value += 1
return self.value
5 changes: 3 additions & 2 deletions src/isoform_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

import logging
from enum import Enum, unique
from src.common import AtomicIDDistributor, junctions_from_blocks
from src.common import junctions_from_blocks
from src.id_policy import SimpleIDDistributor
from src.serialization import *
from src.polya_finder import PolyAInfo

Expand Down Expand Up @@ -619,7 +620,7 @@ def serialize(self, outfile):


class ReadAssignment:
assignment_id_generator = AtomicIDDistributor()
assignment_id_generator = SimpleIDDistributor()

def __init__(self, read_id, assignment_type, match=None):
self.assignment_id = ReadAssignment.assignment_id_generator.increment()
Expand Down
37 changes: 35 additions & 2 deletions src/multimap_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
############################################################################

import logging
import math
from enum import Enum
from collections import defaultdict

from .isoform_assignment import ReadAssignmentType
from .common import intersection_len


logger = logging.getLogger('IsoQuant')

Expand Down Expand Up @@ -51,6 +53,7 @@ def select_best_assignment(self, assignment_list):
consistent_assignments = []
inconsistent_assignments = []
primary_inconsistent = []
noninformative = []

for i, a in enumerate(assignment_list):
if a.assignment_type.is_inconsistent():
Expand All @@ -61,6 +64,8 @@ def select_best_assignment(self, assignment_list):
consistent_assignments.append(i)
if not a.multimapper and not a.assignment_type == ReadAssignmentType.ambiguous:
primary_unique.append(i)
else:
noninformative.append(i)

if primary_unique:
return self.filter_assignments(assignment_list, primary_unique)
Expand All @@ -74,7 +79,11 @@ def select_best_assignment(self, assignment_list):
if inconsistent_assignments:
return self.select_best_inconsistent(assignment_list, inconsistent_assignments)

return assignment_list
if noninformative:
return self.select_noninformative(assignment_list, noninformative)

logger.warning("Unexpected assignments in multimap resolution %s" % assignment_list[0].read_id)
return assignment_list[0:1]

@staticmethod
def select_best_inconsistent(assignment_list, inconsistent_assignments):
Expand Down Expand Up @@ -184,3 +193,27 @@ def find_duplicates(assignment_list, assignment_indices):
% (example_assignment.read_id, example_assignment.chr_id, example_assignment.start))

return selected_assignments

@staticmethod
def select_noninformative(assignment_list, assignment_indices):
# triplets (overlap_length, genomic_region_start, index)
overlap_index_list = []
max_overlap_len = 0

for i in assignment_indices:
assignment = assignment_list[i]
overlap_len = intersection_len(assignment.genomic_region, (assignment.start, assignment.end))
max_overlap_len = max(overlap_len, max_overlap_len)
overlap_index_list.append((overlap_len, assignment.genomic_region[0], i))

# select assignment with the best overlap with genic region and lowest region start (for reproducibility)
min_region_start = math.inf
best_assignment = -1
for info in overlap_index_list:
if info[0] == max_overlap_len and info[1] < min_region_start:
min_region_start = info[1]
best_assignment = info[2]

assert best_assignment != -1
return MultimapResolver.filter_assignments(assignment_list, {best_assignment})

Loading