diff --git a/README.md b/README.md
index 5793368e..b4eb3100 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,9 @@ IsoQuant support all kinds of long RNA data:
Reads must be provided in FASTQ or FASTA format (can be gzipped). If you have already aligned your reads to the reference genome, simply provide sorted and indexed BAM files.
+IsoQuant expect reads to contain polyA tails. For more reliable transcript model construction do not trim polyA tails.
+
+
## Supported reference data
@@ -519,6 +522,26 @@ The main explanation that some aligner report a lot of false unspliced alignment
for ONT reads.
+`--report_canonical`
+ Strategy for reporting novel transcripts based on canonical splice sites, should be one of:
+
+* `auto` - automatic selection based on the data type and model construction strategy (default);
+* `only_canonical` - report novel transcripts, which contain only canonical splice sites;
+* `only_stranded` - report novel transcripts, for which the strand can be unambiguously derived using splice sites and
+presence of a polyA tail, allowing some splice sites to be non-canonical
+* `all` -- report all transcript model regardless of their splice sites.
+
+
+`--polya_requirement` Strategy for using polyA tails during transcript model construction, should be one of:
+
+* `auto` - default behaviour: polyA tails are required if at least 70% of the reads have polyA tail;
+polyA tails are always required for 1/2-exon transcripts when using ONT data (this is caused by elevated number of false 1/2-exonic alignments reported by minimap2);
+* `never` - polyA tails are never required; use this option **at your own risk** as it may noticeably increase false discovery rate, especially for ONT data;
+* `always` - reported transcripts are always required to have polyA support in the reads.
+
+Note, that polyA tails are always required for reporting novel unspliced isoforms.
+
+
### Hidden options
@@ -528,9 +551,6 @@ We recommend _not_ to modify these options unless you are clearly aware of their
`--no_secondary`
Ignore secondary alignments.
-`--report_unstranded`
- Report transcripts for which the strand cannot be detected using canonical splice sites.
-
`--aligner`
Force to use this alignment method, can be `starlong` or `minimap2`; `minimap2` is currently used as default. Make sure the specified aligner is in the `$PATH` variable.
@@ -548,11 +568,9 @@ We recommend _not_ to modify these options unless you are clearly aware of their
for storing the annotation database, because SQLite database cannot be created on a shared disks.
The folder will be created automatically.
-`--low_memory`
- Deprecated, default behaviour since 3.2.
-
`--high_memory`
- Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage.
+ Cache read alignments instead for making several passes over a BAM file, noticeably increases RAM usage,
+but may improve running time when disk I/O is relatively slow.
`--min_mapq`
Filers out all alignments with MAPQ less than this value (will also filter all secondary alignments, as they typically have MAPQ = 0).
diff --git a/isoquant.py b/isoquant.py
index 0d89a940..dd8bbd27 100755
--- a/isoquant.py
+++ b/isoquant.py
@@ -29,7 +29,8 @@
NANOPORE_DATA,
DataSetReadMapper
)
-from src.dataset_processor import DatasetProcessor
+from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
+from src.graph_based_model_construction import StrandnessReportingLevel
from src.long_read_assigner import AmbiguityResolvingMethod
from src.long_read_counter import COUNTING_STRATEGIES
from src.input_data_storage import InputDataStorage
@@ -48,10 +49,10 @@ def bool_str(s):
def parse_args(cmd_args=None, namespace=None):
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
- ref_args_group = parser.add_argument_group('Reference data:')
- input_args_group = parser.add_argument_group('Input data:')
- output_args_group = parser.add_argument_group('Output naming:')
- pipeline_args_group = parser.add_argument_group('Pipeline options:')
+ ref_args_group = parser.add_argument_group('Reference data')
+ input_args_group = parser.add_argument_group('Input data')
+ output_args_group = parser.add_argument_group('Output naming')
+ pipeline_args_group = parser.add_argument_group('Pipeline options')
other_options = parser.add_argument_group("Additional options:")
show_full_help = '--full_help' in cmd_args
@@ -150,16 +151,20 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
help="gene quantification strategy", type=str, default="with_inconsistent")
add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
help="report novel monoexonic transcripts (true/false), "
- "default: False for ONT, True for other data types")
+ "default: false for ONT, true for other data types")
+ add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel],
+ help="reporting level for novel transcripts based on canonical splice sites;"
+ " default: " + StrandnessReportingLevel.auto.name,
+ default=StrandnessReportingLevel.only_stranded.name)
+ add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
+ help="require polyA tails to be present when reporting transcripts; "
+ "default: auto (requires polyA only when polyA percentage is >= 70%%)",
+ default=PolyAUsageStrategies.auto.name)
# OUTPUT PROPERTIES
pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
default="16")
pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False,
help="report whether splice junctions are canonical")
- add_additional_option_to_group(pipeline_args_group, "--report_unstranded",
- help="report transcripts for which the strand cannot be detected "
- "using canonical splice sites",
- action='store_true', default=False)
pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output",
action='store_true', default=False)
pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting",
@@ -180,8 +185,6 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
help="align reads to reference without running further analysis")
# ADDITIONAL
- add_additional_option("--low_memory", help="decrease RAM consumption (deprecated, set by default)",
- action='store_true', default=True)
add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)",
action='store_true', default=False)
add_additional_option("--no_junc_bed", action="store_true", default=False,
@@ -234,8 +237,12 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
type=str, required=True)
resume_parser.add_argument('--debug', action='store_true', default=argparse.SUPPRESS,
help='Debug log output.')
- resume_parser.add_argument("--threads", "-t", help="number of threads to use", type=int, default=argparse.SUPPRESS)
- resume_parser.add_argument("--low_memory", help="decrease RAM consumption (deprecated, set by default)",
+ resume_parser.add_argument("--threads", "-t", help="number of threads to use",
+ type=int, default=argparse.SUPPRESS)
+ resume_parser.add_argument("--high_memory",
+ help="increase RAM consumption (store alignment and the genome in RAM)",
+ action='store_true', default=False)
+ resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end",
action='store_true', default=argparse.SUPPRESS)
resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end", action='store_true',
default=argparse.SUPPRESS)
@@ -271,11 +278,12 @@ def check_and_load_args(args, parser):
logger.warning("Output folder already contains a previous run, will be overwritten.")
else:
logger.warning("Output folder already contains a previous run, some files may be overwritten. "
- "Use --resume to resume a failed run. Use --force to avoid this message. "
- "Press Ctrl+C to interrupt the run now.")
+ "Use --resume to resume a failed run. Use --force to avoid this message.")
+ logger.warning("Press Ctrl+C to interrupt the run now.")
delay = 9
for i in range(delay):
- sys.stdout.write("Resuming the run in %d seconds\r" % (delay - i))
+ countdown = delay - i
+ sys.stdout.write("Resuming the run in %d second%s\r" % (countdown, "s" if countdown > 1 else ""))
time.sleep(1)
logger.info("Overwriting the previous run")
time.sleep(1)
@@ -288,9 +296,6 @@ def check_and_load_args(args, parser):
elif not os.path.exists(args.genedb_output):
os.makedirs(args.genedb_output)
- if args.high_memory:
- args.low_memory = False
-
if not check_input_params(args):
parser.print_usage()
exit(-1)
@@ -501,7 +506,7 @@ def set_data_dependent_options(args):
args.splice_correction_strategy = splice_correction_strategies[args.data_type]
args.resolve_ambiguous = 'monoexon_and_fsm' if args.fl_data else 'default'
- args.needs_polya_for_construction = False
+ args.requires_polya_for_construction = False
if args.read_group is None and args.input_data.has_replicas():
args.read_group = "file_name"
args.use_technical_replicas = args.read_group == "file_name"
@@ -591,16 +596,26 @@ def set_model_construction_options(args):
'min_known_count', 'min_nonfl_count',
'min_novel_count', 'min_novel_count_rel',
'min_mono_count_rel', 'singleton_adjacent_cov',
- 'fl_only', 'novel_monoexonic'))
+ 'fl_only', 'novel_monoexonic',
+ 'require_monointronic_polya', 'require_monoexonic_polya',
+ 'report_canonical'))
strategies = {
- 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50, True, False),
- 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100, False, True),
- 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100, False, True),
- 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10, False, False),
- 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10, False, True),
- 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100, True, True),
- 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500, False, True),
- 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50, False, True)
+ 'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50,
+ True, False, True, True, StrandnessReportingLevel.only_canonical),
+ 'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100,
+ False, True, False, True, StrandnessReportingLevel.only_canonical),
+ 'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100,
+ False, True, False, False, StrandnessReportingLevel.only_stranded),
+ 'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10,
+ False, False, True, True, StrandnessReportingLevel.only_canonical),
+ 'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10,
+ False, True, False, False, StrandnessReportingLevel.only_stranded),
+ 'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100,
+ True, True, False, False, StrandnessReportingLevel.only_canonical),
+ 'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
+ False, True, False, False, StrandnessReportingLevel.all),
+ 'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50,
+ False, True, False, False, StrandnessReportingLevel.only_stranded)
}
strategy = strategies[args.model_construction_strategy]
@@ -633,6 +648,13 @@ def set_model_construction_options(args):
logger.info("Novel unspliced transcripts will not be reported, "
"set --report_novel_unspliced true to discover them")
+ args.require_monointronic_polya = strategy.require_monointronic_polya
+ args.require_monoexonic_polya = strategy.require_monoexonic_polya
+ args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
+ args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
+ if args.report_canonical_strategy == StrandnessReportingLevel.auto:
+ args.report_canonical_strategy = strategy.report_canonical
+
def set_configs_directory(args):
config_dir = os.path.join(os.environ['HOME'], '.config', 'IsoQuant')
@@ -656,8 +678,6 @@ def set_additional_params(args):
set_splice_correction_options(args)
args.print_additional_info = True
- args.no_polya = False
-
args.indel_near_splice_site_dist = 10
args.upstream_region_len = 20
@@ -673,6 +693,8 @@ def set_additional_params(args):
args.needs_reference = False
args.simple_models_mapq_cutoff = 30
+ args.polya_percentage_threshold = 0.7
+ args.low_polya_percentage_threshold = 0.1
def run_pipeline(args):
diff --git a/src/alignment_processor.py b/src/alignment_processor.py
index d916ff94..67df2262 100644
--- a/src/alignment_processor.py
+++ b/src/alignment_processor.py
@@ -229,7 +229,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec
self.bam_merger = BAMOnlineMerger(self.bam_pairs, self.chr_id, 0,
self.bam_pairs[0][0].get_reference_length(self.chr_id),
- multiple_iterators=self.params.low_memory)
+ multiple_iterators=not self.params.high_memory)
self.strand_detector = StrandDetector(self.chr_record)
self.read_groupper = read_groupper
self.polya_finder = PolyAFinder(self.params.polya_window, self.params.polya_fraction)
@@ -238,7 +238,7 @@ def __init__(self, chr_id, bam_pairs, params, illumina_bam, genedb=None, chr_rec
def process(self):
current_region = None
- alignment_storage = BAMAlignmentStorage(self.bam_merger) if self.params.low_memory else InMemoryAlignmentStorage()
+ alignment_storage = BAMAlignmentStorage(self.bam_merger) if not self.params.high_memory else InMemoryAlignmentStorage()
for bam_index, alignment in self.bam_merger.get():
if alignment_storage.alignment_is_not_adjacent(alignment):
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
index 2e5099e5..8559097a 100644
--- a/src/dataset_processor.py
+++ b/src/dataset_processor.py
@@ -10,7 +10,7 @@
import logging
import os
import shutil
-import time
+from enum import Enum, unique
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
@@ -72,9 +72,25 @@ def clean_locks(chr_ids, base_name, fname_function):
os.remove(fname)
+@unique
+class PolyAUsageStrategies(Enum):
+ auto = 1
+ never = 2
+ always = 3
+
+
+def set_polya_requirement_strategy(flag, polya_requirement_strategy):
+ if polya_requirement_strategy == PolyAUsageStrategies.auto:
+ return flag
+ elif polya_requirement_strategy == PolyAUsageStrategies.never:
+ return False
+ else:
+ return True
+
+
def collect_reads_in_parallel(sample, chr_id, args):
current_chr_record = Fasta(args.reference)[chr_id]
- if not args.low_memory:
+ if args.high_memory:
current_chr_record = str(current_chr_record)
read_grouper = create_read_grouper(args, sample, chr_id)
lock_file = reads_collected_lock_file_name(sample.out_raw_file, chr_id)
@@ -406,7 +422,21 @@ def process_sample(self, sample):
polya_fraction = polya_found / total_alignments if total_alignments > 0 else 0.0
logger.info("Total alignments processed: %d, polyA tail detected in %d (%.1f%%)" %
(total_alignments, polya_found, polya_fraction * 100.0))
- self.args.needs_polya_for_construction = polya_fraction >= 0.7
+ if polya_fraction < self.args.low_polya_percentage_threshold:
+ logger.warning("PolyA percentage is suspiciously low. IsoQuant expects non-polya-trimmed reads. "
+ "If you aim to construct transcript models, consider using --polya_requirement option.")
+
+ self.args.requires_polya_for_construction = set_polya_requirement_strategy(
+ polya_fraction >= self.args.polya_percentage_threshold,
+ self.args.polya_requirement_strategy)
+ self.args.require_monointronic_polya = set_polya_requirement_strategy(
+ # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
+ self.args.require_monointronic_polya or self.args.requires_polya_for_construction,
+ self.args.polya_requirement_strategy)
+ self.args.require_monoexonic_polya = set_polya_requirement_strategy(
+ # do not require polyA tails for mono-intronic only if the data is reliable and polyA percentage is low
+ self.args.require_monoexonic_polya or self.args.requires_polya_for_construction,
+ self.args.polya_requirement_strategy)
self.process_assigned_reads(sample, saves_file)
if not self.args.read_assignments and not self.args.keep_tmp:
@@ -510,12 +540,26 @@ def process_assigned_reads(self, sample, dump_filename):
reverse=True
)
logger.info("Processing assigned reads " + sample.prefix)
+ logger.info("Transcript models construction is turned %s" %
+ ("off" if self.args.no_model_construction else "on"))
# set up aggregators and outputs
aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups)
transcript_stat_counter = EnumStats()
if not self.args.no_model_construction:
+ logger.info("Transcript construction options:")
+ logger.info(" Novel monoexonic transcripts will be reported: %s"
+ % ("yes" if self.args.report_novel_unspliced else "no"))
+ logger.info(" PolyA tails are required for multi-exon transcripts to be reported: %s"
+ % ("yes" if self.args.requires_polya_for_construction else "no"))
+ logger.info(" PolyA tails are required for 2-exon transcripts to be reported: %s"
+ % ("yes" if self.args.require_monointronic_polya else "no"))
+ logger.info(" PolyA tails are required for known monoexon transcripts to be reported: %s"
+ % ("yes" if self.args.require_monoexonic_polya else "no"))
+ logger.info(" PolyA tails are required for novel monoexon transcripts to be reported: %s" % "yes")
+ logger.info(" Splice site reporting level: %s" % self.args.report_canonical_strategy.name)
+
gff_printer = GFFPrinter(
sample.out_dir, sample.prefix, self.io_support, header=self.common_header
)
diff --git a/src/gene_info.py b/src/gene_info.py
index 88ce30f2..2a0e318d 100644
--- a/src/gene_info.py
+++ b/src/gene_info.py
@@ -635,7 +635,7 @@ def set_strand(self, intron, strand=None):
elif self.chr_record:
self.strand_dict[intron] = get_intron_strand(intron, self.chr_record)
- def get_strand(self, introns, has_polya=False, has_polyt=False):
+ def count_canonical_sites(self, introns):
count_fwd = 0
count_rev = 0
for intron in introns:
@@ -648,6 +648,19 @@ def get_strand(self, introns, has_polya=False, has_polyt=False):
count_fwd += 1
elif strand == '-':
count_rev += 1
+ return count_fwd, count_rev
+
+ # all splice sites must be canonical from the same strand, not just the majority
+ def get_clean_strand(self, introns):
+ count_fwd, count_rev = self.count_canonical_sites(introns)
+ if count_fwd == 0 and count_rev > 0:
+ return '-'
+ elif count_fwd > 0 and count_rev == 0:
+ return '+'
+ return '.'
+
+ def get_strand(self, introns, has_polya=False, has_polyt=False):
+ count_fwd, count_rev = self.count_canonical_sites(introns)
if count_fwd == count_rev:
if has_polya and not has_polyt:
return '+'
diff --git a/src/graph_based_model_construction.py b/src/graph_based_model_construction.py
index 8b7b3820..526deff8 100644
--- a/src/graph_based_model_construction.py
+++ b/src/graph_based_model_construction.py
@@ -7,6 +7,7 @@
import logging
from collections import defaultdict
from functools import cmp_to_key
+from enum import unique, Enum
from .common import (
AtomicCounter,
@@ -30,6 +31,14 @@
logger = logging.getLogger('IsoQuant')
+@unique
+class StrandnessReportingLevel(Enum):
+ only_canonical = 1
+ only_stranded = 2
+ all = 3
+ auto = 10
+
+
class GraphBasedModelConstructor:
transcript_id_counter = AtomicCounter()
transcript_prefix = "transcript"
@@ -128,6 +137,7 @@ def process(self, read_assignment_storage):
self.construct_fl_isoforms()
self.construct_assignment_based_isoforms(read_assignment_storage)
+ # FIXME: assign reads AFTER filtering
self.assign_reads_to_models(read_assignment_storage)
self.filter_transcripts()
@@ -369,18 +379,22 @@ def construct_fl_isoforms(self):
has_polya = path[-1][0] == VERTEX_polya
polya_site = has_polya or has_polyt
transcript_strand = self.strand_detector.get_strand(intron_path, has_polya, has_polyt)
- transcript_ss_strand = self.strand_detector.get_strand(intron_path)
+ transcript_clean_strand = self.strand_detector.get_clean_strand(intron_path)
#logger.debug("uuu Novel isoform %s has coverage: %d cutoff = %d, component cov = %d, max_coverage = %d"
# % (new_transcript_id, count, novel_isoform_cutoff, component_coverage, self.intron_graph.max_coverage))
if count < novel_isoform_cutoff:
# logger.debug("uuu Novel isoform %s has low coverage: %d\t%d" % (new_transcript_id, count, novel_isoform_cutoff))
pass
- elif len(novel_exons) == 2 and (not polya_site or transcript_ss_strand == '.'):
+ elif (len(novel_exons) == 2 and
+ ((self.params.require_monointronic_polya and not polya_site) or transcript_clean_strand == '.')):
# logger.debug("uuu Avoiding single intron %s isoform: %d\t%s" % (new_transcript_id, count, str(path)))
pass
- elif transcript_strand == '.' and not self.params.report_unstranded:
- logger.info("Avoiding unreliable transcript with %d exons" % len(novel_exons))
+ elif ((self.params.report_canonical_strategy == StrandnessReportingLevel.only_canonical
+ and transcript_clean_strand == '.')
+ or (self.params.report_canonical_strategy == StrandnessReportingLevel.only_stranded
+ and transcript_strand == '.')):
+ logger.debug("Avoiding unreliable transcript with %d exons" % len(novel_exons))
pass
else:
if self.params.use_technical_replicas and \
@@ -468,13 +482,13 @@ def construct_assignment_based_isoforms(self, read_assignment_storage):
# (read_assignment.read_id, refrenence_isoform_id))
spliced_isoform_reads[refrenence_isoform_id].append(read_assignment)
- if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-':
+ if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '-':
if any(x.event_type == MatchEventSubtype.correct_polya_site_left for x in events):
isoform_left_support[refrenence_isoform_id] += 1
elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][0][0] - read_assignment.corrected_exons[0][0]) <= self.params.apa_delta:
isoform_left_support[refrenence_isoform_id] += 1
- if self.params.needs_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+':
+ if self.params.requires_polya_for_construction and self.gene_info.isoform_strands[refrenence_isoform_id] == '+':
if any(x.event_type == MatchEventSubtype.correct_polya_site_right for x in events):
isoform_right_support[refrenence_isoform_id] += 1
elif abs(self.gene_info.all_isoforms_exons[refrenence_isoform_id][-1][1] - read_assignment.corrected_exons[-1][1]) <= self.params.apa_delta:
@@ -596,7 +610,8 @@ def construct_monoexon_isoforms(self, mono_exon_isoform_reads, mono_exon_isoform
polya_support = polya_sites[isoform_id]
# logger.debug(">> Monoexon transcript %s: %d\t%d\t%.4f\t%d" % (isoform_id, self.intron_graph.max_coverage, count, coverage, polya_support))
- if count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or polya_support == 0:
+ if (count < self.params.min_known_count or coverage < self.params.min_mono_exon_coverage or
+ (self.params.require_monoexonic_polya and polya_support == 0)):
pass # logger.debug(">> Will NOT be added, abs cutoff=%d" % (self.params.min_known_count))
elif isoform_id not in GraphBasedModelConstructor.detected_known_isoforms:
new_model = self.transcript_from_reference(isoform_id)
@@ -761,7 +776,7 @@ def fill(self, read_assignments):
path_tuple = tuple(intron_path)
self.paths[path_tuple] += 1
if terminal_vertex and starting_vertex:
- if not self.params.needs_polya_for_construction or\
+ if not self.params.requires_polya_for_construction or\
(terminal_vertex[0] == VERTEX_polya or starting_vertex[0] == VERTEX_polyt):
self.fl_paths.add(path_tuple)
self.paths_to_reads[path_tuple].append(a)