diff --git a/tallytrin/pipeline_10x.py b/tallytrin/pipeline_10x.py index a194fc2..72ca27c 100644 --- a/tallytrin/pipeline_10x.py +++ b/tallytrin/pipeline_10x.py @@ -153,7 +153,7 @@ def correct_polyA(infile, outfile): PYTHON_ROOT = os.path.join(os.path.dirname(__file__), "python/") - statement = '''python %(PYTHON_ROOT)s/complement_polyA_singlecell.py --infile=%(infile)s --outname=%(outfile)s && truncate -s 0 %(infile)s''' + statement = '''python %(PYTHON_ROOT)s/complement_polyA_singlecell.py --infile=%(infile)s --outname=%(outfile)s''' P.run(statement, job_options='-t 24:00:00') @@ -177,7 +177,7 @@ def identify_bcumi(infile, outfile): PYTHON_ROOT = os.path.join(os.path.dirname(__file__), "python/") statement = '''python %(PYTHON_ROOT)s/10x_identify_barcode.py --outfile=%(outfile)s --infile=%(infile)s --whitelist=polyA_umi.dir/%(name)s.whitelist.txt - --cmimode=%(cmimode)s --barcode=%(barcode)s --umi=%(umi_length)s && truncate -s 0 %(infile)s''' + --cmimode=%(cmimode)s --barcode=%(barcode)s --umi=%(umi_length)s''' P.run(statement, job_options='-t 24:00:00') diff --git a/tallytrin/python/10x_identify_barcode.py b/tallytrin/python/10x_identify_barcode.py index 90e8c8f..a3c7089 100644 --- a/tallytrin/python/10x_identify_barcode.py +++ b/tallytrin/python/10x_identify_barcode.py @@ -53,50 +53,32 @@ n = 0 y = 0 + +# Construct a regex pattern that matches the sequence with up to 2 mismatches in the specified middle part +sequence_pattern = r"AGATCGGAAGAGCGT" + + + with pysam.FastxFile(args.infile) as fh: for record in fh: n+=1 - first = 0 - seq = record.sequence - for a, b in zip(Bio.pairwise2.align.localms(seq,"AGATCGGAAGAGCGT", 2, -1, -0.5, -0.1), Bio.pairwise2.align.localms(seq,"AAAAAAAAA", 2, -1, -0.5, -0.1)): - first +=1 - if first == 1: - al1_a, al2_a, score_a, begin_a, end_a = a - al1_a, al2_b, score_b, begin_b, end_b = b - else: - first = 0 - break - length_umibarcode = len(record.sequence[end_b:begin_a]) - - if length_umibarcode >= 20: - y+=1 - if args.cmimode == '1': - umi = record.sequence[begin_a:end_a] - umi = umi[:15] - else: - umi_start = int(16 + args.umi) - print(umi_start) - umi = record.sequence[begin_a-umi_start:begin_a-16] - print(umi) - if len(umi) == 15 and args.cmimode == '1': - barcode = record.sequence[begin_a-16:begin_a][:int(args.barcode)] - barcodes.append(barcode) - seq_new = record.sequence[:begin_b] - quality_new = record.quality[:begin_b] - - outfile.write("@%s\n%s\n+\n%s\n" % (record.name + "_" + barcode + "_" + umi, seq_new, quality_new)) - else: - barcode = record.sequence[begin_a-16:begin_a][:int(args.barcode)] - barcodes.append(barcode) - seq_new = record.sequence[:begin_b] - quality_new = record.quality[:begin_b] - - outfile.write("@%s\n%s\n+\n%s\n" % (record.name + "_" + barcode + "_" + umi, seq_new, quality_new)) - + match = regex.search(sequence_pattern, seq) + if match: + y += 1 + barcode = seq[match.start()-16:match.start()] + umi = seq[match.start() - 16 - args.umi:match.start() - 16] + barcodes.append(barcode) + + seq_new = seq[:match.start()-28] + quality_new = record.quality[:match.start()-28] + + outfile.write(f"@{record.name}_{barcode}_{umi}\n{seq_new}\n+\n{quality_new}\n") + + outfile.close() @@ -108,9 +90,5 @@ out_barcodes.close() -log.write("The number of total reads: %s\n" %(n)) -log.write("The number of total reads that have a correct barcode and UMI: %s\n" %(y)) -log.write("The number of total recovered percent is: %s\n" %((y/n)*100)) - log.close()