From 946b1d18bfbfb3de1a6dc4b50cb83196fb62400c Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Tue, 31 Jul 2018 18:07:02 -0700 Subject: [PATCH 1/9] Download IgBLAST optional_file directory This directory is necessary for IgBLAST CDR3 calling --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3e1d87d..0817286 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,10 @@ RUN cd /trinityrnaseq-Trinity-v2.4.0 && make RUN wget ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.7.0/ncbi-igblast-1.7.0-x64-linux.tar.gz RUN tar -xzvf ncbi-igblast-1.7.0-x64-linux.tar.gz && rm ncbi-igblast-1.7.0-x64-linux.tar.gz RUN cd /ncbi-igblast-1.7.0/bin/ && wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data && \ - mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && rm -r ftp.ncbi.nih.gov + wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file && \ + mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && \ + mv ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file . && \ + rm -r ftp.ncbi.nih.gov #aligners - kallisto and salmon RUN wget https://github.com/pachterlab/kallisto/releases/download/v0.43.1/kallisto_linux-v0.43.1.tar.gz From 19185714530590cc78565c5271bfd56f26275236 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Tue, 31 Jul 2018 18:10:18 -0700 Subject: [PATCH 2/9] Report CDR3 nucleotide seqs parsed from IgBLAST --- tracerlib/core.py | 7 +++++-- tracerlib/tasks.py | 14 +++++++++++--- tracerlib/tracer_func.py | 24 ++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/tracerlib/core.py b/tracerlib/core.py index 54c20d8..72732c4 100644 --- a/tracerlib/core.py +++ b/tracerlib/core.py @@ -302,7 +302,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers, productive, stop_codon, in_frame, TPM, dna_seq, hit_table, summary, junction_details, best_VJ_names, alignment_summary, trinity_seq, - imgt_reconstructed_seq, has_D): + imgt_reconstructed_seq, has_D, cdr3nt): self.contig_name = contig_name self.locus = locus self.identifier = identifier @@ -321,6 +321,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers, self.trinity_seq = trinity_seq self.imgt_reconstructed_seq = imgt_reconstructed_seq self.has_D_segment = has_D + self.cdr3nt = cdr3nt def __str__(self): return ( @@ -367,10 +368,12 @@ def get_summary(self): summary_string += segments_string summary_string += "ID:\t{}\n".format(self.identifier) summary_string += "TPM:\t{TPM}\nProductive:\t{productive}\nStop codon:" \ - "\t{stop_codon}\nIn frame:\t{in_frame}\n\n".format( + "\t{stop_codon}\nIn frame:\t{in_frame}\n".format( TPM=self.TPM, productive=self.productive, stop_codon=self.stop_codon, in_frame=self.in_frame) + summary_string += "CDR3nt:\t{}\n\n".format(self.cdr3nt) + summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \ 'alignment length\tmismatches\tgap opens\tgaps' \ '\tq start\tq end\ts start\ts end\te value\tbit score\n' diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py index c07cfcc..332e11d 100644 --- a/tracerlib/tasks.py +++ b/tracerlib/tasks.py @@ -1014,7 +1014,7 @@ def run(self): # Write out recombinant details for each cell with open("{}/recombinants.txt".format(outdir), 'w') as f: f.write( - "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\n") + "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3nt\n") sorted_cell_names = sorted(list(cells.keys())) for cell_name in sorted_cell_names: cell = cells[cell_name] @@ -1025,12 +1025,20 @@ def run(self): recombinants = cell.recombinants[self.receptor_name][locus] if recombinants is not None: for r in recombinants: + + # check for cdr3nt attribute (to make backwards compatible) + if hasattr(r, 'cdr3nt'): + cdr3nt = r.cdr3nt + else: + cdr3nt = 'N/A' + f.write( - "{name}\t{locus}\t{ident}\t{productive}\t{length}\n".format( + "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3nt}\n".format( name=cell_name, locus=locus, ident=r.identifier, productive=r.productive, - length=len(r.trinity_seq))) + length=len(r.trinity_seq), + cdr3nt=cdr3nt)) if r.productive: cell_data[locus + "_productive"] = r.identifier else: diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py index 5a04205..37d2b71 100644 --- a/tracerlib/tracer_func.py +++ b/tracerlib/tracer_func.py @@ -42,6 +42,7 @@ def process_chunk(chunk): store_junction_details = False store_alignment_summary = False store_hit_table = False + store_CDR3 = False alignment_summary = [] hit_table = [] looking_for_end = False @@ -80,6 +81,13 @@ def process_chunk(chunk): else: return_dict['hit_table'].append(line_x) + elif store_CDR3: + # single tab-separated line, example: + # CDR3 GCGTGGAAAGTG AWKV 51 59 + _, cdr3nt, _cdr3aa, _start, _end = line_x.split('\t') + return_dict['cdr3'].append(cdr3nt) + store_CDR3 = False + elif line_x.startswith('# Query'): query_name = line_x.split(" ")[2] query_length = None @@ -100,6 +108,9 @@ def process_chunk(chunk): elif line_x.startswith('# V-(D)-J junction details'): store_junction_details = True + elif line_x.startswith('# Sub-region sequence'): + store_CDR3 = True + elif line_x.startswith('# Alignment summary'): store_alignment_summary = True @@ -152,6 +163,12 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs, identifier = best_V + "_" + junc_string + "_" + best_J + # CDR3 nucleotide sequences + if 'cdr3' in query_data.keys(): + cdr3nt = query_data['cdr3'][0] + else: + cdr3nt = 'N/A' + # line attempting to add alignment summary to data for use # with PCR comparisons alignment_summary = query_data['alignment_summary'] @@ -227,7 +244,8 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs, alignment_summary=alignment_summary, trinity_seq=trinity_seq, imgt_reconstructed_seq=imgt_reconstructed_seq, - has_D=has_D) + has_D=has_D, + cdr3nt=cdr3nt) recombinants[locus].append(rec) if recombinants: @@ -1242,7 +1260,9 @@ def run_IgBlast(igblast, receptor, loci, output_dir, cell_name, index_location, '-ig_seqtype', ig_seqtype, '-show_translation', '-num_alignments_V', '5', '-num_alignments_D', '5', '-num_alignments_J', '5', - '-outfmt', fmt, '-query', trinity_fasta] + '-outfmt', fmt, + '-auxiliary_data', 'optional_file/{}_gl.aux'.format(igblast_species), + '-query', trinity_fasta] if fmt == '7': igblast_out = "{output_dir}/IgBLAST_output/{cell_name}_{locus}.IgBLASTOut".format( output_dir=output_dir, From 70e44c586ee6535bea0cbe1c604363dba9c91e81 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Thu, 2 Aug 2018 10:57:12 -0700 Subject: [PATCH 3/9] Update README to reflect IgBLAST optional_file use --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 953c63d..9831c17 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Note that TraCeR is compatible with both Python 2 and 3. 6. [Graphviz](http://www.graphviz.org) - Dot and Neato drawing programs required for visualisation of clonotype graphs. This is optional - see the [`--no_networks` option](#options-1) to [`summarise`](#summarise-summary-and-clonotype-networks). ##### Installing IgBlast -Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and put it into the same directory as the igblast executable. This is also described in the igblast README file. +Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and `optional_file` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file/) and put them into the same directory as the igblast executable. This is also described in the igblast README file. You should also ensure to set the `$IGDATA` environment variable to point to the location of the IgBlast executable. For example run `export IGDATA=//igblast/1.4.0/bin`. From e702ccbf7f9c99079b9b4d32ea4e31373cf6bdd5 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Mon, 6 Aug 2018 21:16:39 -0700 Subject: [PATCH 4/9] Report amino acid CDR3 from IgBLAST --- tracerlib/core.py | 28 ++++------------------------ tracerlib/tasks.py | 7 +++++-- tracerlib/tracer_func.py | 10 ++++++---- 3 files changed, 15 insertions(+), 30 deletions(-) diff --git a/tracerlib/core.py b/tracerlib/core.py index 72732c4..4b83e19 100644 --- a/tracerlib/core.py +++ b/tracerlib/core.py @@ -302,7 +302,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers, productive, stop_codon, in_frame, TPM, dna_seq, hit_table, summary, junction_details, best_VJ_names, alignment_summary, trinity_seq, - imgt_reconstructed_seq, has_D, cdr3nt): + imgt_reconstructed_seq, has_D, cdr3nt, cdr3): self.contig_name = contig_name self.locus = locus self.identifier = identifier @@ -310,7 +310,6 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers, self.productive = productive self.TPM = TPM self.dna_seq = dna_seq - self.cdr3 = self._get_cdr3(dna_seq) self.hit_table = hit_table self.summary = summary self.junction_details = junction_details @@ -322,32 +321,12 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers, self.imgt_reconstructed_seq = imgt_reconstructed_seq self.has_D_segment = has_D self.cdr3nt = cdr3nt + self.cdr3 = cdr3 def __str__(self): return ( "{} {} {} {}".format(self.identifier, self.productive, self.TPM)) - def _get_cdr3(self, dna_seq): - aaseq = Seq(str(dna_seq), generic_dna).translate() - if re.findall('FG.G', str(aaseq)) and re.findall('C', str(aaseq)): - indices = [i for i, x in enumerate(aaseq) if x == 'C'] - upper = str(aaseq).find(re.findall('FG.G', str(aaseq))[0]) - lower = False - for i in indices: - if i < upper: - lower = i - if lower: - cdr3 = aaseq[lower:upper + 4] - else: - cdr3 = "Couldn't find conserved cysteine" - elif re.findall('FG.G', str(aaseq)): - cdr3 = "Couldn't find conserved cysteine" - elif re.findall('C', str(aaseq)): - cdr3 = "Couldn't find FGXG" - else: - cdr3 = "Couldn't find either conserved boundary" - return (cdr3) - def get_summary(self): summary_string = "##{contig_name}##\n".format( contig_name=self.contig_name) @@ -372,7 +351,8 @@ def get_summary(self): TPM=self.TPM, productive=self.productive, stop_codon=self.stop_codon, in_frame=self.in_frame) - summary_string += "CDR3nt:\t{}\n\n".format(self.cdr3nt) + summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(self.cdr3, + self.cdr3nt) summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \ 'alignment length\tmismatches\tgap opens\tgaps' \ diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py index 332e11d..5489a45 100644 --- a/tracerlib/tasks.py +++ b/tracerlib/tasks.py @@ -1014,7 +1014,7 @@ def run(self): # Write out recombinant details for each cell with open("{}/recombinants.txt".format(outdir), 'w') as f: f.write( - "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3nt\n") + "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3aa\tCDR3nt\n") sorted_cell_names = sorted(list(cells.keys())) for cell_name in sorted_cell_names: cell = cells[cell_name] @@ -1029,15 +1029,18 @@ def run(self): # check for cdr3nt attribute (to make backwards compatible) if hasattr(r, 'cdr3nt'): cdr3nt = r.cdr3nt + cdr3 = r.cdr3 else: cdr3nt = 'N/A' + cdr3 = 'N/A' f.write( - "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3nt}\n".format( + "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3}\t{cdr3nt}\n".format( name=cell_name, locus=locus, ident=r.identifier, productive=r.productive, length=len(r.trinity_seq), + cdr3=cdr3, cdr3nt=cdr3nt)) if r.productive: cell_data[locus + "_productive"] = r.identifier diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py index 37d2b71..15e41ac 100644 --- a/tracerlib/tracer_func.py +++ b/tracerlib/tracer_func.py @@ -84,8 +84,8 @@ def process_chunk(chunk): elif store_CDR3: # single tab-separated line, example: # CDR3 GCGTGGAAAGTG AWKV 51 59 - _, cdr3nt, _cdr3aa, _start, _end = line_x.split('\t') - return_dict['cdr3'].append(cdr3nt) + _, cdr3nt, cdr3, _start, _end = line_x.split('\t') + return_dict['cdr3'] += [cdr3nt, cdr3] store_CDR3 = False elif line_x.startswith('# Query'): @@ -165,9 +165,10 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs, # CDR3 nucleotide sequences if 'cdr3' in query_data.keys(): - cdr3nt = query_data['cdr3'][0] + cdr3nt, cdr3 = query_data['cdr3'] else: cdr3nt = 'N/A' + cdr3 = 'N/A' # line attempting to add alignment summary to data for use # with PCR comparisons @@ -245,7 +246,8 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs, trinity_seq=trinity_seq, imgt_reconstructed_seq=imgt_reconstructed_seq, has_D=has_D, - cdr3nt=cdr3nt) + cdr3nt=cdr3nt, + cdr3=cdr3) recombinants[locus].append(rec) if recombinants: From 40a82492c2076569a3212a63bc32cc62ecbef506 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Tue, 7 Aug 2018 18:24:54 -0700 Subject: [PATCH 5/9] Update expected recombinants.txt --- test_data/expected_summary/recombinants.txt | 26 ++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test_data/expected_summary/recombinants.txt b/test_data/expected_summary/recombinants.txt index 75113c6..3da80f2 100644 --- a/test_data/expected_summary/recombinants.txt +++ b/test_data/expected_summary/recombinants.txt @@ -1,18 +1,18 @@ -cell_name locus recombinant_id productive reconstructed_length -cell1 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 -cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 -cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 -cell1 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 +cell_name locus recombinant_id productive reconstructed_length CDR3aa CDR3nt +cell1 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 AVENNNNAPR GCTGTTGAGAATAACAACAATGCCCCACGA +cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 AVRGKERQYRKTH GCTGTGAGGGGGAAGGAGAGGCAATACCGGAAAACTCATC +cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 ASSTTGGAPGSST GCCAGCTCTACAACAGGGGGGGCACCGGGCAGCTCTAC +cell1 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 ASSYNSYEQY GCCAGCAGCTACAACTCCTATGAACAGTAC -cell2 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 -cell2 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 -cell2 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 -cell2 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 +cell2 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 N/A N/A +cell2 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 N/A N/A +cell2 B TRBV12-1_GCTCTACAACAGGGGGGG(C)ACCGG_TRBJ2-2 False 100 N/A N/A +cell2 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 N/A N/A -cell3 A TRAV3-3_CAGTGGGGGAACTA_TRAJ26 False 339 -cell3 A TRAV7-5_TGAGCGACACC_TRAJ27 True 334 -cell3 B TRBV31_AGTCTTGACACAAGA_TRBJ2-5 False 335 -cell3 B TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5 True 343 +cell3 A TRAV7-5_TGAGCGACACC_TRAJ27 True 334 N/A N/A +cell3 A TRAV3-3_CAGTGGGGGAACTA_TRAJ26 False 339 N/A N/A +cell3 B TRBV31_AGTCTTGACACAAGA_TRBJ2-5 False 335 N/A N/A +cell3 B TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5 True 343 N/A N/A From 361b16d2ba7ad4604078a7ad443fdfb07785d72f Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Tue, 21 Aug 2018 21:57:15 -0700 Subject: [PATCH 6/9] Change CDR3 seqs to lowercase if non-productive --- tracerlib/core.py | 11 +++++++++-- tracerlib/tasks.py | 6 ++++++ tracerlib/tracer_func.py | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tracerlib/core.py b/tracerlib/core.py index 4b83e19..24d948d 100644 --- a/tracerlib/core.py +++ b/tracerlib/core.py @@ -351,8 +351,15 @@ def get_summary(self): TPM=self.TPM, productive=self.productive, stop_codon=self.stop_codon, in_frame=self.in_frame) - summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(self.cdr3, - self.cdr3nt) + # lowercase CDR3 sequences if non-productive + cdr3 = self.cdr3 + cdr3nt = self.cdr3nt + if not self.productive: + cdr3 = cdr3.lower() + cdr3nt = cdr3nt.lower() + + summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(cdr3, + cdr3nt) summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \ 'alignment length\tmismatches\tgap opens\tgaps' \ diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py index 5489a45..0522c24 100644 --- a/tracerlib/tasks.py +++ b/tracerlib/tasks.py @@ -1030,6 +1030,12 @@ def run(self): if hasattr(r, 'cdr3nt'): cdr3nt = r.cdr3nt cdr3 = r.cdr3 + + # lowercase CDR3 sequences if non-productive + if not r.productive: + cdr3 = cdr3.lower() + cdr3nt = cdr3nt.lower() + else: cdr3nt = 'N/A' cdr3 = 'N/A' diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py index 15e41ac..a587700 100644 --- a/tracerlib/tracer_func.py +++ b/tracerlib/tracer_func.py @@ -163,7 +163,7 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs, identifier = best_V + "_" + junc_string + "_" + best_J - # CDR3 nucleotide sequences + # CDR3 sequences if 'cdr3' in query_data.keys(): cdr3nt, cdr3 = query_data['cdr3'] else: From 510b5f3052f828c46a6577eb70ecdb7b503d8f55 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Fri, 31 Aug 2018 10:07:23 -0700 Subject: [PATCH 7/9] Include CDR3 seqs in README summarise output --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9831c17..f8d5e6d 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ The following output files are generated: 1. `TCR_summary.txt` Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus. 2. `recombinants.txt` - List of TCR identifiers, lengths and productivities for each cell. + List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. 3. `reconstructed_lengths_TCR[A|B].pdf` and `reconstructed_lengths_TCR[A|B].txt` Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus. 4. `clonotype_sizes.pdf` and `clonotype_sizes.txt` From ccaf9242c61d23b6eacb114618baa4e9dfe2b1b2 Mon Sep 17 00:00:00 2001 From: Derek Croote Date: Fri, 31 Aug 2018 10:10:13 -0700 Subject: [PATCH 8/9] Update expected recombinants with lowercase CDR3s --- test_data/expected_summary/recombinants.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_data/expected_summary/recombinants.txt b/test_data/expected_summary/recombinants.txt index 3da80f2..7832cc7 100644 --- a/test_data/expected_summary/recombinants.txt +++ b/test_data/expected_summary/recombinants.txt @@ -1,7 +1,7 @@ cell_name locus recombinant_id productive reconstructed_length CDR3aa CDR3nt cell1 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 AVENNNNAPR GCTGTTGAGAATAACAACAATGCCCCACGA -cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 AVRGKERQYRKTH GCTGTGAGGGGGAAGGAGAGGCAATACCGGAAAACTCATC -cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 ASSTTGGAPGSST GCCAGCTCTACAACAGGGGGGGCACCGGGCAGCTCTAC +cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 avrgkerqyrkth gctgtgagggggaaggagaggcaataccggaaaactcatc +cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 assttggapgsst gccagctctacaacagggggggcaccgggcagctctac cell1 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 ASSYNSYEQY GCCAGCAGCTACAACTCCTATGAACAGTAC cell2 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 N/A N/A From e63f01054d518f00f83e68983dc1cc8c7090a248 Mon Sep 17 00:00:00 2001 From: Mike Stubbington Date: Fri, 31 Aug 2018 20:57:33 +0100 Subject: [PATCH 9/9] added extra info about CDR3 reporting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f8d5e6d..2f4fc21 100644 --- a/README.md +++ b/README.md @@ -286,7 +286,7 @@ The following output files are generated: 1. `TCR_summary.txt` Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus. 2. `recombinants.txt` - List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. + List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. **Note:** It's possible for non-productive rearrangements to still have a detectable CDR3 if a frameshift introduces a stop-codon after this region. In these cases, the CDR3 nucleotides and amino acids are still reported but are shown in lower-case. 3. `reconstructed_lengths_TCR[A|B].pdf` and `reconstructed_lengths_TCR[A|B].txt` Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus. 4. `clonotype_sizes.pdf` and `clonotype_sizes.txt`