Merge pull request #73 from dcroote/add-cdr3

Add cdr3 nucleotide sequences
Teichlab · Aug 31, 2018 · 6bc0182 · 6bc0182
2 parents c90ab4f + e63f010
commit 6bc0182
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 45 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -18,7 +18,10 @@ RUN cd /trinityrnaseq-Trinity-v2.4.0 && make
 RUN wget ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.7.0/ncbi-igblast-1.7.0-x64-linux.tar.gz
 RUN tar -xzvf ncbi-igblast-1.7.0-x64-linux.tar.gz && rm ncbi-igblast-1.7.0-x64-linux.tar.gz
 RUN cd /ncbi-igblast-1.7.0/bin/ && wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data && \
-	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && rm -r ftp.ncbi.nih.gov
+	wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file && \
+	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && \
+	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file . && \
+	rm -r ftp.ncbi.nih.gov
 
 #aligners - kallisto and salmon
 RUN wget https://github.com/pachterlab/kallisto/releases/download/v0.43.1/kallisto_linux-v0.43.1.tar.gz

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ Note that TraCeR is compatible with both Python 2 and 3.
 6. [Graphviz](http://www.graphviz.org) - Dot and Neato drawing programs required for visualisation of clonotype graphs. This is optional - see the [`--no_networks` option](#options-1) to [`summarise`](#summarise-summary-and-clonotype-networks).
 
 ##### Installing IgBlast 
-Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and put it into the same directory as the igblast executable. This is also described in the igblast README file.
+Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and `optional_file` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file/) and put them into the same directory as the igblast executable. This is also described in the igblast README file.
 
 You should also ensure to set the `$IGDATA` environment variable to point to the location of the IgBlast executable. For example run `export IGDATA=/<path_to_igblast>/igblast/1.4.0/bin`.
 
@@ -286,7 +286,7 @@ The following output files are generated:
 1. `TCR_summary.txt`
     Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus.
 2. `recombinants.txt`
-    List of TCR identifiers, lengths and productivities for each cell. 
+    List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. **Note:** It's possible for non-productive rearrangements to still have a detectable CDR3 if a frameshift introduces a stop-codon after this region. In these cases, the CDR3 nucleotides and amino acids are still reported but are shown in lower-case.
 3. `reconstructed_lengths_TCR[A|B].pdf` and  `reconstructed_lengths_TCR[A|B].txt`
     Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus. 
 4. `clonotype_sizes.pdf` and `clonotype_sizes.txt`

diff --git a/test_data/expected_summary/recombinants.txt b/test_data/expected_summary/recombinants.txt
@@ -1,18 +1,18 @@
-cell_name	locus	recombinant_id	productive	reconstructed_length
-cell1	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325
-cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347
-cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100
-cell1	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334
+cell_name	locus	recombinant_id	productive	reconstructed_length	CDR3aa	CDR3nt
+cell1	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325	AVENNNNAPR	GCTGTTGAGAATAACAACAATGCCCCACGA
+cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	avrgkerqyrkth	gctgtgagggggaaggagaggcaataccggaaaactcatc
+cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100	assttggapgsst	gccagctctacaacagggggggcaccgggcagctctac
+cell1	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334	ASSYNSYEQY	GCCAGCAGCTACAACTCCTATGAACAGTAC
 
-cell2	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325
-cell2	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347
-cell2	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100
-cell2	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334
+cell2	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	N/A	N/A
+cell2	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325	N/A	N/A
+cell2	B	TRBV12-1_GCTCTACAACAGGGGGGG(C)ACCGG_TRBJ2-2	False	100	N/A	N/A
+cell2	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334	N/A	N/A
 
-cell3	A	TRAV3-3_CAGTGGGGGAACTA_TRAJ26	False	339
-cell3	A	TRAV7-5_TGAGCGACACC_TRAJ27	True	334
-cell3	B	TRBV31_AGTCTTGACACAAGA_TRBJ2-5	False	335
-cell3	B	TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5	True	343
+cell3	A	TRAV7-5_TGAGCGACACC_TRAJ27	True	334	N/A	N/A
+cell3	A	TRAV3-3_CAGTGGGGGAACTA_TRAJ26	False	339	N/A	N/A
+cell3	B	TRBV31_AGTCTTGACACAAGA_TRBJ2-5	False	335	N/A	N/A
+cell3	B	TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5	True	343	N/A	N/A
 
 
 
diff --git a/tracerlib/core.py b/tracerlib/core.py
@@ -302,15 +302,14 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
                  productive, stop_codon, in_frame, TPM,
                  dna_seq, hit_table, summary, junction_details, best_VJ_names,
                  alignment_summary, trinity_seq,
-                 imgt_reconstructed_seq, has_D):
+                 imgt_reconstructed_seq, has_D, cdr3nt, cdr3):
         self.contig_name = contig_name
         self.locus = locus
         self.identifier = identifier
         self.all_poss_identifiers = all_poss_identifiers
         self.productive = productive
         self.TPM = TPM
         self.dna_seq = dna_seq
-        self.cdr3 = self._get_cdr3(dna_seq)
         self.hit_table = hit_table
         self.summary = summary
         self.junction_details = junction_details
@@ -321,32 +320,13 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
         self.trinity_seq = trinity_seq
         self.imgt_reconstructed_seq = imgt_reconstructed_seq
         self.has_D_segment = has_D
+        self.cdr3nt = cdr3nt
+        self.cdr3 = cdr3
 
     def __str__(self):
         return (
         "{} {} {} {}".format(self.identifier, self.productive, self.TPM))
 
-    def _get_cdr3(self, dna_seq):
-        aaseq = Seq(str(dna_seq), generic_dna).translate()
-        if re.findall('FG.G', str(aaseq)) and re.findall('C', str(aaseq)):
-            indices = [i for i, x in enumerate(aaseq) if x == 'C']
-            upper = str(aaseq).find(re.findall('FG.G', str(aaseq))[0])
-            lower = False
-            for i in indices:
-                if i < upper:
-                    lower = i
-            if lower:
-                cdr3 = aaseq[lower:upper + 4]
-            else:
-                cdr3 = "Couldn't find conserved cysteine"
-        elif re.findall('FG.G', str(aaseq)):
-            cdr3 = "Couldn't find conserved cysteine"
-        elif re.findall('C', str(aaseq)):
-            cdr3 = "Couldn't find FGXG"
-        else:
-            cdr3 = "Couldn't find either conserved boundary"
-        return (cdr3)
-
     def get_summary(self):
         summary_string = "##{contig_name}##\n".format(
             contig_name=self.contig_name)
@@ -367,10 +347,20 @@ def get_summary(self):
         summary_string += segments_string
         summary_string += "ID:\t{}\n".format(self.identifier)
         summary_string += "TPM:\t{TPM}\nProductive:\t{productive}\nStop codon:" \
-                          "\t{stop_codon}\nIn frame:\t{in_frame}\n\n".format(
+                          "\t{stop_codon}\nIn frame:\t{in_frame}\n".format(
             TPM=self.TPM, productive=self.productive,
             stop_codon=self.stop_codon, in_frame=self.in_frame)
 
+        # lowercase CDR3 sequences if non-productive
+        cdr3 = self.cdr3
+        cdr3nt = self.cdr3nt
+        if not self.productive:
+            cdr3 = cdr3.lower()
+            cdr3nt = cdr3nt.lower()
+
+        summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(cdr3,
+                                                                cdr3nt)
+
         summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \
                           'alignment length\tmismatches\tgap opens\tgaps' \
                           '\tq start\tq end\ts start\ts end\te value\tbit score\n'

diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py
@@ -1014,7 +1014,7 @@ def run(self):
         # Write out recombinant details for each cell
         with open("{}/recombinants.txt".format(outdir), 'w') as f:
             f.write(
-                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\n")
+                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3aa\tCDR3nt\n")
             sorted_cell_names = sorted(list(cells.keys()))
             for cell_name in sorted_cell_names:
                 cell = cells[cell_name]
@@ -1025,12 +1025,29 @@ def run(self):
                     recombinants = cell.recombinants[self.receptor_name][locus]
                     if recombinants is not None:
                         for r in recombinants:
+
+                            # check for cdr3nt attribute (to make backwards compatible)
+                            if hasattr(r, 'cdr3nt'):
+                                cdr3nt = r.cdr3nt
+                                cdr3 = r.cdr3
+
+                                # lowercase CDR3 sequences if non-productive
+                                if not r.productive:
+                                    cdr3 = cdr3.lower()
+                                    cdr3nt = cdr3nt.lower()
+
+                            else:
+                                cdr3nt = 'N/A'
+                                cdr3 = 'N/A'
+
                             f.write(
-                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\n".format(
+                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3}\t{cdr3nt}\n".format(
                                     name=cell_name, locus=locus,
                                     ident=r.identifier,
                                     productive=r.productive,
-                                    length=len(r.trinity_seq)))
+                                    length=len(r.trinity_seq),
+                                    cdr3=cdr3,
+                                    cdr3nt=cdr3nt))
                             if r.productive:
                                 cell_data[locus + "_productive"] = r.identifier
                             else:

diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py
@@ -42,6 +42,7 @@ def process_chunk(chunk):
     store_junction_details = False
     store_alignment_summary = False
     store_hit_table = False
+    store_CDR3 = False
     alignment_summary = []
     hit_table = []
     looking_for_end = False
@@ -80,6 +81,13 @@ def process_chunk(chunk):
                 else:
                     return_dict['hit_table'].append(line_x)
 
+        elif store_CDR3:
+            # single tab-separated line, example:
+            # CDR3 GCGTGGAAAGTG AWKV 51 59
+            _, cdr3nt, cdr3, _start, _end = line_x.split('\t')
+            return_dict['cdr3'] += [cdr3nt, cdr3]
+            store_CDR3 = False
+
         elif line_x.startswith('# Query'):
             query_name = line_x.split(" ")[2]
             query_length = None
@@ -100,6 +108,9 @@ def process_chunk(chunk):
         elif line_x.startswith('# V-(D)-J junction details'):
             store_junction_details = True
 
+        elif line_x.startswith('# Sub-region sequence'):
+            store_CDR3 = True
+
         elif line_x.startswith('# Alignment summary'):
             store_alignment_summary = True
 
@@ -152,6 +163,13 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
 
                     identifier = best_V + "_" + junc_string + "_" + best_J
 
+                    # CDR3 sequences
+                    if 'cdr3' in query_data.keys():
+                        cdr3nt, cdr3 = query_data['cdr3']
+                    else:
+                        cdr3nt = 'N/A'
+                        cdr3 = 'N/A'
+
                     # line attempting to add alignment summary to data for use
                     # with PCR comparisons
                     alignment_summary = query_data['alignment_summary']
@@ -227,7 +245,9 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
                                           alignment_summary=alignment_summary,
                                           trinity_seq=trinity_seq,
                                           imgt_reconstructed_seq=imgt_reconstructed_seq,
-                                          has_D=has_D)
+                                          has_D=has_D,
+                                          cdr3nt=cdr3nt,
+                                          cdr3=cdr3)
                         recombinants[locus].append(rec)
 
     if recombinants:
@@ -1242,7 +1262,9 @@ def run_IgBlast(igblast, receptor, loci, output_dir, cell_name, index_location,
                        '-ig_seqtype', ig_seqtype, '-show_translation',
                        '-num_alignments_V', '5',
                        '-num_alignments_D', '5', '-num_alignments_J', '5',
-                       '-outfmt', fmt, '-query', trinity_fasta]
+                       '-outfmt', fmt,
+                       '-auxiliary_data', 'optional_file/{}_gl.aux'.format(igblast_species),
+                       '-query', trinity_fasta]
             if fmt == '7':
                 igblast_out = "{output_dir}/IgBLAST_output/{cell_name}_{locus}.IgBLASTOut".format(
                     output_dir=output_dir,