From 946b1d18bfbfb3de1a6dc4b50cb83196fb62400c Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Tue, 31 Jul 2018 18:07:02 -0700
Subject: [PATCH 1/9] Download IgBLAST optional_file directory

This directory is necessary for IgBLAST CDR3 calling
---
 Dockerfile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 3e1d87d..0817286 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,10 @@ RUN cd /trinityrnaseq-Trinity-v2.4.0 && make
 RUN wget ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.7.0/ncbi-igblast-1.7.0-x64-linux.tar.gz
 RUN tar -xzvf ncbi-igblast-1.7.0-x64-linux.tar.gz && rm ncbi-igblast-1.7.0-x64-linux.tar.gz
 RUN cd /ncbi-igblast-1.7.0/bin/ && wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data && \
-	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && rm -r ftp.ncbi.nih.gov
+	wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file && \
+	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && \
+	mv ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file . && \
+	rm -r ftp.ncbi.nih.gov
 
 #aligners - kallisto and salmon
 RUN wget https://github.com/pachterlab/kallisto/releases/download/v0.43.1/kallisto_linux-v0.43.1.tar.gz

From 19185714530590cc78565c5271bfd56f26275236 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Tue, 31 Jul 2018 18:10:18 -0700
Subject: [PATCH 2/9] Report CDR3 nucleotide seqs parsed from IgBLAST

---
 tracerlib/core.py        |  7 +++++--
 tracerlib/tasks.py       | 14 +++++++++++---
 tracerlib/tracer_func.py | 24 ++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tracerlib/core.py b/tracerlib/core.py
index 54c20d8..72732c4 100644
--- a/tracerlib/core.py
+++ b/tracerlib/core.py
@@ -302,7 +302,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
                  productive, stop_codon, in_frame, TPM,
                  dna_seq, hit_table, summary, junction_details, best_VJ_names,
                  alignment_summary, trinity_seq,
-                 imgt_reconstructed_seq, has_D):
+                 imgt_reconstructed_seq, has_D, cdr3nt):
         self.contig_name = contig_name
         self.locus = locus
         self.identifier = identifier
@@ -321,6 +321,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
         self.trinity_seq = trinity_seq
         self.imgt_reconstructed_seq = imgt_reconstructed_seq
         self.has_D_segment = has_D
+        self.cdr3nt = cdr3nt
 
     def __str__(self):
         return (
@@ -367,10 +368,12 @@ def get_summary(self):
         summary_string += segments_string
         summary_string += "ID:\t{}\n".format(self.identifier)
         summary_string += "TPM:\t{TPM}\nProductive:\t{productive}\nStop codon:" \
-                          "\t{stop_codon}\nIn frame:\t{in_frame}\n\n".format(
+                          "\t{stop_codon}\nIn frame:\t{in_frame}\n".format(
             TPM=self.TPM, productive=self.productive,
             stop_codon=self.stop_codon, in_frame=self.in_frame)
 
+        summary_string += "CDR3nt:\t{}\n\n".format(self.cdr3nt)
+
         summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \
                           'alignment length\tmismatches\tgap opens\tgaps' \
                           '\tq start\tq end\ts start\ts end\te value\tbit score\n'
diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py
index c07cfcc..332e11d 100644
--- a/tracerlib/tasks.py
+++ b/tracerlib/tasks.py
@@ -1014,7 +1014,7 @@ def run(self):
         # Write out recombinant details for each cell
         with open("{}/recombinants.txt".format(outdir), 'w') as f:
             f.write(
-                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\n")
+                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3nt\n")
             sorted_cell_names = sorted(list(cells.keys()))
             for cell_name in sorted_cell_names:
                 cell = cells[cell_name]
@@ -1025,12 +1025,20 @@ def run(self):
                     recombinants = cell.recombinants[self.receptor_name][locus]
                     if recombinants is not None:
                         for r in recombinants:
+
+                            # check for cdr3nt attribute (to make backwards compatible)
+                            if hasattr(r, 'cdr3nt'):
+                                cdr3nt = r.cdr3nt
+                            else:
+                                cdr3nt = 'N/A'
+
                             f.write(
-                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\n".format(
+                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3nt}\n".format(
                                     name=cell_name, locus=locus,
                                     ident=r.identifier,
                                     productive=r.productive,
-                                    length=len(r.trinity_seq)))
+                                    length=len(r.trinity_seq),
+                                    cdr3nt=cdr3nt))
                             if r.productive:
                                 cell_data[locus + "_productive"] = r.identifier
                             else:
diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py
index 5a04205..37d2b71 100644
--- a/tracerlib/tracer_func.py
+++ b/tracerlib/tracer_func.py
@@ -42,6 +42,7 @@ def process_chunk(chunk):
     store_junction_details = False
     store_alignment_summary = False
     store_hit_table = False
+    store_CDR3 = False
     alignment_summary = []
     hit_table = []
     looking_for_end = False
@@ -80,6 +81,13 @@ def process_chunk(chunk):
                 else:
                     return_dict['hit_table'].append(line_x)
 
+        elif store_CDR3:
+            # single tab-separated line, example:
+            # CDR3 GCGTGGAAAGTG AWKV 51 59
+            _, cdr3nt, _cdr3aa, _start, _end = line_x.split('\t')
+            return_dict['cdr3'].append(cdr3nt)
+            store_CDR3 = False
+
         elif line_x.startswith('# Query'):
             query_name = line_x.split(" ")[2]
             query_length = None
@@ -100,6 +108,9 @@ def process_chunk(chunk):
         elif line_x.startswith('# V-(D)-J junction details'):
             store_junction_details = True
 
+        elif line_x.startswith('# Sub-region sequence'):
+            store_CDR3 = True
+
         elif line_x.startswith('# Alignment summary'):
             store_alignment_summary = True
 
@@ -152,6 +163,12 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
 
                     identifier = best_V + "_" + junc_string + "_" + best_J
 
+                    # CDR3 nucleotide sequences
+                    if 'cdr3' in query_data.keys():
+                        cdr3nt = query_data['cdr3'][0]
+                    else:
+                        cdr3nt = 'N/A'
+
                     # line attempting to add alignment summary to data for use
                     # with PCR comparisons
                     alignment_summary = query_data['alignment_summary']
@@ -227,7 +244,8 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
                                           alignment_summary=alignment_summary,
                                           trinity_seq=trinity_seq,
                                           imgt_reconstructed_seq=imgt_reconstructed_seq,
-                                          has_D=has_D)
+                                          has_D=has_D,
+                                          cdr3nt=cdr3nt)
                         recombinants[locus].append(rec)
 
     if recombinants:
@@ -1242,7 +1260,9 @@ def run_IgBlast(igblast, receptor, loci, output_dir, cell_name, index_location,
                        '-ig_seqtype', ig_seqtype, '-show_translation',
                        '-num_alignments_V', '5',
                        '-num_alignments_D', '5', '-num_alignments_J', '5',
-                       '-outfmt', fmt, '-query', trinity_fasta]
+                       '-outfmt', fmt,
+                       '-auxiliary_data', 'optional_file/{}_gl.aux'.format(igblast_species),
+                       '-query', trinity_fasta]
             if fmt == '7':
                 igblast_out = "{output_dir}/IgBLAST_output/{cell_name}_{locus}.IgBLASTOut".format(
                     output_dir=output_dir,

From 70e44c586ee6535bea0cbe1c604363dba9c91e81 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Thu, 2 Aug 2018 10:57:12 -0700
Subject: [PATCH 3/9] Update README to reflect IgBLAST optional_file use

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 953c63d..9831c17 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ Note that TraCeR is compatible with both Python 2 and 3.
 6. [Graphviz](http://www.graphviz.org) - Dot and Neato drawing programs required for visualisation of clonotype graphs. This is optional - see the [`--no_networks` option](#options-1) to [`summarise`](#summarise-summary-and-clonotype-networks).
 
 ##### Installing IgBlast 
-Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and put it into the same directory as the igblast executable. This is also described in the igblast README file.
+Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and `optional_file` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file/) and put them into the same directory as the igblast executable. This is also described in the igblast README file.
 
 You should also ensure to set the `$IGDATA` environment variable to point to the location of the IgBlast executable. For example run `export IGDATA=/<path_to_igblast>/igblast/1.4.0/bin`.
 

From e702ccbf7f9c99079b9b4d32ea4e31373cf6bdd5 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Mon, 6 Aug 2018 21:16:39 -0700
Subject: [PATCH 4/9] Report amino acid CDR3 from IgBLAST

---
 tracerlib/core.py        | 28 ++++------------------------
 tracerlib/tasks.py       |  7 +++++--
 tracerlib/tracer_func.py | 10 ++++++----
 3 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/tracerlib/core.py b/tracerlib/core.py
index 72732c4..4b83e19 100644
--- a/tracerlib/core.py
+++ b/tracerlib/core.py
@@ -302,7 +302,7 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
                  productive, stop_codon, in_frame, TPM,
                  dna_seq, hit_table, summary, junction_details, best_VJ_names,
                  alignment_summary, trinity_seq,
-                 imgt_reconstructed_seq, has_D, cdr3nt):
+                 imgt_reconstructed_seq, has_D, cdr3nt, cdr3):
         self.contig_name = contig_name
         self.locus = locus
         self.identifier = identifier
@@ -310,7 +310,6 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
         self.productive = productive
         self.TPM = TPM
         self.dna_seq = dna_seq
-        self.cdr3 = self._get_cdr3(dna_seq)
         self.hit_table = hit_table
         self.summary = summary
         self.junction_details = junction_details
@@ -322,32 +321,12 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
         self.imgt_reconstructed_seq = imgt_reconstructed_seq
         self.has_D_segment = has_D
         self.cdr3nt = cdr3nt
+        self.cdr3 = cdr3
 
     def __str__(self):
         return (
         "{} {} {} {}".format(self.identifier, self.productive, self.TPM))
 
-    def _get_cdr3(self, dna_seq):
-        aaseq = Seq(str(dna_seq), generic_dna).translate()
-        if re.findall('FG.G', str(aaseq)) and re.findall('C', str(aaseq)):
-            indices = [i for i, x in enumerate(aaseq) if x == 'C']
-            upper = str(aaseq).find(re.findall('FG.G', str(aaseq))[0])
-            lower = False
-            for i in indices:
-                if i < upper:
-                    lower = i
-            if lower:
-                cdr3 = aaseq[lower:upper + 4]
-            else:
-                cdr3 = "Couldn't find conserved cysteine"
-        elif re.findall('FG.G', str(aaseq)):
-            cdr3 = "Couldn't find conserved cysteine"
-        elif re.findall('C', str(aaseq)):
-            cdr3 = "Couldn't find FGXG"
-        else:
-            cdr3 = "Couldn't find either conserved boundary"
-        return (cdr3)
-
     def get_summary(self):
         summary_string = "##{contig_name}##\n".format(
             contig_name=self.contig_name)
@@ -372,7 +351,8 @@ def get_summary(self):
             TPM=self.TPM, productive=self.productive,
             stop_codon=self.stop_codon, in_frame=self.in_frame)
 
-        summary_string += "CDR3nt:\t{}\n\n".format(self.cdr3nt)
+        summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(self.cdr3,
+                                                                self.cdr3nt)
 
         summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \
                           'alignment length\tmismatches\tgap opens\tgaps' \
diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py
index 332e11d..5489a45 100644
--- a/tracerlib/tasks.py
+++ b/tracerlib/tasks.py
@@ -1014,7 +1014,7 @@ def run(self):
         # Write out recombinant details for each cell
         with open("{}/recombinants.txt".format(outdir), 'w') as f:
             f.write(
-                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3nt\n")
+                "cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3aa\tCDR3nt\n")
             sorted_cell_names = sorted(list(cells.keys()))
             for cell_name in sorted_cell_names:
                 cell = cells[cell_name]
@@ -1029,15 +1029,18 @@ def run(self):
                             # check for cdr3nt attribute (to make backwards compatible)
                             if hasattr(r, 'cdr3nt'):
                                 cdr3nt = r.cdr3nt
+                                cdr3 = r.cdr3
                             else:
                                 cdr3nt = 'N/A'
+                                cdr3 = 'N/A'
 
                             f.write(
-                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3nt}\n".format(
+                                "{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3}\t{cdr3nt}\n".format(
                                     name=cell_name, locus=locus,
                                     ident=r.identifier,
                                     productive=r.productive,
                                     length=len(r.trinity_seq),
+                                    cdr3=cdr3,
                                     cdr3nt=cdr3nt))
                             if r.productive:
                                 cell_data[locus + "_productive"] = r.identifier
diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py
index 37d2b71..15e41ac 100644
--- a/tracerlib/tracer_func.py
+++ b/tracerlib/tracer_func.py
@@ -84,8 +84,8 @@ def process_chunk(chunk):
         elif store_CDR3:
             # single tab-separated line, example:
             # CDR3 GCGTGGAAAGTG AWKV 51 59
-            _, cdr3nt, _cdr3aa, _start, _end = line_x.split('\t')
-            return_dict['cdr3'].append(cdr3nt)
+            _, cdr3nt, cdr3, _start, _end = line_x.split('\t')
+            return_dict['cdr3'] += [cdr3nt, cdr3]
             store_CDR3 = False
 
         elif line_x.startswith('# Query'):
@@ -165,9 +165,10 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
 
                     # CDR3 nucleotide sequences
                     if 'cdr3' in query_data.keys():
-                        cdr3nt = query_data['cdr3'][0]
+                        cdr3nt, cdr3 = query_data['cdr3']
                     else:
                         cdr3nt = 'N/A'
+                        cdr3 = 'N/A'
 
                     # line attempting to add alignment summary to data for use
                     # with PCR comparisons
@@ -245,7 +246,8 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
                                           trinity_seq=trinity_seq,
                                           imgt_reconstructed_seq=imgt_reconstructed_seq,
                                           has_D=has_D,
-                                          cdr3nt=cdr3nt)
+                                          cdr3nt=cdr3nt,
+                                          cdr3=cdr3)
                         recombinants[locus].append(rec)
 
     if recombinants:

From 40a82492c2076569a3212a63bc32cc62ecbef506 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Tue, 7 Aug 2018 18:24:54 -0700
Subject: [PATCH 5/9] Update expected recombinants.txt

---
 test_data/expected_summary/recombinants.txt | 26 ++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test_data/expected_summary/recombinants.txt b/test_data/expected_summary/recombinants.txt
index 75113c6..3da80f2 100644
--- a/test_data/expected_summary/recombinants.txt
+++ b/test_data/expected_summary/recombinants.txt
@@ -1,18 +1,18 @@
-cell_name	locus	recombinant_id	productive	reconstructed_length
-cell1	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325
-cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347
-cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100
-cell1	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334
+cell_name	locus	recombinant_id	productive	reconstructed_length	CDR3aa	CDR3nt
+cell1	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325	AVENNNNAPR	GCTGTTGAGAATAACAACAATGCCCCACGA
+cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	AVRGKERQYRKTH	GCTGTGAGGGGGAAGGAGAGGCAATACCGGAAAACTCATC
+cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100	ASSTTGGAPGSST	GCCAGCTCTACAACAGGGGGGGCACCGGGCAGCTCTAC
+cell1	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334	ASSYNSYEQY	GCCAGCAGCTACAACTCCTATGAACAGTAC
 
-cell2	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325
-cell2	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347
-cell2	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100
-cell2	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334
+cell2	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	N/A	N/A
+cell2	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325	N/A	N/A
+cell2	B	TRBV12-1_GCTCTACAACAGGGGGGG(C)ACCGG_TRBJ2-2	False	100	N/A	N/A
+cell2	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334	N/A	N/A
 
-cell3	A	TRAV3-3_CAGTGGGGGAACTA_TRAJ26	False	339
-cell3	A	TRAV7-5_TGAGCGACACC_TRAJ27	True	334
-cell3	B	TRBV31_AGTCTTGACACAAGA_TRBJ2-5	False	335
-cell3	B	TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5	True	343
+cell3	A	TRAV7-5_TGAGCGACACC_TRAJ27	True	334	N/A	N/A
+cell3	A	TRAV3-3_CAGTGGGGGAACTA_TRAJ26	False	339	N/A	N/A
+cell3	B	TRBV31_AGTCTTGACACAAGA_TRBJ2-5	False	335	N/A	N/A
+cell3	B	TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5	True	343	N/A	N/A
 
 
 

From 361b16d2ba7ad4604078a7ad443fdfb07785d72f Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Tue, 21 Aug 2018 21:57:15 -0700
Subject: [PATCH 6/9] Change CDR3 seqs to lowercase if non-productive

---
 tracerlib/core.py        | 11 +++++++++--
 tracerlib/tasks.py       |  6 ++++++
 tracerlib/tracer_func.py |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tracerlib/core.py b/tracerlib/core.py
index 4b83e19..24d948d 100644
--- a/tracerlib/core.py
+++ b/tracerlib/core.py
@@ -351,8 +351,15 @@ def get_summary(self):
             TPM=self.TPM, productive=self.productive,
             stop_codon=self.stop_codon, in_frame=self.in_frame)
 
-        summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(self.cdr3,
-                                                                self.cdr3nt)
+        # lowercase CDR3 sequences if non-productive
+        cdr3 = self.cdr3
+        cdr3nt = self.cdr3nt
+        if not self.productive:
+            cdr3 = cdr3.lower()
+            cdr3nt = cdr3nt.lower()
+
+        summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(cdr3,
+                                                                cdr3nt)
 
         summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \
                           'alignment length\tmismatches\tgap opens\tgaps' \
diff --git a/tracerlib/tasks.py b/tracerlib/tasks.py
index 5489a45..0522c24 100644
--- a/tracerlib/tasks.py
+++ b/tracerlib/tasks.py
@@ -1030,6 +1030,12 @@ def run(self):
                             if hasattr(r, 'cdr3nt'):
                                 cdr3nt = r.cdr3nt
                                 cdr3 = r.cdr3
+
+                                # lowercase CDR3 sequences if non-productive
+                                if not r.productive:
+                                    cdr3 = cdr3.lower()
+                                    cdr3nt = cdr3nt.lower()
+
                             else:
                                 cdr3nt = 'N/A'
                                 cdr3 = 'N/A'
diff --git a/tracerlib/tracer_func.py b/tracerlib/tracer_func.py
index 15e41ac..a587700 100644
--- a/tracerlib/tracer_func.py
+++ b/tracerlib/tracer_func.py
@@ -163,7 +163,7 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
 
                     identifier = best_V + "_" + junc_string + "_" + best_J
 
-                    # CDR3 nucleotide sequences
+                    # CDR3 sequences
                     if 'cdr3' in query_data.keys():
                         cdr3nt, cdr3 = query_data['cdr3']
                     else:

From 510b5f3052f828c46a6577eb70ecdb7b503d8f55 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Fri, 31 Aug 2018 10:07:23 -0700
Subject: [PATCH 7/9] Include CDR3 seqs in README summarise output

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9831c17..f8d5e6d 100644
--- a/README.md
+++ b/README.md
@@ -286,7 +286,7 @@ The following output files are generated:
 1. `TCR_summary.txt`
     Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus.
 2. `recombinants.txt`
-    List of TCR identifiers, lengths and productivities for each cell. 
+    List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell.
 3. `reconstructed_lengths_TCR[A|B].pdf` and  `reconstructed_lengths_TCR[A|B].txt`
     Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus. 
 4. `clonotype_sizes.pdf` and `clonotype_sizes.txt`

From ccaf9242c61d23b6eacb114618baa4e9dfe2b1b2 Mon Sep 17 00:00:00 2001
From: Derek Croote <derek.croote@gmail.com>
Date: Fri, 31 Aug 2018 10:10:13 -0700
Subject: [PATCH 8/9] Update expected recombinants with lowercase CDR3s

---
 test_data/expected_summary/recombinants.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_data/expected_summary/recombinants.txt b/test_data/expected_summary/recombinants.txt
index 3da80f2..7832cc7 100644
--- a/test_data/expected_summary/recombinants.txt
+++ b/test_data/expected_summary/recombinants.txt
@@ -1,7 +1,7 @@
 cell_name	locus	recombinant_id	productive	reconstructed_length	CDR3aa	CDR3nt
 cell1	A	TRAV4-2_TTGAGAATAA_TRAJ43	True	325	AVENNNNAPR	GCTGTTGAGAATAACAACAATGCCCCACGA
-cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	AVRGKERQYRKTH	GCTGTGAGGGGGAAGGAGAGGCAATACCGGAAAACTCATC
-cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100	ASSTTGGAPGSST	GCCAGCTCTACAACAGGGGGGGCACCGGGCAGCTCTAC
+cell1	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	avrgkerqyrkth	gctgtgagggggaaggagaggcaataccggaaaactcatc
+cell1	B	TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2	False	100	assttggapgsst	gccagctctacaacagggggggcaccgggcagctctac
 cell1	B	TRBV4_AGCTACAACTCCT_TRBJ2-7	True	334	ASSYNSYEQY	GCCAGCAGCTACAACTCCTATGAACAGTAC
 
 cell2	A	TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37	False	347	N/A	N/A

From e63f01054d518f00f83e68983dc1cc8c7090a248 Mon Sep 17 00:00:00 2001
From: Mike Stubbington <mstubb@users.noreply.github.com>
Date: Fri, 31 Aug 2018 20:57:33 +0100
Subject: [PATCH 9/9] added extra info about CDR3 reporting

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f8d5e6d..2f4fc21 100644
--- a/README.md
+++ b/README.md
@@ -286,7 +286,7 @@ The following output files are generated:
 1. `TCR_summary.txt`
     Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus.
 2. `recombinants.txt`
-    List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell.
+    List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. **Note:** It's possible for non-productive rearrangements to still have a detectable CDR3 if a frameshift introduces a stop-codon after this region. In these cases, the CDR3 nucleotides and amino acids are still reported but are shown in lower-case.
 3. `reconstructed_lengths_TCR[A|B].pdf` and  `reconstructed_lengths_TCR[A|B].txt`
     Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus. 
 4. `clonotype_sizes.pdf` and `clonotype_sizes.txt`