Skip to content

Commit

Permalink
Merge pull request #73 from dcroote/add-cdr3
Browse files Browse the repository at this point in the history
Add cdr3 nucleotide sequences
  • Loading branch information
mstubb authored Aug 31, 2018
2 parents c90ab4f + e63f010 commit 6bc0182
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 45 deletions.
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ RUN cd /trinityrnaseq-Trinity-v2.4.0 && make
RUN wget ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/1.7.0/ncbi-igblast-1.7.0-x64-linux.tar.gz
RUN tar -xzvf ncbi-igblast-1.7.0-x64-linux.tar.gz && rm ncbi-igblast-1.7.0-x64-linux.tar.gz
RUN cd /ncbi-igblast-1.7.0/bin/ && wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data && \
mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && rm -r ftp.ncbi.nih.gov
wget -r ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file && \
mv ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data . && \
mv ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file . && \
rm -r ftp.ncbi.nih.gov

#aligners - kallisto and salmon
RUN wget https://github.com/pachterlab/kallisto/releases/download/v0.43.1/kallisto_linux-v0.43.1.tar.gz
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Note that TraCeR is compatible with both Python 2 and 3.
6. [Graphviz](http://www.graphviz.org) - Dot and Neato drawing programs required for visualisation of clonotype graphs. This is optional - see the [`--no_networks` option](#options-1) to [`summarise`](#summarise-summary-and-clonotype-networks).

##### Installing IgBlast
Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and put it into the same directory as the igblast executable. This is also described in the igblast README file.
Downloading the executable files from `ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/<version_number>` is not sufficient for a working IgBlast installation. You must also download the `internal_data` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/internal_data) and `optional_file` directory (ftp://ftp.ncbi.nih.gov/blast/executables/igblast/release/optional_file/) and put them into the same directory as the igblast executable. This is also described in the igblast README file.

You should also ensure to set the `$IGDATA` environment variable to point to the location of the IgBlast executable. For example run `export IGDATA=/<path_to_igblast>/igblast/1.4.0/bin`.

Expand Down Expand Up @@ -286,7 +286,7 @@ The following output files are generated:
1. `TCR_summary.txt`
Summary statistics describing successful TCR reconstruction rates and the numbers of cells with 0, 1, 2 or more recombinants for each locus.
2. `recombinants.txt`
List of TCR identifiers, lengths and productivities for each cell.
List of TCR identifiers, lengths, productivities, and CDR3 sequences (nucleotide and amino acid) for each cell. **Note:** It's possible for non-productive rearrangements to still have a detectable CDR3 if a frameshift introduces a stop-codon after this region. In these cases, the CDR3 nucleotides and amino acids are still reported but are shown in lower-case.
3. `reconstructed_lengths_TCR[A|B].pdf` and `reconstructed_lengths_TCR[A|B].txt`
Distribution plots (and text files with underlying data) showing the lengths of the VDJ regions from assembled TCR contigs. Longer contigs give higher-confidence segment assignments. Text files are only generated if at least one TCR is found for a locus. Plots are only generated if at least two TCRs are found for a locus.
4. `clonotype_sizes.pdf` and `clonotype_sizes.txt`
Expand Down
26 changes: 13 additions & 13 deletions test_data/expected_summary/recombinants.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
cell_name locus recombinant_id productive reconstructed_length
cell1 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325
cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347
cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100
cell1 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334
cell_name locus recombinant_id productive reconstructed_length CDR3aa CDR3nt
cell1 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 AVENNNNAPR GCTGTTGAGAATAACAACAATGCCCCACGA
cell1 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 avrgkerqyrkth gctgtgagggggaaggagaggcaataccggaaaactcatc
cell1 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100 assttggapgsst gccagctctacaacagggggggcaccgggcagctctac
cell1 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 ASSYNSYEQY GCCAGCAGCTACAACTCCTATGAACAGTAC

cell2 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325
cell2 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347
cell2 B TRBV12-1_GCTCTACAACAGGGGGGGCACCG_TRBJ2-2 False 100
cell2 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334
cell2 A TRAV9D-4_GTGAGGGGGAAGGAGAGGCA_TRAJ37 False 347 N/A N/A
cell2 A TRAV4-2_TTGAGAATAA_TRAJ43 True 325 N/A N/A
cell2 B TRBV12-1_GCTCTACAACAGGGGGGG(C)ACCGG_TRBJ2-2 False 100 N/A N/A
cell2 B TRBV4_AGCTACAACTCCT_TRBJ2-7 True 334 N/A N/A

cell3 A TRAV3-3_CAGTGGGGGAACTA_TRAJ26 False 339
cell3 A TRAV7-5_TGAGCGACACC_TRAJ27 True 334
cell3 B TRBV31_AGTCTTGACACAAGA_TRBJ2-5 False 335
cell3 B TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5 True 343
cell3 A TRAV7-5_TGAGCGACACC_TRAJ27 True 334 N/A N/A
cell3 A TRAV3-3_CAGTGGGGGAACTA_TRAJ26 False 339 N/A N/A
cell3 B TRBV31_AGTCTTGACACAAGA_TRBJ2-5 False 335 N/A N/A
cell3 B TRBV31_TGGAGCCCCGGGACAGGGCTCAACC_TRBJ1-5 True 343 N/A N/A



38 changes: 14 additions & 24 deletions tracerlib/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,15 +302,14 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
productive, stop_codon, in_frame, TPM,
dna_seq, hit_table, summary, junction_details, best_VJ_names,
alignment_summary, trinity_seq,
imgt_reconstructed_seq, has_D):
imgt_reconstructed_seq, has_D, cdr3nt, cdr3):
self.contig_name = contig_name
self.locus = locus
self.identifier = identifier
self.all_poss_identifiers = all_poss_identifiers
self.productive = productive
self.TPM = TPM
self.dna_seq = dna_seq
self.cdr3 = self._get_cdr3(dna_seq)
self.hit_table = hit_table
self.summary = summary
self.junction_details = junction_details
Expand All @@ -321,32 +320,13 @@ def __init__(self, contig_name, locus, identifier, all_poss_identifiers,
self.trinity_seq = trinity_seq
self.imgt_reconstructed_seq = imgt_reconstructed_seq
self.has_D_segment = has_D
self.cdr3nt = cdr3nt
self.cdr3 = cdr3

def __str__(self):
return (
"{} {} {} {}".format(self.identifier, self.productive, self.TPM))

def _get_cdr3(self, dna_seq):
aaseq = Seq(str(dna_seq), generic_dna).translate()
if re.findall('FG.G', str(aaseq)) and re.findall('C', str(aaseq)):
indices = [i for i, x in enumerate(aaseq) if x == 'C']
upper = str(aaseq).find(re.findall('FG.G', str(aaseq))[0])
lower = False
for i in indices:
if i < upper:
lower = i
if lower:
cdr3 = aaseq[lower:upper + 4]
else:
cdr3 = "Couldn't find conserved cysteine"
elif re.findall('FG.G', str(aaseq)):
cdr3 = "Couldn't find conserved cysteine"
elif re.findall('C', str(aaseq)):
cdr3 = "Couldn't find FGXG"
else:
cdr3 = "Couldn't find either conserved boundary"
return (cdr3)

def get_summary(self):
summary_string = "##{contig_name}##\n".format(
contig_name=self.contig_name)
Expand All @@ -367,10 +347,20 @@ def get_summary(self):
summary_string += segments_string
summary_string += "ID:\t{}\n".format(self.identifier)
summary_string += "TPM:\t{TPM}\nProductive:\t{productive}\nStop codon:" \
"\t{stop_codon}\nIn frame:\t{in_frame}\n\n".format(
"\t{stop_codon}\nIn frame:\t{in_frame}\n".format(
TPM=self.TPM, productive=self.productive,
stop_codon=self.stop_codon, in_frame=self.in_frame)

# lowercase CDR3 sequences if non-productive
cdr3 = self.cdr3
cdr3nt = self.cdr3nt
if not self.productive:
cdr3 = cdr3.lower()
cdr3nt = cdr3nt.lower()

summary_string += "CDR3aa:\t{}\nCDR3nt:\t{}\n\n".format(cdr3,
cdr3nt)

summary_string += 'Segment\tquery_id\tsubject_id\t% identity\t' \
'alignment length\tmismatches\tgap opens\tgaps' \
'\tq start\tq end\ts start\ts end\te value\tbit score\n'
Expand Down
23 changes: 20 additions & 3 deletions tracerlib/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,7 @@ def run(self):
# Write out recombinant details for each cell
with open("{}/recombinants.txt".format(outdir), 'w') as f:
f.write(
"cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\n")
"cell_name\tlocus\trecombinant_id\tproductive\treconstructed_length\tCDR3aa\tCDR3nt\n")
sorted_cell_names = sorted(list(cells.keys()))
for cell_name in sorted_cell_names:
cell = cells[cell_name]
Expand All @@ -1025,12 +1025,29 @@ def run(self):
recombinants = cell.recombinants[self.receptor_name][locus]
if recombinants is not None:
for r in recombinants:

# check for cdr3nt attribute (to make backwards compatible)
if hasattr(r, 'cdr3nt'):
cdr3nt = r.cdr3nt
cdr3 = r.cdr3

# lowercase CDR3 sequences if non-productive
if not r.productive:
cdr3 = cdr3.lower()
cdr3nt = cdr3nt.lower()

else:
cdr3nt = 'N/A'
cdr3 = 'N/A'

f.write(
"{name}\t{locus}\t{ident}\t{productive}\t{length}\n".format(
"{name}\t{locus}\t{ident}\t{productive}\t{length}\t{cdr3}\t{cdr3nt}\n".format(
name=cell_name, locus=locus,
ident=r.identifier,
productive=r.productive,
length=len(r.trinity_seq)))
length=len(r.trinity_seq),
cdr3=cdr3,
cdr3nt=cdr3nt))
if r.productive:
cell_data[locus + "_productive"] = r.identifier
else:
Expand Down
26 changes: 24 additions & 2 deletions tracerlib/tracer_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def process_chunk(chunk):
store_junction_details = False
store_alignment_summary = False
store_hit_table = False
store_CDR3 = False
alignment_summary = []
hit_table = []
looking_for_end = False
Expand Down Expand Up @@ -80,6 +81,13 @@ def process_chunk(chunk):
else:
return_dict['hit_table'].append(line_x)

elif store_CDR3:
# single tab-separated line, example:
# CDR3 GCGTGGAAAGTG AWKV 51 59
_, cdr3nt, cdr3, _start, _end = line_x.split('\t')
return_dict['cdr3'] += [cdr3nt, cdr3]
store_CDR3 = False

elif line_x.startswith('# Query'):
query_name = line_x.split(" ")[2]
query_length = None
Expand All @@ -100,6 +108,9 @@ def process_chunk(chunk):
elif line_x.startswith('# V-(D)-J junction details'):
store_junction_details = True

elif line_x.startswith('# Sub-region sequence'):
store_CDR3 = True

elif line_x.startswith('# Alignment summary'):
store_alignment_summary = True

Expand Down Expand Up @@ -152,6 +163,13 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,

identifier = best_V + "_" + junc_string + "_" + best_J

# CDR3 sequences
if 'cdr3' in query_data.keys():
cdr3nt, cdr3 = query_data['cdr3']
else:
cdr3nt = 'N/A'
cdr3 = 'N/A'

# line attempting to add alignment summary to data for use
# with PCR comparisons
alignment_summary = query_data['alignment_summary']
Expand Down Expand Up @@ -227,7 +245,9 @@ def find_possible_alignments(sample_dict, locus_names, cell_name, IMGT_seqs,
alignment_summary=alignment_summary,
trinity_seq=trinity_seq,
imgt_reconstructed_seq=imgt_reconstructed_seq,
has_D=has_D)
has_D=has_D,
cdr3nt=cdr3nt,
cdr3=cdr3)
recombinants[locus].append(rec)

if recombinants:
Expand Down Expand Up @@ -1242,7 +1262,9 @@ def run_IgBlast(igblast, receptor, loci, output_dir, cell_name, index_location,
'-ig_seqtype', ig_seqtype, '-show_translation',
'-num_alignments_V', '5',
'-num_alignments_D', '5', '-num_alignments_J', '5',
'-outfmt', fmt, '-query', trinity_fasta]
'-outfmt', fmt,
'-auxiliary_data', 'optional_file/{}_gl.aux'.format(igblast_species),
'-query', trinity_fasta]
if fmt == '7':
igblast_out = "{output_dir}/IgBLAST_output/{cell_name}_{locus}.IgBLASTOut".format(
output_dir=output_dir,
Expand Down

0 comments on commit 6bc0182

Please sign in to comment.