Skip to content

Commit

Permalink
Merge pull request #168 from FrickTobias/ema
Browse files Browse the repository at this point in the history
Ema (Emerald) mapper added.
  • Loading branch information
FrickTobias authored Dec 4, 2019
2 parents 338bbd6 + 616434d commit c69f36d
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 38 deletions.
25 changes: 13 additions & 12 deletions environment.linux.lock.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ dependencies:
- bowtie2=2.3.5=py36he860b03_0
- bwa=0.7.17=hed695b0_6
- bzip2=1.0.8=h516909a_1
- ca-certificates=2019.9.11=hecc5488_0
- certifi=2019.9.11=py36_0
- ca-certificates=2019.11.28=hecc5488_0
- certifi=2019.11.28=py36_0
- cffi=1.13.2=py36h8022711_0
- chardet=3.0.4=py36_1003
- configargparse=0.13.0=py_1
Expand All @@ -21,13 +21,14 @@ dependencies:
- datrie=0.8=py36h516909a_0
- dnaio=0.4.1=py36h516909a_0
- docutils=0.15.2=py36_0
- ema=0.6.2=h8b12597_1
- freebayes=1.3.1=py36h56106d0_0
- gitdb2=2.0.6=py_0
- gitpython=3.0.5=py_0
- hapcut2=1.2=hed50d52_0
- htslib=1.9=h244ad75_9
- idna=2.8=py36_1000
- importlib_metadata=0.23=py36_0
- importlib_metadata=1.1.0=py36_0
- importlib_resources=1.0.2=py36_1000
- jsonschema=3.2.0=py36_0
- krb5=1.16.3=h05b26f9_1001
Expand All @@ -37,13 +38,13 @@ dependencies:
- libdeflate=1.3=h516909a_0
- libedit=3.1.20170329=hf8c457e_1001
- libffi=3.2.1=he1b5a44_1006
- libgcc-ng=9.1.0=hdf63c60_0
- libgcc-ng=9.2.0=hdf63c60_0
- libgfortran-ng=7.3.0=hdf63c60_2
- liblapack=3.8.0=14_openblas
- liblapacke=3.8.0=14_openblas
- libopenblas=0.3.7=h6e990d7_3
- libopenblas=0.3.7=h5ec1e0e_4
- libssh2=1.8.2=h22169c7_2
- libstdcxx-ng=9.1.0=hdf63c60_0
- libstdcxx-ng=9.2.0=hdf63c60_0
- minimap2=2.17=h8b12597_1
- more-itertools=7.2.0=py_0
- ncurses=6.1=hf484d3e_1002
Expand All @@ -55,12 +56,12 @@ dependencies:
- pip=19.3.1=py36_0
- psutil=5.6.7=py36h516909a_0
- pycparser=2.19=py36_1
- pyopenssl=19.0.0=py36_0
- pyopenssl=19.1.0=py36_0
- pyrsistent=0.15.6=py36h516909a_0
- pysam=0.15.3=py36hbcae180_3
- pysocks=1.7.1=py36_0
- python=3.6.7=h357f687_1006
- pyyaml=5.1.2=py36h516909a_0
- pyyaml=5.1.2=py36h516909a_1
- ratelimiter=1.2.0=py36_1000
- readline=8.0=hf8c457e_0
- requests=2.22.0=py36_1
Expand All @@ -69,21 +70,21 @@ dependencies:
- sambamba=0.6.6=2
- samblaster=0.1.24=hc9558a2_3
- samtools=1.9=h10a08f8_12
- setuptools=42.0.1=py36_0
- setuptools=42.0.2=py36_0
- six=1.13.0=py36_0
- smmap2=2.0.5=py_0
- snakemake-minimal=5.8.1=py_0
- sqlite=3.30.1=hcee41ef_0
- starcode=1.3=h14c3975_1
- tbb=2019.9=hc9558a2_0
- tbb=2019.9=hc9558a2_1
- tk=8.6.10=hed695b0_0
- tqdm=4.39.0=py_0
- tqdm=4.40.0=py_0
- urllib3=1.25.7=py36_0
- wheel=0.33.6=py36_0
- wrapt=1.11.2=py36h516909a_0
- xopen=0.8.4=py36_0
- xz=5.2.4=h14c3975_1001
- yaml=0.1.7=h14c3975_1001
- yaml=0.2.2=he1b5a44_0
- zipp=0.6.0=py_0
- zlib=1.2.11=h516909a_1006

21 changes: 11 additions & 10 deletions environment.osx.lock.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ dependencies:
- bowtie2=2.3.5=py36h5c9b4e4_0
- bwa=0.7.17=h2573ce8_6
- bzip2=1.0.8=h01d97ff_1
- ca-certificates=2019.9.11=hecc5488_0
- certifi=2019.9.11=py36_0
- ca-certificates=2019.11.28=hecc5488_0
- certifi=2019.11.28=py36_0
- cffi=1.13.2=py36h33e799b_0
- chardet=3.0.4=py36_1003
- configargparse=0.13.0=py_1
Expand All @@ -20,13 +20,14 @@ dependencies:
- datrie=0.8=py36h01d97ff_0
- dnaio=0.4.1=py36h01d97ff_0
- docutils=0.15.2=py36_0
- ema=0.6.2=hfbae3c0_1
- freebayes=1.3.1=py36he0122c3_0
- gitdb2=2.0.6=py_0
- gitpython=3.0.5=py_0
- hapcut2=1.2=hd50730b_0
- htslib=1.9=h356306b_9
- idna=2.8=py36_1000
- importlib_metadata=0.23=py36_0
- importlib_metadata=1.1.0=py36_0
- importlib_resources=1.0.2=py36_1000
- jsonschema=3.2.0=py36_0
- krb5=1.16.3=hcfa6398_1001
Expand All @@ -40,7 +41,7 @@ dependencies:
- libgfortran=4.0.0=2
- liblapack=3.8.0=14_openblas
- liblapacke=3.8.0=14_openblas
- libopenblas=0.3.7=h4bb4525_3
- libopenblas=0.3.7=h3d69b6c_4
- libssh2=1.8.2=hcdc9a53_2
- llvm-openmp=9.0.0=h40edb58_0
- minimap2=2.17=hfbae3c0_1
Expand All @@ -54,12 +55,12 @@ dependencies:
- pip=19.3.1=py36_0
- psutil=5.6.7=py36h0b31af3_0
- pycparser=2.19=py36_1
- pyopenssl=19.0.0=py36_0
- pyopenssl=19.1.0=py36_0
- pyrsistent=0.15.6=py36h0b31af3_0
- pysam=0.15.3=py36h4ace0ce_3
- pysocks=1.7.1=py36_0
- python=3.6.7=h4285619_1006
- pyyaml=5.1.2=py36h0b31af3_0
- pyyaml=5.1.2=py36h0b31af3_1
- ratelimiter=1.2.0=py36_1000
- readline=8.0=hcfe32e1_0
- requests=2.22.0=py36_1
Expand All @@ -68,21 +69,21 @@ dependencies:
- sambamba=0.6.6=2
- samblaster=0.1.24=h770b8ee_3
- samtools=1.9=h8aa4d43_12
- setuptools=42.0.1=py36_0
- setuptools=42.0.2=py36_0
- six=1.13.0=py36_0
- smmap2=2.0.5=py_0
- snakemake-minimal=5.8.1=py_0
- sqlite=3.30.1=h93121df_0
- starcode=1.3=h1de35cc_1
- tbb=2019.9=ha1b3eb9_0
- tbb=2019.9=ha1b3eb9_1
- tk=8.6.10=hbbe82c9_0
- tqdm=4.39.0=py_0
- tqdm=4.40.0=py_0
- urllib3=1.25.7=py36_0
- wheel=0.33.6=py36_0
- wrapt=1.11.2=py36h0b31af3_0
- xopen=0.8.4=py36_0
- xz=5.2.4=h1de35cc_1001
- yaml=0.1.7=h1de35cc_1001
- yaml=0.2.2=h4a8c4bd_0
- zipp=0.6.0=py_0
- zlib=1.2.11=h0b31af3_1006

1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ dependencies:
- importlib_resources
- freebayes
- ruamel.yaml
- ema=0.6.2
26 changes: 24 additions & 2 deletions src/blr/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,27 @@ rule starcode_clustering:
" 2> {log}"


rule barcode_sort_fastq:
# Assumes barcode read name is followed by barcode sequence.
# Exmaple: @ST-E00269:339:H27G2CCX2:7:1102:21186:8060:AAAAAAAATATCTACGCTCA BX:Z:AAAAAAAATATCTACGCTCA
output:
fastq = "sorted.{nr}.fastq.gz"
input:
fastq = "trimmed.barcoded.{nr}.fastq.gz"
shell:
"pigz -cd -p 1 {input.fastq} |"
" paste - - - - |"
" sort -t ' ' -k2 |"
" tr '\t' '\n' |"
" pigz > {output.fastq}"


rule map_reads:
output:
bam = "mapped.sorted.bam"
input:
r1_fastq = "trimmed.barcoded.1.fastq.gz",
r2_fastq = "trimmed.barcoded.2.fastq.gz"
r1_fastq = "trimmed.barcoded.1.fastq.gz" if config["read_mapper"] != "ema" else "sorted.1.fastq.gz",
r2_fastq = "trimmed.barcoded.2.fastq.gz" if config["read_mapper"] != "ema" else "sorted.2.fastq.gz"
threads: 20
log: "read_mapping.log"
run:
Expand All @@ -63,6 +78,13 @@ rule map_reads:
" {config[genome_reference]}"
" {input.r1_fastq}"
" {input.r2_fastq}",
"ema":
"ema align"
" -1 <(pigz -cd {input.r1_fastq})"
" -2 <(pigz -cd {input.r2_fastq})"
" -r {config[genome_reference]}"
" -t {threads}"
" -p 10x"
}
command = commands[config["read_mapper"]].format(**locals(), **globals())

Expand Down
2 changes: 1 addition & 1 deletion src/blr/blr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ molecule_tag: MI # Used to store molecule ID, same as 10x default.
num_mol_tag: MN # Used to store number of molecules per barcode
sequence_tag: RX # Used to store original barcode sequence in bam file. 'RX' is 10x genomic default
genome_reference: # Path to indexed reference
read_mapper: bowtie2 # Choose bwa, bowtie2 or minimap2
read_mapper: bowtie2 # Choose bwa, bowtie2, minimap2 or ema
duplicate_marker: sambamba # Choose sambamba or samblaster
barcode_max_dist: 2 # Max edit distance (Leveshtein distance) allowed to cluster two barcode sequences together
max_molecules_per_bc: 260 # Max number of molecules allowed for a single barcode (removes if bc has > 260 molecules).
Expand Down
30 changes: 19 additions & 11 deletions src/blr/cli/tagfastq.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def main(args):
logger.info(f"Output detected as {'interleaved' if out_interleaved else 'paired'} FASTQ.")

reads_missing_barcode = 0
separator = args.sep
# Parse input FASTA/FASTQ for read1 and read2, uncorrected barcodes and write output
with dnaio.open(args.input1, file2=args.input2, interleaved=in_interleaved, mode="r",
fileformat="fastq") as reader, \
Expand All @@ -58,10 +57,10 @@ def main(args):

for read1, read2 in tqdm(reader, desc="Read pairs processed"):
# Header parsing
name_and_pos_r1, read_and_index_r1 = read1.name.split(maxsplit=1)
name_and_pos_r2, read_and_index_r2 = read2.name.split(maxsplit=1)
name_and_pos, nr_and_index1 = read1.name.split(maxsplit=1)
_, nr_and_index2 = read2.name.split(maxsplit=1)

uncorrected_barcode_seq = uncorrected_barcode_reader.get_barcode(name_and_pos_r1)
uncorrected_barcode_seq = uncorrected_barcode_reader.get_barcode(name_and_pos)

# Check if barcode was found and update header with barcode info.
if uncorrected_barcode_seq:
Expand All @@ -71,14 +70,23 @@ def main(args):
corr_barcode_id = f"{args.barcode_tag}:Z:{corrected_barcode_seq}"

# Create new name with barcode information.
new_name = separator.join([name_and_pos_r1, raw_barcode_id, corr_barcode_id])

# Save header to read instances
read1.name = " ".join([new_name, read_and_index_r1])
read2.name = " ".join([new_name, read_and_index_r2])
if args.mapper == "ema":
# The EMA aligner requires reads in 10x format e.g.
# @READNAME:AAAAAAAATATCTACGCTCA BX:Z:AAAAAAAATATCTACGCTCA
new_name = ":".join([name_and_pos, corrected_barcode_seq])
new_name = " ".join((new_name, corr_barcode_id))
read1.name, read2.name = new_name, new_name
else:
new_name = "_".join([name_and_pos, raw_barcode_id, corr_barcode_id])
read1.name = " ".join([new_name, nr_and_index1])
read2.name = " ".join([new_name, nr_and_index2])
else:
reads_missing_barcode += 1

# EMA aligner cannot handle reads without barcodes so these are skipped.
if args.mapper == "ema":
continue

# Write to out
writer.write(read1, read2)

Expand Down Expand Up @@ -167,6 +175,6 @@ def add_arguments(parser):
"-s", "--sequence-tag", default="RX",
help="SAM tag for storing the uncorrected barcode sequence. Default: %(default)s")
parser.add_argument(
"--sep", default="_",
help="Character used as separator for storing SAM tags in the FASTQ/FASTA header. Default: %(default)s"
"-m", "--mapper",
help="Specify read mapper for labeling reads with barcodes. "
)
2 changes: 1 addition & 1 deletion src/blr/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ properties:
default: RX
read_mapper:
type: string
pattern: "bwa|bowtie2|minimap2"
pattern: "bwa|bowtie2|minimap2|ema"
description: Which read mapper to use
duplicate_marker:
type: string
Expand Down
2 changes: 2 additions & 0 deletions src/blr/rules/trim.smk
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ rule trim_and_tag:
" --o2 {output.r2_fastq}"
" -b {config[cluster_tag]}"
" -s {config[sequence_tag]}"
" --mapper {config[read_mapper]}"
" {input.uncorrected_barcodes}"
" {input.corrected_barcodes}"
" -"
Expand All @@ -75,6 +76,7 @@ rule extract_DBS:
" -j {threads}"
" -m 19"
" -M 21"
" --max-n 0"
" -o {output.fastq}"
" {input.fastq}"
" > {log}"
2 changes: 2 additions & 0 deletions tests/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ snakemake --version
blr --version
samblaster --version
sambamba --version
ema

( cd testdata && bwa index chr1mini.fasta )
( cd testdata && bowtie2-build chr1mini.fasta chr1mini.fasta > /dev/null )
( cd testdata && samtools faidx chr1mini.fasta )

pytest -v tests/

Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_config(tmpdir):
change_config(workdir / "blr.yaml", [("read_mapper", "bwa")])


@pytest.mark.parametrize("read_mapper", ["bwa", "bowtie2", "minimap2"])
@pytest.mark.parametrize("read_mapper", ["bwa", "bowtie2", "minimap2", "ema"])
def test_mappers(tmpdir, read_mapper):
workdir = tmpdir / "analysis"
init(workdir, TESTDATA_READS)
Expand Down

0 comments on commit c69f36d

Please sign in to comment.