diff --git a/src/hostile/aligner.py b/src/hostile/aligner.py index 3b6723c..28ae4ee 100644 --- a/src/hostile/aligner.py +++ b/src/hostile/aligner.py @@ -31,7 +31,7 @@ def check_index(self, index: str, offline: bool = False) -> Path: if self.name == "Bowtie2": if Path(f"{index}.1.bt2").is_file(): index_path = Path(index) - logging.info(f"Using custom index {index_path}") + logging.info(f"Found custom index {index_path}") elif (self.data_dir / f"{index}.1.bt2").is_file(): index_path = self.data_dir / index logging.info(f"Found cached standard index {index} (Bowtie2)") @@ -63,14 +63,14 @@ def check_index(self, index: str, offline: bool = False) -> Path: elif self.name == "Minimap2": if Path(f"{index}").is_file(): index_path = Path(index) - logging.info(f"Using custom index {index}") + logging.info(f"Found custom index {index}") elif (self.data_dir / f"{index}.fa.gz").is_file(): index_path = self.data_dir / f"{index}.fa.gz" logging.info(f"Found cached standard index {index} (Minimap2)") elif not offline and util.fetch_manifest(util.BUCKET_URL).get(index): file_name = f"{index}.fa.gz" file_url = f"{util.BUCKET_URL}/{file_name}" - logging.info(f"Fetching standard reference {index} ({file_url})") + logging.info(f"Fetching standard index {index} ({file_url})") manifest = util.fetch_manifest(util.BUCKET_URL) with tempfile.NamedTemporaryFile() as temporary_file: tmp_path = Path(temporary_file.name) diff --git a/src/hostile/cli.py b/src/hostile/cli.py index a208c84..15dcc17 100644 --- a/src/hostile/cli.py +++ b/src/hostile/cli.py @@ -98,8 +98,6 @@ def clean( def mask( reference: Path, target: Path, - kmer_length: int = 150, - kmer_step: int = 10, out_dir: Path = Path("masked"), threads: int = util.CPU_COUNT, ) -> None: @@ -116,8 +114,6 @@ def mask( lib.mask( reference=reference, target=target, - k=kmer_length, - i=kmer_step, out_dir=out_dir, threads=threads, ) diff --git a/src/hostile/lib.py b/src/hostile/lib.py index 074a9b3..2b8c753 100644 --- a/src/hostile/lib.py +++ b/src/hostile/lib.py @@ -264,147 +264,172 @@ def clean_paired_fastqs( return stats +# def mask( +# reference: Path, +# target: Path, +# out_dir=Path("masked"), +# k: int = 150, +# i: int = 50, +# threads: int = util.CPU_COUNT, +# ) -> tuple[Path, int, int]: +# """Mask a fasta[.gz] reference genome against fasta.[gz] target genomes""" +# ref_path, target_path = Path(reference), Path(target) +# out_dir.mkdir(exist_ok=True, parents=True) +# ref_index_path = out_dir / "existing" +# kmers_path = out_dir / "kmers.fasta.gz" +# alignments_path = out_dir / "alignments.sam" +# mask_path = out_dir / "mask.bed" +# masked_ref_path = out_dir / "masked.fa" +# masked_ref_index_path = out_dir / "masked" +# masked_alignments_path = out_dir / "masked-alignments.sam" + +# if ref_path.suffix == ".gz": # Decompress reference if necessary +# new_ref_path = out_dir / ref_path.stem +# logging.info(f"Decompressing reference into {new_ref_path}") +# with gzip.open(ref_path, "rb") as in_fh: +# with open(new_ref_path, "wb") as out_fh: +# shutil.copyfileobj(in_fh, out_fh) +# ref_path = new_ref_path + +# build_existing_cmd = ( +# f"bowtie2-build --threads '{threads}' '{ref_path}' '{out_dir}/existing'" +# ) +# logging.info(f"Indexing existing reference ({build_existing_cmd})") +# build_existing_cmd_run = util.run(build_existing_cmd) +# if build_existing_cmd_run.stderr.strip(): +# logging.info(build_existing_cmd_run.stderr.strip()) + +# logging.info(f"k-merising target genome(s) {target_path} ({k=}, {i=})") +# kmerise(path=target_path, out_dir=out_dir, k=k, i=i) + +# align_cmd = ( +# f"bowtie2 -a -p '{threads}'" +# f" -x '{ref_index_path}'" +# f" -f '{kmers_path}'" +# f" > '{alignments_path}'" +# ) +# logging.info(f"Aligning target k-mers to existing reference ({align_cmd})") +# align_cmd_run = util.run(align_cmd) +# if align_cmd_run.stderr: +# logging.info(align_cmd_run.stderr.strip()) + +# count_alignments_cmd = ( # Exclude unmapped reads and secondary alignments +# f"samtools view -c -F 0x904 {alignments_path}" +# ) +# logging.info( +# f"Counting target k-mers aligned to existing reference ({count_alignments_cmd})" +# ) +# count_alignments_cmd_run = util.run(count_alignments_cmd) +# if count_alignments_cmd_run.stderr: +# logging.info(count_alignments_cmd_run.stderr) +# n_alignments = int(count_alignments_cmd_run.stdout.strip()) +# logging.info(f"{n_alignments} k-mers aligned before masking") + +# make_cmd = ( +# f"samtools view -F 4 -bS '{alignments_path}'" +# f" | samtools sort -" +# f" | bedtools bamtobed -i stdin" +# f" | bedtools merge -i stdin" +# f" > '{mask_path}'" +# ) +# logging.info(f"Making mask ({make_cmd=})") +# make_cmd_run = util.run(make_cmd) +# if make_cmd_run.stderr: +# logging.info(make_cmd_run.stderr) + +# apply_cmd = ( +# f"bedtools maskfasta" +# f" -fi '{ref_path}' -bed '{mask_path}' -fo '{masked_ref_path}'" +# ) +# logging.info(f"Applying mask ({apply_cmd=})") +# apply_cmd_run = util.run(apply_cmd) +# if apply_cmd_run.stderr: +# logging.info(apply_cmd_run.stderr) + +# build_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'" +# logging.info(f"Indexing masked reference ({build_masked_index_cmd})") +# build_masked_index_cmd_run = util.run(build_masked_index_cmd) +# if build_masked_index_cmd_run.stderr: +# logging.info(build_masked_index_cmd_run.stderr.strip()) + +# align_masked_cmd = ( +# f"bowtie2 -a -p '{threads}'" +# f" -x '{masked_ref_index_path}'" +# f" -f '{kmers_path}'" +# f" > '{masked_alignments_path}'" +# ) +# logging.info(f"Aligning target k-mers to masked reference ({align_masked_cmd})") +# align_masked_cmd_run = util.run(align_masked_cmd) +# if align_masked_cmd_run.stderr: +# logging.info(align_masked_cmd_run.stderr.strip()) + +# count_masked_alignments_cmd = f"samtools view -c -F 0x904 {masked_alignments_path}" # Exclude secondaries (0x100), supplementaries (0x800), and unmapped (0x4) +# logging.info( +# f"Counting target k-mers aligned to masked reference ({count_masked_alignments_cmd})" +# ) +# count_masked_alignments_cmd_run = util.run(count_masked_alignments_cmd) +# if count_masked_alignments_cmd_run.stderr: +# logging.info(count_masked_alignments_cmd_run.stderr) +# n_masked_alignments = int(count_masked_alignments_cmd_run.stdout.strip()) +# logging.info( +# f"{n_masked_alignments} k-mers aligned after masking ({n_alignments} aligned before masking)" +# ) +# logging.info( +# f"Masked genome path (for use with long reads / Minimap2): {masked_ref_path.absolute()}" +# ) +# logging.info( +# f"Masked Bowtie2 index path (for use with short reads): {masked_ref_index_path.absolute()} (multiple files)" +# ) + +# return masked_ref_path, n_alignments, n_masked_alignments + + +# def kmerise(path: Path, out_dir: Path, k: int, i: int) -> Path: +# out_path = out_dir / "kmers.fasta.gz" +# with dnaio.open(path) as reader, dnaio.open(out_path, mode="w") as writer: +# for r in reader: +# for offset in range(0, len(r.sequence) - k + 1, i): +# kmer = r.sequence[offset : offset + k] +# name = r.name.partition(" ")[0] +# kmer_id = f"{name}_{offset}" +# writer.write(dnaio.SequenceRecord(kmer_id, kmer)) +# return out_path.absolute() + + def mask( - reference: Path, - target: Path, - out_dir=Path("masked"), - k: int = 150, - i: int = 50, - threads: int = util.CPU_COUNT, -) -> tuple[Path, int, int]: + reference: Path, target: Path, out_dir=Path("masked"), threads: int = 1 +) -> Path: """Mask a fasta[.gz] reference genome against fasta.[gz] target genomes""" - ref_path, target_path = Path(reference), Path(target) + reference_path, target_path = Path(reference), Path(target) out_dir.mkdir(exist_ok=True, parents=True) - ref_index_path = out_dir / "existing" - kmers_path = out_dir / "kmers.fasta.gz" - alignments_path = out_dir / "alignments.sam" - mask_path = out_dir / "mask.bed" - masked_ref_path = out_dir / "masked.fa" - masked_ref_index_path = out_dir / "masked" - masked_alignments_path = out_dir / "masked-alignments.sam" - - if ref_path.suffix == ".gz": # Decompress reference if necessary - new_ref_path = out_dir / ref_path.stem - logging.info(f"Decompressing reference into {new_ref_path}") - with gzip.open(ref_path, "rb") as in_fh: - with open(new_ref_path, "wb") as out_fh: + bed_path = out_dir / "mask.bed" + masked_reference_path = out_dir / "masked.fa" + + if reference_path.suffix == ".gz": # Decompress reference if necessary + new_reference_path = out_dir / reference_path.stem + logging.info(f"Decompressing reference into {new_reference_path}") + with gzip.open(reference_path, "rb") as in_fh: + with open(new_reference_path, "wb") as out_fh: shutil.copyfileobj(in_fh, out_fh) - ref_path = new_ref_path - - build_existing_cmd = ( - f"bowtie2-build --threads '{threads}' '{ref_path}' '{out_dir}/existing'" - ) - logging.info(f"Indexing existing reference ({build_existing_cmd})") - build_existing_cmd_run = util.run(build_existing_cmd) - if build_existing_cmd_run.stderr.strip(): - logging.info(build_existing_cmd_run.stderr.strip()) - - logging.info(f"k-merising target genome(s) {target_path} ({k=}, {i=})") - kmerise(path=target_path, out_dir=out_dir, k=k, i=i) - - align_cmd = ( - f"bowtie2 -a -p '{threads}'" - f" -x '{ref_index_path}'" - f" -f '{kmers_path}'" - f" > '{alignments_path}'" - ) - logging.info(f"Aligning target k-mers to existing reference ({align_cmd})") - align_cmd_run = util.run(align_cmd) - if align_cmd_run.stderr: - logging.info(align_cmd_run.stderr.strip()) - - count_alignments_cmd = ( # Exclude unmapped reads and secondary alignments - f"samtools view -c -F 0x904 {alignments_path}" - ) - logging.info( - f"Counting target k-mers aligned to existing reference ({count_alignments_cmd})" - ) - count_alignments_cmd_run = util.run(count_alignments_cmd) - if count_alignments_cmd_run.stderr: - logging.info(count_alignments_cmd_run.stderr) - n_alignments = int(count_alignments_cmd_run.stdout.strip()) - logging.info(f"{n_alignments} k-mers aligned before masking") + reference_path = new_reference_path make_cmd = ( - f"samtools view -F 4 -bS '{alignments_path}'" - f" | samtools sort -" - f" | bedtools bamtobed -i stdin" - f" | bedtools merge -i stdin" - f" > '{mask_path}'" + f"minimap2 -x asm10 -N 10000 -p 0.0001 -t {threads} '{reference_path}' '{target}'" + f" | awk -v OFS='\t' '{{print $6, $8, $9}}'" + f" | sort -k1,1 -k2,2n" + f" | bedtools merge -i stdin > '{bed_path}'" ) logging.info(f"Making mask ({make_cmd=})") make_cmd_run = util.run(make_cmd) - if make_cmd_run.stderr: - logging.info(make_cmd_run.stderr) + logging.info(make_cmd_run.stderr.strip()) apply_cmd = ( f"bedtools maskfasta" - f" -fi '{ref_path}' -bed '{mask_path}' -fo '{masked_ref_path}'" + f" -fi '{reference_path}' -bed '{bed_path}' -fo '{masked_reference_path}'" ) logging.info(f"Applying mask ({apply_cmd=})") apply_cmd_run = util.run(apply_cmd) if apply_cmd_run.stderr: logging.info(apply_cmd_run.stderr) - build_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'" - logging.info(f"Indexing masked reference ({build_masked_index_cmd})") - build_masked_index_cmd_run = util.run(build_masked_index_cmd) - if build_masked_index_cmd_run.stderr: - logging.info(build_masked_index_cmd_run.stderr.strip()) - - align_masked_cmd = ( - f"bowtie2 -a -p '{threads}'" - f" -x '{masked_ref_index_path}'" - f" -f '{kmers_path}'" - f" > '{masked_alignments_path}'" - ) - logging.info(f"Aligning target k-mers to masked reference ({align_masked_cmd})") - align_masked_cmd_run = util.run(align_masked_cmd) - if align_masked_cmd_run.stderr: - logging.info(align_masked_cmd_run.stderr.strip()) - - count_masked_alignments_cmd = f"samtools view -c -F 0x904 {masked_alignments_path}" # Exclude secondaries (0x100), supplementaries (0x800), and unmapped (0x4) - logging.info( - f"Counting target k-mers aligned to masked reference ({count_masked_alignments_cmd})" - ) - count_masked_alignments_cmd_run = util.run(count_masked_alignments_cmd) - if count_masked_alignments_cmd_run.stderr: - logging.info(count_masked_alignments_cmd_run.stderr) - n_masked_alignments = int(count_masked_alignments_cmd_run.stdout.strip()) - logging.info( - f"{n_masked_alignments} k-mers aligned after masking ({n_alignments} aligned before masking)" - ) - logging.info( - f"Masked genome path (for use with long reads / Minimap2): {masked_ref_path.absolute()}" - ) - logging.info( - f"Masked Bowtie2 index path (for use with short reads): {masked_ref_index_path.absolute()} (multiple files)" - ) - - return masked_ref_path, n_alignments, n_masked_alignments - - -def get_default_reference_filenames() -> list[Path]: - return [ALIGNER.minimap2.value.ref_archive_fn, ALIGNER.bowtie2.value.idx_archive_fn] - - -def fetch_reference(filename: str) -> None: - util.download(url=f"{util.BUCKET_URL}/{filename}", path=Path(filename)) - if filename.endswith(".tar"): - logging.info("Extracting…") - util.untar_file(input_path=Path(filename), output_path=Path(".")) - logging.info(f"Downloaded and extracted {filename}") - else: - logging.info(f"Downloaded {filename}") - - -def kmerise(path: Path, out_dir: Path, k: int, i: int) -> Path: - out_path = out_dir / "kmers.fasta.gz" - with dnaio.open(path) as reader, dnaio.open(out_path, mode="w") as writer: - for r in reader: - for offset in range(0, len(r.sequence) - k + 1, i): - kmer = r.sequence[offset : offset + k] - name = r.name.partition(" ")[0] - kmer_id = f"{name}_{offset}" - writer.write(dnaio.SequenceRecord(kmer_id, kmer)) - return out_path.absolute() + return masked_reference_path diff --git a/tests/test_all.py b/tests/test_all.py index cd2ad54..3b8683e 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -694,3 +694,14 @@ def test_mismatched_number_of_reads_bowtie2(): force=True, ) shutil.rmtree(out_dir, ignore_errors=True) + + +def test_offline_invalid_standard_index_name(): + with pytest.raises(FileNotFoundError): + stats = lib.clean_fastqs( + fastqs=[data_dir / "sars-cov-2_1_1.fastq"], + index="invalid_index_name", + out_dir=out_dir, + offline=True, + ) + shutil.rmtree(out_dir, ignore_errors=True)