Skip to content

Commit

Permalink
Minimap2-based masking with -N and -p to report more alignments
Browse files Browse the repository at this point in the history
  • Loading branch information
bede committed Jan 21, 2024
1 parent 9dae0a3 commit 40316d3
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 133 deletions.
6 changes: 3 additions & 3 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def check_index(self, index: str, offline: bool = False) -> Path:
if self.name == "Bowtie2":
if Path(f"{index}.1.bt2").is_file():
index_path = Path(index)
logging.info(f"Using custom index {index_path}")
logging.info(f"Found custom index {index_path}")
elif (self.data_dir / f"{index}.1.bt2").is_file():
index_path = self.data_dir / index
logging.info(f"Found cached standard index {index} (Bowtie2)")
Expand Down Expand Up @@ -63,14 +63,14 @@ def check_index(self, index: str, offline: bool = False) -> Path:
elif self.name == "Minimap2":
if Path(f"{index}").is_file():
index_path = Path(index)
logging.info(f"Using custom index {index}")
logging.info(f"Found custom index {index}")
elif (self.data_dir / f"{index}.fa.gz").is_file():
index_path = self.data_dir / f"{index}.fa.gz"
logging.info(f"Found cached standard index {index} (Minimap2)")
elif not offline and util.fetch_manifest(util.BUCKET_URL).get(index):
file_name = f"{index}.fa.gz"
file_url = f"{util.BUCKET_URL}/{file_name}"
logging.info(f"Fetching standard reference {index} ({file_url})")
logging.info(f"Fetching standard index {index} ({file_url})")
manifest = util.fetch_manifest(util.BUCKET_URL)
with tempfile.NamedTemporaryFile() as temporary_file:
tmp_path = Path(temporary_file.name)
Expand Down
4 changes: 0 additions & 4 deletions src/hostile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ def clean(
def mask(
reference: Path,
target: Path,
kmer_length: int = 150,
kmer_step: int = 10,
out_dir: Path = Path("masked"),
threads: int = util.CPU_COUNT,
) -> None:
Expand All @@ -116,8 +114,6 @@ def mask(
lib.mask(
reference=reference,
target=target,
k=kmer_length,
i=kmer_step,
out_dir=out_dir,
threads=threads,
)
Expand Down
277 changes: 151 additions & 126 deletions src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,147 +264,172 @@ def clean_paired_fastqs(
return stats


# def mask(
# reference: Path,
# target: Path,
# out_dir=Path("masked"),
# k: int = 150,
# i: int = 50,
# threads: int = util.CPU_COUNT,
# ) -> tuple[Path, int, int]:
# """Mask a fasta[.gz] reference genome against fasta.[gz] target genomes"""
# ref_path, target_path = Path(reference), Path(target)
# out_dir.mkdir(exist_ok=True, parents=True)
# ref_index_path = out_dir / "existing"
# kmers_path = out_dir / "kmers.fasta.gz"
# alignments_path = out_dir / "alignments.sam"
# mask_path = out_dir / "mask.bed"
# masked_ref_path = out_dir / "masked.fa"
# masked_ref_index_path = out_dir / "masked"
# masked_alignments_path = out_dir / "masked-alignments.sam"

# if ref_path.suffix == ".gz": # Decompress reference if necessary
# new_ref_path = out_dir / ref_path.stem
# logging.info(f"Decompressing reference into {new_ref_path}")
# with gzip.open(ref_path, "rb") as in_fh:
# with open(new_ref_path, "wb") as out_fh:
# shutil.copyfileobj(in_fh, out_fh)
# ref_path = new_ref_path

# build_existing_cmd = (
# f"bowtie2-build --threads '{threads}' '{ref_path}' '{out_dir}/existing'"
# )
# logging.info(f"Indexing existing reference ({build_existing_cmd})")
# build_existing_cmd_run = util.run(build_existing_cmd)
# if build_existing_cmd_run.stderr.strip():
# logging.info(build_existing_cmd_run.stderr.strip())

# logging.info(f"k-merising target genome(s) {target_path} ({k=}, {i=})")
# kmerise(path=target_path, out_dir=out_dir, k=k, i=i)

# align_cmd = (
# f"bowtie2 -a -p '{threads}'"
# f" -x '{ref_index_path}'"
# f" -f '{kmers_path}'"
# f" > '{alignments_path}'"
# )
# logging.info(f"Aligning target k-mers to existing reference ({align_cmd})")
# align_cmd_run = util.run(align_cmd)
# if align_cmd_run.stderr:
# logging.info(align_cmd_run.stderr.strip())

# count_alignments_cmd = ( # Exclude unmapped reads and secondary alignments
# f"samtools view -c -F 0x904 {alignments_path}"
# )
# logging.info(
# f"Counting target k-mers aligned to existing reference ({count_alignments_cmd})"
# )
# count_alignments_cmd_run = util.run(count_alignments_cmd)
# if count_alignments_cmd_run.stderr:
# logging.info(count_alignments_cmd_run.stderr)
# n_alignments = int(count_alignments_cmd_run.stdout.strip())
# logging.info(f"{n_alignments} k-mers aligned before masking")

# make_cmd = (
# f"samtools view -F 4 -bS '{alignments_path}'"
# f" | samtools sort -"
# f" | bedtools bamtobed -i stdin"
# f" | bedtools merge -i stdin"
# f" > '{mask_path}'"
# )
# logging.info(f"Making mask ({make_cmd=})")
# make_cmd_run = util.run(make_cmd)
# if make_cmd_run.stderr:
# logging.info(make_cmd_run.stderr)

# apply_cmd = (
# f"bedtools maskfasta"
# f" -fi '{ref_path}' -bed '{mask_path}' -fo '{masked_ref_path}'"
# )
# logging.info(f"Applying mask ({apply_cmd=})")
# apply_cmd_run = util.run(apply_cmd)
# if apply_cmd_run.stderr:
# logging.info(apply_cmd_run.stderr)

# build_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'"
# logging.info(f"Indexing masked reference ({build_masked_index_cmd})")
# build_masked_index_cmd_run = util.run(build_masked_index_cmd)
# if build_masked_index_cmd_run.stderr:
# logging.info(build_masked_index_cmd_run.stderr.strip())

# align_masked_cmd = (
# f"bowtie2 -a -p '{threads}'"
# f" -x '{masked_ref_index_path}'"
# f" -f '{kmers_path}'"
# f" > '{masked_alignments_path}'"
# )
# logging.info(f"Aligning target k-mers to masked reference ({align_masked_cmd})")
# align_masked_cmd_run = util.run(align_masked_cmd)
# if align_masked_cmd_run.stderr:
# logging.info(align_masked_cmd_run.stderr.strip())

# count_masked_alignments_cmd = f"samtools view -c -F 0x904 {masked_alignments_path}" # Exclude secondaries (0x100), supplementaries (0x800), and unmapped (0x4)
# logging.info(
# f"Counting target k-mers aligned to masked reference ({count_masked_alignments_cmd})"
# )
# count_masked_alignments_cmd_run = util.run(count_masked_alignments_cmd)
# if count_masked_alignments_cmd_run.stderr:
# logging.info(count_masked_alignments_cmd_run.stderr)
# n_masked_alignments = int(count_masked_alignments_cmd_run.stdout.strip())
# logging.info(
# f"{n_masked_alignments} k-mers aligned after masking ({n_alignments} aligned before masking)"
# )
# logging.info(
# f"Masked genome path (for use with long reads / Minimap2): {masked_ref_path.absolute()}"
# )
# logging.info(
# f"Masked Bowtie2 index path (for use with short reads): {masked_ref_index_path.absolute()} (multiple files)"
# )

# return masked_ref_path, n_alignments, n_masked_alignments


# def kmerise(path: Path, out_dir: Path, k: int, i: int) -> Path:
# out_path = out_dir / "kmers.fasta.gz"
# with dnaio.open(path) as reader, dnaio.open(out_path, mode="w") as writer:
# for r in reader:
# for offset in range(0, len(r.sequence) - k + 1, i):
# kmer = r.sequence[offset : offset + k]
# name = r.name.partition(" ")[0]
# kmer_id = f"{name}_{offset}"
# writer.write(dnaio.SequenceRecord(kmer_id, kmer))
# return out_path.absolute()


def mask(
reference: Path,
target: Path,
out_dir=Path("masked"),
k: int = 150,
i: int = 50,
threads: int = util.CPU_COUNT,
) -> tuple[Path, int, int]:
reference: Path, target: Path, out_dir=Path("masked"), threads: int = 1
) -> Path:
"""Mask a fasta[.gz] reference genome against fasta.[gz] target genomes"""
ref_path, target_path = Path(reference), Path(target)
reference_path, target_path = Path(reference), Path(target)
out_dir.mkdir(exist_ok=True, parents=True)
ref_index_path = out_dir / "existing"
kmers_path = out_dir / "kmers.fasta.gz"
alignments_path = out_dir / "alignments.sam"
mask_path = out_dir / "mask.bed"
masked_ref_path = out_dir / "masked.fa"
masked_ref_index_path = out_dir / "masked"
masked_alignments_path = out_dir / "masked-alignments.sam"

if ref_path.suffix == ".gz": # Decompress reference if necessary
new_ref_path = out_dir / ref_path.stem
logging.info(f"Decompressing reference into {new_ref_path}")
with gzip.open(ref_path, "rb") as in_fh:
with open(new_ref_path, "wb") as out_fh:
bed_path = out_dir / "mask.bed"
masked_reference_path = out_dir / "masked.fa"

if reference_path.suffix == ".gz": # Decompress reference if necessary
new_reference_path = out_dir / reference_path.stem
logging.info(f"Decompressing reference into {new_reference_path}")
with gzip.open(reference_path, "rb") as in_fh:
with open(new_reference_path, "wb") as out_fh:
shutil.copyfileobj(in_fh, out_fh)
ref_path = new_ref_path

build_existing_cmd = (
f"bowtie2-build --threads '{threads}' '{ref_path}' '{out_dir}/existing'"
)
logging.info(f"Indexing existing reference ({build_existing_cmd})")
build_existing_cmd_run = util.run(build_existing_cmd)
if build_existing_cmd_run.stderr.strip():
logging.info(build_existing_cmd_run.stderr.strip())

logging.info(f"k-merising target genome(s) {target_path} ({k=}, {i=})")
kmerise(path=target_path, out_dir=out_dir, k=k, i=i)

align_cmd = (
f"bowtie2 -a -p '{threads}'"
f" -x '{ref_index_path}'"
f" -f '{kmers_path}'"
f" > '{alignments_path}'"
)
logging.info(f"Aligning target k-mers to existing reference ({align_cmd})")
align_cmd_run = util.run(align_cmd)
if align_cmd_run.stderr:
logging.info(align_cmd_run.stderr.strip())

count_alignments_cmd = ( # Exclude unmapped reads and secondary alignments
f"samtools view -c -F 0x904 {alignments_path}"
)
logging.info(
f"Counting target k-mers aligned to existing reference ({count_alignments_cmd})"
)
count_alignments_cmd_run = util.run(count_alignments_cmd)
if count_alignments_cmd_run.stderr:
logging.info(count_alignments_cmd_run.stderr)
n_alignments = int(count_alignments_cmd_run.stdout.strip())
logging.info(f"{n_alignments} k-mers aligned before masking")
reference_path = new_reference_path

make_cmd = (
f"samtools view -F 4 -bS '{alignments_path}'"
f" | samtools sort -"
f" | bedtools bamtobed -i stdin"
f" | bedtools merge -i stdin"
f" > '{mask_path}'"
f"minimap2 -x asm10 -N 10000 -p 0.0001 -t {threads} '{reference_path}' '{target}'"
f" | awk -v OFS='\t' '{{print $6, $8, $9}}'"
f" | sort -k1,1 -k2,2n"
f" | bedtools merge -i stdin > '{bed_path}'"
)
logging.info(f"Making mask ({make_cmd=})")
make_cmd_run = util.run(make_cmd)
if make_cmd_run.stderr:
logging.info(make_cmd_run.stderr)
logging.info(make_cmd_run.stderr.strip())

apply_cmd = (
f"bedtools maskfasta"
f" -fi '{ref_path}' -bed '{mask_path}' -fo '{masked_ref_path}'"
f" -fi '{reference_path}' -bed '{bed_path}' -fo '{masked_reference_path}'"
)
logging.info(f"Applying mask ({apply_cmd=})")
apply_cmd_run = util.run(apply_cmd)
if apply_cmd_run.stderr:
logging.info(apply_cmd_run.stderr)

build_masked_index_cmd = f"bowtie2-build --threads '{threads}' '{masked_ref_path}' '{masked_ref_index_path}'"
logging.info(f"Indexing masked reference ({build_masked_index_cmd})")
build_masked_index_cmd_run = util.run(build_masked_index_cmd)
if build_masked_index_cmd_run.stderr:
logging.info(build_masked_index_cmd_run.stderr.strip())

align_masked_cmd = (
f"bowtie2 -a -p '{threads}'"
f" -x '{masked_ref_index_path}'"
f" -f '{kmers_path}'"
f" > '{masked_alignments_path}'"
)
logging.info(f"Aligning target k-mers to masked reference ({align_masked_cmd})")
align_masked_cmd_run = util.run(align_masked_cmd)
if align_masked_cmd_run.stderr:
logging.info(align_masked_cmd_run.stderr.strip())

count_masked_alignments_cmd = f"samtools view -c -F 0x904 {masked_alignments_path}" # Exclude secondaries (0x100), supplementaries (0x800), and unmapped (0x4)
logging.info(
f"Counting target k-mers aligned to masked reference ({count_masked_alignments_cmd})"
)
count_masked_alignments_cmd_run = util.run(count_masked_alignments_cmd)
if count_masked_alignments_cmd_run.stderr:
logging.info(count_masked_alignments_cmd_run.stderr)
n_masked_alignments = int(count_masked_alignments_cmd_run.stdout.strip())
logging.info(
f"{n_masked_alignments} k-mers aligned after masking ({n_alignments} aligned before masking)"
)
logging.info(
f"Masked genome path (for use with long reads / Minimap2): {masked_ref_path.absolute()}"
)
logging.info(
f"Masked Bowtie2 index path (for use with short reads): {masked_ref_index_path.absolute()} (multiple files)"
)

return masked_ref_path, n_alignments, n_masked_alignments


def get_default_reference_filenames() -> list[Path]:
return [ALIGNER.minimap2.value.ref_archive_fn, ALIGNER.bowtie2.value.idx_archive_fn]


def fetch_reference(filename: str) -> None:
util.download(url=f"{util.BUCKET_URL}/{filename}", path=Path(filename))
if filename.endswith(".tar"):
logging.info("Extracting…")
util.untar_file(input_path=Path(filename), output_path=Path("."))
logging.info(f"Downloaded and extracted {filename}")
else:
logging.info(f"Downloaded {filename}")


def kmerise(path: Path, out_dir: Path, k: int, i: int) -> Path:
out_path = out_dir / "kmers.fasta.gz"
with dnaio.open(path) as reader, dnaio.open(out_path, mode="w") as writer:
for r in reader:
for offset in range(0, len(r.sequence) - k + 1, i):
kmer = r.sequence[offset : offset + k]
name = r.name.partition(" ")[0]
kmer_id = f"{name}_{offset}"
writer.write(dnaio.SequenceRecord(kmer_id, kmer))
return out_path.absolute()
return masked_reference_path
11 changes: 11 additions & 0 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,3 +694,14 @@ def test_mismatched_number_of_reads_bowtie2():
force=True,
)
shutil.rmtree(out_dir, ignore_errors=True)


def test_offline_invalid_standard_index_name():
with pytest.raises(FileNotFoundError):
stats = lib.clean_fastqs(
fastqs=[data_dir / "sars-cov-2_1_1.fastq"],
index="invalid_index_name",
out_dir=out_dir,
offline=True,
)
shutil.rmtree(out_dir, ignore_errors=True)

0 comments on commit 40316d3

Please sign in to comment.