Skip to content

Commit

Permalink
Fix preprocess.knockin_handler to correctly identify the flox knock…
Browse files Browse the repository at this point in the history
…-in sites as deletions not present in the control.
  • Loading branch information
akikuno committed May 1, 2024
1 parent d8bbd9f commit d4d267c
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions src/DAJIN2/core/preprocess/knockin_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,35 @@
###########################################################
# Consider all mutations are possible in the knockin region
# For large deletion alleles, the deleted sequence becomes the knock-in region, so all mutations within this region are taken into consideration.
# TODO: Maybe this knock-in consideration isn't necessary?
# The code identifies the flox knock-in sites as deletions not present in the control.
###########################################################


def _is_invalid_file(reference: Path, query: Path) -> bool:
return query == reference or query.suffix != ".fasta"
def is_valid_file(control_fasta: Path, knockin_fasta: Path) -> bool:
return knockin_fasta != control_fasta and knockin_fasta.suffix == ".fasta"


def _get_knockin_loci(reference: Path, query: Path) -> set[int]:
alignments = mapping.to_sam(reference, query, preset="splice")
def get_index_of_knockin_loci(control_fasta: Path, knockin_fasta: Path) -> set[int]:
alignments = mapping.to_sam(knockin_fasta, control_fasta, preset="map-ont")
alignments = [a.split("\t") for a in alignments]
alignments_midsv = midsv.transform(alignments, midsv=False, cssplit=True, qscore=False)[0]
alignments_midsv = next(iter(midsv.transform(alignments, midsv=False, cssplit=True, qscore=False)))
cssplits = alignments_midsv["CSSPLIT"].split(",")

return {i for i, cs in enumerate(cssplits) if cs.startswith("-")}


def extract_knockin_loci(TEMPDIR: str | Path, SAMPLE_NAME: str) -> None:
reference = Path(TEMPDIR, SAMPLE_NAME, "fasta", "control.fasta")
for query in Path(TEMPDIR, SAMPLE_NAME, "fasta").iterdir():
if _is_invalid_file(reference, query):
control_fasta = Path(TEMPDIR, SAMPLE_NAME, "fasta", "control.fasta")
for knockin_fasta in Path(TEMPDIR, SAMPLE_NAME, "fasta").iterdir():
if not is_valid_file(control_fasta, knockin_fasta):
continue

allele = query.stem
allele = knockin_fasta.stem
path_output = Path(TEMPDIR, SAMPLE_NAME, "knockin_loci", f"{allele}.pickle")
if path_output.exists():
continue

knockin_loci = _get_knockin_loci(reference, query)
knockin_loci = get_index_of_knockin_loci(control_fasta, knockin_fasta)

with open(path_output, "wb") as p:
pickle.dump(knockin_loci, p)

0 comments on commit d4d267c

Please sign in to comment.