diff --git a/Cargo.lock b/Cargo.lock index 8b1644b..cd79ec6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -314,6 +314,15 @@ version = "1.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "kseq" version = "0.5.3" @@ -586,6 +595,7 @@ dependencies = [ "anyhow", "assert_cmd", "clap", + "itertools", "kseq", "predicates", "pretty_assertions", diff --git a/Cargo.toml b/Cargo.toml index 0a5f5ac..91034d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ anyhow = "1.0.86" kseq = "0.5" clap = { version = "4.5.7", features = ["derive"] } rayon = "1.10.0" +itertools = "0.13.0" [dev-dependencies] assert_cmd = "2" diff --git a/mk-outs.sh b/mk-outs.sh index 952916a..d8ad34d 100755 --- a/mk-outs.sh +++ b/mk-outs.sh @@ -17,8 +17,10 @@ OUT_FA_100K="tests/outputs/out-100k-fasta.txt" OUT_FQ_50K="tests/outputs/out-50k-fastq.txt" OUT_FQ_100K="tests/outputs/out-100k-fastq.txt" -$PRG -d $DNA_FA -r $RNA_FA_50K -o $OUT_FA_50K -$PRG -d $DNA_FA -r $RNA_FA_100K -o $OUT_FA_100K +OUT_DIR="tests/outputs" -$PRG -d $DNA_FA -r $RNA_FQ_50K -o $OUT_FQ_50K -$PRG -d $DNA_FA -r $RNA_FQ_100K -o $OUT_FQ_100K +$PRG -j $DNA_FA -r $RNA_FA_50K -o $OUT_DIR +$PRG -j $DNA_FA -r $RNA_FA_100K -o $OUT_DIR + +$PRG -j $DNA_FA -r $RNA_FQ_50K -o $OUT_DIR +$PRG -j $DNA_FA -r $RNA_FQ_100K -o $OUT_DIR diff --git a/src/main.rs b/src/main.rs index 5f4f29a..061e14f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -109,22 +109,30 @@ fn run(args: Args) -> Result<()> { .try_for_each(|reads_file| -> Result<()> { let basename = Path::new(&reads_file) .file_name() - .ok_or(anyhow!("basename"))?; + .ok_or(anyhow!("basename"))? + .to_os_string(); - let mut basename = basename.to_os_string(); - basename.push(".txt"); - let out_path = &outdir.join(basename); - let mut out_file = File::create(out_path)?; + let mut out_data_file = basename.clone(); + out_data_file.push(".txt"); + let out_data_path = &outdir.join(out_data_file); + let mut out_data = File::create(out_data_path)?; + + let mut out_count_file = basename.clone(); + out_count_file.push(".count"); + let out_count_path = &outdir.join(out_count_file); + let mut out_count = File::create(out_count_path)?; // Search through each of the RNA sequences, reusing // the sequence and search results instances. let timer = Instant::now(); let mut reads: kseq::Paths = get_reader(&reads_file)?; - writeln!(out_file, "File: {}", &reads_file)?; + writeln!(out_data, "File: {}", &reads_file)?; let mut search: Search = Search::new(&junctions)?; + let mut read_count = 0; while let Some(rec) = reads.iter_record()? { search.search(rec.seq()); + read_count += 1; } if args.verbose { @@ -137,10 +145,12 @@ fn run(args: Args) -> Result<()> { for (i, count) in search.junctions.hits.into_iter().enumerate() { if count > 0 { if let Some(name) = map.get(&search.junctions.key[i]) { - writeln!(out_file, "{name}\t{count}")?; + writeln!(out_data, "{name}\t{count}")?; } } } + + writeln!(out_count, "{read_count}")?; Ok(()) })?; diff --git a/tests/cli.rs b/tests/cli.rs index 642b685..c60f449 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -1,22 +1,30 @@ use anyhow::{anyhow, Result}; use assert_cmd::Command; +use itertools::izip; use predicates::prelude::*; use pretty_assertions::assert_eq; use rand::{distributions::Alphanumeric, Rng}; -use std::{fs, iter::zip, path::Path}; +use std::{fs, path::Path}; use tempfile::TempDir; const PRG: &str = "tallyman"; const DNA_FA: &str = "tests/inputs/dna.fasta"; const DNA_FQ: &str = "tests/inputs/dna.fastq"; + const RNA_FA_50K: &str = "tests/inputs/rna-50k.fasta"; const RNA_FQ_50K: &str = "tests/inputs/rna-50k.fastq"; const RNA_FA_100K: &str = "tests/inputs/rna-100k.fasta"; const RNA_FQ_100K: &str = "tests/inputs/rna-100k.fastq"; -const OUT_FA_50K: &str = "tests/outputs/out-50k-fasta.txt"; -const OUT_FA_100K: &str = "tests/outputs/out-100k-fasta.txt"; -const OUT_FQ_50K: &str = "tests/outputs/out-50k-fastq.txt"; -const OUT_FQ_100K: &str = "tests/outputs/out-100k-fastq.txt"; + +const OUT_FA_50K: &str = "tests/outputs/rna-50k.fasta.txt"; +const OUT_FA_100K: &str = "tests/outputs/rna-100k.fasta.txt"; +const OUT_FA_50K_COUNT: &str = "tests/outputs/rna-50k.fasta.count"; +const OUT_FA_100K_COUNT: &str = "tests/outputs/rna-100k.fasta.count"; + +const OUT_FQ_50K: &str = "tests/outputs/rna-50k.fastq.txt"; +const OUT_FQ_100K: &str = "tests/outputs/rna-100k.fastq.txt"; +const OUT_FQ_50K_COUNT: &str = "tests/outputs/rna-50k.fastq.count"; +const OUT_FQ_100K_COUNT: &str = "tests/outputs/rna-100k.fastq.count"; // -------------------------------------------------- fn gen_bad_file() -> String { @@ -64,9 +72,11 @@ fn run( read_files: &[&str], junction_file: &str, expected_files: &[&str], + expected_counts: &[&str], ) -> Result<()> { // outdir will be removed when var leaves scope let outdir = TempDir::new()?; + let mut args: Vec = vec![ "-j".to_string(), junction_file.to_string(), @@ -81,19 +91,30 @@ fn run( Command::cargo_bin(PRG)?.args(&args).assert().success(); - for (read_file, expected_file) in zip(read_files, expected_files) { + for (read_file, expected_file, expected_count) in + izip!(read_files, expected_files, expected_counts) + { // Output file is read basename + ".txt" - let mut read_base = Path::new(&read_file) + let read_base = Path::new(&read_file) .file_name() .ok_or(anyhow!("No basename"))? .to_os_string(); - read_base.push(".txt"); - let outpath = &outdir.path().join(&read_base); + let mut data_basename = read_base.clone(); + data_basename.push(".txt"); + let outpath = &outdir.path().join(&data_basename); assert!(outpath.exists()); let expected = fs::read_to_string(expected_file)?; let actual = fs::read_to_string(outpath)?; assert_eq!(&actual, &expected); + + let mut count_basename = read_base.clone(); + count_basename.push(".count"); + let outpath = &outdir.path().join(&count_basename); + assert!(outpath.exists()); + let expected = fs::read_to_string(expected_count)?; + let actual = fs::read_to_string(outpath)?; + assert_eq!(&actual, &expected); } Ok(()) @@ -102,25 +123,25 @@ fn run( // -------------------------------------------------- #[test] fn run_50k_fasta() -> Result<()> { - run(&[RNA_FA_50K], DNA_FA, &[OUT_FA_50K]) + run(&[RNA_FA_50K], DNA_FA, &[OUT_FA_50K], &[OUT_FA_50K_COUNT]) } // -------------------------------------------------- #[test] fn run_50k_fastq() -> Result<()> { - run(&[RNA_FQ_50K], DNA_FQ, &[OUT_FQ_50K]) + run(&[RNA_FQ_50K], DNA_FQ, &[OUT_FQ_50K], &[OUT_FQ_50K_COUNT]) } // -------------------------------------------------- #[test] fn run_100k_fasta() -> Result<()> { - run(&[RNA_FA_100K], DNA_FA, &[OUT_FA_100K]) + run(&[RNA_FA_100K], DNA_FA, &[OUT_FA_100K], &[OUT_FA_100K_COUNT]) } // -------------------------------------------------- #[test] fn run_100k_fastq() -> Result<()> { - run(&[RNA_FQ_100K], DNA_FQ, &[OUT_FQ_100K]) + run(&[RNA_FQ_100K], DNA_FQ, &[OUT_FQ_100K], &[OUT_FQ_100K_COUNT]) } // -------------------------------------------------- @@ -130,5 +151,6 @@ fn run_50k_100k_fastq() -> Result<()> { &[RNA_FA_50K, RNA_FQ_100K], DNA_FQ, &[OUT_FA_50K, OUT_FQ_100K], + &[OUT_FA_50K_COUNT, OUT_FQ_100K_COUNT], ) } diff --git a/tests/outputs/full.txt b/tests/outputs/full.txt deleted file mode 100644 index 6ddf255..0000000 --- a/tests/outputs/full.txt +++ /dev/null @@ -1,501 +0,0 @@ -File: fixtures/test-rna.fasta -testSeq938 1 -testSeq921 1 -testSeq181 1 -testSeq707 1 -testSeq391 1 -testSeq611 1 -testSeq414 1 -testSeq6 1 -testSeq913 1 -testSeq101 1 -testSeq876 1 -testSeq632 1 -testSeq280 1 -testSeq433 1 -testSeq995 1 -testSeq542 1 -testSeq417 1 -testSeq355 1 -testSeq536 1 -testSeq724 1 -testSeq241 1 -testSeq853 1 -testSeq92 1 -testSeq249 1 -testSeq227 1 -testSeq200 1 -testSeq955 1 -testSeq675 1 -testSeq353 1 -testSeq150 1 -testSeq18 1 -testSeq721 1 -testSeq859 1 -testSeq177 1 -testSeq65 1 -testSeq595 1 -testSeq852 1 -testSeq791 1 -testSeq886 1 -testSeq418 1 -testSeq314 1 -testSeq281 1 -testSeq654 1 -testSeq95 1 -testSeq225 1 -testSeq321 1 -testSeq233 1 -testSeq847 1 -testSeq175 1 -testSeq837 1 -testSeq162 1 -testSeq53 1 -testSeq684 1 -testSeq941 1 -testSeq678 1 -testSeq560 1 -testSeq960 1 -testSeq891 1 -testSeq730 1 -testSeq378 1 -testSeq78 1 -testSeq824 1 -testSeq664 1 -testSeq124 1 -testSeq565 1 -testSeq816 1 -testSeq224 1 -testSeq726 1 -testSeq928 1 -testSeq511 1 -testSeq394 1 -testSeq588 1 -testSeq572 1 -testSeq100 1 -testSeq138 1 -testSeq146 1 -testSeq846 1 -testSeq222 1 -testSeq777 1 -testSeq25 1 -testSeq890 1 -testSeq672 1 -testSeq381 1 -testSeq371 1 -testSeq46 1 -testSeq164 1 -testSeq839 1 -testSeq134 1 -testSeq443 1 -testSeq187 1 -testSeq785 1 -testSeq448 1 -testSeq112 1 -testSeq352 1 -testSeq191 1 -testSeq504 1 -testSeq144 1 -testSeq368 1 -testSeq133 1 -testSeq120 1 -testSeq549 1 -testSeq655 1 -testSeq751 1 -testSeq316 1 -testSeq620 1 -testSeq740 1 -testSeq634 1 -testSeq718 1 -testSeq198 1 -testSeq512 1 -testSeq704 1 -testSeq540 1 -testSeq405 1 -testSeq966 1 -testSeq346 1 -testSeq403 1 -testSeq490 1 -testSeq682 1 -testSeq127 1 -testSeq284 1 -testSeq384 1 -testSeq693 1 -testSeq819 1 -testSeq698 1 -testSeq750 1 -testSeq348 1 -testSeq697 1 -testSeq480 1 -testSeq765 1 -testSeq539 1 -testSeq607 1 -testSeq947 1 -testSeq135 1 -testSeq214 1 -testSeq323 1 -testSeq229 1 -testSeq330 1 -testSeq449 1 -testSeq147 1 -testSeq739 1 -testSeq874 1 -testSeq379 1 -testSeq428 1 -testSeq236 1 -testSeq854 1 -testSeq39 1 -testSeq934 1 -testSeq860 1 -testSeq48 1 -testSeq256 1 -testSeq842 1 -testSeq500 1 -testSeq294 1 -testSeq831 1 -testSeq516 1 -testSeq495 1 -testSeq599 1 -testSeq578 1 -testSeq939 1 -testSeq731 1 -testSeq658 1 -testSeq424 1 -testSeq689 1 -testSeq756 1 -testSeq829 1 -testSeq176 1 -testSeq465 1 -testSeq143 1 -testSeq173 1 -testSeq971 1 -testSeq676 1 -testSeq325 1 -testSeq780 1 -testSeq559 1 -testSeq723 1 -testSeq121 1 -testSeq550 1 -testSeq729 1 -testSeq999 1 -testSeq631 1 -testSeq808 1 -testSeq219 1 -testSeq419 1 -testSeq341 1 -testSeq383 1 -testSeq450 1 -testSeq924 1 -testSeq126 1 -testSeq647 1 -testSeq52 1 -testSeq474 1 -testSeq118 1 -testSeq667 1 -testSeq3 1 -testSeq958 1 -testSeq50 1 -testSeq201 1 -testSeq494 1 -testSeq786 1 -testSeq401 1 -testSeq927 1 -testSeq42 1 -testSeq533 1 -testSeq708 1 -testSeq484 1 -testSeq743 1 -testSeq423 1 -testSeq88 1 -testSeq618 1 -testSeq503 1 -testSeq23 1 -testSeq948 1 -testSeq396 1 -testSeq950 1 -testSeq72 1 -testSeq278 1 -testSeq411 1 -testSeq265 1 -testSeq40 1 -testSeq31 1 -testSeq906 1 -testSeq643 1 -testSeq90 1 -testSeq288 1 -testSeq435 1 -testSeq577 1 -testSeq197 1 -testSeq875 1 -testSeq614 1 -testSeq364 1 -testSeq645 1 -testSeq589 1 -testSeq392 1 -testSeq775 1 -testSeq85 1 -testSeq275 1 -testSeq303 1 -testSeq286 1 -testSeq148 1 -testSeq311 1 -testSeq561 1 -testSeq840 1 -testSeq238 1 -testSeq978 1 -testSeq189 1 -testSeq439 1 -testSeq648 1 -testSeq361 1 -testSeq159 1 -testSeq962 1 -testSeq988 1 -testSeq155 1 -testSeq319 1 -testSeq192 1 -testSeq692 1 -testSeq475 1 -testSeq817 1 -testSeq339 1 -testSeq367 1 -testSeq123 1 -testSeq534 1 -testSeq661 1 -testSeq796 1 -testSeq926 1 -testSeq122 1 -testSeq861 1 -testSeq81 1 -testSeq235 1 -testSeq89 1 -testSeq359 1 -testSeq681 1 -testSeq20 1 -testSeq245 1 -testSeq59 1 -testSeq807 1 -testSeq305 1 -testSeq605 1 -testSeq753 1 -testSeq160 1 -testSeq429 1 -testSeq421 1 -testSeq916 1 -testSeq246 1 -testSeq12 1 -testSeq365 1 -testSeq903 1 -testSeq151 1 -testSeq463 1 -testSeq558 1 -testSeq877 1 -testSeq982 1 -testSeq526 1 -testSeq57 1 -testSeq520 1 -testSeq920 1 -testSeq784 1 -testSeq795 1 -testSeq257 1 -testSeq297 1 -testSeq94 1 -testSeq738 1 -testSeq207 1 -testSeq37 1 -testSeq710 1 -testSeq626 1 -testSeq454 1 -testSeq16 1 -testSeq652 1 -testSeq932 1 -testSeq583 1 -testSeq683 1 -testSeq570 1 -testSeq343 1 -testSeq109 1 -testSeq163 1 -testSeq97 1 -testSeq438 1 -testSeq213 1 -testSeq272 1 -testSeq700 1 -testSeq663 1 -testSeq345 1 -testSeq64 1 -testSeq930 1 -testSeq274 1 -testSeq382 1 -testSeq709 1 -testSeq900 1 -testSeq205 1 -testSeq531 1 -testSeq507 1 -testSeq653 1 -testSeq395 1 -testSeq116 1 -testSeq942 1 -testSeq945 1 -testSeq459 1 -testSeq803 1 -testSeq119 1 -testSeq825 1 -testSeq794 1 -testSeq601 1 -testSeq468 1 -testSeq240 1 -testSeq289 1 -testSeq762 1 -testSeq705 1 -testSeq298 1 -testSeq711 1 -testSeq260 1 -testSeq351 1 -testSeq943 1 -testSeq185 1 -testSeq137 1 -testSeq93 1 -testSeq902 1 -testSeq268 1 -testSeq527 1 -testSeq769 1 -testSeq337 1 -testSeq194 1 -testSeq47 1 -testSeq290 1 -testSeq834 1 -testSeq881 1 -testSeq625 1 -testSeq26 1 -testSeq28 1 -testSeq598 1 -testSeq269 1 -testSeq102 1 -testSeq139 1 -testSeq329 1 -testSeq851 1 -testSeq981 1 -testSeq771 1 -testSeq244 1 -testSeq685 1 -testSeq145 1 -testSeq149 1 -testSeq206 1 -testSeq909 1 -testSeq719 1 -testSeq14 1 -testSeq234 1 -testSeq34 1 -testSeq313 1 -testSeq892 1 -testSeq580 1 -testSeq110 1 -testSeq261 1 -testSeq532 1 -testSeq372 1 -testSeq637 1 -testSeq742 1 -testSeq571 1 -testSeq202 1 -testSeq872 1 -testSeq674 1 -testSeq773 1 -testSeq266 1 -testSeq83 1 -testSeq250 1 -testSeq248 1 -testSeq767 1 -testSeq992 1 -testSeq630 1 -testSeq845 1 -testSeq17 1 -testSeq815 1 -testSeq283 1 -testSeq220 1 -testSeq690 1 -testSeq712 1 -testSeq530 1 -testSeq887 1 -testSeq444 1 -testSeq778 1 -testSeq670 1 -testSeq422 1 -testSeq591 1 -testSeq358 1 -testSeq153 1 -testSeq717 1 -testSeq259 1 -testSeq783 1 -testSeq432 1 -testSeq639 1 -testSeq479 1 -testSeq58 1 -testSeq340 1 -testSeq409 1 -testSeq375 1 -testSeq464 1 -testSeq862 1 -testSeq73 1 -testSeq888 1 -testSeq258 1 -testSeq567 1 -testSeq668 1 -testSeq169 1 -testSeq390 1 -testSeq376 1 -testSeq657 1 -testSeq68 1 -testSeq918 1 -testSeq350 1 -testSeq373 1 -testSeq552 1 -testSeq55 1 -testSeq231 1 -testSeq608 1 -testSeq836 1 -testSeq362 1 -testSeq789 1 -testSeq701 1 -testSeq855 1 -testSeq612 1 -testSeq597 1 -testSeq758 1 -testSeq998 1 -testSeq954 1 -testSeq107 1 -testSeq117 1 -testSeq212 1 -testSeq651 1 -testSeq706 1 -testSeq897 1 -testSeq182 1 -testSeq736 1 -testSeq263 1 -testSeq885 1 -testSeq763 1 -testSeq857 1 -testSeq863 1 -testSeq270 1 -testSeq370 1 -testSeq650 1 -testSeq606 1 -testSeq295 1 -testSeq41 1 -testSeq4 1 -testSeq638 1 -testSeq154 1 -testSeq152 1 -testSeq759 1 -testSeq713 1 -testSeq959 1 -testSeq790 1 -testSeq209 1 -testSeq487 1 -testSeq915 1 -testSeq32 1 -testSeq43 1 -testSeq322 1 -testSeq546 1 -testSeq673 1 -testSeq935 1 -testSeq716 1 -testSeq749 1 diff --git a/tests/outputs/rna-100k.fasta.count b/tests/outputs/rna-100k.fasta.count new file mode 100644 index 0000000..ccfc37a --- /dev/null +++ b/tests/outputs/rna-100k.fasta.count @@ -0,0 +1 @@ +50000 diff --git a/tests/outputs/out-100k-fasta.txt b/tests/outputs/rna-100k.fasta.txt similarity index 100% rename from tests/outputs/out-100k-fasta.txt rename to tests/outputs/rna-100k.fasta.txt diff --git a/tests/outputs/rna-100k.fastq.count b/tests/outputs/rna-100k.fastq.count new file mode 100644 index 0000000..ccfc37a --- /dev/null +++ b/tests/outputs/rna-100k.fastq.count @@ -0,0 +1 @@ +50000 diff --git a/tests/outputs/out-100k-fastq.txt b/tests/outputs/rna-100k.fastq.txt similarity index 100% rename from tests/outputs/out-100k-fastq.txt rename to tests/outputs/rna-100k.fastq.txt diff --git a/tests/outputs/rna-50k.fasta.count b/tests/outputs/rna-50k.fasta.count new file mode 100644 index 0000000..e87f3b8 --- /dev/null +++ b/tests/outputs/rna-50k.fasta.count @@ -0,0 +1 @@ +25000 diff --git a/tests/outputs/out-50k-fasta.txt b/tests/outputs/rna-50k.fasta.txt similarity index 100% rename from tests/outputs/out-50k-fasta.txt rename to tests/outputs/rna-50k.fasta.txt diff --git a/tests/outputs/rna-50k.fastq.count b/tests/outputs/rna-50k.fastq.count new file mode 100644 index 0000000..e87f3b8 --- /dev/null +++ b/tests/outputs/rna-50k.fastq.count @@ -0,0 +1 @@ +25000 diff --git a/tests/outputs/out-50k-fastq.txt b/tests/outputs/rna-50k.fastq.txt similarity index 100% rename from tests/outputs/out-50k-fastq.txt rename to tests/outputs/rna-50k.fastq.txt