diff --git a/Cargo.lock b/Cargo.lock index bd3f819..3f17f4d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,9 +19,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.5" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" dependencies = [ "anstyle", "anstyle-parse", @@ -33,9 +33,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.4" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" [[package]] name = "anstyle-parse" @@ -73,9 +73,9 @@ checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" [[package]] name = "assert_cmd" -version = "2.0.12" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88903cb14723e4d4003335bb7f8a14f27691649105346a0f0957466c096adfe6" +checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" dependencies = [ "anstyle", "bstr", @@ -103,10 +103,17 @@ dependencies = [ ] [[package]] -name = "bitflags" -version = "1.3.2" +name = "bio-types" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +checksum = "9d45749b87f21808051025e9bf714d14ff4627f9d8ca967eade6946ea769aa4a" +dependencies = [ + "derive-new", + "lazy_static", + "regex", + "strum_macros", + "thiserror", +] [[package]] name = "bitflags" @@ -141,6 +148,12 @@ version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bzip2" version = "0.4.4" @@ -180,9 +193,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.12" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcfab8ba68f3668e89f6ff60f5b205cea56aa7b769451a59f34b8682f51c056d" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", "clap_derive", @@ -190,9 +203,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.12" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb7fb5e4e979aec3be7791562fcba452f94ad85e954da024396433e0e25a79e9" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstream", "anstyle", @@ -202,21 +215,30 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.4.7" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] name = "clap_lex" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] [[package]] name = "colorchoice" @@ -233,6 +255,23 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + +[[package]] +name = "derive-new" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "difflib" version = "0.4.0" @@ -246,22 +285,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] -name = "either" -version = "1.9.0" +name = "env_filter" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" +dependencies = [ + "log", + "regex", +] [[package]] name = "env_logger" -version = "0.10.1" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" dependencies = [ + "anstream", + "anstyle", + "env_filter", "humantime", - "is-terminal", "log", - "regex", - "termcolor", ] [[package]] @@ -299,6 +342,24 @@ dependencies = [ "num-traits", ] +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs-utils" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593" +dependencies = [ + "quick-error", +] + [[package]] name = "getrandom" version = "0.2.11" @@ -310,6 +371,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "heck" version = "0.4.1" @@ -317,10 +384,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] -name = "hermit-abi" -version = "0.3.3" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hts-sys" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deebfb779c734d542e7f14c298597914b9b5425e4089aef482eacb5cab941915" +dependencies = [ + "bzip2-sys", + "cc", + "fs-utils", + "glob", + "libz-sys", + "lzma-sys", +] [[package]] name = "humantime" @@ -329,24 +410,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] -name = "is-terminal" -version = "0.4.10" +name = "idna" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ - "hermit-abi", - "rustix", - "windows-sys", + "unicode-bidi", + "unicode-normalization", ] [[package]] -name = "itertools" -version = "0.11.0" +name = "ieee754" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] +checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c" [[package]] name = "jobserver" @@ -357,11 +434,36 @@ dependencies = [ "libc", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" -version = "0.2.151" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "libz-sys" +version = "1.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e143b5e666b2695d28f6bca6497720813f699c9602dd7f5cac91008b8ada7f9" +dependencies = [ + "cc", + "cmake", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linear-map" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee" [[package]] name = "linux-raw-sys" @@ -371,9 +473,9 @@ checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "lzma-sys" @@ -415,6 +517,15 @@ dependencies = [ "xz2", ] +[[package]] +name = "newtype_derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" +dependencies = [ + "rustc_version", +] + [[package]] name = "niffler" version = "2.5.0" @@ -445,6 +556,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "pkg-config" version = "0.3.28" @@ -459,14 +576,13 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "predicates" -version = "3.0.4" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0" +checksum = "68b87bfd4605926cdfefc1c3b5f8fe560e3feca9d5552cf68c466d3d8236c7e8" dependencies = [ "anstyle", "difflib", "float-cmp", - "itertools", "normalize-line-endings", "predicates-core", "regex", @@ -490,13 +606,19 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.74" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db" +checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.35" @@ -560,24 +682,16 @@ dependencies = [ "rand", "rand_pcg", "regex", + "rust-htslib", "tempfile", "thiserror", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex" -version = "1.10.2" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", @@ -587,9 +701,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", @@ -602,56 +716,112 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rust-htslib" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aec6f9ca4601beb4ae75ff8c99144dd15de5a873f6adf058da299962c760968e" +dependencies = [ + "bio-types", + "byteorder", + "custom_derive", + "derive-new", + "hts-sys", + "ieee754", + "lazy_static", + "libc", + "libz-sys", + "linear-map", + "newtype_derive", + "regex", + "thiserror", + "url", +] + +[[package]] +name = "rustc_version" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" +dependencies = [ + "semver", +] + [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" dependencies = [ - "bitflags 2.4.1", + "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys", ] +[[package]] +name = "rustversion" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47" + [[package]] name = "safemem" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" + [[package]] name = "serde" -version = "1.0.194" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773" +checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.194" +version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0" +checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.48", +] [[package]] name = "syn" -version = "2.0.46" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89456b690ff72fddcecf231caedbe615c59480c93358a93dfae7fc29e3ebbf0e" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", @@ -659,25 +829,26 @@ dependencies = [ ] [[package]] -name = "tempfile" -version = "3.9.0" +name = "syn" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ - "cfg-if", - "fastrand", - "redox_syscall", - "rustix", - "windows-sys", + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] -name = "termcolor" -version = "1.4.0" +name = "tempfile" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ - "winapi-util", + "cfg-if", + "fastrand", + "rustix", + "windows-sys", ] [[package]] @@ -703,66 +874,82 @@ checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] -name = "unicode-ident" -version = "1.0.12" +name = "tinyvec" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] [[package]] -name = "utf8parse" -version = "0.2.1" +name = "tinyvec_macros" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] -name = "wait-timeout" -version = "0.2.0" +name = "unicode-bidi" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" -dependencies = [ - "libc", -] +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +name = "unicode-ident" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] -name = "winapi" -version = "0.3.9" +name = "unicode-normalization" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", + "tinyvec", ] [[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" +name = "url" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] [[package]] -name = "winapi-util" -version = "0.1.6" +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" dependencies = [ - "winapi", + "libc", ] [[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "windows-sys" diff --git a/Cargo.toml b/Cargo.toml index 6afb3ce..95363f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,13 @@ [package] name = "rasusa" -description = "Randomly subsample reads to a specified coverage" +description = "Randomly subsample reads or alignments" version = "0.8.0" authors = ["Michael Hall "] edition = "2018" repository = "https://github.com/mbhall88/rasusa" homepage = "https://github.com/mbhall88/rasusa" readme = "README.md" -keywords = ["bioinformatics", "subsampling", "fastq", "fasta", "coverage"] +keywords = ["bioinformatics", "subsampling", "fastq", "alignment", "coverage"] categories = ["science", "command-line-utilities"] license-file = "LICENSE" @@ -16,8 +16,8 @@ license-file = "LICENSE" maintenance = { status = "actively-developed" } [dependencies] -clap = { version = "4.4.12", features = ["derive"]} -regex = "1.10.2" +clap = { version = "4.5.4", features = ["derive"] } +regex = "1.10.4" thiserror = "1.0" anyhow = "1.0" needletail = { version = "0.5.1", features = ["compression"] } @@ -25,9 +25,18 @@ rand = "0.8.5" rand_pcg = "0.3.1" log = "0.4" niffler = "2.5" -env_logger = "0.10.1" +env_logger = "0.11.3" +rust-htslib = { version = "0.46.0", default-features = false, features = ["bzip2", "lzma"] } [dev-dependencies] -assert_cmd = "2.0.12" -predicates = "3.0.4" -tempfile = "3.9.0" +assert_cmd = "2.0.14" +predicates = "3.1.0" +tempfile = "3.10.1" + +[profile.release] +strip = true # https://github.com/johnthagen/min-sized-rust?tab=readme-ov-file#strip-symbols-from-binary + +[profile.profiling] +inherits = "release" +debug = true +strip = false \ No newline at end of file diff --git a/README.md b/README.md index 25bfadc..70a056d 100644 --- a/README.md +++ b/README.md @@ -6,36 +6,39 @@ [![github release version](https://img.shields.io/github/v/release/mbhall88/rasusa)](https://github.com/mbhall88/rasusa/releases) [![DOI](https://joss.theoj.org/papers/10.21105/joss.03941/status.svg)](https://doi.org/10.21105/joss.03941) -**Ra**ndomly **su**b**sa**mple sequencing reads. +**Ra**ndomly **su**b**sa**mple sequencing reads or alignments. -> Hall, M. B., (2022). Rasusa: Randomly subsample sequencing reads to a specified coverage. Journal of Open Source Software, 7(69), 3941, https://doi.org/10.21105/joss.03941 +> Hall, M. B., (2022). Rasusa: Randomly subsample sequencing reads to a specified coverage. Journal of Open Source +> Software, 7(69), 3941, https://doi.org/10.21105/joss.03941 [TOC]: # + ## Table of Contents + - [Motivation](#motivation) - [Install](#install) - - [`cargo`](#cargo) - - [`conda`](#conda) - - [Container](#container) - - [`homebrew`](#homebrew) - - [Release binaries](#release-binaries) - - [Build locally](#build-locally) + - [`cargo`](#cargo) + - [`conda`](#conda) + - [Container](#container) + - [`homebrew`](#homebrew) + - [Release binaries](#release-binaries) + - [Build locally](#build-locally) - [Usage](#usage) - - [Basic usage](#basic-usage) - - [Required parameters](#required-parameters) - - [Optional parameters](#optional-parameters) - - [Full usage](#full-usage) - - [Snakemake](#snakemake) + - [Basic usage - reads](#basic-usage---reads) + - [Basic usage - alignments](#basic-usage---alignments) + - [Required parameters](#required-parameters) + - [Optional parameters](#optional-parameters) + - [Full usage](#full-usage) - [Benchmark](#benchmark) - - [Single long read input](#single-long-read-input) - - [Paired-end input](#paired-end-input) + - [Single long read input](#single-long-read-input) + - [Paired-end input](#paired-end-input) - [Contributing](#contributing) - [Citing](#citing) - - [Bibtex](#bibtex) + - [Bibtex](#bibtex) ## Motivation -I couldn't find a tool for subsampling reads that met my requirements. All the +I couldn't find a tool for subsampling fastq reads that met my requirements. All the strategies I could find fell short as they either just wanted a number or percentage of reads to subsample to or, if they did subsample to a coverage, they assume all reads are the same size (i.e Illumina). As I mostly work with long-read data this posed a problem @@ -112,7 +115,7 @@ Options [![Crates.io](https://img.shields.io/crates/v/rasusa.svg)](https://crates.io/crates/rasusa) -Prerequisite: [`rust` toolchain][rust] (min. v1.64.0) +Prerequisite: [`rust` toolchain][rust] (min. v1.74.1) ```sh cargo install rasusa @@ -182,13 +185,14 @@ target/release/rasusa --help cargo test --all ``` - ## Usage -### Basic usage +### Basic usage - reads + +Subsample fastq reads ``` -rasusa --input in.fq --coverage 30 --genome-size 4.6mb +rasusa reads --coverage 30 --genome-size 4.6mb in.fq ``` The above command will output the subsampled file to `stdout`. @@ -196,48 +200,67 @@ The above command will output the subsampled file to `stdout`. Or, if you have paired Illumina ``` -rasusa -i r1.fq -i r2.fq --coverage 30 --genome-size 4g -o out.r1.fq -o out.r2.fq +rasusa --coverage 30 --genome-size 4g -o out.r1.fq -o out.r2.fq r1.fq r2.fq ``` For more details on the above options, and additional options, see below. +### Basic usage - alignments + +Subsample alignments + +``` +rasusa aln --coverage 30 in.bam | samtools sort -o out.bam +``` + +this will subsample each position in the alignment to 30x coverage. + ### Required parameters -There are three required options to run `rasusa`. +There are three required options to run `rasusa reads`. #### Input -##### `-i`, `--input` - -This option specifies the file(s) containing the reads you would like to subsample. The -file(s) must be valid fasta or fastq format and can be compressed (with a tool such as -`gzip`). -Illumina paired files can be passed in two ways. -1. Using `--input` twice `-i r1.fq -i r2.fq` -2. Using `--input` once, but passing both files immediately after `-i r1.fq r2.fq` +This positional argument specifies the file(s) containing the reads or alignments you would like to subsample. The +file(s) must be valid fasta or fastq format for the `reads` command and can be compressed (with a tool such as +`gzip`). For the `aln` command, the file must be a valid **indexed** SAM/BAM file. +If two files are passed to `reads`, `rasusa` will assume they are paired-end reads. -> Bash wizard tip 🧙: Let globs do the work for you `-i r*.fq` +> Bash wizard tip 🧙: Let globs do the work for you `r*.fq` #### Coverage ##### `-c`, `--coverage` -> Not required if [`--bases`](#target-number-of-bases) is present +> Not required if [`--bases`](#target-number-of-bases) is present for `reads` -This option is used to determine the minimum coverage to subsample the reads to. It can +This option is used to determine the minimum coverage to subsample the reads to. For the `reads` command, it can be specified as an integer (100), a decimal/float (100.7), or either of the previous -suffixed with an 'x' (100x). +suffixed with an 'x' (100x). For the `aln` command, it is an integer only. -_**Note**: Due to the method for determining how many bases are required to achieve the -desired coverage, the actual coverage, in the end, could be slightly higher than +Due to the method for determining how many bases are required to achieve the +desired coverage in the `reads` command, the actual coverage, in the end, could be slightly higher than requested. For example, if the last included read is very long. The log messages should -inform you of the actual coverage in the end._ +inform you of the actual coverage in the end. + +For the `aln` command, the coverage is the minimum number of reads that should be present at each position in the +alignment. If a position has fewer than the requested number of reads, all reads at that position will be included. In +addition, there will be (small) regions with more than the requested number of reads - usually localised to where the +alignment of a read ends. This is +because when the alignment of a selected read ends, the next read is selected based on it spanning the end of the +previous alignment. +When selecting this next alignment, we preference alignments whose start is closest to the end of the previous +alignment, ensuring minimal overlap with the previous alignment. See the below screenshot from IGV for a visual example. + +![IGV screenshot](img/igv_panel.png) #### Genome size ##### `-g`, `--genome-size` -> Not required if [`--bases`](#target-number-of-bases) is present +> Not valid for `aln` + +> Not required if [`--bases`](#target-number-of-bases) is present for `reads` The genome size of the input is also required. It is used to determine how many bases are necessary to achieve the desired coverage. This can, of course, be as precise or @@ -264,13 +287,15 @@ set to the sum of all reference sequences in it. ##### `-o`, `--output` -NOTE: This parameter is required if passing paired Illumina data. +**`reads`** + +NOTE: This parameter is required if passing paired Illumina data to `reads`. By default, `rasusa` will output the subsampled file to `stdout` (if one file is given). If you would prefer to specify an output file path, then use this option. -Output for Illumina paired files can be specified in the same manner as -[`--input`](#input) +Output for Illumina paired files can be specified in the two ways + 1. Using `--output` twice `-o out.r1.fq -o out.r2.fq` 2. Using `--output` once, but passing both files immediately after `-o out.r1.fq out.r2.fq` @@ -279,42 +304,71 @@ The ordering of the output files is assumed to be the same as the input. _Note: The output will always be in the same format as the input. You cannot pass fastq as input and ask for fasta as output._ -`rasusa` will also attempt to automatically infer whether comression of the output +`rasusa reads` will also attempt to automatically infer whether compression of the output file(s) is required. It does this by detecting any of the supported extensions: + - `.gz`: will compress the output with [`gzip`][gzip] - `.bz` or `.bz2`: will compress the output with [`bzip2`][bzip] - `.lzma`: will compress the output with the [`xz`][xz] LZMA algorithm +**`aln`** + +For the `aln` command, the output file format will be the same as the input if writing to stdout, otherwise it will be +inferred from the file extension. + +**Note:** the output alignment will most likely **not be sorted**. You can use `samtools sort` to sort the output. e.g., + +``` +rasusa aln -c 5 in.bam | samtools sort -o out.bam +``` + [gzip]: http://www.gzip.org/ + [bzip]: https://sourceware.org/bzip2/ + [xz]: https://tukaani.org/xz/ -#### Output compression format +#### Output compression/format ##### `-O`, `--output-type` +**`reads`** + Use this option to manually set the compression algoritm to use for the output file(s). It will override any format automatically detected from the output path. Valid options are: + - `g`: [`gzip`][gzip] - `b`: [`bzip2`][bzip] - `l` or `x`: [`xz`][xz] LZMA algorithm - `z`: [`zstd`][zstd] - `u`: no compression -*Note: these options are case insensitive.* +**`aln`** + +Use this option to manually set the output file format. By default, the same format as the input will be used, or the +format will be guessed from the `--output` path extension if given. Valid options are: + +- `b` or `bam`: BAM +- `c` or `cram`: CRAM +- `s` or `sam`: SAM + +*Note: all values to this option are case insensitive.* #### Compresion level ##### `-l`, `--compress-level` -Compression level to use if compressing the output. By default this is set to the default for the compression type being output. +Compression level to use if compressing the output. By default this is set to the default for the compression type being +output. #### Target number of bases ##### `-b`, `--bases` +> `reads` only + Explicitly set the number of bases required in the subsample. This option takes the number in the same format as [genome size](#genome-size). @@ -325,11 +379,13 @@ they are provided.* ##### `-n`, `--num` -Explicitly set the number of reads in the subsample. This option takes the number in +> `reads` only + +Explicitly set the number of reads in the subsample. This option takes the number in the same format as [genome size](#genome-size). -When providing paired reads as input, this option will sample this many total read -pairs. For example, when passing `-n 20 -i r1.fq r2.fq`, the two output files will have +When providing paired reads as input, this option will sample this many total read +pairs. For example, when passing `-n 20 -i r1.fq r2.fq`, the two output files will have 20 reads each, and the read ids will be the same in both. *Note: if this option is given, genome size and coverage are not required.* @@ -338,8 +394,10 @@ pairs. For example, when passing `-n 20 -i r1.fq r2.fq`, the two output files wi ##### `-f`, `--frac` -Explicitly set the fraction of total reads in the subsample. The value given to this -option can be a float or a percentage - i.e., `-f 0.5` and `-f 50` will both take half +> `reads` only + +Explicitly set the fraction of total reads in the subsample. The value given to this +option can be a float or a percentage - i.e., `-f 0.5` and `-f 50` will both take half of the reads. *Note: if this option is given, genome size and coverage are not required.* @@ -367,20 +425,41 @@ verbosity is switched on, you will additionally get "debug" level logging messag ```text $ rasusa --help -Randomly subsample reads to a specified coverage +Randomly subsample reads or alignments + +Usage: rasusa [OPTIONS] -Usage: rasusa [OPTIONS] --input ... +Commands: + reads Randomly subsample reads + aln Randomly subsample alignments to a specified depth of coverage + cite Get a bibtex formatted citation for this package + help Print this message or the help of the given subcommand(s) Options: - -i, --input ... + -v Switch on verbosity + -h, --help Print help + -V, --version Print version +``` + +#### `reads` command + +```text +$ rasusa reads --help +Randomly subsample reads + +Usage: rasusa reads [OPTIONS] ... + +Arguments: + ... The fast{a,q} file(s) to subsample. - For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two files consecutively `-i r1.fq r2.fq`. + For paired Illumina, the order matters. i.e., R1 then R2. +Options: -o, --output ... Output filepath(s); stdout if not present. - For paired Illumina you may either pass this flag twice `-o o1.fq -o o2.fq` or give two files consecutively `-o o1.fq o2.fq`. NOTE: The order of the pairs is assumed to be the same as that given for --input. This option is required for paired input. + For paired Illumina you may either pass this flag twice `-o o1.fq -o o2.fq` or give two files consecutively `-o o1.fq o2.fq`. NOTE: The order of the pairs is assumed to be the same as the input - e.g., R1 then R2. This option is required for paired input. -g, --genome-size Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB @@ -390,7 +469,7 @@ Options: If --bases is not provided, this option and --coverage are required -c, --coverage - The desired coverage to sub-sample the reads to + The desired depth of coverage to subsample the reads to If --bases is not provided, this option and --genome-size are required @@ -430,6 +509,47 @@ Options: Print version ``` +#### `aln` command + +```text +$ rasusa aln --help +Randomly subsample alignments to a specified depth of coverage + +Usage: rasusa aln [OPTIONS] --coverage + +Arguments: + + Path to the indexed alignment file (SAM/BAM/CRAM) to subsample + +Options: + -o, --output + Path to the output subsampled alignment file. Defaults to stdout (same format as input) + + The output is not guaranteed to be sorted. We recommend piping the output to `samtools sort` + + -O, --output-type + Output format. Rasusa will attempt to infer the format from the output file extension if not provided + + -c, --coverage + The desired depth of coverage to subsample the alignment to + + -s, --seed + Random seed to use + + --step-size + When a region has less than the desired coverage, the step size to move along the chromosome to find more reads. + + The lowest of the step and the minimum end coordinate of the reads in the region will be used. This parameter can have a significant impact on the runtime of the subsampling process. + + [default: 100] + + -h, --help + Print help (see a summary with '-h') + + -V, --version + Print version +``` + ## Benchmark > “Time flies like an arrow; fruit flies like a banana.” @@ -447,6 +567,8 @@ The data I used comes from > identifies novel variation in repetitive PE/PPE gene regions." Microbial genomics 4.7 > (2018).][1] +_Note, these benchmarks are for `reads` only as there is no other tool that replicates the functionality of `aln`._ + ### Single long read input Download and rename the fastq @@ -465,17 +587,17 @@ TB_GENOME_SIZE=4411532 COVG=50 TARGET_BASES=$(( TB_GENOME_SIZE * COVG )) FILTLONG_CMD="filtlong --target_bases $TARGET_BASES tb.fq" -RASUSA_CMD="rasusa -i tb.fq -c $COVG -g $TB_GENOME_SIZE -s 1" +RASUSA_CMD="rasusa reads tb.fq -c $COVG -g $TB_GENOME_SIZE -s 1" hyperfine --warmup 3 --runs 10 --export-markdown results-single.md \ "$FILTLONG_CMD" "$RASUSA_CMD" ``` #### Results -| Command | Mean [s] | Min [s] | Max [s] | Relative | -|:------------------------------------------|---------------:|--------:|--------:|-------------:| -| `filtlong --target_bases 220576600 tb.fq` | 21.685 ± 0.055 | 21.622 | 21.787 | 21.77 ± 0.29 | -| `rasusa -i tb.fq -c 50 -g 4411532 -s 1` | 0.996 ± 0.013 | 0.983 | 1.023 | 1.00 | +| Command | Mean [s] | Min [s] | Max [s] | Relative | +|:---------------------------------------------|---------------:|--------:|--------:|-------------:| +| `filtlong --target_bases 220576600 tb.fq` | 21.685 ± 0.055 | 21.622 | 21.787 | 21.77 ± 0.29 | +| `rasusa reads tb.fq -c 50 -g 4411532 -s 1` | 0.996 ± 0.013 | 0.983 | 1.023 | 1.00 | **Summary**: `rasusa` ran 21.77 ± 0.29 times faster than `filtlong`. @@ -489,8 +611,8 @@ wget "$URL" -O - | gzip -d -c - | fastaq deinterleave - r1.fq r2.fq ``` Each file's size is 179M and has 283,590 reads. -For this benchmark, we will use [`seqtk`][seqtk]. We will also test `seqtk`'s 2-pass -mode as this is analogous to `rasusa`. +For this benchmark, we will use [`seqtk`][seqtk]. We will also test `seqtk`'s 2-pass +mode as this is analogous to `rasusa reads`. ```shell NUM_READS=140000 @@ -507,12 +629,12 @@ hyperfine --warmup 10 --runs 100 --export-markdown results-paired.md \ |:--------------------------------------------------------------------------------------------------|--------------:|---------:|---------:|------------:| | `seqtk sample -s 1 r1.fq 140000 > /tmp/r1.fq; seqtk sample -s 1 r2.fq 140000 > /tmp/r2.fq;` | 907.7 ± 23.6 | 875.4 | 997.8 | 1.84 ± 0.62 | | `seqtk sample -2 -s 1 r1.fq 140000 > /tmp/r1.fq; seqtk sample -2 -s 1 r2.fq 140000 > /tmp/r2.fq;` | 870.8 ± 54.9 | 818.2 | 1219.8 | 1.77 ± 0.61 | -| `rasusa -i r1.fq r2.fq -n 140000 -s 1 -o /tmp/r1.fq -o /tmp/r2.fq` | 492.2 ± 165.4 | 327.4 | 887.4 | 1.00 | +| `rasusa reads r1.fq r2.fq -n 140000 -s 1 -o /tmp/r1.fq -o /tmp/r2.fq` | 492.2 ± 165.4 | 327.4 | 887.4 | 1.00 | -**Summary**: `rasusa` ran 1.84 times faster than `seqtk` (1-pass) and 1.77 times faster +**Summary**: `rasusa reads` ran 1.84 times faster than `seqtk` (1-pass) and 1.77 times faster than `seqtk` (2-pass) -So, `rasusa` is faster than `seqtk` but doesn't require a fixed number of reads - +So, `rasusa reads` is faster than `seqtk` but doesn't require a fixed number of reads - allowing you to avoid doing maths to determine how many reads you need to downsample to a specific coverage. 🤓 @@ -536,10 +658,13 @@ cite it. [![DOI](https://joss.theoj.org/papers/10.21105/joss.03941/status.svg)](https://doi.org/10.21105/joss.03941) -> Hall, M. B., (2022). Rasusa: Randomly subsample sequencing reads to a specified coverage. Journal of Open Source Software, 7(69), 3941, https://doi.org/10.21105/joss.03941 +> Hall, M. B., (2022). Rasusa: Randomly subsample sequencing reads to a specified coverage. Journal of Open Source +> Software, 7(69), 3941, https://doi.org/10.21105/joss.03941 ### Bibtex +You can get the following citation by running `rasusa cite` + ```Bibtex @article{Hall2022, doi = {10.21105/joss.03941}, @@ -557,28 +682,52 @@ cite it. ``` [1]: https://doi.org/10.1099/mgen.0.000188 + [brew-tap]: https://github.com/brewsci/homebrew-bio + [channels]: https://bioconda.github.io/user/install.html#set-up-channels + [conda]: https://docs.conda.io/projects/conda/en/latest/user-guide/install/ + [docker]: https://docs.docker.com/v17.12/install/ + [dockerhub]: https://hub.docker.com/r/mbhall88/rasusa + [dpryan79]: https://github.com/dpryan79 + [filtlong]: https://github.com/rrwick/Filtlong + [homebrew]: https://docs.brew.sh/Installation + [hyperfine]: https://github.com/sharkdp/hyperfine + [kcov]: https://github.com/SimonKagstrom/kcov + [log-lvl]: https://docs.rs/log/0.4.6/log/enum.Level.html#variants + [mgen-ref]: https://doi.org/10.1099/mgen.0.000294 + [pr-help]: https://github.com/bioconda/bioconda-recipes/pull/18690 + [pyfastaq]: https://github.com/sanger-pathogens/Fastaq + [quay.io]: https://quay.io/repository/mbhall88/rasusa + [rust]: https://www.rust-lang.org/tools/install + [score]: https://github.com/rrwick/Filtlong#read-scoring + [seed]: https://en.wikipedia.org/wiki/Random_seed + [seqtk]: https://github.com/lh3/seqtk + [singularity]: https://sylabs.io/guides/3.4/user-guide/quick_start.html#quick-installation-steps + [snakemake]: https://snakemake.readthedocs.io/en/stable/ + [triples]: https://clang.llvm.org/docs/CrossCompilation.html#target-triple + [wrapper]: https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/rasusa.html + [zstd]: https://github.com/facebook/zstd diff --git a/img/igv_panel.png b/img/igv_panel.png new file mode 100644 index 0000000..7ff88c9 Binary files /dev/null and b/img/igv_panel.png differ diff --git a/src/alignment.rs b/src/alignment.rs new file mode 100644 index 0000000..5268498 --- /dev/null +++ b/src/alignment.rs @@ -0,0 +1,406 @@ +use std::cmp::{Ordering, Reverse}; +use std::collections::BinaryHeap; +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use anyhow::{anyhow, Context, Result}; +use clap::Parser; +use log::{info, warn}; +use rand::prelude::SliceRandom; +use rand::{random, Rng, SeedableRng}; +use rust_htslib::bam; +use rust_htslib::bam::ext::BamRecordExtensions; +use rust_htslib::bam::{Format, Read}; + +use crate::cli::check_path_exists; +use crate::Runner; + +#[derive(Debug, Parser)] +#[command(author, version, about)] +pub struct Alignment { + /// Path to the indexed alignment file (SAM/BAM/CRAM) to subsample + #[arg(value_parser = check_path_exists, name = "FILE")] + pub aln: PathBuf, + + /// Path to the output subsampled alignment file. Defaults to stdout (same format as input) + /// + /// The output is not guaranteed to be sorted. We recommend piping the output to `samtools sort` + #[arg(short, long, value_name = "FILE")] + pub output: Option, + + /// Output format. Rasusa will attempt to infer the format from the output file extension if not provided + #[arg(short='O', long, value_name = "FMT", value_parser = infer_format_from_char)] + pub output_type: Option, + + /// The desired depth of coverage to subsample the alignment to + #[arg(short, long, value_name = "INT", value_parser = clap::value_parser!(u32).range(1..))] + pub coverage: u32, + + /// Random seed to use. + #[arg(short, long, value_name = "INT")] + pub seed: Option, + + /// When a region has less than the desired coverage, the step size to move along the chromosome + /// to find more reads. + /// + /// The lowest of the step and the minimum end coordinate of the reads in the region will be used. + /// This parameter can have a significant impact on the runtime of the subsampling process. + #[arg(long, default_value_t = 100, value_name = "INT", value_parser = clap::value_parser!(i64).range(1..))] + pub step_size: i64, +} + +impl Runner for Alignment { + fn run(&mut self) -> Result<()> { + info!("Subsampling alignment file: {:?}", self.aln); + + let mut rng = match self.seed { + Some(s) => rand_pcg::Pcg64::seed_from_u64(s), + None => rand_pcg::Pcg64::seed_from_u64(random()), + }; + + let mut reader = + bam::IndexedReader::from_path(&self.aln).context("Failed to read alignment file")?; + let header = bam::Header::from_template(reader.header()); + + let input_fmt = match infer_format_from_path(&self.aln) { + Some(fmt) => fmt, + None => { + return Err(anyhow::anyhow!( + "Output file format not recognized. Please use .sam, .bam, or .cram extensions" + )); + } + }; + + let output_fmt = match &self.output_type { + Some(fmt) => *fmt, + None => match &self.output { + None => input_fmt, + Some(path) => match infer_format_from_path(path) { + Some(fmt) => fmt, + None => { + return Err(anyhow::anyhow!( + "Output file format not recognized. Please use .sam, .bam, or .cram extensions" + )); + } + }, + }, + }; + + let mut writer = match &self.output { + Some(path) => { + let path = path.as_path(); + + let output = bam::Writer::from_path(path, &header, output_fmt) + .context("Failed to create output alignment file")?; + info!("Writing subsampled alignment to: {:?}", path); + output + } + None => { + let output = bam::Writer::from_stdout(&header, output_fmt) + .context("Failed to create output alignment file")?; + info!("Writing subsampled alignment to stdout"); + output + } + }; + + let header = reader.header().clone(); + let chroms = header.target_names(); + + for chrom in chroms { + let chrom_name = String::from_utf8_lossy(chrom); + + info!("Subsampling chromosome: {}", chrom_name); + + let tid = header + .tid(chrom) + .context(format!("Failed to get tid for chromosome {}", chrom_name))?; + let chrom_len = header.target_len(tid).context(format!( + "Failed to get chromosome length for chromosome {}", + chrom_name + ))?; + let mut n_reads_needed = self.coverage; + let mut current_reads = HashSet::new(); + let mut heap = BinaryHeap::new(); + + // get the 0-based position of the first record in the chromosome + reader.fetch(tid).context(format!( + "Failed to get all records for chromosome {}", + chrom_name + ))?; + + let first_record = if let Some(first_record) = reader.records().next() { + first_record.context("Failed to get first record")? + } else { + warn!("Chromosome {} has no records", chrom_name); + continue; + }; + + let mut next_pos = first_record.pos(); + let first_pos = next_pos; + let mut regions_below_coverage = false; + + loop { + reader + .fetch((tid, next_pos, next_pos + 1)) + .context(format!( + "Failed to fetch records in region {}:{}-{}", + chrom_name, + next_pos, + next_pos + 1 + ))?; + let records = reader.records(); + + let mut records: Vec<_> = records.filter_map(Result::ok).collect(); + + if next_pos == first_pos { + // we just shuffle all the reads in the first position + records.shuffle(&mut rng); + } else { + // need to sort records by their alignment start positions. those with the same start + // position should be shuffled so that the order is random + random_sort(&mut records, |record| record.pos(), &mut rng); + } + + let mut num_output = 0; + let mut record_iter = records.into_iter().rev(); + + while num_output < n_reads_needed { + let record = match record_iter.next() { + Some(r) => r, + None => break, + }; + let qname = record.qname().to_owned(); + + if current_reads.contains(&qname) { + continue; + } + current_reads.insert(qname.to_owned()); + heap.push(Reverse((record.reference_end(), qname.to_owned()))); + // write the record + writer.write(&record).context("Failed to write record")?; + num_output += 1; + } + + n_reads_needed -= num_output; + + if n_reads_needed > 0 { + // increment next_pos by step_size or the minimum end position of the reads in + // the region, whichever is smaller + let min_end = heap.peek().map(|Reverse((end, _))| *end); + match min_end { + Some(min_end) => { + next_pos += self.step_size.min(min_end - next_pos); + } + None => { + next_pos += self.step_size; + } + } + regions_below_coverage = true; + } + + // remove smallest end position from heap and update next_pos and n_reads_needed + // allowing for the fact that there may be multiple reads with the same end position + while let Some(Reverse((end, qname))) = heap.pop() { + next_pos = end; + + if current_reads.contains(&qname) { + current_reads.remove(&qname); + n_reads_needed += 1; + } else { + return Err(anyhow!("A read in the heap was not found in the current reads set. This should not happen, please raise an issue.")); + } + if heap.is_empty() { + break; + } + if let Some(Reverse((next_end, _))) = heap.peek() { + if *next_end != end { + break; + } + } else { + break; + } + } + + if next_pos as u64 >= chrom_len { + break; + } + } + if regions_below_coverage { + warn!( + "Chromosome {} has regions with less than the requested coverage", + chrom_name + ); + } + } + + Ok(()) + } +} + +/// Sorts the vector with a custom order where equal keys are randomly ordered. +fn random_sort(vec: &mut [T], key_extractor: fn(&T) -> K, mut rng: impl Rng) { + vec.sort_by(|a, b| random_compare(key_extractor(a), key_extractor(b), &mut rng)); +} + +/// Custom comparison function for random sorting. +fn random_compare(a: T, b: T, rng: &mut impl Rng) -> Ordering { + if a == b { + // Introduce randomness when elements are equal + if rng.gen::() { + Ordering::Less + } else { + Ordering::Greater + } + } else { + a.cmp(&b) + } +} + +pub fn infer_format_from_path(path: &Path) -> Option { + match path.extension().and_then(|ext| ext.to_str()) { + Some("sam") => Some(Format::Sam), + Some("bam") => Some(Format::Bam), + Some("cram") => Some(Format::Cram), + _ => None, + } +} + +// a function which infers a format from a single character (case insensitive) this will be used in the CLI, so should return a Result +pub fn infer_format_from_char(c: &str) -> Result { + match c.to_ascii_lowercase().as_str() { + "s" | "sam" => Ok(Format::Sam), + "b" | "bam" => Ok(Format::Bam), + "c" | "cram" => Ok(Format::Cram), + _ => Err(String::from("Invalid output format. Please use 's', 'b', or 'c' to specify SAM, BAM, or CRAM format, respectively")), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use assert_cmd::Command; + use rand::prelude::StdRng; + const SUB: &str = "aln"; + + #[test] + fn test_infer_format() { + // have to test this way as rust-htslib does not derive PartialEq or Eq for Format + let fmt = infer_format_from_path(Path::new("file.sam")).unwrap(); + assert!(matches!(fmt, Format::Sam)); + + let fmt = infer_format_from_path(Path::new("file.bam")).unwrap(); + assert!(matches!(fmt, Format::Bam)); + + let fmt = infer_format_from_path(Path::new("file.cram")).unwrap(); + assert!(matches!(fmt, Format::Cram)); + + let fmt = infer_format_from_path(Path::new("file.txt")); + assert!(fmt.is_none()); + } + + #[test] + fn test_infer_format_from_char() { + let fmt = infer_format_from_char("s").unwrap(); + assert!(matches!(fmt, Format::Sam)); + + let fmt = infer_format_from_char("b").unwrap(); + assert!(matches!(fmt, Format::Bam)); + + let fmt = infer_format_from_char("c").unwrap(); + assert!(matches!(fmt, Format::Cram)); + + let fmt = infer_format_from_char("x"); + assert!(fmt.is_err()); + + let fmt = infer_format_from_char("sam").unwrap(); + assert!(matches!(fmt, Format::Sam)); + + let fmt = infer_format_from_char("B").unwrap(); + assert!(matches!(fmt, Format::Bam)); + + let fmt = infer_format_from_char("CRAM").unwrap(); + assert!(matches!(fmt, Format::Cram)); + + let fmt = infer_format_from_char("x"); + assert!(fmt.is_err()); + } + + #[test] + fn test_inequalities() { + let mut rng = StdRng::from_entropy(); // Use a random seed for general testing + assert_eq!(random_compare(1, 2, &mut rng), Ordering::Less); + assert_eq!(random_compare(2, 1, &mut rng), Ordering::Greater); + } + + #[test] + fn test_random_behavior_for_equality() { + let mut outcomes = HashSet::new(); + let mut rng = StdRng::seed_from_u64(1234); // Use a fixed seed for reproducibility + + // Perform multiple tests to check the outcomes + for _ in 0..100 { + let outcome = random_compare(1, 1, &mut rng); + outcomes.insert(outcome); + } + + // Check if both outcomes are possible + assert!( + outcomes.contains(&Ordering::Less) && outcomes.contains(&Ordering::Greater), + "Random outcomes should include both Ordering::Less and Ordering::Greater" + ); + } + + #[test] + fn no_coverage_given_raises_error() { + let infile = "tests/cases/test.bam"; + let passed_args = vec![SUB, infile]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn zero_coverage_raises_error() { + let infile = "tests/cases/test.bam"; + let passed_args = vec![SUB, infile, "-c", "0"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn bam_with_regions_of_zero_coverage_doesnt_endless_loop() { + let infile = "tests/cases/test.bam"; + let passed_args = vec![SUB, infile, "-c", "1"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().success(); + } + + #[test] + fn excess_coverage_doesnt_endless_loop() { + let infile = "tests/cases/test.bam"; + let passed_args = vec![SUB, infile, "-c", "10000"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().success(); + } + + #[test] + fn bam_no_index_fails() { + let infile = "tests/cases/no_index.bam"; + let passed_args = vec![SUB, infile, "-c", "1"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn bam_with_no_start_or_end_regions_and_missing_chromosomes() { + let infile = "tests/cases/no_start_end.bam"; + let passed_args = vec![SUB, infile, "-c", "1"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().success(); + } +} diff --git a/src/cli.rs b/src/cli.rs index 7f81f99..c98a0b3 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,4 +1,7 @@ -use clap::Parser; +use crate::alignment::Alignment; +use crate::reads::Reads; +use crate::Runner; +use clap::{Parser, Subcommand}; use regex::Regex; use std::ffi::OsStr; use std::fs::File; @@ -8,131 +11,41 @@ use std::path::{Path, PathBuf}; use std::str::FromStr; use thiserror::Error; -/// Randomly subsample reads to a specified coverage. +const CITATION: &str = r#"@article{Hall2022, + doi = {10.21105/joss.03941}, + url = {https://doi.org/10.21105/joss.03941}, + year = {2022}, + publisher = {The Open Journal}, + volume = {7}, + number = {69}, + pages = {3941}, + author = {Michael B. Hall}, + title = {Rasusa: Randomly subsample sequencing reads to a specified coverage}, + journal = {Journal of Open Source Software} +}"#; + +/// Randomly subsample reads or alignments #[derive(Debug, Parser)] -#[clap(author, version, about)] +#[command(author, version, about)] +#[command(propagate_version = true)] pub struct Cli { - /// The fast{a,q} file(s) to subsample. - /// - /// For paired Illumina you may either pass this flag twice `-i r1.fq -i r2.fq` or give two - /// files consecutively `-i r1.fq r2.fq`. - #[clap( - short = 'i', - long = "input", - value_parser = check_path_exists, - num_args = 1..=2, - required = true - )] - pub input: Vec, - - /// Output filepath(s); stdout if not present. - /// - /// For paired Illumina you may either pass this flag twice `-o o1.fq -o o2.fq` or give two - /// files consecutively `-o o1.fq o2.fq`. NOTE: The order of the pairs is assumed to be the - /// same as that given for --input. This option is required for paired input. - #[clap(short = 'o', long = "output", num_args = 1..=2)] - pub output: Vec, - - /// Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB - /// - /// Alternatively, a FASTA/Q index file can be provided and the genome size will be - /// set to the sum of all reference sequences. - /// - /// If --bases is not provided, this option and --coverage are required - #[clap( - short, - long, - required_unless_present_any = &["bases", "num", "frac"], - requires = "coverage", - value_name = "size|faidx", - conflicts_with_all = &["num", "frac"] - )] - pub genome_size: Option, - - /// The desired coverage to sub-sample the reads to - /// - /// If --bases is not provided, this option and --genome-size are required - #[clap( - short, - long, - value_name = "FLOAT", - required_unless_present_any = &["bases", "num", "frac"], - requires = "genome_size", - conflicts_with_all = &["num", "frac"] - )] - pub coverage: Option, - - /// Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB - /// - /// If this option is given, --coverage and --genome-size are ignored - #[clap(short, long, value_name = "bases", conflicts_with_all = &["num", "frac"])] - pub bases: Option, - - /// Subsample to a specific number of reads - /// - /// If paired-end reads are passed, this is the number of (matched) reads from EACH file. - /// This option accepts the same format as genome size - e.g., 1k will take 1000 reads - #[clap(short, long, value_name = "INT", conflicts_with = "frac")] - pub num: Option, - - /// Subsample to a fraction of the reads - e.g., 0.5 samples half the reads - /// - /// Values >1 and <=100 will be automatically converted - e.g., 25 => 0.25 - #[clap(short, long, value_name = "FLOAT", value_parser = parse_fraction, conflicts_with = "num")] - pub frac: Option, - - /// Random seed to use. - #[clap(short = 's', long = "seed", value_name = "INT")] - pub seed: Option, - /// Switch on verbosity. - #[clap(short)] + #[arg(short)] pub verbose: bool, - /// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma; x: Xz (Lzma); z: Zstd - /// - /// Rasusa will attempt to infer the output compression format automatically from the filename - /// extension. This option is used to override that. If writing to stdout, the default is - /// uncompressed - #[clap(short = 'O', long, value_name = "u|b|g|l|x|z", value_parser = parse_compression_format)] - pub output_type: Option, - - /// Compression level to use if compressing output. Uses the default level for the format if - /// not specified. - #[clap(short = 'l', long, value_parser = parse_level, value_name = "1-21")] - pub compress_level: Option, + #[command(subcommand)] + pub(crate) command: Commands, } -impl Cli { - /// Checks there is a valid and equal number of `--input` and `--output` arguments given. - /// - /// # Errors - /// A [`CliError::BadInputOutputCombination`](#clierror) is returned for the following: - /// - Either `--input` or `--output` are passed more than twice - /// - An unequal number of `--input` and `--output` are passed. The only exception to - /// this is if one `--input` and zero `--output` are passed, in which case, the output - /// will be sent to STDOUT. - pub fn validate_input_output_combination(&self) -> Result<(), CliError> { - let out_len = self.output.len(); - let in_len = self.input.len(); - - if in_len > 2 { - let msg = String::from("Got more than 2 files for input."); - return Err(CliError::BadInputOutputCombination(msg)); - } - if out_len > 2 { - let msg = String::from("Got more than 2 files for output."); - return Err(CliError::BadInputOutputCombination(msg)); - } - match in_len as isize - out_len as isize { - diff if diff == 1 && in_len == 1 => Ok(()), - diff if diff != 0 => Err(CliError::BadInputOutputCombination(format!( - "Got {} --input but {} --output", - in_len, out_len - ))), - _ => Ok(()), - } - } +#[derive(Subcommand, Debug)] +pub enum Commands { + /// Randomly subsample reads + Reads(Reads), + /// Randomly subsample alignments to a specified depth of coverage + #[command(name = "aln")] + Alignment(Alignment), + /// Get a bibtex formatted citation for this package. + Cite(Cite), } /// A collection of custom errors relating to the command line interface for this package. @@ -167,6 +80,17 @@ pub enum CliError { FaidxError(String), } +#[derive(Debug, Parser)] +#[command(author, version, about)] +pub struct Cite {} + +impl Runner for Cite { + fn run(&mut self) -> anyhow::Result<()> { + println!("{}", CITATION); + Ok(()) + } +} + /// A metric suffix is a unit suffix used to indicate the multiples of (in this case) base pairs. /// For instance, the metric suffix 'Kb' refers to kilobases. Therefore, 6.9kb means 6900 base pairs. #[derive(PartialEq, Debug)] @@ -436,7 +360,7 @@ impl CompressionExt for niffler::compression::Format { } } -fn parse_compression_format(s: &str) -> Result { +pub(crate) fn parse_compression_format(s: &str) -> Result { match s { "b" | "B" => Ok(niffler::Format::Bzip), "g" | "G" => Ok(niffler::Format::Gzip), @@ -448,7 +372,7 @@ fn parse_compression_format(s: &str) -> Result + ?Sized>(s: &S) -> Result { +pub(crate) fn check_path_exists + ?Sized>(s: &S) -> Result { let path = PathBuf::from(s); if path.exists() { Ok(path) @@ -459,7 +383,7 @@ fn check_path_exists + ?Sized>(s: &S) -> Result /// A utility function to validate compression level is in allowed range #[allow(clippy::redundant_clone)] -fn parse_level(s: &str) -> Result { +pub(crate) fn parse_level(s: &str) -> Result { let lvl = match s.parse::() { Ok(1) => niffler::Level::One, Ok(2) => niffler::Level::Two, @@ -488,7 +412,7 @@ fn parse_level(s: &str) -> Result { } /// A utility function to parse the fraction CLI option to the range 0-1 -fn parse_fraction(s: &str) -> Result { +pub(crate) fn parse_fraction(s: &str) -> Result { let f = f32::from_str(s).map_err(|_| CliError::FractionOutOfRange(s.to_string()))?; if !(0.0..=100.0).contains(&f) { return Err(CliError::FractionOutOfRange(s.to_string())); @@ -500,10 +424,10 @@ fn parse_fraction(s: &str) -> Result { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; - const ERROR: f32 = f32::EPSILON; + pub(crate) const ERROR: f32 = f32::EPSILON; #[test] fn parse_fraction_negative_fraction_returns_error() { @@ -615,425 +539,6 @@ mod tests { assert!((expected - actual).abs() < ERROR) } - #[test] - fn no_args_given_raises_error() { - let passed_args = vec!["rasusa"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::MissingRequiredArgument; - - assert_eq!(actual, expected) - } - - #[test] - fn no_input_file_given_raises_error() { - let passed_args = vec!["rasusa", "-c", "30", "-g", "3mb"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::MissingRequiredArgument; - - assert_eq!(actual, expected) - } - - #[test] - fn no_coverage_given_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-g", "3mb"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::MissingRequiredArgument; - - assert_eq!(actual, expected) - } - - #[test] - fn invalid_coverage_given_raises_error() { - let passed_args = vec!["rasusa", "-i", "in.fq", "-g", "3mb", "-c", "foo"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ValueValidation; - - assert_eq!(actual, expected) - } - - #[test] - fn no_genome_size_given_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-c", "5"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::MissingRequiredArgument; - - assert_eq!(actual, expected) - } - - #[test] - fn no_genome_size_but_bases_and_coverage() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-b", "5", "-c", "7"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::MissingRequiredArgument; - - assert_eq!(actual, expected) - } - - #[test] - fn bases_and_coverage_and_genome_size_all_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-b", "5", "-c", "7", "-g", "5m"]; - - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert_eq!(args.bases.unwrap(), GenomeSize(5)); - } - - #[test] - fn no_genome_size_or_coverage_given_but_bases_given() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-b", "5"]; - - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert_eq!(args.bases.unwrap(), GenomeSize(5)); - } - - #[test] - fn no_genome_size_or_coverage_given_but_num_given() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-n", "5"]; - - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert_eq!(args.num.unwrap(), GenomeSize(5)); - } - - #[test] - fn no_genome_size_or_coverage_given_but_frac_given() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-f", "5"]; - - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert!((args.frac.unwrap() - 0.05_f32).abs() < ERROR); - } - - #[test] - fn num_and_coverage_and_genome_size_not_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-n", "5", "-c", "7", "-g", "5m"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ArgumentConflict; - - assert_eq!(actual, expected) - } - - #[test] - fn frac_and_coverage_and_genome_size_not_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-f", "5", "-c", "7", "-g", "5m"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ArgumentConflict; - - assert_eq!(actual, expected) - } - - #[test] - fn frac_and_bases_not_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-f", "5", "-b", "7"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ArgumentConflict; - - assert_eq!(actual, expected) - } - - #[test] - fn frac_and_num_not_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-f", "5", "-n", "7"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ArgumentConflict; - - assert_eq!(actual, expected) - } - - #[test] - fn bases_and_num_not_allowed() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-b", "5", "-n", "7"]; - - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ArgumentConflict; - - assert_eq!(actual, expected) - } - - #[test] - fn invalid_genome_size_given_raises_error() { - let passed_args = vec!["rasusa", "-i", "in.fq", "-c", "5", "-g", "8jb"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ValueValidation; - - assert_eq!(actual, expected) - } - - #[test] - fn faidx_given_as_genome_size() { - let infile = "tests/cases/r1.fq.gz"; - let faidx = "tests/cases/h37rv.fa.fai"; - let passed_args = vec!["rasusa", "-i", infile, "-c", "5", "-g", faidx]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert_eq!(args.genome_size.unwrap(), GenomeSize(4411532)) - } - - #[test] - fn invalid_seed_given_raises_error() { - let passed_args = vec!["rasusa", "-i", "in.fq", "-c", "5", "-g", "8mb", "-s", "foo"]; - let args: Result = Cli::try_parse_from(passed_args); - - let actual = args.unwrap_err().kind(); - let expected = clap::error::ErrorKind::ValueValidation; - - assert_eq!(actual, expected) - } - - #[test] - fn all_valid_args_parsed_as_expected() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", - "-i", - infile, - "-c", - "5", - "-g", - "8mb", - "-s", - "88", - "-o", - "my/output/file.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - assert_eq!(args.input[0], PathBuf::from_str(infile).unwrap()); - assert_eq!(args.coverage.unwrap(), Coverage(5.0)); - assert_eq!(args.genome_size.unwrap(), GenomeSize(8_000_000)); - assert_eq!(args.seed, Some(88)); - assert_eq!( - args.output[0], - PathBuf::from_str("my/output/file.fq").unwrap() - ) - } - - #[test] - fn all_valid_args_with_two_inputs_parsed_as_expected() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", - "-i", - infile, - infile, - "-c", - "5", - "-g", - "8mb", - "-s", - "88", - "-o", - "my/output/file.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let expected_input = vec![ - PathBuf::from_str(infile).unwrap(), - PathBuf::from_str(infile).unwrap(), - ]; - assert_eq!(args.input, expected_input); - assert_eq!(args.coverage.unwrap(), Coverage(5.0)); - assert_eq!(args.genome_size.unwrap(), GenomeSize(8_000_000)); - assert_eq!(args.seed, Some(88)); - assert_eq!( - args.output[0], - PathBuf::from_str("my/output/file.fq").unwrap() - ) - } - - #[test] - fn all_valid_args_with_two_inputs_using_flag_twice_parsed_as_expected() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", - "-i", - infile, - "-i", - infile, - "-c", - "5", - "-g", - "8mb", - "-s", - "88", - "-o", - "my/output/file.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let expected_input = vec![ - PathBuf::from_str(infile).unwrap(), - PathBuf::from_str(infile).unwrap(), - ]; - assert_eq!(args.input, expected_input); - assert_eq!(args.coverage.unwrap(), Coverage(5.0)); - assert_eq!(args.genome_size.unwrap(), GenomeSize(8_000_000)); - assert_eq!(args.seed, Some(88)); - assert_eq!( - args.output[0], - PathBuf::from_str("my/output/file.fq").unwrap() - ) - } - - #[test] - fn three_inputs_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", - "-i", - infile, - "-i", - infile, - "-i", - infile, - "-c", - "5", - "-g", - "8mb", - "-s", - "88", - "-o", - "my/output/file.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual: CliError = args.validate_input_output_combination().unwrap_err(); - let expected = - CliError::BadInputOutputCombination(String::from("Got more than 2 files for input.")); - - assert_eq!(actual, expected) - } - - #[test] - fn three_outputs_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", - "-i", - infile, - "-i", - infile, - "-c", - "5", - "-g", - "8mb", - "-s", - "88", - "-o", - "my/output/file.fq", - "-o", - "out.fq", - "-o", - "out.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual: CliError = args.validate_input_output_combination().unwrap_err(); - let expected = - CliError::BadInputOutputCombination(String::from("Got more than 2 files for output.")); - - assert_eq!(actual, expected) - } - - #[test] - fn one_input_no_output_is_ok() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec!["rasusa", "-i", infile, "-c", "5", "-g", "8mb", "-s", "88"]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual = args.validate_input_output_combination(); - - assert!(actual.is_ok()) - } - - #[test] - fn two_inputs_one_output_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", "-i", infile, "-i", infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", - "out.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual: CliError = args.validate_input_output_combination().unwrap_err(); - let expected = - CliError::BadInputOutputCombination(String::from("Got 2 --input but 1 --output")); - - assert_eq!(actual, expected) - } - - #[test] - fn one_input_two_outputs_raises_error() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", "-i", infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", - "out.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual: CliError = args.validate_input_output_combination().unwrap_err(); - let expected = - CliError::BadInputOutputCombination(String::from("Got 1 --input but 2 --output")); - - assert_eq!(actual, expected) - } - - #[test] - fn two_input_two_outputs_is_ok() { - let infile = "tests/cases/r1.fq.gz"; - let passed_args = vec![ - "rasusa", "-i", infile, "-i", infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", - "out.fq", "-o", "out.fq", - ]; - let args = Cli::try_parse_from(passed_args).unwrap(); - - let actual = args.validate_input_output_combination(); - - assert!(actual.is_ok()) - } - #[test] fn float_multiply_with_base_unit() { let actual = 4.5 * MetricSuffix::Base; diff --git a/src/main.rs b/src/main.rs index f7b32d9..a740ecc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,24 +2,26 @@ extern crate core; -// due to a structopt problem -use std::io::stdout; - -use anyhow::{Context, Result}; +use anyhow::Result; use clap::Parser; use env_logger::Builder; -use log::{debug, error, info, warn, LevelFilter}; -use niffler::compression; +use log::{debug, LevelFilter}; pub use crate::cli::Cli; -use crate::cli::Coverage; +use crate::cli::Commands; pub use crate::fastx::Fastx; pub use crate::subsampler::SubSampler; +mod alignment; mod cli; mod fastx; +mod reads; mod subsampler; +pub trait Runner { + fn run(&mut self) -> Result<()>; +} + fn main() -> Result<()> { let args: Cli = Cli::parse(); // Initialize logger @@ -37,151 +39,13 @@ fn main() -> Result<()> { debug!("{:?}", args); - args.validate_input_output_combination()?; - let is_paired = args.input.len() == 2; - if is_paired { - info!("Two input files given. Assuming paired Illumina...") - } - - let input_fastx = Fastx::from_path(&args.input[0]); - - let mut output_handle = match args.output.len() { - 0 => match args.output_type { - None => Box::new(stdout()), - Some(fmt) => { - let lvl = match fmt { - compression::Format::Gzip => compression::Level::Six, - compression::Format::Bzip => compression::Level::Nine, - compression::Format::Lzma => compression::Level::Six, - compression::Format::Zstd => compression::Level::Three, - _ => compression::Level::Zero, - }; - niffler::basic::get_writer(Box::new(stdout()), fmt, lvl)? - } - }, - _ => { - let out_fastx = Fastx::from_path(&args.output[0]); - out_fastx - .create(args.compress_level, args.output_type) - .context("unable to create the first output file")? - } - }; - - let target_total_bases: Option = match (args.genome_size, args.coverage, args.bases) { - (_, _, Some(bases)) => Some(u64::from(bases)), - (Some(gsize), Some(cov), _) => Some(gsize * cov), - _ => None, + let mut subcmd: Box = match args.command { + Commands::Reads(cmd) => Box::new(cmd), + Commands::Alignment(cmd) => Box::new(cmd), + Commands::Cite(cmd) => Box::new(cmd), }; - if target_total_bases.is_some() { - info!( - "Target number of bases to subsample to is: {}", - target_total_bases.unwrap() - ); - } - - info!("Gathering read lengths..."); - let mut read_lengths = input_fastx - .read_lengths() - .context("unable to gather read lengths for the first input file")?; - - if is_paired { - let second_input_fastx = Fastx::from_path(&args.input[1]); - let expected_num_reads = read_lengths.len(); - info!("Gathering read lengths for second input file..."); - let mate_lengths = second_input_fastx - .read_lengths() - .context("unable to gather read lengths for the second input file")?; - - if mate_lengths.len() != expected_num_reads { - error!("First input has {} reads, but the second has {} reads. Paired Illumina files are assumed to have the same number of reads. The results of this subsample may not be as expected now.", expected_num_reads, read_lengths.len()); - std::process::exit(1); - } else { - info!( - "Both input files have the same number of reads ({}) 👍", - expected_num_reads - ); - } - // add the paired read lengths to the existing lengths - for (i, len) in mate_lengths.iter().enumerate() { - read_lengths[i] += len; - } - } - info!("{} reads detected", read_lengths.len()); - - // calculate the depth of coverage if using coverage-based subsampling - if args.genome_size.is_some() { - let number_of_bases: u64 = read_lengths.iter().map(|&x| x as u64).sum(); - let depth_of_covg = (number_of_bases as f64) / f64::from(args.genome_size.unwrap()); - info!("Input coverage is {:.2}x", depth_of_covg); - } - - let num_reads = match (args.num, args.frac) { - (Some(n), None) => Some(u64::from(n)), - (None, Some(f)) => { - let n = ((f as f64) * (read_lengths.len() as f64)).round() as u64; - if n == 0 { - warn!( - "Requested fraction of reads ({} * {}) was rounded to 0", - f, - read_lengths.len() - ); - } - Some(n) - } - _ => None, - }; - - let subsampler = SubSampler { - target_total_bases, - seed: args.seed, - num_reads, - }; - - let (reads_to_keep, nb_reads_to_keep) = subsampler.indices(&read_lengths); - if is_paired { - info!("Keeping {} reads from each input", nb_reads_to_keep); - } else { - info!("Keeping {} reads", nb_reads_to_keep); - } - debug!("Indices of reads being kept:\n{:?}", reads_to_keep); - - let mut total_kept_bases = - input_fastx.filter_reads_into(&reads_to_keep, nb_reads_to_keep, &mut output_handle)? as u64; - - // repeat the same process for the second input fastx (if illumina) - if is_paired { - let second_input_fastx = Fastx::from_path(&args.input[1]); - let second_out_fastx = Fastx::from_path(&args.output[1]); - let mut second_output_handle = second_out_fastx - .create(args.compress_level, args.output_type) - .context("unable to create the second output file")?; - - total_kept_bases += second_input_fastx.filter_reads_into( - &reads_to_keep, - nb_reads_to_keep, - &mut second_output_handle, - )? as u64; - } - - if let Some(gsize) = args.genome_size { - let actual_covg = total_kept_bases / gsize; - // safe to unwrap args.coverage as we have already ensured genome size and coverage co-occur - if Coverage(actual_covg as f32) < args.coverage.unwrap() { - warn!( - "Requested coverage ({:.2}x) is not possible as the actual coverage is {:.2}x - \ - output will be the same as the input", - args.coverage.unwrap().0, - actual_covg - ); - } else { - info!("Actual coverage of kept reads is {:.2}x", actual_covg); - } - } else { - info!("Kept {} bases", total_kept_bases); - } - - info!("Done 🎉"); + subcmd.run()?; Ok(()) } diff --git a/src/reads.rs b/src/reads.rs new file mode 100644 index 0000000..d1d7a55 --- /dev/null +++ b/src/reads.rs @@ -0,0 +1,584 @@ +use crate::cli::{ + check_path_exists, parse_compression_format, parse_fraction, parse_level, CliError, Coverage, + GenomeSize, +}; +use crate::{Fastx, Runner, SubSampler}; +use anyhow::{Context, Result}; +use clap::Parser; +use log::{debug, error, info, warn}; +use niffler::compression; +use std::io::stdout; +use std::path::PathBuf; + +#[derive(Debug, Parser)] +#[command(author, version, about)] +pub struct Reads { + /// The fast{a,q} file(s) to subsample. + /// + /// For paired Illumina, the order matters. i.e., R1 then R2. + #[arg( + value_parser = check_path_exists, + num_args = 1..=2, + required = true, + name = "FILE(S)" + )] + pub input: Vec, + + /// Output filepath(s); stdout if not present. + /// + /// For paired Illumina you may either pass this flag twice `-o o1.fq -o o2.fq` or give two + /// files consecutively `-o o1.fq o2.fq`. NOTE: The order of the pairs is assumed to be the + /// same as the input - e.g., R1 then R2. This option is required for paired input. + #[clap(short = 'o', long = "output", num_args = 1..=2)] + pub output: Vec, + + /// Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB + /// + /// Alternatively, a FASTA/Q index file can be provided and the genome size will be + /// set to the sum of all reference sequences. + /// + /// If --bases is not provided, this option and --coverage are required + #[clap( + short, + long, + required_unless_present_any = &["bases", "num", "frac"], + requires = "coverage", + value_name = "size|faidx", + conflicts_with_all = &["num", "frac"] + )] + pub genome_size: Option, + + /// The desired depth of coverage to subsample the reads to + /// + /// If --bases is not provided, this option and --genome-size are required + #[clap( + short, + long, + value_name = "FLOAT", + required_unless_present_any = &["bases", "num", "frac"], + requires = "genome_size", + conflicts_with_all = &["num", "frac"] + )] + pub coverage: Option, + + /// Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB + /// + /// If this option is given, --coverage and --genome-size are ignored + #[clap(short, long, value_name = "bases", conflicts_with_all = &["num", "frac"])] + pub bases: Option, + + /// Subsample to a specific number of reads + /// + /// If paired-end reads are passed, this is the number of (matched) reads from EACH file. + /// This option accepts the same format as genome size - e.g., 1k will take 1000 reads + #[clap(short, long, value_name = "INT", conflicts_with = "frac")] + pub num: Option, + + /// Subsample to a fraction of the reads - e.g., 0.5 samples half the reads + /// + /// Values >1 and <=100 will be automatically converted - e.g., 25 => 0.25 + #[clap(short, long, value_name = "FLOAT", value_parser = parse_fraction, conflicts_with = "num")] + pub frac: Option, + + /// Random seed to use. + #[clap(short = 's', long = "seed", value_name = "INT")] + pub seed: Option, + + /// Switch on verbosity. + #[clap(short)] + pub verbose: bool, + + /// u: uncompressed; b: Bzip2; g: Gzip; l: Lzma; x: Xz (Lzma); z: Zstd + /// + /// Rasusa will attempt to infer the output compression format automatically from the filename + /// extension. This option is used to override that. If writing to stdout, the default is + /// uncompressed + #[clap(short = 'O', long, value_name = "u|b|g|l|x|z", value_parser = parse_compression_format)] + pub output_type: Option, + + /// Compression level to use if compressing output. Uses the default level for the format if + /// not specified. + #[clap(short = 'l', long, value_parser = parse_level, value_name = "1-21")] + pub compress_level: Option, +} + +impl Reads { + /// Checks there is a valid and equal number of `--input` and `--output` arguments given. + /// + /// # Errors + /// A [`CliError::BadInputOutputCombination`](#clierror) is returned for the following: + /// - Either `--input` or `--output` are passed more than twice + /// - An unequal number of `--input` and `--output` are passed. The only exception to + /// this is if one `--input` and zero `--output` are passed, in which case, the output + /// will be sent to STDOUT. + pub fn validate_input_output_combination(&self) -> std::result::Result<(), CliError> { + let out_len = self.output.len(); + let in_len = self.input.len(); + + if in_len > 2 { + let msg = String::from("Got more than 2 files for input."); + return Err(CliError::BadInputOutputCombination(msg)); + } + if out_len > 2 { + let msg = String::from("Got more than 2 files for output."); + return Err(CliError::BadInputOutputCombination(msg)); + } + match in_len as isize - out_len as isize { + diff if diff == 1 && in_len == 1 => Ok(()), + diff if diff != 0 => Err(CliError::BadInputOutputCombination(format!( + "Got {} --input but {} --output", + in_len, out_len + ))), + _ => Ok(()), + } + } +} + +impl Runner for Reads { + fn run(&mut self) -> Result<()> { + self.validate_input_output_combination()?; + let is_paired = self.input.len() == 2; + if is_paired { + info!("Two input files given. Assuming paired Illumina...") + } + + let input_fastx = Fastx::from_path(&self.input[0]); + + let mut output_handle = match self.output.len() { + 0 => match self.output_type { + None => Box::new(stdout()), + Some(fmt) => { + let lvl = match fmt { + compression::Format::Gzip => compression::Level::Six, + compression::Format::Bzip => compression::Level::Nine, + compression::Format::Lzma => compression::Level::Six, + compression::Format::Zstd => compression::Level::Three, + _ => compression::Level::Zero, + }; + niffler::basic::get_writer(Box::new(stdout()), fmt, lvl)? + } + }, + _ => { + let out_fastx = Fastx::from_path(&self.output[0]); + out_fastx + .create(self.compress_level, self.output_type) + .context("unable to create the first output file")? + } + }; + + let target_total_bases: Option = match (self.genome_size, self.coverage, self.bases) { + (_, _, Some(bases)) => Some(u64::from(bases)), + (Some(gsize), Some(cov), _) => Some(gsize * cov), + _ => None, + }; + + if target_total_bases.is_some() { + info!( + "Target number of bases to subsample to is: {}", + target_total_bases.unwrap() + ); + } + + info!("Gathering read lengths..."); + let mut read_lengths = input_fastx + .read_lengths() + .context("unable to gather read lengths for the first input file")?; + + if is_paired { + let second_input_fastx = Fastx::from_path(&self.input[1]); + let expected_num_reads = read_lengths.len(); + info!("Gathering read lengths for second input file..."); + let mate_lengths = second_input_fastx + .read_lengths() + .context("unable to gather read lengths for the second input file")?; + + if mate_lengths.len() != expected_num_reads { + error!("First input has {} reads, but the second has {} reads. Paired Illumina files are assumed to have the same number of reads. The results of this subsample may not be as expected now.", expected_num_reads, read_lengths.len()); + std::process::exit(1); + } else { + info!( + "Both input files have the same number of reads ({}) 👍", + expected_num_reads + ); + } + // add the paired read lengths to the existing lengths + for (i, len) in mate_lengths.iter().enumerate() { + read_lengths[i] += len; + } + } + info!("{} reads detected", read_lengths.len()); + + // calculate the depth of coverage if using coverage-based subsampling + if self.genome_size.is_some() { + let number_of_bases: u64 = read_lengths.iter().map(|&x| x as u64).sum(); + let depth_of_covg = (number_of_bases as f64) / f64::from(self.genome_size.unwrap()); + info!("Input coverage is {:.2}x", depth_of_covg); + } + + let num_reads = match (self.num, self.frac) { + (Some(n), None) => Some(u64::from(n)), + (None, Some(f)) => { + let n = ((f as f64) * (read_lengths.len() as f64)).round() as u64; + if n == 0 { + warn!( + "Requested fraction of reads ({} * {}) was rounded to 0", + f, + read_lengths.len() + ); + } + Some(n) + } + _ => None, + }; + + let subsampler = SubSampler { + target_total_bases, + seed: self.seed, + num_reads, + }; + + let (reads_to_keep, nb_reads_to_keep) = subsampler.indices(&read_lengths); + if is_paired { + info!("Keeping {} reads from each input", nb_reads_to_keep); + } else { + info!("Keeping {} reads", nb_reads_to_keep); + } + debug!("Indices of reads being kept:\n{:?}", reads_to_keep); + + let mut total_kept_bases = + input_fastx.filter_reads_into(&reads_to_keep, nb_reads_to_keep, &mut output_handle)? + as u64; + + // repeat the same process for the second input fastx (if illumina) + if is_paired { + let second_input_fastx = Fastx::from_path(&self.input[1]); + let second_out_fastx = Fastx::from_path(&self.output[1]); + let mut second_output_handle = second_out_fastx + .create(self.compress_level, self.output_type) + .context("unable to create the second output file")?; + + total_kept_bases += second_input_fastx.filter_reads_into( + &reads_to_keep, + nb_reads_to_keep, + &mut second_output_handle, + )? as u64; + } + + if let Some(gsize) = self.genome_size { + let actual_covg = total_kept_bases / gsize; + // safe to unwrap self.coverage as we have already ensured genome size and coverage co-occur + if Coverage(actual_covg as f32) < self.coverage.unwrap() { + warn!( + "Requested coverage ({:.2}x) is not possible as the actual coverage is {:.2}x - \ + output will be the same as the input", + self.coverage.unwrap().0, + actual_covg + ); + } else { + info!("Actual coverage of kept reads is {:.2}x", actual_covg); + } + } else { + info!("Kept {} bases", total_kept_bases); + } + + info!("Done 🎉"); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use assert_cmd::Command; + + const SUB: &str = "reads"; + + #[test] + fn no_args_given_raises_error() { + let passed_args = vec![SUB]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn no_input_file_given_raises_error() { + let passed_args = vec![SUB, "-c", "30", "-g", "3mb"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn no_coverage_given_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-g", "3mb"]; + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn invalid_coverage_given_raises_error() { + let passed_args = vec![SUB, "in.fq", "-g", "3mb", "-c", "foo"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn no_genome_size_given_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-c", "5"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn no_genome_size_but_bases_and_coverage() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-b", "5", "-c", "7"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn bases_and_coverage_and_genome_size_all_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-b", "5", "-c", "7", "-g", "5m"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn no_genome_size_or_coverage_given_but_bases_given() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-b", "5"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn no_genome_size_or_coverage_given_but_num_given() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-n", "5"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn no_genome_size_or_coverage_given_but_frac_given() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-f", "5"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn num_and_coverage_and_genome_size_not_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-n", "5", "-c", "7", "-g", "5m"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn frac_and_coverage_and_genome_size_not_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-f", "5", "-c", "7", "-g", "5m"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn frac_and_bases_not_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-f", "5", "-b", "7"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn frac_and_num_not_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-f", "5", "-n", "7"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn bases_and_num_not_allowed() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-b", "5", "-n", "7"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn invalid_genome_size_given_raises_error() { + let passed_args = vec![SUB, "in.fq", "-c", "5", "-g", "8jb"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn faidx_given_as_genome_size() { + let infile = "tests/cases/r1.fq.gz"; + let faidx = "tests/cases/h37rv.fa.fai"; + let passed_args = vec![SUB, infile, "-c", "5", "-g", faidx]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn invalid_seed_given_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-c", "5", "-g", "8mb", "-s", "foo"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn all_valid_args_parsed_as_expected() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, + infile, + "-c", + "5", + "-g", + "8mb", + "-s", + "88", + "-o", + "/dev/null", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn all_valid_args_with_two_inputs_parsed_as_expected() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, + infile, + infile, + "-c", + "5", + "-g", + "8mb", + "-s", + "88", + "-o", + "/dev/null", + "/dev/null", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn three_inputs_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, + infile, + infile, + infile, + "-c", + "5", + "-g", + "8mb", + "-s", + "88", + "-o", + "my/output/file.fq", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn three_outputs_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, + infile, + infile, + "-c", + "5", + "-g", + "8mb", + "-s", + "88", + "-o", + "my/output/file.fq", + "-o", + "out.fq", + "-o", + "out.fq", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn one_input_no_output_is_ok() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![SUB, infile, "-c", "5", "-g", "8mb", "-s", "88"]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } + + #[test] + fn two_inputs_one_output_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, infile, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn one_input_two_outputs_raises_error() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", "out.fq", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().failure(); + } + + #[test] + fn two_input_two_outputs_is_ok() { + let infile = "tests/cases/r1.fq.gz"; + let passed_args = vec![ + SUB, infile, infile, "-c", "5", "-g", "8mb", "-s", "88", "-o", "out.fq", "-o", "out.fq", + ]; + + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); + cmd.args(passed_args).assert().success(); + } +} diff --git a/tests/cases/no_index.bam b/tests/cases/no_index.bam new file mode 100644 index 0000000..aa1ed45 Binary files /dev/null and b/tests/cases/no_index.bam differ diff --git a/tests/cases/no_start_end.bam b/tests/cases/no_start_end.bam new file mode 100644 index 0000000..a335911 Binary files /dev/null and b/tests/cases/no_start_end.bam differ diff --git a/tests/cases/no_start_end.bam.bai b/tests/cases/no_start_end.bam.bai new file mode 100644 index 0000000..b0b218f Binary files /dev/null and b/tests/cases/no_start_end.bam.bai differ diff --git a/tests/cases/test.bam b/tests/cases/test.bam new file mode 100644 index 0000000..8e8204b Binary files /dev/null and b/tests/cases/test.bam differ diff --git a/tests/cases/test.bam.bai b/tests/cases/test.bam.bai new file mode 100644 index 0000000..8c107b3 Binary files /dev/null and b/tests/cases/test.bam.bai differ diff --git a/tests/main.rs b/tests/main.rs index 6d3e33e..760a7eb 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -3,10 +3,11 @@ use assert_cmd::Command; use predicates::prelude::*; const BIN: &str = "rasusa"; +const READS: &str = "reads"; #[test] fn input_file_doesnt_exist() -> Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; - cmd.args(vec!["-i", "file/doesnt/exist.fa", "-g", "5mb", "-c", "20"]); + cmd.args(vec![READS, "file/doesnt/exist.fa", "-g", "5mb", "-c", "20"]); cmd.assert() .failure() .stderr(predicate::str::contains("does not exist")); @@ -18,7 +19,7 @@ fn input_file_doesnt_exist() -> Result<(), Box> { fn output_file_in_nonexistant_dir() -> Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/file1.fq.gz", "-g", "5mb", @@ -38,7 +39,7 @@ fn output_file_in_nonexistant_dir() -> Result<(), Box> { fn valid_inputs_raises_no_errors() -> Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/file1.fq.gz", "-g", "5mb", @@ -56,7 +57,7 @@ fn input_and_output_filetypes_different_raises_no_errors() -> Result<(), Box Result<(), Box Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/file1.fq.gz", "-g", "5mb", @@ -97,7 +98,7 @@ fn invalid_input_and_output_combination_raises_error() -> Result<(), Box Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; - cmd.args(vec!["-i", "tests/cases/file1.fq.gz", "-n", "5m"]); + cmd.args(vec![READS, "tests/cases/file1.fq.gz", "-n", "5m"]); cmd.assert().success(); @@ -107,7 +108,7 @@ fn num_instead_of_coverage_based_raises_no_errors() -> Result<(), Box Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; - cmd.args(vec!["-i", "tests/cases/file1.fq.gz", "-f", "0.2"]); + cmd.args(vec![READS, "tests/cases/file1.fq.gz", "-f", "0.2"]); cmd.assert().success(); @@ -118,7 +119,7 @@ fn frac_instead_of_coverage_based_raises_no_errors() -> Result<(), Box Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/file1.fq.gz", "tests/cases/r2.fq.gz", "-g", @@ -142,7 +143,7 @@ fn unequal_number_of_reads_raises_error() -> Result<(), Box Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/r1.fq.gz", "tests/cases/r2.fq.gz", "-g", @@ -163,9 +164,8 @@ fn two_valid_illumina_inputs_suceeds() -> Result<(), Box> fn num_from_each_with_paired_reads() -> Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/r1.fq.gz", - "-i", "tests/cases/r2.fq.gz", "-n", "1", @@ -184,7 +184,7 @@ fn num_from_each_with_paired_reads() -> Result<(), Box> { fn frac_from_each_with_paired_reads() -> Result<(), Box> { let mut cmd = Command::cargo_bin(BIN)?; cmd.args(vec![ - "-i", + READS, "tests/cases/r1.fq.gz", "tests/cases/r2.fq.gz", "-f",