diff --git a/.editorconfig b/.editorconfig index 2951ad8..2813def 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,8 +8,62 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,cff}] +[*.{md,yml,yaml,html,css,scss,js,cff}] indent_size = 2 -[*.nf.test] -insert_final_newline = false +# These files are edited and tested upstream in nf-core/modules +[/modules/nf-core/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset + +[/subworkflows/nf-core/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset + +# These files are edited and tested upstream in pfr/modules +[/modules/pfr/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset + +[/subworkflows/pfr/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset + +[/assets/email*] +indent_size = unset + +# ignore Readme +[README.md] +indent_style = unset + +# ignore python +[*.{py}] +indent_style = unset + +# ignore perl +[*.{pl,pm}] +indent_size = unset + +# ignore drawio +[*.drawio] +indent_size = unset + +# ignore LICENSE +[LICENSE] +indent_size = unset diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4de425f..04497a0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,37 +13,22 @@ concurrency: cancel-in-progress: true jobs: - prettier: - runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Install NodeJS - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install Prettier - run: npm install -g prettier - - - name: Run Prettier --check - run: prettier --check . - - editorconfig: + pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 with: - node-version: "20" + python-version: 3.11 + cache: "pip" - - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + - name: Install pre-commit + run: pip install pre-commit - - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(git ls-files | grep -v test | grep -v LICENSE) + - name: Run pre-commit + run: pre-commit run --all-files stub-test: runs-on: ubuntu-latest @@ -54,31 +39,22 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/checkout@v4 - with: - repository: PlantandFoodResearch/pangene-test - ssh-key: ${{ secrets.PANGENE_TEST_DEPLOY_KEY }} - path: pangene-test - - - uses: actions/setup-java@v3 - with: - distribution: "temurin" - java-version: "17" - - name: Setup Nextflow + - name: Install Nextflow uses: nf-core/setup-nextflow@v1 + with: + version: "23.04.4" - name: Run stub-test run: | - nextflow \ + nextflow run \ main.nf \ -profile local,docker \ - -resume \ -stub \ - -params-file conf/test_params.json + -params-file tests/stub/params.json confirm-pass: runs-on: ubuntu-latest - needs: [prettier, editorconfig, stub-test] + needs: [pre-commit, stub-test] if: always() steps: - name: All tests ok diff --git a/.gitignore b/.gitignore index 62d31c0..8bdcb04 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,4 @@ __pycache__ *.stdout *.stderr -.literature pangene-test/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fc52181..bc85d76 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,3 +3,18 @@ repos: rev: "v3.1.0" hooks: - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.3" + hooks: + - id: editorconfig-checker + alias: ec + - repo: local + hooks: + - id: version_checks + name: Version checks + language: system + entry: > + ./version_check.sh + always_run: true + fail_fast: true + pass_filenames: false diff --git a/.prettierignore b/.prettierignore index 24a3687..543341f 100644 --- a/.prettierignore +++ b/.prettierignore @@ -15,5 +15,4 @@ __pycache__ *.stdout *.stderr -.literature pangene-test/ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3eb74a0 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,63 @@ +# PlantandFoodResearch/pangene: Changelog + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## 0.3.0 - [30-April-2024] + +### `Added` + +1. Added changelog and semantic versioning +2. Changed license to MIT +3. Updated `.editorconfig` +4. Moved .literature to test/ branch +5. Renamed `pangene_local` to `local_pangene` +6. Renamed `pangene_pfr` to `pfr_pangene` +7. Added versioning checking +8. Updated github workflow to use pre-commit instead of prettier and editorconfig check +9. Added central singularity cache dir for pfr config +10. Added `SORTMERNA_INDEX` before `SORTMERNA` +11. Fixed sample contamination bug introduced by `file.simpleName` +12. Now using empty files for stub testing in CI +13. Now BRAKER can be skipped by including BRAKER outputs from previous runs in the `target_assemblies` param +14. Added `gffcompare` to merge liftoff annotations +15. Renamed `samplesheet` param to `fastq` +16. Now using assemblysheet in combination with nf-validation for assembly input +17. Added nextflow_schema.json +18. Now using nf-validation to validate fastqsheet provided by params.fastq +19. Moved `manifest.config` and `reporting_defaults.config` content to `nextflow.config` +20. Now using a txt file for `params.external_protein_fastas` +21. Now using nf-validation for `params.liftoff_annotations` +22. Now using nf-validation for all the parameters +23. Added `PURGE_BREAKER_MODELS` sub-workflow +24. Added `GFF_EGGNOGMAPPER` sub-workflow +25. Now using a custom version of `GFFREAD` which supports `meta` and `fasta` +26. Now using TSEBRA to purge models which do not have full intron support from BRAKER hints +27. Added params `eggnogmapper_evalue` and `eggnogmapper_pident` +28. Added `PURGE_NOHIT_BRAKER_MODELS` sub-workflow +29. Now merging BRAKER and liftoff models before running eggnogmapper +30. Added `GFF_MERGE_CLEANUP` sub-workflow +31. Now using `description` field to store notes and textual annotations in the gff files +32. Now using `mRNA` in place of `transcript` in gff files +33. Now `eggnogmapper_purge_nohits` is set to `false` by default +34. Added `GFF_STORE` sub workflow +35. `external_protein_fastas` and `eggnogmapper_db_dir` are not mandatory parameters +36. Added contributors +37. Add a document for the pipeline parameters +38. Updated `pfr_pangene` and `pfr/profile.config` +39. Now using local tests/stub files for GitHub CI +40. Now removing iso-forms left by TSEBRA using `AGAT_SPFILTERFEATUREFROMKILLLIST` +41. Added `pyproject.toml` +42. Now using PFAMs from eggnog if description is '-' + +### `Fixed` + +1. Removed liftoff models with `valid_ORF=False` +2. Updated license text to include 'Copyright (c) 2024 The New Zealand Institute for Plant and Food Research Limited' + +### `Dependencies` + +1. NextFlow!>=23.04.4 +2. nf-validation=1.1.3 + +### `Deprecated` diff --git a/LICENSE b/LICENSE index f288702..2ef2204 100644 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. +MIT License + +Copyright (c) 2024 The New Zealand Institute for Plant and Food Research Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 0717c20..d255a36 100644 --- a/README.md +++ b/README.md @@ -1,109 +1,67 @@ # PANGENE -[![Lint and -stub on Linux/Docker](https://github.com/PlantandFoodResearch/pangene/actions/workflows/test.yml/badge.svg)](https://github.com/PlantandFoodResearch/pangene/actions/workflows/test.yml) - -A NextFlow pipeline for pan-genome annotation. - -## Pipeline Flowchart - -```mermaid -flowchart TD - subgraph PrepareAssembly [ ] - TARGET_ASSEMBLIES - TE_LIBRARIES - FASTA_VALIDATE - fasta_file_from_fasta_validate - EDTA - REPEATMODELER - te_lib_absent_node - REPEATMASKER - end - - TARGET_ASSEMBLIES(["[target_assemblies]"]) - TE_LIBRARIES(["[te_libs]"]) - TARGET_ASSEMBLIES --> FASTA_VALIDATE - FASTA_VALIDATE --- |Fasta|fasta_file_from_fasta_validate(( )) - fasta_file_from_fasta_validate --> |or|EDTA - fasta_file_from_fasta_validate --> |default|REPEATMODELER - REPEATMODELER --- te_lib_absent_node(( )) - EDTA --- te_lib_absent_node - TE_LIBRARIES --> REPEATMASKER - te_lib_absent_node --> REPEATMASKER - - subgraph Samplesheet [ ] - SAMPLESHEET - CAT_FASTQ - FASTQC - FASTP - FASTP_FASTQC - SORTMERNA - fasta_file_for_star - STAR - SAMTOOLS_CAT - end - - SAMPLESHEET([samplesheet]) - SAMPLESHEET --> |Tech. reps|CAT_FASTQ - CAT_FASTQ --> FASTQC - SAMPLESHEET --> FASTQC - FASTQC --> FASTP - FASTP --> FASTP_FASTQC[FASTQC] - FASTP_FASTQC --> SORTMERNA - fasta_file_for_star(( )) - fasta_file_for_star --> |Fasta|STAR - SORTMERNA --> STAR - STAR --> SAMTOOLS_CAT - - subgraph Annotation [ ] - anno_fasta(( )) - anno_masked_fasta(( )) - anno_bam(( )) - EXTERNAL_PROTEIN_SEQS(["[ext_prots]"]) - XREF_ANNOTATIONS(["[xref_annotations]"]) - CAT - BRAKER3 - GFFREAD - LIFTOFF - end - - PrepareAssembly --> |Fasta, Masked fasta|Annotation - Samplesheet --> |RNASeq bam|Annotation - - XREF_ANNOTATIONS --> |xref_gff|GFFREAD - XREF_ANNOTATIONS --> |xref_fasta|LIFTOFF - GFFREAD --> LIFTOFF - anno_fasta --> |Fasta|LIFTOFF - - EXTERNAL_PROTEIN_SEQS --> CAT - anno_masked_fasta --> |Masked fasta|BRAKER3 - anno_bam --> |RNASeq bam|BRAKER3 - CAT --> BRAKER3 - - style Samplesheet fill:#00FFFF21,stroke:#00FFFF21 - style PrepareAssembly fill:#00FFFF21,stroke:#00FFFF21 - style Annotation fill:#00FFFF21,stroke:#00FFFF21 -``` +[![Lint/stub on Linux/Docker](https://github.com/PlantandFoodResearch/pangene/actions/workflows/test.yml/badge.svg)](https://github.com/PlantandFoodResearch/pangene/actions/workflows/test.yml) + +A NextFlow pipeline for pan-genome annotation. It can also be used for annotation of a single genome. + +## Flowchart + +

+ +## Alpha Release + +This release is not fully documented and under alpha testing by the Bioinformatics Team. There are several [outstanding issues](https://github.com/PlantandFoodResearch/pangene/issues) which will be addressed before a general release. ## Plant&Food Users -Configure the pipeline by modifying `nextflow.config` and submit to SLURM for execution. +Download the pipeline to your `/workspace/$USER` folder. Change the parameters defined in the [pfr/params.json](./pfr/params.json) file. Submit the pipeline to SLURM for execution. For a description of the parameters, see [parameters.md](./docs/parameters.md). ```bash -sbatch ./pangene_pfr +sbatch ./pfr_pangene ``` -## Third-party Sources - -Some software components of this pipeline have been adopted from following third-party sources: - -1. nf-core [MIT](https://github.com/nf-core/modules/blob/master/LICENSE): https://github.com/nf-core/modules +## Credits + +plantandfoodresearch/pangene workflows were originally scripted by Jason Shiller. Usman Rashid wrote the NextFLow pipeline. + +We thank the following people for their extensive assistance in the development of this pipeline. + +- Cecilia Deng [@CeciliaDeng](https://github.com/CeciliaDeng) +- Charles David [@charlesdavid](https://github.com/charlesdavid) +- Chen Wu [@christinawu2008](https://github.com/christinawu2008) +- Leonardo Salgado [@leorippel](https://github.com/leorippel) +- Ross Crowhurst [@rosscrowhurst](https://github.com/rosscrowhurst) +- Susan Thomson [@cflsjt](https://github.com/cflsjt) +- Ting-Hsuan Chen [@ting-hsuan-chen](https://github.com/ting-hsuan-chen) + +The pipeline uses nf-core modules contributed by following authors. + + + + + + + + + + + + + + + + + + + + + +## Citations + +This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). > **The nf-core framework for community-curated bioinformatics pipelines.** > > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). - -2. nf-core/rnaseq [MIT](https://github.com/nf-core/rnaseq/blob/master/LICENSE): https://github.com/nf-core/rnaseq -3. rewarewaannotation [MIT](https://github.com/kherronism/rewarewaannotation/blob/master/LICENSE): https://github.com/kherronism/rewarewaannotation -4. assembly_qc [GPL-3.0](https://github.com/Plant-Food-Research-Open/assembly_qc/blob/main/LICENSE): https://github.com/Plant-Food-Research-Open/assembly_qc diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 3742442..0000000 --- a/TODO.md +++ /dev/null @@ -1,21 +0,0 @@ -- [ ] Add --eval=reference.gtf -- [ ] From Ross regarding post-processing: - -> [9:49 am] Ross Crowhurst -> Here is an easy one: BLATSp vs swissprot & Arabidpsis and check query is with set thresholds of reference - if so accept; If not move to BLASTp vs Uniref90 or Refeq (or some other predetermined model species) - same deal accept if within threshold limits. Else BLASTn of cds vs NCBI nt (really scrapping the bottom of the barrel here). If not a hit to anything then chances are its garbage and should be removed. Some ppl might try to claim its a unique protein to the genotype but in 20 years I have never seen one of those be supported - mostly this category is garbage. The screen agains NCBI nt also assists to classify "bits" as well retroposonss etc. Idea being you want to remove garbage predictions - as this does take time you can see why some papers just filter out by size. - -- [ ] From Cecilia: - -> https://github.com/zhaotao1987/SynNet-Pipeline - -- [ ] From Ross: - -> https://www.biorxiv.org/content/10.1101/096529v2.full.pdf - -- [ ] Sort out EDTA testing - -- Mib finder, eggnog, blastp against TAIR and uniprot (Wait) -- entap to merge (Wait) -- trinity and PASA + StringTie2 -> Evigene (Do) -- othrofinder paper -- gffcompre on braker and liftoff diff --git a/assets/schema_fastq.json b/assets/schema_fastq.json new file mode 100644 index 0000000..0890ce3 --- /dev/null +++ b/assets/schema_fastq.json @@ -0,0 +1,44 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/plantandfoodresearch/pangene/master/assets/schema_fastq.json", + "title": "plantandfoodresearch/pangene pipeline - params.fastq schema", + "description": "Schema for the file provided with params.fastq", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] + }, + "fastq_1": { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + }, + "fastq_2": { + "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "dependentRequired": ["fastq_1"] + }, + "target_assemblies": { + "type": "string", + "pattern": "^(\\w+;)*\\w+$", + "errorMessage": "One or more target assemblies must be specified by their tags from params.input sheet. Multiple tags should be separated by ';'", + "meta": ["target_assemblies"] + } + }, + "required": ["sample", "fastq_1", "target_assemblies"] + } +} diff --git a/assets/schema_input.json b/assets/schema_input.json new file mode 100644 index 0000000..d8e0fdd --- /dev/null +++ b/assets/schema_input.json @@ -0,0 +1,69 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/plantandfoodresearch/pangene/master/assets/schema_input.json", + "title": "plantandfoodresearch/pangene pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "tag": { + "type": "string", + "pattern": "^\\w+$", + "errorMessage": "Assembly tags must be provided and can only contain alphanumeric characters including '_'", + "unique": true + }, + "fasta": { + "type": "string", + "pattern": "^\\S+\\.f(a|asta|as|sa|na)(\\.gz)?$", + "errorMessage": "FASTA file path cannot contain spaces and must have extension '.f(a|asta|as|sa|na)' or '.f(a|asta|as|sa|na).gz'" + }, + "is_masked": { + "type": "string", + "pattern": "^(yes|no)$", + "errorMessage": "Masking information must be provided as 'yes' or 'no'" + }, + "te_lib": { + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(a|asta|as|sa|na)(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "TE library FASTA file path cannot contain spaces and must have extension '.f(a|asta|as|sa|na)' or '.f(a|asta|as|sa|na).gz'" + }, + "braker_gff3": { + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.gff(3)?(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "BRAKER GFF3 file path cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'", + "dependentRequired": ["braker_hints"] + }, + "braker_hints": { + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.gff(3)?(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "errorMessage": "BRAKER hints GFF/GFF3 file path cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'" + } + }, + "required": ["tag", "fasta", "is_masked"] + } +} diff --git a/assets/schema_liftoff.json b/assets/schema_liftoff.json new file mode 100644 index 0000000..f4bb651 --- /dev/null +++ b/assets/schema_liftoff.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/plantandfoodresearch/pangene/master/assets/schema_fastq.json", + "title": "plantandfoodresearch/pangene pipeline - params.fastq schema", + "description": "Schema for the file provided with params.fastq", + "type": "array", + "items": { + "type": "object", + "properties": { + "fasta": { + "type": "string", + "pattern": "^\\S+\\.f(a|asta|as|sa|na)(\\.gz)?$", + "errorMessage": "FASTA file path cannot contain spaces and must have extension '.f(a|asta|as|sa|na)' or '.f(a|asta|as|sa|na).gz'", + "unique": true + }, + "gff3": { + "type": "string", + "pattern": "^\\S+\\.gff(3)?(\\.gz)?$", + "errorMessage": "Annotation GFF3 file path cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'", + "unique": true + } + }, + "required": ["fasta", "gff3"] + } +} diff --git a/assets/tsebra-1form.cfg b/assets/tsebra-1form.cfg new file mode 100644 index 0000000..2b076fa --- /dev/null +++ b/assets/tsebra-1form.cfg @@ -0,0 +1,16 @@ +# Weight for each hint source +# Values have to be >= 0 +P 1 +E 20 +C 1 +M 1 +# Required fraction of supported introns or supported start/stop-codons for a transcript +# Values have to be in [0,1] +intron_support 1.0 +stasto_support 2 +# Allowed difference for each feature +# Values have to be in [0,1] +e_1 0 +e_2 0 +e_3 0 +e_4 0 diff --git a/assets/tsebra-default.cfg b/assets/tsebra-default.cfg new file mode 100644 index 0000000..8ec1a98 --- /dev/null +++ b/assets/tsebra-default.cfg @@ -0,0 +1,16 @@ +# Weight for each hint source +# Values have to be >= 0 +P 1 +E 20 +C 1 +M 1 +# Required fraction of supported introns or supported start/stop-codons for a transcript +# Values have to be in [0,1] +intron_support 1.0 +stasto_support 2 +# Allowed difference for each feature +# Values have to be in [0,1] +e_1 0.1 +e_2 0.5 +e_3 0.05 +e_4 0.2 diff --git a/bin/make-samplesheet.py b/bin/make-samplesheet.py deleted file mode 100755 index b4ad0b7..0000000 --- a/bin/make-samplesheet.py +++ /dev/null @@ -1,285 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import pathlib -import os -import re -import pandas as pd -import subprocess -import hashlib - - -def remove_dna_symbols(string): - pattern = r"[ACGT]{5,}" - result = re.sub(pattern, "", string) - return result - - -def remove_lane_suffix(string): - pattern = r"_L0*\d*" - result = re.sub(pattern, "", string) - return result - - -def sort_list_of_files(files_list): - def natural_key(string): - return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string)] - - return sorted(files_list, key=lambda x: natural_key(x)) - - -def validate_fastq_columns(fastq_1, fastq_2): - pattern = re.compile(r"(.*)([R]?[12])\.([fastq]+)\.gz$") - - for fq1, fq2 in zip(fastq_1, fastq_2): - match1 = pattern.match(fq1) - match2 = pattern.match(fq2) - - if match1 and match2: - base1 = match1.group(1) - base2 = match2.group(1) - if base1 != base2: - raise ValueError(f"Failed to match fq1 and fq2 files:\n{fq1}\n{fq2}") - else: - raise ValueError(f"Failed to match fq1 and fq2 files:\n{fq1}\n{fq2}") - - -def validate_samplesheet(pd_df): - if len(pd.unique(pd_df["sample"])) != len(pd_df["sample"]): - raise ValueError(f"Failed to create a sample-sheet with unique sample IDs") - - validate_fastq_columns(list(pd_df["fastq_1"]), list(pd_df["fastq_2"])) - - -def compute_md5(file_path): - with open(file_path, "rb") as file: - data = file.read(10_000_000) - md5_hash = hashlib.md5(data).hexdigest() - - print(f"Sample: {os.path.basename(file_path)}") - - return md5_hash - - -def remove_duplicate_samples(pd_df): - print("Checking for duplicate samples and removing them...") - - pd_df["md5sum"] = pd_df.apply( - lambda row: compute_md5(row["fastq_1"]) + compute_md5(row["fastq_2"]), axis=1 - ) - - duplicates = pd_df.duplicated(subset="md5sum") - - if duplicates.any(): - print("Following samples have duplicates:") - print(pd_df[duplicates].iloc[:, 0]) - else: - print("No duplicates detected...") - - df_unique = pd_df.drop_duplicates(subset="md5sum", keep="first") - return df_unique.iloc[:, :4] - - -def extract_r1_r2_files(list_of_files): - list_of_files_sorted = sort_list_of_files(list_of_files) - list_of_files_R1 = [ - f - for f in list_of_files_sorted - if len(re.findall(r"_[R]?1\.([fastq]+)\.gz", str(f))) > 0 - ] - list_of_files_R2 = [ - f - for f in list_of_files_sorted - if len(re.findall(r"_[R]?2\.([fastq]+)\.gz", str(f))) > 0 - ] - - if len(list_of_files_R1) != len(list_of_files_R2): - raise ValueError("Number of R1 and R2 files do not match") - - return list_of_files_R1, list_of_files_R2 - - -def get_common_literals(list_of_lists): - if len(list_of_lists) == 0: - return [] - - common_elements = set(list_of_lists[0]) - - for sublist in list_of_lists[1:]: - common_elements = common_elements.intersection(sublist) - - common_elements = list(common_elements) - - return sorted(common_elements) - - -def get_unique_elements(input_list): - unique_elements = [] - seen_elements = set() - for item in input_list: - if item not in seen_elements: - unique_elements.append(item) - seen_elements.add(item) - return unique_elements - - -def create_sample_ids_from_files_list(list_of_files_R1): - file_names_normalized = [ - f.replace(".fastq.gz", "") - .replace(".fq.gz", "") - .replace("-", "_") - .replace("/", "_") - .replace("R1", "") - for f in list_of_files_R1 - ] - - file_name_literals = [ - [l for l in f.split("_") if l != ""] for f in file_names_normalized - ] - - common_literals = get_common_literals(file_name_literals) - - cleaved_names = [] - for f in file_names_normalized: - cleaved_name = f - for l in common_literals: - cleaved_name = cleaved_name.replace(f"_{l}_", "__") - - cleaved_name = remove_dna_symbols(cleaved_name) - - cleaved_name_literals = [ - e for e in get_unique_elements(cleaved_name.split("_")) if e != "" - ] - - cleaved_name = "" - is_first = True - for em in cleaved_name_literals: - cleaved_name += em if is_first else "_" + em - is_first = False - - cleaved_names.append(cleaved_name) - - sample_ids = cleaved_names - - return sample_ids - - -def save_samplesheet(exp_name, list_of_files_R1, list_of_files_R2, sample_ids): - strandedness = ["auto" for _ in sample_ids] - file_data = pd.DataFrame( - { - "sample": sample_ids, - "fastq_1": list_of_files_R1, - "fastq_2": list_of_files_R2, - "strandedness": strandedness, - } - ) - - validate_samplesheet(file_data) - file_data.sort_values(by=["sample"], inplace=True) - - file_data_dedup = remove_duplicate_samples(file_data) - - file_data_dedup["sample"] = file_data_dedup["sample"].apply(remove_lane_suffix) - - file_data_dedup.to_csv(f"{exp_name}_samplesheet.csv", index=False) - - -def make_samplesheet_from_metadata_file(file_path, exp_name): - file_data = pd.read_excel(file_path, sheet_name="Samplesheet") - - sample_id_col = "isolate" - - file_data.loc[:, "sample"] = file_data[sample_id_col] - file_data.loc[:, "fastq_1"] = ( - file_data["directory"] + "/" + file_data["file_name_F"] - ) - file_data.loc[:, "fastq_2"] = ( - file_data["directory"] + "/" + file_data["file_name_R"] - ) - file_data.loc[:, "strandedness"] = "auto" - - file_data = file_data[["sample", "fastq_1", "fastq_2", "strandedness"]] - - validate_samplesheet(file_data) - file_data.sort_values(by=["sample"], inplace=True) - - file_data_dedup = remove_duplicate_samples(file_data) - file_data_dedup.to_csv(f"{exp_name}_samplesheet.csv", index=False) - - -def make_samplesheet_from_folder(file_path, exp_name): - if os.path.isfile(file_path): - raise ValueError( - "The provided path is for a file. Path to an input folder is required" - ) - - fastq_gz_list = [str(f) for f in file_path.glob("*.fastq.gz")] - fq_gz_list = [str(f) for f in file_path.glob("*.fq.gz")] - - list_of_files = fastq_gz_list + fq_gz_list - - if len(list_of_files) < 1: - raise ValueError( - "Could not find any fastq.gz or fq.gz files in the command output" - ) - - list_of_files_R1, list_of_files_R2 = extract_r1_r2_files(list_of_files) - sample_ids = create_sample_ids_from_files_list(list_of_files_R1) - save_samplesheet(exp_name, list_of_files_R1, list_of_files_R2, sample_ids) - - -def make_samplesheet_from_command(input_path_or_command, exp_name): - result = subprocess.run( - input_path_or_command, shell=True, capture_output=True, text=True - ) - - if result.returncode != 0: - raise ValueError(f"Failed to execute the provided command...\n{result.stderr}") - - list_of_files = [ - f - for f in result.stdout.split("\n") - if f != "" and (f.endswith(".fq.gz") or f.endswith(".fastq.gz")) - ] - - if len(list_of_files) < 1: - raise ValueError( - "Could not find any fastq.gz or fq.gz files in the command output" - ) - - list_of_files_R1, list_of_files_R2 = extract_r1_r2_files(list_of_files) - sample_ids = create_sample_ids_from_files_list(list_of_files_R1) - save_samplesheet(exp_name, list_of_files_R1, list_of_files_R2, sample_ids) - -def main(): - parser = argparse.ArgumentParser( - prog="make-sample-sheet", - description="Read an RNASeq input folder or metadata file and create a sample-sheet.csv", - ) - parser.add_argument( - "path", - help="RNASeq input folder path or metadata file path or a bash command which lists all the fastq samples", - ) - parser.add_argument( - "experiment-name", - help="RNASeq experiment name", - ) - parser.add_argument("-v", action="version", version="%(prog)s v0.3") - - args = vars(parser.parse_args()) - - input_path_or_command = args["path"] - exp_name = args["experiment-name"] - - print("Creating sample sheet...") - - if os.path.isfile(input_path_or_command): - make_samplesheet_from_metadata_file(input_path_or_command, exp_name) - elif os.path.isdir(input_path_or_command): - make_samplesheet_from_folder(pathlib.Path(input_path_or_command), exp_name) - else: - make_samplesheet_from_command(input_path_or_command, exp_name) - -if __name__ == "__main__": - main() diff --git a/conf/base.config b/conf/base.config index 6b0d419..3d30f22 100644 --- a/conf/base.config +++ b/conf/base.config @@ -6,6 +6,7 @@ profiles { apptainer { envWhitelist = 'APPTAINER_BINDPATH,APPTAINER_BIND' + cacheDir = "/workspace/pangene/singularity" } } @@ -70,12 +71,6 @@ process { } } -nextflow { - enable { - moduleBinaries = true - } -} - def check_max(obj, type) { if (type == 'memory') { try { diff --git a/conf/manifest.config b/conf/manifest.config deleted file mode 100644 index 95537cd..0000000 --- a/conf/manifest.config +++ /dev/null @@ -1,10 +0,0 @@ -manifest { - name = 'pangene' - author = """Usman Rashid, Jason Shiller""" - homePage = 'https://github.com/PlantandFoodResearch/pan-gene' - description = """A NextFlow pipeline for pan-genome annotation""" - mainScript = 'main.nf' - nextflowVersion = '!>=23.04.4' - version = '0.2' - doi = '' -} diff --git a/conf/modules.config b/conf/modules.config index fc489bf..f5e0cbf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,7 @@ process { if(!params.skip_fastqc) { process { withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_RAW' { - ext.args = '--quiet' + ext.args = '--quiet' } withName: '.*:FASTQ_FASTQC_UMITOOLS_FASTP:FASTQC_TRIM' { @@ -90,8 +90,12 @@ if(!params.skip_fastp) { if (params.remove_ribo_rna) { process { - withName: 'SORTMERNA' { - ext.args = '--num_alignments 1 -v' + withName: SORTMERNA_INDEX { + ext.args = '--index 1' + } + + withName: SORTMERNA_READS { + ext.args = '--index 0 --num_alignments 1 -v' publishDir = [ [ path: { "${params.outdir}/sortmerna" }, @@ -150,10 +154,9 @@ process { } } -if(params.liftoff_xref_annotations) { +if(params.liftoff_annotations) { process { withName: LIFTOFF { - ext.args = ' ' ext.args = [ '-exclude_partial', '-copies', @@ -161,23 +164,97 @@ if(params.liftoff_xref_annotations) { "-a $params.liftoff_coverage", "-s $params.liftoff_identity" ].join(' ').trim() - publishDir = [ - path: { "${params.outdir}/liftoff/${meta.id}" }, - mode: "copy", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - ] } - withName: GFFREAD { + withName: '.*:FASTA_LIFTOFF:GFFREAD_BEFORE_LIFTOFF' { ext.args = '--no-pseudo --keep-genes' } + + withName: MERGE_LIFTOFF_ANNOTATIONS { + ext.prefix = { "${meta.id}.merged.liftoffs" } + } + + withName: '.*:FASTA_LIFTOFF:AGAT_SPFILTERFEATUREFROMKILLLIST' { + ext.prefix = { "${meta.id}.invalid.orf.purged" } + } + + withName: '.*:FASTA_LIFTOFF:GFFREAD_AFTER_LIFTOFF' { + ext.prefix = { "${meta.id}.liftoff" } + ext.args = '--keep-genes' + } + } +} + +process { + + withName: 'AGAT_CONVERTSPGFF2GTF' { + ext.args = '--gtf_version relax' + } + + withName: 'KILL_TSEBRA_ISOFORMS' { + ext.prefix = { "${meta.id}.1form" } + } + + withName: 'AGAT_SPFILTERFEATUREFROMKILLLIST' { + ext.prefix = { "${meta.id}.purged" } + } +} + +process { + withName: '.*:GFF_MERGE_CLEANUP:AGAT_SPMERGEANNOTATIONS' { + ext.prefix = { "${meta.id}.liftoff.braker" } + } + + withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { + ext.args = '-tidy -retainids -sort' + } +} + +process { + withName: GFF2FASTA_FOR_EGGNOGMAPPER { + ext.args = '-y' + } + + withName: EGGNOGMAPPER { + ext.args = [ + "--evalue $params.eggnogmapper_evalue", + "--pident $params.eggnogmapper_pident", + params.eggnogmapper_tax_scope ? "--tax_scope $params.eggnogmapper_tax_scope" : '', + '--mp_start_method fork', + "--itype proteins", + '--go_evidence all' + ].join(' ').trim() + + publishDir = [ + path: { "${params.outdir}/final/$meta.id" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } +} + +process { + withName: '.*:PURGE_NOHIT_MODELS:AGAT_SPFILTERFEATUREFROMKILLLIST' { + ext.prefix = { "${meta.id}.nohits.purged" } + } +} + +process { + withName: 'FINAL_GFF_CHECK' { + ext.args = '-tidy -retainids -sort' + + publishDir = [ + path: { "${params.outdir}/final/$meta.id" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] } } process { withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ - path: params.outdir, + path: "$params.outdir/pipeline_info", pattern: "software_versions.yml", mode: "copy", enabled: true diff --git a/conf/reporting_defaults.config b/conf/reporting_defaults.config deleted file mode 100644 index 178522d..0000000 --- a/conf/reporting_defaults.config +++ /dev/null @@ -1,13 +0,0 @@ -def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') -timeline { - enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" -} -report { - enabled = true - file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" -} -trace { - enabled = true - file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" -} diff --git a/conf/test_params.json b/conf/test_params.json deleted file mode 100644 index 0dc25f8..0000000 --- a/conf/test_params.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "target_assemblies": [ - ["red5_v2p1", "pangene-test/target/red5_v2p1_chr1_1200k.fasta.gz"], - ["donghong", "pangene-test/target/donghong_chr1_600k.fsa.gz"] - ], - - "te_libraries": [["donghong", "pangene-test/te_lib/donghong.TElib.fa.gz"]], - - "samplesheet": "pangene-test/samplesheet/samplesheet.csv", - - "remove_ribo_rna": true, - "ribo_database_manifest": "assets/rrna-db-test.txt", - - "external_protein_fastas": [ - "pangene-test/ext_prot/RU01_20221115150135_chr1_600k.pep.fasta.gz", - "pangene-test/ext_prot/RU01_20221115150135_chr2_600k.pep.fasta.gz" - ], - - "braker_extra_args": "--testMode --species=arabidopsis --useexisting", - - "liftoff_xref_annotations": [ - ["pangene-test/liftoff/Russell_V2a_chr1_600k.fsa.gz", "pangene-test/liftoff/Russell_V2a_chr1_600k.gff3.gz"], - ["pangene-test/liftoff/TAIR10_chr1_600k.fas.gz", "pangene-test/liftoff/TAIR10_chr1_600k.gff3.gz"] - ], - - "max_cpus": 2, - "max_memory": "3.GB" -} diff --git a/docs/contributors.sh b/docs/contributors.sh new file mode 100755 index 0000000..8dbfc36 --- /dev/null +++ b/docs/contributors.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +module_authors=$(find ./modules -name meta.yml | xargs -I {} grep -A20 'authors:' {} | grep '\- ' | tr -d '[-" ]' | tr '[:upper:]' '[:lower:]') +workflow_authors=$(find ./subworkflows -name meta.yml | xargs -I {} grep -A20 'authors:' {} | grep '\- ' | tr -d '[-" ]' | tr '[:upper:]' '[:lower:]') +echo -e "${module_authors}\n${workflow_authors}" | sort -V | uniq | sed -n 's|@\(.*\)||p' diff --git a/docs/img/pangene.drawio b/docs/img/pangene.drawio new file mode 100644 index 0000000..0c61685 --- /dev/null +++ b/docs/img/pangene.drawio @@ -0,0 +1,432 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/img/pangene.png b/docs/img/pangene.png new file mode 100644 index 0000000..a81e1c1 Binary files /dev/null and b/docs/img/pangene.png differ diff --git a/docs/parameters.md b/docs/parameters.md new file mode 100644 index 0000000..8403d49 --- /dev/null +++ b/docs/parameters.md @@ -0,0 +1,68 @@ +# plantandfoodresearch/pangene pipeline parameters + +A NextFlow pipeline for pan-genome annotation + +## Input/output options + +| Parameter | Description | Type | Default | Required | Hidden | +| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | --------- | -------- | ------ | +| `input` | Target assemblies listed in a CSV sheet
HelpFASTA and other associated files for target assemblies provided as a CSV sheet
| `string` | | True | | +| `external_protein_fastas` | External protein fastas listed in a text sheet
HelpA text file listing FASTA files to provide protein evidence for annotation
| `string` | | True | | +| `eggnogmapper_db_dir` | Eggnogmapper database directory | `string` | | True | | +| `eggnogmapper_tax_scope` | Eggnogmapper taxonomy scopre | `integer` | | True | | +| `fastq` | FASTQ samples listed in a CSV sheet
HelpFASTQ files for RNASeq samples corresponding to each target assembly provided in a CSV sheet
| `string` | | | | +| `liftoff_annotations` | Reference annotations listed in a CSV sheet
HelpFASTA and GFF3 files for reference annotations for liftoff listed in a CSV sheet
| `string` | | | | +| `outdir` | The output directory where the results will be saved
Help Use absolute paths to storage on Cloud infrastructure
| `string` | ./results | True | | + +## Repeat annotation options + +| Parameter | Description | Type | Default | Required | Hidden | +| --------------------------- | ------------------------------------------ | --------- | ------------- | -------- | ------ | +| `repeat_annotator` | 'edta' or 'repeatmodeler' | `string` | repeatmodeler | | | +| `save_annotated_te_lib` | Save annotated TE library or not? | `boolean` | | | | +| `edta_is_sensitive` | Use '--sensitive 1' flag with EDTA or not? | `boolean` | | | | +| `repeatmasker_save_outputs` | Save the repeat-masked genome or not? | `boolean` | | | | + +## RNASeq pre-processing options + +| Parameter | Description | Type | Default | Required | Hidden | +| ------------------------ | ------------------------------------------------------------------ | --------- | ----------------------------------------- | -------- | ------ | +| `skip_fastqc` | Skip FASTQC or not? | `boolean` | | | | +| `skip_fastp` | Skip trimming by FASTQP or not? | `boolean` | | | | +| `min_trimmed_reads` | Exclude a sample if its reads after trimming are below this number | `integer` | 10000 | | | +| `extra_fastp_args` | Extra FASTP arguments | `string` | | | | +| `save_trimmed` | Save FASTQ files after trimming or not? | `boolean` | | | | +| `remove_ribo_rna` | Remove Ribosomal RNA or not? | `boolean` | | | | +| `save_non_ribo_reads` | Save FASTQ files after Ribosomal RNA removal or not? | `boolean` | | | | +| `ribo_database_manifest` | Ribosomal RNA fastas listed in a text sheet | `string` | ${projectDir}/assets/rrna-db-defaults.txt | | | + +## RNAseq alignment options + +| Parameter | Description | Type | Default | Required | Hidden | +| ------------------------ | ------------------------------------------------- | --------- | ------- | -------- | ------ | +| `star_max_intron_length` | Maximum intron length for STAR alignment | `integer` | 16000 | | | +| `star_align_extra_args` | EXTRA arguments for STAR | `string` | | | | +| `star_save_outputs` | Save BAM files from STAR or not? | `boolean` | | | | +| `save_cat_bam` | SAVE a concatenated BAM file per assembly or not? | `boolean` | | | | + +## Annotation options + +| Parameter | Description | Type | Default | Required | Hidden | +| --------------------------- | --------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `braker_extra_args` | Extra arguments for BRAKER | `string` | | | | +| `braker_allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `liftoff_coverage` | Liftoff coverage parameter | `number` | 0.9 | | | +| `liftoff_identity` | Liftoff identity parameter | `number` | 0.9 | | | +| `eggnogmapper_evalue` | Only report alignments below or equal the e-value threshold | `number` | 1e-05 | | | +| `eggnogmapper_pident` | Only report alignments above or equal to the given percentage of identity (0-100) | `integer` | 35 | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | + +## Max job request options + +Set the top limit for requested resources for any single job. + +| Parameter | Description | Type | Default | Required | Hidden | +| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `max_cpus` | Maximum number of CPUs that can be requested for any single job.
HelpUse to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`
| `integer` | 12 | | True | +| `max_memory` | Maximum amount of memory that can be requested for any single job.
HelpUse to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`
| `string` | 200.GB | | True | +| `max_time` | Maximum amount of time that can be requested for any single job.
HelpUse to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`
| `string` | 7.day | | True | diff --git a/pangene_local b/local_pangene similarity index 75% rename from pangene_local rename to local_pangene index 255edb9..8a1aa8a 100755 --- a/pangene_local +++ b/local_pangene @@ -12,9 +12,10 @@ F_BOLD="\033[1m" && echo 'Executing with -stub' \ || echo -e "${C_RED}${F_BOLD}Executing without -stub${NO_FORMAT}" -nextflow \ +nextflow run \ main.nf \ -profile local,docker \ -resume \ $stub \ - -params-file conf/test_params.json + -params-file pangene-test/params.json \ + --eggnogmapper_db_dir ../dbs/emapperdb/5.0.2 diff --git a/main.nf b/main.nf index 9ed32f7..067425b 100755 --- a/main.nf +++ b/main.nf @@ -2,7 +2,11 @@ nextflow.enable.dsl=2 -include { PANGENE } from './workflows/pangene.nf' +include { validateParameters } from 'plugin/nf-validation' + +validateParameters() + +include { PANGENE } from './workflows/pangene.nf' workflow { PFR_PANGENE() diff --git a/modules.json b/modules.json index 4e8f0a9..27dc611 100644 --- a/modules.json +++ b/modules.json @@ -5,39 +5,49 @@ "git@github.com:PlantandFoodResearch/nxf-modules.git": { "modules": { "pfr": { + "agat/spfilterfeaturefromkilllist": { + "branch": "main", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", + "installed_by": ["modules"] + }, + "agat/spmergeannotations": { + "branch": "main", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", + "installed_by": ["modules"] + }, "custom/restoregffids": { "branch": "main", - "git_sha": "e9f6bdd634bdbcd52c5568ba82f16176ec06631f", - "installed_by": ["fasta_edta_lai", "modules"] + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", + "installed_by": ["fasta_edta_lai"] }, "custom/shortenfastaids": { "branch": "main", - "git_sha": "5e0e41b51d7fc7f68ae43692b6fe19b95d7f3a8c", - "installed_by": ["fasta_edta_lai", "modules"] + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", + "installed_by": ["fasta_edta_lai"] }, "edta/edta": { "branch": "main", - "git_sha": "35468dbb1f35eb17a43d7e05544601c7c3f8cd90", - "installed_by": ["fasta_edta_lai", "modules"] + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", + "installed_by": ["fasta_edta_lai"] }, - "lai": { + "ltrretriever/lai": { "branch": "main", - "git_sha": "7e6e3cb41362a045c6bb6065903efa0eba246e87", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", "installed_by": ["fasta_edta_lai"] }, - "liftoff": { + "repeatmodeler/builddatabase": { "branch": "main", - "git_sha": "444b35f4e6285115f84d2bfce49fc0e6d8a2754e", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", "installed_by": ["modules"] }, - "repeatmodeler/builddatabase": { + "repeatmodeler/repeatmodeler": { "branch": "main", - "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", "installed_by": ["modules"] }, - "repeatmodeler/repeatmodeler": { + "tsebra": { "branch": "main", - "git_sha": "9da0567f685b2772f65290f2bd6d6347671c8310", + "git_sha": "a3b86c357980e5244cb313027c1d980d89c19ef4", "installed_by": ["modules"] } } @@ -46,7 +56,7 @@ "pfr": { "fasta_edta_lai": { "branch": "main", - "git_sha": "5ae026a98da1331433fa4cf5b667c9abdf43e395", + "git_sha": "10b046eaac396f279c08e7e2bb067482ccd9c74e", "installed_by": ["subworkflows"] } } @@ -71,21 +81,37 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "agat/convertspgff2gtf": { + "branch": "master", + "git_sha": "15f1cf0a1a12da63839c336eb1ecd96d03320e94", + "installed_by": ["modules"] + }, + "agat/convertspgxf2gxf": { + "branch": "master", + "git_sha": "71ccbccbd498af48c33939e1123517340bab3d6f", + "installed_by": ["modules"] + }, "cat/cat": { "branch": "master", - "git_sha": "d593e8f6b7d1bbbb2acf43a4b9efeeac8d6720f2", + "git_sha": "9437e6053dccf4aafa022bfd6e7e9de67e625af8", "installed_by": ["modules"] }, "cat/fastq": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "0997b47c93c06b49aa7b3fefda87e728312cf2ca", "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", "installed_by": ["modules"] }, + "eggnogmapper": { + "branch": "master", + "git_sha": "9be0d5f9aeb31bded6780b5b589de7158ccf2c7b", + "installed_by": ["modules"], + "patch": "modules/nf-core/eggnogmapper/eggnogmapper.diff" + }, "fastavalidator": { "branch": "master", "git_sha": "89ff95427f695086369d7927a3c17cea2a37a382", @@ -93,47 +119,63 @@ }, "fastp": { "branch": "master", - "git_sha": "d086322563bdbb08c94bf15a7db58a39ccdb1520", + "git_sha": "95cf5fe0194c7bf5cb0e3027a2eb7e7c89385080", "installed_by": ["fastq_fastqc_umitools_fastp"] }, "fastqc": { "branch": "master", - "git_sha": "617777a807a1770f73deb38c80004bac06807eef", + "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", "installed_by": ["fastq_fastqc_umitools_fastp", "modules"] }, + "gffcompare": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "gffread": { "branch": "master", - "git_sha": "a2d6c3082c5c44b4155a3246daff36701ee49af8", + "git_sha": "b1b959609bda44341120aed1766329909f54b8d0", + "installed_by": ["modules"], + "patch": "modules/nf-core/gffread/gffread.diff" + }, + "gt/gff3": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", + "installed_by": ["modules"] + }, + "liftoff": { + "branch": "master", + "git_sha": "8ce34a40589137b75b65dfe8bb334c9b94f1d6c8", "installed_by": ["modules"] }, "samtools/cat": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", "installed_by": ["modules"] }, "sortmerna": { "branch": "master", - "git_sha": "ce558e30784469b88a16923ca96d81899d240b42", + "git_sha": "df05c8db5195867c0bc7b92c1788115b66f0d17d", "installed_by": ["modules"] }, "star/align": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", - "git_sha": "d87a6e2156c2099c09280fa70776eaf0a824817a", + "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", "installed_by": ["modules"] }, "umitools/extract": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "d2c5e76f291379f3dd403e48e46ed7e6ba5da744", "installed_by": ["fastq_fastqc_umitools_fastp"] } } @@ -142,7 +184,7 @@ "nf-core": { "fastq_fastqc_umitools_fastp": { "branch": "master", - "git_sha": "3e8b0c1144ccf60b7848efbdc2be285ff20b49ee", + "git_sha": "cabcc0dadf8366aa7a9930066a7b3dd90d9825d5", "installed_by": ["subworkflows"] } } diff --git a/modules/kherronism/braker3/main.nf b/modules/kherronism/braker3/main.nf index ae0ec81..82de764 100644 --- a/modules/kherronism/braker3/main.nf +++ b/modules/kherronism/braker3/main.nf @@ -62,9 +62,11 @@ process BRAKER3 { """ stub: + def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" def rna_ids = rnaseq_sets_ids ? "--rnaseq_sets_ids=${rnaseq_sets_ids}" : '' def touch_hints = (rna_ids || bam || proteins || hints) ? "touch ${prefix}/hintsfile.gff" : '' + def touch_gff = args.contains('--gff3') ? "touch ${prefix}/braker.gff3" : '' """ mkdir "$prefix" @@ -74,6 +76,7 @@ process BRAKER3 { $touch_hints touch "${prefix}/braker.log" touch "${prefix}/what-to-cite.txt" + $touch_gff cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/samplesheet_check/main.nf b/modules/local/samplesheet_check/main.nf deleted file mode 100644 index 4fb60f8..0000000 --- a/modules/local/samplesheet_check/main.nf +++ /dev/null @@ -1,40 +0,0 @@ -// Source: -// https://github.com/nf-core/rnaseq -// MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE -// -// Changes: -// Added channel permissible_target_assemblies - -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.9.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'biocontainers/python:3.9--1' }" - - input: - path samplesheet - val permissible_target_assemblies - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - check_samplesheet.py \\ - $samplesheet \\ - "$permissible_target_assemblies" \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/samplesheet_check/resources/usr/bin/check_samplesheet.py b/modules/local/samplesheet_check/resources/usr/bin/check_samplesheet.py deleted file mode 100755 index ac75f2b..0000000 --- a/modules/local/samplesheet_check/resources/usr/bin/check_samplesheet.py +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys -import errno -import argparse - -# https://github.com/nf-core/rnaseq -# MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE -# -# Changes: -# -# 1. Formatted with black -# 2. Added checks for the fifth column: target_assemblies -# 3. Removed strandedness - - -def parse_args(args=None): - Description = ( - "Reformat nf-core/rnaseq style samplesheet file and check its contents." - ) - Epilog = 'Example usage: python check_samplesheet.py "target_assembly_a,target_assembly_b" ' - - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("TARGET_ASSEMBLIES", help="Permissible target assemblies") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) - - -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception - - -def print_error(error, context="Line", context_str=""): - error_str = f"ERROR: Please check samplesheet -> {error}" - if context != "" and context_str != "": - error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'" - print(error_str) - sys.exit(1) - - -def check_samplesheet(file_in, file_out, permissible_target_assemblies): - """ - This function checks that the samplesheet follows the following structure: - - sample,fastq_1,fastq_2,target_assemblies - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,red5;red3 - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,red5;red3 - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,,red5 - """ - - sample_mapping_dict = {} - with open(file_in, "r", encoding="utf-8-sig") as fin: - ## Check header - MIN_COLS = 4 - HEADER = ["sample", "fastq_1", "fastq_2", "target_assemblies"] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - if header[: len(HEADER)] != HEADER: - print( - f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}" - ) - sys.exit(1) - - ## Check sample entries - for line in fin: - if line.strip(): - lspl = [x.strip().strip('"') for x in line.strip().split(",")] - - ## Check valid number of columns per row - if len(lspl) < len(HEADER): - print_error( - f"Invalid number of columns (minimum = {len(HEADER)})!", - "Line", - line, - ) - - num_cols = len([x for x in lspl[: len(HEADER)] if x]) - if num_cols < MIN_COLS: - print_error( - f"Invalid number of populated columns (minimum = {MIN_COLS})!", - "Line", - line, - ) - - ## Check sample name entries - sample, fastq_1, fastq_2, target_assemblies = lspl[ - : len(HEADER) - ] - - if sample.find(" ") != -1: - print( - f"WARNING: Spaces have been replaced by underscores for sample: {sample}" - ) - sample = sample.replace(" ", "_") - if not sample: - print_error("Sample entry has not been specified!", "Line", line) - - ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: - if fastq: - if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith( - ".fq.gz" - ): - print_error( - "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", - "Line", - line, - ) - - ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2] - if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2] - elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2] - else: - print_error( - "Invalid combination of columns provided!", "Line", line - ) - - ## Check if the target assemblies are permissible - target_assemblies_list = sorted( - [x.strip() for x in target_assemblies.strip().split(";")] - ) - - for assembly in target_assemblies_list: - if assembly in permissible_target_assemblies: - continue - - print_error( - f"Target assembly '{assembly}' is not one of {permissible_target_assemblies}!", - "Line", - line, - ) - - ## Create sample mapping dictionary = {sample: [[ single_end, fastq_1, fastq_2, strandedness, [target_assemblies] ]]} - sample_target_assemblies = ";".join(target_assemblies_list) - sample_info = ( - sample_info + lspl[len(HEADER) :] + [sample_target_assemblies] - ) - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] - else: - if sample_info in sample_mapping_dict[sample]: - print_error( - "Samplesheet contains duplicate rows!", "Line", line - ) - else: - sample_mapping_dict[sample].append(sample_info) - - ## Write validated samplesheet with appropriate columns - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write( - ",".join( - [ - "sample", - "single_end", - "fastq_1", - "fastq_2", - "target_assemblies", - ] - + header[len(HEADER) :] - ) - + "\n" - ) - for sample in sorted(sample_mapping_dict.keys()): - ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - if not all( - x[0] == sample_mapping_dict[sample][0][0] - for x in sample_mapping_dict[sample] - ): - print_error( - f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!", - "Sample", - sample, - ) - - ## Check that multiple runs of the same sample have same target assemblies - if not all( - x[3] == sample_mapping_dict[sample][0][3] - for x in sample_mapping_dict[sample] - ): - print_error( - f"Multiple runs of a sample must have the same target assemblies!", - "Sample", - sample, - ) - - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n") - else: - print_error(f"No entries to process!", "Samplesheet: {file_in}") - - -def main(args=None): - args = parse_args(args) - permissible_target_assemblies = [ - x.strip() for x in args.TARGET_ASSEMBLIES.strip().split(",") - ] - check_samplesheet(args.FILE_IN, args.FILE_OUT, permissible_target_assemblies) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/modules/local/utils.nf b/modules/local/utils.nf new file mode 100644 index 0000000..7e1c86a --- /dev/null +++ b/modules/local/utils.nf @@ -0,0 +1,40 @@ +def idFromFileName(fileName) { + + def trial = ( fileName + ).replaceFirst( + /\.f(ast)?q$/, '' + ).replaceFirst( + /\.f(asta|sa|a|as|aa)?$/, '' + ).replaceFirst( + /\.gff(3)?$/, '' + ).replaceFirst( + /\.gz$/, '' + ) + + if ( trial == fileName ) { return fileName } + + return idFromFileName ( trial ) +} + +def validateFastqMetadata(metas, fqs, permAssString) { + def permAssList = permAssString.split(",") + + // Check if each listed assembly is permissible + metas.each { meta -> + if ( meta.target_assemblies.any { !permAssList.contains( it ) } ) { + exit 1, "Sample ${meta.id} targets ${meta.target_assemblies} which are not in $permAssList" + } + } + + // Check if multiple runs of a sample have the same target assemblies + if ( metas.collect { meta -> meta.target_assemblies }.unique().size() > 1 ) { + error "Multiple runs of sample ${metas.first().id} must target same assemblies" + } + + // Check if multiple runs of a sample have the same endedness + if ( metas.collect { meta -> meta.single_end }.unique().size() > 1 ) { + error "Multiple runs of sample ${metas.first().id} must have same endedness" + } + + [ metas.first(), fqs ] +} diff --git a/modules/local/validate_params.nf b/modules/local/validate_params.nf deleted file mode 100644 index 460ce80..0000000 --- a/modules/local/validate_params.nf +++ /dev/null @@ -1,104 +0,0 @@ -def validateParams(params) { - validateFastaTags(params) - - if (!params['repeat_annotator']) { - error "Error: repeat_annotator must be either 'repeatmodeler' or 'edta'" - } - - if ( !(params['repeat_annotator'] in ['repeatmodeler', 'edta']) ) { - error "Error: repeat_annotator must be either 'repeatmodeler' or 'edta'" - } - - validateTETags(params) - validateTEFastaCorrespondence(params) - - validateRiboDBManifest(params) - - validateLiftoffXrefs(params) -} - -def validateFastaTags(params) { - def listOfFastaTuples = params["target_assemblies"] - - if (isNotListOfLists(listOfFastaTuples, 2)) { - error 'Error: target_assemblies must be a list of sublists, with each sublist containing 2 elements' - } - - def fastaTags = listOfFastaTuples.collect { it[0] } - - fastaTags.each { - if (!(it =~ /^\w+$/)) { - error "Error: $it is not a valid tag in target_assemblies" - } - } - - if (fastaTags.size() != (fastaTags as Set).size()) { - error "All the tags in target_assemblies should be unique" - } -} - -def validateTETags(params) { - - if(!params["te_libraries"]) { - return - } - - def listOfTETuples = params["te_libraries"] - - if (listOfTETuples.isEmpty()) { - return - } - - if (isNotListOfLists(listOfTETuples, 2)) { - error 'Error: te_libraries must be a list of sublists, with each sublist containing 2 elements' - } - - def teTags = listOfTETuples.collect { it[0] } - - teTags.each { - if (!(it =~ /^\w+$/)) { - error "Error: $it is not a valid tag in te_libraries" - } - } -} - -def validateTEFastaCorrespondence(params) { - - if(!params["te_libraries"]) { - return - } - - def listOfTETuples = params["te_libraries"] - def listOfFastaTuples = params["target_assemblies"] - - def fastaTags = listOfFastaTuples.collect { it[0] } - def teTags = listOfTETuples.collect { it[0] } - - teTags.each { - if(!fastaTags.contains(it)) { - error "Error: $it in te_libraries does not have a corresponding tag in target_assemblies" - } - } -} - -def validateRiboDBManifest(params) { - if (params.remove_ribo_rna) { - file_ribo_db = file(params.ribo_database_manifest, checkIfExists: true) - - if (file_ribo_db.isEmpty()) {exit 1, "File provided with --ribo_database_manifest is empty: ${file_ribo_db.getName()}!"} - } -} - -def validateLiftoffXrefs(params) { - if(!params["liftoff_xref_annotations"]) { - return - } - - if(isNotListOfLists(params["liftoff_xref_annotations"]), 2) { - error "Error: liftoff_xref_annotations must be a list of sublists, with each sublist containing 2 elements" - } -} - -def isNotListOfLists(thisOne, subListSize) { - return (!(thisOne instanceof List) || thisOne.isEmpty() || thisOne.any { !(it instanceof List) || it.size() != subListSize }) -} diff --git a/modules/nf-core/agat/convertspgff2gtf/environment.yml b/modules/nf-core/agat/convertspgff2gtf/environment.yml new file mode 100644 index 0000000..381154f --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/environment.yml @@ -0,0 +1,7 @@ +name: agat_convertspgff2gtf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::agat=1.0.0 diff --git a/modules/nf-core/agat/convertspgff2gtf/main.nf b/modules/nf-core/agat/convertspgff2gtf/main.nf new file mode 100644 index 0000000..8f1f8b4 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/main.nf @@ -0,0 +1,48 @@ +process AGAT_CONVERTSPGFF2GTF { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.0.0--pl5321hdfd78af_0' : + 'biocontainers/agat:1.0.0--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.agat.gtf"), emit: output_gtf + tuple val(meta), path("*.log"), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + agat_convert_sp_gff2gtf.pl \\ + --gff $gff \\ + --output ${prefix}.agat.gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gff2gtf.pl --help | sed '4!d; s/.*v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.agat.gtf + touch ${gff}.agat.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gff2gtf.pl --help | sed '4!d; s/.*v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/agat/convertspgff2gtf/meta.yml b/modules/nf-core/agat/convertspgff2gtf/meta.yml new file mode 100644 index 0000000..dcdc8d9 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/meta.yml @@ -0,0 +1,43 @@ +name: agat_convertspgff2gtf +description: | + Converts a GFF/GTF file into a proper GTF file +keywords: + - genome + - gff + - gtf + - conversion +tools: + - agat: + description: "AGAT is a toolkit for manipulation and getting information from GFF/GTF files" + homepage: "https://github.com/NBISweden/AGAT" + documentation: "https://agat.readthedocs.io/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gff: + type: file + description: Annotation file in GFF3/GTF format + pattern: "*.{gff, gtf}" +output: + - output_gtf: + type: file + description: Annotation file in GTF format + pattern: "*.{gtf}" + - log: + type: file + description: Log file of the conversion process + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@toniher" +maintainers: + - "@toniher" diff --git a/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test b/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test new file mode 100644 index 0000000..9accfec --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process AGAT_CONVERTSPGFF2GTF" + script "../main.nf" + process "AGAT_CONVERTSPGFF2GTF" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/convertspgff2gtf" + + test("sarscov2 - genome [gff3]") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.output_gtf, + process.out.versions).match() }, + { assert path(process.out.log[0][1]).exists() } + ) + } + + } + + test("sarscov2 - genome [gff3] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.output_gtf.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test.snap b/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test.snap new file mode 100644 index 0000000..6193be8 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/tests/main.nf.test.snap @@ -0,0 +1,28 @@ +{ + "sarscov2 - genome [gff3] - stub": { + "content": [ + [ + "test.agat.gtf", + "genome.gff3.agat.log", + "versions.yml:md5,dcbde1b24eb36571645f2d4bd4b4e551" + ] + ], + "timestamp": "2023-12-24T23:36:49.538312808" + }, + "sarscov2 - genome [gff3]": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.agat.gtf:md5,bbe333239767d048eb8392bba6856616" + ] + ], + [ + "versions.yml:md5,dcbde1b24eb36571645f2d4bd4b4e551" + ] + ], + "timestamp": "2023-12-24T23:36:39.319717066" + } +} \ No newline at end of file diff --git a/modules/nf-core/agat/convertspgff2gtf/tests/tags.yml b/modules/nf-core/agat/convertspgff2gtf/tests/tags.yml new file mode 100644 index 0000000..7a59648 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2gtf/tests/tags.yml @@ -0,0 +1,2 @@ +agat/convertspgff2gtf: + - "modules/nf-core/agat/convertspgff2gtf/**" diff --git a/modules/nf-core/agat/convertspgxf2gxf/environment.yml b/modules/nf-core/agat/convertspgxf2gxf/environment.yml new file mode 100644 index 0000000..6ed34fa --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/environment.yml @@ -0,0 +1,7 @@ +name: agat_convertspgxf2gxf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::agat=1.4.0 diff --git a/modules/nf-core/agat/convertspgxf2gxf/main.nf b/modules/nf-core/agat/convertspgxf2gxf/main.nf new file mode 100644 index 0000000..b9a7668 --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/main.nf @@ -0,0 +1,48 @@ +process AGAT_CONVERTSPGXF2GXF { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.4.0--pl5321hdfd78af_0' : + 'biocontainers/agat:1.4.0--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gxf) + + output: + tuple val(meta), path("*.agat.gff") , emit: output_gff + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + agat_convert_sp_gxf2gxf.pl \\ + --gxf $gxf \\ + --output ${prefix}.agat.gff \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gxf2gxf.pl --help | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.agat.gff + touch ${gxf}.agat.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gxf2gxf.pl --help | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/agat/convertspgxf2gxf/meta.yml b/modules/nf-core/agat/convertspgxf2gxf/meta.yml new file mode 100644 index 0000000..0ef9881 --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/meta.yml @@ -0,0 +1,43 @@ +name: agat_convertspgxf2gxf +description: | + Fixes and standardizes GFF/GTF files and outputs a cleaned GFF/GTF file +keywords: + - genome + - gff + - gtf + - conversion +tools: + - agat: + description: "AGAT is a toolkit for manipulation and getting information from GFF/GTF files" + homepage: "https://github.com/NBISweden/AGAT" + documentation: "https://agat.readthedocs.io/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gxf: + type: file + description: Annotation file in GFF3/GTF format + pattern: "*.{gff, gtf}" +output: + - output_gff: + type: file + description: Cleaned annotation file in GFF3 format + pattern: "*.{gff}" + - log: + type: file + description: Log file of the conversion process + pattern: "*.{log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@toniher" +maintainers: + - "@toniher" diff --git a/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test b/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test new file mode 100644 index 0000000..db85991 --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process AGAT_CONVERTSPGXF2GXF" + script "../main.nf" + process "AGAT_CONVERTSPGXF2GXF" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/convertspgxf2gxf" + + test("sarscov2 genome [gtf]") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.output_gff, + process.out.versions).match() }, + { assert path(process.out.log[0][1]).exists() } + ) + } + + } + + test("sarscov2 genome [gtf] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test.snap b/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test.snap new file mode 100644 index 0000000..e89073f --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/tests/main.nf.test.snap @@ -0,0 +1,71 @@ +{ + "sarscov2 genome [gtf] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.agat.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "genome.gtf.agat.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,5ec6166c5c080ec4bc08a8fe55ada486" + ], + "log": [ + [ + { + "id": "test" + }, + "genome.gtf.agat.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output_gff": [ + [ + { + "id": "test" + }, + "test.agat.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,5ec6166c5c080ec4bc08a8fe55ada486" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-12T12:25:34.583294" + }, + "sarscov2 genome [gtf]": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.agat.gff:md5,7d7e9bcd82a2f0bb7d8a38f85e82f0bc" + ] + ], + [ + "versions.yml:md5,5ec6166c5c080ec4bc08a8fe55ada486" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-12T12:21:21.310464" + } +} \ No newline at end of file diff --git a/modules/nf-core/agat/convertspgxf2gxf/tests/tags.yml b/modules/nf-core/agat/convertspgxf2gxf/tests/tags.yml new file mode 100644 index 0000000..85c7000 --- /dev/null +++ b/modules/nf-core/agat/convertspgxf2gxf/tests/tags.yml @@ -0,0 +1,2 @@ +agat/convertspgxf2gxf: + - "modules/nf-core/agat/convertspgxf2gxf/**" diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf index 970ab76..adbdbd7 100644 --- a/modules/nf-core/cat/cat/main.nf +++ b/modules/nf-core/cat/cat/main.nf @@ -22,6 +22,8 @@ process CAT_CAT { def args2 = task.ext.args2 ?: '' def file_list = files_in.collect { it.toString() } + // choose appropriate concatenation tool depending on input and output format + // | input | output | command1 | command2 | // |-----------|------------|----------|----------| // | gzipped | gzipped | cat | | @@ -30,7 +32,7 @@ process CAT_CAT { // | ungzipped | gzipped | cat | pigz | // Use input file ending as default - prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" out_zip = prefix.endsWith('.gz') in_zip = file_list[0].endsWith('.gz') command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' @@ -68,3 +70,10 @@ process CAT_CAT { END_VERSIONS """ } + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test index ed5a4f1..fcee2d1 100644 --- a/modules/nf-core/cat/cat/tests/main.nf.test +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -19,8 +19,8 @@ nextflow_process { [ [ id:'genome', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), - file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) ] ] """ @@ -45,8 +45,8 @@ nextflow_process { [ [ id:'test', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), - file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) ] ] """ @@ -72,8 +72,8 @@ nextflow_process { [ [ id:'test', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) ] ] """ @@ -102,8 +102,8 @@ nextflow_process { [ [ id:'test', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) ] ] """ @@ -131,8 +131,8 @@ nextflow_process { [ [ id:'test', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), - file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) ] ] """ @@ -160,7 +160,7 @@ nextflow_process { [ [ id:'test', single_end:true ], [ - file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] ] """ @@ -176,4 +176,3 @@ nextflow_process { } } } - diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml index bff93ad..8c69b12 100644 --- a/modules/nf-core/cat/fastq/environment.yml +++ b/modules/nf-core/cat/fastq/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - conda-forge::sed=4.7 + - conda-forge::coreutils=8.30 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf index 3d96378..f132b2a 100644 --- a/modules/nf-core/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -76,5 +76,4 @@ process CAT_FASTQ { """ } } - } diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test index f5f9418..dab2e14 100644 --- a/modules/nf-core/cat/fastq/tests/main.nf.test +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -16,11 +16,11 @@ nextflow_process { } process { """ - input[0] = [ - [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] - ] + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) """ } } @@ -28,8 +28,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match() }, - { assert path(process.out.versions.get(0)).getText().contains("cat") } + { assert snapshot(process.out).match() } ) } } @@ -42,13 +41,13 @@ nextflow_process { } process { """ - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ]) """ } } @@ -56,8 +55,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match() }, - { assert path(process.out.versions.get(0)).getText().contains("cat") } + { assert snapshot(process.out).match() } ) } } @@ -70,11 +68,11 @@ nextflow_process { } process { """ - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) """ } } @@ -82,8 +80,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match() }, - { assert path(process.out.versions.get(0)).getText().contains("cat") } + { assert snapshot(process.out).match() } ) } } @@ -96,13 +93,13 @@ nextflow_process { } process { """ - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) """ } } @@ -110,8 +107,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match() }, - { assert path(process.out.versions.get(0)).getText().contains("cat") } + { assert snapshot(process.out).match() } ) } } @@ -124,10 +120,10 @@ nextflow_process { } process { """ - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] - ] + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) """ } } @@ -135,8 +131,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out.reads).match() }, - { assert path(process.out.versions.get(0)).getText().contains("cat") } + { assert snapshot(process.out).match() } ) } } diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap index ec2342e..43dfe28 100644 --- a/modules/nf-core/cat/fastq/tests/main.nf.test.snap +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -1,78 +1,169 @@ { "test_cat_fastq_single_end": { "content": [ - [ - [ - { - "id": "test", - "single_end": true - }, - "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" ] - ] + } ], - "timestamp": "2023-10-17T23:19:12.990284837" + "timestamp": "2024-01-17T17:30:39.816981" }, "test_cat_fastq_single_end_same_name": { "content": [ - [ - [ - { - "id": "test", - "single_end": true - }, - "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" ] - ] + } ], - "timestamp": "2023-10-17T23:19:31.554568147" + "timestamp": "2024-01-17T17:32:35.229332" }, "test_cat_fastq_single_end_single_file": { "content": [ - [ - [ - { - "id": "test", - "single_end": true - }, - "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" ] - ] + } ], - "timestamp": "2023-10-17T23:19:49.629360033" + "timestamp": "2024-01-17T17:34:00.058829" }, "test_cat_fastq_paired_end_same_name": { "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, + { + "0": [ [ - "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", - "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" ] - ] + } ], - "timestamp": "2023-10-17T23:19:40.711617539" + "timestamp": "2024-01-17T17:33:33.031555" }, "test_cat_fastq_paired_end": { "content": [ - [ - [ - { - "id": "test", - "single_end": false - }, + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ [ - "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", - "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" ] - ] + } ], - "timestamp": "2023-10-18T07:53:20.923560211" + "timestamp": "2024-01-17T17:32:02.270935" } } \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml index 9b3272b..b48ced2 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/environment.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.19 + - bioconda::multiqc=1.20 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index f218761..105f926 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -4,8 +4,8 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : - 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.20--pyhdfd78af_0' : + 'biocontainers/multiqc:1.20--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/eggnogmapper/eggnogmapper.diff b/modules/nf-core/eggnogmapper/eggnogmapper.diff new file mode 100644 index 0000000..b38223d --- /dev/null +++ b/modules/nf-core/eggnogmapper/eggnogmapper.diff @@ -0,0 +1,53 @@ +Changes in module 'nf-core/eggnogmapper' +--- modules/nf-core/eggnogmapper/meta.yml ++++ modules/nf-core/eggnogmapper/meta.yml +@@ -60,3 +60,6 @@ + pattern: "versions.yml" + authors: + - "@vagkaratzas" ++maintainers: ++ - "@vagkaratzas" ++ - "@gallvp" + +--- modules/nf-core/eggnogmapper/main.nf ++++ modules/nf-core/eggnogmapper/main.nf +@@ -1,6 +1,6 @@ + process EGGNOGMAPPER { + tag "$meta.id" +- label 'process_long' ++ label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +@@ -23,11 +23,13 @@ + task.ext.when == null || task.ext.when + + script: +- def args = task.ext.args ?: '' +- def prefix = task.ext.prefix ?: "${meta.id}" +- def is_compressed = fasta.name.endsWith(".gz") +- def fasta_name = fasta.name.replace(".gz", "") +- def dbmem = task.memory.toMega() > 40000 ? '--dbmem' : '' ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" ++ def is_compressed = fasta.name.endsWith(".gz") ++ def fasta_name = fasta.name.replace(".gz", "") ++ def dbmem = task.memory.toMega() > 40000 ? '--dbmem' : '' ++ def database_arg = eggnog_db ? "--database $eggnog_db" : '' ++ def dmnd_db_arg = eggnog_diamond_db ? "--dmnd_db $eggnog_diamond_db" : '' + """ + if [ "$is_compressed" == "true" ]; then + gzip -c -d $fasta > $fasta_name +@@ -38,8 +40,8 @@ + -i ${fasta_name} \\ + --data_dir ${eggnog_data_dir} \\ + -m diamond \\ +- --dmnd_db ${eggnog_diamond_db} \\ +- --database ${eggnog_db} \\ ++ $dmnd_db_arg \\ ++ $database_arg \\ + --output ${prefix} \\ + ${dbmem} \\ + $args + +************************************************************ diff --git a/modules/nf-core/eggnogmapper/environment.yml b/modules/nf-core/eggnogmapper/environment.yml new file mode 100644 index 0000000..f4fb6fd --- /dev/null +++ b/modules/nf-core/eggnogmapper/environment.yml @@ -0,0 +1,7 @@ +name: eggnogmapper +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::eggnog-mapper=2.1.12 diff --git a/modules/nf-core/eggnogmapper/main.nf b/modules/nf-core/eggnogmapper/main.nf new file mode 100644 index 0000000..134451d --- /dev/null +++ b/modules/nf-core/eggnogmapper/main.nf @@ -0,0 +1,68 @@ +process EGGNOGMAPPER { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eggnog-mapper:2.1.12--pyhdfd78af_0': + 'biocontainers/eggnog-mapper:2.1.12--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + path(eggnog_db) + path(eggnog_data_dir) + tuple val(meta2), path(eggnog_diamond_db) + + output: + tuple val(meta), path("*.emapper.annotations") , emit: annotations + tuple val(meta), path("*.emapper.seed_orthologs"), emit: orthologs + tuple val(meta), path("*.emapper.hits") , emit: hits + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.name.endsWith(".gz") + def fasta_name = fasta.name.replace(".gz", "") + def dbmem = task.memory.toMega() > 40000 ? '--dbmem' : '' + def database_arg = eggnog_db ? "--database $eggnog_db" : '' + def dmnd_db_arg = eggnog_diamond_db ? "--dmnd_db $eggnog_diamond_db" : '' + """ + if [ "$is_compressed" == "true" ]; then + gzip -c -d $fasta > $fasta_name + fi + + emapper.py \\ + --cpu ${task.cpus} \\ + -i ${fasta_name} \\ + --data_dir ${eggnog_data_dir} \\ + -m diamond \\ + $dmnd_db_arg \\ + $database_arg \\ + --output ${prefix} \\ + ${dbmem} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog-mapper: \$(echo \$(emapper.py --version) | grep -o "emapper-[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+" | sed "s/emapper-//") + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.emapper.annotations + touch ${prefix}.emapper.seed_orthologs + touch ${prefix}.emapper.hits + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog-mapper: \$(echo \$(emapper.py --version) | grep -o "emapper-[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+" | sed "s/emapper-//") + END_VERSIONS + """ +} diff --git a/modules/nf-core/eggnogmapper/meta.yml b/modules/nf-core/eggnogmapper/meta.yml new file mode 100644 index 0000000..b07c27e --- /dev/null +++ b/modules/nf-core/eggnogmapper/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "eggnogmapper" +description: Fast genome-wide functional annotation through orthology assignment. +keywords: + - annotation + - orthology + - genomics +tools: + - "eggnogmapper": + description: "Fast genome-wide functional annotation through orthology assignment." + homepage: "https://github.com/eggnogdb/eggnog-mapper" + documentation: "https://github.com/eggnogdb/eggnog-mapper/wiki" + tool_dev_url: "https://github.com/eggnogdb/eggnog-mapper" + doi: "10.1093/molbev/msab293" + licence: ["AGPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - fasta: + type: file + description: Database of sequences in FASTA format + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + - eggnog_db: + type: file + description: The eggnog database file (e.g. eggnog-mapper/data/eggnog.db) + pattern: "*.db" + - eggnog_data_dir: + type: directory + description: Directory containing eggnog database files (e.g. eggnog-mapper/data) + pattern: "*" + - eggnog_diamond_db: + type: file + description: The eggnog Diamond protein database file (e.g. eggnog-mapper/data/eggnog_proteins.dmnd) + pattern: "*.dmnd" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - annotations: + type: file + description: TSV with the results from the annotation phase + pattern: "*.emapper.annotations" + - orthologs: + type: file + description: TSV with the results from parsing the hits, linking queries with seed orthologs (with commented metadata) + pattern: "*.emapper.seed_orthologs" + - hits: + type: file + description: TSV with the results from the Diamond search phase + pattern: "*.emapper.hits" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" + - "@gallvp" diff --git a/modules/nf-core/eggnogmapper/tests/main.nf.test b/modules/nf-core/eggnogmapper/tests/main.nf.test new file mode 100644 index 0000000..fb707ea --- /dev/null +++ b/modules/nf-core/eggnogmapper/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process EGGNOGMAPPER" + script "../main.nf" + process "EGGNOGMAPPER" + tag "modules" + tag "modules_nfcore" + tag "eggnogmapper" + tag "diamond/makedb" + + test("Should search for protein annotations against the eggnogmapper db") { + + setup { + run("DIAMOND_MAKEDB") { + script "../../diamond/makedb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], file(params.test_data['sarscov2']['genome']['proteome_fasta'], checkIfExists: true) ] + eggnog_db = file("https://github.com/nf-core/test-datasets/raw/eddf5b0e3336e0f93c81d4b4843b07257f9efaec/data/delete_me/eggnogmapper/eggnog.db", checkIfExists: true) + eggnog_db.copyTo("${workDir}/tmp/eggnog.db") + eggnog_data_dir = "${workDir}/tmp/" + input[1] = eggnog_db + input[2] = eggnog_data_dir + input[3] = DIAMOND_MAKEDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.annotations.get(0).get(1)).readLines().contains("ENSSASP00005000002.1\tENSSASP00005000002.1\t0.0\t14179.0\tCOG0498@1|root,COG0498@2|Bacteria,1MUWQ@1224|Proteobacteria,2VHR6@28216|Betaproteobacteria,2KUMA@206389|Rhodocyclales\t1224|Proteobacteria\tE\tthreonine synthase\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-\t-") }, + { assert path(process.out.orthologs.get(0).get(1)).readLines().contains("ENSSASP00005000002.1\tENSSASP00005000002.1\t0.0\t14179.0\t1\t7096\t1\t7096\t100.0\t100.0\t100.0") }, + { assert snapshot(process.out.hits).match("hits") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/eggnogmapper/tests/main.nf.test.snap b/modules/nf-core/eggnogmapper/tests/main.nf.test.snap new file mode 100644 index 0000000..4e1c837 --- /dev/null +++ b/modules/nf-core/eggnogmapper/tests/main.nf.test.snap @@ -0,0 +1,15 @@ +{ + "hits": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.emapper.hits:md5,864b7a1f902893d8aee6621baeab7be8" + ] + ] + ], + "timestamp": "2023-11-08T20:43:50.173213923" + } +} \ No newline at end of file diff --git a/modules/nf-core/eggnogmapper/tests/tags.yml b/modules/nf-core/eggnogmapper/tests/tags.yml new file mode 100644 index 0000000..284ba2e --- /dev/null +++ b/modules/nf-core/eggnogmapper/tests/tags.yml @@ -0,0 +1,2 @@ +eggnogmapper: + - modules/nf-core/eggnogmapper/** diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 2a3b679..4fc19b7 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -29,7 +29,7 @@ process FASTP { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" - def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--failed_out ${prefix}.paired.fail.fastq.gz --unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' // Added soft-links to original fastqs for consistent naming in MultiQC // Use single ended for interleaved. Add --interleaved_in in config. if ( task.ext.args?.contains('--interleaved_in') ) { diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test index 17dce8a..6f1f489 100644 --- a/modules/nf-core/fastp/tests/main.nf.test +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -19,11 +19,10 @@ nextflow_process { save_trimmed_fail = false save_merged = false - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:true ], - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] - + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -68,9 +67,9 @@ nextflow_process { process.out.reads_fail.collect { file(it[1]).getName() } + process.out.reads_merged.collect { file(it[1]).getName() } ).sort() - ).match("test_fastp_single_end-for_stub_match") + ).match("test_fastp_single_end-_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_single_end") } ) } } @@ -89,11 +88,10 @@ nextflow_process { save_trimmed_fail = false save_merged = false - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:true ], - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] - + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -118,7 +116,7 @@ nextflow_process { ).sort() ).match("test_fastp_single_end-for_stub_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_single_end_stub") } ) } } @@ -135,12 +133,11 @@ nextflow_process { save_trimmed_fail = false save_merged = false - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] - + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -199,9 +196,9 @@ nextflow_process { process.out.reads_fail.collect { file(it[1]).getName() } + process.out.reads_merged.collect { file(it[1]).getName() } ).sort() - ).match("test_fastp_paired_end-for_stub_match") + ).match("test_fastp_paired_end_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end") } ) } } @@ -220,12 +217,11 @@ nextflow_process { save_trimmed_fail = false save_merged = false - input[0] = [ + input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] - + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -249,13 +245,14 @@ nextflow_process { ).sort() ).match("test_fastp_paired_end-for_stub_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end-stub") } ) } } test("fastp test_fastp_interleaved") { - config './nextflow.config' + + config './nextflow.interleaved.config' when { params { outdir = "$outputDir" @@ -266,10 +263,10 @@ nextflow_process { save_trimmed_fail = false save_merged = false - input[0] = [ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ] - ] - + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -281,7 +278,7 @@ nextflow_process { def html_text = [ "Q20 bases:25.719000 K (93.033098%)", "paired end (151 cycles + 151 cycles)"] def log_text = [ "Q20 bases: 12922(92.9841%)", - "reads passed filter: 198"] + "reads passed filter: 162"] def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE - { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) } + { assert path(process.out.reads_fail.get(0).get(1).get(2)).linesGzip.contains(failed_read2_line) } } }, { html_text.each { html_part -> @@ -503,7 +500,7 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end_trim_fail") } ) } } @@ -519,11 +516,11 @@ nextflow_process { adapter_fasta = [] save_trimmed_fail = false save_merged = true - - input[0] = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -592,9 +589,9 @@ nextflow_process { process.out.reads_fail.collect { file(it[1]).getName() } + process.out.reads_merged.collect { file(it[1]).getName() } ).sort() - ).match("test_fastp_paired_end_merged-for_stub_match") + ).match("test_fastp_paired_end_merged_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end_merged") } ) } } @@ -613,10 +610,11 @@ nextflow_process { save_trimmed_fail = false save_merged = true - input[0] = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -640,7 +638,7 @@ nextflow_process { ).sort() ).match("test_fastp_paired_end_merged-for_stub_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end_merged_stub") } ) } } @@ -653,14 +651,15 @@ nextflow_process { } process { """ - adapter_fasta = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta", checkIfExists: true) + adapter_fasta = Channel.of([ file(params.modules_testdata_base_path + 'delete_me/fastp/adapters.fasta', checkIfExists: true) ]) save_trimmed_fail = false save_merged = true - input[0] = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) input[1] = adapter_fasta input[2] = save_trimmed_fail input[3] = save_merged @@ -719,7 +718,7 @@ nextflow_process { { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } } }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("versions_paired_end_merged_adapterlist") } ) } } diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap index 1b7d241..3e87628 100644 --- a/modules/nf-core/fastp/tests/main.nf.test.snap +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -1,5 +1,23 @@ { - "test_fastp_paired_end-for_stub_match": { + "fastp test_fastp_interleaved_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,b24e0624df5cc0b11cd5ba21b726fb22" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:19:15.063001" + }, + "test_fastp_paired_end_merged-for_stub_match": { "content": [ [ [ @@ -9,12 +27,29 @@ "test.fastp.html", "test.fastp.json", "test.fastp.log", + "test.merged.fastq.gz", "{id=test, single_end=false}" ] ], - "timestamp": "2023-12-21T09:44:37.202512" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-17T18:10:13.467574" }, - "fastp test_fastp_interleaved_json": { + "versions_interleaved": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:56:24.615634793" + }, + "test_fastp_single_end_json": { "content": [ [ [ @@ -22,13 +57,64 @@ "id": "test", "single_end": true }, - "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4" + "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" ] ] ], - "timestamp": "2023-10-17T11:04:45.794175881" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:18:43.526412" }, - "test_fastp_paired_end_merged-for_stub_match": { + "versions_paired_end": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:55:42.333545689" + }, + "test_fastp_paired_end_match": { + "content": [ + [ + [ + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=false}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T12:03:06.431833729" + }, + "test_fastp_interleaved-_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:19:15.111894" + }, + "test_fastp_paired_end_merged_match": { "content": [ [ [ @@ -42,29 +128,102 @@ "{id=test, single_end=false}" ] ], - "timestamp": "2023-12-21T09:53:45.237014" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T12:08:44.496251446" }, - "test_fastp_single_end_json": { + "versions_single_end_stub": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:55:27.354051299" + }, + "versions_interleaved-stub": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:56:46.535528418" + }, + "versions_single_end_trim_fail": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:59:03.724591407" + }, + "test_fastp_paired_end-for_stub_match": { "content": [ [ [ - { - "id": "test", - "single_end": true - }, - "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" - ] + "test_1.fastp.fastq.gz", + "test_2.fastp.fastq.gz" + ], + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=false}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-17T18:07:15.398827" + }, + "versions_paired_end-stub": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" ] ], - "timestamp": "2023-10-17T11:04:10.566343705" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:56:06.50017282" }, - "versions": { + "versions_single_end": { "content": [ [ "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" ] ], - "timestamp": "2023-10-17T11:04:10.582076024" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:55:07.67921647" + }, + "versions_paired_end_merged_stub": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:59:47.350653154" }, "test_fastp_interleaved-for_stub_match": { "content": [ @@ -76,7 +235,23 @@ "{id=test, single_end=true}" ] ], - "timestamp": "2023-12-21T09:48:43.148485" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-17T18:08:06.127974" + }, + "versions_paired_end_trim_fail": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:59:18.140484878" }, "test_fastp_single_end-for_stub_match": { "content": [ @@ -88,7 +263,51 @@ "{id=test, single_end=true}" ] ], - "timestamp": "2023-12-21T09:20:07.254788" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-17T18:06:00.244202" + }, + "test_fastp_single_end-_match": { + "content": [ + [ + "test.fastp.fastq.gz", + "test.fastp.html", + "test.fastp.json", + "test.fastp.log", + "{id=test, single_end=true}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:18:43.580336" + }, + "versions_paired_end_merged_adapterlist": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T12:05:37.845370554" + }, + "versions_paired_end_merged": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T11:59:32.860543858" }, "test_fastp_single_end_trim_fail_json": { "content": [ @@ -102,6 +321,10 @@ ] ] ], - "timestamp": "2023-10-17T11:05:00.379878948" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-17T18:08:41.942317" } } \ No newline at end of file diff --git a/modules/nf-core/fastp/tests/nextflow.interleaved.config b/modules/nf-core/fastp/tests/nextflow.interleaved.config new file mode 100644 index 0000000..4be8dbd --- /dev/null +++ b/modules/nf-core/fastp/tests/nextflow.interleaved.config @@ -0,0 +1,5 @@ +process { + withName: FASTP { + ext.args = "--interleaved_in -e 30" + } +} diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.save_failed.config similarity index 50% rename from modules/nf-core/fastp/tests/nextflow.config rename to modules/nf-core/fastp/tests/nextflow.save_failed.config index 0f7849a..53b61b0 100644 --- a/modules/nf-core/fastp/tests/nextflow.config +++ b/modules/nf-core/fastp/tests/nextflow.save_failed.config @@ -1,6 +1,5 @@ process { - withName: FASTP { - ext.args = "--interleaved_in" + ext.args = "-e 30" } } diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9e19a74..1fd7ac4 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -25,6 +25,11 @@ process FASTQC { def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } def rename_to = old_new_pairs*.join(' ').join(' ') def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + + def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') + // FastQC memory value allowed range (100 - 10000) + def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) + """ printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index ad9bc54..70edae4 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -13,12 +13,10 @@ nextflow_process { when { process { """ - input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) """ } } @@ -35,7 +33,7 @@ nextflow_process { { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_single") } ) } } @@ -44,15 +42,13 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) - ] - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ } } @@ -67,7 +63,7 @@ nextflow_process { { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_paired") } ) } } @@ -76,11 +72,11 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) - ] + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) """ } } @@ -93,7 +89,7 @@ nextflow_process { { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") } ) } } @@ -102,12 +98,12 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ } } @@ -119,7 +115,7 @@ nextflow_process { { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_bam") } ) } } @@ -128,17 +124,15 @@ nextflow_process { when { process { - """ - input[0] = [ - [id: 'test', single_end: false], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) - ] - ] - """ + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ } } @@ -159,7 +153,7 @@ nextflow_process { { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_multiple") } ) } } @@ -168,12 +162,12 @@ nextflow_process { when { process { - """ - input[0] = [ - [ id:'mysample', single_end:true ], // meta map - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - """ + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ } } @@ -185,7 +179,7 @@ nextflow_process { { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") } ) } } @@ -197,12 +191,10 @@ nextflow_process { when { process { """ - input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) """ } } @@ -212,7 +204,7 @@ nextflow_process { { assert process.success }, { assert snapshot(process.out.html.collect { file(it[1]).getName() } + process.out.zip.collect { file(it[1]).getName() } + - process.out.versions ).match() } + process.out.versions ).match("fastqc_stub") } ) } } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 5ef5afb..86f7c31 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -1,5 +1,17 @@ { - "sarscov2 single-end [fastq] - stub": { + "fastqc_versions_interleaved": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:07.293713" + }, + "fastqc_stub": { "content": [ [ "test.html", @@ -7,14 +19,70 @@ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-12-29T02:48:05.126117287" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:31:01.425198" + }, + "fastqc_versions_multiple": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:55.797907" + }, + "fastqc_versions_bam": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:26.795862" + }, + "fastqc_versions_single": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:39:27.043675" + }, + "fastqc_versions_paired": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:39:47.584191" }, - "versions": { + "fastqc_versions_custom_prefix": { "content": [ [ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-12-29T02:46:49.507942667" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:41:14.576531" } } \ No newline at end of file diff --git a/modules/nf-core/gffcompare/environment.yml b/modules/nf-core/gffcompare/environment.yml new file mode 100644 index 0000000..bcd633e --- /dev/null +++ b/modules/nf-core/gffcompare/environment.yml @@ -0,0 +1,7 @@ +name: gffcompare +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gffcompare=0.12.6 diff --git a/modules/nf-core/gffcompare/main.nf b/modules/nf-core/gffcompare/main.nf new file mode 100644 index 0000000..edca0f2 --- /dev/null +++ b/modules/nf-core/gffcompare/main.nf @@ -0,0 +1,63 @@ +process GFFCOMPARE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffcompare:0.12.6--h9f5acd7_0' : + 'biocontainers/gffcompare:0.12.6--h9f5acd7_0' }" + + input: + tuple val(meta), path(gtfs) + tuple val(meta2), path(fasta), path(fai) + tuple val(meta3), path(reference_gtf) + + output: + tuple val(meta), path("*.annotated.gtf"), optional: true, emit: annotated_gtf + tuple val(meta), path("*.combined.gtf") , optional: true, emit: combined_gtf + tuple val(meta), path("*.tmap") , optional: true, emit: tmap + tuple val(meta), path("*.refmap") , optional: true, emit: refmap + tuple val(meta), path("*.loci") , emit: loci + tuple val(meta), path("*.stats") , emit: stats + tuple val(meta), path("*.tracking") , emit: tracking + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def ref_fasta = fasta ? "-s ${fasta}" : '' + def ref_gtf = reference_gtf ? "-r ${reference_gtf}" : '' + """ + gffcompare \\ + $args \\ + $ref_fasta \\ + $ref_gtf \\ + -o $prefix \\ + $gtfs + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffcompare: \$(echo \$(gffcompare --version 2>&1) | sed 's/^gffcompare v//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.annotated.gtf + touch ${prefix}.combined.gtf + touch ${prefix}.tmap + touch ${prefix}.refmap + touch ${prefix}.loci + touch ${prefix}.stats + touch ${prefix}.tracking + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffcompare: \$(echo \$(gffcompare --version 2>&1) | sed 's/^gffcompare v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffcompare/meta.yml b/modules/nf-core/gffcompare/meta.yml new file mode 100644 index 0000000..674f08c --- /dev/null +++ b/modules/nf-core/gffcompare/meta.yml @@ -0,0 +1,91 @@ +name: "gffcompare" +description: Compare, merge, annotate and estimate accuracy of generated gtf files +keywords: + - transcripts + - gtf + - merge + - compare +tools: + - "gffcompare": + description: "GffCompare by Geo Pertea" + homepage: "http://ccb.jhu.edu/software/stringtie/gffcompare.shtml" + documentation: "http://ccb.jhu.edu/software/stringtie/gffcompare.shtml" + tool_dev_url: "https://github.com/gpertea/gffcompare" + doi: "10.12688/f1000research.23297.1" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test', single_end:false ] + - gtfs: + type: file + description: | + GTF/GFF files + e.g. [ 'file_1.gtf', 'file_2.gtf' ] + pattern: "*.{gtf,gff}" + - fasta: + type: file + description: Genome reference fasta file (optional) + pattern: "*.{fasta,fa}" + - fai: + type: file + description: Index for fasta file + pattern: "*.fai" + - reference_gtf: + type: file + description: Reference annotation in gtf/gff format (optional) + pattern: "*.{gtf,gff}" +output: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test', single_end:false ] + - annotated_gtf: + type: file + description: | + Annotated gtf file when reference gtf is provided (optional) + pattern: "*.annotated.gtf" + - combined_gtf: + type: file + description: | + Combined gtf file when multiple input files are + provided (optional) + pattern: "*.annotated.gtf" + - tmap: + type: file + description: | + File listing the most closely matching reference transcript + for each query transcript (optional) + pattern: "*.tmap" + - refmap: + type: file + description: | + File listing the reference transcripts with overlapping + query transcripts (optional) + pattern: "*.refmap" + - loci: + type: file + description: File with loci + pattern: "*.loci" + - stats: + type: file + description: | + File with stats for input transcripts as compared to + reference alternatively stats for the combined gtf + pattern: "*.stats" + - tracking: + type: file + description: | + This file matches transcripts up between samples + pattern: "*.tracking" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jemten" +maintainers: + - "@jemten" diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml index 5398f71..c6df58a 100644 --- a/modules/nf-core/gffread/environment.yml +++ b/modules/nf-core/gffread/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::gffread=0.12.1 + - bioconda::gffread=0.12.7 diff --git a/modules/nf-core/gffread/gffread.diff b/modules/nf-core/gffread/gffread.diff new file mode 100644 index 0000000..fa3668c --- /dev/null +++ b/modules/nf-core/gffread/gffread.diff @@ -0,0 +1,675 @@ +Changes in module 'nf-core/gffread' +--- modules/nf-core/gffread/environment.yml ++++ modules/nf-core/gffread/environment.yml +@@ -4,4 +4,4 @@ + - bioconda + - defaults + dependencies: +- - bioconda::gffread=0.12.1 ++ - bioconda::gffread=0.12.7 + +--- modules/nf-core/gffread/meta.yml ++++ modules/nf-core/gffread/meta.yml +@@ -13,11 +13,25 @@ + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] + input: ++ - meta: ++ type: map ++ description: | ++ Groovy Map containing meta data ++ e.g. [ id:'test' ] + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" ++ - fasta: ++ type: file ++ description: A multi-fasta file with the genomic sequences ++ pattern: "*.{fasta,fa,faa,fas,fsa}" + output: ++ - meta: ++ type: map ++ description: | ++ Groovy Map containing meta data ++ e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present +@@ -25,7 +39,11 @@ + - gffread_gff: + type: file + description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent +- pattern: "*.{gff3}" ++ pattern: "*.gff3" ++ - gffread_fasta: ++ type: file ++ description: Fasta file produced when either of '-w', '-x', '-y' parameters is present ++ pattern: "*.fasta" + - versions: + type: file + description: File containing software versions +@@ -34,3 +52,4 @@ + - "@edmundmiller" + maintainers: + - "@edmundmiller" ++ - "@gallvp" + +--- modules/nf-core/gffread/main.nf ++++ modules/nf-core/gffread/main.nf +@@ -1,32 +1,59 @@ + process GFFREAD { +- tag "$gff" ++ tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : +- 'biocontainers/gffread:0.12.1--h8b12597_0' }" ++ 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hdcf5f25_4' : ++ 'biocontainers/gffread:0.12.7--hdcf5f25_4' }" + + input: +- path gff ++ tuple val(meta), path(gff) ++ path fasta + + output: +- path "*.gtf" , emit: gtf , optional: true +- path "*.gff3" , emit: gffread_gff , optional: true +- path "versions.yml" , emit: versions ++ tuple val(meta), path("*.gtf") , emit: gtf , optional: true ++ tuple val(meta), path("*.gff3") , emit: gffread_gff , optional: true ++ tuple val(meta), path("*.fasta"), emit: gffread_fasta , optional: true ++ path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: +- def args = task.ext.args ?: '' +- def prefix = task.ext.prefix ?: "${gff.baseName}" +- def extension = args.contains("-T") ? 'gtf' : 'gffread.gff3' ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" ++ def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) ++ def fasta_arg = fasta ? "-g $fasta" : '' ++ def output_name = "${prefix}.${extension}" ++ def output = extension == "fasta" ? "$output_name" : "-o $output_name" ++ def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() ++ if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + gffread \\ + $gff \\ +- $args \\ +- -o ${prefix}.${extension} ++ $fasta_arg \\ ++ $args_sorted \\ ++ $output ++ ++ cat <<-END_VERSIONS > versions.yml ++ "${task.process}": ++ gffread: \$(gffread --version 2>&1) ++ END_VERSIONS ++ """ ++ ++ stub: ++ def args = task.ext.args ?: '' ++ def prefix = task.ext.prefix ?: "${meta.id}" ++ def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) ++ def fasta_arg = fasta ? "-g $fasta" : '' ++ def output_name = "${prefix}.${extension}" ++ def output = extension == "fasta" ? "$output_name" : "-o $output_name" ++ def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() ++ if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" ++ """ ++ touch $output_name ++ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + +--- modules/nf-core/gffread/tests/main.nf.test.snap ++++ modules/nf-core/gffread/tests/main.nf.test.snap +@@ -1,24 +1,272 @@ + { + "sarscov2-gff3-gtf": { + "content": [ +- [ +- "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" +- ], +- [ +- "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" +- ] +- ], +- "timestamp": "2024-01-23T20:00:32.688779117" ++ { ++ "0": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" ++ ] ++ ], ++ "1": [ ++ ++ ], ++ "2": [ ++ ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ ++ ], ++ "gffread_gff": [ ++ ++ ], ++ "gtf": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" ++ ] ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T10:48:56.496187" + }, + "sarscov2-gff3-gff3": { + "content": [ +- [ +- "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" +- ], +- [ +- "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" +- ] +- ], +- "timestamp": "2024-01-23T20:07:11.457356625" ++ { ++ "0": [ ++ ++ ], ++ "1": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" ++ ] ++ ], ++ "2": [ ++ ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ ++ ], ++ "gffread_gff": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" ++ ] ++ ], ++ "gtf": [ ++ ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T10:49:00.892782" ++ }, ++ "sarscov2-gff3-gtf-stub": { ++ "content": [ ++ { ++ "0": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "1": [ ++ ++ ], ++ "2": [ ++ ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ ++ ], ++ "gffread_gff": [ ++ ++ ], ++ "gtf": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T11:11:26.975666" ++ }, ++ "sarscov2-gff3-fasta-stub": { ++ "content": [ ++ { ++ "0": [ ++ ++ ], ++ "1": [ ++ ++ ], ++ "2": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "gffread_gff": [ ++ ++ ], ++ "gtf": [ ++ ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T11:11:44.34792" ++ }, ++ "sarscov2-gff3-gff3-stub": { ++ "content": [ ++ { ++ "0": [ ++ ++ ], ++ "1": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "2": [ ++ ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ ++ ], ++ "gffread_gff": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" ++ ] ++ ], ++ "gtf": [ ++ ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T11:11:35.221671" ++ }, ++ "sarscov2-gff3-fasta": { ++ "content": [ ++ { ++ "0": [ ++ ++ ], ++ "1": [ ++ ++ ], ++ "2": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" ++ ] ++ ], ++ "3": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ], ++ "gffread_fasta": [ ++ [ ++ { ++ "id": "test" ++ }, ++ "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" ++ ] ++ ], ++ "gffread_gff": [ ++ ++ ], ++ "gtf": [ ++ ++ ], ++ "versions": [ ++ "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ++ ] ++ } ++ ], ++ "meta": { ++ "nf-test": "0.8.4", ++ "nextflow": "23.10.1" ++ }, ++ "timestamp": "2024-04-09T10:54:02.88143" + } + } +--- modules/nf-core/gffread/tests/main.nf.test ++++ modules/nf-core/gffread/tests/main.nf.test +@@ -18,47 +18,203 @@ + } + process { + """ +- input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) +- """ +- } +- } +- +- then { +- assertAll ( +- { assert process.success }, +- { assert snapshot( +- process.out.gtf, +- process.out.versions +- ).match() }, ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = [] ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gffread_gff == [] }, ++ { assert process.out.gffread_fasta == [] } ++ ) ++ } ++ ++ } ++ ++ test("sarscov2-gff3-gtf-stub") { ++ ++ options '-stub' ++ config "./nextflow.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = [] ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gffread_gff == [] }, ++ { assert process.out.gffread_fasta == [] } ++ ) ++ } ++ ++ } ++ ++ test("sarscov2-gff3-gff3") { ++ ++ config "./nextflow-gff3.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = [] ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gtf == [] }, ++ { assert process.out.gffread_fasta == [] } ++ ) ++ } ++ ++ } ++ ++ test("sarscov2-gff3-gff3-stub") { ++ ++ options '-stub' ++ config "./nextflow-gff3.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = [] ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gtf == [] }, ++ { assert process.out.gffread_fasta == [] } ++ ) ++ } ++ ++ } ++ ++ test("sarscov2-gff3-fasta") { ++ ++ config "./nextflow-fasta.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + +- test("sarscov2-gff3-gff3") { +- +- config "./nextflow-gff3.config" +- +- when { +- params { +- outdir = "$outputDir" +- } +- process { +- """ +- input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) +- """ +- } +- } +- +- then { +- assertAll ( +- { assert process.success }, +- { assert snapshot( +- process.out.gffread_gff, +- process.out.versions +- ).match() }, +- { assert process.out.gtf == [] }, ++ test("sarscov2-gff3-fasta-stub") { ++ ++ options '-stub' ++ config "./nextflow-fasta.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'test'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert process.success }, ++ { assert snapshot(process.out).match() }, ++ { assert process.out.gtf == [] }, ++ { assert process.out.gffread_gff == [] } ++ ) ++ } ++ ++ } ++ ++ test("sarscov2-gff3-fasta-fail-catch") { ++ ++ options '-stub' ++ config "./nextflow-fasta.config" ++ ++ when { ++ params { ++ outdir = "$outputDir" ++ } ++ process { ++ """ ++ input[0] = [ ++ [id: 'genome'], ++ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) ++ ] ++ input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ++ """ ++ } ++ } ++ ++ then { ++ assertAll ( ++ { assert ! process.success }, ++ { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + +--- /dev/null ++++ modules/nf-core/gffread/tests/nextflow-fasta.config +@@ -0,0 +1,5 @@ ++process { ++ withName: GFFREAD { ++ ext.args = '-w -S' ++ } ++} + +************************************************************ diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf index d8a473e..cfd3e2f 100644 --- a/modules/nf-core/gffread/main.nf +++ b/modules/nf-core/gffread/main.nf @@ -1,32 +1,59 @@ process GFFREAD { - tag "$gff" + tag "$meta.id" label 'process_low' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : - 'biocontainers/gffread:0.12.1--h8b12597_0' }" + 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hdcf5f25_4' : + 'biocontainers/gffread:0.12.7--hdcf5f25_4' }" input: - path gff + tuple val(meta), path(gff) + path fasta output: - path "*.gtf" , emit: gtf , optional: true - path "*.gff3" , emit: gffread_gff , optional: true - path "versions.yml" , emit: versions + tuple val(meta), path("*.gtf") , emit: gtf , optional: true + tuple val(meta), path("*.gff3") , emit: gffread_gff , optional: true + tuple val(meta), path("*.fasta"), emit: gffread_fasta , optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${gff.baseName}" - def extension = args.contains("-T") ? 'gtf' : 'gffread.gff3' + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def fasta_arg = fasta ? "-g $fasta" : '' + def output_name = "${prefix}.${extension}" + def output = extension == "fasta" ? "$output_name" : "-o $output_name" + def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ gffread \\ $gff \\ - $args \\ - -o ${prefix}.${extension} + $fasta_arg \\ + $args_sorted \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def fasta_arg = fasta ? "-g $fasta" : '' + def output_name = "${prefix}.${extension}" + def output = extension == "fasta" ? "$output_name" : "-o $output_name" + def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch $output_name + cat <<-END_VERSIONS > versions.yml "${task.process}": gffread: \$(gffread --version 2>&1) diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml index 27ac310..c060282 100644 --- a/modules/nf-core/gffread/meta.yml +++ b/modules/nf-core/gffread/meta.yml @@ -13,11 +13,25 @@ tools: doi: 10.12688/f1000research.23297.1 licence: ["MIT"] input: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] - gff: type: file description: A reference file in either the GFF3, GFF2 or GTF format. pattern: "*.{gff, gtf}" + - fasta: + type: file + description: A multi-fasta file with the genomic sequences + pattern: "*.{fasta,fa,faa,fas,fsa}" output: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] - gtf: type: file description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present @@ -25,12 +39,17 @@ output: - gffread_gff: type: file description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent - pattern: "*.{gff3}" + pattern: "*.gff3" + - gffread_fasta: + type: file + description: Fasta file produced when either of '-w', '-x', '-y' parameters is present + pattern: "*.fasta" - versions: type: file description: File containing software versions pattern: "versions.yml" authors: - - "@emiller88" + - "@edmundmiller" maintainers: - - "@emiller88" + - "@edmundmiller" + - "@gallvp" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test index 3c064b3..4cd13dc 100644 --- a/modules/nf-core/gffread/tests/main.nf.test +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -18,17 +18,52 @@ nextflow_process { } process { """ - input[0] = file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] """ } } then { assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert process.out.gtf != null }, - { assert process.out.gffread_gff == [] } + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gtf-stub") { + + options '-stub' + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } ) } @@ -36,23 +71,150 @@ nextflow_process { test("sarscov2-gff3-gff3") { + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gff3-stub") { + + options '-stub' + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-fasta") { + + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-stub") { + + options '-stub' + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-fail-catch") { + + options '-stub' + config "./nextflow-fasta.config" + when { params { outdir = "$outputDir" } process { """ - input[0] = file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + input[0] = [ + [id: 'genome'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) """ } } then { assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert process.out.gtf == [] }, - { assert process.out.gffread_gff != null }, + { assert ! process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } ) } diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap index 1f1342e..1526232 100644 --- a/modules/nf-core/gffread/tests/main.nf.test.snap +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -3,26 +3,46 @@ "content": [ { "0": [ - "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] ], "1": [ ], "2": [ - "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + ], "gffread_gff": [ ], "gtf": [ - "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] ], "versions": [ - "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ] } ], - "timestamp": "2023-11-29T15:39:30.006985" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:48:56.496187" }, "sarscov2-gff3-gff3": { "content": [ @@ -31,22 +51,222 @@ ], "1": [ - "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:49:00.892782" + }, + "sarscov2-gff3-gtf-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:26.975666" + }, + "sarscov2-gff3-fasta-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gffread_gff": [ + + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:44.34792" + }, + "sarscov2-gff3-gff3-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:35.221671" + }, + "sarscov2-gff3-fasta": { + "content": [ + { + "0": [ + + ], + "1": [ + ], "2": [ - "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] ], "gffread_gff": [ - "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + ], "gtf": [ ], "versions": [ - "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" ] } ], - "timestamp": "2023-11-29T15:39:34.636061" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:54:02.88143" } } \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow-fasta.config b/modules/nf-core/gffread/tests/nextflow-fasta.config new file mode 100644 index 0000000..ac6cb14 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-fasta.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-w -S' + } +} diff --git a/modules/nf-core/gffread/tests/nextflow-gff3.config b/modules/nf-core/gffread/tests/nextflow-gff3.config new file mode 100644 index 0000000..afe0830 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-gff3.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '' + } +} diff --git a/modules/nf-core/gt/gff3/environment.yml b/modules/nf-core/gt/gff3/environment.yml new file mode 100644 index 0000000..8289fb3 --- /dev/null +++ b/modules/nf-core/gt/gff3/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "gt_gff3" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::genometools-genometools=1.6.5" diff --git a/modules/nf-core/gt/gff3/main.nf b/modules/nf-core/gt/gff3/main.nf new file mode 100644 index 0000000..6324a39 --- /dev/null +++ b/modules/nf-core/gt/gff3/main.nf @@ -0,0 +1,51 @@ +process GT_GFF3 { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genometools-genometools:1.6.5--py310h3db02ab_0': + 'biocontainers/genometools-genometools:1.6.5--py310h3db02ab_0' }" + + input: + tuple val(meta), path(gff3) + + output: + tuple val(meta), path("*.gt.gff3") , emit: gt_gff3 , optional: true + tuple val(meta), path("*.error.log"), emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gt \\ + gff3 \\ + $args \\ + "$gff3" \\ + > "${prefix}.gt.gff3" \\ + 2> >(tee "${prefix}.error.log" >&2) \\ + || echo "Errors from gt-gff3 printed to ${prefix}.error.log" + + if grep -q "gt gff3: error:" "${prefix}.error.log"; then + echo "gt-gff3 failed to parse $gff3" + + rm \\ + "${prefix}.gt.gff3" + else + echo "gt-gff3 successfully parsed $gff3" + + mv \\ + "${prefix}.error.log" \\ + gt_gff3.stderr + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genometools: \$(gt --version | head -1 | sed 's/gt (GenomeTools) //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gt/gff3/meta.yml b/modules/nf-core/gt/gff3/meta.yml new file mode 100644 index 0000000..5cecd8d --- /dev/null +++ b/modules/nf-core/gt/gff3/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "gt_gff3" +description: "GenomeTools gt-gff3 utility to parse, possibly transform, and output GFF3 files" +keywords: + - genome + - gff3 + - annotation +tools: + - "gt": + description: "The GenomeTools genome analysis system" + homepage: "https://genometools.org/index.html" + documentation: "https://genometools.org/documentation.html" + tool_dev_url: "https://github.com/genometools/genometools" + doi: "10.1109/TCBB.2013.68" + licence: ["ISC"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gt_gff3: + type: file + description: Parsed gff3 file produced only if there is no parsing error + pattern: "*.gt.gff3" + - error_log: + type: file + description: Error log if gt-gff3 failed to parse the input gff3 file + pattern: "*.error.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@gallvp" +maintainers: + - "@gallvp" diff --git a/modules/nf-core/gt/gff3/tests/main.nf.test b/modules/nf-core/gt/gff3/tests/main.nf.test new file mode 100644 index 0000000..cb44bc8 --- /dev/null +++ b/modules/nf-core/gt/gff3/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + + name "Test Process GT_GFF3" + script "../main.nf" + process "GT_GFF3" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "gt" + tag "gt/gff3" + + test("sarscov2-gff3-valid") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gt_gff3 != null }, + { assert process.out.error_log == [] } + ) + } + + } + + test("sarscov2-gff3-invalid") { + when { + process { + """ + input[0] = Channel.of( + '##gff-version 3', + 'chr22\tID=gene:ENSG00000233995;Name=AP000547.1' + ) + .collectFile(name: 'sample.gff3', newLine: true) + .map { file -> [ [ id:'test' ], file ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gt_gff3 == [] }, + { assert process.out.error_log != null }, + { assert path(process.out.error_log.get(0).get(1)).getText().contains("gt gff3: error:") } + ) + } + } + +} diff --git a/modules/nf-core/gt/gff3/tests/main.nf.test.snap b/modules/nf-core/gt/gff3/tests/main.nf.test.snap new file mode 100644 index 0000000..6e454f1 --- /dev/null +++ b/modules/nf-core/gt/gff3/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2-gff3-invalid": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,31e6117c516f936ec403f792c732bc76" + ] + ], + "2": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ], + "error_log": [ + [ + { + "id": "test" + }, + "test.error.log:md5,31e6117c516f936ec403f792c732bc76" + ] + ], + "gt_gff3": [ + + ], + "versions": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ] + } + ], + "timestamp": "2023-11-29T10:42:11.408352" + }, + "sarscov2-gff3-valid": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ], + "error_log": [ + + ], + "gt_gff3": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "versions": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ] + } + ], + "timestamp": "2023-11-29T10:42:07.817894" + } +} \ No newline at end of file diff --git a/modules/nf-core/gt/gff3/tests/nextflow.config b/modules/nf-core/gt/gff3/tests/nextflow.config new file mode 100644 index 0000000..af56226 --- /dev/null +++ b/modules/nf-core/gt/gff3/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '-tidy -retainids' +} diff --git a/modules/nf-core/gt/gff3/tests/tags.yml b/modules/nf-core/gt/gff3/tests/tags.yml new file mode 100644 index 0000000..0ce15a9 --- /dev/null +++ b/modules/nf-core/gt/gff3/tests/tags.yml @@ -0,0 +1,2 @@ +gt/gff3: + - "modules/nf-core/gt/gff3/**" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test index d031792..6406008 100644 --- a/modules/nf-core/gunzip/tests/main.nf.test +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -15,10 +15,11 @@ nextflow_process { } process { """ - input[0] = [ - [], - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) """ } } diff --git a/modules/pfr/liftoff/environment.yml b/modules/nf-core/liftoff/environment.yml similarity index 100% rename from modules/pfr/liftoff/environment.yml rename to modules/nf-core/liftoff/environment.yml diff --git a/modules/pfr/liftoff/main.nf b/modules/nf-core/liftoff/main.nf similarity index 85% rename from modules/pfr/liftoff/main.nf rename to modules/nf-core/liftoff/main.nf index 317eca1..4db5da2 100644 --- a/modules/pfr/liftoff/main.nf +++ b/modules/nf-core/liftoff/main.nf @@ -11,6 +11,7 @@ process LIFTOFF { tuple val(meta), path(target_fa) path ref_fa, name: 'ref_assembly.fa' path ref_annotation + path ref_db output: tuple val(meta), path("${prefix}.gff3") , emit: gff3 @@ -22,11 +23,14 @@ process LIFTOFF { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def arg_g = ref_annotation ? "-g $ref_annotation" : '' + def arg_db = ref_db ? "-db $ref_db" : '' + prefix = task.ext.prefix ?: "${meta.id}" """ liftoff \\ - -g $ref_annotation \\ + $arg_g \\ + $arg_db \\ -p $task.cpus \\ -o "${prefix}.gff3" \\ -u "${prefix}.unmapped.txt" \\ diff --git a/modules/pfr/liftoff/meta.yml b/modules/nf-core/liftoff/meta.yml similarity index 91% rename from modules/pfr/liftoff/meta.yml rename to modules/nf-core/liftoff/meta.yml index 46b3c58..10e502c 100644 --- a/modules/pfr/liftoff/meta.yml +++ b/modules/nf-core/liftoff/meta.yml @@ -38,6 +38,11 @@ input: type: file description: Reference assembly annotations in gtf or gff3 format pattern: "*.{gtf,gff3}" + - ref_db: + type: file + description: | + Name of feature database; if not specified, the -g argument must + be provided and a database will be built automatically output: - meta: type: map diff --git a/modules/pfr/liftoff/tests/main.nf.test b/modules/nf-core/liftoff/tests/main.nf.test similarity index 70% rename from modules/pfr/liftoff/tests/main.nf.test rename to modules/nf-core/liftoff/tests/main.nf.test index 272c882..fc0f567 100644 --- a/modules/pfr/liftoff/tests/main.nf.test +++ b/modules/nf-core/liftoff/tests/main.nf.test @@ -9,6 +9,7 @@ nextflow_process { tag "modules_nfcore" tag "nf-core/gunzip" tag "liftoff" + tag "gunzip" test("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf") { @@ -38,6 +39,7 @@ nextflow_process { input[2] = [ file(params.test_data['homo_sapiens']['genome']['genome_1_gtf'], checkIfExists: true) ] + input[3] = [] """ } } @@ -48,16 +50,6 @@ nextflow_process { { assert snapshot(process.out.unmapped_txt).match("unmapped_txt") }, { assert file(process.out.gff3[0][1]).text.contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, { assert file(process.out.polished_gff3[0][1]).text.contains("chr21\tLiftoff\texon\t34608061\t34608118\t.\t+\t.") }, - { - assert snapshot( - ( - [process.out.gff3[0][0].toString()] + // meta - process.out.gff3.collect { file(it[1]).getName() } + - process.out.polished_gff3.collect { file(it[1]).getName() } + - process.out.unmapped_txt.collect { file(it[1]).getName() } - ).sort() - ).match("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match") - }, { assert snapshot(process.out.versions).match("versions") } ) } @@ -93,6 +85,7 @@ nextflow_process { input[2] = [ file(params.test_data['homo_sapiens']['genome']['genome_1_gtf'], checkIfExists: true) ] + input[3] = [] """ } } @@ -100,17 +93,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { - assert snapshot( - ( - [process.out.gff3[0][0].toString()] + // meta - process.out.gff3.collect { file(it[1]).getName() } + - process.out.polished_gff3.collect { file(it[1]).getName() } + - process.out.unmapped_txt.collect { file(it[1]).getName() } - ).sort() - ).match("homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match") - }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out).match() } ) } diff --git a/modules/nf-core/liftoff/tests/main.nf.test.snap b/modules/nf-core/liftoff/tests/main.nf.test.snap new file mode 100644 index 0000000..f606446 --- /dev/null +++ b/modules/nf-core/liftoff/tests/main.nf.test.snap @@ -0,0 +1,96 @@ +{ + "unmapped_txt": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.unmapped.txt:md5,7391d10df6e15db356b084c9af5259e4" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2023-12-01T13:57:40.748507" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,205d9c609e7fe27d8199550d842bdce8" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2023-12-01T13:57:40.752414" + }, + "homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.polished.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.unmapped.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,205d9c609e7fe27d8199550d842bdce8" + ], + "gff3": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "polished_gff3": [ + [ + { + "id": "test" + }, + "test.polished.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "unmapped_txt": [ + [ + { + "id": "test" + }, + "test.unmapped.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,205d9c609e7fe27d8199550d842bdce8" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:15:25.661428" + } +} \ No newline at end of file diff --git a/modules/nf-core/liftoff/tests/nextflow.config b/modules/nf-core/liftoff/tests/nextflow.config new file mode 100644 index 0000000..f35ef80 --- /dev/null +++ b/modules/nf-core/liftoff/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: LIFTOFF { + ext.args = '-polish' + } +} diff --git a/modules/nf-core/liftoff/tests/tags.yml b/modules/nf-core/liftoff/tests/tags.yml new file mode 100644 index 0000000..4d0adb6 --- /dev/null +++ b/modules/nf-core/liftoff/tests/tags.yml @@ -0,0 +1,2 @@ +liftoff: + - "modules/nf-core/liftoff/**" diff --git a/modules/nf-core/samtools/cat/environment.yml b/modules/nf-core/samtools/cat/environment.yml index 0455a7d..75f10f7 100644 --- a/modules/nf-core/samtools/cat/environment.yml +++ b/modules/nf-core/samtools/cat/environment.yml @@ -4,4 +4,5 @@ channels: - bioconda - defaults dependencies: - - bioconda::samtools=1.18 + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/cat/main.nf b/modules/nf-core/samtools/cat/main.nf index b3b2508..06615aa 100644 --- a/modules/nf-core/samtools/cat/main.nf +++ b/modules/nf-core/samtools/cat/main.nf @@ -4,8 +4,8 @@ process SAMTOOLS_CAT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : - 'biocontainers/samtools:1.18--h50ea8bc_1' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" input: tuple val(meta), path(input_files, stageAs: "?/*") diff --git a/modules/nf-core/samtools/cat/tests/main.nf.test b/modules/nf-core/samtools/cat/tests/main.nf.test index 49c633f..dad80b8 100644 --- a/modules/nf-core/samtools/cat/tests/main.nf.test +++ b/modules/nf-core/samtools/cat/tests/main.nf.test @@ -9,18 +9,16 @@ nextflow_process { tag "samtools" tag "samtools/cat" - test("sarscov2 - [bam1, bam2]") { + test("bams") { when { process { """ - input[0] = [ - [ id:'test', single_end:false ], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_unaligned_bam'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.unaligned.bam', checkIfExists: true) ] + ]) """ } } @@ -28,30 +26,25 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - file(process.out.bam[0][1]).name, - process.out.cram, - process.out.versions - ).match() } + { assert snapshot(file(process.out.bam[0][1]).name).match("bams_bam") }, + { assert snapshot(process.out.cram).match("bams_cram") }, + { assert snapshot(process.out.versions).match("bams_versions") } ) } - } - test("sarscov2 - [bam1, bam2] - stub") { + test("bams_stub") { options "-stub" when { process { """ - input[0] = [ - [ id:'test', single_end:false ], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_unaligned_bam'], checkIfExists: true) - ] - ] + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.unaligned.bam', checkIfExists: true) ] + ]) """ } } @@ -59,14 +52,10 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - file(process.out.bam[0][1]).name, - process.out.cram, - process.out.versions - ).match() } + { assert snapshot(file(process.out.bam[0][1]).name).match("bams_stub_bam") }, + { assert snapshot(process.out.cram).match("bams_stub_cram") }, + { assert snapshot(process.out.versions).match("bams_stub_versions") } ) } - } - } diff --git a/modules/nf-core/samtools/cat/tests/main.nf.test.snap b/modules/nf-core/samtools/cat/tests/main.nf.test.snap index 298e25d..f99cdd6 100644 --- a/modules/nf-core/samtools/cat/tests/main.nf.test.snap +++ b/modules/nf-core/samtools/cat/tests/main.nf.test.snap @@ -1,26 +1,70 @@ { - "sarscov2 - [bam1, bam2]": { + "bams_stub_cram": { "content": [ - "test.bam", [ - ], + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-02T16:45:42.587418" + }, + "bams_stub_versions": { + "content": [ [ - "versions.yml:md5,f10a4f6b2e0272bef2ceb4ca826a15a1" + "versions.yml:md5,e214a92343158372aa79dabe0fb0064a" ] ], - "timestamp": "2023-12-04T14:00:18.264348819" + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:15:40.92408626" + }, + "bams_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-02T16:45:37.965199" }, - "sarscov2 - [bam1, bam2] - stub": { + "bams_cram": { "content": [ - "test.bam", [ - ], + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-02T16:45:37.96805" + }, + "bams_stub_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-02T16:45:42.583881" + }, + "bams_versions": { + "content": [ [ - "versions.yml:md5,f10a4f6b2e0272bef2ceb4ca826a15a1" + "versions.yml:md5,e214a92343158372aa79dabe0fb0064a" ] ], - "timestamp": "2023-12-04T14:03:17.714482742" + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:15:33.224336325" } } \ No newline at end of file diff --git a/modules/nf-core/sortmerna/main.nf b/modules/nf-core/sortmerna/main.nf index 29c640c..7c17e50 100644 --- a/modules/nf-core/sortmerna/main.nf +++ b/modules/nf-core/sortmerna/main.nf @@ -9,88 +9,102 @@ process SORTMERNA { input: tuple val(meta), path(reads) - path fastas + tuple val(meta2), path(fastas) + tuple val(meta3), path(index) output: - tuple val(meta), path("*non_rRNA.fastq.gz"), emit: reads - tuple val(meta), path("*.log") , emit: log - path "versions.yml" , emit: versions + tuple val(meta), path("*non_rRNA.fastq.gz"), emit: reads, optional: true + tuple val(meta), path("*.log") , emit: log, optional: true + tuple val(meta2), path("idx") , emit: index, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - sortmerna \\ - ${'--ref '+fastas.join(' --ref ')} \\ - --reads $reads \\ - --threads $task.cpus \\ - --workdir . \\ - --aligned rRNA_reads \\ - --fastx \\ - --other non_rRNA_reads \\ - $args + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" - mv non_rRNA_reads.f*q.gz ${prefix}.non_rRNA.fastq.gz - mv rRNA_reads.log ${prefix}.sortmerna.log + def index_only = args.contains('--index 1')? true : false + def skip_index = args.contains('--index 0')? true : false + def paired_end = reads instanceof List + def paired_cmd = '' + def reads_args = '' + def out2_cmd = '' + def mv_cmd = '' + def reads_input = '' + def refs_input = '' - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ - } else { - """ - sortmerna \\ - ${'--ref '+fastas.join(' --ref ')} \\ - --reads ${reads[0]} \\ - --reads ${reads[1]} \\ - --threads $task.cpus \\ - --workdir . \\ - --aligned rRNA_reads \\ - --fastx \\ - --other non_rRNA_reads \\ - --paired_in \\ - --out2 \\ - $args + if (! index_only){ + reads_args = '--aligned rRNA_reads --fastx --other non_rRNA_reads' + reads_input = paired_end ? reads.collect{"--reads $it"}.join(' ') : "--reads $reads" + def n_fastq = paired_end ? reads.size() : 1 + if ( n_fastq == 1 ) { + mv_cmd = """ + mv non_rRNA_reads.f*q.gz ${prefix}.non_rRNA.fastq.gz + mv rRNA_reads.log ${prefix}.sortmerna.log + """ + } else { + mv_cmd = """ + mv non_rRNA_reads_fwd.f*q.gz ${prefix}_1.non_rRNA.fastq.gz + mv non_rRNA_reads_rev.f*q.gz ${prefix}_2.non_rRNA.fastq.gz + mv rRNA_reads.log ${prefix}.sortmerna.log + """ + paired_cmd = "--paired_in" + out2_cmd = "--out2" + } + } + """ + sortmerna \\ + ${'--ref '+fastas.join(' --ref ')} \\ + $refs_input \\ + $reads_input \\ + --threads $task.cpus \\ + --workdir . \\ + $reads_args \\ + $paired_cmd \\ + $out2_cmd \\ + $args - mv non_rRNA_reads_fwd.f*q.gz ${prefix}_1.non_rRNA.fastq.gz - mv non_rRNA_reads_rev.f*q.gz ${prefix}_2.non_rRNA.fastq.gz - mv rRNA_reads.log ${prefix}.sortmerna.log + $mv_cmd - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ - } + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - if (meta.single_end) { - """ - touch ${prefix}.non_rRNA.fastq.gz - touch ${prefix}.sortmerna.log + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ - } else { - """ - touch ${prefix}_1.non_rRNA.fastq.gz - touch ${prefix}_2.non_rRNA.fastq.gz - touch ${prefix}.sortmerna.log + def index_only = args.contains('--index 1')? true : false + def paired_end = reads instanceof List + def paired_cmd = '' + def out2_cmd = '' + def mv_cmd = '' + def reads_input = '' - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') - END_VERSIONS - """ + if (! index_only){ + reads_input = paired_end ? reads.collect{"--reads $it"}.join(' ') : "--reads $reads" + def n_fastq = paired_end ? reads.size() : 1 + if ( n_fastq == 1 ) { + mv_cmd = "touch ${prefix}.non_rRNA.fastq.gz" + } else { + mv_cmd = """ + touch ${prefix}_1.non_rRNA.fastq.gz + touch ${prefix}_2.non_rRNA.fastq.gz + """ + } } + """ + $mv_cmd + mkdir -p idx + touch ${prefix}.sortmerna.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sortmerna: \$(echo \$(sortmerna --version 2>&1) | sed 's/^.*SortMeRNA version //; s/ Build Date.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/sortmerna/meta.yml b/modules/nf-core/sortmerna/meta.yml index de0b18e..c0a2a58 100644 --- a/modules/nf-core/sortmerna/meta.yml +++ b/modules/nf-core/sortmerna/meta.yml @@ -23,16 +23,31 @@ input: description: | List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] - fastas: type: file description: | Path to reference file(s) + - meta3: + type: map + description: | + Groovy Map containing index information + e.g. [ id:'test' ] + - index: + type: directory + description: | + Path to index directory of a previous sortmerna run output: - meta: type: map description: | Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test', single_end:false ], or reference information from an + indexing-only run - reads: type: file description: The filtered fastq reads @@ -41,6 +56,15 @@ output: type: file description: SortMeRNA log file pattern: "*sortmerna.log" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: | + Path to index directory generated by sortmern - versions: type: file description: File containing software versions diff --git a/modules/nf-core/sortmerna/tests/indexing_only.config b/modules/nf-core/sortmerna/tests/indexing_only.config new file mode 100644 index 0000000..3e74a32 --- /dev/null +++ b/modules/nf-core/sortmerna/tests/indexing_only.config @@ -0,0 +1,5 @@ +process { + withName: 'SORTMERNA' { + ext.args = '--index 1' + } +} diff --git a/modules/nf-core/sortmerna/tests/main.nf.test b/modules/nf-core/sortmerna/tests/main.nf.test index 8a01e2a..73bc119 100644 --- a/modules/nf-core/sortmerna/tests/main.nf.test +++ b/modules/nf-core/sortmerna/tests/main.nf.test @@ -7,15 +7,73 @@ nextflow_process { tag "modules_nfcore" tag "sortmerna" + test("sarscov2 indexing only") { + + config './indexing_only.config' + + when { + process { + """ + input[0] = Channel.of([[],[]]) + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert ! process.out.reads }, + { assert snapshot(process.out.index).match("index_index_only") }, + { assert snapshot(process.out.versions).match("versions_index_only") } + ) + } + + } + + test("sarscov2 indexing only stub") { + + options '-stub' + config './indexing_only.config' + + when { + process { + """ + input[0] = Channel.of([[],[]]) + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert ! process.out.reads }, + { assert snapshot(process.out.index).match("index_only_stub") }, + { assert snapshot(process.out.versions).match("versions_index_only_stub") } + ) + } + + } + test("sarscov2 single_end") { when { process { """ - input[0] = [ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] - input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) """ } } @@ -32,9 +90,10 @@ nextflow_process { process.out.reads.collect { file(it[1]).getName() } + process.out.log.collect { file(it[1]).getName() } ).sort() - ).match("sarscov2 single_end-for_stub_match") + ).match("sarscov2 single_end_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.index).match("index_single_end") }, + { assert snapshot(process.out.versions).match("versions_single_end") } ) } @@ -47,10 +106,13 @@ nextflow_process { when { process { """ - input[0] = [ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] - input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) """ } } @@ -67,7 +129,8 @@ nextflow_process { ).sort() ).match("sarscov2 single_end-for_stub_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.index).match("index_single_end_stub") }, + { assert snapshot(process.out.versions).match("versions_single_end_stub") } ) } @@ -78,11 +141,17 @@ nextflow_process { when { process { """ - input[0] = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] - input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) """ } } @@ -99,9 +168,10 @@ nextflow_process { process.out.reads.collect { it[1].collect { item -> file(item).getName() } } + process.out.log.collect { file(it[1]).getName() } ).sort() - ).match("sarscov2 paired_end-for_stub_match") + ).match("sarscov2 paired_end_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.index).match("index_paired_end") }, + { assert snapshot(process.out.versions).match("versions_paired_end") } ) } @@ -114,11 +184,16 @@ nextflow_process { when { process { """ - input[0] = [ [ id:'test', single_end:false ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] - ] - input[1] = [ file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) ] + input[0] = [ [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) """ } } @@ -135,10 +210,118 @@ nextflow_process { ).sort() ).match("sarscov2 paired_end-for_stub_match") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.index).match("index_paired_end_stub") }, + { assert snapshot(process.out.versions).match("versions_paired_end_stub") } ) } } + test("sarscov2 single_end premade_index") { + + config './premade_index.config' + + setup { + + run("SORTMERNA", alias: "SORTMERNA_INDEX") { + script "../main.nf" + process { + """ + input[0] = Channel.of([[],[]]) + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) + """ + } + } + } + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = SORTMERNA_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reads }, + { assert file(process.out.log[0][1]).text.contains("Total reads passing E-value threshold = 100 (100.00)") }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end_premade_index_match") + }, + { assert snapshot(process.out.index).match("index_single_end_premade_index") }, + { assert snapshot(process.out.versions).match("versions_single_end_premade_index") } + ) + } + } + + test("sarscov2 single_end premade_index stub") { + + config './premade_index.config' + options '-stub' + + setup { + + run("SORTMERNA", alias: "SORTMERNA_INDEX") { + script "../main.nf" + process { + """ + input[0] = Channel.of([[],[]]) + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = Channel.of([[],[]]) + """ + } + } + } + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + input[1] = [ [id:'test2'], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) ] + ] + input[2] = SORTMERNA_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.reads }, + { + assert snapshot( + ( + [process.out.reads[0][0].toString()] + // meta + process.out.reads.collect { file(it[1]).getName() } + + process.out.log.collect { file(it[1]).getName() } + ).sort() + ).match("sarscov2 single_end_premade_index_match_stub") + }, + { assert snapshot(process.out.index).match("index_single_end_premade_index_stub") }, + { assert snapshot(process.out.versions).match("versions_single_end_premade_index_stub") } + ) + } + } } diff --git a/modules/nf-core/sortmerna/tests/main.nf.test.snap b/modules/nf-core/sortmerna/tests/main.nf.test.snap index e502000..86e8473 100644 --- a/modules/nf-core/sortmerna/tests/main.nf.test.snap +++ b/modules/nf-core/sortmerna/tests/main.nf.test.snap @@ -1,33 +1,352 @@ { - "sarscov2 single_end-for_stub_match": { + "versions_paired_end_stub": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:27:11.223149" + }, + "index_paired_end_stub": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T14:24:25.384097178" + }, + "versions_paired_end": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:27:04.517155" + }, + "versions_single_end_stub": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-01T12:10:35.228450189" + }, + "sarscov2 single_end_match": { "content": [ [ "test.non_rRNA.fastq.gz", "test.sortmerna.log", - "{id=test, single_end=true}" + "{id=test}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:28:23.20327" + }, + "index_only_stub": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + + ] + ] ] ], - "timestamp": "2023-12-21T11:56:00.15356" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:00:47.128504164" }, - "versions": { + "index_single_end_premade_index": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + "2415186086593376314.bursttrie_0.dat:md5,74f7f020e8d46e24a8a2e9c5fbcd564a", + "2415186086593376314.kmer_0.dat:md5,4a0bcb71b120f6a6949b7969292ef2e7", + "2415186086593376314.pos_0.dat:md5,bc2875e4cc4017707306565e396839ef", + "2415186086593376314.stats:md5,67c9d4c768f28a450fc82a2b5d43db5c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:01:53.832643452" + }, + "versions_single_end_premade_index": { "content": [ [ "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" ] ], - "timestamp": "2023-12-21T11:56:00.200244" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:01:53.902154982" }, "sarscov2 paired_end-for_stub_match": { "content": [ [ + "{id=test}", [ "test_1.non_rRNA.fastq.gz", "test_2.non_rRNA.fastq.gz" ], + "test.sortmerna.log" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:28:56.063579" + }, + "index_paired_end": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + "2415186086593376314.bursttrie_0.dat:md5,74f7f020e8d46e24a8a2e9c5fbcd564a", + "2415186086593376314.kmer_0.dat:md5,4a0bcb71b120f6a6949b7969292ef2e7", + "2415186086593376314.pos_0.dat:md5,bc2875e4cc4017707306565e396839ef", + "2415186086593376314.stats:md5,67c9d4c768f28a450fc82a2b5d43db5c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T14:24:14.272659781" + }, + "sarscov2 single_end_premade_index_match_stub": { + "content": [ + [ + "test.non_rRNA.fastq.gz", + "test.sortmerna.log", + "{id=test}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:05:01.913287272" + }, + "sarscov2 single_end-for_stub_match": { + "content": [ + [ + "test.non_rRNA.fastq.gz", + "test.sortmerna.log", + "{id=test}" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:28:29.197913" + }, + "sarscov2 paired_end_match": { + "content": [ + [ + "{id=test}", + [ + "test_1.non_rRNA.fastq.gz", + "test_2.non_rRNA.fastq.gz" + ], + "test.sortmerna.log" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-07T12:28:49.914992" + }, + "versions_single_end": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T14:36:27.14244294" + }, + "versions_index_only": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:00:35.609161481" + }, + "versions_single_end_premade_index_stub": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:05:02.059858431" + }, + "index_single_end_stub": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T14:23:58.068772508" + }, + "versions_index_only_stub": { + "content": [ + [ + "versions.yml:md5,7df9d50209f351e1f75e05a1fad6ba4b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:00:47.169402699" + }, + "index_single_end_premade_index_stub": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:05:01.953316205" + }, + "index_single_end": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + "2415186086593376314.bursttrie_0.dat:md5,74f7f020e8d46e24a8a2e9c5fbcd564a", + "2415186086593376314.kmer_0.dat:md5,4a0bcb71b120f6a6949b7969292ef2e7", + "2415186086593376314.pos_0.dat:md5,bc2875e4cc4017707306565e396839ef", + "2415186086593376314.stats:md5,67c9d4c768f28a450fc82a2b5d43db5c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T14:36:26.88061978" + }, + "index_index_only": { + "content": [ + [ + [ + { + "id": "test2" + }, + [ + "2415186086593376314.bursttrie_0.dat:md5,74f7f020e8d46e24a8a2e9c5fbcd564a", + "2415186086593376314.kmer_0.dat:md5,4a0bcb71b120f6a6949b7969292ef2e7", + "2415186086593376314.pos_0.dat:md5,bc2875e4cc4017707306565e396839ef", + "2415186086593376314.stats:md5,67c9d4c768f28a450fc82a2b5d43db5c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:00:35.34089252" + }, + "sarscov2 single_end_premade_index_match": { + "content": [ + [ + "test.non_rRNA.fastq.gz", "test.sortmerna.log", - "{id=test, single_end=false}" + "{id=test}" ] ], - "timestamp": "2023-12-21T12:00:47.879193" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-26T15:01:53.797737296" } } \ No newline at end of file diff --git a/modules/nf-core/sortmerna/tests/premade_index.config b/modules/nf-core/sortmerna/tests/premade_index.config new file mode 100644 index 0000000..ab86d2e --- /dev/null +++ b/modules/nf-core/sortmerna/tests/premade_index.config @@ -0,0 +1,8 @@ +process { + withName: 'SORTMERNA_INDEX' { + ext.args = '--index 1' + } + withName: 'SORTMERNA' { + ext.args = '--index 0' + } +} diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml index 36fcd02..8bd58cf 100644 --- a/modules/nf-core/star/align/environment.yml +++ b/modules/nf-core/star/align/environment.yml @@ -6,4 +6,5 @@ channels: dependencies: - bioconda::star=2.7.10a - bioconda::samtools=1.18 + - bioconda::htslib=1.18 - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/tests/main.nf.test b/modules/nf-core/star/align/tests/main.nf.test index 4c87847..6ecd778 100644 --- a/modules/nf-core/star/align/tests/main.nf.test +++ b/modules/nf-core/star/align/tests/main.nf.test @@ -7,39 +7,40 @@ nextflow_process { tag "modules_nfcore" tag "star" tag "star/align" + tag "star/genomegenerate" - test("homo_sapiens - single_end") { - config "./nextflow.config" - - setup { - run("STAR_GENOMEGENERATE") { - script "../../../star/genomegenerate/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] - ]) - input[1] = Channel.of([ - [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] - ]) - """ - } + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ } } + } + + test("homo_sapiens - single_end") { + config "./nextflow.config" when { process { """ input[0] = Channel.of([ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true) ] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] ]) input[1] = STAR_GENOMEGENERATE.out.index input[2] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) input[3] = false input[4] = 'illumina' @@ -74,38 +75,20 @@ nextflow_process { test("homo_sapiens - paired_end") { config "./nextflow.config" - setup { - run("STAR_GENOMEGENERATE") { - script "../../../star/genomegenerate/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] - ]) - input[1] = Channel.of([ - [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] - ]) - """ - } - } - } - when { process { """ input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map [ - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) ] ]) input[1] = STAR_GENOMEGENERATE.out.index input[2] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) input[3] = false input[4] = 'illumina' @@ -140,38 +123,20 @@ nextflow_process { test("homo_sapiens - paired_end - arriba") { config "./nextflow.arriba.config" - setup { - run("STAR_GENOMEGENERATE") { - script "../../../star/genomegenerate/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] - ]) - input[1] = Channel.of([ - [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] - ]) - """ - } - } - } - when { process { """ input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map [ - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) ] ]) input[1] = STAR_GENOMEGENERATE.out.index input[2] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) input[3] = false input[4] = 'illumina' @@ -206,38 +171,20 @@ nextflow_process { test("homo_sapiens - paired_end - starfusion") { config "./nextflow.starfusion.config" - setup { - run("STAR_GENOMEGENERATE") { - script "../../../star/genomegenerate/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] - ]) - input[1] = Channel.of([ - [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] - ]) - """ - } - } - } - when { process { """ input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map [ - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) ] ]) input[1] = STAR_GENOMEGENERATE.out.index input[2] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) input[3] = false input[4] = 'illumina' @@ -272,40 +219,22 @@ nextflow_process { test("homo_sapiens - paired_end - multiple") { config "./nextflow.config" - setup { - run("STAR_GENOMEGENERATE") { - script "../../../star/genomegenerate/main.nf" - process { - """ - input[0] = Channel.of([ - [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] - ]) - input[1] = Channel.of([ - [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] - ]) - """ - } - } - } - when { process { """ input[0] = Channel.of([ [ id:'test', single_end:false ], // meta map [ - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_1_fastq_gz'], checkIfExists: true), - file(params.test_data['homo_sapiens']['illumina']['test_rnaseq_2_fastq_gz'], checkIfExists: true) + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) ] ]) input[1] = STAR_GENOMEGENERATE.out.index input[2] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) input[3] = false input[4] = 'illumina' diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml index 93e4476..791f255 100644 --- a/modules/nf-core/star/genomegenerate/environment.yml +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -1,11 +1,10 @@ name: star_genomegenerate - channels: - conda-forge - bioconda - defaults - dependencies: - bioconda::samtools=1.18 + - bioconda::htslib=1.18 - bioconda::star=2.7.10a - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test index af0c942..c17c8ba 100644 --- a/modules/nf-core/star/genomegenerate/tests/main.nf.test +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -8,18 +8,18 @@ nextflow_process { tag "star" tag "star/genomegenerate" - test("homo_sapiens") { + test("fasta_gtf") { when { process { """ input[0] = Channel.of([ [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] ]) input[1] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) """ } @@ -28,14 +28,13 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_versions") } ) } - } - test("homo_sapiens-stub") { + test("fasta_gtf_stub") { options '-stub' @@ -44,11 +43,11 @@ nextflow_process { """ input[0] = Channel.of([ [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] ]) input[1] = Channel.of([ [ id:'test_gtf' ], - [file(params.test_data['homo_sapiens']['genome']['genome_gtf'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] ]) """ } @@ -57,21 +56,20 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_with_gtf") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_stub_versions") } ) } - } - test("homo_sapiens-without_gtf") { + test("fasta") { when { process { """ input[0] = Channel.of([ [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] ]) input[1] = Channel.of([ [], [] ]) """ @@ -81,14 +79,14 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_index") }, + { assert snapshot(process.out.versions).match("fasta_versions") } ) } } - test("homo_sapiens-without_gtf-stub") { + test("fasta_stub") { options '-stub' @@ -97,7 +95,7 @@ nextflow_process { """ input[0] = Channel.of([ [ id:'test_fasta' ], - [file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true)] + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] ]) input[1] = Channel.of([ [], [] ]) """ @@ -107,11 +105,11 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("index_without_gtf") }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_stub_versions") } ) } } -} \ No newline at end of file +} diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap index 9de08c7..5653d6e 100644 --- a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -1,22 +1,90 @@ { - "versions": { + "fasta_gtf_versions": { "content": [ [ "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" ] ], - "timestamp": "2023-12-19T11:05:51.741109" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.798555" }, - "index_with_gtf": { + "fasta_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.521209" + }, + "fasta_gtf_stub_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.478098" + }, + "fasta_gtf_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.491657" + }, + "fasta_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.552329" + }, + "fasta_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.560541" + }, + "fasta_gtf_index": { "content": [ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" ], - "timestamp": "2023-12-19T11:38:14.551548" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.786814" }, - "index_without_gtf": { + "fasta_stub_index": { "content": [ "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" ], - "timestamp": "2023-12-19T11:38:22.382905" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.517472" } } \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/environment.yml b/modules/nf-core/umitools/extract/environment.yml index 7d08ac0..aab452d 100644 --- a/modules/nf-core/umitools/extract/environment.yml +++ b/modules/nf-core/umitools/extract/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::umi_tools=1.1.4 + - bioconda::umi_tools=1.1.5 diff --git a/modules/nf-core/umitools/extract/main.nf b/modules/nf-core/umitools/extract/main.nf index 4bd79e7..8719e5f 100644 --- a/modules/nf-core/umitools/extract/main.nf +++ b/modules/nf-core/umitools/extract/main.nf @@ -5,8 +5,8 @@ process UMITOOLS_EXTRACT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.4--py38hbff2b2d_1' : - 'biocontainers/umi_tools:1.1.4--py38hbff2b2d_1' }" + 'https://depot.galaxyproject.org/singularity/umi_tools:1.1.5--py39hf95cd2a_0' : + 'biocontainers/umi_tools:1.1.5--py39hf95cd2a_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test b/modules/nf-core/umitools/extract/tests/main.nf.test index 22242d1..2a8eba1 100644 --- a/modules/nf-core/umitools/extract/tests/main.nf.test +++ b/modules/nf-core/umitools/extract/tests/main.nf.test @@ -12,24 +12,20 @@ nextflow_process { test("Should run without failures") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = [ [ id:'test', single_end:true ], // meta map - [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] - ] + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] """ } } then { assertAll ( - { assert process.success }, - { assert snapshot(process.out.versions).match("versions") } + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") } ) } - } } \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/main.nf.test.snap b/modules/nf-core/umitools/extract/tests/main.nf.test.snap index 6d5944f..bf82701 100644 --- a/modules/nf-core/umitools/extract/tests/main.nf.test.snap +++ b/modules/nf-core/umitools/extract/tests/main.nf.test.snap @@ -2,9 +2,13 @@ "versions": { "content": [ [ - "versions.yml:md5,5a18da2d3a5a4de15e7aaae9082d7abb" + "versions.yml:md5,568d243174c081a0301e74ed42e59b48" ] ], - "timestamp": "2023-12-08T09:41:43.540658352" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-16T10:01:33.326046137" } } \ No newline at end of file diff --git a/modules/nf-core/umitools/extract/tests/nextflow.config b/modules/nf-core/umitools/extract/tests/nextflow.config index 628f5fc..c866f5a 100644 --- a/modules/nf-core/umitools/extract/tests/nextflow.config +++ b/modules/nf-core/umitools/extract/tests/nextflow.config @@ -1,7 +1,7 @@ process { publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } - + withName: UMITOOLS_EXTRACT { ext.args = '--bc-pattern="NNNN"' } diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/environment.yml b/modules/pfr/agat/spfilterfeaturefromkilllist/environment.yml new file mode 100644 index 0000000..b0811b4 --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "agat_spfilterfeaturefromkilllist" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::agat=1.3.3" diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/main.nf b/modules/pfr/agat/spfilterfeaturefromkilllist/main.nf new file mode 100644 index 0000000..ffb91d1 --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/main.nf @@ -0,0 +1,53 @@ +process AGAT_SPFILTERFEATUREFROMKILLLIST { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.3.3--pl5321hdfd78af_0': + 'biocontainers/agat:1.3.3--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gff) + path kill_list + path config + + output: + tuple val(meta), path("*.gff"), emit: gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def config_param = config ? "--config $config" : '' + if( "$gff" == "${prefix}.gff" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + agat_sp_filter_feature_from_kill_list.pl \\ + --gff $gff \\ + --kill_list $kill_list \\ + $config_param \\ + $args \\ + --output "${prefix}.gff" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_filter_feature_from_kill_list.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$gff" == "${prefix}.gff" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch "${prefix}.gff" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_filter_feature_from_kill_list.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/meta.yml b/modules/pfr/agat/spfilterfeaturefromkilllist/meta.yml new file mode 100644 index 0000000..d408fe7 --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/meta.yml @@ -0,0 +1,60 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "agat_spfilterfeaturefromkilllist" +description: | + The script aims to remove features based on a kill list. The default behaviour is to look at the features's ID. + If the feature has an ID (case insensitive) listed among the kill list it will be removed. /!\ Removing a level1 + or level2 feature will automatically remove all linked subfeatures, and removing all children of a feature will + automatically remove this feature too. +keywords: + - genomics + - gff + - remove + - feature +tools: + - "agat": + description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene annotations in any GTF/GFF format." + homepage: "https://agat.readthedocs.io/en/latest/" + documentation: "https://agat.readthedocs.io/en/latest/tools/agat_sp_filter_feature_from_kill_list.html" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - gff: + type: file + description: Input GFF3 file that will be read + pattern: "*.{gff,gff3}" + - kill_list: + type: file + description: Kill list. One value per line. + pattern: "*.txt" + - config: + type: file + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + pattern: "*.yaml" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - gff: + type: file + description: Output GFF file. + pattern: "*.gff" + +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test new file mode 100644 index 0000000..891e0a7 --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test @@ -0,0 +1,74 @@ +nextflow_process { + + name "Test Process AGAT_SPFILTERFEATUREFROMKILLLIST" + script "../main.nf" + process "AGAT_SPFILTERFEATUREFROMKILLLIST" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/spfilterfeaturefromkilllist" + + test("sarscov2-genome_gff3") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + + def kill_list = "unknown_transcript_1" + def kill_list_file = new File('kill.list.txt') + kill_list_file.text = kill_list + + input[1] = kill_list_file.toPath() + + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2-genome_gff3-stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + + def kill_list = "unknown_transcript_1" + def kill_list_file = new File('kill.list.txt') + kill_list_file.text = kill_list + + input[1] = kill_list_file.toPath() + + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test.snap b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test.snap new file mode 100644 index 0000000..bbc8cea --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "sarscov2-genome_gff3-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,ace24108f514da465e068372b18d4651" + ], + "gff": [ + [ + { + "id": "test" + }, + "test.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ace24108f514da465e068372b18d4651" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-02T13:29:56.638311" + }, + "sarscov2-genome_gff3": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff:md5,df19e1b84ba6f691d20c72b397c88abf" + ] + ], + "1": [ + "versions.yml:md5,ace24108f514da465e068372b18d4651" + ], + "gff": [ + [ + { + "id": "test" + }, + "test.gff:md5,df19e1b84ba6f691d20c72b397c88abf" + ] + ], + "versions": [ + "versions.yml:md5,ace24108f514da465e068372b18d4651" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-02T13:11:42.236263" + } +} \ No newline at end of file diff --git a/modules/pfr/agat/spfilterfeaturefromkilllist/tests/tags.yml b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/tags.yml new file mode 100644 index 0000000..2ab17b0 --- /dev/null +++ b/modules/pfr/agat/spfilterfeaturefromkilllist/tests/tags.yml @@ -0,0 +1,2 @@ +agat/spfilterfeaturefromkilllist: + - "modules/pfr/agat/spfilterfeaturefromkilllist/**" diff --git a/modules/pfr/agat/spmergeannotations/environment.yml b/modules/pfr/agat/spmergeannotations/environment.yml new file mode 100644 index 0000000..6df7aea --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "agat_spmergeannotations" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::agat=1.3.3" diff --git a/modules/pfr/agat/spmergeannotations/main.nf b/modules/pfr/agat/spmergeannotations/main.nf new file mode 100644 index 0000000..7738ac5 --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/main.nf @@ -0,0 +1,53 @@ +process AGAT_SPMERGEANNOTATIONS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.3.3--pl5321hdfd78af_0': + 'biocontainers/agat:1.3.3--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gffs) + path config + + output: + tuple val(meta), path("*.gff") , emit: gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def config_param = config ? "--config $config" : '' + def gff_param = "$gffs".split(' ').collect { "--gff $it" }.join(' ') + def file_names = gffs.collect { "$it" } + if ( file_names.contains ( "${prefix}.gff" ) ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + agat_sp_merge_annotations.pl \\ + $gff_param \\ + $config_param \\ + $args \\ + --output ${prefix}.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_merge_annotations.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def file_names = gffs.collect { "$it" } + if ( file_names.contains ( "${prefix}.gff" ) ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_merge_annotations.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/pfr/agat/spmergeannotations/meta.yml b/modules/pfr/agat/spmergeannotations/meta.yml new file mode 100644 index 0000000..afa9ddd --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/meta.yml @@ -0,0 +1,54 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "agat_spmergeannotations" +description: | + This script merge different gff annotation files in one. It uses the AGAT parser that takes care of duplicated names and fixes other oddities met in those files. +keywords: + - genomics + - gff + - merge + - combine +tools: + - "agat": + description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene annotations in any GTF/GFF format." + homepage: "https://agat.readthedocs.io/en/latest/" + documentation: "https://agat.readthedocs.io/en/latest/tools/agat_sp_merge_annotations.html" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gffs: + type: list + description: A list of GFFs to merge + pattern: "[ *.{gff,gff3} ]" + - config: + type: file + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml + locally type: "agat config --expose". The --config option gives you the possibility to use your + own AGAT config file (located elsewhere or named differently). + pattern: "*.yaml" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - gff: + type: file + description: Output GFF file. + pattern: "*.gff" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/agat/spmergeannotations/tests/main.nf.test b/modules/pfr/agat/spmergeannotations/tests/main.nf.test new file mode 100644 index 0000000..5e25599 --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/tests/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process AGAT_SPMERGEANNOTATIONS" + script "../main.nf" + process "AGAT_SPMERGEANNOTATIONS" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/spmergeannotations" + + test("candidatus_portiera_aleyrodidarum-multi_gffs") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test1_gff'], checkIfExists: true), + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test2_gff'], checkIfExists: true), + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test3_gff'], checkIfExists: true), + ] + ] + + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.gff[0][1]).text.contains('AGAT gene') }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("candidatus_portiera_aleyrodidarum-multi_gffs-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test1_gff'], checkIfExists: true), + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test2_gff'], checkIfExists: true), + file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['test3_gff'], checkIfExists: true), + ] + ] + + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/pfr/agat/spmergeannotations/tests/main.nf.test.snap b/modules/pfr/agat/spmergeannotations/tests/main.nf.test.snap new file mode 100644 index 0000000..c7e2154 --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,0ae449590befbaac9269ad8a7a84b66d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-02T17:08:15.459625" + }, + "candidatus_portiera_aleyrodidarum-multi_gffs-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,0ae449590befbaac9269ad8a7a84b66d" + ], + "gff": [ + [ + { + "id": "test" + }, + "test.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,0ae449590befbaac9269ad8a7a84b66d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-02T17:08:20.581403" + } +} \ No newline at end of file diff --git a/modules/pfr/agat/spmergeannotations/tests/tags.yml b/modules/pfr/agat/spmergeannotations/tests/tags.yml new file mode 100644 index 0000000..7d9b839 --- /dev/null +++ b/modules/pfr/agat/spmergeannotations/tests/tags.yml @@ -0,0 +1,2 @@ +agat/spmergeannotations: + - "modules/pfr/agat/spmergeannotations/**" diff --git a/modules/pfr/custom/restoregffids/tests/main.nf.test b/modules/pfr/custom/restoregffids/tests/main.nf.test index 521b924..cc374b7 100644 --- a/modules/pfr/custom/restoregffids/tests/main.nf.test +++ b/modules/pfr/custom/restoregffids/tests/main.nf.test @@ -60,4 +60,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/pfr/custom/shortenfastaids/tests/main.nf.test b/modules/pfr/custom/shortenfastaids/tests/main.nf.test index dc46bae..efff639 100644 --- a/modules/pfr/custom/shortenfastaids/tests/main.nf.test +++ b/modules/pfr/custom/shortenfastaids/tests/main.nf.test @@ -128,4 +128,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/pfr/edta/edta/tests/main.nf.test b/modules/pfr/edta/edta/tests/main.nf.test index 3aed0a2..7601876 100644 --- a/modules/pfr/edta/edta/tests/main.nf.test +++ b/modules/pfr/edta/edta/tests/main.nf.test @@ -74,4 +74,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/pfr/lai/tests/main.nf.test b/modules/pfr/lai/tests/main.nf.test deleted file mode 100644 index 353043c..0000000 --- a/modules/pfr/lai/tests/main.nf.test +++ /dev/null @@ -1,120 +0,0 @@ -nextflow_process { - - name "Test Process LAI" - script "../main.nf" - process "LAI" - config "./nextflow.config" - - tag "modules" - tag "modules_nfcore" - tag "lai" - tag "gt/suffixerator" - tag "nf-core/gunzip" - tag "gt/ltrharvest" - tag "ltrretriever" - - test("homo_sapiens-genome_21_fasta-success") { - - setup { - run("GUNZIP") { - script "../../../nf-core/gunzip" - - process { - """ - input[0] = [ - [ id:'test' ], - file('/Users/hrauxr/Projects/nxf-modules/tests/data/chr1.fa.gz', checkIfExists: true) - ] - """ - } - } - - run("GT_SUFFIXERATOR") { - script "../../../pfr/gt/suffixerator" - - process { - """ - input[0] = GUNZIP.out.gunzip - """ - } - } - - run("GT_LTRHARVEST") { - script "../../../pfr/gt/ltrharvest" - - process { - """ - input[0] = GT_SUFFIXERATOR.out.index - """ - } - } - - run("LTRRETRIEVER") { - script "../../../pfr/ltrretriever" - - process { - """ - input[0] = GUNZIP.out.gunzip - input[1] = GT_LTRHARVEST.out.tabout.map { meta, tabout -> tabout } - input[2] = [] - input[3] = [] - input[4] = [] - """ - } - } - } - - when { - process { - """ - input[0] = GUNZIP.out.gunzip - input[1] = LTRRETRIEVER.out.pass_list.map { meta, pass_list -> pass_list } - input[2] = LTRRETRIEVER.out.annotation_out.map { meta, annotation_out -> annotation_out } - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert path(process.out.log.get(0).get(1)).getText().contains("Dependency checking: Passed!") }, - { assert path(process.out.log.get(0).get(1)).getText().contains("Calculate LAI:") }, - { assert path(process.out.log.get(0).get(1)).getText().contains("Total LTR sequence content (0%) is too low for accurate LAI calculation") }, - { assert path(process.out.log.get(0).get(1)).getText().contains("Sorry, LAI is not applicable on the current genome assembly.") }, - { assert process.out.lai_out == [] }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - - } - - test("stub") { - - options '-stub' - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) - ] - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - - } - -} diff --git a/modules/pfr/lai/tests/main.nf.test.snap b/modules/pfr/lai/tests/main.nf.test.snap deleted file mode 100644 index 751ddb6..0000000 --- a/modules/pfr/lai/tests/main.nf.test.snap +++ /dev/null @@ -1,10 +0,0 @@ -{ - "versions": { - "content": [ - [ - "versions.yml:md5,2ac93e1e6324236af6f9a794bbac2099" - ] - ], - "timestamp": "2023-12-05T12:15:32.969684" - } -} \ No newline at end of file diff --git a/modules/pfr/lai/tests/nextflow.config b/modules/pfr/lai/tests/nextflow.config deleted file mode 100644 index 516a3e2..0000000 --- a/modules/pfr/lai/tests/nextflow.config +++ /dev/null @@ -1,10 +0,0 @@ -process { - - withName: GT_SUFFIXERATOR { - ext.args = '-tis -suf -lcp -des -ssp -sds -dna' - } - - withName: GT_LTRHARVEST { - ext.args = '-minlenltr 100 -maxlenltr 7000 -mintsd 4 -maxtsd 6 -motif TGCA -motifmis 1 -similar 85 -vic 10 -seed 20 -seqids yes' - } -} diff --git a/modules/pfr/lai/tests/tags.yml b/modules/pfr/lai/tests/tags.yml deleted file mode 100644 index 252295d..0000000 --- a/modules/pfr/lai/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -lai: - - "modules/pfr/lai/**" diff --git a/modules/pfr/liftoff/tests/main.nf.test.snap b/modules/pfr/liftoff/tests/main.nf.test.snap deleted file mode 100644 index baa4d70..0000000 --- a/modules/pfr/liftoff/tests/main.nf.test.snap +++ /dev/null @@ -1,34 +0,0 @@ -{ - "unmapped_txt": { - "content": [ - [ - [ - { - "id": "test" - }, - "test.unmapped.txt:md5,7391d10df6e15db356b084c9af5259e4" - ] - ] - ], - "timestamp": "2023-12-01T13:57:40.748507" - }, - "versions": { - "content": [ - [ - "versions.yml:md5,205d9c609e7fe27d8199550d842bdce8" - ] - ], - "timestamp": "2023-12-01T13:57:40.752414" - }, - "homo_sapiens-genome_21_fasta-genome_1_fasta-genome_1_gtf-for_stub_match": { - "content": [ - [ - "test.gff3", - "test.polished.gff3", - "test.unmapped.txt", - "{id=test}" - ] - ], - "timestamp": "2023-12-21T15:20:04.816416" - } -} \ No newline at end of file diff --git a/modules/pfr/liftoff/tests/nextflow.config b/modules/pfr/liftoff/tests/nextflow.config deleted file mode 100644 index 06b9d76..0000000 --- a/modules/pfr/liftoff/tests/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: LIFTOFF { - ext.args = '-exclude_partial -copies -polish -a 0.1 -s 0.1' - } -} diff --git a/modules/pfr/liftoff/tests/tags.yml b/modules/pfr/liftoff/tests/tags.yml deleted file mode 100644 index 4ae1fb0..0000000 --- a/modules/pfr/liftoff/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -liftoff: - - "modules/pfr/liftoff/**" diff --git a/modules/pfr/ltrretriever/lai/environment.yml b/modules/pfr/ltrretriever/lai/environment.yml new file mode 100644 index 0000000..e0e4968 --- /dev/null +++ b/modules/pfr/ltrretriever/lai/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "ltrretriever_lai" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::LTR_retriever=2.9.9" diff --git a/modules/pfr/lai/main.nf b/modules/pfr/ltrretriever/lai/main.nf similarity index 54% rename from modules/pfr/lai/main.nf rename to modules/pfr/ltrretriever/lai/main.nf index d4fced9..464b215 100644 --- a/modules/pfr/lai/main.nf +++ b/modules/pfr/ltrretriever/lai/main.nf @@ -1,11 +1,11 @@ -process LAI { +process LTRRETRIEVER_LAI { tag "$meta.id" - label 'process_high' + label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ltr_retriever:2.9.0--hdfd78af_2': - 'biocontainers/ltr_retriever:2.9.0--hdfd78af_2' }" + 'https://depot.galaxyproject.org/singularity/ltr_retriever:2.9.9--hdfd78af_0': + 'biocontainers/ltr_retriever:2.9.9--hdfd78af_0' }" input: tuple val(meta), path(fasta) @@ -26,44 +26,46 @@ process LAI { def prefix = task.ext.prefix ?: "${meta.id}" def monoploid_param = monoploid_seqs ? "-mono $monoploid_seqs" : '' def lai_output_name = monoploid_seqs ? "${annotation_out}.${monoploid_seqs}.out.LAI" : "${annotation_out}.LAI" + def VERSION = 'beta3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - # Remove comments from genome fasta, - # otherwise LAI triggers its sequence name change logic - - sed \\ - '/^>/ s/\\s.*\$//' \\ - $fasta \\ - > for_lai_no_comments.fsa - LAI \\ - -genome for_lai_no_comments.fsa \\ + -genome $fasta \\ -intact $pass_list \\ -all $annotation_out \\ -t $task.cpus \\ $monoploid_param \\ $args \\ - > "${prefix}.LAI.log" + > >(tee "${prefix}.LAI.log") \\ + || echo "LAI failed! See ${prefix}.LAI.log" mv \\ $lai_output_name \\ "${prefix}.LAI.out" \\ - || echo "LAI did not produce the output file" + || echo "LAI failed to estimate assembly index. See ${prefix}.LAI.log" cat <<-END_VERSIONS > versions.yml "${task.process}": - lai: \$(cat /usr/local/share/LTR_retriever/LAI | grep "my \\\$version" | sed 's/my \$version="//; s/";//') + lai: $VERSION END_VERSIONS """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def monoploid_param = monoploid_seqs ? "-mono $monoploid_seqs" : '' + def lai_output_name = monoploid_seqs ? "${annotation_out}.${monoploid_seqs}.out.LAI" : "${annotation_out}.LAI" + def VERSION = 'beta3.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - touch ${prefix}.LAI.log + touch "${prefix}.LAI.log" + touch "$lai_output_name" + + mv \\ + $lai_output_name \\ + "${prefix}.LAI.out" cat <<-END_VERSIONS > versions.yml "${task.process}": - lai: \$(cat /usr/local/share/LTR_retriever/LAI | grep "my \\\$version" | sed 's/my \$version="//; s/";//') + lai: $VERSION END_VERSIONS """ } diff --git a/modules/pfr/lai/meta.yml b/modules/pfr/ltrretriever/lai/meta.yml similarity index 91% rename from modules/pfr/lai/meta.yml rename to modules/pfr/ltrretriever/lai/meta.yml index 6fd7aef..f84cf6c 100644 --- a/modules/pfr/lai/meta.yml +++ b/modules/pfr/ltrretriever/lai/meta.yml @@ -1,7 +1,9 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -name: "lai" -description: Estimates the mean LTR sequence identity in the genome +name: "ltrretriever_lai" +description: | + Estimates the mean LTR sequence identity in the genome. The input genome fasta should + have short alphanumeric IDs without comments keywords: - genomics - annotation diff --git a/modules/pfr/ltrretriever/lai/tests/main.nf.test b/modules/pfr/ltrretriever/lai/tests/main.nf.test new file mode 100644 index 0000000..a617811 --- /dev/null +++ b/modules/pfr/ltrretriever/lai/tests/main.nf.test @@ -0,0 +1,166 @@ +nextflow_process { + + name "Test Process LTRRETRIEVER_LAI" + script "../main.nf" + process "LTRRETRIEVER_LAI" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "gunzip" + tag "ltrretriever" + tag "ltrretriever/ltrretriever" + tag "ltrretriever/lai" + tag "ltrharvest" + tag "ltrfinder" + tag "cat/cat" + + test("actinidia_chinensis-genome_21_fasta_gz-success") { + + setup { + + run("GUNZIP") { + script "../../../gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['actinidia_chinensis']['genome']['genome_21_fasta_gz'], checkIfExists: true) + ] + """ + } + } + + run("LTRHARVEST") { + script "../../../ltrharvest" + + process { + """ + input[0] = GUNZIP.out.gunzip + """ + } + } + + run("LTRFINDER") { + script "../../../ltrfinder" + + process { + """ + input[0] = GUNZIP.out.gunzip + """ + } + } + + run("CAT_CAT") { + script "../../../cat/cat" + + process { + """ + input[0] = LTRHARVEST.out.scn.mix(LTRFINDER.out.scn).groupTuple() + """ + } + } + + run("LTRRETRIEVER_LTRRETRIEVER") { + script "../../ltrretriever" + + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = CAT_CAT.out.file_out.map { meta, tabout -> tabout } + input[2] = [] + input[3] = [] + input[4] = [] + """ + } + } + } + + when { + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = LTRRETRIEVER_LTRRETRIEVER.out.pass_list.map { meta, pass_list -> pass_list } + input[2] = LTRRETRIEVER_LTRRETRIEVER.out.annotation_out.map { meta, annotation_out -> annotation_out } + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log[0][1]).text.contains("Dependency checking: Passed!") }, + { assert path(process.out.log[0][1]).text.contains("Calculate LAI:") }, + { assert path(process.out.log[0][1]).text.contains("Done!") }, + { assert path(process.out.log[0][1]).text.contains("Result file:") }, + { assert Math.abs(Float.parseFloat(path(process.out.lai_out[0][1]).text.split("\n")[1].split("\t")[6]) - 31.29) <= 1.0 } + ) + } + + } + + test("stub") { + + options '-stub' + + when { + process { + """ + def pass_list = new File('test.pass.list') + def out_file = new File('test.out') + def monoploid_seqs = new File('some_seqs.list.txt') + + input[0] = [ + [ id:'test' ], + file(params.test_data['actinidia_chinensis']['genome']['genome_21_fasta_gz'], checkIfExists: true) + ] + input[1] = pass_list.toPath() + input[2] = out_file.toPath() + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("stub_with_monoploid_seqs") { + + options '-stub' + + when { + process { + """ + def pass_list = new File('test.pass.list') + def out_file = new File('test.out') + def monoploid_seqs = new File('some_seqs.list.txt') + + input[0] = [ + [ id:'test' ], + file(params.test_data['actinidia_chinensis']['genome']['genome_21_fasta_gz'], checkIfExists: true) + ] + input[1] = pass_list.toPath() + input[2] = out_file.toPath() + input[3] = monoploid_seqs.toPath() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/pfr/ltrretriever/lai/tests/main.nf.test.snap b/modules/pfr/ltrretriever/lai/tests/main.nf.test.snap new file mode 100644 index 0000000..e1c8086 --- /dev/null +++ b/modules/pfr/ltrretriever/lai/tests/main.nf.test.snap @@ -0,0 +1,100 @@ +{ + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e04e27f9408e771795cd44d96518b7cd" + ], + "lai_out": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e04e27f9408e771795cd44d96518b7cd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-22T20:09:00.558021" + }, + "stub_with_monoploid_seqs": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e04e27f9408e771795cd44d96518b7cd" + ], + "lai_out": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e04e27f9408e771795cd44d96518b7cd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-22T20:10:08.213842" + } +} \ No newline at end of file diff --git a/modules/pfr/ltrretriever/lai/tests/nextflow.config b/modules/pfr/ltrretriever/lai/tests/nextflow.config new file mode 100644 index 0000000..75edf1a --- /dev/null +++ b/modules/pfr/ltrretriever/lai/tests/nextflow.config @@ -0,0 +1,15 @@ +process { + + withName: LTRHARVEST { + ext.prefix = { "${meta.id}_ltrharvest" } + } + + withName: LTRFINDER { + ext.args = '-harvest_out -size 1000000 -time 300' + // recommended parameters: https://github.com/oushujun/LTR_retriever#usage + } + + withName: CAT_CAT { + ext.prefix = { "${meta.id}_ltrharvest_ltrfinder.tabout" } + } +} diff --git a/modules/pfr/ltrretriever/lai/tests/tags.yml b/modules/pfr/ltrretriever/lai/tests/tags.yml new file mode 100644 index 0000000..470f468 --- /dev/null +++ b/modules/pfr/ltrretriever/lai/tests/tags.yml @@ -0,0 +1,2 @@ +ltrretriever/lai: + - "modules/nf-core/ltrretriever/lai/**" diff --git a/modules/pfr/repeatmodeler/builddatabase/main.nf b/modules/pfr/repeatmodeler/builddatabase/main.nf index 486e25d..6fe244b 100644 --- a/modules/pfr/repeatmodeler/builddatabase/main.nf +++ b/modules/pfr/repeatmodeler/builddatabase/main.nf @@ -26,7 +26,7 @@ process REPEATMODELER_BUILDDATABASE { cat <<-END_VERSIONS > versions.yml "${task.process}": - repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + repeatmodeler: \$(RepeatModeler --version | sed 's/RepeatModeler version //') END_VERSIONS """ @@ -44,7 +44,7 @@ process REPEATMODELER_BUILDDATABASE { cat <<-END_VERSIONS > versions.yml "${task.process}": - repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + repeatmodeler: \$(RepeatModeler --version | sed 's/RepeatModeler version //') END_VERSIONS """ } diff --git a/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test index 616f88c..fdeda4a 100644 --- a/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test +++ b/modules/pfr/repeatmodeler/builddatabase/tests/main.nf.test @@ -57,4 +57,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/pfr/repeatmodeler/repeatmodeler/main.nf b/modules/pfr/repeatmodeler/repeatmodeler/main.nf index 34df322..c7df9ca 100644 --- a/modules/pfr/repeatmodeler/repeatmodeler/main.nf +++ b/modules/pfr/repeatmodeler/repeatmodeler/main.nf @@ -35,7 +35,7 @@ process REPEATMODELER_REPEATMODELER { cat <<-END_VERSIONS > versions.yml "${task.process}": - repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + repeatmodeler: \$(RepeatModeler --version | sed 's/RepeatModeler version //') END_VERSIONS """ @@ -48,7 +48,7 @@ process REPEATMODELER_REPEATMODELER { cat <<-END_VERSIONS > versions.yml "${task.process}": - repeatmodeler: \$(RepeatModeler | grep '/usr/local/bin/RepeatModeler - ' | sed 's|/usr/local/bin/RepeatModeler - ||') + repeatmodeler: \$(RepeatModeler --version | sed 's/RepeatModeler version //') END_VERSIONS """ } diff --git a/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test index 78b7957..dd7185f 100644 --- a/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test +++ b/modules/pfr/repeatmodeler/repeatmodeler/tests/main.nf.test @@ -89,4 +89,4 @@ nextflow_process { } -} +} \ No newline at end of file diff --git a/modules/pfr/lai/environment.yml b/modules/pfr/tsebra/environment.yml similarity index 79% rename from modules/pfr/lai/environment.yml rename to modules/pfr/tsebra/environment.yml index 94fadbd..3505512 100644 --- a/modules/pfr/lai/environment.yml +++ b/modules/pfr/tsebra/environment.yml @@ -1,9 +1,9 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -name: "lai" +name: "tsebra" channels: - conda-forge - bioconda - defaults dependencies: - - "bioconda::LTR_retriever=2.9.0" + - "bioconda::tsebra=1.1.2.4" diff --git a/modules/pfr/tsebra/main.nf b/modules/pfr/tsebra/main.nf new file mode 100644 index 0000000..c92ade7 --- /dev/null +++ b/modules/pfr/tsebra/main.nf @@ -0,0 +1,61 @@ +process TSEBRA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tsebra:1.1.2.4--pyhca03a8a_0': + 'biocontainers/tsebra:1.1.2.4--pyhca03a8a_0' }" + + input: + tuple val(meta), path(gtfs) + path hints_files + path keep_gtfs + path config + + output: + tuple val(meta), path("*.gtf"), emit: tsebra_gtf + tuple val(meta), path("*.tsv"), emit: tsebra_scores + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def gtf_arg = '-g ' + gtfs.collect { "$it" }.join(',') + def hints_arg = '-e ' + hints_files.collect { "$it" }.join(',') + def keep_arg = keep_gtfs ? ( '-k ' + keep_gtfs.collect { "$it" }.join(',') ) : '' + def config_arg = config ? "-c $config" : '' + def VERSION = '1.1.2.4' + """ + tsebra.py \\ + $gtf_arg \\ + $hints_arg \\ + $keep_arg \\ + $config_arg \\ + $args \\ + -o ${prefix}.gtf \\ + -s ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tsebra: $VERSION + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.1.2.4' + """ + touch ${prefix}.gtf + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tsebra: $VERSION + END_VERSIONS + """ +} diff --git a/modules/pfr/tsebra/meta.yml b/modules/pfr/tsebra/meta.yml new file mode 100644 index 0000000..18660d4 --- /dev/null +++ b/modules/pfr/tsebra/meta.yml @@ -0,0 +1,66 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "tsebra" +description: Transcript Selector for BRAKER TSEBRA combines gene predictions by selecing transcripts based on their extrisic evidence support +keywords: + - genomics + - transcript + - selector + - gene + - prediction + - evidence +tools: + - "tsebra": + description: TSEBRA is a combiner tool that selects transcripts from gene predictions based on the support by extrisic evidence in form of introns and start/stop codons + homepage: "https://github.com/Gaius-Augustus/TSEBRA" + documentation: "https://github.com/Gaius-Augustus/TSEBRA" + tool_dev_url: "https://github.com/Gaius-Augustus/TSEBRA" + doi: "10.1186/s12859-021-04482-0" + licence: ["Artistic-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gtfs: + type: list + description: List of gene prediction files in gtf + pattern: "*.gtf" + - hints_files: + type: list + description: List of files containing extrinsic evidence in gff + pattern: "*.gff" + - keep_gtfs: + type: list + description: | + List of gene prediction files in gtf. These gene sets are used the same way as other inputs, but TSEBRA ensures that all + transcripts from these gene sets are included in the output + pattern: "*.gtf" + - config: + type: file + description: Configuration file that sets the parameter for TSEBRA + pattern: "*.cfg" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsebra_gtf: + type: file + description: Output file for the combined gene predictions in gtf + pattern: "*.gtf" + - tsebra_scores: + type: file + description: Transcript scores as a table + pattern: "*.tsv" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/tsebra/tests/main.nf.test b/modules/pfr/tsebra/tests/main.nf.test new file mode 100644 index 0000000..ddf7a6c --- /dev/null +++ b/modules/pfr/tsebra/tests/main.nf.test @@ -0,0 +1,91 @@ +nextflow_process { + + name "Test Process TSEBRA" + script "../main.nf" + process "TSEBRA" + + tag "modules" + tag "modules_nfcore" + tag "tsebra" + tag "nf-core/gunzip/main" + + test("actinidia_chinensis-genome") { + + setup { + run('GUNZIP', alias: 'GUNZIP_GTF') { + script "../../../nf-core/gunzip/main" + + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['actinidia_chinensis']['genome']['genome_1_gtf_gz'], checkIfExists: true) + ] + """ + } + } + + run('GUNZIP', alias: 'GUNZIP_HINTS') { + script "../../../nf-core/gunzip/main" + + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['actinidia_chinensis']['genome']['genome_1_hints_gff_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = GUNZIP_GTF.out.gunzip.map { meta, gtf -> [ meta, [ gtf ] ] } + input[1] = GUNZIP_HINTS.out.gunzip.map { meta, gff -> [ gff ] } + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("actinidia_chinensis-genome-stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.test_data['actinidia_chinensis']['genome']['genome_1_gtf_gz'], checkIfExists: true) ] + ] + input[1] = [ + file(params.test_data['actinidia_chinensis']['genome']['genome_1_hints_gff_gz'], checkIfExists: true) + ] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/pfr/tsebra/tests/main.nf.test.snap b/modules/pfr/tsebra/tests/main.nf.test.snap new file mode 100644 index 0000000..4d9e15f --- /dev/null +++ b/modules/pfr/tsebra/tests/main.nf.test.snap @@ -0,0 +1,100 @@ +{ + "actinidia_chinensis-genome-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,6d60045f4f9b66baa508c174ae6a6408" + ], + "tsebra_gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tsebra_scores": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,6d60045f4f9b66baa508c174ae6a6408" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-11T12:20:49.618044" + }, + "actinidia_chinensis-genome": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,7c781c919e6aa20561f72dea09474f74" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.tsv:md5,ad7bd858c1838b40286609a311f3a891" + ] + ], + "2": [ + "versions.yml:md5,6d60045f4f9b66baa508c174ae6a6408" + ], + "tsebra_gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,7c781c919e6aa20561f72dea09474f74" + ] + ], + "tsebra_scores": [ + [ + { + "id": "test" + }, + "test.tsv:md5,ad7bd858c1838b40286609a311f3a891" + ] + ], + "versions": [ + "versions.yml:md5,6d60045f4f9b66baa508c174ae6a6408" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-11T12:20:45.666076" + } +} \ No newline at end of file diff --git a/modules/pfr/tsebra/tests/tags.yml b/modules/pfr/tsebra/tests/tags.yml new file mode 100644 index 0000000..7594182 --- /dev/null +++ b/modules/pfr/tsebra/tests/tags.yml @@ -0,0 +1,2 @@ +tsebra: + - "modules/pfr/tsebra/**" diff --git a/nextflow.config b/nextflow.config index 409da80..a9ae34a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,87 +1,83 @@ includeConfig './conf/base.config' params { - target_assemblies = [ - ["red5_v2p1", "/workspace/pangene/test_data/red5_v2p1_chr1.fasta"], - ["donghong", "/workspace/pangene/test_data/donghong.chr1.fsa.gz"] - ] - // Pattern: [ [tag, fasta(.gz) ] ] - // Permissible tags: tag, tag_1, tag_tag2_3, tag_tag2_tag3; - // Any name with alphanumeric characters including "_". - // "." is not allowed in the tag name - - te_libraries = [ - ["donghong", "/workspace/pangene/test_data/donghong.TElib.fa.gz"] - ] - // Pattern: [ [tag, fasta(.gz) ] ] - // Optional Set to null if libraries are not available. - // - // Each TE library should have an associated (by tag) assembly in target_assemblies. - // Not all target_assemblies need to have an associated (by tag) TE library. - // When the TE lib is not available for a traget assembly, EDTA is used to create one. + // Input/output options + input = null + external_protein_fastas = null + eggnogmapper_db_dir = null + eggnogmapper_tax_scope = null + fastq = null + liftoff_annotations = null + outdir = "./results" + // Repeat annotation options repeat_annotator = 'repeatmodeler' - // 'repeatmodeler' or 'edta' - - save_annotated_te_lib = true - + save_annotated_te_lib = false edta_is_sensitive = false + repeatmasker_save_outputs = false - repeatmasker_save_outputs = true - - samplesheet = "/workspace/pangene/test_data/samplesheet.csv" - // Optional: Set to null if not available - + // RNASeq pre-processing options skip_fastqc = false skip_fastp = false min_trimmed_reads = 10000 extra_fastp_args = "" - - save_trimmed = true - // toggling this parameter results in rerun of FASTP and FASTQC_TRIM - + save_trimmed = false remove_ribo_rna = false - save_non_ribo_reads = true + save_non_ribo_reads = false ribo_database_manifest = "${projectDir}/assets/rrna-db-defaults.txt" + // RNAseq alignment options star_max_intron_length = 16000 star_align_extra_args = "" - star_save_outputs = true - save_cat_bam = true - // A single BAM is created for each assembly from all the RNAseq samples, if there - // are more than one - - external_protein_fastas = [ - "/workspace/ComparativeDataSources/OrthoDB11/Viridiplantae.fa.gz", - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.pep.fasta" - ] - // Optional: Set to null if not available + star_save_outputs = false + save_cat_bam = false + // Annotation options braker_extra_args = "" - - liftoff_xref_annotations = [ - [ - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/Russell_V2a.chromosomes.and.unassiged.and.haplotigs.fsa", - "/output/genomic/fairGenomes/Plant/Actinidia/chinensis/var_chinensis/male/2x/assembly_russell/v2.1/RU01.20221115150135.gff3" - ], - [ - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_chr_all.fas", - "/output/genomic/fairGenomes/Plant/Arabidopsis/thaliana/var_na/sex_na/2x/assembly_tair/v10/TAIR10_GFF3_genes_transposons.fixed.gff3" - ] - ] - // Format: [ [ fasta(.gz), gff3(.gz) ] ] - // Optional: Set to null if not available - + braker_allow_isoforms = true liftoff_coverage = 0.9 liftoff_identity = 0.9 + eggnogmapper_evalue = 0.00001 + eggnogmapper_pident = 35 + eggnogmapper_purge_nohits = false - outdir = "./results" - + // Max job request options max_cpus = 12 - max_memory = 200.GB - max_time = 1.days + max_memory = '200.GB' + max_time = '7.day' + + // Infrastructure options + validationSkipDuplicateCheck= true + validationS3PathCheck = true +} + +manifest { + name = 'pangene' + author = """Usman Rashid, Jason Shiller""" + homePage = 'https://github.com/PlantandFoodResearch/pan-gene' + description = """A NextFlow pipeline for pan-genome annotation""" + mainScript = 'main.nf' + nextflowVersion = '!>=23.04.4' + version = '0.3.0' + doi = '' +} + +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" +} + +plugins { + id 'nf-validation@1.1.3' } -includeConfig './conf/manifest.config' includeConfig './conf/modules.config' -includeConfig './conf/reporting_defaults.config' diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000..988f87a --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,293 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/plantandfoodresearch/pangene/master/nextflow_schema.json", + "title": "plantandfoodresearch/pangene pipeline parameters", + "description": "A NextFlow pipeline for pan-genome annotation", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "", + "required": ["input", "outdir", "external_protein_fastas", "eggnogmapper_db_dir", "eggnogmapper_tax_scope"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "schema": "assets/schema_input.json", + "pattern": "^\\S+\\.csv$", + "description": "Target assemblies listed in a CSV sheet", + "fa_icon": "fas fa-file-csv", + "help_text": "FASTA and other associated files for target assemblies provided as a CSV sheet" + }, + "external_protein_fastas": { + "type": "string", + "description": "External protein fastas listed in a text sheet", + "help_text": "A text file listing FASTA files to provide protein evidence for annotation", + "format": "file-path", + "mimetype": "text/txt", + "fa_icon": "far fa-file-alt" + }, + "eggnogmapper_db_dir": { + "type": "string", + "description": "Eggnogmapper database directory", + "format": "directory-path" + }, + "eggnogmapper_tax_scope": { + "type": "integer", + "description": "Eggnogmapper taxonomy scopre", + "minimum": 0 + }, + "fastq": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "schema": "assets/schema_fastq.json", + "pattern": "^\\S+\\.csv$", + "help_text": "FASTQ files for RNASeq samples corresponding to each target assembly provided in a CSV sheet", + "fa_icon": "fas fa-file-csv", + "description": "FASTQ samples listed in a CSV sheet" + }, + "liftoff_annotations": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "schema": "assets/schema_liftoff.json", + "pattern": "^\\S+\\.csv$", + "description": "Reference annotations listed in a CSV sheet", + "help_text": "FASTA and GFF3 files for reference annotations for liftoff listed in a CSV sheet", + "fa_icon": "fas fa-file-csv" + }, + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved", + "fa_icon": "fas fa-folder-open", + "default": "./results", + "help_text": " Use absolute paths to storage on Cloud infrastructure" + } + } + }, + "repeat_annotation_options": { + "title": "Repeat annotation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "repeat_annotator": { + "type": "string", + "default": "repeatmodeler", + "enum": ["edta", "repeatmodeler"], + "description": "'edta' or 'repeatmodeler'" + }, + "save_annotated_te_lib": { + "type": "boolean", + "description": "Save annotated TE library or not?" + }, + "edta_is_sensitive": { + "type": "boolean", + "description": "Use '--sensitive 1' flag with EDTA or not?" + }, + "repeatmasker_save_outputs": { + "type": "boolean", + "description": "Save the repeat-masked genome or not?" + } + } + }, + "rnaseq_pre_processing_options": { + "title": "RNASeq pre-processing options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_fastqc": { + "type": "boolean", + "description": "Skip FASTQC or not?" + }, + "skip_fastp": { + "type": "boolean", + "description": "Skip trimming by FASTQP or not?" + }, + "min_trimmed_reads": { + "type": "integer", + "default": 10000, + "description": "Exclude a sample if its reads after trimming are below this number", + "minimum": 0 + }, + "extra_fastp_args": { + "type": "string", + "description": "Extra FASTP arguments" + }, + "save_trimmed": { + "type": "boolean", + "description": "Save FASTQ files after trimming or not?" + }, + "remove_ribo_rna": { + "type": "boolean", + "description": "Remove Ribosomal RNA or not?" + }, + "save_non_ribo_reads": { + "type": "boolean", + "description": "Save FASTQ files after Ribosomal RNA removal or not?" + }, + "ribo_database_manifest": { + "type": "string", + "default": "${projectDir}/assets/rrna-db-defaults.txt", + "description": "Ribosomal RNA fastas listed in a text sheet", + "format": "file-path", + "mimetype": "text/txt" + } + } + }, + "rnaseq_alignment_options": { + "title": "RNAseq alignment options", + "type": "object", + "description": "", + "default": "", + "properties": { + "star_max_intron_length": { + "type": "integer", + "default": 16000, + "minimum": 0, + "description": "Maximum intron length for STAR alignment" + }, + "star_align_extra_args": { + "type": "string", + "description": "EXTRA arguments for STAR" + }, + "star_save_outputs": { + "type": "boolean", + "description": "Save BAM files from STAR or not?" + }, + "save_cat_bam": { + "type": "boolean", + "description": "SAVE a concatenated BAM file per assembly or not?" + } + } + }, + "annotation_options": { + "title": "Annotation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "braker_extra_args": { + "type": "string", + "description": "Extra arguments for BRAKER" + }, + "braker_allow_isoforms": { + "type": "boolean", + "default": true, + "description": "Allow multiple isoforms for gene models" + }, + "liftoff_coverage": { + "type": "number", + "default": 0.9, + "minimum": 0, + "maximum": 1, + "description": "Liftoff coverage parameter" + }, + "liftoff_identity": { + "type": "number", + "default": 0.9, + "description": "Liftoff identity parameter" + }, + "eggnogmapper_evalue": { + "type": "number", + "default": 1e-5, + "description": "Only report alignments below or equal the e-value threshold" + }, + "eggnogmapper_pident": { + "type": "integer", + "default": 35, + "description": "Only report alignments above or equal to the given percentage of identity (0-100)", + "minimum": 0, + "maximum": 100 + }, + "eggnogmapper_purge_nohits": { + "type": "boolean", + "description": "Purge transcripts which do not have a hit against eggnog" + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 12, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "200.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "7.day", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "infrastructure_options": { + "title": "Infrastructure options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "", + "help_text": "", + "properties": { + "validationSkipDuplicateCheck": { + "type": "boolean", + "default": true, + "hidden": true + }, + "validationS3PathCheck": { + "type": "boolean", + "default": true, + "hidden": true + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/repeat_annotation_options" + }, + { + "$ref": "#/definitions/rnaseq_pre_processing_options" + }, + { + "$ref": "#/definitions/rnaseq_alignment_options" + }, + { + "$ref": "#/definitions/annotation_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/infrastructure_options" + } + ] +} diff --git a/pangene_pfr b/pangene_pfr deleted file mode 100644 index 608798c..0000000 --- a/pangene_pfr +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -e - - -#SBATCH --job-name PANGENE -#SBATCH --time=1-00:00:00 -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --output pangene_pfr.stdout -#SBATCH --error pangene_pfr.stderr -#SBATCH --mem=4G - -ml apptainer/1.1 -ml nextflow/23.04.4 - -export TMPDIR="/workspace/$USER/tmp" -export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,$TMPDIR:$TMPDIR,$TMPDIR:/tmp" - -nextflow \ - main.nf \ - -profile pfr,apptainer \ - -resume diff --git a/pfr/params.json b/pfr/params.json new file mode 100644 index 0000000..cb7d801 --- /dev/null +++ b/pfr/params.json @@ -0,0 +1,8 @@ +{ + "input": "/workspace/pangene/test_data/assemblysheet.csv", + "external_protein_fastas": "/workspace/pangene/test_data/external-protein-fastas.txt", + "eggnogmapper_db_dir": "/workspace/ComparativeDataSources/emapperdb/5.0.2", + "eggnogmapper_tax_scope": 33090, + "fastq": "/workspace/pangene/test_data/fastqsheet.csv", + "liftoff_annotations": "/workspace/pangene/test_data/liftoffannotations.csv" +} diff --git a/pfr/profile.config b/pfr/profile.config new file mode 100644 index 0000000..b0eba29 --- /dev/null +++ b/pfr/profile.config @@ -0,0 +1,17 @@ +profiles { + pfr { + process { + executor = 'slurm' + } + + apptainer { + envWhitelist = "APPTAINER_BINDPATH,APPTAINER_BIND" + cacheDir = "/workspace/pangene/singularity" + } + } +} + +// params { +// config_profile_name = 'Plant&Food profile' +// config_profile_description = 'Plant&Food profile using SLURM in combination with Apptainer' +// } diff --git a/pfr_pangene b/pfr_pangene new file mode 100644 index 0000000..4f9bf4c --- /dev/null +++ b/pfr_pangene @@ -0,0 +1,49 @@ +#!/bin/bash -e + + +#SBATCH --job-name PANGENE +#SBATCH --time=7-00:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --output pfr_pangene.stdout +#SBATCH --error pfr_pangene.stderr +#SBATCH --mem=4G + +full_test_flag=0 + +# Parse command line options +while getopts "t" opt; do + case ${opt} in + t ) + full_test_flag=1 + ;; + \? ) + echo "Invalid option: $OPTARG" 1>&2 + exit 1 + ;; + esac +done +shift $((OPTIND -1)) + +ml unload perl +ml apptainer/1.1 +ml nextflow/23.04.4 + +export TMPDIR="/workspace/$USER/tmp" +export APPTAINER_BINDPATH="$APPTAINER_BINDPATH,$TMPDIR:$TMPDIR,$TMPDIR:/tmp" + +if [ $full_test_flag -eq 1 ]; then + nextflow \ + main.nf \ + -c pfr/profile.config \ + -profile pfr,apptainer,test_full \ + -resume +else + nextflow \ + main.nf \ + -c pfr/profile.config \ + -profile pfr,apptainer \ + -params-file pfr/params.json \ + -resume +fi diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5611062 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.ruff] +line-length = 120 +target-version = "py38" +cache-dir = "~/.cache/ruff" + +[tool.ruff.lint] +select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] + +[tool.ruff.lint.isort] +known-first-party = ["nf_core"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/local/align_rnaseq.nf b/subworkflows/local/align_rnaseq.nf index 05cd2fa..94d8b8d 100644 --- a/subworkflows/local/align_rnaseq.nf +++ b/subworkflows/local/align_rnaseq.nf @@ -71,6 +71,6 @@ workflow ALIGN_RNASEQ { ch_versions = ch_versions.mix(SAMTOOLS_CAT.out.versions.first()) emit: - bam = ch_samtools_bam // channel: [ [ id, single_end, target_assembly ], [ bam ] ] + bam = ch_samtools_bam // channel: [ [ id: target_assembly, single_end ], [ bam ] ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/extract_samples.nf b/subworkflows/local/extract_samples.nf deleted file mode 100644 index 947c0b7..0000000 --- a/subworkflows/local/extract_samples.nf +++ /dev/null @@ -1,71 +0,0 @@ -// Source: -// https://github.com/nf-core/rnaseq -// MIT: https://github.com/nf-core/rnaseq/blob/master/LICENSE -// -// Check input samplesheet and get read channels -// -// Changes: -// Added channel permissible_target_assemblies -// Changed file name from input_check.nf to extract_samples.nf -// Removed strandedness -// Nowing emitting an extra channel 'assemblies' which indicates the -// assemblies targeted by each read - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow EXTRACT_SAMPLES { - take: - samplesheet // file: /path/to/samplesheet.csv - permissible_target_assemblies // val: assembly_a,assembly_b - - main: - SAMPLESHEET_CHECK ( samplesheet, permissible_target_assemblies ) - .csv - | splitCsv ( header:true, sep:',' ) - | combine ( samplesheet ) - | map { row, sheet -> - create_fastq_channel(row, sheet.getParent()) - } - | set { ch_reads } - - reads = ch_reads.map { meta, fastq -> [[id:meta.id, single_end:meta.single_end], fastq]} - - ch_reads - | flatMap { meta, fastq -> - meta.target_assemblies.collect { assembly -> [[id:meta.id, single_end:meta.single_end], assembly] } - } - | set { assemblies } - - emit: - reads // channel: [ val(meta), [ reads ] ] - assemblies // channel: [ val(meta), val(assembly) ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row, sheetPath) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - meta.target_assemblies = row.target_assemblies.split(";").sort() - - def fq1 = row.fastq_1.startsWith("/") ? row.fastq_1 : "$sheetPath/${row.fastq_1}" - def fq2 = row.fastq_2.startsWith("/") ? row.fastq_2 : "$sheetPath/${row.fastq_2}" - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(fq1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${fq1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(fq1) ] ] - } else { - if (!file(fq2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${fq2}" - } - fastq_meta = [ meta, [ file(fq1), file(fq2) ] ] - } - - return fastq_meta -} diff --git a/subworkflows/local/fasta_braker3.nf b/subworkflows/local/fasta_braker3.nf new file mode 100644 index 0000000..b76bdf7 --- /dev/null +++ b/subworkflows/local/fasta_braker3.nf @@ -0,0 +1,65 @@ +include { BRAKER3 } from '../../modules/kherronism/braker3' +include { FILE_GUNZIP as BRAKER_GFF3_GUNZIP } from '../../subworkflows/local/file_gunzip' +include { FILE_GUNZIP as BRAKER_HINTS_GUNZIP } from '../../subworkflows/local/file_gunzip' + +workflow FASTA_BRAKER3 { + take: + ch_masked_target_assembly // channel: [ meta, fasta ]; meta ~ [ id: traget_assembly ] + ch_braker_ex_asm_str // channel: val(assembly_x,assembly_y) + ch_rnaseq_bam // channel: [ meta, bam ] + ch_ext_prots_fasta // channel: [ meta2, fasta ]; meta2 ~ [ id: ext_protein_seqs ] + ch_braker_annotation // channel: [ meta, gff3, hints.gff ] + + main: + ch_versions = Channel.empty() + + + ch_braker_inputs = ch_masked_target_assembly + | combine( ch_braker_ex_asm_str ) + | filter { meta, fasta, ex_str -> !( ex_str.split(",").contains( meta.id ) ) } + | map { meta, fasta, ex_str -> + [ meta, fasta ] + } + | join(ch_rnaseq_bam, remainder: true) + | combine( + ch_ext_prots_fasta.map { meta, fasta -> fasta }.ifEmpty(null) + ) + | map { meta, fasta, bam, prots -> [ meta, fasta, bam ?: [], prots ?: [] ] } + + def rnaseq_sets_dirs = [] + def rnaseq_sets_ids = [] + def hintsfile = [] + + // MODULE: BRAKER3 + BRAKER3( + ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, + ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, + rnaseq_sets_dirs, + rnaseq_sets_ids, + ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, + hintsfile + ) + + ch_braker_gff3 = BRAKER3.out.gff3 + | mix( ch_braker_annotation.map { meta, gff3, hints -> [ meta, gff3 ] } ) + ch_braker_hints = BRAKER3.out.hintsfile + | mix( ch_braker_annotation.map { meta, gff3, hints -> [ meta, hints ] } ) + ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + + // WORKFLOW: FILE_GUNZIP as BRAKER_GFF3_GUNZIP + BRAKER_GFF3_GUNZIP ( ch_braker_gff3 ) + + ch_braker_gff3 = BRAKER_GFF3_GUNZIP.out.gunzip + ch_versions = ch_versions.mix(BRAKER_GFF3_GUNZIP.out.versions) + + // WORKFLOW: FILE_GUNZIP as BRAKER_HINTS_GUNZIP + BRAKER_HINTS_GUNZIP ( ch_braker_hints ) + + ch_braker_hints = BRAKER_HINTS_GUNZIP.out.gunzip + ch_versions = ch_versions.mix(BRAKER_HINTS_GUNZIP.out.versions) + + emit: + braker_gff3 = ch_braker_gff3 // [ meta, gff3 ] + braker_hints = ch_braker_hints // [ meta, hints.gff ] + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/local/fasta_liftoff.nf b/subworkflows/local/fasta_liftoff.nf index 4c59ba3..01cd776 100644 --- a/subworkflows/local/fasta_liftoff.nf +++ b/subworkflows/local/fasta_liftoff.nf @@ -1,7 +1,10 @@ -include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip' -include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip' -include { GFFREAD } from '../../modules/nf-core/gffread' -include { LIFTOFF } from '../../modules/pfr/liftoff' +include { GUNZIP as GUNZIP_FASTA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' +include { GFFREAD as GFFREAD_BEFORE_LIFTOFF } from '../../modules/nf-core/gffread/main' +include { LIFTOFF } from '../../modules/nf-core/liftoff/main' +include { AGAT_SPMERGEANNOTATIONS as MERGE_LIFTOFF_ANNOTATIONS } from '../../modules/pfr/agat/spmergeannotations/main' +include { AGAT_SPFILTERFEATUREFROMKILLLIST } from '../../modules/pfr/agat/spfilterfeaturefromkilllist/main' +include { GFFREAD as GFFREAD_AFTER_LIFTOFF } from '../../modules/nf-core/gffread/main' workflow FASTA_LIFTOFF { take: @@ -44,21 +47,11 @@ workflow FASTA_LIFTOFF { ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions.first()) - // MODULE: GFFREAD - ch_gffread_inputs = ch_xref_gunzip_gff - | map { meta, gff -> - [ gff.getSimpleName(), meta, gff ] - } // For meta insertion later, remove when GFFREAD has meta - - GFFREAD ( ch_gffread_inputs.map { name, meta, gff -> gff } ) + // MODULE: GFFREAD as GFFREAD_BEFORE_LIFTOFF + GFFREAD_BEFORE_LIFTOFF ( ch_xref_gunzip_gff, [] ) - ch_gffread_gff = GFFREAD.out.gffread_gff - | map { gff -> [ gff.getSimpleName(), gff ] } - | join(ch_gffread_inputs) - | map { fid, gffread_gff, meta, gff -> [ meta, gffread_gff ] } - // meta insertion - - ch_versions = ch_versions.mix(GFFREAD.out.versions.first()) + ch_gffread_gff = GFFREAD_BEFORE_LIFTOFF.out.gffread_gff + ch_versions = ch_versions.mix(GFFREAD_BEFORE_LIFTOFF.out.versions.first()) // MODULE: LIFTOFF ch_liftoff_inputs = target_assemby @@ -83,7 +76,8 @@ workflow FASTA_LIFTOFF { LIFTOFF( ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> [ meta, target_fa ] }, ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> ref_fa }, - ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> ref_gff } + ch_liftoff_inputs.map { meta, target_fa, ref_fa, ref_gff -> ref_gff }, + [] ) ch_liftoff_gff3 = LIFTOFF.out.polished_gff3 @@ -92,7 +86,73 @@ workflow FASTA_LIFTOFF { ch_versions = ch_versions.mix(LIFTOFF.out.versions.first()) + // MODULE: AGAT_SPMERGEANNOTATIONS as MERGE_LIFTOFF_ANNOTATIONS + ch_merge_inputs = ch_liftoff_gff3 + | branch { meta, list_polished -> + one: list_polished.size() == 1 + many: list_polished.size() > 1 + } + + MERGE_LIFTOFF_ANNOTATIONS( + ch_merge_inputs.many, + [] + ) + + ch_merged_gff = MERGE_LIFTOFF_ANNOTATIONS.out.gff.mix(ch_merge_inputs.one) + ch_versions = ch_versions.mix(MERGE_LIFTOFF_ANNOTATIONS.out.versions.first()) + + // COLLECTFILE: Transcript level kill list + ch_kill_list = ch_merged_gff + | map { meta, gff -> + + def tx_from_gff = gff.readLines() + .findAll { it -> + if ( it.startsWith('#') ) { return false } + + def cols = it.split('\t') + def feat = cols[2] + if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return false } + + def attrs = cols[8] + attrs.contains('valid_ORF=False') + } + .collect { + def cols = it.split('\t') + def attrs = cols[8] + + def matches = attrs =~ /ID=([^;]*)/ + + return matches[0][1] + } + + [ "${meta.id}.kill.list.txt" ] + tx_from_gff.join('\n') + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.kill.list', '') ], file ] + } + + // MODULE: AGAT_SPFILTERFEATUREFROMKILLLIST + ch_agat_kill_inputs = ch_merged_gff + | join(ch_kill_list) + + + AGAT_SPFILTERFEATUREFROMKILLLIST( + ch_agat_kill_inputs.map { meta, gff, kill -> [ meta, gff ] }, + ch_agat_kill_inputs.map { meta, gff, kill -> kill }, + [] // default config + ) + + ch_liftoff_purged_gff = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff + ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first()) + + // MODULE: GFFREAD as GFFREAD_AFTER_LIFTOFF + GFFREAD_AFTER_LIFTOFF ( ch_liftoff_purged_gff, [] ) + + ch_attr_trimmed_gff = GFFREAD_AFTER_LIFTOFF.out.gffread_gff + ch_versions = ch_versions.mix(GFFREAD_AFTER_LIFTOFF.out.versions.first()) + emit: - gff3 = ch_liftoff_gff3 // [ meta, [ gff3 ] ] - versions = ch_versions // [ versions.yml ] + gff3 = ch_attr_trimmed_gff // [ meta, gff3 ] + versions = ch_versions // [ versions.yml ] } diff --git a/subworkflows/local/file_gunzip.nf b/subworkflows/local/file_gunzip.nf new file mode 100644 index 0000000..30f3368 --- /dev/null +++ b/subworkflows/local/file_gunzip.nf @@ -0,0 +1,20 @@ +include { GUNZIP } from '../../modules/nf-core/gunzip' + +workflow FILE_GUNZIP { + take: + ch_input // channel [ meta, archive ] + + main: + ch_input_branch = ch_input + | branch { meta, archive -> + gz: "$archive".endsWith('.gz') + rest: ! "$archive".endsWith('.gz') + } + + // MODULE: GUNZIP + GUNZIP ( ch_input_branch.gz ) + + emit: + versions = GUNZIP.out.versions.first() + gunzip = GUNZIP.out.gunzip.mix( ch_input_branch.rest ) +} diff --git a/subworkflows/local/gff_eggnogmapper.nf b/subworkflows/local/gff_eggnogmapper.nf new file mode 100644 index 0000000..7ea0d19 --- /dev/null +++ b/subworkflows/local/gff_eggnogmapper.nf @@ -0,0 +1,47 @@ +include { GFFREAD as GFF2FASTA_FOR_EGGNOGMAPPER } from '../../modules/nf-core/gffread/main' +include { EGGNOGMAPPER } from '../../modules/nf-core/eggnogmapper/main' + +workflow GFF_EGGNOGMAPPER { + take: + ch_gff // Channel: [ meta, gff ] + ch_fasta // Channel: [ meta, fasta ] + db_folder // val(db_folder) + + main: + // Versions + ch_versions = Channel.empty() + + // MODULE: GFFREAD as GFF2FASTA_FOR_EGGNOGMAPPER + ch_gffread_inputs = ch_gff + | join(ch_fasta) + + GFF2FASTA_FOR_EGGNOGMAPPER( + ch_gffread_inputs.map { meta, gff, fasta -> [ meta, gff ] }, + ch_gffread_inputs.map { meta, gff, fasta -> fasta } + ) + + ch_gffread_fasta = GFF2FASTA_FOR_EGGNOGMAPPER.out.gffread_fasta + ch_versions = ch_versions.mix(GFF2FASTA_FOR_EGGNOGMAPPER.out.versions.first()) + + + ch_eggnogmapper_inputs = ch_gffread_fasta + | combine(Channel.fromPath(db_folder)) + + EGGNOGMAPPER( + ch_eggnogmapper_inputs.map { meta, fasta, db -> [ meta, fasta ] }, + [], + ch_eggnogmapper_inputs.map { meta, fasta, db -> db }, + [ [], [] ] + ) + + ch_eggnogmapper_annotations = EGGNOGMAPPER.out.annotations + ch_eggnogmapper_orthologs = EGGNOGMAPPER.out.orthologs + ch_eggnogmapper_hits = EGGNOGMAPPER.out.hits + ch_versions = ch_versions.mix(EGGNOGMAPPER.out.versions.first()) + + emit: + eggnogmapper_annotations = ch_eggnogmapper_annotations + eggnogmapper_orthologs = ch_eggnogmapper_orthologs + eggnogmapper_hits = ch_eggnogmapper_hits + versions = ch_versions +} diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf new file mode 100644 index 0000000..834af94 --- /dev/null +++ b/subworkflows/local/gff_merge_cleanup.nf @@ -0,0 +1,118 @@ +include { AGAT_SPMERGEANNOTATIONS } from '../../modules/pfr/agat/spmergeannotations/main' +include { GT_GFF3 } from '../../modules/nf-core/gt/gff3/main' +include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' + +workflow GFF_MERGE_CLEANUP { + take: + ch_braker_gff // Channel: [ meta, gff ] + ch_liftoff_gff // Channel: [ meta, gff ] + + main: + ch_versions = Channel.empty() + + ch_gff_branch = ch_braker_gff + | join(ch_liftoff_gff, remainder:true) + | branch { meta, braker_gff, liftoff_gff -> + both : ( braker_gff && liftoff_gff ) + braker_only : ( braker_gff && ( ! liftoff_gff ) ) + liftoff_only: ( ( ! braker_gff ) && liftoff_gff ) + } + + // MODULE: AGAT_SPMERGEANNOTATIONS + AGAT_SPMERGEANNOTATIONS( + ch_gff_branch.both.map { meta, bg, lg -> [ meta, [ bg, lg ] ] }, + [] + ) + + ch_merged_gff = AGAT_SPMERGEANNOTATIONS.out.gff + | mix ( ch_gff_branch.liftoff_only.map { meta, braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) + | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, liftoff_gff -> [ meta, braker_gff ] } ) + ch_versions = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first()) + + // MODULE: GT_GFF3 + GT_GFF3 ( ch_merged_gff ) + + ch_gt_gff = GT_GFF3.out.gt_gff3 + ch_versions = ch_versions.mix(GT_GFF3.out.versions.first()) + + // COLLECTFILE: Format GT_GFF3 output + ch_gt_formatted_gff = ch_gt_gff + | map { meta, gff -> + + def lines = gff.readLines() + .collect { line -> + if ( line.startsWith('##') ) { return line } + if ( line.startsWith('#') ) { return '' } + + def cols = line.split('\t') + def program = cols[1] + def feat = cols[2] + def atts = cols[8] + + def atts_r = '' + // Remove attributes and use AGAT_CONVERTSPGXF2GXF + // to create attributes based on sequential layout + + def feat_r = feat == 'transcript' ? 'mRNA' : feat + // Use mRNA inplace of transcript + + if ( feat != 'gene' || program != 'Liftoff' ) { + return ( cols[0..1] + [ feat_r ] + cols[3..7] + [ atts_r ] ).join('\t') + } + + def gene_id = ( atts =~ /ID=([^;]*)/ )[0][1] + def atts_g = "liftoffID=$gene_id" + + return ( cols[0..7] + [ atts_g ] ).join('\t') + + }.join('\n') + + [ "${meta.id}.bare.gff" ] + [ lines ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.bare', '') ], file ] + } + + // MODULE: AGAT_CONVERTSPGXF2GXF + AGAT_CONVERTSPGXF2GXF ( ch_gt_formatted_gff ) + + ch_agat_gff = AGAT_CONVERTSPGXF2GXF.out.output_gff + ch_versions = ch_versions.mix(AGAT_CONVERTSPGXF2GXF.out.versions.first()) + + // COLLECTFILE: Format AGAT_CONVERTSPGXF2GXF output + ch_final_gff = ch_agat_gff + | map { meta, gff -> + + def lines = gff.readLines() + .collect { line -> + if ( line.startsWith('#') ) { return line } + + def cols = line.split('\t') + def program = cols[1] + def feat = cols[2] + def atts = cols[8] + def atts_r = atts.replace('-', '').replace('agat', '') + + if ( feat != 'gene' || program != 'Liftoff' ) { + return ( cols[0..7] + [ atts_r ] ).join('\t') + } + + def oldID = ( atts =~ /liftoffID=([^;]*)/ )[0][1] + def newID = ( atts =~ /ID=([^;]*)/ )[0][1].replace('-', '').replace('agat', '') + def atts_g = "ID=${newID};liftoffID=${oldID}" + + return ( cols[0..7] + [ atts_g ] ).join('\t') + }.join('\n') + + [ "${meta.id}.agat.cleanup.gff" ] + [ lines ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.agat.cleanup', '') ], file ] + } + + emit: + gff = ch_final_gff // [ meta, gff ] + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/local/gff_store.nf b/subworkflows/local/gff_store.nf new file mode 100644 index 0000000..3326e2a --- /dev/null +++ b/subworkflows/local/gff_store.nf @@ -0,0 +1,119 @@ +import java.net.URLEncoder + +include { GT_GFF3 as FINAL_GFF_CHECK } from '../../modules/nf-core/gt/gff3/main' + +workflow GFF_STORE { + take: + ch_target_gff // [ meta, gff ] + ch_eggnogmapper_annotations // [ meta, annotations ] + + main: + ch_versions = Channel.empty() + + // COLLECTFILE: Add eggnogmapper hits to gff + ch_described_gff = ch_target_gff + | join(ch_eggnogmapper_annotations) + | map { meta, gff, annotations -> + def tx_annotations = annotations.readLines() + .findAll { ! it.startsWith('#') } + .collect { line -> + def cols = line.split('\t') + def id = cols[0] + def txt = cols[7] + def pfams = cols[20] + + [ id, txt, pfams ] + } + .collect { id, txt, pfams -> + if ( txt != '-' ) { return [ id, txt ] } + if ( pfams != '-' ) { return [ id, "PFAMs: $pfams" ] } + + [ id, 'No eggnog description and PFAMs' ] + } + .collectEntries { id, txt -> + [ id, txt ] + } + + def gene_tx_annotations = [:] + gff.readLines() + .findAll { line -> + if ( line.startsWith('#') || line == '' ) { return false } + + def cols = line.split('\t') + def feat = cols[2] + + if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return false } + + return true + } + .each { line -> + def cols = line.split('\t') + def atts = cols[8] + + def matches = atts =~ /ID=([^;]*)/ + def tx_id = matches[0][1] + + def matches_p= atts =~ /Parent=([^;]*)/ + def gene_id = matches_p[0][1] + + if ( ! gene_tx_annotations.containsKey(gene_id) ) { + gene_tx_annotations[gene_id] = [:] + } + + def anno = tx_annotations.containsKey(tx_id) + ? URLEncoder.encode(tx_annotations[tx_id], "UTF-8").replace('+', '%20') + : URLEncoder.encode('Hypothetical protein | no eggnog hit', "UTF-8").replace('+', '%20') + + gene_tx_annotations[gene_id] += [ ( tx_id ): anno ] + } + + gene_tx_annotations = gene_tx_annotations + .collectEntries { gene_id, tx_annos -> + def default_anno = tx_annos.values().first() + + if ( tx_annos.values().findAll { it != default_anno }.size() > 0 ) { + return [ gene_id, ( tx_annos + [ 'default': 'Differing%20isoform%20descriptions' ] ) ] + } + + [ gene_id, ( tx_annos + [ 'default': default_anno ] ) ] + } + + def gff_lines = gff.readLines() + .collect { line -> + + if ( line.startsWith('#') || line == '' ) { return line } + + def cols = line.split('\t') + def feat = cols[2] + def atts = cols[8] + + if ( ! ( feat == 'gene' || feat == 'transcript' || feat == 'mRNA' ) ) { return line } + + def id = feat == 'gene' ? ( atts =~ /ID=([^;]*)/ )[0][1] : ( atts =~ /Parent=([^;]*)/ )[0][1] + + if ( ! gene_tx_annotations.containsKey(id) ) { return line } + + def tx_id = feat == 'gene' ? null : ( atts =~ /ID=([^;]*)/ )[0][1] + def desc = feat == 'gene' ? gene_tx_annotations[id]['default'] : gene_tx_annotations[id][tx_id] + + return ( line + ";description=$desc" ) + } + + [ "${meta.id}.described.gff" ] + gff_lines.join('\n') + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.described', '') ], file ] + } + + // MODULE: GT_GFF3 as FINAL_GFF_CHECK + FINAL_GFF_CHECK ( ch_described_gff ) + + ch_final_gff = FINAL_GFF_CHECK.out.gt_gff3 + ch_versions = ch_versions.mix(FINAL_GFF_CHECK.out.versions.first()) + + + emit: + final_gff = ch_final_gff // [ meta, gff ] + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/local/prepare_assembly.nf b/subworkflows/local/prepare_assembly.nf index d18f5ce..f0c9d7f 100644 --- a/subworkflows/local/prepare_assembly.nf +++ b/subworkflows/local/prepare_assembly.nf @@ -13,6 +13,7 @@ workflow PREPARE_ASSEMBLY { target_assembly // channel: [ meta, fasta ] te_library // channel: [ meta, fasta ] repeat_annotator // val(String), 'repeatmodeler' or 'edta' + exclude_assemblies // channel: val(assembly_x,assembly_y) main: ch_versions = Channel.empty() @@ -74,6 +75,14 @@ workflow PREPARE_ASSEMBLY { ch_edta_inputs = repeat_annotator != 'edta' ? Channel.empty() : ch_annotator_inputs + | combine( exclude_assemblies ) + | map { meta, fasta, ex_assemblies -> + def ex_list = ex_assemblies.split(",") + + if ( !( ex_list.contains( meta.id ) ) ) { + [ meta, fasta ] + } + } FASTA_EDTA_LAI( ch_edta_inputs, @@ -87,6 +96,14 @@ workflow PREPARE_ASSEMBLY { ch_repeatmodeler_inputs = repeat_annotator != 'repeatmodeler' ? Channel.empty() : ch_annotator_inputs + | combine( exclude_assemblies ) + | map { meta, fasta, ex_assemblies -> + def ex_list = ex_assemblies.split(",") + + if ( !( ex_list.contains( meta.id ) ) ) { + [ meta, fasta ] + } + } REPEATMODELER_BUILDDATABASE ( ch_repeatmodeler_inputs ) @@ -113,9 +130,20 @@ workflow PREPARE_ASSEMBLY { ch_versions = ch_versions.mix(REPEATMASKER.out.versions.first()) // MODULE: STAR_GENOMEGENERATE + ch_genomegenerate_inputs = ch_validated_assembly + | combine( exclude_assemblies ) + | map { meta, fasta, ex_assemblies -> + def ex_list = ex_assemblies.split(",") + + if ( !( ex_list.contains( meta.id ) ) ) { + [ meta, fasta ] + } + } + + STAR_GENOMEGENERATE( - ch_validated_assembly, - ch_validated_assembly.map { meta, fasta -> [ [], [] ] } + ch_genomegenerate_inputs, + ch_genomegenerate_inputs.map { meta, fasta -> [ [], [] ] } ) ch_assembly_index = STAR_GENOMEGENERATE.out.index diff --git a/subworkflows/local/preprocess_rnaseq.nf b/subworkflows/local/preprocess_rnaseq.nf index 9466104..72fa176 100644 --- a/subworkflows/local/preprocess_rnaseq.nf +++ b/subworkflows/local/preprocess_rnaseq.nf @@ -1,12 +1,13 @@ include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq' -include { SORTMERNA } from '../../modules/nf-core/sortmerna' -include { EXTRACT_SAMPLES } from '../../subworkflows/local/extract_samples' +include { SORTMERNA as SORTMERNA_INDEX } from '../../modules/nf-core/sortmerna' +include { SORTMERNA as SORTMERNA_READS } from '../../modules/nf-core/sortmerna' include { FASTQ_FASTQC_UMITOOLS_FASTP } from '../../subworkflows/nf-core/fastq_fastqc_umitools_fastp' workflow PREPROCESS_RNASEQ { take: - samplesheet // path: csv + ch_reads // channel: [ [ id, single_end, target_assemblies ], [ [ fq ] ] ] permissible_assemblies // val: assembly_a,assembly_b + exclude_assemblies // channel: val(assembly_x,assembly_y) skip_fastqc // val: true|false skip_fastp // val: true|false save_trimmed // val: true|false @@ -15,36 +16,37 @@ workflow PREPROCESS_RNASEQ { sortmerna_fastas // channel: [ [ fasta ] ] main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() - // SUBWORKFLOW: EXTRACT_SAMPLES - EXTRACT_SAMPLES( - samplesheet, - permissible_assemblies - ) + ch_fastq = ch_reads + | combine( exclude_assemblies ) + | map { meta, fqs, ex_assemblies -> + def ex_list = ex_assemblies.split(",") - ch_fastq = EXTRACT_SAMPLES.out.reads - | map { meta, fastq -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], fastq ] + if ( !( meta.target_assemblies.every { ex_list.contains( it ) } ) ) { + [ [ id:meta.id, single_end:meta.single_end ], fqs ] + } } - | groupTuple() - | branch { meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] + | branch { meta, fqs -> + single : fqs.size() == 1 + return [ meta, fqs.flatten() ] + multiple: fqs.size() > 1 + return [ meta, fqs.flatten() ] } - ch_reads_target = EXTRACT_SAMPLES.out.assemblies - | map { meta, assembly -> - groupID = meta.id - ~/_T\d+/ - [ meta + [id: groupID], assembly ] + + ch_reads_target = ch_reads + | combine( exclude_assemblies ) + | flatMap { meta, fqs, ex_assemblies -> + def ex_list = ex_assemblies.split(",") + + meta + .target_assemblies + .collect { assembly -> [ [ id:meta.id, single_end:meta.single_end ], assembly ] } + .findAll { _meta, assembly -> !( ex_list.contains( assembly ) ) } } | unique - ch_versions = ch_versions.mix(EXTRACT_SAMPLES.out.versions) - // MODULES: CAT_FASTQ CAT_FASTQ ( ch_fastq.multiple ) @@ -81,21 +83,41 @@ workflow PREPROCESS_RNASEQ { ch_versions = ch_versions.mix(FASTQ_FASTQC_UMITOOLS_FASTP.out.versions.first()) - // MODULE: SORTMERNA - SORTMERNA( - remove_ribo_rna ? ch_trim_reads : Channel.empty(), - sortmerna_fastas + + // MODULE: SORTMERNA as SORTMERNA_INDEX + SORTMERNA_INDEX( + [ [ id: 'idx' ], [] ], + sortmerna_fastas.map { fastas -> [ [ id: 'fastas' ], fastas ] }, + [ [], [] ] + ) + + ch_versions = ch_versions.mix(SORTMERNA_INDEX.out.versions) + + // MODULE: SORTMERNA as SORTMERNA_READS + ch_sortmerna_inputs = remove_ribo_rna + ? ch_trim_reads + | combine( + sortmerna_fastas + | map { fastas -> [ [ id: 'fastas' ], fastas ] } + | join(SORTMERNA_INDEX.out.index) + ) + : Channel.empty() + + SORTMERNA_READS( + ch_sortmerna_inputs.map { meta, reads, meta2, fastas, idx -> [ meta, reads ] }, + ch_sortmerna_inputs.map { meta, reads, meta2, fastas, idx -> [ meta2, fastas ] }, + ch_sortmerna_inputs.map { meta, reads, meta2, fastas, idx -> [ meta2, idx ] } ) ch_emitted_reads = remove_ribo_rna - ? SORTMERNA.out.reads + ? SORTMERNA_READS.out.reads : ch_trim_reads - ch_versions = ch_versions.mix(SORTMERNA.out.versions.first()) + ch_versions = ch_versions.mix(SORTMERNA_READS.out.versions.first()) emit: - trim_reads = ch_emitted_reads // channel: [ meta, [ fq ] ] - reads_target = ch_reads_target // channel: [ meta, assembly_id ] + trim_reads = ch_emitted_reads // channel: [ [ id, single_end ], [ fq ] ] + reads_target = ch_reads_target // channel: [ [ id, single_end ], assembly_id ] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/purge_breaker_models.nf b/subworkflows/local/purge_breaker_models.nf new file mode 100644 index 0000000..db828ab --- /dev/null +++ b/subworkflows/local/purge_breaker_models.nf @@ -0,0 +1,241 @@ +include { AGAT_CONVERTSPGFF2GTF } from '../../modules/nf-core/agat/convertspgff2gtf/main' +include { TSEBRA } from '../../modules/pfr/tsebra/main' +include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' +include { AGAT_SPFILTERFEATUREFROMKILLLIST as KILL_TSEBRA_ISOFORMS } from '../../modules/pfr/agat/spfilterfeaturefromkilllist/main' +include { GFFCOMPARE as COMPARE_BRAKER_TO_LIFTOFF } from '../../modules/nf-core/gffcompare/main' +include { AGAT_SPFILTERFEATUREFROMKILLLIST } from '../../modules/pfr/agat/spfilterfeaturefromkilllist/main' +include { GFFCOMPARE as VALIDATE_PURGING_BY_AGAT } from '../../modules/nf-core/gffcompare/main' +include { AGAT_SPMERGEANNOTATIONS as MERGE_BRAKER_LIFTOFF } from '../../modules/pfr/agat/spmergeannotations/main' + +workflow PURGE_BREAKER_MODELS { + take: + braker_gff3 // [ meta, gff3 ] + braker_hints // [ meta, gff ] + liftoff_gff3 // [ meta, gff3 ] + tsebra_config // val(tsebra_config) + braker_allow_isoforms // val(true|false) + + main: + ch_versions = Channel.empty() + + // MODULE: AGAT_CONVERTSPGFF2GTF + AGAT_CONVERTSPGFF2GTF ( braker_gff3 ) + + ch_braker_gtf = AGAT_CONVERTSPGFF2GTF.out.output_gtf + ch_versions = ch_versions.mix(AGAT_CONVERTSPGFF2GTF.out.versions.first()) + + // COLLECTFILE: Prepare for TSEBRA + ch_tsebra_input_gtf = ch_braker_gtf + | map { meta, gtf -> + + def lines = gtf.readLines() + .collect { line -> + if ( line.startsWith('#') ) { return line } + + def cols = line.split('\t') + def feat = cols[2] + + if ( ! ( feat in [ 'gene', 'transcript', 'mRNA' ] ) ) { return line } + + def atts = cols[8] + def matches = atts =~ /ID ([^;]*)/ + def id = matches[0][1] + + def feat_format = ( feat == 'mRNA' ) ? 'transcript' : feat + + return ( cols[0..1] + [ feat_format ] + cols[3..7] + [ id ] ).join('\t') + }.join('\n') + + [ "${meta.id}.clean.gtf" ] + [ lines ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace(".clean", "") ], file ] + } + + // MODULE: TSEBRA + ch_tsebra_inputs = ch_tsebra_input_gtf + | join(braker_hints) + | combine(Channel.fromPath(tsebra_config)) + TSEBRA( + ch_tsebra_inputs.map { meta, gtf, gff, cfg -> [ meta, [ gtf ] ] }, + ch_tsebra_inputs.map { meta, gtf, gff, cfg -> [ gff ] }, + [], + ch_tsebra_inputs.map { meta, gtf, gff, cfg -> cfg } + ) + + ch_tsebra_gtf = TSEBRA.out.tsebra_gtf + ch_versions = ch_versions.mix(TSEBRA.out.versions.first()) + + // COLLECTFILE: Format TSEBRA output + ch_tsebra_formatted_gtf = ch_tsebra_gtf + | map { meta, gtf -> + + def lines = gtf.readLines() + .collect { line -> + if ( line.startsWith('#') ) { return line } + + def cols = line.split('\t') + def atts_r = '' + // Remove attributes and use AGAT_CONVERTSPGXF2GXF + // to create attributes based on sequential layout + + return ( cols[0..7] + [ atts_r ] ).join('\t') + }.join('\n') + + [ "${meta.id}.gtf" ] + [ lines ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName ], file ] + } + + // MODULE: AGAT_CONVERTSPGXF2GXF + AGAT_CONVERTSPGXF2GXF ( ch_tsebra_formatted_gtf ) + + ch_tsebra_formatted_gff = AGAT_CONVERTSPGXF2GXF.out.output_gff + ch_versions = ch_versions.mix(AGAT_CONVERTSPGXF2GXF.out.versions.first()) + + // COLLECTFILE: Format AGAT_CONVERTSPGXF2GXF output + ch_tsebra_gff = ch_tsebra_formatted_gff + | map { meta, gff -> + + def lines = gff.readLines() + .collect { line -> + if ( line.startsWith('#') ) { return line } + + def cols = line.split('\t') + def atts_r = cols[8].replaceAll('-', '').replaceAll('agat', '') + + return ( cols[0..7] + [ atts_r ] ).join('\t') + }.join('\n') + + [ "${meta.id}.gff3" ] + [ lines ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName ], file ] + } + + // COLLECTFILE: Iso-form kill list if braker_allow_isoforms=true + ch_post_tsebra_kill_list = braker_allow_isoforms + ? Channel.empty() + : ch_tsebra_gff + | map { meta, gff -> + def kill_list = gff.readLines() + .findAll { line -> + if ( line.startsWith('#') ) { return false } + + def cols = line.split('\t') + def feat = cols[2] + + ( feat == 'mRNA' || feat == 'transcript' ) + } + .collect { line -> + def cols = line.split('\t') + def atts = cols[8] + def tx_id = ( atts =~ /ID=([^;]*)/ )[0][1] + def g_id = ( atts =~ /Parent=([^;]*)/ )[0][1] + + [ g_id, tx_id ] + } + .groupBy { g_id, tx_id -> g_id } + .findAll { key, value -> value.size() > 1 } + .collect { key, value -> + value.collect { it[1] }[1..-1] + } + .flatten() + .join('\n') + + [ "${meta.id}.kill.list.txt" ] + [ kill_list ] + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.kill.list', '') ], file ] + } + + // MODULE: AGAT_SPFILTERFEATUREFROMKILLLIST as KILL_TSEBRA_ISOFORMS + ch_tsebra_kill_inputs = ch_tsebra_gff + | join(ch_post_tsebra_kill_list) + + + KILL_TSEBRA_ISOFORMS( + ch_tsebra_kill_inputs.map { meta, gff, kill -> [ meta, gff ] }, + ch_tsebra_kill_inputs.map { meta, gff, kill -> kill }, + [] // default config + ) + + ch_tsebra_killed_gff = ch_tsebra_gff + | join(KILL_TSEBRA_ISOFORMS.out.gff, remainder: true) + | map { meta, tsebra, killed -> + if ( tsebra ) { [ meta, killed ?: tsebra ] } + } + ch_versions = ch_versions.mix(KILL_TSEBRA_ISOFORMS.out.versions.first()) + + // MODULE: GFFCOMPARE as COMPARE_BRAKER_TO_LIFTOFF + ch_comparison_inputs = ch_tsebra_killed_gff + | join(liftoff_gff3) + + + COMPARE_BRAKER_TO_LIFTOFF ( + ch_comparison_inputs.map { meta, braker, liftoff -> [ meta, braker ] }, + [ [], [], [] ], + ch_comparison_inputs.map { meta, braker, liftoff -> [ meta, liftoff ] }, + ) + + ch_tracking = COMPARE_BRAKER_TO_LIFTOFF.out.tracking + ch_versions = ch_versions.mix(COMPARE_BRAKER_TO_LIFTOFF.out.versions.first()) + + // COLLECTFILE: Transcript level kill list + ch_kill_list = ch_tracking + | map { meta, tracking -> + + def kept_lines = tracking.readLines() + .findAll { line -> + def cols = line.split('\t') + + ( cols[3] != 'u' ) && ( cols[3] != 'p' ) + } + + def tx_kill_list = kept_lines + .collect { line -> + def cols = line.split('\t') + + def matched = cols[4] =~ /q1:([^\|]+)\|([^\|]+)/ + + matched[0][2].trim() + }.join('\n') + + [ "${meta.id}.kill.list.txt" ] + tx_kill_list + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.kill.list', '') ], file ] + } + + // MODULE: AGAT_SPFILTERFEATUREFROMKILLLIST + ch_agat_kill_inputs = ch_tsebra_killed_gff + | join(ch_kill_list) + + + AGAT_SPFILTERFEATUREFROMKILLLIST( + ch_agat_kill_inputs.map { meta, gff, kill -> [ meta, gff ] }, + ch_agat_kill_inputs.map { meta, gff, kill -> kill }, + [] // default config + ) + + ch_braker_purged_gff = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff + ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first()) + + // Handle case where liftoff is not present + ch_all_braker_gff = ch_tsebra_killed_gff + | join(ch_braker_purged_gff, remainder:true) + | map { meta, tsebra_gff, purged_gff -> + if ( purged_gff ) { return [ meta, purged_gff ] } + if ( tsebra_gff ) { return [ meta, tsebra_gff ] } + } + + emit: + braker_purged_gff = ch_all_braker_gff // [ meta, gff3 ] + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/local/purge_nohit_models.nf b/subworkflows/local/purge_nohit_models.nf new file mode 100644 index 0000000..d213dc2 --- /dev/null +++ b/subworkflows/local/purge_nohit_models.nf @@ -0,0 +1,65 @@ +include { AGAT_SPFILTERFEATUREFROMKILLLIST } from '../../modules/pfr/agat/spfilterfeaturefromkilllist/main' + +workflow PURGE_NOHIT_MODELS { + take: + ch_target_gff // [ meta, gff ] + ch_eggnogmapper_hits // [ meta, hits ] + val_purge_nohits // val(true|false) + + main: + ch_versions = Channel.empty() + + // COLLECTFILE: Transcript level kill list + ch_kill_list = ch_target_gff + | join(ch_eggnogmapper_hits) + | map { meta, gff, hits -> + + def tx_with_hits = hits.readLines() + .collect { it.split('\t')[0] } + .sort(false) + .unique() + + def tx_in_gff = gff.readLines() + .findAll { line -> + if ( line.startsWith('#') || line == '' ) { return false } + + def feat = line.split('\t')[2] + ( feat == 'transcript' || feat == 'mRNA' ) + } + .collect { it -> + def attrs = it.split('\t')[8] + + ( attrs =~ /ID=([^;]*)/ )[0][1] + } + .sort(false) + .unique() + + def tx_without_hits = tx_in_gff - tx_with_hits + + [ "${meta.id}.kill.list.txt" ] + tx_without_hits.join('\n') + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.kill.list', '') ], file ] + } + + // MODULE: AGAT_SPFILTERFEATUREFROMKILLLIST + ch_agat_kill_inputs = ! val_purge_nohits + ? Channel.empty() + : ch_target_gff + | join(ch_kill_list) + + + AGAT_SPFILTERFEATUREFROMKILLLIST( + ch_agat_kill_inputs.map { meta, gff, kill -> [ meta, gff ] }, + ch_agat_kill_inputs.map { meta, gff, kill -> kill }, + [] // default config + ) + + ch_target_purged_gff = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff + ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first()) + + emit: + purged_gff = ch_target_purged_gff.mix(val_purge_nohits ? Channel.empty() : ch_target_gff) + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf index 2c67b3c..833d82b 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/main.nf @@ -20,6 +20,19 @@ def getFastpReadsAfterFiltering(json_file, min_trimmed_reads) { return json['after_filtering']['total_reads'].toLong() } +def getFastpAdapterSequence(json_file){ + + if (!json_file.text) { return "" } // Usman Rashid: To allow -stub with FASTP + + def Map json = (Map) new JsonSlurper().parseText(json_file.text) + try{ + adapter = json['adapter_cutting']['read1_adapter_sequence'] + } catch(Exception ex){ + adapter = "" + } + return adapter +} + workflow FASTQ_FASTQC_UMITOOLS_FASTP { take: reads // channel: [ val(meta), [ reads ] ] @@ -28,7 +41,7 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { skip_umi_extract // boolean: true/false umi_discard_read // integer: 0, 1 or 2 skip_trimming // boolean: true/false - adapter_fasta // file: adapter.fasta + adapter_fasta // file: adapter.fasta save_trimmed_fail // boolean: true/false save_merged // boolean: true/false min_trimmed_reads // integer: > 0 @@ -78,6 +91,8 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { fastqc_trim_html = Channel.empty() fastqc_trim_zip = Channel.empty() trim_read_count = Channel.empty() + adapter_seq = Channel.empty() + if (!skip_trimming) { FASTP ( umi_reads, @@ -111,6 +126,10 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { .map { meta, reads, num_reads -> [ meta, num_reads ] } .set { trim_read_count } + trim_json + .map { meta, json -> [meta, getFastpAdapterSequence(json)] } + .set { adapter_seq } + if (!skip_fastqc) { FASTQC_TRIM ( trim_reads @@ -128,6 +147,7 @@ workflow FASTQ_FASTQC_UMITOOLS_FASTP { fastqc_raw_zip // channel: [ val(meta), [ zip ] ] umi_log // channel: [ val(meta), [ log ] ] + adapter_seq // channel: [ val(meta), [ adapter_seq] ] trim_json // channel: [ val(meta), [ json ] ] trim_html // channel: [ val(meta), [ html ] ] diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml index 220e8db..9308fe9 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/meta.yml @@ -43,7 +43,7 @@ input: - skip_trimming: type: boolean description: | - Allows to skip trimgalore execution + Allows to skip FastP execution - adapter_fasta: type: file description: | @@ -70,10 +70,7 @@ output: type: file description: > Extracted FASTQ files. | For single-end reads, pattern is \${prefix}.umi_extract.fastq.gz. | - - - - For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. + For paired-end reads, pattern is \${prefix}.umi_extract_{1,2}.fastq.gz. pattern: "*.{fastq.gz}" - fastqc_html: type: file @@ -118,6 +115,10 @@ output: type: file description: FastQC report archive pattern: "*_{fastqc.zip}" + - adapter_seq: + type: string + description: | + Adapter Sequence found in read1 - versions: type: file description: File containing software versions diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test index cdd7398..961b5b4 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test @@ -3,6 +3,8 @@ nextflow_workflow { name "Test Workflow FASTQ_FASTQC_UMITOOLS_FASTP" script "../main.nf" workflow "FASTQ_FASTQC_UMITOOLS_FASTP" + config './nextflow.config' + tag "subworkflows" tag "subworkflows_nfcore" tag "subworkflows/fastq_fastqc_umitools_fastp" @@ -17,22 +19,518 @@ nextflow_workflow { when { workflow { """ - input[0] = [ - [ id:'test', single_end:false ], // meta map - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), - file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) - ] - ] - input[1] = false // skip_fastqc - input[2] = false // with_umi - input[3] = false // skip_umi_extract - input[4] = 1 // umi_discard_read - input[5] = false // skip_trimming - input[6] = [] // adapter_fasta - input[7] = false // save_trimmed_fail - input[8] = false // save_merged - input[9] = 1 // min_trimmed_reads + skip_fastqc = false + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + test("skip_fastqc") { + + when { + workflow { + """ + skip_fastqc = true + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end: false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert !workflow.out.fastqc_raw_html }, + { assert !workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert !workflow.out.fastqc_trim_html }, + { assert !workflow.out.fastqc_trim_zip } + ) + } + } + + test("with_umi") { + + when { + workflow { + """ + skip_fastqc = false + with_umi = true + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + + test("skip_umi_extract") { + + when { + workflow { + """ + skip_fastqc = false + with_umi = true + skip_umi_extract = true + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + test("umi_discard_read = 2") { + + when { + workflow { + """ + skip_fastqc = false + with_umi = true + skip_umi_extract = true + umi_discard_read = 2 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + test("skip_trimming") { + + when { + workflow { + """ + skip_fastqc = false + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = true + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads.get(0).get(0), // Reads meta map + // Because the input file is passed to the output file, we have to do check the filename only + file(workflow.out.reads.get(0).get(1).get(0)).name, + file(workflow.out.reads.get(0).get(1).get(1)).name, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert !workflow.out.trim_html }, + { assert !workflow.out.trim_log }, + { assert !workflow.out.fastqc_trim_html }, + { assert !workflow.out.fastqc_trim_zip } + ) + } + } + + test("save_trimmed_fail") { + + config './nextflow.save_trimmed.config' + + when { + workflow { + """ + skip_fastqc = false + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + test("save_merged") { + + when { + workflow { + """ + skip_fastqc = false + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + min_trimmed_reads = 1 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, + + { assert workflow.out.fastqc_raw_html }, + { assert workflow.out.fastqc_raw_zip }, + { assert workflow.out.trim_html }, + { assert workflow.out.trim_log }, + { assert workflow.out.fastqc_trim_html }, + { assert workflow.out.fastqc_trim_zip } + ) + } + } + + test("min_trimmed_reads = 26") { + // Subworkflow should stop after FASTP which trims down to 25 reads + + when { + workflow { + """ + skip_fastqc = false + with_umi = false + skip_umi_extract = false + umi_discard_read = 1 + skip_trimming = false + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + min_trimmed_reads = 26 + + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = skip_fastqc + input[2] = with_umi + input[3] = skip_umi_extract + input[4] = umi_discard_read + input[5] = skip_trimming + input[6] = adapter_fasta + input[7] = save_trimmed_fail + input[8] = save_merged + input[9] = min_trimmed_reads """ } } @@ -40,13 +538,17 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot(workflow.out.reads).match("reads") }, - { assert snapshot(workflow.out.umi_log).match("umi_log") }, - { assert snapshot(workflow.out.trim_json).match("trim_json") }, - { assert snapshot(workflow.out.trim_reads_fail).match("trim_reads_fail") }, - { assert snapshot(workflow.out.trim_reads_merged).match("trim_reads_merged") }, - { assert snapshot(workflow.out.trim_read_count).match("trim_read_count") }, - { assert snapshot(workflow.out.versions).match("versions") }, + { assert snapshot( + workflow.out.reads, + workflow.out.umi_log, + workflow.out.trim_json, + workflow.out.trim_reads_fail, + workflow.out.trim_reads_merged, + workflow.out.adapter_seq, + workflow.out.trim_read_count, + workflow.out.versions + ).match() + }, { assert workflow.out.fastqc_raw_html }, { assert workflow.out.fastqc_raw_zip }, diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap index 38a65ae..3e11d9e 100644 --- a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/main.nf.test.snap @@ -1,32 +1,215 @@ { - "trim_reads_merged": { + "skip_fastqc": { "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" + ] + ], [ + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 198 + ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad" ] ], - "timestamp": "2023-11-26T02:28:26.26920982" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:53:49.315194" }, - "trim_reads_fail": { + "save_trimmed_fail": { "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,6ff32a64c5188b9a9192be1398c262c7", + "test_2.fastp.fastq.gz:md5,db0cb7c9977e94ac2b4b446ebd017a8a" + ] + ] + ], [ + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,4c3268ddb50ea5b33125984776aa3519" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.paired.fail.fastq.gz:md5,409b687c734cedd7a1fec14d316e1366", + "test_1.fail.fastq.gz:md5,4f273cf3159c13f79e8ffae12f5661f6", + "test_2.fail.fastq.gz:md5,f97b9edefb5649aab661fbc9e71fc995" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 162 + ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-11-26T02:28:26.25861515" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:51:45.34934" }, - "versions": { + "skip_umi_extract": { "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 198 + ] + ], [ "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-11-26T02:28:26.30891403" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T12:07:40.34249" }, - "trim_json": { + "umi_discard_read = 2": { "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ], + [ + + ], [ [ { @@ -35,11 +218,44 @@ }, "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 198 + ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-11-26T02:28:26.24768259" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T12:08:24.141938" }, - "reads": { + "save_merged": { "content": [ [ [ @@ -48,24 +264,142 @@ "single_end": false }, [ - "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", - "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + "test_1.fastp.fastq.gz:md5,54b726a55e992a869fd3fa778afe1672", + "test_2.fastp.fastq.gz:md5,29d3b33b869f7b63417b8ff07bb128ba" ] ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,b712fd68ed0322f4bec49ff2a5237fcc" + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.fastq.gz:md5,c873bb1ab3fa859dcc47306465e749d5" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 75 + ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-12-04T11:30:32.061644815" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T12:10:18.546963" }, - "umi_log": { + "skip_trimming": { "content": [ + { + "id": "test", + "single_end": false + }, + "test_1.fastq.gz", + "test_2.fastq.gz", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], [ + ], + [ + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-11-26T02:28:26.238536" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T15:49:26.574759" }, - "trim_read_count": { + "sarscov2 paired-end [fastq]": { "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,67b2bbae47f073e05a97a9c2edce23c7", + "test_2.fastp.fastq.gz:md5,25cbdca08e2083dbd4f0502de6b62f39" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,1e0f8e27e71728e2b63fc64086be95cd" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], [ [ { @@ -74,8 +408,142 @@ }, 198 ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T16:53:39.139038" + }, + "min_trimmed_reads = 26": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastp.fastq.gz:md5,54b726a55e992a869fd3fa778afe1672", + "test_2.fastp.fastq.gz:md5,29d3b33b869f7b63417b8ff07bb128ba" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fastp.json:md5,b712fd68ed0322f4bec49ff2a5237fcc" + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.fastq.gz:md5,c873bb1ab3fa859dcc47306465e749d5" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "unspecified" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + 75 + ] + ], + [ + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T11:52:23.849945" + }, + "with_umi": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.fastq.gz:md5,ba8c6c3a7ce718d9a2c5857e2edf53bc" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,d39c5c6d9a2e35fb60d26ced46569af6" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + 99 + ] + ], + [ + "versions.yml:md5,01f264f78de3c6d893c449cc6d3cd721", + "versions.yml:md5,85bd0117e5778fff18e3920972a296ad", + "versions.yml:md5,c50aa59475ab901bc6f9a2cf7b1a14e0", + "versions.yml:md5,f3dcaae948e8eed92b4a5557b4c6668e" ] ], - "timestamp": "2023-11-26T02:28:26.27984169" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T17:31:09.193212" } } \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.config b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.config new file mode 100644 index 0000000..12f7b25 --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.config @@ -0,0 +1,11 @@ +process { + + withName: UMITOOLS_EXTRACT { + ext.args = '--bc-pattern="NNNN" --bc-pattern2="NNNN"' + } + + withName: UMICOLLAPSE { + ext.prefix = { "${meta.id}.dedup" } + } + +} \ No newline at end of file diff --git a/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.save_trimmed.config b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.save_trimmed.config new file mode 100644 index 0000000..2430e9d --- /dev/null +++ b/subworkflows/nf-core/fastq_fastqc_umitools_fastp/tests/nextflow.save_trimmed.config @@ -0,0 +1,6 @@ +process { + // Make filtering more aggressive to make more reads fail + withName: FASTP { + ext.args = "-e 30" + } +} \ No newline at end of file diff --git a/subworkflows/pfr/fasta_edta_lai/main.nf b/subworkflows/pfr/fasta_edta_lai/main.nf index 2e73ca5..628e255 100644 --- a/subworkflows/pfr/fasta_edta_lai/main.nf +++ b/subworkflows/pfr/fasta_edta_lai/main.nf @@ -1,33 +1,45 @@ -include { CUSTOM_SHORTENFASTAIDS } from '../../../modules/pfr/custom/shortenfastaids' -include { EDTA_EDTA } from '../../../modules/pfr/edta/edta' -include { LAI } from '../../../modules/pfr/lai' -include { CUSTOM_RESTOREGFFIDS } from '../../../modules/pfr/custom/restoregffids' +include { CUSTOM_SHORTENFASTAIDS } from '../../../modules/pfr/custom/shortenfastaids/main' +include { EDTA_EDTA } from '../../../modules/pfr/edta/edta/main' +include { LTRRETRIEVER_LAI } from '../../../modules/pfr/ltrretriever/lai/main' +include { CUSTOM_RESTOREGFFIDS } from '../../../modules/pfr/custom/restoregffids/main' workflow FASTA_EDTA_LAI { take: - ch_fasta // channel: [ val(meta), fasta ] - ch_monoploid_seqs // channel: [ val(meta), txt ]; Optional: Set to [] if not needed - skip_lai // val; true|false + ch_fasta // channel: [ val(meta), fasta ] + ch_monoploid_seqs // channel: [ val(meta), txt ]; Optional: Set to [] if not needed + skip_lai // val; true|false main: - - ch_versions = Channel.empty() + ch_versions = Channel.empty() // MOUDLE: CUSTOM_SHORTENFASTAIDS CUSTOM_SHORTENFASTAIDS ( ch_fasta ) - ch_short_ids_fasta = ch_fasta - | join(CUSTOM_SHORTENFASTAIDS.out.short_ids_fasta, by:0, remainder:true) - | map { meta, fasta, short_ids_fasta -> - [ meta, short_ids_fasta ?: fasta ] - } + ch_short_ids_fasta = ch_fasta + | join(CUSTOM_SHORTENFASTAIDS.out.short_ids_fasta, by:0, remainder:true) + | map { meta, fasta, short_ids_fasta -> + if ( fasta ) { [ meta, short_ids_fasta ?: fasta ] } + } + + ch_short_ids_tsv = CUSTOM_SHORTENFASTAIDS.out.short_ids_tsv + ch_short_monoploid_seqs = ch_short_ids_tsv + | join( + ch_monoploid_seqs ?: Channel.empty() + ) + | map { meta, short_ids_tsv, monoploid_seqs -> + map_monoploid_seqs_to_new_ids(meta, short_ids_tsv, monoploid_seqs) + } + | collectFile(newLine:true) + | map { seqs -> + def id = seqs.name.split('.mapped.monoploid.seqs.txt')[0] - ch_short_ids_tsv = CUSTOM_SHORTENFASTAIDS.out.short_ids_tsv - ch_versions = ch_versions.mix(CUSTOM_SHORTENFASTAIDS.out.versions.first()) + [ [ id: id ], seqs ] + } + ch_versions = ch_versions.mix(CUSTOM_SHORTENFASTAIDS.out.versions.first()) // MODULE: EDTA_EDTA - EDTA_EDTA ( + EDTA_EDTA( ch_short_ids_fasta, [], [], @@ -35,54 +47,85 @@ workflow FASTA_EDTA_LAI { [] ) - ch_te_lib_fasta = EDTA_EDTA.out.te_lib_fasta - ch_pass_list = EDTA_EDTA.out.pass_list - ch_out_file = EDTA_EDTA.out.out_file - ch_te_anno_gff3 = EDTA_EDTA.out.te_anno_gff3 - ch_versions = ch_versions.mix(EDTA_EDTA.out.versions.first()) + ch_te_lib_fasta = EDTA_EDTA.out.te_lib_fasta + ch_pass_list = EDTA_EDTA.out.pass_list + ch_out_file = EDTA_EDTA.out.out_file + ch_te_anno_gff3 = EDTA_EDTA.out.te_anno_gff3 + ch_versions = ch_versions.mix(EDTA_EDTA.out.versions.first()) // MODULE: LAI - ch_lai_inputs = skip_lai - ? Channel.empty() - : ch_short_ids_fasta - | join(ch_pass_list) - | join(ch_out_file) - | join( - ch_monoploid_seqs ?: Channel.empty(), - by:0, - remainder: true - ) - | map { meta, fasta, pass, out, mono -> - [ meta, fasta, pass, out, mono ?: [] ] - } - LAI ( + ch_lai_inputs = skip_lai + ? Channel.empty() + : ch_short_ids_fasta + | join(ch_pass_list) + | join(ch_out_file) + | map { meta, fasta, pass, out -> + [ meta.id, meta, fasta, pass, out ] + } + | join( + ch_short_monoploid_seqs + | map { meta, mono -> [ meta.id, mono ] }, + by:0, + remainder: true + ) + | map { id, meta, fasta, pass, out, mono -> + [ meta, fasta, pass, out, mono ?: [] ] + } + LTRRETRIEVER_LAI( ch_lai_inputs.map { meta, fasta, pass, out, mono -> [ meta, fasta ] }, ch_lai_inputs.map { meta, fasta, pass, out, mono -> pass }, ch_lai_inputs.map { meta, fasta, pass, out, mono -> out }, ch_lai_inputs.map { meta, fasta, pass, out, mono -> mono } ) - ch_lai_log = LAI.out.log - ch_lai_out = LAI.out.lai_out - ch_versions = ch_versions.mix(LAI.out.versions.first()) + ch_lai_log = LTRRETRIEVER_LAI.out.log + ch_lai_out = LTRRETRIEVER_LAI.out.lai_out + ch_versions = ch_versions.mix(LTRRETRIEVER_LAI.out.versions.first()) // MODULE: CUSTOM_RESTOREGFFIDS - ch_restorable_gff_tsv = ch_te_anno_gff3.join(ch_short_ids_tsv) + ch_restorable_gff_tsv = ch_te_anno_gff3.join(ch_short_ids_tsv) CUSTOM_RESTOREGFFIDS ( ch_restorable_gff_tsv.map { meta, gff, tsv -> [ meta, gff ] }, ch_restorable_gff_tsv.map { meta, gff, tsv -> tsv } ) - ch_restored_gff = ch_te_anno_gff3 - | join(CUSTOM_RESTOREGFFIDS.out.restored_ids_gff3, by:0, remainder:true) - | map { meta, gff, restored_gff -> [ meta, restored_gff ?: gff ] } - ch_versions = ch_versions.mix(CUSTOM_RESTOREGFFIDS.out.versions.first()) + ch_restored_gff = ch_te_anno_gff3 + | join(CUSTOM_RESTOREGFFIDS.out.restored_ids_gff3, by:0, remainder:true) + | map { meta, gff, restored_gff -> [ meta, restored_gff ?: gff ] } + ch_versions = ch_versions.mix(CUSTOM_RESTOREGFFIDS.out.versions.first()) emit: - te_lib_fasta = ch_te_lib_fasta // channel: [ val(meta), fasta ] - te_anno_gff3 = ch_restored_gff // channel: [ val(meta), gff ] - lai_log = ch_lai_log // channel: [ val(meta), log ] - lai_out = ch_lai_out // channel: [ val(meta), out ] - versions = ch_versions // channel: [ versions.yml ] + te_lib_fasta = ch_te_lib_fasta // channel: [ val(meta), fasta ] + te_anno_gff3 = ch_restored_gff // channel: [ val(meta), gff ] + lai_log = ch_lai_log // channel: [ val(meta), log ] + lai_out = ch_lai_out // channel: [ val(meta), out ] + versions = ch_versions // channel: [ versions.yml ] +} + +def map_monoploid_seqs_to_new_ids(meta, short_ids_tsv, monoploid_seqs) { + + def short_ids_head = short_ids_tsv.text.split('\n')[0] + + if (short_ids_head == "IDs have acceptable length and character. No change required.") { + return [ "${meta.id}.mapped.monoploid.seqs.txt" ] + monoploid_seqs.text.split('\n') + } + + def orig_to_new_ids = [:] + short_ids_tsv.text.eachLine { line -> + def (original_id, renamed_id) = line.split('\t') + orig_to_new_ids[original_id] = renamed_id + } + + def mapped_ids = [] + monoploid_seqs.text.eachLine { original_id -> + if (!orig_to_new_ids[original_id]) { + error "Faild to find $original_id in ${monoploid_seqs}" + + "The monoploid_seqs file is malformed!" + } + + mapped_ids.add(orig_to_new_ids[original_id]) + } + + return [ "${meta.id}.mapped.monoploid.seqs.txt" ] + mapped_ids } diff --git a/subworkflows/pfr/fasta_edta_lai/meta.yml b/subworkflows/pfr/fasta_edta_lai/meta.yml index 52483ce..c356ce7 100644 --- a/subworkflows/pfr/fasta_edta_lai/meta.yml +++ b/subworkflows/pfr/fasta_edta_lai/meta.yml @@ -11,10 +11,10 @@ keywords: - stats - qc components: + - edta/edta - custom/restoregffids + - ltrretriever/lai - custom/shortenfastaids - - edta/edta - - lai input: - ch_fasta: type: file diff --git a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test index e852a70..2c6850d 100644 --- a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test +++ b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test @@ -3,17 +3,60 @@ nextflow_workflow { name "Test Workflow FASTA_EDTA_LAI" script "../main.nf" workflow "FASTA_EDTA_LAI" + config "./nextflow.config" tag "subworkflows" tag "subworkflows_nfcore" tag "subworkflows/fasta_edta_lai" tag "fasta_edta_lai" - tag "lai" + tag "modules/nf-core/gunzip" + tag "custom/shortenfastaids" tag "edta/edta" + tag "ltrretriever/lai" tag "custom/restoregffids" - tag "custom/shortenfastaids" - test("test_data") { + test("actinidia_chinensis-genome_21_fasta_gz") { + + setup { + run("GUNZIP") { + script "../../../../modules/nf-core/gunzip" + + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['actinidia_chinensis']['genome']['genome_21_fasta_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + workflow { + """ + input[0] = GUNZIP.out.gunzip + input[1] = [] + input[2] = false + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert file(workflow.out.te_anno_gff3[0][1]).text.contains('Copia_LTR_retrotransposon') }, + { assert file(workflow.out.lai_log[0][1]).text.contains('Calculate LAI:') }, + { assert file(workflow.out.lai_log[0][1]).text.contains('Done!') }, + { assert Math.abs(Float.parseFloat(path(workflow.out.lai_out[0][1]).text.split("\n")[1].split("\t")[6]) - 31.29) <= 1.0 }, + { assert file(workflow.out.te_lib_fasta[0][1]).text.contains('#LTR/Copia') } + ) + } + } + + test("actinidia_chinensis-genome_21_fasta_gz-stub") { + + options '-stub' setup { run("GUNZIP") { @@ -23,7 +66,7 @@ nextflow_workflow { """ input[0] = [ [ id:'test' ], - file('/Users/hrauxr/Projects/nxf-modules/tests/data/genome.fasta.gz', checkIfExists: true) + file(params.test_data['actinidia_chinensis']['genome']['genome_21_fasta_gz'], checkIfExists: true) ] """ } @@ -43,8 +86,8 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot(workflow.out.versions).match("versions") } + { assert snapshot(workflow.out).match() } ) } } -} +} \ No newline at end of file diff --git a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap index 574acc9..2ab7da2 100644 --- a/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap +++ b/subworkflows/pfr/fasta_edta_lai/tests/main.nf.test.snap @@ -1,11 +1,87 @@ { - "versions": { + "actinidia_chinensis-genome_21_fasta_gz-stub": { "content": [ - [ - "versions.yml:md5,0d4bc49e94acb8995ca552d4e666e3ce", - "versions.yml:md5,754bb19f86be761d90c002a0af2faf1c" - ] + { + "0": [ + [ + { + "id": "test" + }, + "test.EDTA.TElib.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.EDTA.TEanno.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,0d4bc49e94acb8995ca552d4e666e3ce", + "versions.yml:md5,65666e975bdfd71978843ca963e84d0c", + "versions.yml:md5,754bb19f86be761d90c002a0af2faf1c" + ], + "lai_log": [ + [ + { + "id": "test" + }, + "test.LAI.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "lai_out": [ + [ + { + "id": "test" + }, + "test.LAI.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "te_anno_gff3": [ + [ + { + "id": "test" + }, + "test.EDTA.TEanno.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "te_lib_fasta": [ + [ + { + "id": "test" + }, + "test.EDTA.TElib.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,0d4bc49e94acb8995ca552d4e666e3ce", + "versions.yml:md5,65666e975bdfd71978843ca963e84d0c", + "versions.yml:md5,754bb19f86be761d90c002a0af2faf1c" + ] + } ], - "timestamp": "2023-12-22T14:09:24.171934" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-20T18:05:46.667121" } } \ No newline at end of file diff --git a/subworkflows/pfr/fasta_edta_lai/tests/nextflow.config b/subworkflows/pfr/fasta_edta_lai/tests/nextflow.config new file mode 100644 index 0000000..1fa6315 --- /dev/null +++ b/subworkflows/pfr/fasta_edta_lai/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: EDTA_EDTA { + ext.args = '--anno 1' + } +} diff --git a/tests/stub/assemblysheet.csv b/tests/stub/assemblysheet.csv new file mode 100644 index 0000000..cfa0cdb --- /dev/null +++ b/tests/stub/assemblysheet.csv @@ -0,0 +1,3 @@ +tag,fasta,is_masked,te_lib,braker_gff3,braker_hints +red5_v2p1,tests/stub/target/red5_v2p1_chr1.fasta.gz,no,,tests/stub/braker/red5_v2p1.gff3.gz,tests/stub/braker/red5_v2p1.hints.gff.gz +donghong,tests/stub/target/donghong.chr1.fsa.gz,no,tests/stub/te_lib/donghong.TElib.fa.gz,tests/stub/braker/red5_v2p1.gff3.gz,tests/stub/braker/red5_v2p1.hints.gff.gz diff --git a/tests/stub/braker/donghong.gff3.gz b/tests/stub/braker/donghong.gff3.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/braker/donghong.hints.gff.gz b/tests/stub/braker/donghong.hints.gff.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/braker/red5_v2p1.gff3.gz b/tests/stub/braker/red5_v2p1.gff3.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/braker/red5_v2p1.hints.gff.gz b/tests/stub/braker/red5_v2p1.hints.gff.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/emapperdb/5.0.2/eggnog.db b/tests/stub/emapperdb/5.0.2/eggnog.db new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/emapperdb/5.0.2/eggnog_proteins.dmnd b/tests/stub/emapperdb/5.0.2/eggnog_proteins.dmnd new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/ext_prot/RU01.20221115150135.chr1.pep.fasta.gz b/tests/stub/ext_prot/RU01.20221115150135.chr1.pep.fasta.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/ext_prot/RU01.20221115150135.chr2.pep.fasta.gz b/tests/stub/ext_prot/RU01.20221115150135.chr2.pep.fasta.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/external-protein-fastas.txt b/tests/stub/external-protein-fastas.txt new file mode 100644 index 0000000..d510c4c --- /dev/null +++ b/tests/stub/external-protein-fastas.txt @@ -0,0 +1,2 @@ +tests/stub/ext_prot/RU01.20221115150135.chr1.pep.fasta.gz +tests/stub/ext_prot/RU01.20221115150135.chr2.pep.fasta.gz diff --git a/tests/stub/fastqsheet.csv b/tests/stub/fastqsheet.csv new file mode 100644 index 0000000..0b7e223 --- /dev/null +++ b/tests/stub/fastqsheet.csv @@ -0,0 +1,4 @@ +sample,fastq_1,fastq_2,target_assemblies +Root1,tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R1.1k.fastq.gz,tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R2.1k.fastq.gz,red5_v2p1 +Root1,tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R1.1k.fastq.gz,tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R2.1k.fastq.gz,red5_v2p1 +cane3,tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R1.1k.fastq.gz,tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R2.1k.fastq.gz,red5_v2p1;donghong diff --git a/tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R1.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R1.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R2.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_Root1_162bp_C728RACXX_Lane1_R2.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R1.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R1.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R2.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_Root2_156bp_C728RACXX_Lane1_R2.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R1.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R1.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R2.1k.fastq.gz b/tests/stub/fq/1505KHS-0090_cane3_165bp_C728RACXX_Lane1_R2.1k.fastq.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/liftoff/RU01.20221115150135.chr1.gff3.gz b/tests/stub/liftoff/RU01.20221115150135.chr1.gff3.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/liftoff/RU01.20221115150135.chr2.gff3.gz b/tests/stub/liftoff/RU01.20221115150135.chr2.gff3.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/liftoff/Russell_V2a.chr1.fsa.gz b/tests/stub/liftoff/Russell_V2a.chr1.fsa.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/liftoff/Russell_V2a.chr2.fsa.gz b/tests/stub/liftoff/Russell_V2a.chr2.fsa.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/liftoffannotations.csv b/tests/stub/liftoffannotations.csv new file mode 100644 index 0000000..5215d2c --- /dev/null +++ b/tests/stub/liftoffannotations.csv @@ -0,0 +1,3 @@ +fasta,gff3 +tests/stub/liftoff/Russell_V2a.chr1.fsa.gz,tests/stub/liftoff/RU01.20221115150135.chr1.gff3.gz +tests/stub/liftoff/Russell_V2a.chr2.fsa.gz,tests/stub/liftoff/RU01.20221115150135.chr2.gff3.gz diff --git a/tests/stub/params.json b/tests/stub/params.json new file mode 100644 index 0000000..db97d3b --- /dev/null +++ b/tests/stub/params.json @@ -0,0 +1,10 @@ +{ + "input": "tests/stub/assemblysheet.csv", + "external_protein_fastas": "tests/stub/external-protein-fastas.txt", + "eggnogmapper_db_dir": "tests/stub/emapperdb/5.0.2", + "eggnogmapper_tax_scope": 33090, + "fastq": "tests/stub/fastqsheet.csv", + "liftoff_annotations": "tests/stub/liftoffannotations.csv", + "max_cpus": 2, + "max_memory": "3.GB" +} diff --git a/tests/stub/target/donghong.chr1.fsa.gz b/tests/stub/target/donghong.chr1.fsa.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/target/red5_v2p1_chr1.fasta.gz b/tests/stub/target/red5_v2p1_chr1.fasta.gz new file mode 100644 index 0000000..e69de29 diff --git a/tests/stub/te_lib/donghong.TElib.fa.gz b/tests/stub/te_lib/donghong.TElib.fa.gz new file mode 100644 index 0000000..e69de29 diff --git a/version_check.sh b/version_check.sh new file mode 100755 index 0000000..131d97c --- /dev/null +++ b/version_check.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +config_version=$(sed -n "s/.*version.*= '\(.*\)'.*/\1/p" nextflow.config) + +# Check CHANGELOG version + +grep "## $config_version - " CHANGELOG.md >/dev/null \ + || (echo 'Failed to match CHANGELOG version'; exit 1) diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 8512ff9..c0f3c32 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -1,36 +1,102 @@ -include { validateParams } from '../modules/local/validate_params' -include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' -include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' -include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' -include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' -include { BRAKER3 } from '../modules/kherronism/braker3' -include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' +include { fromSamplesheet; paramsSummaryLog } from 'plugin/nf-validation' +include { idFromFileName; validateFastqMetadata } from '../modules/local/utils' +include { PREPARE_ASSEMBLY } from '../subworkflows/local/prepare_assembly' +include { PREPROCESS_RNASEQ } from '../subworkflows/local/preprocess_rnaseq' +include { ALIGN_RNASEQ } from '../subworkflows/local/align_rnaseq' +include { PREPARE_EXT_PROTS } from '../subworkflows/local/prepare_ext_prots' +include { FASTA_BRAKER3 } from '../subworkflows/local/fasta_braker3' +include { FASTA_LIFTOFF } from '../subworkflows/local/fasta_liftoff' +include { PURGE_BREAKER_MODELS } from '../subworkflows/local/purge_breaker_models' +include { GFF_MERGE_CLEANUP } from '../subworkflows/local/gff_merge_cleanup' +include { GFF_EGGNOGMAPPER } from '../subworkflows/local/gff_eggnogmapper' +include { PURGE_NOHIT_MODELS } from '../subworkflows/local/purge_nohit_models' +include { GFF_STORE } from '../subworkflows/local/gff_store' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' -validateParams(params) +log.info paramsSummaryLog(workflow) workflow PANGENE { + // Versions channel ch_versions = Channel.empty() - ch_target_assembly = Channel.fromList(params.target_assemblies) - | map { tag, filePath -> - [ [ id: tag ], file(filePath, checkIfExists: true) ] + // Input channels + ch_input = Channel.fromSamplesheet('input') + + ch_target_assembly = ch_input + | map { it -> + def tag = it[0] + def fasta = it[1] + + [ [ id: tag ], file(fasta, checkIfExists: true) ] + } + + ch_tar_assm_str = ch_input + | map { it -> + def tag = it[0].strip() + + tag } + | collect + | map { it -> + it.join(",") + } + + ch_masked = ch_input + | map { it -> + def tag = it[0] + def is_masked = it[2] - ch_te_library = Channel.fromList(params.te_libraries) - | map { tag, filePath -> - [ [ id:tag ], file(filePath, checkIfExists: true) ] + [ [ id: tag ], is_masked == "yes" ] } - ch_samplesheet = params.samplesheet - ? Channel.fromPath(params.samplesheet, checkIfExists: true) - : Channel.empty() + ch_te_library = ch_input + | map { it -> + def tag = it[0] + def te_fasta = it[3] - ch_tar_assm_str = Channel.of( - params.target_assemblies - .collect { tag, fastaPath -> tag.strip() }.join(",") - ) + if ( te_fasta ) { + [ [ id:tag ], file(te_fasta, checkIfExists: true) ] + } + } + + ch_braker_annotation = ch_input + | map { it -> + def tag = it[0] + def braker_gff3 = it[4] + def hints_gff = it[5] + + if ( braker_gff3 ) { + [ + [ id: tag ], + file(braker_gff3, checkIfExists: true), + file(hints_gff, checkIfExists: true) + ] + } + } + + ch_braker_ex_asm_str = ch_braker_annotation + | map { meta, braker_gff3, hints_gff -> meta.id } + | collect + | map { it.join(",") } + | ifEmpty( "" ) + + ch_reads = ! params.fastq + ? Channel.empty() + : Channel.fromSamplesheet('fastq') + | map { meta, fq1, fq2 -> + fq2 + ? [ meta + [ single_end: false ], [ file(fq1, checkIfExists:true), file(fq2, checkIfExists:true) ] ] + : [ meta + [ single_end: true ], [ file(fq1, checkIfExists:true) ] ] + } + | map { meta, fqs -> + [ meta.id, meta + [ target_assemblies: meta.target_assemblies.split(';').sort() ], fqs ] + } + | groupTuple + | combine(ch_tar_assm_str) + | map { id, metas, fqs, tar_assm_str -> + validateFastqMetadata(metas, fqs, tar_assm_str) + } ch_ribo_db = params.remove_ribo_rna ? file(params.ribo_database_manifest, checkIfExists: true) @@ -42,32 +108,43 @@ workflow PANGENE { | collect : Channel.empty() - ch_ext_prot_fastas = params.external_protein_fastas - ? Channel.fromList(params.external_protein_fastas) - | map { filePath -> - def fileHandle = file(filePath, checkIfExists: true) - [ [ id: fileHandle.getSimpleName() ], fileHandle] + ch_ext_prot_fastas = ! params.external_protein_fastas + ? Channel.empty() + : Channel.fromPath(params.external_protein_fastas) + | splitText + | map { file_path -> + def file_handle = file(file_path.strip(), checkIfExists: true) + [ [ id: idFromFileName( file_handle.baseName ) ], file_handle ] } - : Channel.empty() - ch_xref_mm = params.liftoff_xref_annotations - ? Channel.fromList(params.liftoff_xref_annotations) + ch_liftoff_mm = ! params.liftoff_annotations + ? Channel.empty() + : Channel.fromSamplesheet('liftoff_annotations') | multiMap { fasta, gff -> def fastaFile = file(fasta, checkIfExists:true) - fasta: [ [ id: fastaFile.getSimpleName() ], fastaFile ] - gff: [ [ id: fastaFile.getSimpleName() ], file(gff, checkIfExists:true) ] + fasta: [ [ id: idFromFileName( fastaFile.baseName ) ], fastaFile ] + gff: [ [ id: idFromFileName( fastaFile.baseName ) ], file(gff, checkIfExists:true) ] } + + ch_liftoff_fasta = params.liftoff_annotations + ? ch_liftoff_mm.fasta : Channel.empty() - ch_xref_fasta = ch_xref_mm.fasta - ch_xref_gff = ch_xref_mm.gff + ch_liftoff_gff = params.liftoff_annotations + ? ch_liftoff_mm.gff + : Channel.empty() + + val_tsebra_config = params.braker_allow_isoforms + ? "${projectDir}/assets/tsebra-default.cfg" + : "${projectDir}/assets/tsebra-1form.cfg" // SUBWORKFLOW: PREPARE_ASSEMBLY PREPARE_ASSEMBLY( ch_target_assembly, ch_te_library, - params.repeat_annotator + params.repeat_annotator, + ch_braker_ex_asm_str ) ch_valid_target_assembly = PREPARE_ASSEMBLY.out.target_assemby @@ -77,8 +154,9 @@ workflow PANGENE { // SUBWORKFLOW: PREPROCESS_RNASEQ PREPROCESS_RNASEQ( - ch_samplesheet, + ch_reads, ch_tar_assm_str, + ch_braker_ex_asm_str, params.skip_fastqc, params.skip_fastp, params.save_trimmed, @@ -95,7 +173,7 @@ workflow PANGENE { ALIGN_RNASEQ( ch_reads_target, ch_trim_reads, - ch_target_assemby_index + ch_target_assemby_index, ) ch_rnaseq_bam = ALIGN_RNASEQ.out.bam @@ -109,40 +187,77 @@ workflow PANGENE { ch_ext_prots_fasta = PREPARE_EXT_PROTS.out.ext_prots_fasta ch_versions = ch_versions.mix(PREPARE_EXT_PROTS.out.versions) - // MODULE: BRAKER3 - ch_braker_inputs = ch_masked_target_assembly - | join(ch_rnaseq_bam, remainder: true) - | combine( - ch_ext_prots_fasta.map { meta, filePath -> filePath }.ifEmpty(null) - ) - | map { meta, fasta, bam, prots -> [ meta, fasta, bam ?: [], prots ?: [] ] } - - def rnaseq_sets_dirs = [] - def rnaseq_sets_ids = [] - def hintsfile = [] - - BRAKER3( - ch_braker_inputs.map { meta, fasta, bam, prots -> [meta, fasta] }, - ch_braker_inputs.map { meta, fasta, bam, prots -> bam }, - rnaseq_sets_dirs, - rnaseq_sets_ids, - ch_braker_inputs.map { meta, fasta, bam, prots -> prots }, - hintsfile + // SUBWORKFLOW: FASTA_BRAKER3 + FASTA_BRAKER3( + ch_masked_target_assembly, + ch_braker_ex_asm_str, + ch_rnaseq_bam, + ch_ext_prots_fasta, + ch_braker_annotation ) - ch_braker_gff3 = BRAKER3.out.gff3 - ch_versions = ch_versions.mix(BRAKER3.out.versions.first()) + ch_braker_gff3 = FASTA_BRAKER3.out.braker_gff3 + ch_braker_hints = FASTA_BRAKER3.out.braker_hints + ch_versions = ch_versions.mix(FASTA_BRAKER3.out.versions) // SUBWORKFLOW: FASTA_LIFTOFF FASTA_LIFTOFF( ch_valid_target_assembly, - ch_xref_fasta, - ch_xref_gff + ch_liftoff_fasta, + ch_liftoff_gff ) ch_liftoff_gff3 = FASTA_LIFTOFF.out.gff3 ch_versions = ch_versions.mix(FASTA_LIFTOFF.out.versions) + // SUBWORKFLOW: PURGE_BREAKER_MODELS + PURGE_BREAKER_MODELS( + ch_braker_gff3, + ch_braker_hints, + ch_liftoff_gff3, + val_tsebra_config, + params.braker_allow_isoforms + ) + + ch_braker_purged_gff = PURGE_BREAKER_MODELS.out.braker_purged_gff + ch_versions = ch_versions.mix(PURGE_BREAKER_MODELS.out.versions) + + // SUBWORKFLOW: GFF_MERGE_CLEANUP + GFF_MERGE_CLEANUP( + ch_braker_purged_gff, + ch_liftoff_gff3 + ) + + ch_merged_gff = GFF_MERGE_CLEANUP.out.gff + ch_versions = ch_versions.mix(GFF_MERGE_CLEANUP.out.versions) + + // SUBWORKFLOW: GFF_EGGNOGMAPPER + GFF_EGGNOGMAPPER( + ch_merged_gff, + ch_valid_target_assembly, + params.eggnogmapper_db_dir, + ) + + ch_eggnogmapper_hits = GFF_EGGNOGMAPPER.out.eggnogmapper_hits + ch_eggnogmapper_annotations = GFF_EGGNOGMAPPER.out.eggnogmapper_annotations + ch_versions = ch_versions.mix(GFF_EGGNOGMAPPER.out.versions) + + // SUBWORKFLOW: PURGE_NOHIT_MODELS + PURGE_NOHIT_MODELS( + ch_merged_gff, + ch_eggnogmapper_hits, + params.eggnogmapper_purge_nohits + ) + + ch_purged_gff = PURGE_NOHIT_MODELS.out.purged_gff + ch_versions = ch_versions.mix(PURGE_NOHIT_MODELS.out.versions) + + // SUBWORKFLOW: GFF_STORE + GFF_STORE( + ch_purged_gff, + ch_eggnogmapper_annotations + ) + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')