From 0720aecc32aa7e3c49d0f8fb49e8ee691d9e390e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 27 Oct 2017 09:45:56 -0400 Subject: [PATCH] Enhance cwltoil to support SoftwareRequirements & BioContainers. This enables the reproducibilty stack described in [this preprint](https://www.biorxiv.org/content/early/2017/10/11/200683) and [presented at BOSC 2017](http://jmchilton.github.io/writing/bosc2017slides/biocontainers.html) under Toil. Concretely this enables all the same options in cwltoil as added to cwltool in common-workflow-language/cwltool#214 including `` --beta-conda-dependencies``, ``--beta-dependency-resolvers-configuration``, and ``--beta-use-biocontainers``. The first two of these are documented in depth in cwltool's README (https://github.com/common-workflow-language/cwltool/#leveraging-softwarerequirements-beta). Here I will quickly review a couple of the available options against test examples available in cwltool's ``tests`` directory using this branch of Toil. ``` git clone https://github.com/common-workflow-language/cwltool.git cd cwltool ``` From here we can quickly demonstrate installation and resolution of CWL ``SoftwareRequirement`` hints using Conda using the tests/seqtk_seq.cwl tool. This tool doesn't define an explicit ``DockerRequirement`` but does define the following ``SoftwareRequirement`` in its ``hints`` as follows: ``` hints: SoftwareRequirement: packages: - package: seqtk version: - r93 ``` We can try this tool out with ``cwltoil`` and see that by default we probably don't have the binary seqtk on our ``PATH`` and so the tool fails using the following command: ``` cwltoil tests/seqtk_seq.cwl tests/seqtk_seq_job.json ``` This should result in a tool execution failure. We can then instruct ``cwltoil`` to install the required package from Bioconda into an isolated environment and use it as needed by passing it the ``--beta-conda-dependencies`` flag as follows: ``` cwltoil --beta-conda-dependencies tests/seqtk_seq.cwl tests/seqtk_seq_job.json ``` The tool should now be successful. The Conda support can be endless tweaked but the defaults are defaults that target the best practice Conda channels that work well for the Galaxy project. Additional ``SoftwareRequirement`` resolution options are available including targetting Software Modules, lmod, Homebrew, simple scripts called "Galaxy packages". All of these options can be specified and configured with a YAML file passed to cwltoil using the ``--beta-dependency-resolvers-configuration`` option instead of the simple shortcut ``--beta-conda-dependencies``. The cwltool documentation walks through a few examples of adapting infrastructure to tools and tools to package managers. Reference documentation is available in [galaxy-lib's documentation](http://galaxy-lib.readthedocs.io/en/latest/topics/dependency_resolution.html). In addition to options that allow configuring tool execution environments, containers themselves can be discovered and/or built from these software requirements. The [Biocontainers](https://github.com/BioContainers) project (previously Biodocker) contains a registry we use for this purpose. Every version of every Bioconda package has a corresponding best-practice (very lightweight, very small) Docker container on quay.io. There are over 3000 such containers currently. Continuing with the example above, the new `--beta-use-biocontainers` flag instructs ``cwltoil`` to fetch the corresponding Biocontainers container from quay.io automatically or build one to use locally (required for instance for tools with multiple software requirements - fat tools). ``` cwltoil --beta-use-biocontainers tests/seqtk_seq.cwl tests/seqtk_seq_job.json ``` These containers contain the same binaries that the package would use locally (outside of Docker). Therefore this technique allows cross platform reproducibility/remixability across cwltool, cwltoil, Galaxy, and CLI - both inside and outside of containers. --- setup.py | 1 + src/toil/cwl/cwltoil.py | 48 ++++++++++++++++++++++------ src/toil/test/cwl/2.fasta | 11 +++++++ src/toil/test/cwl/2.fastq | 12 +++++++ src/toil/test/cwl/cwlTest.py | 46 ++++++++++++++++++++------ src/toil/test/cwl/seqtk_seq.cwl | 24 ++++++++++++++ src/toil/test/cwl/seqtk_seq_job.json | 6 ++++ 7 files changed, 130 insertions(+), 18 deletions(-) create mode 100644 src/toil/test/cwl/2.fasta create mode 100644 src/toil/test/cwl/2.fastq create mode 100644 src/toil/test/cwl/seqtk_seq.cwl create mode 100644 src/toil/test/cwl/seqtk_seq_job.json diff --git a/setup.py b/setup.py index afe82c873d..fd9e0ab25c 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ def runSetup(): 'cwl': [ 'cwltool==1.0.20170822192924', 'schema-salad >= 2.6, < 3', + 'galaxy-lib==17.9.3', 'cwltest>=1.0.20170214185319']}, package_dir={'': 'src'}, packages=find_packages(where='src', diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index fcf395fa4f..554747cdde 100755 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -40,6 +40,7 @@ import cwltool.draft2tool from cwltool.pathmapper import PathMapper, adjustDirObjs, adjustFileObjs, get_listing, MapperEnt, visit_class, normalizeFilesDirs from cwltool.process import shortname, fillInDefaults, compute_checksums, collectFilesAndDirs, stageFiles +from cwltool.software_requirements import DependenciesConfiguration, get_container_from_software_requirements from cwltool.utils import aslist import schema_salad.validate as validate import schema_salad.ref_resolver @@ -831,6 +832,14 @@ def main(args=None, stdout=sys.stdout): metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") + # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system." + parser.add_argument("--beta-dependency-resolvers-configuration", default=None) + # help="Defaut root directory used by dependency resolvers configuration." + parser.add_argument("--beta-dependencies-directory", default=None) + # help="Use biocontainers for tools without an explicitly annotated Docker container." + parser.add_argument("--beta-use-biocontainers", default=None, action="store_true") + # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages." + parser.add_argument("--beta-conda-dependencies", default=None, action="store_true") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, @@ -853,22 +862,32 @@ def main(args=None, stdout=sys.stdout): outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} + make_tool_kwargs = {} + conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) # Text + use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) # Text + job_script_provider = None + if conf_file or use_conda_dependencies: + dependencies_configuration = DependenciesConfiguration(options) # type: DependenciesConfiguration + job_script_provider = dependencies_configuration + + options.default_container = None + make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict + make_tool_kwargs["hints"] = [{ + "class": "ResourceRequirement", + "coresMin": toil.config.defaultCores, + "ramMin": toil.config.defaultMemory / (2**20), + "outdirMin": toil.config.defaultDisk / (2**20), + "tmpdirMin": 0 + }] try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, - kwargs={ - "hints": [{ - "class": "ResourceRequirement", - "coresMin": toil.config.defaultCores, - "ramMin": toil.config.defaultMemory / (2**20), - "outdirMin": toil.config.defaultDisk / (2**20), - "tmpdirMin": 0 - }]}, + kwargs=make_tool_kwargs, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) @@ -931,7 +950,8 @@ def setSecondary(fileobj): try: (wf1, wf2) = makeJob(t, {}, use_container=use_container, preserve_environment=options.preserve_environment, - tmpdir=os.path.realpath(outdir), workdir=options.workDir) + tmpdir=os.path.realpath(outdir), workdir=options.workDir, + job_script_provider=job_script_provider) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 @@ -948,3 +968,13 @@ def setSecondary(fileobj): stdout.write(json.dumps(outobj, indent=4)) return 0 + + +def find_default_container(args, builder): + default_container = None + if args.default_container: + default_container = args.default_container + elif args.beta_use_biocontainers: + default_container = get_container_from_software_requirements(args, builder) + + return default_container diff --git a/src/toil/test/cwl/2.fasta b/src/toil/test/cwl/2.fasta new file mode 100644 index 0000000000..3bfe7d3d3e --- /dev/null +++ b/src/toil/test/cwl/2.fasta @@ -0,0 +1,11 @@ +>Sequence 561 BP; 135 A; 106 C; 98 G; 222 T; 0 other; +gttcgatgcc taaaatacct tcttttgtcc ctacacagac cacagttttc ctaatggctt +tacaccgact agaaattctt gtgcaagcac taattgaaag cggttggcct agagtgttac +cggtttgtat agctgagcgc gtctcttgcc ctgatcaaag gttcattttc tctactttgg +aagacgttgt ggaagaatac aacaagtacg agtctctccc ccctggtttg ctgattactg +gatacagttg taataccctt cgcaacaccg cgtaactatc tatatgaatt attttccctt +tattatatgt agtaggttcg tctttaatct tcctttagca agtcttttac tgttttcgac +ctcaatgttc atgttcttag gttgttttgg ataatatgcg gtcagtttaa tcttcgttgt +ttcttcttaa aatatttatt catggtttaa tttttggttt gtacttgttc aggggccagt +tcattattta ctctgtttgt atacagcagt tcttttattt ttagtatgat tttaatttaa +aacaattcta atggtcaaaa a \ No newline at end of file diff --git a/src/toil/test/cwl/2.fastq b/src/toil/test/cwl/2.fastq new file mode 100644 index 0000000000..436c05ff2c --- /dev/null +++ b/src/toil/test/cwl/2.fastq @@ -0,0 +1,12 @@ +@EAS54_6_R1_2_1_413_324 +CCCTTCTTGTCTTCAGCGTTTCTCC ++ +;;3;;;;;;;;;;;;7;;;;;;;88 +@EAS54_6_R1_2_1_540_792 +TTGGCAGGCCAAGGCCGATGGATCA ++ +;;;;;;;;;;;7;;;;;-;;;3;83 +@EAS54_6_R1_2_1_443_348 +GTTGCTTCTGGCGTGGGTGGGGGGG ++EAS54_6_R1_2_1_443_348 +;;;;;;;;;;;9;7;;.7;393333 \ No newline at end of file diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index d813a95743..e4e302da60 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -27,22 +27,20 @@ from toil.test import ToilTest, needs_cwl, slow - @needs_cwl class CWLTest(ToilTest): - def _tester(self, cwlfile, jobfile, outDir, expect): + def _tester(self, cwlfile, jobfile, outDir, expect, main_args=[], out_name="output"): from toil.cwl import cwltoil rootDir = self._projectRootPath() st = StringIO() - cwltoil.main(['--outdir', outDir, - os.path.join(rootDir, cwlfile), - os.path.join(rootDir, jobfile)], - stdout=st) + main_args = main_args[:] + main_args.extend(['--outdir', outDir, os.path.join(rootDir, cwlfile), os.path.join(rootDir, jobfile)]) + cwltoil.main(main_args, stdout=st) out = json.loads(st.getvalue()) - out["output"].pop("http://commonwl.org/cwltool#generation", None) - out["output"].pop("nameext", None) - out["output"].pop("nameroot", None) + out[out_name].pop("http://commonwl.org/cwltool#generation", None) + out[out_name].pop("nameext", None) + out[out_name].pop("nameroot", None) self.assertEquals(out, expect) def _debug_worker_tester(self, cwlfile, jobfile, outDir, expect): @@ -148,3 +146,33 @@ def test_run_conformance(self): if not only_unsupported: print(e.output) raise e + + @slow + def test_bioconda(self): + outDir = self._createTempDir() + self._tester('src/toil/test/cwl/seqtk_seq.cwl', + 'src/toil/test/cwl/seqtk_seq_job.json', + outDir, + self._expected_seqtk_output(outDir), + main_args=["--beta-conda-dependencies"], + out_name="output1") + + def test_biocontainers(self): + outDir = self._createTempDir() + self._tester('src/toil/test/cwl/seqtk_seq.cwl', + 'src/toil/test/cwl/seqtk_seq_job.json', + outDir, + self._expected_seqtk_output(outDir), + main_args=["--beta-use-biocontainers"], + out_name="output1") + + def _expected_seqtk_output(self, outDir): + return { + u"output": { + u"location": "file://" + str(os.path.join(outDir, 'output.txt')), + u"checksum": u"sha1$322e001e5a99f19abdce9f02ad0f02a17b5066c2", + u"basename": str("out"), + u"location": u"file:///Users/john/workspace/toil/cwltool/out", + u"class": u"File", + u"size": 150} + } diff --git a/src/toil/test/cwl/seqtk_seq.cwl b/src/toil/test/cwl/seqtk_seq.cwl new file mode 100644 index 0000000000..b97d6c25e6 --- /dev/null +++ b/src/toil/test/cwl/seqtk_seq.cwl @@ -0,0 +1,24 @@ +cwlVersion: v1.0 +class: CommandLineTool +id: "seqtk_seq" +doc: "Convert to FASTA (seqtk)" +inputs: + - id: input1 + type: File + inputBinding: + position: 1 + prefix: "-a" +outputs: + - id: output1 + type: File + outputBinding: + glob: out +baseCommand: ["seqtk", "seq"] +arguments: [] +stdout: out +hints: + SoftwareRequirement: + packages: + - package: seqtk + version: + - r93 diff --git a/src/toil/test/cwl/seqtk_seq_job.json b/src/toil/test/cwl/seqtk_seq_job.json new file mode 100644 index 0000000000..79ea46c378 --- /dev/null +++ b/src/toil/test/cwl/seqtk_seq_job.json @@ -0,0 +1,6 @@ +{ + "input1": { + "class": "File", + "location": "2.fastq" + } +}