From e444256cd4482a9b87fdba1bccabb90bf96bbd37 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Thu, 8 Feb 2024 17:53:16 -0800 Subject: [PATCH] Work on checking outputs --- .github/data/pipeline-provenance.json | 264 ++++++++++++++++++ .github/environments/check-outputs.yml | 9 + .github/scripts/check_outputs.py | 59 +++- .github/scripts/check_outputs.sh | 14 + .../create_output_checking_environment.sh | 3 + .github/workflows/pull_request.yml | 4 +- .gitignore | 4 + 7 files changed, 355 insertions(+), 2 deletions(-) create mode 100644 .github/data/pipeline-provenance.json create mode 100644 .github/environments/check-outputs.yml create mode 100644 .github/scripts/check_outputs.sh create mode 100755 .github/scripts/create_output_checking_environment.sh diff --git a/.github/data/pipeline-provenance.json b/.github/data/pipeline-provenance.json new file mode 100644 index 0000000..e9b9eba --- /dev/null +++ b/.github/data/pipeline-provenance.json @@ -0,0 +1,264 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/PipelineProvenanceRecord", + "description": "A record of the pipeline that was run." + }, + { + "$ref": "#/definitions/ProcessProvenanceRecord", + "description": "A record of a process that was run." + }, + { + "$ref": "#/definitions/InputFileProvenanceRecord", + "description": "A record of an input file that was used." + } + ] + }, + "definitions": { + "PipelineProvenanceRecord": { + "title": "PipelineProvenanceRecord", + "type": "object", + "properties": { + "pipeline_name": { + "type": "string", + "description": "Name of the pipeline.", + "examples": [ + "BCCDC-PHL/routine-assembly", + "BCCDC-PHL/plasmid-screen" + ] + }, + "pipeline_version": { + "type": "string", + "description": "Version of the pipeline.", + "examples": [ + "v0.1.0", + "0.1.0", + "1", + "2.0-beta" + ] + }, + "timestamp_analysis_start": { + "type": "string", + "description": "Timestamp for the start of a pipeline run. ISO-8601-formatted date, followed by 'T' and a 24-hour timestamp, assumed to be in the local timezone if not specified. Timezone may be specified with an offset from UTC. Timestamp precision is not guaranteed.", + "format": "date-time", + "examples": [ + "2021-12-06T16:12:31.252055", + "2022-01-12T01:22:51-08:00", + "2022-02-04T16:55:03.182-08:00" + ] + } + }, + "required": [ + "pipeline_name", + "pipeline_version" + ] + }, + "ProcessProvenanceRecord": { + "title": "ProcessProvenanceRecord", + "type": "object", + "properties": { + "process_name": { + "type": "string", + "description": "Name of the process.", + "examples": [ + "fastp", + "bwa_mem", + "samtools_mpileup", + "align_reads_to_ref", + "trim_reads", + "CALL_VARIANTS" + ] + }, + "tools": { + "type": "array", + "description": "The tools used to run the process.", + "items": { + "$ref": "#/definitions/Tool" + }, + "examples": [ + [ + { + "tool_name": "fastp", + "tool_version": "0.20.0", + "subcommand": "trim", + "parameters": [ + { + "parameter": "cut_tail", + "value": null + } + ] + } + ], + [ + { + "tool_name": "bwa", + "tool_version": "0.7.17-r1188", + "subcommand": "mem", + "parameters": [ + { + "parameter": "exclude_flags", + "value": 1540 + }, + { + "parameter": "min_base_quality", + "value": 20 + } + ] + } + ] + ] + } + }, + "required": [ + "process_name" + ] + }, + "Tool": { + "title": "Tool", + "type": "object", + "properties": { + "tool_name": { + "type": "string", + "description": "Name of the tool.", + "examples": [ + "fastp", + "bwa", + "samtools", + "bcftools", + "medaka" + ] + }, + "tool_version": { + "type": "string", + "description": "A number or string associated with a specific snapshot of the development state of a tool. Should (but may not always) map to a tagged release on GitHub or another version control system.", + "examples": [ + "0.1.0", + "v0.1.1", + "1.1", + "2", + "0.7.17-r1188" + ] + }, + "subcommand": { + "type": "string", + "description": "Subcommand of the tool.", + "examples": [ + "mem", + "mpileup", + "filter" + ] + }, + "parameters": { + "type": "array", + "items": { + "$ref": "#/definitions/ToolParameter" + }, + "description": "The specific invocation of a process may depend on values that can be varied. These values are the parameters to that process. Each parameter has a name and a value.", + "examples": [ + [ + { + "parameter": "cut_tail", + "value": null + } + ], + [ + { + "parameter": "exclude_flags", + "value": 1540 + } + ], + [ + { + "parameter": "min_base_quality", + "value": 20 + } + ], + [ + { + "parameter": "min_coverage", + "value": 10 + } + ] + ] + } + } + }, + "ToolParameter": { + "title": "ToolParameter", + "type": "object", + "properties": { + "parameter": { + "type": "string", + "description": "Name of the parameter.", + "examples": [ + "cut_tail", + "exclude_flags", + "min_base_quality", + "min_coverage" + ] + }, + "value": { + "type": "string", + "description": "Value of the parameter, or null if the parameter is a flag without a value.", + "examples": [ + "null", + "1540", + "20", + "10" + ] + } + }, + "required": [ + "parameter" + ] + }, + "InputFileProvenanceRecord": { + "title": "InputFileProvenanceRecord", + "type": "object", + "properties": { + "input_filename": { + "type": "string", + "description": "Filename of an input file.", + "examples": [ + "sample-01_R1.fastq.gz", + "sample-01_R2.fastq.gz", + "ref.fa", + "sample-01.bam", + "sample-01.bam.bai" + ] + }, + "input_path": { + "type": "string", + "description": "Absolute path to an input file, at the time that the pipeline was invoked. May be invalid if the input file is moved or renamed after pipeline invocation.", + "examples":[ + "/data/ref_data/ecoli.fa", + "/data/sequence/miseq/210101_M00123_0123_000000000-ABC123/Data/Intensities/BaseCalls/sample-01_S1_L001_R1_001.fastq.gz", + "/data/sequence/miseq/210101_M00123_0123_000000000-ABC123/Data/Intensities/BaseCalls/sample-01_S1_L001_R2_001.fastq.gz" + ] + }, + "sha256": { + "type": "string", + "description": "The checksum of a file, calculated with the SHA256 algorithm. Files with identical contents have identical checksums. If a single byte differs, the checksums will be completely different.", + "examples":[ + "b0534592d61321243897e842a9ea655d396d4496cbf6d926b6c6fea8e06aa98d", + "cc66309103da91e337143eb649196d84ed3ebe2ff08a45b197cd4151d137a167" + ] + }, + "file_size": { + "type": "integer", + "description": "Size of the file in bytes.", + "examples": [ + 123456789, + 1234567890 + ] + } + }, + "required": [ + "input_filename" + ] + } + } +} diff --git a/.github/environments/check-outputs.yml b/.github/environments/check-outputs.yml new file mode 100644 index 0000000..414072e --- /dev/null +++ b/.github/environments/check-outputs.yml @@ -0,0 +1,9 @@ +name: check-outputs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3 + - jsonschema=4.20.0 + - pyyaml=6.0.1 diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py index 6d7ae03..7f88d6b 100755 --- a/.github/scripts/check_outputs.py +++ b/.github/scripts/check_outputs.py @@ -1,12 +1,69 @@ #!/usr/bin/env python3 import argparse +import csv +import glob +import json +import urllib.request + +from jsonschema import validate +import yaml + + +def check_provenance_format_valid(provenance_files, schema): + """ + Check that the provenance files are valid according to the schema. + """ + for provenance_file in provenance_files: + with open(provenance_file) as f: + try: + provenance = yaml.load(f, Loader=yaml.BaseLoader) + validate(provenance, schema) + except Exception as e: + return False + + return True + def main(args): - exit(-1) + provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json" + provenance_schema_path = ".github/data/pipeline-provenance.json" + urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path) + + provenance_schema = None + with open(provenance_schema_path) as f: + provenance_schema = json.load(f) + + provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml" + provenance_files = glob.glob(provenace_files_glob, recursive=True) + + tests = [ + { + "test_name": "provenance_format_valid", + "test_result": check_provenance_format_valid(provenance_files, provenance_schema), + } + ] + + output_fields = [ + "test_name", + "test_result" + ] + + output_path = args.output + with open(output_path, 'w') as f: + writer = csv.DictWriter(f, fieldnames=output_fields) + writer.writeheader() + for test in tests: + if test["test_result"]: + test["test_result"] = "PASS" + else: + test["test_result"] = "FAIL" + writer.writerow(test) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Check outputs') parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory') + parser.add_argument('-o', '--output', type=str, help='Path to the output file') args = parser.parse_args() main(args) diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh new file mode 100644 index 0000000..1f0127b --- /dev/null +++ b/.github/scripts/check_outputs.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -eo pipefail + +source ${HOME}/.bashrc + +eval "$(conda shell.bash hook)" + +conda activate check-outputs + + +.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv + +grep -v 'FAIL' .github/artifacts/check_outputs_results.csv diff --git a/.github/scripts/create_output_checking_environment.sh b/.github/scripts/create_output_checking_environment.sh new file mode 100755 index 0000000..0f9a4a4 --- /dev/null +++ b/.github/scripts/create_output_checking_environment.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +conda env create -f .github/environments/check-outputs.yml diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 89cb7f9..57e73dd 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -34,8 +34,10 @@ jobs: run: bash .github/scripts/simulate_reads.sh - name: Run Pipeline run: bash .github/scripts/run_pipeline.sh + - name: Create Output Checking Environment + run: bash .github/scripts/create_output_checking_environment.sh - name: Check Outputs - run: .github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output + run: .github/scripts/check_outputs.sh - name: Prepare Artifacts if: always() run: bash .github/scripts/prepare_artifacts.sh diff --git a/.gitignore b/.gitignore index 6f952b0..aeaec71 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ *~ .nextflow* +.github/data/assemblies +.github/data/fastq +.github/data/test_output +artifacts work test_input test_output \ No newline at end of file