Skip to content

Commit

Permalink
Fix testing (#19)
Browse files Browse the repository at this point in the history
* Make alignment cleaning optional

* updates

* Update test workflow

* re-working testing pipeline

* fix workflow structure

* update nextflow installation script

* updates

* update

* add conda enabled to config for compatibility

* add missing script

* updates

* Collect outputs when running pipeline

* fix sample_id in test
  • Loading branch information
dfornika authored May 31, 2024
1 parent 1f8805a commit fc4684e
Show file tree
Hide file tree
Showing 34 changed files with 286 additions and 1,129 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions .github/data/reads_to_simulate.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
NC000962.3,.github/data/refs/NC_000962.3.fa
500 changes: 0 additions & 500 deletions .github/data/refs/MN908947.3/MN908947.3.fa

This file was deleted.

1 change: 0 additions & 1 deletion .github/data/refs/MN908947.3/MN908947.3.fa.fai

This file was deleted.

500 changes: 0 additions & 500 deletions .github/data/refs/MN908947.3_with_snps/MN908947.3.fa

This file was deleted.

7 changes: 7 additions & 0 deletions .github/environments/art.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: art
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- art=2016.06.05
9 changes: 9 additions & 0 deletions .github/environments/check-outputs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: check-outputs
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3
- jsonschema=4.20.0
- pyyaml=6.0.1
116 changes: 116 additions & 0 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3

import argparse
import csv
import glob
import json
import os
import urllib.request

from jsonschema import validate
import yaml


def check_provenance_format_valid(provenance_files, schema):
"""
Check that the provenance files are valid according to the schema.
"""
for provenance_file in provenance_files:
with open(provenance_file) as f:
try:
provenance = yaml.load(f, Loader=yaml.BaseLoader)
validate(provenance, schema)
except Exception as e:
print(f"Error validating {provenance_file}: {e}")
exit(1)
return False

return True

def check_expected_files_exist(output_dir, sample_ids):
"""
Check that the expected files exist in the output directory.
:param output_dir: Path to the output directory
:param sample_ids: List of sample IDs
:return: True if all expected files exist, False otherwise
:rtype: bool
"""
for sample_id in sample_ids:
expected_files = [
f"{sample_id}/{sample_id}_fastp.csv",
f"{sample_id}/{sample_id}_fastp.json",
f"{sample_id}/{sample_id}_short.bam",
f"{sample_id}/{sample_id}_short.bam.bai",
f"{sample_id}/{sample_id}_short_combined_alignment_qc.csv",
f"{sample_id}/{sample_id}_short_depths.tsv",
f"{sample_id}/{sample_id}_short_freebayes.vcf",
f"{sample_id}/{sample_id}_short_low_coverage_regions.bed",
]

for expected_file in expected_files:
expected_file_path = os.path.join(output_dir, expected_file)
if not os.path.exists(expected_file_path):
print(f"Expected file {expected_file_path} not found")
return False

return True


def main(args):

output_dir = os.path.dirname(args.output)
os.makedirs(output_dir, exist_ok=True)

provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json"
provenance_schema_path = ".github/data/pipeline-provenance.json"
urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path)

provenance_schema = None
with open(provenance_schema_path) as f:
provenance_schema = json.load(f)

provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml"
provenance_files = glob.glob(provenace_files_glob, recursive=True)

sample_ids = [os.path.basename(provenance_file).split("_")[0] for provenance_file in provenance_files]

# TODO: Add more tests
tests = [
{
"test_name": "provenance_format_valid",
"test_passed": check_provenance_format_valid(provenance_files, provenance_schema),
},
{
"test_name": "all_expected_files_exist",
"test_passed": check_expected_files_exist(args.pipeline_outdir, sample_ids),
},
]

output_fields = [
"test_name",
"test_result"
]

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)

for test in tests:
if not test['test_passed']:
exit(1)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check outputs')
parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory')
parser.add_argument('-o', '--output', type=str, help='Path to the output file')
args = parser.parse_args()
main(args)
12 changes: 12 additions & 0 deletions .github/scripts/check_outputs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

set -e -o pipefail

source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate check-outputs


.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv
3 changes: 3 additions & 0 deletions .github/scripts/create_art_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/art.yml
3 changes: 3 additions & 0 deletions .github/scripts/create_output_checking_environment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

conda env create -f .github/environments/check-outputs.yml
6 changes: 6 additions & 0 deletions .github/scripts/download_refs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

mkdir -p .github/data/refs

curl -o .github/data/refs/NC_000962.3.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_000962.3&db=nucleotide&rettype=fasta"
curl -o .github/data/refs/NC_002973.6.fa "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?retmode=text&id=NC_002973.6&db=nucleotide&rettype=fasta"
11 changes: 5 additions & 6 deletions .github/scripts/install_nextflow.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
#!/bin/bash
set -eo pipefail

echo Install Nextflow .. >> artifacts/test_artifact.log
set -eo pipefail

wget -qO- https://get.nextflow.io | bash
artifacts_dir="artifacts"

mkdir -p /opt/nextflow/bin
echo Install Nextflow .. >> ${artifacts_dir}/test.log

mv nextflow /opt/nextflow/bin
wget -qO- https://get.nextflow.io | bash

echo "export PATH=/opt/nextflow/bin:$PATH" >> ~/.bashrc
sudo mv nextflow /usr/local/bin/
13 changes: 13 additions & 0 deletions .github/scripts/prepare_artifacts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

artifacts_dir="artifacts"

echo "Prepare artifacts .." >> ${artifacts_dir}/test.log

mkdir -p ${artifacts_dir}/fastq

mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq

mkdir -p ${artifacts_dir}/pipeline_outputs

mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs
21 changes: 21 additions & 0 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

set -eo pipefail

sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
sed -i 's/cpus = 12/cpus = 4/g' nextflow.config
sed -i 's/cpus = 16/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i "s/memory = '36G'/memory = '2G'/g" nextflow.config

nextflow run main.nf \
-profile conda \
--cache ${HOME}/.conda/envs \
--fastq_input .github/data/fastq \
--outdir .github/data/test_output \
--min_depth 5 \
--ref .github/data/refs/NC_000962.3.fa \
--collect_outputs \
-with-report .github/data/test_output/nextflow_report.html \
-with-trace .github/data/test_output/nextflow_trace.tsv
35 changes: 35 additions & 0 deletions .github/scripts/simulate_reads.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash


source ${HOME}/.bashrc

eval "$(conda shell.bash hook)"

conda activate art

mkdir -p .github/data/fastq

while IFS=',' read -r sample_id assembly; do
art_illumina \
--paired \
--in ${assembly} \
--fcov 12 \
--len 150 \
--mflen 400 \
--sdev 100 \
--rndSeed 42 \
--qShift 0 \
--qShift2 0 \
--out .github/data/fastq/${sample_id}_R

rm -f .github/data/fastq/${sample_id}_R1.aln
rm -f .github/data/fastq/${sample_id}_R2.aln

mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq
mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq

gzip -f .github/data/fastq/${sample_id}_R1.fastq
gzip -f .github/data/fastq/${sample_id}_R2.fastq

done < .github/data/reads_to_simulate.csv

87 changes: 0 additions & 87 deletions .github/scripts/test_against_previous_release.sh

This file was deleted.

28 changes: 0 additions & 28 deletions .github/workflows/pull_request.yml

This file was deleted.

Loading

0 comments on commit fc4684e

Please sign in to comment.