diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 7adb289..4a3799d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -13,7 +13,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.8.17' + python-version: '3.10.5' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 7deb7c4..9c82f36 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8.17] + python-version: [3.10.5, 3.11] steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index 32b0a13..3701049 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,38 @@ -cromwell-executions/ -cromwell-workflow-logs/ + +# built files __pycache__/ +build/ .idea/ .pyc -.env .DS_Store *.egg-info/ pip-wheel-metadata/ -.coverage -.mypy_cache/ dist/ -venv/ + +# env & IDE +.env .python-version +.mypy_cache/ +.vscode/ +venv/ + +# tests +.coverage +.nextflow/ +cromwell-executions/ +cromwell-workflow-logs/ +sample_data/ +local_data/ + +# translation files +.janis/ +output/ +translated*/ +testout/ +*.log +involucro + +# misc files +test.py +ubu* +temp.txt \ No newline at end of file diff --git a/configs/nextflow.config.BwaAlignment b/configs/nextflow.config.BwaAlignment new file mode 100644 index 0000000..fe17e18 --- /dev/null +++ b/configs/nextflow.config.BwaAlignment @@ -0,0 +1,97 @@ +docker.enabled = true + +params { + + // OUTPUT DIRECTORY + outdir = './outputs' + + // INPUTS + fastqs = [ + [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R1.fastq.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R2.fastq.gz' + ], + ] + + // FastaWithIndexes + reference = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.amb', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.ann', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.bwt', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.dict', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.fai', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.pac', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.sa', + ] + + // CompressedIndexedVCF + snps_dbsnp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + snps_1000gp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + known_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + mills_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz.tbi' + ] + + adapter_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/adapter_list.txt' + contaminant_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/contaminant_list.txt' + sample_name = 'NA12878-BRCA1' + allele_freq_threshold = 0.05 + min_mapping_qual = null + filter = null + align_and_sort_sortsam_tmp_dir = './tmp' + + // PROCESSES + getfastqc_adapters.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_BwaAlignmentAndQC/templates/ParseFastqcAdapters.py' + calculate_performancesummary_genomefile.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_BwaAlignmentAndQC/templates/GenerateGenomeFileForBedtoolsCoverage.py' + fastqc.cpus = 1 + fastqc.memory = 8 + + // SUBWORKFLOW: ALIGN_AND_SORT + align_and_sort.five_prime_adapter_read1 = [ + // list values here + ] + align_and_sort.five_prime_adapter_read2 = [ + // list values here + ] + align_and_sort.cutadapt_quality_cutoff = 15 + align_and_sort.cutadapt_minimum_length = 50 + align_and_sort.bwamem_mark_shorter_splits = true + align_and_sort.sortsam_sort_order = 'coordinate' + align_and_sort.sortsam_create_index = true + align_and_sort.sortsam_validation_stringency = 'SILENT' + align_and_sort.sortsam_max_records_in_ram = 5000000 + align_and_sort.cutadapt.cpus = 5 + align_and_sort.cutadapt.memory = 4 + align_and_sort.bwamem.cpus = 16 + align_and_sort.bwamem.memory = 16 + align_and_sort.sortsam.cpus = 1 + align_and_sort.sortsam.memory = 8 + + // SUBWORKFLOW: MERGE_AND_MARKDUPS + merge_and_markdups.create_index = true + merge_and_markdups.max_records_in_ram = 5000000 + merge_and_markdups.merge_sam_files_use_threading = true + merge_and_markdups.merge_sam_files_validation_stringency = 'SILENT' + merge_and_markdups.merge_sam_files.cpus = 4 + merge_and_markdups.merge_sam_files.memory = 8 + merge_and_markdups.mark_duplicates.cpus = 4 + merge_and_markdups.mark_duplicates.memory = 8 + +} diff --git a/configs/nextflow.config.BwaAlignmentAndQC b/configs/nextflow.config.BwaAlignmentAndQC new file mode 100644 index 0000000..093867c --- /dev/null +++ b/configs/nextflow.config.BwaAlignmentAndQC @@ -0,0 +1,104 @@ +docker.enabled = true + +params { + + // OUTPUT DIRECTORY + outdir = './outputs' + + // INPUTS + fastqs = [ + [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R1.fastq.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R2.fastq.gz' + ], + ] + + // FastaWithIndexes + reference = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.amb', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.ann', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.bwt', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.dict', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.fai', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.pac', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.sa', + ] + + // CompressedIndexedVCF + snps_dbsnp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + snps_1000gp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + known_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + mills_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz.tbi' + ] + + adapter_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/adapter_list.txt' + contaminant_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/contaminant_list.txt' + sample_name = 'NA12878-BRCA1' + allele_freq_threshold = 0.05 + min_mapping_qual = null + filter = null + align_and_sort_sortsam_tmp_dir = './tmp' + + // PROCESSES + getfastqc_adapters.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_BwaAlignmentAndQC/templates/ParseFastqcAdapters.py' + calculate_performancesummary_genomefile.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_BwaAlignmentAndQC/templates/GenerateGenomeFileForBedtoolsCoverage.py' + fastqc.cpus = 1 + fastqc.memory = 8 + + // SUBWORKFLOW: ALIGN_AND_SORT + align_and_sort.five_prime_adapter_read1 = [ + // list values here + ] + align_and_sort.five_prime_adapter_read2 = [ + // list values here + ] + align_and_sort.cutadapt_quality_cutoff = 15 + align_and_sort.cutadapt_minimum_length = 50 + align_and_sort.bwamem_mark_shorter_splits = true + align_and_sort.sortsam_sort_order = 'coordinate' + align_and_sort.sortsam_create_index = true + align_and_sort.sortsam_validation_stringency = 'SILENT' + align_and_sort.sortsam_max_records_in_ram = 5000000 + align_and_sort.cutadapt.cpus = 5 + align_and_sort.cutadapt.memory = 4 + align_and_sort.bwamem.cpus = 16 + align_and_sort.bwamem.memory = 16 + align_and_sort.sortsam.cpus = 1 + align_and_sort.sortsam.memory = 8 + + // SUBWORKFLOW: MERGE_AND_MARKDUPS + merge_and_markdups.create_index = true + merge_and_markdups.max_records_in_ram = 5000000 + merge_and_markdups.merge_sam_files_use_threading = true + merge_and_markdups.merge_sam_files_validation_stringency = 'SILENT' + merge_and_markdups.merge_sam_files.cpus = 4 + merge_and_markdups.merge_sam_files.memory = 8 + merge_and_markdups.mark_duplicates.cpus = 4 + merge_and_markdups.mark_duplicates.memory = 8 + + // SUBWORKFLOW: PERFORMANCE_SUMMARY + performance_summary.samtoolsview_do_not_output_alignments_with_bits_set = '0x400' + performance_summary.performancesummary_genome = true + performance_summary.gatk4collectinsertsizemetrics.cpus = 1 + performance_summary.gatk4collectinsertsizemetrics.memory = 8 + performance_summary.bedtoolsgenomecoveragebed.memory = 8 + +} diff --git a/configs/nextflow.config.wgsgermline b/configs/nextflow.config.wgsgermline new file mode 100644 index 0000000..c68e7a5 --- /dev/null +++ b/configs/nextflow.config.wgsgermline @@ -0,0 +1,182 @@ +docker.enabled = true + +params { + + // OUTPUT DIRECTORY + outdir = './outputs' + + // INPUTS + fastqs = [ + [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R1.fastq.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/NA12878-BRCA1_R2.fastq.gz' + ], + ] + + // FastaWithIndexes + reference = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.amb', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.ann', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.bwt', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.dict', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.fai', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.pac', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.chr17.fasta.sa', + ] + + // CompressedIndexedVCF + snps_dbsnp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + snps_1000gp = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + known_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + mills_indels = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz.tbi' + ] + + adapter_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/adapter_list.txt' + contaminant_file = '/home/grace/work/pp/translation/janis-assistant/local_data/data/contaminant_list.txt' + + gatk_intervals = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/BRCA1.hg38.bed' + ] + vardict_intervals = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/BRCA1.hg38.split-intervals.bed' + ] + // BedTABIX + strelka_intervals = [ + '/home/grace/work/pp/translation/janis-assistant/local_data/data/BRCA1.hg38.bed.gz', + '/home/grace/work/pp/translation/janis-assistant/local_data/data/BRCA1.hg38.bed.gz.tbi' + ] + + gridss_blacklist = '/home/grace/work/pp/translation/janis-assistant/local_data/data/consensusBlacklist.hg38.chr17.bed' + sample_name = 'NA12878-BRCA1' + allele_freq_threshold = 0.05 + min_mapping_qual = null + filter = null + align_and_sort_sortsam_tmp_dir = './tmp' + combine_variants_type = 'germline' + combine_variants_columns = ['AC', 'AN', 'AF', 'AD', 'DP', 'GT'] + + // PROCESSES + getfastqc_adapters.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/ParseFastqcAdapters.py' + calculate_performancesummary_genomefile.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateGenomeFileForBedtoolsCoverage.py' + generate_gatk_intervals.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateIntervalsByChromosome.py' + generate_manta_config.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateMantaConfig.py' + generate_vardict_headerlines.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateVardictHeaderLines.py' + fastqc.cpus = 1 + fastqc.memory = 8 + vc_gridss.cpus = 8 + vc_gridss.memory = 31 + vc_gatk_sort_combined.cpus = 1 + vc_gatk_sort_combined.memory = 8 + vc_vardict_sort_combined.cpus = 1 + vc_vardict_sort_combined.memory = 8 + combine_variants.memory = 8 + combined_sort.cpus = 1 + combined_sort.memory = 8 + + // SUBWORKFLOW: ALIGN_AND_SORT + align_and_sort.five_prime_adapter_read1 = [] // list values here + align_and_sort.five_prime_adapter_read2 = [] // list values here + align_and_sort.cutadapt_quality_cutoff = 15 + align_and_sort.cutadapt_minimum_length = 50 + align_and_sort.bwamem_mark_shorter_splits = true + align_and_sort.sortsam_sort_order = 'coordinate' + align_and_sort.sortsam_create_index = true + align_and_sort.sortsam_validation_stringency = 'SILENT' + align_and_sort.sortsam_max_records_in_ram = 5000000 + align_and_sort.cutadapt.cpus = 5 + align_and_sort.cutadapt.memory = 4 + align_and_sort.bwamem.cpus = 16 + align_and_sort.bwamem.memory = 16 + align_and_sort.sortsam.cpus = 1 + align_and_sort.sortsam.memory = 8 + + // SUBWORKFLOW: BQSR + bqsr.base_recalibrator.cpus = 1 + bqsr.base_recalibrator.memory = 16 + bqsr.apply_bqsr.cpus = 1 + bqsr.apply_bqsr.memory = 8 + + // SUBWORKFLOW: COMBINED_ADDBAMSTATS + combined_addbamstats.samtoolsmpileup_count_orphans = true + combined_addbamstats.samtoolsmpileup_nobaq = true + combined_addbamstats.samtoolsmpileup_minbq = 0 + combined_addbamstats.samtoolsmpileup_max_depth = 10000 + combined_addbamstats.addbamstats_type = 'germline' + + // SUBWORKFLOW: MERGE_AND_MARKDUPS + merge_and_markdups.create_index = true + merge_and_markdups.max_records_in_ram = 5000000 + merge_and_markdups.merge_sam_files_use_threading = true + merge_and_markdups.merge_sam_files_validation_stringency = 'SILENT' + merge_and_markdups.merge_sam_files.cpus = 4 + merge_and_markdups.merge_sam_files.memory = 8 + merge_and_markdups.mark_duplicates.cpus = 4 + merge_and_markdups.mark_duplicates.memory = 8 + + // SUBWORKFLOW: PERFORMANCE_SUMMARY + performance_summary.samtoolsview_do_not_output_alignments_with_bits_set = '0x400' + performance_summary.performancesummary_genome = true + performance_summary.gatk4collectinsertsizemetrics.cpus = 1 + performance_summary.gatk4collectinsertsizemetrics.memory = 8 + performance_summary.bedtoolsgenomecoveragebed.memory = 8 + + // SUBWORKFLOW: VC_GATK + vc_gatk.haplotype_caller_pair_hmm_implementation = 'LOGLESS_CACHING' + vc_gatk.split_bam.memory = 4 + vc_gatk.haplotype_caller.cpus = 1 + vc_gatk.haplotype_caller.memory = 8 + vc_gatk.splitnormalisevcf.cpus = 1 + vc_gatk.splitnormalisevcf.memory = 8 + + // SUBWORKFLOW: VC_STRELKA + vc_strelka.is_exome = null + vc_strelka.strelka_config = null + vc_strelka.filterpass_remove_filetered_all = true + vc_strelka.filterpass_recode = true + vc_strelka.filterpass_recode_infoall = true + vc_strelka.manta.cpus = 4 + vc_strelka.manta.memory = 4 + vc_strelka.strelka.cpus = 4 + vc_strelka.strelka.memory = 4 + vc_strelka.splitnormalisevcf.cpus = 1 + vc_strelka.splitnormalisevcf.memory = 8 + + // SUBWORKFLOW: VC_VARDICT + vc_vardict.vardict_vcf_format = true + vc_vardict.vardict_chrom_column = 1 + vc_vardict.vardict_reg_start_col = 2 + vc_vardict.vardict_gene_end_col = 3 + vc_vardict.vardict_threads = 4 + vc_vardict.compressvcf_stdout = true + vc_vardict.filterpass_remove_filetered_all = true + vc_vardict.filterpass_recode = true + vc_vardict.filterpass_recode_infoall = true + vc_vardict.vardict.cpus = 4 + vc_vardict.vardict.memory = 8 + vc_vardict.annotate.cpus = 1 + vc_vardict.annotate.memory = 8 + vc_vardict.splitnormalisevcf.cpus = 1 + vc_vardict.splitnormalisevcf.memory = 8 + vc_vardict.trim.cpus = 1 + vc_vardict.trim.memory = 1 + + +} diff --git a/configs/nextflow.config.wgsgermline.server b/configs/nextflow.config.wgsgermline.server new file mode 100644 index 0000000..135f30e --- /dev/null +++ b/configs/nextflow.config.wgsgermline.server @@ -0,0 +1,182 @@ +docker.enabled = true + +params { + + // OUTPUT DIRECTORY + outdir = './outputs' + + // INPUTS + fastqs = [ + [ + '/home/ubuntu/data/NA12878-BRCA1_R1.fastq.gz', + '/home/ubuntu/data/NA12878-BRCA1_R2.fastq.gz' + ], + ] + + // FastaWithIndexes + reference = [ + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.amb', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.ann', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.bwt', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.dict', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.fai', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.pac', + '/home/ubuntu/data/Homo_sapiens_assembly38.chr17.fasta.sa', + ] + + // CompressedIndexedVCF + snps_dbsnp = [ + '/home/ubuntu/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz', + '/home/ubuntu/data/Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + snps_1000gp = [ + '/home/ubuntu/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz', + '/home/ubuntu/data/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + known_indels = [ + '/home/ubuntu/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz', + '/home/ubuntu/data/Homo_sapiens_assembly38.known_indels.BRCA1.vcf.gz.tbi' + ] + + // CompressedIndexedVCF + mills_indels = [ + '/home/ubuntu/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz', + '/home/ubuntu/data/Mills_and_1000G_gold_standard.indels.hg38.BRCA1.vcf.gz.tbi' + ] + + adapter_file = '/home/ubuntu/data/adapter_list.txt' + contaminant_file = '/home/ubuntu/data/contaminant_list.txt' + + gatk_intervals = [ + '/home/ubuntu/data/BRCA1.hg38.bed' + ] + vardict_intervals = [ + '/home/ubuntu/data/BRCA1.hg38.split-intervals.bed' + ] + // BedTABIX + strelka_intervals = [ + '/home/ubuntu/data/BRCA1.hg38.bed.gz', + '/home/ubuntu/data/BRCA1.hg38.bed.gz.tbi' + ] + + gridss_blacklist = '/home/ubuntu/data/consensusBlacklist.hg38.chr17.bed' + sample_name = 'NA12878-BRCA1' + allele_freq_threshold = 0.05 + min_mapping_qual = null + filter = null + align_and_sort_sortsam_tmp_dir = './tmp' + combine_variants_type = 'germline' + combine_variants_columns = ['AC', 'AN', 'AF', 'AD', 'DP', 'GT'] + + // PROCESSES + getfastqc_adapters.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/ParseFastqcAdapters.py' + calculate_performancesummary_genomefile.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateGenomeFileForBedtoolsCoverage.py' + generate_gatk_intervals.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateIntervalsByChromosome.py' + generate_manta_config.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateMantaConfig.py' + generate_vardict_headerlines.code_file = '/home/grace/work/pp/translation/janis-assistant/translated_wgsgermline/templates/GenerateVardictHeaderLines.py' + fastqc.cpus = 1 + fastqc.memory = 8 + vc_gridss.cpus = 8 + vc_gridss.memory = 31 + vc_gatk_sort_combined.cpus = 1 + vc_gatk_sort_combined.memory = 8 + vc_vardict_sort_combined.cpus = 1 + vc_vardict_sort_combined.memory = 8 + combine_variants.memory = 8 + combined_sort.cpus = 1 + combined_sort.memory = 8 + + // SUBWORKFLOW: ALIGN_AND_SORT + align_and_sort.five_prime_adapter_read1 = [] // list values here + align_and_sort.five_prime_adapter_read2 = [] // list values here + align_and_sort.cutadapt_quality_cutoff = 15 + align_and_sort.cutadapt_minimum_length = 50 + align_and_sort.bwamem_mark_shorter_splits = true + align_and_sort.sortsam_sort_order = 'coordinate' + align_and_sort.sortsam_create_index = true + align_and_sort.sortsam_validation_stringency = 'SILENT' + align_and_sort.sortsam_max_records_in_ram = 5000000 + align_and_sort.cutadapt.cpus = 5 + align_and_sort.cutadapt.memory = 4 + align_and_sort.bwamem.cpus = 16 + align_and_sort.bwamem.memory = 16 + align_and_sort.sortsam.cpus = 1 + align_and_sort.sortsam.memory = 8 + + // SUBWORKFLOW: BQSR + bqsr.base_recalibrator.cpus = 1 + bqsr.base_recalibrator.memory = 16 + bqsr.apply_bqsr.cpus = 1 + bqsr.apply_bqsr.memory = 8 + + // SUBWORKFLOW: COMBINED_ADDBAMSTATS + combined_addbamstats.samtoolsmpileup_count_orphans = true + combined_addbamstats.samtoolsmpileup_nobaq = true + combined_addbamstats.samtoolsmpileup_minbq = 0 + combined_addbamstats.samtoolsmpileup_max_depth = 10000 + combined_addbamstats.addbamstats_type = 'germline' + + // SUBWORKFLOW: MERGE_AND_MARKDUPS + merge_and_markdups.create_index = true + merge_and_markdups.max_records_in_ram = 5000000 + merge_and_markdups.merge_sam_files_use_threading = true + merge_and_markdups.merge_sam_files_validation_stringency = 'SILENT' + merge_and_markdups.merge_sam_files.cpus = 4 + merge_and_markdups.merge_sam_files.memory = 8 + merge_and_markdups.mark_duplicates.cpus = 4 + merge_and_markdups.mark_duplicates.memory = 8 + + // SUBWORKFLOW: PERFORMANCE_SUMMARY + performance_summary.samtoolsview_do_not_output_alignments_with_bits_set = '0x400' + performance_summary.performancesummary_genome = true + performance_summary.gatk4collectinsertsizemetrics.cpus = 1 + performance_summary.gatk4collectinsertsizemetrics.memory = 8 + performance_summary.bedtoolsgenomecoveragebed.memory = 8 + + // SUBWORKFLOW: VC_GATK + vc_gatk.haplotype_caller_pair_hmm_implementation = 'LOGLESS_CACHING' + vc_gatk.split_bam.memory = 4 + vc_gatk.haplotype_caller.cpus = 1 + vc_gatk.haplotype_caller.memory = 8 + vc_gatk.splitnormalisevcf.cpus = 1 + vc_gatk.splitnormalisevcf.memory = 8 + + // SUBWORKFLOW: VC_STRELKA + vc_strelka.is_exome = null + vc_strelka.strelka_config = null + vc_strelka.filterpass_remove_filetered_all = true + vc_strelka.filterpass_recode = true + vc_strelka.filterpass_recode_infoall = true + vc_strelka.manta.cpus = 4 + vc_strelka.manta.memory = 4 + vc_strelka.strelka.cpus = 4 + vc_strelka.strelka.memory = 4 + vc_strelka.splitnormalisevcf.cpus = 1 + vc_strelka.splitnormalisevcf.memory = 8 + + // SUBWORKFLOW: VC_VARDICT + vc_vardict.vardict_vcf_format = true + vc_vardict.vardict_chrom_column = 1 + vc_vardict.vardict_reg_start_col = 2 + vc_vardict.vardict_gene_end_col = 3 + vc_vardict.vardict_threads = 4 + vc_vardict.compressvcf_stdout = true + vc_vardict.filterpass_remove_filetered_all = true + vc_vardict.filterpass_recode = true + vc_vardict.filterpass_recode_infoall = true + vc_vardict.vardict.cpus = 4 + vc_vardict.vardict.memory = 8 + vc_vardict.annotate.cpus = 1 + vc_vardict.annotate.memory = 8 + vc_vardict.splitnormalisevcf.cpus = 1 + vc_vardict.splitnormalisevcf.memory = 8 + vc_vardict.trim.cpus = 1 + vc_vardict.trim.memory = 1 + + +} diff --git a/janis_assistant/__init__.py b/janis_assistant/__init__.py index 73fced5..19b282c 100644 --- a/janis_assistant/__init__.py +++ b/janis_assistant/__init__.py @@ -11,3 +11,6 @@ from janis_assistant.management.workflowmanager import WorkflowManager, TaskStatus from janis_assistant.validation import ValidationRequirements from janis_assistant.__meta__ import __version__ + +import collections +collections.Callable = collections.abc.Callable diff --git a/janis_assistant/__meta__.py b/janis_assistant/__meta__.py index 2d4166b..9328593 100644 --- a/janis_assistant/__meta__.py +++ b/janis_assistant/__meta__.py @@ -1,4 +1,4 @@ -__version__ = "v0.12.1" +__version__ = "v0.13.0" DOCS_URL = "https://janis.readthedocs.io" GITHUB_URL = "https://github.com/PMCC-BioinformaticsCore/janis-assistant" ISSUE_URL = "https://github.com/PMCC-BioinformaticsCore/janis-assistant/issues/new" diff --git a/janis_assistant/cli.py b/janis_assistant/cli.py index 01846b6..1ad6b66 100644 --- a/janis_assistant/cli.py +++ b/janis_assistant/cli.py @@ -3,15 +3,16 @@ import os.path import json from time import sleep -from typing import Optional, Tuple, List +from typing import Optional, Tuple import ruamel.yaml import tabulate -from janis_assistant.management.workflowmanager import WorkflowManager +from janis_assistant.management.workflowmanager import WorkflowManager from janis_assistant.management.envvariables import EnvVariables from janis_core import InputQualityType, HINTS, HintEnum, SupportedTranslation, Tool +from janis_core.ingestion import SupportedIngestion from janis_core.utils.logger import Logger, LogLevel from janis_assistant.__meta__ import DOCS_URL @@ -27,6 +28,7 @@ from janis_assistant.data.enums.taskstatus import TaskStatus from janis_assistant.main import ( + ingest, translate, generate_inputs, cleanup, @@ -42,7 +44,6 @@ parse_additional_arguments, parse_dict, get_file_from_searchname, - fully_qualify_filename, dict_to_yaml_string, ) @@ -94,7 +95,7 @@ def process_args(sysargs=None): ) add_translate_args( subparsers.add_parser( - "translate", help="Translate a janis workflow to CWL, WDL, or Nextflow" + "translate", help="Translate a janis workflow to CWL, WDL or Nextflow" ) ) add_inputs_args( @@ -285,42 +286,94 @@ def add_cleanup_args(parser): return parser -def add_translate_args(parser): - parser.add_argument("workflow", help="Path to workflow") +def add_translate_args(parser: argparse.ArgumentParser): + """ + intended syntax + fmt1: janis translate [OPTIONS] --from cwl --to nextflow infile.cwl [longform] + """ + ### --- MANDATORY ARGS --- ### + parser.add_argument( + "infile", + help="Path to input file", + ) parser.add_argument( - "translation", - help="language to translate to", + "--from", + help="Language of infile. Will be autodetected if not supplied", + choices=SupportedIngestion.all(), + type=str + ) + parser.add_argument( + "--to", + help="Language to translate to.", choices=SupportedTranslation.all(), + type=str ) - parser.add_argument("-c", "--config", help="Path to config file") + + ### --- OPTIONAL ARGS --- ### + + # translation features parser.add_argument( - "--name", - help="If you have multiple workflows in your file, you may want to " - "help Janis out to select the right workflow to run", + "--mode", + help="Translate mode (default: regular). Controls extent of tool translation\n\ + - skeleton: ignores inputs which aren't used in workflow. no CLI command generation.\n\ + - regular: ignores inputs which aren't used in workflow. \n\ + - extended: full translation of all inputs & CLI command", + type=str, + choices=["skeleton", "regular", "extended"], + default="regular" + ) + # parser.add_argument( + # "--no-comments", + # help="don't provide info comments in output translation", + # default=False, + # action="store_true" + # ) + parser.add_argument( + "--galaxy-build-images", + action="store_true", + help="Requires docker. \nFor Galaxy Tool Wrappers with multiple software requirements, build a local container image containing all requirements.\nAdds ~2-10 mins per affected Galaxy Wrapper. " ) parser.add_argument( - "-o", - "--output-dir", - help="output directory to write output to (default=stdout)", + "--galaxy-no-image-cache", + help="Turns off galaxy container image cache. Cache stores previously identified containers suitable for different tool wrappers so that quay.io API calls are unnecessary for previously parsed tools.", + action="store_true", ) parser.add_argument( - "--no-cache", - help="Force re-download of workflow if remote", + "--galaxy-no-wrapper-cache", + help="Turns off galaxy tool downloads cache. Cache stores local copies of tool.xml files so they don't have to be re-downloaded if they have been downloaded before.", action="store_true", ) + # accessory files & directories + parser.add_argument( + "-o", + "--output-dir", + help="Output directory to write output to (default: translated).", + type=str, + default="translated" + ) + parser.add_argument( + "-c", + "--config", + help="Path to config file" + ) + parser.add_argument( + "--name", + help="Specifies the name of the workflow/tool to translate, in case where infile has multiple such objects.", + ) parser.add_argument( "--resources", action="store_true", help="Add resource overrides into inputs file (eg: runtime_cpu / runtime_memory)", ) - parser.add_argument( - "--toolbox", help="Only look for tools in the toolbox", action="store_true" + "--toolbox", + help="Only look for tools in the toolbox", + action="store_true" ) + # workflow inputs inputargs = parser.add_argument_group("Inputs") - inputargs.add_argument( "-i", "--inputs", @@ -334,6 +387,7 @@ def add_translate_args(parser): action="append", ) + # hints hint_args = parser.add_argument_group("hints") for HintType in HINTS: if issubclass(HintType, HintEnum): @@ -341,21 +395,20 @@ def add_translate_args(parser): "--hint-" + HintType.key(), choices=HintType.symbols() ) + # containers container_args = parser.add_argument_group("container related args") container_args.add_argument( - "--allow-empty-container", + "--disallow-empty-container", action="store_true", - help="Some tools you use may not include a container, this would usually (and intentionally) cause an error. " - "Including this flag will disable this check, and empty containers can be used.", + help="Some tools you use may not include a container, this would usually (and intentionally) cause an error." + "Including this flag will check that all tools have a container." ) - container_args.add_argument( "--container-override", help="Override a tool's container by specifying a new container. This argument should be specified in the " "following (comma separated) format: t1=v1,t2=v2. Eg toolid=container/override:version,toolid2=.", ) - container_args.add_argument( "--skip-digest-lookup", action="store_true", @@ -1235,38 +1288,85 @@ def parse_container_override_format(container_override): return co -def do_translate(args): - jc = JanisConfiguration.initial_configuration(args.config) - +def do_translate(args: argparse.Namespace): + # setup + # (ensure all parameters are ready for ingest & translate) + # JanisConfiguration holds settings related to janis-assistant, not janis-core translate + # settings in janis should be a singleton module so they are globally available. + # this would be time consuming, so will avoid for now. will just pass things as arguments. + # - GH + jc = JanisConfiguration.initial_configuration(args.config) container_override = parse_container_override_format(args.container_override) - + source_fmt = _get_source_fmt(args) + dest_fmt = _get_dest_fmt(args) + inputs = args.inputs if args.inputs else None hints = { - k[5:]: v - for k, v in vars(args).items() + k[5:]: v for k, v in vars(args).items() if k.startswith("hint_") and v is not None } + hints = hints if hints else None - inputs = args.inputs or [] - # the args.extra_inputs parameter are inputs that we MUST match - # we'll need to parse them manually and then pass them to fromjanis as requiring a match - # required_inputs = parse_additional_arguments(args.extra_inputs) + # ingest + internal_model = ingest( + infile=args.infile, + format=source_fmt, + galaxy_build_images=args.galaxy_build_images, + galaxy_no_image_cache=args.galaxy_no_image_cache, + galaxy_no_wrapper_cache=args.galaxy_no_wrapper_cache + ) + # translate translate( config=jc, - tool=args.workflow, - translation=args.translation, + tool=internal_model, + dest_fmt=dest_fmt, + mode=args.mode, name=args.name, output_dir=args.output_dir, - force=args.no_cache, - allow_empty_container=args.allow_empty_container, + inputs=inputs, + hints=hints, + allow_empty_container=not args.disallow_empty_container, container_override=container_override, skip_digest_lookup=args.skip_digest_lookup, skip_digest_cache=args.skip_digest_cache, - inputs=inputs, recipes=args.recipe, - hints=hints, - ) - + render_comments=False + ) + +def _get_source_fmt(args: argparse.Namespace) -> str: + # user supplied fmt + fmt: Optional[str] = None + for key, val in args._get_kwargs(): # workaround for '--from' name: usually a python error. + if key == 'from' and val is not None: + fmt = val + break + # auto-detect fmt + ext_map = { + '.cwl': 'cwl', # any cwl file + '.py': 'janis', # any janis file + '.xml': 'galaxy', # galaxy tool + '.ga': 'galaxy' # galaxy workflow + } + if not fmt: + name, ext = os.path.splitext(args.infile) + if ext in ext_map: + fmt = ext_map[ext] + # guard + if not fmt: + raise ValueError(f"unknown source language for {args.infile}. please specify with '--from'") + return fmt + +def _get_dest_fmt(args: argparse.Namespace) -> str: + # user supplied fmt (mandatory) + fmt: Optional[str] = None + for key, val in args._get_kwargs(): # workaround for '--from' name: usually a python error. + if key in ['to', 'dest_language'] and val is not None: + fmt = val + break + # guard + if not fmt: + raise ValueError(f"unsupplied dest language for {args.infile}. please specify with '--to'") + return fmt def do_cleanup(args): cleanup() diff --git a/janis_assistant/engines/nextflow/main.py b/janis_assistant/engines/nextflow/main.py index 7e48077..705e6cd 100644 --- a/janis_assistant/engines/nextflow/main.py +++ b/janis_assistant/engines/nextflow/main.py @@ -14,7 +14,7 @@ from janis_core import LogLevel from janis_core.types.data_types import is_python_primitive from janis_core.utils.logger import Logger -from janis_core.translations import nfgen, NextflowTranslator +from janis_core.translations import NextflowTranslator from janis_assistant.data.models.outputs import WorkflowOutputModel from janis_assistant.data.models.run import RunModel from janis_assistant.data.models.workflowjob import RunJobModel diff --git a/janis_assistant/main.py b/janis_assistant/main.py index 55bf6f0..40d5643 100644 --- a/janis_assistant/main.py +++ b/janis_assistant/main.py @@ -9,11 +9,10 @@ import os import sys import time -import subprocess from datetime import datetime from inspect import isclass +from typing import Optional, Type, Tuple, Any from textwrap import dedent -from typing import Optional, Dict, Union, Type, List, Tuple import janis_core as j from janis_assistant.data.enums import TaskStatus @@ -35,6 +34,8 @@ from janis_assistant.utils.batchrun import BatchRunRequirements from janis_core import InputQualityType, Tool, DynamicWorkflow, LogLevel, JanisShed +from janis_core import ingestion +from janis_core import translations import janis_assistant.templates as janistemplates from janis_assistant.data.models.preparedjob import PreparedJob @@ -42,8 +43,8 @@ from janis_assistant.management.configmanager import ConfigManager from janis_assistant.management.configuration import ( JanisConfiguration, - EnvVariables, stringify_dict_keys_or_return_value, + EnvVariables, JanisConfigurationEnvironment, DatabaseTypeToUse, JanisConfigurationCromwell, @@ -66,10 +67,10 @@ def run_with_outputs( - tool: Union[j.CommandTool, j.Workflow], - inputs: Dict[str, any], + tool: j.CommandTool | j.Workflow, + inputs: dict[str, Any], output_dir: str, - config: JanisConfiguration = None, + config: Optional[JanisConfiguration] = None, engine: Optional[str] = None, workflow_reference: Optional[str] = None, ): @@ -144,7 +145,7 @@ def run_with_outputs( def resolve_tool( - tool: Union[str, j.CommandTool, Type[j.CommandTool], j.Workflow, Type[j.Workflow]], + tool: str | j.CommandTool | Type[j.CommandTool] | j.Workflow | Type[j.Workflow], name=None, from_toolshed=False, force=False, @@ -208,22 +209,53 @@ def resolve_tool( raise Exception("Couldn't find tool with name: " + str(tool)) +def ingest( + infile: str, + format: str, + galaxy_build_images: bool=False, + galaxy_no_image_cache: bool=False, + galaxy_no_wrapper_cache: bool=False + ) -> str | j.CommandTool | j.Workflow: + """ + orchestrator of ingest process. + used translate() below as a guide. + currently just ingests the supplied file. + mainly a placeholder for more extensive logic in the future. + future extensions (related to ingesting a whole folder) + - gathering & loading files + - ingesting input dict + - ingesting script files + - ingesting tools (held in external file - like in a 'tools' or 'processes' subfolder) + """ + # if janis, just return the path. + # all file loading etc handled in translate() (resolve_tool()) + if format == 'janis': + return infile + else: + return ingestion.ingest( + infile, + format, + galaxy_build_images, +# galaxy_no_image_cache, +# galaxy_no_wrapper_cache, + ) + def translate( config: JanisConfiguration, - tool: Union[str, j.CommandTool, Type[j.CommandTool], j.Workflow, Type[j.Workflow]], - translation: str, - name: str = None, - hints: Optional[Dict[str, str]] = None, - output_dir: Optional[str] = None, - inputs: Union[str, dict] = None, - allow_empty_container=False, - container_override=None, - skip_digest_lookup=False, - skip_digest_cache=False, - recipes: List[str] = None, - **kwargs, + tool: str | j.CommandTool | j.Workflow, + dest_fmt: str, + mode: Optional[str]=None, + name: Optional[str]=None, + output_dir: str='translated', + inputs: Optional[str | dict[str, Any]]=None, + hints: Optional[dict[str, str]]=None, + allow_empty_container: Optional[bool]=True, + container_override: Optional[bool]=None, + skip_digest_lookup: Optional[bool]=False, + skip_digest_cache: Optional[bool]=False, + recipes: Optional[list[str]]=None, + render_comments: bool=True, ): - toolref, _ = resolve_tool(tool, name, from_toolshed=True) if not toolref: @@ -252,37 +284,44 @@ def translate( skip_digest_cache=skip_digest_cache, ) - if isinstance(toolref, j.WorkflowBase): - wfstr, _, _ = toolref.translate( - translation, - to_console=False, - to_disk=bool(output_dir), - export_path=output_dir or "./{language}", - hints=hints, - additional_inputs=inputsdict, - allow_empty_container=allow_empty_container, - container_override=container_overrides, - ) - elif isinstance(toolref, (j.CommandTool, j.CodeTool)): - wfstr = toolref.translate( - translation=translation, - to_console=False, - to_disk=bool(output_dir), - export_path=output_dir or "./{language}", - allow_empty_container=allow_empty_container, - container_override=container_overrides, - ) - - else: - name = toolref.__name__ if isclass(toolref) else toolref.__class__.__name__ - raise Exception("Unsupported tool type: " + name) - - print(wfstr, file=sys.stdout) - return wfstr + translations.translate( + entity=toolref, + dest_fmt=dest_fmt, + mode=mode, + + # file io + to_disk=True, + export_path=output_dir, + should_zip=None, # TODO create cli args? + to_console=None, # TODO create cli args? + tool_to_console=None, # TODO create cli args? + write_inputs_file=None, # TODO create cli args? + source_files=None, # TODO create cli args? + + # inputs + additional_inputs=inputsdict if inputsdict else None, + hints=hints, + + # containers + with_container=None, # TODO create cli arg? + allow_empty_container=allow_empty_container, + container_override=container_overrides if container_overrides else None, + + # resouces + with_resource_overrides=None, # TODO create cli arg? + merge_resources=None, # TODO create cli arg? + max_cores=None, # TODO create cli arg? + max_mem=None, # TODO create cli arg? + max_duration=None, # TODO create cli arg? + + # misc + render_comments=render_comments, + should_validate=None, # TODO create cli arg? + ) def spider_tool( - tool: Union[str, j.CommandTool, j.Workflow], + tool: str | j.CommandTool | j.Workflow, name=None, force=False, only_toolbox=False, @@ -312,15 +351,15 @@ def spider_tool( def generate_inputs( jc: JanisConfiguration, - tool: Union[str, j.CommandTool, j.Workflow], + tool: str | j.CommandTool | j.Workflow, all=False, name=None, force=False, additional_inputs=None, with_resources=False, - quality_type: List[InputQualityType] = None, - recipes: List[str] = None, - hints: dict = None, + quality_type: Optional[list[InputQualityType]] = None, + recipes: Optional[list[str]] = None, + hints: Optional[dict[str, Any]] = None, ): toolref, _ = resolve_tool(tool, name, from_toolshed=True, force=force) inputsdict = None @@ -358,7 +397,7 @@ def generate_inputs( class InitArgParser(argparse.ArgumentParser): def __init__( - self, templatename, schema: List[TemplateInput], description: str = None + self, templatename, schema: list[TemplateInput], description: Optional[str] = None ): super().__init__(f"janis init {templatename}", description=description) # self.add_usage( @@ -504,9 +543,9 @@ def get_config(): def run_from_jobfile( - workflow: Union[str, j.Tool, Type[j.Tool]], + workflow: str | j.Tool | Type[j.Tool], jobfile: PreparedJob, - engine: Union[str, Engine, None] = None, + engine: Optional[str | Engine] = None, wait: bool = False, # specific engine args cromwell_jar: Optional[str] = None, @@ -583,17 +622,17 @@ def run_from_jobfile( def prepare_job( - tool: Union[str, j.Tool, Type[j.Tool]], + tool: str | j.Tool | Type[j.Tool], # workflow search options workflow_reference: Optional[str], # if this is None, no jobfile will be written jc: JanisConfiguration, engine: Optional[str], batchrun_reqs: Optional[BatchRunRequirements], validation_reqs: Optional[ValidationRequirements], - hints: Optional[Dict[str, str]], + hints: Optional[dict[str, str]], output_dir: Optional[str], execution_dir: Optional[str], - inputs: Union[str, dict], + inputs: str | dict, required_inputs: dict, watch, max_cores, @@ -611,9 +650,9 @@ def prepare_job( skip_digest_lookup, skip_digest_cache, run_prepare_processing, - db_type: DatabaseTypeToUse = None, - source_hints: List[str] = None, - post_run_script: str = None, + db_type: Optional[DatabaseTypeToUse] = None, + source_hints: Optional[list[str]] = None, + post_run_script: Optional[str] = None, localise_all_files: bool = False, ): @@ -793,7 +832,7 @@ def get_filescheme_from_fs(fs, **kwargs): raise Exception(f"Couldn't initialise filescheme with unrecognised type: '{fs}'") -def abort_wids(sids: List[str], wait=True): +def abort_wids(sids: list[str], wait=True): cm = ConfigManager(db_path=None) for sid in sids: try: diff --git a/janis_assistant/management/workflowmanager.py b/janis_assistant/management/workflowmanager.py index 8db7d91..99da3c7 100644 --- a/janis_assistant/management/workflowmanager.py +++ b/janis_assistant/management/workflowmanager.py @@ -41,7 +41,7 @@ from janis_core.operators.operator import Operator from janis_core.translations import get_translator, CwlTranslator from janis_core.translations.translationbase import TranslatorBase -from janis_core.translations.wdl import apply_secondary_file_format_to_filename +from janis_core.utils.secondary import apply_secondary_file_format_to_filename from janis_assistant.data.enums import TaskStatus, ProgressKeys from janis_assistant.data.models.joblabel import JobLabelModel diff --git a/janis_assistant/tests/data/__init__.py b/janis_assistant/tests/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/janis_assistant/tests/data/cwl/fastqc.cwl b/janis_assistant/tests/data/cwl/fastqc.cwl new file mode 100644 index 0000000..909c065 --- /dev/null +++ b/janis_assistant/tests/data/cwl/fastqc.cwl @@ -0,0 +1,50 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +doc: | + Run fastqc on raw reads in FASTQ format (single or paired end) or aligned reads in BAM. + +hints: + ResourceRequirement: + coresMin: 1 + ramMin: 5000 + DockerRequirement: + dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1 + +baseCommand: "fastqc" +arguments: + - valueFrom: $(runtime.outdir) + prefix: "-o" + - valueFrom: "--noextract" + +inputs: + fastq1: + type: File? + inputBinding: + position: 1 + fastq2: + type: File? + inputBinding: + position: 2 + bam: + type: File? + inputBinding: + position: 1 + +outputs: + fastqc_zip: + doc: all data e.g. figures + type: + type: array + items: File + outputBinding: + glob: "*_fastqc.zip" + fastqc_html: + doc: html report showing results from zip + type: + type: array + items: File + outputBinding: + glob: "*_fastqc.html" + \ No newline at end of file diff --git a/janis_assistant/tests/data/cwl/fastqc2.cwl b/janis_assistant/tests/data/cwl/fastqc2.cwl new file mode 100644 index 0000000..8a437a9 --- /dev/null +++ b/janis_assistant/tests/data/cwl/fastqc2.cwl @@ -0,0 +1,169 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool + +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1 + SoftwareRequirement: + packages: + fastqc: + specs: [ "http://identifiers.org/biotools/fastqc" ] + version: [ "0.11.9--hdfd78af_1", "0.11.9" ] + +inputs: + + reads_file: + type: File + inputBinding: + position: 50 + doc: | + Input bam,sam,bam_mapped,sam_mapped or fastq file + + format_enum: + type: + - "null" + - type: enum + name: "format" + symbols: ['bam','sam','bam_mapped','sam_mapped','fastq'] + inputBinding: + position: 6 + prefix: '--format' + doc: | + Bypasses the normal sequence file format detection and + forces the program to use the specified format. Valid + formats are bam,sam,bam_mapped,sam_mapped and fastq + + threads: + type: int? + inputBinding: + position: 7 + prefix: '--threads' + doc: | + Specifies the number of files which can be processed + simultaneously. Each thread will be allocated 250MB of + memory so you shouldn't run more threads than your + available memory will cope with, and not more than + 6 threads on a 32 bit machine + contaminants: + type: File? + inputBinding: + position: 8 + prefix: '--contaminants' + doc: | + Specifies a non-default file which contains the list of + contaminants to screen overrepresented sequences against. + The file must contain sets of named contaminants in the + form name[tab]sequence. Lines prefixed with a hash will + be ignored. + adapters: + type: File? + inputBinding: + position: 9 + prefix: '--adapters' + doc: | + Specifies a non-default file which contains the list of + adapter sequences which will be explicity searched against + the library. The file must contain sets of named adapters + in the form name[tab]sequence. Lines prefixed with a hash + will be ignored. + limits: + type: File? + inputBinding: + position: 10 + prefix: '--limits' + doc: | + Specifies a non-default file which contains a set of criteria + which will be used to determine the warn/error limits for the + various modules. This file can also be used to selectively + remove some modules from the output all together. The format + needs to mirror the default limits.txt file found in the + Configuration folder. + kmers: + type: int? + inputBinding: + position: 11 + prefix: '--kmers' + doc: | + Specifies the length of Kmer to look for in the Kmer content + module. Specified Kmer length must be between 2 and 10. Default + length is 7 if not specified. + casava: + type: boolean? + inputBinding: + position: 13 + prefix: '--casava' + doc: | + Files come from raw casava output. Files in the same sample + group (differing only by the group number) will be analysed + as a set rather than individually. Sequences with the filter + flag set in the header will be excluded from the analysis. + Files must have the same names given to them by casava + (including being gzipped and ending with .gz) otherwise they + won't be grouped together correctly. + + nofilter: + type: boolean? + inputBinding: + position: 14 + prefix: '--nofilter' + doc: | + If running with --casava then don't remove read flagged by + casava as poor quality when performing the QC analysis. + hide_group: + type: boolean? + inputBinding: + position: 15 + prefix: '--nogroup' + doc: | + Disable grouping of bases for reads >50bp. All reports will + show data for every base in the read. WARNING: Using this + option will cause fastqc to crash and burn if you use it on + really long reads, and your plots may end up a ridiculous size. + You have been warned! +outputs: + zipped_file: + type: File + outputBinding: + glob: '*.zip' + html_file: + type: File + outputBinding: + glob: '*.html' + summary_file: + type: File + outputBinding: + glob: "*/summary.txt" +baseCommand: [fastqc, --extract, --outdir, .] +$namespaces: + s: http://schema.org/ +$schemas: +- https://schema.org/version/latest/schemaorg-current-https.rdf +s:name: "fastqc_2" +s:license: http://www.apache.org/licenses/LICENSE-2.0 +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 +doc: | + Tool runs FastQC from Babraham Bioinformatics \ No newline at end of file diff --git a/janis_assistant/tests/data/cwl/gatk_haplotype_tool.cwl b/janis_assistant/tests/data/cwl/gatk_haplotype_tool.cwl new file mode 100644 index 0000000..7759397 --- /dev/null +++ b/janis_assistant/tests/data/cwl/gatk_haplotype_tool.cwl @@ -0,0 +1,1250 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 +class: CommandLineTool +baseCommand: +- gatk +- HaplotypeCaller +doc: |- + Call germline SNPs and indels via local re-assembly of haplotypes + +

The HaplotypeCaller is capable of calling SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. In other words, whenever the program encounters a region showing signs of variation, it discards the existing mapping information and completely reassembles the reads in that region. This allows the HaplotypeCaller to be more accurate when calling regions that are traditionally difficult to call, for example when they contain different types of variants close to each other. It also makes the HaplotypeCaller much better at calling indels than position-based callers like UnifiedGenotyper.

+ +

In the GVCF workflow used for scalable variant calling in DNA sequence data, HaplotypeCaller runs per-sample to generate an intermediate GVCF (not to be used in final analysis), which can then be used in GenotypeGVCFs for joint genotyping of multiple samples in a very efficient way. The GVCF workflow enables rapid incremental processing of samples as they roll off the sequencer, as well as scaling to very large cohort sizes (e.g. the 92K exomes of ExAC).

+ +

In addition, HaplotypeCaller is able to handle non-diploid organisms as well as pooled experiment data. Note however that the algorithms used to calculate variant likelihoods is not well suited to extreme allele frequencies (relative to ploidy) so its use is not recommended for somatic (cancer) variant discovery. For that purpose, use Mutect2 instead.

+ +

Finally, HaplotypeCaller is also able to correctly handle the splice junctions that make RNAseq a challenge for most variant callers, + on the condition that the input read data has previously been processed according to our recommendations as documented here.

+ +

How HaplotypeCaller works

+ +
+

1. Define active regions

+ +

The program determines which regions of the genome it needs to operate on (active regions), based on the presence of + evidence for variation. + +
+

2. Determine haplotypes by assembly of the active region

+ +

For each active region, the program builds a De Bruijn-like graph to reassemble the active region and identifies + what are the possible haplotypes present in the data. The program then realigns each haplotype against the reference + haplotype using the Smith-Waterman algorithm in order to identify potentially variant sites.

+ +
+

3. Determine likelihoods of the haplotypes given the read data

+ +

For each active region, the program performs a pairwise alignment of each read against each haplotype using the + PairHMM algorithm. This produces a matrix of likelihoods of haplotypes given the read data. These likelihoods are + then marginalized to obtain the likelihoods of alleles for each potentially variant site given the read data.

+ +
+

4. Assign sample genotypes

+ +

For each potentially variant site, the program applies Bayes' rule, using the likelihoods of alleles given the + read data to calculate the likelihoods of each genotype per sample given the read data observed for that + sample. The most likely genotype is then assigned to the sample.

+

Input

+

+ Input bam file(s) from which to make variant calls +

+

Output

+

+ Either a VCF or GVCF file with raw, unfiltered SNP and indel calls. Regular VCFs must be filtered either by variant + recalibration (Best Practice) or hard-filtering before use in downstream analyses. If using the GVCF workflow, the + output is a GVCF file that must first be run through GenotypeGVCFs and then filtering before further analysis. +

+

Caveats

+ +

Special note on ploidy

+

This tool is able to handle many non-diploid use cases; the desired ploidy can be specified using the -ploidy + argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause + performance challenges including excessive slowness. We are working on resolving these limitations.

+

Additional Notes

+ +requirements: + ShellCommandRequirement: {} + InlineJavascriptRequirement: + expressionLib: + - | + /** + * File of functions to be added to cwl files + */ + function generateGATK4BooleanValue(){ + /** + * Boolean types in GATK 4 are expressed on the command line as -- "true"/"false", + * so patch here + */ + if(self === true || self === false){ + return self.toString() + } + return self; + } + function applyTagsToArgument(prefix, tags){ + /** + * Function to be used in the field valueFrom of File objects to add gatk tags. + */ + if(!self){ + return null; + } + else if(!tags){ + return generateArrayCmd(prefix); + } + else{ + function addTagToArgument(tagObject, argument){ + var allTags = Array.isArray(tagObject) ? tagObject.join(",") : tagObject; + return [prefix + ":" + allTags, argument]; + } + if(Array.isArray(self)){ + if(!Array.isArray(tags) || self.length !== tags.length){ + throw new TypeError("Argument '" + prefix + "' tag field is invalid"); + } + var value = self.map(function(element, i) { + return addTagToArgument(tags[i], element); + }).reduce(function(a, b){return a.concat(b)}) + return value; + } + else{ + return addTagToArgument(tags, self); + } + } + } + function generateArrayCmd(prefix){ + /** + * Function to be used in the field valueFrom of array objects, so that arrays are optional + * and prefixes are handled properly. + * + * The issue that this solves is documented here: + * https://www.biostars.org/p/258414/#260140 + */ + if(!self){ + return null; + } + if(!Array.isArray(self)){ + self = [self]; + } + var output = []; + self.forEach(function(element) { + output.push(prefix); + output.push(element); + }) + return output; + } + /* Polyfill String.endsWith (it was introduced in ES6, but CWL 1.0 only supports ES5) */ + String.prototype.endsWith = String.prototype.endsWith || function(suffix) { + return this.indexOf(suffix, this.length - suffix.length) >= 0; + }; + SchemaDefRequirement: + types: + - type: enum + name: annotation_type + symbols: + - AS_BaseQualityRankSumTest + - AS_FisherStrand + - AS_InbreedingCoeff + - AS_MappingQualityRankSumTest + - AS_QualByDepth + - AS_RMSMappingQuality + - AS_ReadPosRankSumTest + - AS_StrandOddsRatio + - AlleleFraction + - BaseQuality + - BaseQualityRankSumTest + - ChromosomeCounts + - ClippingRankSumTest + - CountNs + - Coverage + - DepthPerAlleleBySample + - DepthPerSampleHC + - ExcessHet + - FisherStrand + - FragmentLength + - GenotypeSummaries + - InbreedingCoeff + - LikelihoodRankSumTest + - MappingQuality + - MappingQualityRankSumTest + - MappingQualityZero + - OrientationBiasReadCounts + - OriginalAlignment + - PossibleDeNovo + - QualByDepth + - RMSMappingQuality + - ReadPosRankSumTest + - ReadPosition + - ReferenceBases + - SampleList + - StrandBiasBySample + - StrandOddsRatio + - TandemRepeat + - UniqueAltReadCount +hints: + DockerRequirement: + dockerPull: quay.io/biocontainers/gatk4:4.1.6.0--py38_0 +inputs: +- doc: Threshold number of ambiguous bases. If null, uses threshold fraction; otherwise, + overrides threshold fraction. + id: ambig-filter-bases + type: int? + inputBinding: + prefix: --ambig-filter-bases +- doc: Threshold fraction of ambiguous bases + id: ambig-filter-frac + type: double? + inputBinding: + prefix: --ambig-filter-frac +- doc: Maximum length of fragment (insert size) + id: max-fragment-length + type: int? + inputBinding: + prefix: --max-fragment-length +- doc: Minimum length of fragment (insert size) + id: min-fragment-length + type: int? + inputBinding: + prefix: --min-fragment-length +- doc: One or more genomic intervals to keep + id: keep-intervals + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--keep-intervals")) +- doc: Name of the library to keep + id: library + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--library")) +- doc: Maximum mapping quality to keep (inclusive) + id: maximum-mapping-quality + type: int? + inputBinding: + prefix: --maximum-mapping-quality +- doc: Minimum mapping quality to keep (inclusive) + id: minimum-mapping-quality + type: int? + inputBinding: + prefix: --minimum-mapping-quality +- doc: Minimum start location difference at which mapped mates are considered distant + id: mate-too-distant-length + type: int? + inputBinding: + prefix: --mate-too-distant-length +- doc: Allow a read to be filtered out based on having only 1 soft-clipped block. + By default, both ends must have a soft-clipped block, setting this flag requires + only 1 soft-clipped block + id: dont-require-soft-clips-both-ends + type: boolean? + inputBinding: + prefix: --dont-require-soft-clips-both-ends + valueFrom: $(generateGATK4BooleanValue()) +- doc: Minimum number of aligned bases + id: filter-too-short + type: int? + inputBinding: + prefix: --filter-too-short +- doc: Platform attribute (PL) to match + id: platform-filter-name + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--platform-filter-name")) +- doc: Platform unit (PU) to filter out + id: black-listed-lanes + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--black-listed-lanes")) +- doc: A read group filter expression in the form "attribute:value", where "attribute" + is a two character read group attribute such as "RG" or "PU". + id: read-group-black-list + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--read-group-black-list")) +- doc: The name of the read group to keep + id: keep-read-group + type: string? + inputBinding: + prefix: --keep-read-group +- doc: Keep only reads with length at most equal to the specified value + id: max-read-length + type: int? + inputBinding: + prefix: --max-read-length +- doc: Keep only reads with length at least equal to the specified value + id: min-read-length + type: int? + inputBinding: + prefix: --min-read-length +- doc: Keep only reads with this read name + id: read-name + type: string? + inputBinding: + prefix: --read-name +- doc: Keep only reads on the reverse strand + id: keep-reverse-strand-only + type: boolean? + inputBinding: + prefix: --keep-reverse-strand-only + valueFrom: $(generateGATK4BooleanValue()) +- doc: The name of the sample(s) to keep, filtering out all others + id: sample + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--sample")) +- doc: Inverts the results from this filter, causing all variants that would pass + to fail and visa-versa. + id: invert-soft-clip-ratio-filter + type: boolean? + inputBinding: + prefix: --invert-soft-clip-ratio-filter + valueFrom: $(generateGATK4BooleanValue()) +- doc: Threshold ratio of soft clipped bases (leading / trailing the cigar string) + to total bases in read for read to be filtered. + id: soft-clipped-leading-trailing-ratio + type: double? + inputBinding: + prefix: --soft-clipped-leading-trailing-ratio +- doc: Threshold ratio of soft clipped bases (anywhere in the cigar string) to total + bases in read for read to be filtered. + id: soft-clipped-ratio-threshold + type: double? + inputBinding: + prefix: --soft-clipped-ratio-threshold +- doc: Minimum probability for a locus to be considered active. + id: active-probability-threshold + type: double? + inputBinding: + prefix: --active-probability-threshold +- doc: Use Mutect2's adaptive graph pruning algorithm + id: adaptive-pruning + type: boolean? + inputBinding: + prefix: --adaptive-pruning + valueFrom: $(generateGATK4BooleanValue()) +- doc: Initial base error rate estimate for adaptive pruning + id: adaptive-pruning-initial-error-rate + type: double? + inputBinding: + prefix: --adaptive-pruning-initial-error-rate +- doc: If true, adds a PG tag to created SAM/BAM/CRAM files. + id: add-output-sam-program-record + type: boolean? + inputBinding: + prefix: --add-output-sam-program-record + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, adds a command line header line to created VCF files. + id: add-output-vcf-command-line + type: boolean? + inputBinding: + prefix: --add-output-vcf-command-line + valueFrom: $(generateGATK4BooleanValue()) +- doc: Annotate all sites with PLs + id: all-site-pls + type: boolean? + inputBinding: + prefix: --all-site-pls + valueFrom: $(generateGATK4BooleanValue()) +- doc: Likelihood and read-based annotations will only take into consideration reads + that overlap the variant or any base no further than this distance expressed in + base pairs + id: allele-informative-reads-overlap-margin + type: int? + inputBinding: + prefix: --allele-informative-reads-overlap-margin +- doc: The set of alleles to force-call regardless of evidence + id: alleles + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--alleles", inputs['alleles_tags'])) +- doc: A argument to set the tags of 'alleles' + id: alleles_tags + type: + - 'null' + - string + - string[] +- doc: Allow graphs that have non-unique kmers in the reference + id: allow-non-unique-kmers-in-ref + type: boolean? + inputBinding: + prefix: --allow-non-unique-kmers-in-ref + valueFrom: $(generateGATK4BooleanValue()) +- doc: If provided, we will annotate records with the number of alternate alleles + that were discovered (but not necessarily genotyped) at a given site + id: annotate-with-num-discovered-alleles + type: boolean? + inputBinding: + prefix: --annotate-with-num-discovered-alleles + valueFrom: $(generateGATK4BooleanValue()) +- doc: One or more specific annotations to add to variant calls [synonymous with -A] + id: annotation + type: + - 'null' + - annotation_type + - annotation_type[] + inputBinding: + valueFrom: $(generateArrayCmd("--annotation")) +- doc: One or more groups of annotations to apply to variant calls [synonymous with + -G] + id: annotation-group + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--annotation-group")) +- doc: One or more specific annotations to exclude from variant calls [synonymous + with -AX] + id: annotations-to-exclude + type: + - 'null' + - annotation_type + - annotation_type[] + inputBinding: + valueFrom: $(generateArrayCmd("--annotations-to-exclude")) +- doc: read one or more arguments files and add them to the command line + id: arguments_file + type: + - 'null' + - type: array + items: File + inputBinding: + valueFrom: $(null) + - File + inputBinding: + valueFrom: $(applyTagsToArgument("--arguments_file", inputs['arguments_file_tags'])) +- doc: A argument to set the tags of 'arguments_file' + id: arguments_file_tags + type: + - 'null' + - type: array + items: + - string + - type: array + items: string +- doc: Output the assembly region to this IGV formatted file + id: assembly-region-out-filename + type: string? + inputBinding: + prefix: --assembly-region-out +- doc: Number of additional bases of context to include around each assembly region + id: assembly-region-padding + type: int? + inputBinding: + prefix: --assembly-region-padding +- doc: File to which assembled haplotypes should be written [synonymous with -bamout] + id: bam-output-filename + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--bam-output", inputs['bam-output_tags'])) +- doc: A argument to set the tags of 'bam-output' + id: bam-output_tags + type: + - 'null' + - string + - string[] +- doc: Which haplotypes should be written to the BAM + id: bam-writer-type + type: + - 'null' + - type: enum + symbols: + - ALL_POSSIBLE_HAPLOTYPES + - CALLED_HAPLOTYPES + inputBinding: + prefix: --bam-writer-type +- doc: Base qualities below this threshold will be reduced to the minimum (6) + id: base-quality-score-threshold + type: int? + inputBinding: + prefix: --base-quality-score-threshold +- doc: Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer + if unset. [synonymous with -CIPB] + id: cloud-index-prefetch-buffer + type: int? + inputBinding: + prefix: --cloud-index-prefetch-buffer +- doc: Size of the cloud-only prefetch buffer (in MB; 0 to disable). [synonymous with + -CPB] + id: cloud-prefetch-buffer + type: int? + inputBinding: + prefix: --cloud-prefetch-buffer +- doc: Comparison VCF file(s) [synonymous with -comp] + id: comparison + type: + - 'null' + - type: array + items: File + inputBinding: + valueFrom: $(null) + - File + inputBinding: + valueFrom: $(applyTagsToArgument("--comparison", inputs['comparison_tags'])) +- doc: A argument to set the tags of 'comparison' + id: comparison_tags + type: + - 'null' + - type: array + items: + - string + - type: array + items: string +- doc: Tab-separated File containing fraction of contamination in sequencing data + (per sample) to aggressively remove. Format should be "" + (Contamination is double) per line; No header. [synonymous with -contamination-file] + id: contamination-fraction-per-sample-file + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--contamination-fraction-per-sample-file", inputs['contamination-fraction-per-sample-file_tags'])) +- doc: A argument to set the tags of 'contamination-fraction-per-sample-file' + id: contamination-fraction-per-sample-file_tags + type: + - 'null' + - string + - string[] +- doc: Fraction of contamination in sequencing data (for all samples) to aggressively + remove [synonymous with -contamination] + id: contamination-fraction-to-filter + type: double? + inputBinding: + prefix: --contamination-fraction-to-filter +- doc: Undocumented option + id: correct-overlapping-quality + type: boolean? + inputBinding: + prefix: --correct-overlapping-quality + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM + file. [synonymous with -OBI] + id: create-output-bam-index + type: boolean? + inputBinding: + prefix: --create-output-bam-index + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, create a MD5 digest for any BAM/SAM/CRAM file created [synonymous + with -OBM] + id: create-output-bam-md5 + type: boolean? + inputBinding: + prefix: --create-output-bam-md5 + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, create a VCF index when writing a coordinate-sorted VCF file. [synonymous + with -OVI] + id: create-output-variant-index + type: boolean? + inputBinding: + prefix: --create-output-variant-index + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, create a a MD5 digest any VCF file created. [synonymous with -OVM] + id: create-output-variant-md5 + type: boolean? + inputBinding: + prefix: --create-output-variant-md5 + valueFrom: $(generateGATK4BooleanValue()) +- doc: dbSNP file [synonymous with -D] + id: dbsnp + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--dbsnp", inputs['dbsnp_tags'])) +- doc: A argument to set the tags of 'dbsnp' + id: dbsnp_tags + type: + - 'null' + - string + - string[] +- doc: Print out verbose debug information about each assembly region [synonymous + with -debug] + id: debug-assembly + type: boolean? + inputBinding: + prefix: --debug-assembly + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, don't cache bam indexes, this will reduce memory requirements but + may harm performance if many intervals are specified. Caching is automatically + disabled if there are no intervals specified. [synonymous with -DBIC] + id: disable-bam-index-caching + type: boolean? + inputBinding: + prefix: --disable-bam-index-caching + valueFrom: $(generateGATK4BooleanValue()) +- doc: Don't skip calculations in ActiveRegions with no variants + id: disable-optimizations + type: boolean? + inputBinding: + prefix: --disable-optimizations + valueFrom: $(generateGATK4BooleanValue()) +- doc: Read filters to be disabled before analysis [synonymous with -DF] + id: disable-read-filter + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--disable-read-filter")) +- doc: If specified, do not check the sequence dictionaries from our inputs for compatibility. + Use at your own risk! + id: disable-sequence-dictionary-validation + type: boolean? + inputBinding: + prefix: --disable-sequence-dictionary-validation + valueFrom: $(generateGATK4BooleanValue()) +- doc: Disable all tool default annotations + id: disable-tool-default-annotations + type: boolean? + inputBinding: + prefix: --disable-tool-default-annotations + valueFrom: $(generateGATK4BooleanValue()) +- doc: 'Disable all tool default read filters (WARNING: many tools will not function + correctly without their default read filters on)' + id: disable-tool-default-read-filters + type: boolean? + inputBinding: + prefix: --disable-tool-default-read-filters + valueFrom: $(generateGATK4BooleanValue()) +- doc: Disable physical phasing + id: do-not-run-physical-phasing + type: boolean? + inputBinding: + prefix: --do-not-run-physical-phasing + valueFrom: $(generateGATK4BooleanValue()) +- doc: Disable iterating over kmer sizes when graph cycles are detected + id: dont-increase-kmer-sizes-for-cycles + type: boolean? + inputBinding: + prefix: --dont-increase-kmer-sizes-for-cycles + valueFrom: $(generateGATK4BooleanValue()) +- doc: Do not analyze soft clipped bases in the reads + id: dont-use-soft-clipped-bases + type: boolean? + inputBinding: + prefix: --dont-use-soft-clipped-bases + valueFrom: $(generateGATK4BooleanValue()) +- doc: Mode for emitting reference confidence scores (For Mutect2, this is a BETA + feature) [synonymous with -ERC] + id: emit-ref-confidence + type: + - 'null' + - type: enum + symbols: + - NONE + - BP_RESOLUTION + - GVCF + inputBinding: + prefix: --emit-ref-confidence +- doc: Use all possible annotations (not for the faint of heart) + id: enable-all-annotations + type: boolean? + inputBinding: + prefix: --enable-all-annotations + valueFrom: $(generateGATK4BooleanValue()) +- doc: One or more genomic intervals to exclude from processing [synonymous with -XL] + id: exclude-intervals + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--exclude-intervals")) +- doc: Output the band lower bound for each GQ block regardless of the data it represents + id: floor-blocks + type: boolean? + inputBinding: + prefix: --floor-blocks + valueFrom: $(generateGATK4BooleanValue()) +- doc: If provided, all regions will be marked as active + id: force-active + type: boolean? + inputBinding: + prefix: --force-active + valueFrom: $(generateGATK4BooleanValue()) +- doc: Force-call filtered alleles included in the resource specified by --alleles + [synonymous with -genotype-filtered-alleles] + id: force-call-filtered-alleles + type: boolean? + inputBinding: + prefix: --force-call-filtered-alleles + valueFrom: $(generateGATK4BooleanValue()) +- doc: Samples representing the population "founders" + id: founder-id + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--founder-id")) +- doc: A configuration file to use with the GATK. + id: gatk-config-file + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--gatk-config-file", inputs['gatk-config-file_tags'])) +- doc: A argument to set the tags of 'gatk-config-file' + id: gatk-config-file_tags + type: + - 'null' + - string + - string[] +- doc: If the GCS bucket channel errors out, how many times it will attempt to re-initiate + the connection [synonymous with -gcs-retries] + id: gcs-max-retries + type: int? + inputBinding: + prefix: --gcs-max-retries +- doc: Project to bill when accessing "requester pays" buckets. If unset, these buckets + cannot be accessed. + id: gcs-project-for-requester-pays + type: string? + inputBinding: + prefix: --gcs-project-for-requester-pays +- doc: Write debug assembly graph information to this file [synonymous with -graph] + id: graph-output-filename + type: string? + inputBinding: + prefix: --graph-output +- doc: Exclusive upper bounds for reference confidence GQ bands (must be in [1, 100] + and specified in increasing order) [synonymous with -GQB] + id: gvcf-gq-bands + type: + - 'null' + - type: array + items: int + inputBinding: + valueFrom: $(null) + - int + inputBinding: + valueFrom: $(generateArrayCmd("--gvcf-gq-bands")) +- doc: Heterozygosity value used to compute prior likelihoods for any locus. See + the GATKDocs for full details on the meaning of this population genetics concept + id: heterozygosity + type: double? + inputBinding: + prefix: --heterozygosity +- doc: Standard deviation of heterozygosity for SNP and indel calling. + id: heterozygosity-stdev + type: double? + inputBinding: + prefix: --heterozygosity-stdev +- doc: Heterozygosity for indel calling. See the GATKDocs for heterozygosity for + full details on the meaning of this population genetics concept + id: indel-heterozygosity + type: double? + inputBinding: + prefix: --indel-heterozygosity +- doc: The size of an indel to check for in the reference model + id: indel-size-to-eliminate-in-ref-model + type: int? + inputBinding: + prefix: --indel-size-to-eliminate-in-ref-model +- doc: BAM/SAM/CRAM file containing reads [synonymous with -I] + id: input + type: + - type: array + items: File + inputBinding: + valueFrom: $(null) + - File + inputBinding: + valueFrom: $(applyTagsToArgument("--input", inputs['input_tags'])) + secondaryFiles: $(self.basename + self.nameext.replace('m','i')) +- doc: A argument to set the tags of 'input' + id: input_tags + type: + - 'null' + - type: array + items: + - string + - type: array + items: string +- doc: Amount of padding (in bp) to add to each interval you are excluding. [synonymous + with -ixp] + id: interval-exclusion-padding + type: int? + inputBinding: + prefix: --interval-exclusion-padding +- doc: Interval merging rule for abutting intervals [synonymous with -imr] + id: interval-merging-rule + type: + - 'null' + - type: enum + symbols: + - ALL + - OVERLAPPING_ONLY + inputBinding: + prefix: --interval-merging-rule +- doc: Amount of padding (in bp) to add to each interval you are including. [synonymous + with -ip] + id: interval-padding + type: int? + inputBinding: + prefix: --interval-padding +- doc: Set merging approach to use for combining interval inputs [synonymous with + -isr] + id: interval-set-rule + type: + - 'null' + - type: enum + symbols: + - UNION + - INTERSECTION + inputBinding: + prefix: --interval-set-rule +- doc: One or more genomic intervals over which to operate [synonymous with -L] + id: intervals + type: + - 'null' + - type: array + items: + - File + - string + inputBinding: + valueFrom: $(null) + - File + - string + inputBinding: + valueFrom: $(applyTagsToArgument("--intervals", inputs['intervals_tags'])) +- doc: A argument to set the tags of 'intervals' + id: intervals_tags + type: + - 'null' + - type: array + items: + - string + - type: array + items: string +- doc: Kmer size to use in the read threading assembler + id: kmer-size + type: + - 'null' + - type: array + items: int + inputBinding: + valueFrom: $(null) + - int + inputBinding: + valueFrom: $(generateArrayCmd("--kmer-size")) +- doc: Lenient processing of VCF files [synonymous with -LE] + id: lenient + type: boolean? + inputBinding: + prefix: --lenient + valueFrom: $(generateGATK4BooleanValue()) +- doc: Maximum number of alternate alleles to genotype + id: max-alternate-alleles + type: int? + inputBinding: + prefix: --max-alternate-alleles +- doc: Maximum size of an assembly region + id: max-assembly-region-size + type: int? + inputBinding: + prefix: --max-assembly-region-size +- doc: Maximum number of genotypes to consider at any site + id: max-genotype-count + type: int? + inputBinding: + prefix: --max-genotype-count +- doc: Two or more phased substitutions separated by this distance or less are merged + into MNPs. [synonymous with -mnp-dist] + id: max-mnp-distance + type: int? + inputBinding: + prefix: --max-mnp-distance +- doc: Maximum number of haplotypes to consider for your population + id: max-num-haplotypes-in-population + type: int? + inputBinding: + prefix: --max-num-haplotypes-in-population +- doc: Upper limit on how many bases away probability mass can be moved around when + calculating the boundaries between active and inactive assembly regions + id: max-prob-propagation-distance + type: int? + inputBinding: + prefix: --max-prob-propagation-distance +- doc: Maximum number of reads to retain per alignment start position. Reads above + this threshold will be downsampled. Set to 0 to disable. + id: max-reads-per-alignment-start + type: int? + inputBinding: + prefix: --max-reads-per-alignment-start +- doc: Maximum number of variants in graph the adaptive pruner will allow + id: max-unpruned-variants + type: int? + inputBinding: + prefix: --max-unpruned-variants +- doc: Minimum size of an assembly region + id: min-assembly-region-size + type: int? + inputBinding: + prefix: --min-assembly-region-size +- doc: Minimum base quality required to consider a base for calling [synonymous with + -mbq] + id: min-base-quality-score + type: int? + inputBinding: + prefix: --min-base-quality-score +- doc: Minimum length of a dangling branch to attempt recovery + id: min-dangling-branch-length + type: int? + inputBinding: + prefix: --min-dangling-branch-length +- doc: Minimum support to not prune paths in the graph + id: min-pruning + type: int? + inputBinding: + prefix: --min-pruning +- doc: How many threads should a native pairHMM implementation use + id: native-pair-hmm-threads + type: int? + inputBinding: + prefix: --native-pair-hmm-threads +- doc: use double precision in the native pairHmm. This is slower but matches the + java implementation better + id: native-pair-hmm-use-double-precision + type: boolean? + inputBinding: + prefix: --native-pair-hmm-use-double-precision + valueFrom: $(generateGATK4BooleanValue()) +- doc: Number of samples that must pass the minPruning threshold + id: num-pruning-samples + type: int? + inputBinding: + prefix: --num-pruning-samples +- doc: Number of hom-ref genotypes to infer at sites not present in a panel + id: num-reference-samples-if-no-call + type: int? + inputBinding: + prefix: --num-reference-samples-if-no-call +- doc: File to which variants should be written [synonymous with -O] + id: output_filename + type: string + inputBinding: + prefix: --output +- doc: Specifies which type of calls we should output + id: output_mode + type: + - 'null' + - type: enum + symbols: + - EMIT_VARIANTS_ONLY + - EMIT_ALL_CONFIDENT_SITES + - EMIT_ALL_ACTIVE_SITES + inputBinding: + prefix: --output-mode +- doc: Flat gap continuation penalty for use in the Pair HMM + id: pair-hmm-gap-continuation-penalty + type: int? + inputBinding: + prefix: --pair-hmm-gap-continuation-penalty +- doc: The PairHMM implementation to use for genotype likelihood calculations [synonymous + with -pairHMM] + id: pair-hmm-implementation + type: + - 'null' + - type: enum + symbols: + - EXACT + - ORIGINAL + - LOGLESS_CACHING + - AVX_LOGLESS_CACHING + - AVX_LOGLESS_CACHING_OMP + - EXPERIMENTAL_FPGA_LOGLESS_CACHING + - FASTEST_AVAILABLE + inputBinding: + prefix: --pair-hmm-implementation +- doc: The PCR indel model to use + id: pcr-indel-model + type: + - 'null' + - type: enum + symbols: + - NONE + - HOSTILE + - AGGRESSIVE + - CONSERVATIVE + inputBinding: + prefix: --pcr-indel-model +- doc: Pedigree file for determining the population "founders" [synonymous with -ped] + id: pedigree + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--pedigree", inputs['pedigree_tags'])) +- doc: A argument to set the tags of 'pedigree' + id: pedigree_tags + type: + - 'null' + - string + - string[] +- doc: The global assumed mismapping rate for reads + id: phred-scaled-global-read-mismapping-rate + type: int? + inputBinding: + prefix: --phred-scaled-global-read-mismapping-rate +- doc: Callset to use in calculating genotype priors [synonymous with -population] + id: population-callset + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--population-callset", inputs['population-callset_tags'])) +- doc: A argument to set the tags of 'population-callset' + id: population-callset_tags + type: + - 'null' + - string + - string[] +- doc: Ln likelihood ratio threshold for adaptive pruning algorithm + id: pruning-lod-threshold + type: double? + inputBinding: + prefix: --pruning-lod-threshold +- doc: Whether to suppress job-summary info on System.err. + id: QUIET + type: boolean? + inputBinding: + prefix: --QUIET + valueFrom: $(generateGATK4BooleanValue()) +- doc: Read filters to be applied before analysis [synonymous with -RF] + id: read-filter + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--read-filter")) +- doc: Indices to use for the read inputs. If specified, an index must be provided + for every read input and in the same order as the read inputs. If this argument + is not specified, the path to the index for each input will be inferred automatically. + id: read-index + type: + - 'null' + - type: array + items: string + inputBinding: + valueFrom: $(null) + - string + inputBinding: + valueFrom: $(generateArrayCmd("--read-index")) +- doc: Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The + default stringency value SILENT can improve performance when processing a BAM + file in which variable-length data (read, qualities, tags) do not otherwise need + to be decoded. [synonymous with -VS] + id: read-validation-stringency + type: + - 'null' + - type: enum + symbols: + - STRICT + - LENIENT + - SILENT + inputBinding: + prefix: --read-validation-stringency +- doc: Recover all dangling branches + id: recover-all-dangling-branches + type: boolean? + inputBinding: + prefix: --recover-all-dangling-branches + valueFrom: $(generateGATK4BooleanValue()) +- doc: This argument is deprecated since version 3.3 + id: recover-dangling-heads + type: boolean? + inputBinding: + prefix: --recover-dangling-heads + valueFrom: $(generateGATK4BooleanValue()) +- doc: Reference sequence file [synonymous with -R] + id: reference + type: File + inputBinding: + valueFrom: $(applyTagsToArgument("--reference", inputs['reference_tags'])) + secondaryFiles: + - .fai + - ^.dict +- doc: A argument to set the tags of 'reference' + id: reference_tags + type: + - 'null' + - string + - string[] +- doc: Name of single sample to use from a multi-sample bam [synonymous with -ALIAS] + id: sample-name + type: string? + inputBinding: + prefix: --sample-name +- doc: Ploidy (number of chromosomes) per sample. For pooled data, set to (Number + of samples in each pool * Sample Ploidy). [synonymous with -ploidy] + id: sample-ploidy + type: int? + inputBinding: + prefix: --sample-ploidy +- doc: Output traversal statistics every time this many seconds elapse + id: seconds-between-progress-updates + type: double? + inputBinding: + prefix: --seconds-between-progress-updates +- doc: Use the given sequence dictionary as the master/canonical sequence dictionary. Must + be a .dict file. + id: sequence-dictionary + type: File? + inputBinding: + valueFrom: $(applyTagsToArgument("--sequence-dictionary", inputs['sequence-dictionary_tags'])) +- doc: A argument to set the tags of 'sequence-dictionary' + id: sequence-dictionary_tags + type: + - 'null' + - string + - string[] +- doc: display hidden arguments + id: showHidden + type: boolean? + inputBinding: + prefix: --showHidden + valueFrom: $(generateGATK4BooleanValue()) +- doc: If true, don't emit genotype fields when writing vcf file output. + id: sites-only-vcf-output + type: boolean? + inputBinding: + prefix: --sites-only-vcf-output + valueFrom: $(generateGATK4BooleanValue()) +- doc: Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is + the right choice + id: smith-waterman + type: + - 'null' + - type: enum + symbols: + - FASTEST_AVAILABLE + - AVX_ENABLED + - JAVA + inputBinding: + prefix: --smith-waterman +- doc: The minimum phred-scaled confidence threshold at which variants should be called + [synonymous with -stand-call-conf] + id: standard-min-confidence-threshold-for-calling + type: double? + inputBinding: + prefix: --standard-min-confidence-threshold-for-calling +- doc: Temp directory to use. + id: tmp-dir + type: string? + inputBinding: + prefix: --tmp-dir +- doc: Use the contamination-filtered read maps for the purposes of annotating variants + id: use-filtered-reads-for-annotations + type: boolean? + inputBinding: + prefix: --use-filtered-reads-for-annotations + valueFrom: $(generateGATK4BooleanValue()) +- doc: Whether to use the JdkDeflater (as opposed to IntelDeflater) [synonymous with + -jdk-deflater] + id: use-jdk-deflater + type: boolean? + inputBinding: + prefix: --use-jdk-deflater + valueFrom: $(generateGATK4BooleanValue()) +- doc: Whether to use the JdkInflater (as opposed to IntelInflater) [synonymous with + -jdk-inflater] + id: use-jdk-inflater + type: boolean? + inputBinding: + prefix: --use-jdk-inflater + valueFrom: $(generateGATK4BooleanValue()) +- doc: Control verbosity of logging. + id: verbosity + type: + - 'null' + - type: enum + symbols: + - ERROR + - WARNING + - INFO + - DEBUG + inputBinding: + prefix: --verbosity +- doc: display the version number for this tool + id: version + type: boolean? + inputBinding: + prefix: --version + valueFrom: $(generateGATK4BooleanValue()) +outputs: +- id: assembly-region-out + doc: Output file from corresponding to the input argument assembly-region-out-filename + type: File? + outputBinding: + glob: $(inputs['assembly-region-out-filename']) +- id: bam-output + doc: Output file from corresponding to the input argument bam-output-filename + type: File? + outputBinding: + glob: $(inputs['bam-output-filename']) + secondaryFiles: + - "$(inputs['create-output-bam-index']? self.basename + self.nameext.replace('m',\ + \ 'i') : [])" + - "$(inputs['create-output-bam-md5']? self.basename + '.md5' : [])" +- id: graph-output + doc: Output file from corresponding to the input argument graph-output-filename + type: File? + outputBinding: + glob: $(inputs['graph-output-filename']) +- id: output + doc: Output file from corresponding to the input argument output-filename + type: File + outputBinding: + glob: $(inputs.output_filename) + secondaryFiles: + - "$(inputs['create-output-variant-index']? self.basename + (inputs.output_filename.endsWith('.gz')?\ + \ '.tbi':'.idx') : [])" + - "$(inputs['create-output-variant-md5']? self.basename + '.md5' : [])" \ No newline at end of file diff --git a/janis_assistant/tests/data/cwl/subworkflow_test/basic.cwl b/janis_assistant/tests/data/cwl/subworkflow_test/basic.cwl new file mode 100644 index 0000000..1e22771 --- /dev/null +++ b/janis_assistant/tests/data/cwl/subworkflow_test/basic.cwl @@ -0,0 +1,55 @@ +#!/usr/bin/env cwl-runner +class: CommandLineTool +cwlVersion: v1.2 +label: basic tool for testing + +requirements: + - class: ShellCommandRequirement + - class: InlineJavascriptRequirement + - class: DockerRequirement + dockerPull: ubuntu:latest + +baseCommand: echo + +arguments: + - position: 0 + valueFrom: test:\\t:escaped:\\n:characters + +inputs: + inFile: + type: File + inputBinding: + position: 1 + inString: + type: string + inputBinding: + position: 2 + inSecondary: + type: File + secondaryFiles: [.fai, ^.dict, .amb, .ann, .bwt, .pac, .sa] + inputBinding: + position: 3 + inFileOptional: + type: File? + inputBinding: + position: 4 + inStringOptional: + type: string? + inputBinding: + position: 5 + inIntOptional: + type: int? + inputBinding: + position: 6 + inIntOptional2: + type: int? + default: 10 + inputBinding: + position: 6 +outputs: + out_stdout: + type: stdout + + + + diff --git a/janis_assistant/tests/data/cwl/subworkflow_test/main.cwl b/janis_assistant/tests/data/cwl/subworkflow_test/main.cwl new file mode 100644 index 0000000..ab67ba2 --- /dev/null +++ b/janis_assistant/tests/data/cwl/subworkflow_test/main.cwl @@ -0,0 +1,93 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "test workflow" +requirements: + - class: ScatterFeatureRequirement + - class: SubworkflowFeatureRequirement + - class: MultipleInputFeatureRequirement + +inputs: + inFile: + type: File + inFileOptional: + type: File? + inFileArr: + type: File[] + inFileArrOptional: + type: File[]? + inSecondary: + type: File + secondaryFiles: [.fai, ^.dict, .amb, .ann, .bwt, .pac, .sa] + inString: + type: string + inStringOptional: + type: string? + inStringOptional2: + type: string? + inStringArr: + type: string[] + inStringArrOptional: + type: string[]? + inInt: + type: int + inIntOptional: + type: int? + inIntOptional2: + type: int? + default: 5 + +outputs: + outFile: + type: File + outputSource: optional1/out_stdout + +steps: + basic: + run: basic.cwl + in: + inFile: inFile + inString: inString + inSecondary: inSecondary + inFileOptional: inFileOptional + inIntOptional: inIntOptional2 + out: + [out_stdout] + mandatory: + run: mandatory_input_types.cwl + in: + inFile: inFile + inFileArr: inFileArr + inSecondary: inSecondary + inString: inString + inStringArr: inStringArr + inInt: inInt + out: + [out_stdout] + optional1: + run: optional_input_types.cwl + in: + inFile: inFileOptional + inFileArr: inFileArrOptional + out: + [out_stdout] + optional2: + run: optional_input_types.cwl + in: + inString: inStringOptional + inStringArr: inStringArrOptional + inInt: inIntOptional + out: + [out_stdout] + sub: + run: subworkflow.cwl + in: + inFile: inFile + inFileArr: inFileArr + inSecondary: inSecondary + inString: inString + inStringArr: inStringArr + inInt: inInt + out: + [out_stdout] diff --git a/janis_assistant/tests/data/cwl/subworkflow_test/mandatory_input_types.cwl b/janis_assistant/tests/data/cwl/subworkflow_test/mandatory_input_types.cwl new file mode 100644 index 0000000..b7f7cfd --- /dev/null +++ b/janis_assistant/tests/data/cwl/subworkflow_test/mandatory_input_types.cwl @@ -0,0 +1,46 @@ + + +#!/usr/bin/env cwl-runner +class: CommandLineTool +cwlVersion: v1.2 +label: basic tool for testing mandatory inputs + +requirements: + - class: DockerRequirement + dockerPull: ubuntu:latest + +baseCommand: echo + +inputs: + inFile: + type: File + inputBinding: + position: 1 + inFileArr: + type: File[] + inputBinding: + position: 2 + inSecondary: + type: File + secondaryFiles: [.fai, ^.dict, .amb, .ann, .bwt, .pac, .sa] + inputBinding: + position: 3 + inString: + type: string + inputBinding: + position: 4 + inStringArr: + type: string[] + inputBinding: + position: 5 + inInt: + type: int + inputBinding: + position: 6 +outputs: + out_stdout: + type: stdout + + + + diff --git a/janis_assistant/tests/data/cwl/subworkflow_test/optional_input_types.cwl b/janis_assistant/tests/data/cwl/subworkflow_test/optional_input_types.cwl new file mode 100644 index 0000000..560ff9b --- /dev/null +++ b/janis_assistant/tests/data/cwl/subworkflow_test/optional_input_types.cwl @@ -0,0 +1,46 @@ + + +#!/usr/bin/env cwl-runner +class: CommandLineTool +cwlVersion: v1.2 +label: basic tool for testing optional inputs + +requirements: + - class: DockerRequirement + dockerPull: ubuntu:latest + +baseCommand: echo + +inputs: + inFile: + type: File? + inputBinding: + position: 1 + inFileArr: + type: File[]? + inputBinding: + position: 2 + inSecondary: + type: File? + secondaryFiles: [.fai, ^.dict, .amb, .ann, .bwt, .pac, .sa] + inputBinding: + position: 3 + inString: + type: string? + inputBinding: + position: 4 + inStringArr: + type: string[]? + inputBinding: + position: 5 + inInt: + type: int? + inputBinding: + position: 6 +outputs: + out_stdout: + type: stdout + + + + diff --git a/janis_assistant/tests/data/cwl/subworkflow_test/subworkflow.cwl b/janis_assistant/tests/data/cwl/subworkflow_test/subworkflow.cwl new file mode 100644 index 0000000..67e7529 --- /dev/null +++ b/janis_assistant/tests/data/cwl/subworkflow_test/subworkflow.cwl @@ -0,0 +1,38 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "test subworkflow" +requirements: + - class: ScatterFeatureRequirement + - class: SubworkflowFeatureRequirement + - class: MultipleInputFeatureRequirement + +inputs: + inFile: + type: File + inFileArr: + type: File[] + inSecondary: + type: File + secondaryFiles: [.fai, ^.dict, .amb, .ann, .bwt, .pac, .sa] + inString: + type: string + inStringArr: + type: string[] + inInt: + type: int + +outputs: + outFile: + type: File + outputSource: optional/out_stdout + +steps: + optional: + run: optional_input_types.cwl + in: + inFile: inFile + inFileArr: inFileArr + out: + [out_stdout] diff --git a/janis_assistant/tests/data/cwl/super_enhancer_wf.cwl b/janis_assistant/tests/data/cwl/super_enhancer_wf.cwl new file mode 100644 index 0000000..495a1c3 --- /dev/null +++ b/janis_assistant/tests/data/cwl/super_enhancer_wf.cwl @@ -0,0 +1,479 @@ +cwlVersion: v1.0 +class: Workflow + + +requirements: + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: MultipleInputFeatureRequirement + + +inputs: + + islands_file: + type: File + label: "Input peaks file" + format: "http://edamontology.org/format_3468" + doc: "Input XLS file generated by MACS2" + + islands_control_file: + type: File? + label: "Control peaks file" + format: "http://edamontology.org/format_3468" + doc: "Control XLS file generated by MACS2" + + bambai_pair: + type: File + secondaryFiles: + - .bai + label: "Coordinate sorted BAM+BAI files" + format: "http://edamontology.org/format_2572" + doc: "Coordinate sorted BAM file and BAI index file" + + annotation_file: + type: File + label: "TSV annotation file" + format: "http://edamontology.org/format_3475" + doc: "TSV annotation file" + + chrom_length_file: + type: File + label: "Chromosome length file" + format: "http://edamontology.org/format_2330" + doc: "Chromosome length file" + + stitch_distance: + type: int + label: "Stitching distance" + doc: "Linking distance for stitching" + + tss_distance: + type: int + label: "TSS distance" + doc: "Distance from TSS to exclude, 0 = no TSS exclusion" + + promoter_bp: + type: int + label: "Promoter distance" + doc: "Promoter distance for gene names assignment" + + +outputs: + + png_file: + type: File + label: "ROSE visualization plot" + format: "http://edamontology.org/format_3603" + doc: "Generated by ROSE visualization plot" + outputSource: rename_png/target_file + + gene_names_file: + type: File + label: "Gateway Super Enhancer + gene names" + format: "http://edamontology.org/format_3475" + doc: "Gateway Super Enhancer results from ROSE with assigned gene names" + outputSource: add_island_names/output_file + + bigbed_file: + type: File + label: "Gateway Super Enhancer bigBed file" + format: "http://edamontology.org/format_3475" + doc: "Gateway Super Enhancer bigBed file" + outputSource: bed_to_bigbed/bigbed_file + + +steps: + + make_gff: + in: + islands_file: islands_file + islands_control_file: islands_control_file + out: [gff_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/rose:v0.0.2 + inputs: + islands_file: + type: File + inputBinding: + position: 5 + doc: Input XLS file generated by MACS2 + islands_control_file: + type: File? + inputBinding: + position: 7 + doc: Control XLS file generated by MACS2 + outputs: + gff_file: + type: File + outputBinding: + glob: "*" + baseCommand: ['makegff'] + arguments: + - valueFrom: + ${ + let root = inputs.islands_file.basename.split('.').slice(0,-1).join('.'); + return (root == "")?inputs.islands_file.basename+".gff":root+".gff"; + } + position: 6 + doc: Tool produces GFF file from XLS file generated by MACS2 + + run_rose: + in: + binding_sites_file: make_gff/gff_file + bam_file: bambai_pair + annotation_file: annotation_file + stitch_distance: stitch_distance + tss_distance: tss_distance + out: [plot_points_pic, gateway_super_enhancers_bed] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/rose:v0.0.2 + inputs: + binding_sites_file: + type: File + inputBinding: + position: 5 + prefix: "-i" + doc: GFF file of binding sites used to make enhancers + bam_file: + type: File + inputBinding: + position: 6 + prefix: "-r" + secondaryFiles: [".bai"] + doc: Indexed BAM+BAI file to rank enhancer by + annotation_file: + type: File + inputBinding: + position: 7 + prefix: "-g" + doc: TSV genome annotation file + stitch_distance: + type: int + inputBinding: + position: 8 + prefix: "-s" + doc: Linking distance for stitching + tss_distance: + type: int + inputBinding: + position: 9 + prefix: "-t" + doc: Distance from TSS to exclude. 0 = no TSS exclusion + outputs: + plot_points_pic: + type: File + outputBinding: + glob: "*Plot_points.png" + gateway_super_enhancers_bed: + type: File + outputBinding: + glob: "*Gateway_SuperEnhancers.bed" + baseCommand: ['ROSE_main', '-o', './'] + doc: Tool runs ROSE to get Super Enhancers regions + + rename_png: + in: + source_file: run_rose/plot_points_pic + target_filename: + source: bambai_pair + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+"_default_s_enhcr.png") + out: [target_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + source_file: + type: File + inputBinding: + position: 5 + doc: source file to rename + target_filename: + type: string + inputBinding: + position: 6 + doc: filename to rename to + outputs: + target_file: + type: File + outputBinding: + glob: "*" + baseCommand: ["cp"] + doc: Tool renames (copy) `source_file` to `target_filename` + + sort_bed: + in: + unsorted_file: run_rose/gateway_super_enhancers_bed + key: + default: ["1,1","2,2n","3,3n"] + out: [sorted_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: InlineJavascriptRequirement + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + key: + type: + type: array + items: string + inputBinding: + prefix: "-k" + inputBinding: + position: 5 + doc: -k, --key=POS1[,POS2], start a key at POS1, end it at POS2 (origin 1) + unsorted_file: + type: File + inputBinding: + position: 6 + doc: File to be sorted + outputs: + sorted_file: + type: File + outputBinding: + glob: "*" + stdout: $(inputs.unsorted_file.location.split('/').slice(-1)[0]) + baseCommand: ["sort"] + doc: Tool sorts data from `unsorted_file` by `key` + + reduce_bed: + in: + input_file: sort_bed/sorted_file + out: [output_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + input_file: + type: File + inputBinding: + position: 5 + doc: Input BED6 file to be reduced to BED4 + outputs: + output_file: + type: File + outputBinding: + glob: "*" + baseCommand: [bash, '-c'] + arguments: + - cat $0 | cut -f 1-4 > `basename $0` + doc: Tool converts BED6 to BED4 by reducing column numbers + + bed_to_bigbed: + in: + input_bed: reduce_bed/output_file + chrom_length_file: chrom_length_file + bed_type: + default: "bed4" + output_filename: + source: bambai_pair + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+"_default_s_enhcr.bb") + out: [bigbed_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/ucscuserapps:v358 + inputs: + bed_type: + type: string + inputBinding: + position: 5 + prefix: -type= + separate: false + doc: Type of BED file in a form of bedN[+[P]]. By default bed3 to three required BED fields + input_bed: + type: File + inputBinding: + position: 6 + doc: Input BED file + chrom_length_file: + type: File + inputBinding: + position: 7 + doc: Chromosome length files + output_filename: + type: string + inputBinding: + position: 8 + doc: Output filename + outputs: + bigbed_file: + type: File + outputBinding: + glob: "*" + baseCommand: ["bedToBigBed"] + doc: Tool converts bed to bigBed + + bed_to_macs: + in: + input_file: sort_bed/sorted_file + out: [output_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + input_file: + type: File + inputBinding: + position: 5 + doc: Input file to be converted to MACS2 output format + outputs: + output_file: + type: File + outputBinding: + glob: "*" + baseCommand: [bash, '-c'] + arguments: + - cat $0 | grep -v "#" | awk + 'BEGIN {print "chr\tstart\tend\tlength\tabs_summit\tpileup\t-log10(pvalue)\tfold_enrichment\t-log10(qvalue)\tname"} + {print $1"\t"$2"\t"$3"\t"$3-$2+1"\t0\t0\t0\t0\t0\t"$4}' > `basename $0` + doc: Tool converts `input_file` to the format compatible with the input of iaintersect from `assign_genes` step + + assign_genes: + in: + input_filename: bed_to_macs/output_file + annotation_filename: annotation_file + promoter_bp: promoter_bp + out: [result_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: InlineJavascriptRequirement + - class: DockerRequirement + dockerPull: biowardrobe2/iaintersect:v0.0.2 + inputs: + input_filename: + type: File + inputBinding: + prefix: --in= + separate: false + doc: Input filename with MACS2 peak calling results, tsv + annotation_filename: + type: File + inputBinding: + prefix: --a= + separate: false + doc: Annotation file, tsv + promoter_bp: + type: int + inputBinding: + prefix: --promoter= + separate: false + doc: Promoter region around TSS, base pairs + outputs: + result_file: + type: File + outputBinding: + glob: "*_iaintersect.tsv" + baseCommand: [iaintersect] + arguments: + - valueFrom: | + ${ + let root = inputs.input_filename.basename.split('.').slice(0,-1).join('.'); + return (root == "")?inputs.input_filename.basename+"_iaintersect.tsv":root+"_iaintersect.tsv"; + } + prefix: --out= + separate: false + doc: Tool assigns each peak obtained from MACS2 to a gene and region (upstream, promoter, exon, intron, intergenic) + + add_island_names: + in: + input_file: [assign_genes/result_file, sort_bed/sorted_file] + param: + source: bambai_pair + valueFrom: $(self.location.split('/').slice(-1)[0].split('.').slice(0,-1).join('.')+"_default_s_enhcr.tsv") + out: [output_file] + run: + cwlVersion: v1.0 + class: CommandLineTool + requirements: + - class: DockerRequirement + dockerPull: biowardrobe2/scidap:v0.0.3 + inputs: + input_file: + type: File[] + inputBinding: + position: 5 + doc: TSV file to add extra columns too + param: + type: string + inputBinding: + position: 6 + doc: Param to set output filename + outputs: + output_file: + type: File + outputBinding: + glob: "*" + baseCommand: [bash, '-c'] + arguments: + - echo -e "refseq_id\tgene_id\ttxStart\ttxEnd\tstrand\tchrom\tstart\tend\tlength\tregion\tname\tscore" > `basename $2`; + cat $0 | grep -v refseq_id | paste - $1 | cut -f 1-9,15,19,20 >> `basename $2` + +$namespaces: + s: http://schema.org/ + +$schemas: +- http://schema.org/docs/schema_org_rdfa.html + +s:name: "super-enhancer" +s:downloadUrl: https://raw.githubusercontent.com/michael-kotliar/biowardrobe-airflow-plugins/cwls/super-enhancer.cwl +s:codeRepository: https://github.com/michael-kotliar/biowardrobe-airflow-plugins +s:license: http://www.apache.org/licenses/LICENSE-2.0 + +s:isPartOf: + class: s:CreativeWork + s:name: Common Workflow Language + s:url: http://commonwl.org/ + +s:creator: +- class: s:Organization + s:legalName: "Cincinnati Children's Hospital Medical Center" + s:location: + - class: s:PostalAddress + s:addressCountry: "USA" + s:addressLocality: "Cincinnati" + s:addressRegion: "OH" + s:postalCode: "45229" + s:streetAddress: "3333 Burnet Ave" + s:telephone: "+1(513)636-4200" + s:logo: "https://www.cincinnatichildrens.org/-/media/cincinnati%20childrens/global%20shared/childrens-logo-new.png" + s:department: + - class: s:Organization + s:legalName: "Allergy and Immunology" + s:department: + - class: s:Organization + s:legalName: "Barski Research Lab" + s:member: + - class: s:Person + s:name: Michael Kotliar + s:email: mailto:misha.kotliar@gmail.com + s:sameAs: + - id: http://orcid.org/0000-0002-6486-3898 + +doc: | + Workflow to run Super Enhancer Analysis + +s:about: | + Workflow to run Super Enhancer Analysis \ No newline at end of file diff --git a/janis_assistant/tests/data/galaxy/abricate_wf.ga b/janis_assistant/tests/data/galaxy/abricate_wf.ga new file mode 100644 index 0000000..2901da0 --- /dev/null +++ b/janis_assistant/tests/data/galaxy/abricate_wf.ga @@ -0,0 +1,104 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "my babricate workflow", + "format-version": "0.1", + "name": "abricate", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "in_fasta" + } + ], + "label": "in_fasta", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 330.2253723144531, + "height": 61.752044677734375, + "left": 447.98150634765625, + "right": 647.9814910888672, + "top": 268.47332763671875, + "width": 199.99998474121094, + "x": 447.98150634765625, + "y": 268.47332763671875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "58a6d4ba-d77b-4355-99cf-84da4da30e53", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "ca826a63-be54-4e04-8b62-21fd1d6b4c57" + } + ] + }, + "1": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/abricate/abricate/1.0.1", + "errors": null, + "id": 1, + "input_connections": { + "file_input": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool ABRicate", + "name": "file_input" + } + ], + "label": null, + "name": "ABRicate", + "outputs": [ + { + "name": "report", + "type": "tabular" + } + ], + "position": { + "bottom": 423.73973083496094, + "height": 154.3032684326172, + "left": 788.954833984375, + "right": 988.9548187255859, + "top": 269.43646240234375, + "width": 199.99998474121094, + "x": 788.954833984375, + "y": 269.43646240234375 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/abricate/abricate/1.0.1", + "tool_shed_repository": { + "changeset_revision": "c2ef298da409", + "name": "abricate", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adv\": {\"db\": \"resfinder\", \"no_header\": \"false\", \"min_dna_id\": \"80.0\", \"min_cov\": \"80.0\"}, \"file_input\": {\"__class__\": \"RuntimeValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.0.1", + "type": "tool", + "uuid": "1afee518-0912-4c81-bfe2-ea282a848aa8", + "workflow_outputs": [ + { + "label": "ABRicate on input dataset(s) report file", + "output_name": "report", + "uuid": "15cb9b26-c0a9-401d-9948-f460d90187e0" + } + ] + } + }, + "tags": [], + "uuid": "933d732e-61a5-4350-8c15-e0d979c5ad16", + "version": 3 +} \ No newline at end of file diff --git a/janis_assistant/tests/data/galaxy/cutadapt_wf.ga b/janis_assistant/tests/data/galaxy/cutadapt_wf.ga new file mode 100644 index 0000000..4ebfca7 --- /dev/null +++ b/janis_assistant/tests/data/galaxy/cutadapt_wf.ga @@ -0,0 +1,141 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "cutadapt_wf", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "forward" + } + ], + "label": "forward", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0.99383544921875, + "top": 0 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastq\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "d6925ea7-170c-4c3c-9121-8c651cabe3ae", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "reverse" + } + ], + "label": "reverse", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0, + "top": 103.97540283203125 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastq\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "c305e6bf-6d5c-4c5a-b078-ffafddb2137c", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/lparsons/cutadapt/cutadapt/4.0+galaxy1", + "errors": null, + "id": 2, + "input_connections": { + "library|input_1": { + "id": 0, + "output_name": "output" + }, + "library|input_2": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Cutadapt", + "name": "library" + }, + { + "description": "runtime parameter for tool Cutadapt", + "name": "library" + } + ], + "label": null, + "name": "Cutadapt", + "outputs": [ + { + "name": "out1", + "type": "fastqsanger" + }, + { + "name": "out2", + "type": "fastqsanger" + }, + { + "name": "report", + "type": "txt" + } + ], + "position": { + "left": 243.96514892578125, + "top": 32.3052978515625 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/lparsons/cutadapt/cutadapt/4.0+galaxy1", + "tool_shed_repository": { + "changeset_revision": "135b80fb1ac2", + "name": "cutadapt", + "owner": "lparsons", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adapter_options\": {\"action\": \"trim\", \"internal\": \"\", \"error_rate\": \"0.1\", \"no_indels\": false, \"times\": \"1\", \"overlap\": \"3\", \"match_read_wildcards\": \" \", \"revcomp\": false}, \"filter_options\": {\"discard_trimmed\": false, \"discard_untrimmed\": false, \"minimum_length\": null, \"maximum_length\": null, \"length_R2_options\": {\"length_R2_status\": \"False\", \"__current_case__\": 1}, \"max_n\": null, \"pair_filter\": \"any\", \"max_expected_errors\": null, \"discard_cassava\": false}, \"library\": {\"type\": \"paired\", \"__current_case__\": 1, \"input_1\": {\"__class__\": \"RuntimeValue\"}, \"input_2\": {\"__class__\": \"RuntimeValue\"}, \"r1\": {\"adapters\": [{\"__index__\": 0, \"adapter_source\": {\"adapter_source_list\": \"user\", \"__current_case__\": 0, \"adapter_name\": \"fwd\", \"adapter\": \"AATTGGCC\"}, \"single_noindels\": false}], \"front_adapters\": [], \"anywhere_adapters\": [], \"cut\": \"0\"}, \"r2\": {\"adapters2\": [{\"__index__\": 0, \"adapter_source2\": {\"adapter_source_list2\": \"user\", \"__current_case__\": 0, \"adapter_name2\": \"rev\", \"adapter2\": \"AATTGGCC\"}, \"single_noindels\": false}], \"front_adapters2\": [], \"anywhere_adapters2\": [], \"cut2\": \"0\", \"quality_cutoff2\": \"\"}}, \"output_selector\": [\"report\"], \"read_mod_options\": {\"quality_cutoff\": \"0\", \"nextseq_trim\": \"0\", \"trim_n\": false, \"strip_suffix\": \"\", \"shorten_options\": {\"shorten_values\": \"False\", \"__current_case__\": 1}, \"length_tag\": \"\", \"rename\": \"\", \"zero_cap\": false}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "4.0+galaxy1", + "type": "tool", + "uuid": "8b012d09-4090-4d5c-b3f5-2f7d3e658970", + "when": null, + "workflow_outputs": [ + { + "label": null, + "output_name": "report", + "uuid": "5541c5d2-9d73-442d-ad23-c02606329d6d" + }, + { + "label": "Cutadapt on input dataset(s): Read 1 Output", + "output_name": "out1", + "uuid": "4490227c-3f39-4f43-915c-367d499b091e" + }, + { + "label": null, + "output_name": "out2", + "uuid": "c83a39d3-e9a1-4163-9d47-469d603d605c" + } + ] + } + }, + "tags": [], + "uuid": "f9001d73-1606-4edc-aa3d-ac9bbc80433f", + "version": 3 +} \ No newline at end of file diff --git a/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/.hg_archival.txt b/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/.hg_archival.txt new file mode 100644 index 0000000..d60ed2e --- /dev/null +++ b/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/.hg_archival.txt @@ -0,0 +1,6 @@ +repo: e28c965eeed4adeb19cb086d1e0f5b3ca6dc8a5d +node: 5ec9f6bceaee7c629268142c0b7bb8f7ab05b51b +branch: default +latesttag: null +latesttagdistance: 24 +changessincelatesttag: 24 diff --git a/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/rgFastQC.xml b/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/rgFastQC.xml new file mode 100644 index 0000000..7c2bbf1 --- /dev/null +++ b/janis_assistant/tests/data/galaxy/fastqc-5ec9f6bceaee/rgFastQC.xml @@ -0,0 +1,220 @@ + + Read Quality reports + + fastqc + + + fastqc + + + + + + + '' + --contaminants '${contaminants}' + #end if + + #if $adapters.dataset and str($adapters) > '' + --adapters '${adapters}' + #end if + + #if $limits.dataset and str($limits) > '' + --limits '${limits}' + #end if + --threads \${GALAXY_SLOTS:-2} + --quiet + --extract + #if $min_length: + --min_length $min_length + #end if + $nogroup + --kmers $kmers + -f '${format}' + '${input_file_sl}' + + && cp '${html_file.files_path}'/*/fastqc_data.txt output.txt + && cp '${html_file.files_path}'/*\.html output.html + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @unpublished{andrews_s, + author = {Andrews, S.}, + keywords = {bioinformatics, ngs, qc}, + priority = {2}, + title = {{FastQC A Quality Control tool for High Throughput Sequence Data}}, + url = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/} + } + + + diff --git a/janis_assistant/tests/data/galaxy/hisat2_wf.ga b/janis_assistant/tests/data/galaxy/hisat2_wf.ga new file mode 100644 index 0000000..89f9231 --- /dev/null +++ b/janis_assistant/tests/data/galaxy/hisat2_wf.ga @@ -0,0 +1,123 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "hisat2_wf", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "read1" + } + ], + "label": "read1", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0.0, + "top": 0.0 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastqsanger\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "c38e69e9-c6e2-4a4d-89d5-b72bbe3b28de", + "when": null, + "workflow_outputs": [] + }, + "1": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "read2" + } + ], + "label": "read2", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 2.520477294921875, + "top": 79.31350708007812 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastqsanger\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "2c369543-cc2f-4bb0-bb06-45f9724042c5", + "when": null, + "workflow_outputs": [] + }, + "2": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/hisat2/hisat2/2.2.1+galaxy1", + "errors": null, + "id": 2, + "input_connections": { + "library|input_1": { + "id": 0, + "output_name": "output" + }, + "library|input_2": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool HISAT2", + "name": "library" + }, + { + "description": "runtime parameter for tool HISAT2", + "name": "library" + } + ], + "label": null, + "name": "HISAT2", + "outputs": [ + { + "name": "output_alignments", + "type": "bam" + } + ], + "position": { + "left": 316.78271484375, + "top": 27.008209228515625 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/hisat2/hisat2/2.2.1+galaxy1", + "tool_shed_repository": { + "changeset_revision": "f4af63aaf57a", + "name": "hisat2", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adv\": {\"input_options\": {\"input_options_selector\": \"defaults\", \"__current_case__\": 0}, \"alignment_options\": {\"alignment_options_selector\": \"defaults\", \"__current_case__\": 0}, \"scoring_options\": {\"scoring_options_selector\": \"defaults\", \"__current_case__\": 0}, \"spliced_options\": {\"spliced_options_selector\": \"defaults\", \"__current_case__\": 0}, \"reporting_options\": {\"reporting_options_selector\": \"defaults\", \"__current_case__\": 0}, \"output_options\": {\"output_options_selector\": \"defaults\", \"__current_case__\": 0}, \"sam_options\": {\"sam_options_selector\": \"defaults\", \"__current_case__\": 0}, \"other_options\": {\"other_options_selector\": \"defaults\", \"__current_case__\": 0}}, \"library\": {\"type\": \"paired\", \"__current_case__\": 1, \"input_1\": {\"__class__\": \"RuntimeValue\"}, \"input_2\": {\"__class__\": \"RuntimeValue\"}, \"rna_strandness\": \"\", \"paired_options\": {\"paired_options_selector\": \"defaults\", \"__current_case__\": 0}}, \"reference_genome\": {\"source\": \"indexed\", \"__current_case__\": 0, \"index\": \"mm10\"}, \"sum\": {\"new_summary\": false, \"summary_file\": false}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "2.2.1+galaxy1", + "type": "tool", + "uuid": "758da7a3-8bc9-415b-b221-0e850efab66d", + "when": null, + "workflow_outputs": [ + { + "label": null, + "output_name": "output_alignments", + "uuid": "e988b2f4-5a34-4539-a2a6-fdcf1ba2cae4" + } + ] + } + }, + "tags": [], + "uuid": "56329289-8ed2-4b2c-9fca-7e6fba601e0d", + "version": 1 +} \ No newline at end of file diff --git a/janis_assistant/tests/data/galaxy/unicycler_assembly.ga b/janis_assistant/tests/data/galaxy/unicycler_assembly.ga new file mode 100644 index 0000000..aa332bb --- /dev/null +++ b/janis_assistant/tests/data/galaxy/unicycler_assembly.ga @@ -0,0 +1,488 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "", + "format-version": "0.1", + "name": "Unicycler Assembly", + "steps": { + "0": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "short_R1" + } + ], + "label": "short_R1", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 3.9754087149161705, + "top": 453.28884880181624 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastqsanger\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "03640f53-ba88-4f80-9745-dc7d38f72c5e", + "workflow_outputs": [] + }, + "1": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "short_R2" + } + ], + "label": "short_R2", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 1.0348752702617503, + "top": 541.5060816266651 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastqsanger\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "ce3f01a5-617b-402e-add8-543a7c612e00", + "workflow_outputs": [] + }, + "2": { + "annotation": "", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "", + "name": "long" + } + ], + "label": "long", + "name": "Input dataset", + "outputs": [], + "position": { + "left": 0.0, + "top": 628.5142913824635 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fastqsanger\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "5fdc6024-93ed-48a0-b195-038f8291e897", + "workflow_outputs": [] + }, + "3": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.73+galaxy0", + "errors": null, + "id": 3, + "input_connections": { + "input_file": { + "id": 0, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool FastQC", + "name": "adapters" + }, + { + "description": "runtime parameter for tool FastQC", + "name": "contaminants" + }, + { + "description": "runtime parameter for tool FastQC", + "name": "input_file" + }, + { + "description": "runtime parameter for tool FastQC", + "name": "limits" + } + ], + "label": null, + "name": "FastQC", + "outputs": [ + { + "name": "html_file", + "type": "html" + }, + { + "name": "text_file", + "type": "txt" + } + ], + "position": { + "left": 299.96923213247317, + "top": 2.4487613134046455 + }, + "post_job_actions": { + "HideDatasetActiontext_file": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "text_file" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.73+galaxy0", + "tool_shed_repository": { + "changeset_revision": "3d0c7bdf12f5", + "name": "fastqc", + "owner": "devteam", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adapters\": {\"__class__\": \"RuntimeValue\"}, \"contaminants\": {\"__class__\": \"RuntimeValue\"}, \"input_file\": {\"__class__\": \"RuntimeValue\"}, \"kmers\": \"7\", \"limits\": {\"__class__\": \"RuntimeValue\"}, \"min_length\": null, \"nogroup\": \"false\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.73+galaxy0", + "type": "tool", + "uuid": "1450d138-93ec-47db-960e-78905af995b1", + "workflow_outputs": [ + { + "label": "FastQC on input dataset(s): Webpage", + "output_name": "html_file", + "uuid": "25238fe4-0880-4b36-a7fc-74dbc46e0576" + } + ] + }, + "4": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.73+galaxy0", + "errors": null, + "id": 4, + "input_connections": { + "input_file": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool FastQC", + "name": "adapters" + }, + { + "description": "runtime parameter for tool FastQC", + "name": "contaminants" + }, + { + "description": "runtime parameter for tool FastQC", + "name": "limits" + } + ], + "label": null, + "name": "FastQC", + "outputs": [ + { + "name": "html_file", + "type": "html" + }, + { + "name": "text_file", + "type": "txt" + } + ], + "position": { + "left": 520.9118529795632, + "top": 0.0 + }, + "post_job_actions": { + "HideDatasetActiontext_file": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "text_file" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.73+galaxy0", + "tool_shed_repository": { + "changeset_revision": "3d0c7bdf12f5", + "name": "fastqc", + "owner": "devteam", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adapters\": {\"__class__\": \"RuntimeValue\"}, \"contaminants\": {\"__class__\": \"RuntimeValue\"}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"kmers\": \"7\", \"limits\": {\"__class__\": \"RuntimeValue\"}, \"min_length\": null, \"nogroup\": \"false\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.73+galaxy0", + "type": "tool", + "uuid": "d6a70043-c6d3-42ed-a9da-96c84c7fc802", + "workflow_outputs": [ + { + "label": null, + "output_name": "html_file", + "uuid": "3d489e1f-ad51-481d-be74-4e9a50bc871c" + } + ] + }, + "5": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.8.0", + "errors": null, + "id": 5, + "input_connections": { + "long": { + "id": 2, + "output_name": "output" + }, + "paired_unpaired|fastq_input1": { + "id": 0, + "output_name": "output" + }, + "paired_unpaired|fastq_input2": { + "id": 1, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool Create assemblies with Unicycler", + "name": "long" + }, + { + "description": "runtime parameter for tool Create assemblies with Unicycler", + "name": "lr_align" + }, + { + "description": "runtime parameter for tool Create assemblies with Unicycler", + "name": "paired_unpaired" + }, + { + "description": "runtime parameter for tool Create assemblies with Unicycler", + "name": "paired_unpaired" + }, + { + "description": "runtime parameter for tool Create assemblies with Unicycler", + "name": "rotation" + } + ], + "label": null, + "name": "Create assemblies with Unicycler", + "outputs": [ + { + "name": "assembly_graph", + "type": "gfa1" + }, + { + "name": "assembly", + "type": "fasta" + } + ], + "position": { + "left": 503.34013809490136, + "top": 381.4343956129509 + }, + "post_job_actions": { + "HideDatasetActionassembly_graph": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "assembly_graph" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/unicycler/unicycler/0.4.8.0", + "tool_shed_repository": { + "changeset_revision": "9e3e80cc4ad4", + "name": "unicycler", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"graph_clean\": {\"min_component_size\": \"1000\", \"min_dead_end_size\": \"1000\"}, \"linear_seqs\": \"0\", \"long\": {\"__class__\": \"RuntimeValue\"}, \"lr_align\": {\"contamination\": {\"__class__\": \"RuntimeValue\"}, \"scores\": \"3,-6,-5,-2\", \"low_score\": null}, \"min_anchor_seg_len\": null, \"min_fasta_length\": \"500\", \"mode\": \"normal\", \"paired_unpaired\": {\"fastq_input_selector\": \"paired\", \"__current_case__\": 0, \"fastq_input1\": {\"__class__\": \"RuntimeValue\"}, \"fastq_input2\": {\"__class__\": \"RuntimeValue\"}}, \"pilon\": {\"no_pilon\": \"false\", \"min_polish_size\": \"1000\"}, \"rotation\": {\"no_rotate\": \"false\", \"start_genes\": {\"__class__\": \"RuntimeValue\"}, \"start_gene_id\": \"90.0\", \"start_gene_cov\": \"95.0\"}, \"spades\": {\"no_correct\": \"false\", \"min_kmer_frac\": \"0.2\", \"max_kmer_frac\": \"0.95\", \"kmers\": \"\", \"kmer_count\": \"10\", \"depth_filter\": \"0.25\", \"largest_component\": \"false\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.4.8.0", + "type": "tool", + "uuid": "f827fd85-7920-41bb-8342-c9b2051770c2", + "workflow_outputs": [ + { + "label": "Create assemblies with Unicycler on input dataset(s): Final Assembly", + "output_name": "assembly", + "uuid": "e6632bf6-1c4a-4093-b517-1c86e160c678" + } + ] + }, + "6": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/nanoplot/nanoplot/1.28.2+galaxy1", + "errors": null, + "id": 6, + "input_connections": { + "mode|reads|files": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "NanoPlot", + "outputs": [ + { + "name": "output_html", + "type": "html" + }, + { + "name": "nanostats", + "type": "txt" + }, + { + "name": "nanostats_post_filtering", + "type": "txt" + }, + { + "name": "read_length", + "type": "png" + }, + { + "name": "log_read_length", + "type": "png" + } + ], + "position": { + "left": 739.0983192541909, + "top": 0.9835881726859839 + }, + "post_job_actions": { + "HideDatasetActionlog_read_length": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "log_read_length" + }, + "HideDatasetActionnanostats": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "nanostats" + }, + "HideDatasetActionnanostats_post_filtering": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "nanostats_post_filtering" + }, + "HideDatasetActionread_length": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "read_length" + } + }, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/nanoplot/nanoplot/1.28.2+galaxy1", + "tool_shed_repository": { + "changeset_revision": "edbb6c5028f5", + "name": "nanoplot", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"customization\": {\"color\": null, \"format\": \"png\", \"plots\": null, \"N50\": \"false\"}, \"filter\": {\"maxlength\": null, \"minlength\": null, \"drop_outliers\": \"false\", \"downsample\": null, \"loglength\": \"false\", \"percentqual\": \"false\", \"alength\": \"false\", \"minqual\": null, \"readtype\": null, \"barcoded\": \"false\"}, \"mode\": {\"choice\": \"batch\", \"__current_case__\": 0, \"reads\": {\"type\": \"fastq\", \"__current_case__\": 0, \"files\": {\"__class__\": \"ConnectedValue\"}}}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "1.28.2+galaxy1", + "type": "tool", + "uuid": "3d1ad8c6-4fde-48b5-943f-4383e4b4fe5b", + "workflow_outputs": [ + { + "label": "NanoPlot on input dataset(s): HTML report", + "output_name": "output_html", + "uuid": "4b13efe5-0fc6-4db6-9c22-854f321833d3" + } + ] + }, + "7": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/quast/quast/5.0.2+galaxy5", + "errors": null, + "id": 7, + "input_connections": { + "in|inputs": { + "id": 5, + "output_name": "assembly" + } + }, + "inputs": [], + "label": null, + "name": "Quast", + "outputs": [ + { + "name": "report_html", + "type": "html" + } + ], + "position": { + "left": 926.0962584174182, + "top": 475.95282798944174 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/quast/quast/5.0.2+galaxy5", + "tool_shed_repository": { + "changeset_revision": "675488238c96", + "name": "quast", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"advanced\": {\"contig_thresholds\": \"0,1000\", \"strict_NA\": \"false\", \"extensive_mis_size\": \"1000\", \"scaffold_gap_max_size\": \"1000\", \"unaligned_part_size\": \"500\", \"skip_unaligned_mis_contigs\": \"true\", \"fragmented_max_indent\": null}, \"alignments\": {\"use_all_alignments\": \"false\", \"min_alignment\": \"65\", \"min_identity\": \"95.0\", \"ambiguity_usage\": \"one\", \"ambiguity_score\": \"0.99\", \"fragmented\": \"false\", \"upper_bound_assembly\": \"false\", \"upper_bound_min_con\": null}, \"assembly\": {\"type\": \"genome\", \"__current_case__\": 0, \"ref\": {\"use_ref\": \"false\", \"__current_case__\": 1, \"est_ref_size\": null}, \"orga_type\": \"\"}, \"genes\": {\"gene_finding\": {\"tool\": \"none\", \"__current_case__\": 0}, \"rna_finding\": \"false\", \"conserved_genes_finding\": \"false\"}, \"in\": {\"custom\": \"false\", \"__current_case__\": 1, \"inputs\": {\"__class__\": \"ConnectedValue\"}}, \"large\": \"false\", \"min_contig\": \"500\", \"output_files\": [\"html\"], \"reads\": {\"reads_option\": \"disabled\", \"__current_case__\": 0}, \"split_scaffolds\": \"false\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "5.0.2+galaxy5", + "type": "tool", + "uuid": "2a71ea4f-1311-459e-905f-c42bdbef3f33", + "workflow_outputs": [ + { + "label": "Quast on input dataset(s): HTML report", + "output_name": "report_html", + "uuid": "63c115b7-5b97-4291-b280-3a28487d0452" + } + ] + }, + "8": { + "annotation": "", + "content_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.3.2+galaxy0", + "errors": null, + "id": 8, + "input_connections": { + "input": { + "id": 5, + "output_name": "assembly" + } + }, + "inputs": [], + "label": null, + "name": "Busco", + "outputs": [ + { + "name": "busco_sum", + "type": "txt" + }, + { + "name": "busco_table", + "type": "tabular" + } + ], + "position": { + "left": 925.942572765436, + "top": 599.928233191543 + }, + "post_job_actions": {}, + "tool_id": "toolshed.g2.bx.psu.edu/repos/iuc/busco/busco/5.3.2+galaxy0", + "tool_shed_repository": { + "changeset_revision": "41030a6c03b7", + "name": "busco", + "owner": "iuc", + "tool_shed": "toolshed.g2.bx.psu.edu" + }, + "tool_state": "{\"adv\": {\"evalue\": \"0.001\", \"limit\": \"3\"}, \"busco_mode\": {\"mode\": \"geno\", \"__current_case__\": 0, \"use_augustus\": {\"use_augustus_selector\": \"no\", \"__current_case__\": 0}}, \"input\": {\"__class__\": \"ConnectedValue\"}, \"lineage\": {\"lineage_mode\": \"auto_detect\", \"__current_case__\": 0, \"auto_lineage\": \"--auto-lineage\"}, \"outputs\": [\"short_summary\"], \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "5.3.2+galaxy0", + "type": "tool", + "uuid": "896f095b-e08a-47b2-9173-1ca913ffb0dc", + "workflow_outputs": [ + { + "label": "Busco on input dataset(s): short summary", + "output_name": "busco_sum", + "uuid": "38b54017-0299-4515-93c8-eff788fd1e8c" + }, + { + "label": "Busco on input dataset(s): full table", + "output_name": "busco_table", + "uuid": "2b1d658e-9a8d-4131-bb59-ac502af97431" + } + ] + } + }, + "tags": [], + "uuid": "26da9aea-1aaf-488d-adf8-b90e1d7a9e61", + "version": 6 +} \ No newline at end of file diff --git a/janis_assistant/tests/data/janis/gatk_haplotype_caller_tool.py b/janis_assistant/tests/data/janis/gatk_haplotype_caller_tool.py new file mode 100644 index 0000000..09e648d --- /dev/null +++ b/janis_assistant/tests/data/janis/gatk_haplotype_caller_tool.py @@ -0,0 +1,7 @@ + + +from janis_bioinformatics.tools.gatk4 import Gatk4HaplotypeCaller_4_1_3 + + +class Gatk4HaplotypeCaller_4_1_3_Test(Gatk4HaplotypeCaller_4_1_3): + pass \ No newline at end of file diff --git a/janis_assistant/tests/data/janis/germline_variant_caller_wf.py b/janis_assistant/tests/data/janis/germline_variant_caller_wf.py new file mode 100644 index 0000000..63d8637 --- /dev/null +++ b/janis_assistant/tests/data/janis/germline_variant_caller_wf.py @@ -0,0 +1,7 @@ + + +from janis_bioinformatics.tools.variantcallers import IlluminaGermlineVariantCaller + + +class IlluminaGermlineVariantCaller_Test(IlluminaGermlineVariantCaller): + pass \ No newline at end of file diff --git a/janis_assistant/tests/data/janis/samtools_flagstat_tool.py b/janis_assistant/tests/data/janis/samtools_flagstat_tool.py new file mode 100644 index 0000000..477a839 --- /dev/null +++ b/janis_assistant/tests/data/janis/samtools_flagstat_tool.py @@ -0,0 +1,7 @@ + + +from janis_bioinformatics.tools.samtools import SamToolsFlagstat_1_9 + + +class SamToolsFlagstat_1_9_Test(SamToolsFlagstat_1_9): + pass \ No newline at end of file diff --git a/janis_assistant/tests/data/wdl/Multisample_jointgt_GATK4.wdl b/janis_assistant/tests/data/wdl/Multisample_jointgt_GATK4.wdl new file mode 100644 index 0000000..419e1b7 --- /dev/null +++ b/janis_assistant/tests/data/wdl/Multisample_jointgt_GATK4.wdl @@ -0,0 +1,850 @@ +## Copyright Broad Institute, 2018 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v31 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +## Adapted to Yale Ruddle HPC by Sander Pajusalu (sander.pajusalu@yale.edu) + + +workflow JointGenotyping { + File unpadded_intervals_file + + String callset_name + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + File sample_sheet + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) + + + # Make a 2.5:1 interval number to samples in callset ratio interval list + Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) + Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 + + call samples { + input: + samples = sample_sheet + } + + Int num_gvcfs = length(read_lines(samples.input_gvcfs)) + + call DynamicallyCombineIntervals { + input: + intervals = unpadded_intervals_file, + merge_count = merge_count + } + + Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) + + scatter (idx in range(length(unpadded_intervals))) { + # the batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call ImportGVCFs { + input: + sample_names = read_lines(samples.sample_names), + interval = unpadded_intervals[idx], + workspace_dir_name = "genomicsdb", + input_gvcfs = read_lines(samples.input_gvcfs), + input_gvcfs_indices = read_lines(samples.input_gvcfs_indices), + batch_size = 50 + } + + call GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = "output.vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index + } + + call HardFilterAndMakeSitesOnlyVcf { + input: + vcf = GenotypeGVCFs.output_vcf, + vcf_index = GenotypeGVCFs.output_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz" + } + } + + call GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, + input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index, + output_vcf_name = callset_name + ".sites_only.vcf.gz" + } + + call IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index + } + + if (num_gvcfs > 10000) { + call SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index + } + } + call GatherTranches as SNPGatherTranches { + input: + input_fofn = SNPsVariantRecalibratorScattered.tranches, + output_filename = callset_name + ".snps.gathered.tranches" + } + } + + if (num_gvcfs <= 10000){ + call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index + } + } + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + Boolean is_small_callset = num_gvcfs <= 1000 + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + call ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level + } + + # for large callsets we need to collect metrics from the shards and gather them later + if (!is_small_callset) { + call CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict + } + } + } + + # for small callsets we can gather the VCF shards and then collect metrics on it + if (is_small_callset) { + call GatherVcfs as FinalGatherVcf { + input: + input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf, + input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index, + output_vcf_name = callset_name + ".vcf.gz" + } + + call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict + } + } + + # for large callsets we still need to gather the sharded metrics + if (!is_small_callset) { + call GatherMetrics { + input: + input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file), + input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file), + output_prefix = callset_name + } + } + + output { + # outputs from the small callset path through the wdl + FinalGatherVcf.output_vcf + FinalGatherVcf.output_vcf_index + CollectMetricsOnFullVcf.detail_metrics_file + CollectMetricsOnFullVcf.summary_metrics_file + + # outputs from the large callset path through the wdl + # (note that we do not list ApplyRecalibration here because it is run in both paths) + GatherMetrics.detail_metrics_file + GatherMetrics.summary_metrics_file + + # output the interval list generated/used by this run workflow + DynamicallyCombineIntervals.output_intervals + } +} + +task samples { + File samples + + command { + cut ${samples} -f1 > sample_names.txt + cut ${samples} -f2 > gvcfs.txt + cut ${samples} -f3 > gvcf_indices.txt + + } + runtime { + cpus: 2 + requested_memory: 4000 + } + output { + File sample_names = "sample_names.txt" + File input_gvcfs = "gvcfs.txt" + File input_gvcfs_indices = "gvcf_indices.txt" + } + +} + + +task GetNumberOfSamples { + File sample_name_map + command <<< + wc -l ${sample_name_map} | awk '{print $1}' + >>> + + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + Int sample_count = read_int(stdout()) + } +} + +task ImportGVCFs { + Array[String] sample_names + Array[File] input_gvcfs + Array[File] input_gvcfs_indices + String interval + + String workspace_dir_name + + Int batch_size + + command <<< + set -e + set -o pipefail + + python << CODE + gvcfs = ['${sep="','" input_gvcfs}'] + sample_names = ['${sep="','" sample_names}'] + + if len(gvcfs)!= len(sample_names): + exit(1) + + with open("inputs.list", "w") as fi: + for i in range(len(gvcfs)): + fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") + + CODE + + # The memory setting here is very important and must be several GB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx4g -Xms4g" \ + GenomicsDBImport \ + --genomicsdb-workspace-path ${workspace_dir_name} \ + --batch-size ${batch_size} \ + -L ${interval} \ + --sample-name-map inputs.list \ + --reader-threads 5 \ + -ip 500 + + tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} + + >>> + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File output_genomicsdb = "${workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + File workspace_tar + String interval + + String output_vcf_filename + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + + command <<< + set -e + + tar -xf ${workspace_tar} + WORKSPACE=$( basename ${workspace_tar} .tar) + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx14g -Xms5g" \ + GenotypeGVCFs \ + -R ${ref_fasta} \ + -O ${output_vcf_filename} \ + -D ${dbsnp_vcf} \ + -G StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ${interval} + >>> + runtime { + cpus: 4 + requested_memory: 16000 + } + output { + File output_vcf = "${output_vcf_filename}" + File output_vcf_index = "${output_vcf_filename}.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + + + command { + set -e + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx15g -Xms3g" \ + VariantFiltration \ + --filter-expression "ExcessHet > ${excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ${variant_filtered_vcf_filename} \ + -V ${vcf} + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx15g -Xms3g" \ + MakeSitesOnlyVcf \ + --INPUT ${variant_filtered_vcf_filename} \ + --OUTPUT ${sites_only_vcf_filename} + + } + runtime { + cpus: 4 + requested_memory: 16000 + } + output { + File variant_filtered_vcf = "${variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "${sites_only_vcf_filename}" + File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + + + command { + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx24g -Xms24g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode INDEL \ + --max-gaussians 4 \ + -resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} + } + runtime { + cpus: 8 + requested_memory: 32000 + } + output { + File recalibration = "${recalibration_filename}" + File recalibration_index = "${recalibration_filename}.idx" + File tranches = "${tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + + + command { + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx100g -Xms100g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode SNP \ + --sample-every-Nth-variant ${downsampleFactor} \ + --output-model ${model_report_filename} \ + --max-gaussians 6 \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} + } + runtime { + cpus: 8 + requested_memory: 104000 + } + output { + File model_report = "${model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + + + command { + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx3g -Xms3g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode SNP \ + ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ + --max-gaussians 6 \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} + } + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File recalibration = "${recalibration_filename}" + File recalibration_index = "${recalibration_filename}.idx" + File tranches = "${tranches_filename}" + } +} + +task GatherTranches { + Array[File] input_fofn + String output_filename + + + command <<< + set -e + set -o pipefail + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx6g -Xms6g" \ + GatherTranches \ + --input ${sep=" --input " input_fofn} \ + --output ${output_filename} + >>> + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File tranches = "${output_filename}" + } +} + +task ApplyRecalibration { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + + Float indel_filter_level + Float snp_filter_level + + command { + set -e + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx5g -Xms5g" \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ${input_vcf} \ + --recal-file ${indels_recalibration} \ + --tranches-file ${indels_tranches} \ + --truth-sensitivity-filter-level ${indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx5g -Xms5g" \ + ApplyVQSR \ + -O ${recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ${snps_recalibration} \ + --tranches-file ${snps_tranches} \ + --truth-sensitivity-filter-level ${snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + } + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File recalibrated_vcf = "${recalibrated_vcf_filename}" + File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + Array[File] input_vcfs_fofn + Array[File] input_vcf_indexes_fofn + String output_vcf_name + + command <<< + set -e + set -o pipefail + + # ignoreSafetyChecks make a big performance difference so we include it in our invocation + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx6g -Xms6g" \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input ${sep=" --input " input_vcfs_fofn} \ + --output ${output_vcf_name} + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx6g -Xms6g" \ + IndexFeatureFile \ + --input ${output_vcf_name} + >>> + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File output_vcf = "${output_vcf_name}" + File output_vcf_index = "${output_vcf_name}.tbi" + } +} + +task CollectVariantCallingMetrics { + File input_vcf + File input_vcf_index + + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + + + command { + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx6g -Xms6g" \ + CollectVariantCallingMetrics \ + --INPUT ${input_vcf} \ + --DBSNP ${dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ${ref_dict} \ + --OUTPUT ${metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ${interval_list} + } + output { + File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" + } + runtime { + cpus: 4 + requested_memory: 8000 + } +} + +task GatherMetrics { + Array[File] input_details_fofn + Array[File] input_summaries_fofn + + String output_prefix + + command <<< + set -e + set -o pipefail + + + /scratch/pawsey0339/dtang/gatk4_multisample/gatk-4.1.4.1/gatk --java-options "-Xmx2g -Xms2g" \ + AccumulateVariantCallingMetrics \ + --INPUT ${sep=" --INPUT " input_details_fofn} \ + --OUTPUT ${output_prefix} + >>> + runtime { + cpus: 4 + requested_memory: 8000 + } + output { + File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" + } +} + +task DynamicallyCombineIntervals { + File intervals + Int merge_count + + command { + python << CODE + def parse_interval(interval): + colon_split = interval.split(":") + chromosome = colon_split[0] + dash_split = colon_split[1].split("-") + start = int(dash_split[0]) + end = int(dash_split[1]) + return chromosome, start, end + + def add_interval(chr, start, end): + lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) + return chr, start, end + + count = 0 + chain_count = ${merge_count} + l_chr, l_start, l_end = "", 0, 0 + lines_to_write = [] + with open("${intervals}") as f: + with open("out.intervals", "w") as f1: + for line in f.readlines(): + # initialization + if count == 0: + w_chr, w_start, w_end = parse_interval(line) + count = 1 + continue + # reached number to combine, so spit out and start over + if count == chain_count: + l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) + w_chr, w_start, w_end = parse_interval(line) + count = 1 + continue + + c_chr, c_start, c_end = parse_interval(line) + # if adjacent keep the chain going + if c_chr == w_chr and c_start == w_end + 1: + w_end = c_end + count += 1 + continue + # not adjacent, end here and start a new chain + else: + l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) + w_chr, w_start, w_end = parse_interval(line) + count = 1 + if l_char != w_chr or l_start != w_start or l_end != w_end: + add_interval(w_chr, w_start, w_end) + f1.writelines("\n".join(lines_to_write)) + CODE + } + + runtime { + cpus: 4 + requested_memory: 8000 + } + + output { + File output_intervals = "out.intervals" + } +} diff --git a/janis_assistant/tests/data/wdl/bwa.wdl b/janis_assistant/tests/data/wdl/bwa.wdl new file mode 100644 index 0000000..93cbcb0 --- /dev/null +++ b/janis_assistant/tests/data/wdl/bwa.wdl @@ -0,0 +1,113 @@ +version 1.0 + +# Copyright (c) 2017 Leiden University Medical Center +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +task Mem { + input { + File read1 + File? read2 + BwaIndex bwaIndex + String outputPrefix + Boolean sixtyFour = false + Boolean usePostalt = false + Int sortMemoryPerThreadGb = 2 + Int compressionLevel = 1 + + String? readgroup + Int? sortThreads + + Int threads = 4 + Int? memoryGb + Int timeMinutes = 1 + ceil(size([read1, read2], "G") * 220 / threads) + # Contains bwa 0.7.17 bwakit 0.7.17.dev1 and samtools 1.10. + String dockerImage = "quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0" + } + + # Samtools sort may block the pipe while it is writing data to disk. + # This can lead to cpu underutilization. + # 1 thread if threads is 1. For 2-4 threads 2 sort threads. 3 sort threads for 5-8 threads. + Int estimatedSortThreads = if threads == 1 then 1 else 1 + ceil(threads / 4.0) + Int totalSortThreads = select_first([sortThreads, estimatedSortThreads]) + # BWA needs slightly more memory than the size of the index files (~10%). Add a margin for safety here. + Int estimatedMemoryGb = 1 + ceil(size(bwaIndex.indexFiles, "G") * 1.2) + sortMemoryPerThreadGb * totalSortThreads + + # The bwa postalt script is out commented as soon as usePostalt = false. + # This hack was tested with bash, dash and ash. It seems that comments in between pipes work for all of them. + command { + set -e + mkdir -p "$(dirname ~{outputPrefix})" + bwa mem \ + -t ~{threads} \ + ~{"-R '" + readgroup}~{true="'" false="" defined(readgroup)} \ + ~{bwaIndex.fastaFile} \ + ~{read1} \ + ~{read2} \ + 2> ~{outputPrefix}.log.bwamem | \ + ~{true="" false="#" usePostalt} bwa-postalt.js -p ~{outputPrefix}.hla ~{bwaIndex.fastaFile}~{true=".64.alt" false=".alt" sixtyFour} | \ + samtools sort \ + ~{"-@ " + totalSortThreads} \ + -m ~{sortMemoryPerThreadGb}G \ + -l ~{compressionLevel} \ + - \ + -o ~{outputPrefix}.aln.bam + } + + output { + File outputBam = outputPrefix + ".aln.bam" + File? outputHla = outputPrefix + ".hla" + } + + runtime { + # One extra thread for bwa-postalt + samtools is not needed. + # These only use 5-10% of compute power and not always simultaneously. + cpu: threads + memory: "~{select_first([memoryGb, estimatedMemoryGb])}G" + time_minutes: timeMinutes + docker: dockerImage + } + + parameter_meta { + # inputs + read1: {description: "The first-end fastq file.", category: "required"} + read2: {description: "The second-end fastq file.", category: "common"} + bwaIndex: {description: "The BWA index, including (optionally) a .alt file.", category: "required"} + outputPrefix: {description: "The prefix of the output files, including any parent directories.", category: "required"} + sixtyFour: {description: "Whether or not the index uses the '.64' suffixes.", category: "common"} + usePostalt: {description: "Whether to use the postalt script from bwa kit."} + sortMemoryPerThreadGb: {description: "The amount of memory for each sorting thread in gigabytes.", category: "advanced"} + compressionLevel: {description: "The compression level of the output BAM.", category: "advanced"} + readgroup: {description: "A readgroup identifier.", category: "common"} + sortThreads: {description: "The number of threads to use for sorting.", category: "advanced"} + threads: {description: "The number of threads to use for alignment.", category: "advanced"} + memoryGb: {description: "The amount of memory this job will use in gigabytes.", category: "advanced"} + timeMinutes: {description: "The maximum amount of time the job will run in minutes.", category: "advanced"} + dockerImage: {description: "The docker image used for this task. Changing this may result in errors which the developers may choose not to address.", category: "advanced"} + + # outputs + outputBam: {description: "The produced BAM file."} + outputHla: {description: "The produced HLA file."} + } +} + +struct BwaIndex { + File fastaFile + Array[File] indexFiles +} \ No newline at end of file diff --git a/janis_assistant/tests/test_translate.py b/janis_assistant/tests/test_translate.py new file mode 100644 index 0000000..f01487d --- /dev/null +++ b/janis_assistant/tests/test_translate.py @@ -0,0 +1,532 @@ + + +""" +To test translate functionality. +Tests CLI args and end-to-end translations. +""" + +import os +import unittest +import shutil +import regex as re +from typing import Optional + +from janis_core import settings +from janis_assistant.management.configuration import JanisConfiguration +from janis_assistant.main import ingest +from janis_assistant.main import translate +from janis_assistant.cli import process_args + +CWL_TESTDATA_DIR = os.path.join(os.getcwd(), 'tests/data/cwl') +WDL_TESTDATA_DIR = os.path.join(os.getcwd(), 'tests/data/wdl') +GALAXY_TESTDATA_DIR = os.path.join(os.getcwd(), 'tests/data/galaxy') +JANIS_TESTDATA_DIR = os.path.join(os.getcwd(), 'tests/data/janis') + + +# ------- HELPER FUNCS ------- # + +def _run_translate(filepath: str, srcfmt: str, destfmt: str, mode: Optional[str]=None) -> Optional[str]: + config = JanisConfiguration.initial_configuration(None) + internal = ingest(filepath, srcfmt) + return translate(config, internal, destfmt, mode=mode) + +def _get_file_lines(filepath: str) -> list[str]: + with open(filepath, 'r') as f: + text = f.read() + return _get_simplified_lines(text) + +def _get_process_input_lines(filepath: str) -> list[str]: + PATTERN = r'input:([\s\S]+)(?=output:)' + with open(filepath, 'r') as f: + text = f.read() + match = re.search(PATTERN, text) + assert(match) + return _get_simplified_lines(match.group(1)) + +def _get_script_lines(filepath: str) -> list[str]: + PATTERN = r'script:[\s\S]+"""([\s\S]+)(?=""")' + with open(filepath, 'r') as f: + text = f.read() + match = re.search(PATTERN, text) + assert(match) + return _get_simplified_lines(match.group(1)) + +def _get_simplified_lines(text: str) -> list[str]: + lines = text.split('\n') + lines = [ln.split('//')[0] for ln in lines] + lines = [ln.strip() for ln in lines] + lines = [ln for ln in lines if ln != ''] + return lines + +def _docker_running() -> bool: + import subprocess + try: + completed_process = subprocess.run(['docker', 'version'], shell=True, capture_output=True) + if completed_process.returncode == 0: + return True + else: + return False + except FileNotFoundError: + return False + +def _reset_global_settings() -> None: + settings.translate.MODE = 'regular' + settings.translate.SAFE_MODE = False + settings.translate.ALLOW_EMPTY_CONTAINER = True + settings.ingest.galaxy.GEN_IMAGES = False + settings.ingest.galaxy.DISABLE_IMAGE_CACHE = False + settings.ingest.cwl.INGEST_JAVASCRIPT_EXPRESSIONS = True + settings.ingest.cwl.REQUIRE_CWL_VERSION = False + settings.datatypes.ALLOW_UNPARSEABLE_DATATYPES = True + settings.graph.ALLOW_UNKNOWN_SOURCE = True + settings.graph.ALLOW_UNKNOWN_SCATTER_FIELDS = True + settings.graph.ALLOW_INCORRECT_NUMBER_OF_SOURCES = True + settings.graph.ALLOW_NON_ARRAY_SCATTER_INPUT = True + settings.graph.ALLOW_INCOMPATIBLE_TYPES = True + settings.validation.STRICT_IDENTIFIERS = False + settings.validation.VALIDATE_STRINGFORMATTERS = False + + + +### --------- CLI ---------- ### + +class TestCli(unittest.TestCase): + def setUp(self) -> None: + self.src = 'cwl' + self.dest = 'nextflow' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_basic(self) -> None: + filepath = f'{CWL_TESTDATA_DIR}/fastqc.cwl' + args = ['translate', '--from', self.src, '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # assert file exists + lines = _get_file_lines('translated/fastqc.nf') + self.assertGreater(len(lines), 0) + + @unittest.skip("need to test different assert for skeleton mode") + def test_modes_skeleton(self) -> None: + filepath = f'{CWL_TESTDATA_DIR}/subworkflow_test/main.cwl' + args = ['translate', '--mode', 'skeleton', '--from', self.src, '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/main.nf') + self.assertGreater(len(lines), 0) + + # check optional_input_types.nf + input_lines = _get_process_input_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(input_lines), 5) + script_lines = _get_script_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(script_lines), 2) + + @unittest.skip("Need to further test different assert value") + def test_modes_regular(self) -> None: + filepath = f'{CWL_TESTDATA_DIR}/subworkflow_test/main.cwl' + args = ['translate', '--mode', 'regular', '--from', self.src, '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/main.nf') + self.assertGreater(len(lines), 0) + + # check optional_input_types.nf + input_lines = _get_process_input_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(input_lines), 5) + script_lines = _get_script_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(script_lines), 7) + + def test_modes_extended1(self) -> None: + filepath = f'{CWL_TESTDATA_DIR}/subworkflow_test/main.cwl' + args = ['translate', '--mode', 'extended', '--from', self.src, '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/main.nf') + self.assertGreater(len(lines), 0) + + # check optional_input_types.nf + input_lines = _get_process_input_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(input_lines), 6) + script_lines = _get_script_lines('translated/modules/optional_input_types.nf') + self.assertEqual(len(script_lines), 8) + + def test_modes_extended2(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/unicycler_assembly.ga' + args = ['translate', '--mode', 'extended', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/main.nf') + self.assertGreater(len(lines), 0) + + # check optional_input_types.nf + input_lines = _get_process_input_lines('translated/modules/quast.nf') + self.assertEqual(len(input_lines), 41) + script_lines = _get_script_lines('translated/modules/quast.nf') + self.assertEqual(len(script_lines), 42) + + @unittest.skip("test require local docker installation") + #@unittest.skipUnless(_docker_running(), 'docker daemon must be running to test this') + def test_galaxy_build_images(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/hisat2_wf.ga' + args = ['translate', '--galaxy-build-images', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/main.nf') + self.assertGreater(len(lines), 0) + + # check container correct + lines = _get_file_lines('translated/modules/hisat2.nf') + self.assertGreater(len(lines), 0) + self.assertIn('container "quay.io/ppp-janis-translate/hisat2:2.2.1"', lines) + + def test_galaxy_no_image_cache(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + args = ['translate', '--galaxy-no-image-cache', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/fastqc.nf') + self.assertGreater(len(lines), 0) + + def test_galaxy_no_wrapper_cache(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + args = ['translate', '--galaxy-no-wrapper-cache', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/fastqc.nf') + self.assertGreater(len(lines), 0) + + def test_output_dir(self) -> None: + if os.path.exists('out_fastqc'): + shutil.rmtree('out_fastqc') + + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + args = ['translate', '-o', 'out_fastqc', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('out_fastqc/fastqc.nf') + self.assertGreater(len(lines), 0) + + if os.path.exists('out_fastqc'): + shutil.rmtree('out_fastqc') + + def test_disallow_empty_container(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + args = ['translate', '--disallow-empty-container', '--from', 'galaxy', '--to', self.dest, filepath] + + # run translate via cli args + process_args(sysargs=args) + + # check main.nf + lines = _get_file_lines('translated/fastqc.nf') + self.assertGreater(len(lines), 0) + + + +### --- FROM CWL --- ### + +class TestCwlToCwl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'cwl' + self.dest = 'cwl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_fastqc2_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc2.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/gatk_haplotype_tool.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_super_enhancer_wf(self): + filepath = f'{CWL_TESTDATA_DIR}/super_enhancer_wf.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestCwlToWdl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'cwl' + self.dest = 'wdl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_fastqc2_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc2.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/gatk_haplotype_tool.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_super_enhancer_wf(self): + filepath = f'{CWL_TESTDATA_DIR}/super_enhancer_wf.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestCwlToNextflow(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'cwl' + self.dest = 'nextflow' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_fastqc2_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/fastqc2.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_tool(self): + filepath = f'{CWL_TESTDATA_DIR}/gatk_haplotype_tool.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_super_enhancer_wf(self): + filepath = f'{CWL_TESTDATA_DIR}/super_enhancer_wf.cwl' + maintask = _run_translate(filepath, self.src, self.dest) + + + +### --- FROM GALAXY --- ### + +class TestGalaxyToNextflow(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'galaxy' + self.dest = 'nextflow' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_abricate_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/abricate_wf.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + @unittest.skip("need to test different assert for this translation") + def test_unicycler_assembly_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/unicycler_assembly.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestGalaxyToCwl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'galaxy' + self.dest = 'cwl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_abricate_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/abricate_wf.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_unicycler_assembly_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/unicycler_assembly.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestGalaxyToWdl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'galaxy' + self.dest = 'wdl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_fastqc_tool(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/fastqc-5ec9f6bceaee/rgFastQC.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_abricate_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/abricate_wf.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_unicycler_assembly_wf(self) -> None: + filepath = f'{GALAXY_TESTDATA_DIR}/unicycler_assembly.ga' + maintask = _run_translate(filepath, self.src, self.dest) + + + +### --- FROM JANIS --- ### + +class TestJanisToCWL(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'janis' + self.dest = 'cwl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_samtools_flagstat_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/samtools_flagstat_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_caller_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/gatk_haplotype_caller_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_germline_variant_caller_wf(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/germline_variant_caller_wf.py' + maintask = _run_translate(filepath, self.src, self.dest, mode='extended') + + +class TestJanisToNextflow(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'janis' + self.dest = 'nextflow' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_samtools_flagstat_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/samtools_flagstat_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_caller_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/gatk_haplotype_caller_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_germline_variant_caller_wf(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/germline_variant_caller_wf.py' + maintask = _run_translate(filepath, self.src, self.dest, mode='extended') + + +class TestJanisToWdl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'janis' + self.dest = 'wdl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + def test_samtools_flagstat_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/samtools_flagstat_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_gatk_haplotype_caller_tool(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/gatk_haplotype_caller_tool.py' + maintask = _run_translate(filepath, self.src, self.dest) + + def test_germline_variant_caller_wf(self) -> None: + filepath = f'{JANIS_TESTDATA_DIR}/germline_variant_caller_wf.py' + maintask = _run_translate(filepath, self.src, self.dest, mode='extended') + + + +### --- FROM WDL --- ### + +class TestWdlToWdl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'wdl' + self.dest = 'wdl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + @unittest.skip('TODO WDL ingest needs work') + def test_bwa_mem_tool(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/bwa.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + @unittest.skip('TODO WDL ingest needs work') + def test_gatk4_wf(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/Multisample_jointgt_GATK4.wdl' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestWdlToCwl(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'wdl' + self.dest = 'cwl' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + @unittest.skip('TODO WDL ingest needs work') + def test_bwa_mem_tool(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/bwa.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + @unittest.skip('TODO WDL ingest needs work') + def test_gatk4_wf(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/Multisample_jointgt_GATK4.wdl' + maintask = _run_translate(filepath, self.src, self.dest) + + +class TestWdlToNextflow(unittest.TestCase): + + def setUp(self) -> None: + self.src = 'wdl' + self.dest = 'nextflow' + if os.path.exists('translated'): + shutil.rmtree('translated') + _reset_global_settings() + + @unittest.skip('TODO WDL ingest needs work') + def test_bwa_mem_tool(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/bwa.xml' + maintask = _run_translate(filepath, self.src, self.dest) + + @unittest.skip('TODO WDL ingest needs work') + def test_gatk4_wf(self) -> None: + filepath = f'{WDL_TESTDATA_DIR}/Multisample_jointgt_GATK4.wdl' + maintask = _run_translate(filepath, self.src, self.dest) + diff --git a/notes.txt b/notes.txt new file mode 100644 index 0000000..4ac2b96 --- /dev/null +++ b/notes.txt @@ -0,0 +1,20 @@ + + +TODO +- IlluminaGermlineVariantCaller + - "--mode regular" or "--mode skeleton" fails for translation + - can't find input in the unwrap functions +- move to <__UNTRANSLATED_JS__> + +- docker permission issues + - super fucked + - can we run docker as non-root? + +- --galaxy-build-images + - doesn't work if running on singularity as docker isn't running + - "you have supplied "--build-galaxy-tool-images", but docker was not found" + - can we make this work with singularity? + - can we detect whether singularity is running & tell users to swap to docker? + +- get a pypi package ready for janis + - avoids singularity / docker issues \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 85e8371..4cde4b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,19 +1,82 @@ -[tool.black] -line-length = 88 -target_version = ['py37'] -include = '\.pyi?$' -exclude = ''' -( - /( - \.eggs # exclude a few common directories in the - | \.git # root of the project - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | _build - | build - | dist - )/ -) -''' + +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools>=67.8.0", "wheel>=0.40.0"] + +[project] +name = "janis-pipelines.assistant" +version = "v0.13.0" +description = "Easier way to run workflows, configurable across environments" +readme = "README.md" +license = { file = "LICENSE" } +keywords = [ + "janis", + "workflows", + "assistant", +] + +authors = [ + { name = "Michael Franklin", email = "michael.franklin@petermac.org" }, + { name = "Grace Hall", email = "grace.hall1@unimelb.edu.au" }, + { name = "Richard Lupat", email = "Richard.Lupat@petermac.org" }, + { name = "Evan Thomas", email = "thomas.e@wehi.edu.au" }, +] +maintainers = [ + { name = "Grace Hall", email = "grace.hall1@unimelb.edu.au" }, + { name = "Richard Lupat", email = "Richard.Lupat@petermac.org" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering", +] +dependencies = [ + "janis-pipelines.core", + "janis-pipelines.bioinformatics", + "janis-pipelines.unix", + "janis-pipelines.templates >= 0.11.0", + "ruamel.yaml >= 0.12.4, <= 0.16.5", + "progressbar2", + "path", + "black", + "pre-commit", + "requests", + "python-dateutil", + "tabulate", + "cwltool", + "cwl-utils==0.15", + "blessed", + "regex", +] +requires-python = ">=3.10.5" + +[project.optional-dependencies] +gcs = ["google-cloud-storage"] +ci = [ + "codecov", + "coverage", + "requests_mock", + "nose_parameterized", + "keyring==21.4.0", + "setuptools", + "wheel", + "twine", +] + +[project.urls] +repository = "https://github.com/PMCC-BioinformaticsCore/janis-assistant" +documentation = "https://janis.readthedocs.io/en/latest/" + +[tool.setuptools.packages.find] +where = ["./"] +include = ["janis_assistant*"] +namespaces = false + +[project.scripts] +janis = "janis_assistant.cli:process_args" + +# NOTE +# The following line was not copied across from old setup.py (unknown purpose) +# entry_points={"janis.extension": ["assistant=janis_assistant"]} diff --git a/requirements.txt b/requirements.txt index 41fc7f7..fcb7607 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,16 @@ -blessed janis-pipelines.core +janis-pipelines.bioinformatics +janis-pipelines.unix janis-pipelines.templates +ruamel.yaml >= 0.12.4, <= 0.16.5 +progressbar2 +path black pre-commit requests python-dateutil -tabulate \ No newline at end of file +tabulate +cwltool +cwl-utils==0.15 +blessed +regex diff --git a/setup.py b/setup.py index 9123ffa..348be0a 100644 --- a/setup.py +++ b/setup.py @@ -1,65 +1 @@ -from setuptools import setup, find_packages - -# from setuptools.command.develop import develop -# from setuptools.command.install import install - -modules = ["janis_assistant." + p for p in sorted(find_packages("./janis_assistant"))] - -vsn = {} -with open("./janis_assistant/__meta__.py") as fp: - exec(fp.read(), vsn) -__version__ = vsn["__version__"] - -setup( - # legacy .runner - renamed to assistant - name="janis-pipelines.runner", - version=__version__, - description="Easier way to run workflows, configurable across environments", - long_description=open("./README.md").read(), - long_description_content_type="text/markdown", - author="Michael Franklin", - author_email="michael.franklin@petermac.org", - license="MIT", - keywords=["janis", "workflows", "assistant"], - entry_points={ - "console_scripts": ["janis=janis_assistant.cli:process_args"], - "janis.extension": ["assistant=janis_assistant"], - }, - install_requires=[ - "janis-pipelines.core>=0.11.0", - "janis-pipelines.templates>=0.11.0", - "requests", - "path", - "python-dateutil", - "progressbar2", - "tabulate", - "ruamel.yaml >= 0.12.4, <= 0.16.5", - "cwltool", - "blessed", - ], - extras_require={ - "gcs": ["google-cloud-storage"], - "ci": [ - "codecov", - "coverage", - "requests_mock", - "nose_parameterized", - "keyring==21.4.0", - "setuptools", - "wheel", - "twine", - ], - }, - packages=["janis_assistant"] + modules, - classifiers=[ - "Development Status :: 4 - Beta", - "Topic :: Scientific/Engineering", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Environment :: Console", - ], - cmdclass={ - # 'develop': PostDevelopCommand, - # 'install': PostInstallCommand - }, -) +import setuptools; setuptools.setup()