Skip to content

Commit

Permalink
Merge pull request #22 from microbiomedata/19_lineage_scaffold
Browse files Browse the repository at this point in the history
19 lineage scaffold
  • Loading branch information
aclum authored Jul 17, 2023
2 parents 69b7111 + 694d221 commit 26e609a
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 91 deletions.
27 changes: 8 additions & 19 deletions .github/workflows/create_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ on:
push:
branches:
- master
paths:
- 'version.txt'

jobs:
release:
Expand All @@ -12,30 +14,17 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Get latest release
id: latest_release
run: |
LATEST_RELEASE=$(curl --silent "https://api.github.com/repos/$GITHUB_REPOSITORY/releases/latest" | jq -r .tag_name)
echo "Latest release: $LATEST_RELEASE"
echo "LATEST_RELEASE=$LATEST_RELEASE" >> $GITHUB_OUTPUT
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Bump patch version
id: bump_version
- name: Read version
id: get_version
run: |
LATEST_VERSION=${{ steps.latest_release.outputs.LATEST_RELEASE }}
IFS='.' read -ra VERSION_PARTS <<< "$LATEST_VERSION"
PATCH_BUMP=$(( ${VERSION_PARTS[2]} + 1 ))
NEW_VERSION="${VERSION_PARTS[0]}.${VERSION_PARTS[1]}.$PATCH_BUMP"
echo "New version: $NEW_VERSION"
echo "NEW_VERSION=$NEW_VERSION" >> $GITHUB_OUTPUT
VERSION=$(cat version.txt)
echo "VERSION=${VERSION}" >> $GITHUB_ENV
- name: Create bundle zip
run: zip -r bundle.zip *.wdl

- name: Create Release
run: gh release create v${{ steps.bump_version.outputs.NEW_VERSION }} annotation_full.wdl bundle.zip
run: gh release create ${{ env.VERSION }} annotation_full.wdl bundle.zip
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
14 changes: 5 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM debian as buildbase
FROM debian:bullseye as buildbase

RUN apt-get -y update && apt-get -y install git gcc make wget time autoconf unzip curl

Expand Down Expand Up @@ -55,16 +55,14 @@ RUN \
gcc -std=gnu99 -O3 -fomit-frame-pointer -fstrict-aliasing -march=core2 -fopenmp -fPIC -msse2 -DHAVE_CONFIG_H -L../easel -L./impl_sse -L../libdivsufsort -L. -o hpc_hmmsearch hpc_hmmsearch.o -lhmmer -leasel -ldivsufsort -lm && \
cp hpc_hmmsearch /opt/omics/programs/hmmer/bin/ && \
/opt/omics/programs/hmmer/bin/hpc_hmmsearch -h
# Build last 1256
# Build last 1456
#
FROM buildbase as last

RUN apt-get -y install g++

RUN \
git clone --depth 1 --branch 1256 https://gitlab.com/mcfrith/last

RUN \
git clone --depth 1 --branch 1456 https://gitlab.com/mcfrith/last && \
cd last && \
make && \
make prefix=/opt/omics/programs/last install
Expand All @@ -89,9 +87,7 @@ FROM buildbase as img

RUN \
cd /opt && \
git clone https://code.jgi.doe.gov/img/img-pipelines/img-annotation-pipeline && \
cd img-annotation-pipeline && \
git reset --hard e6fe2f19f691180be0165cfef453d76e17d1f57c
git clone -b scaffold-lineage https://code.jgi.doe.gov/img/img-pipelines/img-annotation-pipeline

RUN \
cd /opt && \
Expand All @@ -108,7 +104,7 @@ RUN \
#chmod -R 755 omics && \
rm gms2_linux_64.v1.14_1.25_lic.tar.gz

RUN apt-get install -y openjdk-11-jdk
RUN apt-get update && apt-get install -y openjdk-11-jdk
# get CRT version 1.8.4
RUN \
wget https://code.jgi.doe.gov/img/img-pipelines/crt-cli-imgap-version/-/archive/main/crt-cli-imgap-version-main.zip && \
Expand Down
62 changes: 9 additions & 53 deletions annotation_full.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@ import "./functional-annotation.wdl" as fa

workflow annotation {
String proj
String resource
String informed_by
String? git_url="https://github.com/microbiomedata/mg_annotation/releases/tag/0.1"
String? url_root="https://data.microbiomedata.org/data/"
String input_file
String imgap_project_id
String database_location="/refdata/img/"
String imgap_project_type="metagenome"
String? gm_license="/refdata/licenses/.gmhmmp2_key"
Int additional_threads=16
# 5.1.14.1` -> sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75
String container="microbiomedata/img-omics@sha256:5c7f95bbffb53e6b7ba6899705fd83ad3c8bb88046c476952a7b9ca53a93888f"
String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"

# structural annotation
Boolean sa_execute=true
Expand Down Expand Up @@ -72,6 +67,7 @@ workflow annotation {
ec_tsvs = f_annotate.ec_tsv,
phylo_tsvs = f_annotate.phylo_tsv,
last_blasttabs = f_annotate.last_blasttab,
lineage_tsvs = f_annotate.lineage_tsv,
proteins = s_annotate.proteins,
genes = s_annotate.genes,
ko_ec_gffs = f_annotate.ko_ec_gff,
Expand Down Expand Up @@ -144,15 +140,12 @@ workflow annotation {
input_file=stage.imgap_input_fasta,
proj=proj,
start=stage.start,
resource=resource,
url_root=url_root,
git_url=git_url,
informed_by=informed_by,
proteins_faa = merge_outputs.proteins_faa,
structural_gff = merge_outputs.structural_gff,
ko_ec_gff = merge_outputs.ko_ec_gff,
gene_phylogeny_tsv = merge_outputs.gene_phylogeny_tsv,
functional_gff = merge_outputs.functional_gff,
lineage_tsv = merge_outputs.lineage_tsv,
ko_tsv = merge_outputs.ko_tsv,
ec_tsv = merge_outputs.ec_tsv,
stats_tsv = final_stats.tsv,
Expand Down Expand Up @@ -201,7 +194,6 @@ workflow annotation {
# File? proteins_cath_funfam_domtblout = finish_ano.final_proteins_cath_funfam_domtblout
File? product_names_tsv = finish_ano.final_product_names_tsv
File? crt_crisprs = finish_ano.final_crt_crisprs
File? ano_objects = finish_ano.objects
File imgap_version = make_info_file.imgap_info
}
Expand Down Expand Up @@ -291,6 +283,7 @@ task merge_outputs {
Array[File?] ec_tsvs
Array[File?] phylo_tsvs
Array[File?] last_blasttabs
Array[File?] lineage_tsvs
Array[File?] proteins
Array[File?] genes
Array[File?] ko_ec_gffs
Expand Down Expand Up @@ -336,6 +329,7 @@ task merge_outputs {
cat ${sep=" " ec_tsvs} > "${project_id}_ec.tsv"
cat ${sep=" " phylo_tsvs} > "${project_id}_gene_phylogeny.tsv"
cat ${sep=" " last_blasttabs} > "${project_id}_proteins.img_nr.last.blasttab"
cat ${sep=" " lineage_tsvs} > "${project_id}.contigLin.assembled.tsv"
cat ${sep=" " proteins} > "${project_id}_proteins.faa"
cat ${sep=" " genes} > "${project_id}_genes.fna"
cat ${sep=" " ko_ec_gffs} > "${project_id}_ko_ec.gff"
Expand Down Expand Up @@ -378,6 +372,7 @@ task merge_outputs {
File ec_tsv = "${project_id}_ec.tsv"
File gene_phylogeny_tsv = "${project_id}_gene_phylogeny.tsv"
File last_blasttab = "${project_id}_proteins.img_nr.last.blasttab"
File lineage_tsv = "${project_id}.contigLin.assembled.tsv"
File proteins_faa = "${project_id}_proteins.faa"
File genes_fna = "${project_id}_genes.fna"
File ko_ec_gff = "${project_id}_ko_ec.gff"
Expand Down Expand Up @@ -553,10 +548,6 @@ task finish_ano {
String proj
String prefix=sub(proj, ":", "_")
String start
String informed_by
String resource
String url_root
String git_url
File input_file
File proteins_faa
File structural_gff
Expand All @@ -569,6 +560,7 @@ task finish_ano {
File smart_gff
File supfam_gff
File gene_phylogeny_tsv
File lineage_tsv
File cath_funfam_gff
File crt_gff
File genemark_gff
Expand Down Expand Up @@ -607,52 +599,15 @@ task finish_ano {
cat ${rfam_gff} | sed ${sed} > ${prefix}_rfam.gff
cat ${crt_crisprs} | sed ${sed} > ${prefix}_crt.crisprs
cat ${gene_phylogeny_tsv} | sed ${sed} > ${prefix}_gene_phylogeny.tsv
cat ${lineage_tsv} | sed ${sed} > ${prefix}.contigLin.assembled.tsv
cat ${product_names_tsv} | sed ${sed} > ${prefix}_product_names.tsv
cat ${ko_ec_gff} | sed ${sed} > ${prefix}_ko_ec.gff
cat ${stats_tsv} | sed ${sed} > ${prefix}_stats.tsv
cat ${stats_json} | sed ${sed} > ${prefix}_stats.json
nmdc gff2json ${prefix}_functional_annotation.gff -of features.json -oa annotations.json -ai ${informed_by}

/scripts/generate_object_json.py \
--type "nmdc:MetagenomeAnnotationActivity" \
--set metagenome_annotation_activity_set \
--part ${proj} \
-p "name=Annotation Activity for ${proj}" \
was_informed_by=${informed_by} \
started_at_time=${start} \
ended_at_time=$end \
execution_resource="${resource}" \
git_url=${git_url} \
version="v1.0.1-beta" \
--url ${url_root}${proj}/annotation/ \
--inputs ${input_file} \
--outputs \
${prefix}_proteins.faa "FASTA amino acid file for annotated proteins" "Annotation Amino Acid FASTA" "FASTA Amino Acid File for ${proj}" \
${prefix}_structural_annotation.gff "GFF3 format file with structural annotations" "Structural Annotation GFF" "Structural Annotation for ${proj}" \
${prefix}_functional_annotation.gff "GFF3 format file with functional annotations" "Functional Annotation GFF" "Functional Annotation for ${proj}" \
${prefix}_ko.tsv "Tab delimited file for KO annotation" "Annotation KEGG Orthology" "KEGG Orthology for ${proj}" \
${prefix}_ec.tsv "Tab delimited file for EC annotation" "Annotation Enzyme Commission" "EC Annotations for ${proj}" \
${prefix}_cog.gff "GFF3 format file with COGs" "Clusters of Orthologous Groups (COG) Annotation GFF" "COGs for ${proj}" \
${prefix}_pfam.gff "GFF3 format file with Pfam" "Pfam Annotation GFF" "Pfam Annotation for ${proj}" \
${prefix}_tigrfam.gff "GFF3 format file with TIGRfam" "TIGRFam Annotation GFF" "TIGRFam for ${proj}" \
${prefix}_smart.gff "GFF3 format file with SMART" "SMART Annotation GFF" "SMART Annotations for ${proj}" \
${prefix}_supfam.gff "GFF3 format file with SUPERFam" "SUPERFam Annotation GFF" "SUPERFam Annotations for ${proj}" \
${prefix}_cath_funfam.gff "GFF3 format file with CATH FunFams" "CATH FunFams (Functional Families) Annotation GFF" "CATH FunFams for ${proj}" \
${prefix}_crt.gff "GFF3 format file with CRT" "CRT Annotation GFF" "CRT Annotations for ${proj}" \
${prefix}_genemark.gff "GFF3 format file with Genemark" "Genemark Annotation GFF" "Genemark Annotations for ${proj}" \
${prefix}_prodigal.gff "GFF3 format file with Prodigal" "Prodigal Annotation GFF" "Prodigal Annotations ${proj}" \
${prefix}_trna.gff "GFF3 format file with TRNA" "TRNA Annotation GFF" "TRNA Annotations ${proj}" \
${prefix}_rfam.gff "GFF3 format file with RFAM" "RFAM Annotation GFF" "RFAM Annotations for ${proj}" \
${prefix}_ko_ec.gff "GFF3 format file with KO_EC" "KO_EC Annotation GFF" "KO_EC Annotations for ${proj}" \
${prefix}_product_names.tsv "Product names file" "Product Names" "Product names for ${proj}" \
${prefix}_gene_phylogeny.tsv "Gene Phylogeny file" "Gene Phylogeny" "Gene Phylogeny for ${proj}"\
${prefix}_crt.crisprs "Crispr Terms" "Crispr Terms" "Crispr Terms for ${proj}" \
${prefix}_stats.tsv "Annotation statistics report" "Annotation Statistics" "Annotation Stats for ${proj}"

}

output {
File objects = "objects.json"
File final_functional_gff = "${prefix}_functional_annotation.gff"
File final_structural_gff = "${prefix}_structural_annotation.gff"
File final_ko_tsv = "${prefix}_ko.tsv"
Expand All @@ -678,6 +633,7 @@ task finish_ano {
# File final_proteins_supfam_domtblout = "${prefix}_proteins.supfam.domtblout"
# File final_proteins_cath_funfam_domtblout = "${prefix}_proteins.cath_funfam.domtblout"
File final_product_names_tsv = "${prefix}_product_names.tsv"
File final_lineage_tsv = "${prefix}.contigLin.assembled.tsv"
File final_crt_crisprs = "${prefix}_crt.crisprs"
File final_tsv = "${prefix}_stats.tsv"

Expand Down
2 changes: 1 addition & 1 deletion annotation_mt_full.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ workflow annotation {
String database_location="/refdata/img/"
String imgap_project_type="metagenome"
Int additional_threads=16
String container="microbiomedata/img-omics@sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75"
String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"
String bc_bin="/miniconda3/bin/bc"
# structural annotation
Boolean sa_execute=true
Expand Down
17 changes: 10 additions & 7 deletions functional-annotation.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ workflow f_annotate {
File input_fasta
String database_location
Boolean ko_ec_execute=true
String ko_ec_img_nr_db="${database_location}"+"/IMG-NR/20211118/img_nr"
String ko_ec_md5_mapping="${database_location}"+"/IMG-NR/20211118/md5Hash2Data.txt"
String ko_ec_taxon_to_phylo_mapping="${database_location}"+"/IMG-NR/20211118/taxonOid2Taxonomy.txt"
String ko_ec_img_nr_db="${database_location}"+"/IMG-NR/20230629/img_nr"
String ko_ec_md5_mapping="${database_location}"+"/IMG-NR/20230629/md5Hash2Data.txt"
String ko_ec_taxon_to_phylo_mapping="${database_location}"+"/IMG-NR/20230629/taxonOid2Taxonomy.txt"
String lastal_bin="/opt/omics/bin/lastal"
String selector_bin="/opt/omics/bin/functional_annotation/lastal_img_nr_ko_ec_gene_phylo_hit_selector.py"
Boolean smart_execute=true
Expand Down Expand Up @@ -41,8 +41,8 @@ workflow f_annotate {
String product_assign_bin="/opt/omics/bin/functional_annotation/assign_product_names_and_create_fa_gff.py"
String product_names_mapping_dir="${database_location}"+"/Product_Name_Mappings/latest"
String container
String hmm_container="microbiomedata/img-omics@sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75"
String last_container="microbiomedata/img-omics@sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75"
String hmm_container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"
String last_container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"

if(ko_ec_execute) {
call ko_ec {
Expand Down Expand Up @@ -171,6 +171,7 @@ workflow f_annotate {
File? phylo_tsv = ko_ec.phylo_tsv
File? ko_ec_gff = ko_ec.gff
File? last_blasttab = ko_ec.last_blasttab
File? lineage_tsv = ko_ec.lineage_tsv
File? cog_gff = cog.gff
File? pfam_gff = pfam.gff
File? tigrfam_gff = tigrfam.gff
Expand Down Expand Up @@ -226,7 +227,8 @@ task ko_ec {
${project_type} ${md5} ${phylo} \
${project_id}_ko.tsv ${project_id}_ec.tsv \
${project_id}_gene_phylogeny.tsv ${project_id}_ko_ec.gff \
${project_id}_proteins.img_nr.last.blasttab
${project_id}_proteins.img_nr.last.blasttab && \
python /opt/omics/bin/functional_annotation/create_scaffold_lineage.py ${project_id}_gene_phylogeny.tsv ${project_id}_scaffold_lineage.tsv

#get version information
lastal_version="`${lastal} -V`"
Expand All @@ -247,6 +249,7 @@ task ko_ec {
File ec_tsv = "${project_id}_ec.tsv"
File phylo_tsv = "${project_id}_gene_phylogeny.tsv"
File gff = "${project_id}_ko_ec.gff"
File lineage_tsv = "${project_id}_scaffold_lineage.tsv"
String lastal_ver = read_string(lastal_version_file)
String img_nr_db_ver = read_string(img_nr_db_version_file)
}
Expand Down Expand Up @@ -300,7 +303,6 @@ task smart {
}

task cog {

String project_id
File input_fasta
String cog_db
Expand Down Expand Up @@ -628,3 +630,4 @@ task product_name {
File tsv = "${project_id}_product_names.tsv"
}
}

2 changes: 1 addition & 1 deletion test-small.wdl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import "annotation_full.wdl" as awf

workflow test_small {
String container="microbiomedata/img-omics@sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75"
String container="microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"
String proj="Testsmall"
String database="/refdata/img/"
String url="https://portal.nersc.gov/project/m3408/test_data"
Expand Down
2 changes: 1 addition & 1 deletion trnascan.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ workflow trnascan {
String imgap_project_id
String imgap_project_type
Int additional_threads
String container = "microbiomedata/img-omics@sha256:e3e3fff75aeb3a6e321054d4bc9d8c8c925dcfb9245d60247ab29c3b24c4bc75"
String container = "microbiomedata/img-omics@sha256:d5f4306bf36a97d55a3710280b940b89d7d4aca76a343e75b0e250734bc82b71"

call trnascan_ba {
input:
Expand Down
1 change: 1 addition & 0 deletions version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
v1.0.3

0 comments on commit 26e609a

Please sign in to comment.