Skip to content

Commit

Permalink
adding sge/uge profile
Browse files Browse the repository at this point in the history
  • Loading branch information
shukwong committed Nov 21, 2024
1 parent d1162d3 commit a0f81ec
Show file tree
Hide file tree
Showing 6 changed files with 453 additions and 0 deletions.
16 changes: 16 additions & 0 deletions workflow/profiles/sge/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
__resources__:
v100:
- "gpu"
- "v100"
- "nvidia_gpu"

__options__:
jc:
- "jc"
- "jclass"
- "job_class"

__default__:
pe: "def_slot "
output: logs/{rulename}/{rulename}.{jobid}.out
error: logs/{rulename}/{rulename}.{jobid}.err
80 changes: 80 additions & 0 deletions workflow/profiles/sge/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
restart-times: 0
jobscript: sge-jobscript.sh
cluster: "sge-submit.py"
cluster-status: "sge-status.py"
cluster-cancel: "sge-cancel.py"
cluster-cancel-nargs: 20
max-jobs-per-second: 0.1
max-status-checks-per-second: 0.017
latency-wait: 180
local-cores: 1

configfile: TriosCompass_v2/config/bam_config.yaml
snakefile: TriosCompass_v2/workflow/Snakefile
verbose: True
skip-script-cleanup: True
reason: True
show-failed-logs: True
rerun-incomplete: True
keep-incomplete: True
keep-going: True
jobs: 500
use-conda: True
use-singularity: True

# Job resources
set-resources:
- bwa_index:mem_mb=16000
- fastp:mem_mb=40000
- fastqc:mem_mb=10000
- fastq_screen:mem_mb=50000
- fq2bam:mem_mb=40000
- fq2bam:runtime=8:00:00
- fq2bam:v100=1
- flagstat:mem_mb=80000
- collectwgsmetrics:runtime=24:00:00
- collectwgsmetrics:mem_mb=64000
- collectmultiplemetrics:mem_mb=8000
- collectmultiplemetrics:v100=1
- gatkhc_pb:mem_mb=8000
- gatkhc_pb:runtime=24:00:00
- gatkhc_pb:v100=1
- gatk_combine_gvcf:mem_mb=80000
- gatk_combine_gvcf:runtime=144:00:00
- gatk_cgp:mem_mb=40000
- gatk_cgp:runtime=24:00:00
- gatk_genotype_gvcf_pb:runtime=24:00:00
- gatk_genotype_gvcf_pb:mem_mb=80000
- gatk_genotype_gvcf_pb:v100=1
- deepvariant_pb:mem_mb=80000
- deepvariant_pb:runtime=24:00:00
- deepvariant_pb:v100=1
- glnexus_dv:mem_mb=80000
- glnexus_dv:runtime=24:00:00
- call_dnm_dv:mem_mb=20000
- call_dnm_dv:runtime=24:00:00
- call_dnm_hc:mem_mb=20000
- call_dnm_hc:runtime=24:00:00
- call_JIGV:mem_mb=60000
- replace_rg:mem_mb=60000
- replace_rg:s_vmem=60000
- replace_rg:runtime=24:00:00
- phase_child:mem_mb=40000
- phase_trios:mem_mb=40000
- split_bed_hipstr:mem_mb=10000
- hipstr:mem_mb=40000
- hipstr:runtime=200:00:00
- dumpstr_call_hipstr:mem_mb=40000
- dumpstr_locus:mem_mb=40000
- dumpstr_locus:runtime=20:00:00
- monstr:mem_mb=40000
- monstr:runtime=20:00:00
- merge_monstr:mem_mb=40000
- merge_monstr:runtime=20:00:00
- monstr_filter:mem_mb=20000
- hipstr_recall:mem_mb=10000

# For some reasons time needs quotes to be read by snakemake
default-resources:
- mem_mb=2000
- runtime="10:00:00"
9 changes: 9 additions & 0 deletions workflow/profiles/sge/sge-cancel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env python3
import subprocess as sp
import shlex
import sys

jobid_list = ', '.join(sys.argv[1:])

sp.check_call(shlex.split(f"qdel {jobid_list}"))

7 changes: 7 additions & 0 deletions workflow/profiles/sge/sge-jobscript.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
# properties = {properties}

# exit on first error
set -o errexit

{exec_job}
69 changes: 69 additions & 0 deletions workflow/profiles/sge/sge-status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
import re
import subprocess as sp
import shlex
import sys
import time
import logging

logger = logging.getLogger("__name__")
logger.setLevel(40)

STATUS_ATTEMPTS = 20

jobid = int(sys.argv[1])
job_status = "running"

# WARNING this currently has no support for task array jobs

for i in range(STATUS_ATTEMPTS):
# first try qstat to see if job is running
# we can use `qstat -s pr -u "*"` to check for all running and pending jobs
try:
qstat_res = sp.check_output(shlex.split(f"qstat -s pr")).decode().strip()

# skip the header using [2:]
res = {
int(x.split()[0]) : x.split()[4] for x in qstat_res.splitlines()[2:]
}

# job is in an unspecified error state
if "E" in res[jobid]:
job_status = "failed"
break

job_status = "running"
break

except sp.CalledProcessError as e:
logger.error("qstat process error")
logger.error(e)
except KeyError as e:
# if the job has finished it won't appear in qstat and we should check qacct
# this will also provide the exit status (0 on success, 128 + exit_status on fail)
# Try getting job with scontrol instead in case sacct is misconfigured
try:
qacct_res = sp.check_output(shlex.split(f"qacct -j {jobid}"))

exit_code = int(re.search("exit_status\s*([0-9]+)", qacct_res.decode()).group(1))

if exit_code == 0:
job_status = "success"
break

if exit_code != 0:
job_status = "failed"
break

except sp.CalledProcessError as e:
logger.warning("qacct process error")
logger.warning(e)
if i >= STATUS_ATTEMPTS - 1:
job_status = "failed"
break
else:
# qacct can be quite slow to update on large servers
time.sleep(30)
pass

print(job_status)
Loading

0 comments on commit a0f81ec

Please sign in to comment.