-
Notifications
You must be signed in to change notification settings - Fork 1
/
parallel_usearch_global.sh
113 lines (102 loc) · 3.11 KB
/
parallel_usearch_global.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env bash
set -e
parallel_usearch_global() {
#use all threads available except 2
maxthreads=${maxthreads:-$(($(nproc)-2))}
#number of threads to use for each parallel process of usearch_global
usearch_global_jobsize=${usearch_global_jobsize:-5}
local OPTIND
while getopts ":hi:o:d:t:a:" opt; do
case ${opt} in
h )
echo "Runs usearch11 -usearch_global in parallel by splitting the input data and process in smaller chunks. Much faster than just using more threads with one command."
echo "Options:"
echo " -h Display this help text and exit."
echo " -v Print version and exit."
echo " -i (required) Input file name."
echo " -o (required) Output file name."
echo " -d (required) Database file."
echo " -a Additional arguments passed on to the usearch11 -usearch_global command as one quoted string, fx: -a \"-maxrejects 8 -strand plus\""
echo " -t Max number of max_threads to use. (Default: all available except 2)"
exit 1
;;
i )
local input=$OPTARG
;;
o )
local output=$OPTARG
;;
d )
local database=$OPTARG
;;
t )
local maxthreads=$OPTARG
;;
a )
local usearch_args=$OPTARG
;;
\? )
echo "Invalid Option: -$OPTARG"
exit 1
;;
: )
echo "Option -$OPTARG requires an argument"
exit 1
;;
esac
done
#check required options
if [[ -z "$input" ]]
then
echo "option -i is required"
exit 1
fi
if [[ -z "$output" ]]
then
echo "option -o is required"
exit 1
fi
if [[ -z "$database" ]]
then
echo "option -d is required"
exit 1
fi
jobs=$(( maxthreads / usearch_global_jobsize ))
if [ ${jobs} -gt 1 ]
then
#create and/or clear a temporary folder for split input files
tmpsplitdir="${output}_tmpsplit"
rm -rf "${tmpsplitdir}"
mkdir -p "${tmpsplitdir}"
echo " - Splitting input file in ${jobs} to run in parallel"
#minus 1 job because of leftover seqs from equal split
usearch11 -fastx_split "${input}" \
-splits $((jobs - 1)) \
-outname "${tmpsplitdir}/[email protected]" \
-quiet
echo " - Running ${jobs} jobs using max ${usearch_global_jobsize} threads each ($(((jobs * usearch_global_jobsize))) total)"
find "${tmpsplitdir}" -type f -name 'seqs_*.fa' |\
parallel --progress usearch11 -usearch_global {} \
-db "${database}" \
$usearch_args \
-blast6out "{.}.b6" \
-threads "${usearch_global_jobsize}" \
-quiet
echo " - Combining output search tables into a single table"
find "${tmpsplitdir}" \
-type f \
-name '*.b6' \
-exec cat > "${tmpsplitdir}/combined.txt" {} +
sort -V "${tmpsplitdir}/combined.txt" > "$output"
rm -rf "${tmpsplitdir}"
else
echo " - Number of jobs is ${jobs}, exiting..."
exit 1
fi
}
#if sourced from another script, just load the function
#if not then run the function passing on arguments
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
parallel_usearch_global "$*"
fi