-
Notifications
You must be signed in to change notification settings - Fork 1
/
findCopyFastq.sh
147 lines (139 loc) · 3.66 KB
/
findCopyFastq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env bash
set -eu
#default options
fastq="/space/sequences/Illumina/"
input="samples"
output="fastq"
samplesep="_"
doCopy="yes"
nfind=${nfind:-2}
if command -v fdfind 1> /dev/null
then
cmd="fdfind"
else
cmd="find"
fi
#default error message if bad usage
usageError() {
echo "Error: $1" 1>&2
echo ""
eval "bash $0 -h"
}
while getopts ":hi:f:o:s:d" opt
do
case ${opt} in
h )
echo "Find and copy fastq files. Reports for each sample how many files were found and copied."
echo "Options:"
echo " -h Display this help text and exit."
echo -e " -i (Required) Path to file containing sample ID's to find and copy. One sample ID per line. \n (Default: ${input})"
echo -e " -f (Required) Path to folder containing fastq files (will be searched recursively). \n (Default: ${fastq})"
echo -e " -o (Required) Output folder to copy fastq files into. \n (Default: ${output})"
echo -e " -d (flag) Don't copy the files, instead only report whether they are found or not."
echo -e " -s Separator to append after sample name. \n (Default: ${samplesep})"
echo
echo -e "Additional options can be set by exporting environment variables before running the script:"
echo -e " - nfind: Max number of files to find per sample ID. The search will be aborted after the first nfind hits. Useful for speeding up things if fx only the forward read (R1) is needed. (Default: ${nfind})"
exit 1
;;
i )
input="$OPTARG"
;;
f )
fastq="$OPTARG"
;;
o )
output="$OPTARG"
;;
s )
samplesep="$OPTARG"
;;
d )
doCopy="no"
;;
\? )
usageError "Invalid Option: -$OPTARG"
exit 1
;;
: )
usageError "Option -$OPTARG requires an argument"
exit 1
;;
esac
done
shift $((OPTIND -1)) #reset option pointer
# check options
if [ ! -s "$input" ]
then
usageError "File '${input}' does not exist or is empty"
exit 1
fi
if [ ! -d "$fastq" ]
then
usageError "Directory '${fastq}' does not exist"
exit 1
fi
mkdir -p "$output"
#clean samples file
tr "\r" "\n" < "$input" |\
sed -e '$a\' |\
sed -e '/^$/d' -e 's/ //g' > "${output}/samples.txt"
nsamples=$(wc -w < "${output}/samples.txt")
echo "Searching for ${nsamples} sample(s) in $fastq (only first hits)..."
if [ $doCopy == "yes" ]
then
echo "Copying files into $(realpath -m "$output")"
fi
i=0
notFound=0
while ((i++)); read -r sample
do
echo -n " - ($i/$nsamples) $sample: "
#find the sample fastq file
fileStatus=$(
if [ "$cmd" == "fdfind" ]
then
#fdfind is just blazing
$cmd -s \
-g "*${sample}${samplesep}*" \
-L \
-t file -t symlink \
-e gz -e fastq -e fq \
--max-results "$nfind" \
"$fastq"
elif [ "$cmd" == "find" ]
then
#use head -n x to stop find from searching further after the first x hits
#use "|| true" to avoid exiting when the find command doesn't
#have permission to access some files/folders
$cmd -L "$fastq" \
-type f \
-name "*${sample}${samplesep}*.f*q*" \
2> /dev/null |\
head -n "$nfind" \
|| true
fi
)
if [ -z "$fileStatus" ]
then
echo "not found"
((notFound=notFound+1))
elif [ -n "$fileStatus" ]
then
echo -e "found at: \n$fileStatus"
if [ $doCopy == "yes" ]
then
#--max-results can't be used together with --exec, so separate cp command
# shellcheck disable=SC2086
cp $fileStatus -fvt "$output"
fi
echo -e "\n"
fi
done < "${output}/samples.txt"
echo
if [ "$notFound" != "0" ]
then
echo "$notFound sample(s) couldn't be found"
else
echo "All $nsamples samples were found"
fi