-
Notifications
You must be signed in to change notification settings - Fork 1
/
Main.hs
271 lines (259 loc) · 10.2 KB
/
Main.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
-- Diversity
-- By G.W. Schwartz
{- | Takes a fasta file and return the diversity of a certain order and
window length (to split into fragments) by position.
-}
-- Built-in
import Data.List
import Control.Monad (forever, unless)
import qualified Data.Map.Strict as Map
import qualified System.IO as IO
-- Cabal
import Data.Semigroup ((<>))
import Data.Fasta.String
import Pipes
import qualified Pipes.Prelude as P
import Options.Applicative
-- Local
import Math.Diversity.Types
import Math.Diversity.GenerateDiversity
import Math.Diversity.Print
-- Command line arguments
data Options = Options { inputLabel :: String
, inputOrder :: Double
, inputWindow :: Int
, inputFasta :: Maybe String
, inputSampleField :: Int
, inputCountField :: Int
, inputSubsampling :: String
, inputG :: Double
, fastBin :: Bool
, runs :: Int
, gapsFlag :: Bool
, removeNBool :: Bool
, wholeSeq :: Bool
, list :: Bool
, sample :: Bool
, rarefactionDF :: Bool
, std :: Bool
, outputRarefaction :: String
, outputRarefactionCurve :: String
, output :: String
}
-- Command line options
options :: Parser Options
options = Options
<$> strOption
( long "input-label"
<> short 'l'
<> metavar "LABEL"
<> value ""
<> help "The label for this particular dataset (to differentiate\
\ the file in batch analyses)" )
<*> option auto
( long "input-order"
<> short 'r'
<> metavar "[1] | INT"
<> value 1
<> help "The order of true diversity" )
<*> option auto
( long "input-window"
<> short 'w'
<> metavar "[1] | INT"
<> value 1
<> help "The length of the sliding window for generating fragments" )
<*> optional ( strOption
( long "input-fasta"
<> short 'i'
<> metavar "[Nothing] | FILE"
<> help "The fasta file containing the germlines and clones" )
)
<*> option auto
( long "input-sample-field"
<> short 'S'
<> metavar "INT"
<> value 1
<> help "The index for the sample ID in the header separated by '|'\
\ (1 indexed)" )
<*> option auto
( long "input-count-field"
<> short 'C'
<> metavar "INT"
<> value 0
<> help "The index for the number of this type in the header separated\
\ by '|' (1 indexed). Used if there are multiple copies\
\ of one entry, so a '4' in the header would indicate that\
\ this entity occurred 4 times. Defaults to 0, meaning that\
\ this field is ignored and count each sequence as occurring\
\ just once" )
<*> strOption
( long "input-subsampling"
<> short 'I'
<> metavar "INT INT (INT)"
<> value "1 1"
<> help "The start point, interval, and optional endpoint of\
\ subsamples in the rarefaction curve.\
\ For instance, '1 1 4' would be 1, 2, 3, 4\
\ '2 6 14' would be 2, 8, 14, ... Note: input is a string so\
\ use quotations around the entry and it always includes the\
\ number of subsamples overall in the result. Excluding the\
\ endpoint results in the number of samples the endpoint, so\
\ '1 1' would be 1, 2, 3, ..., N " )
<*> option auto
( long "input-g"
<> short 'g'
<> metavar "Double"
<> value 0.95
<> help "Used for calculating the number of individuals\
\ (or samples) needed before the proportion g of the total\
\ number of estimated species is reached.\
\ Sobs / Sest < g < 1" )
<*> switch
( long "fast-bin"
<> short 'f'
<> help "Whether to use a much faster, but approximated, binomial\
\ coefficient for the rarefaction analysis. This method\
\ results in NaNs for larger numbers, so in that case you\
\ you should use the slower, more accurate default method" )
<*> option auto
( long "input-runs"
<> short 'R'
<> metavar "INT"
<> value 0
<> help "The number of runs for empirical resampling rarefaction.\
\ This method does not compute the theoretical, it reports the\
\ actual median and median absolute deviation (MAD) values of\
\ this many runs. If this value is not 0, empirical\
\ rarefaction is automatically enabled (individual based only,\
\ not for sample based)" )
<*> switch
( long "keep-gaps"
<> short 'G'
<> help "Do not remove '.' and '-' characters from the analysis. This\
\ flag will thus treat these characters as additional entities\
\ rather than be ignored as artificial biological gaps in\
\ a sequence" )
<*> switch
( long "remove-N"
<> short 'n'
<> help "Remove 'N' and 'n' characters" )
<*> switch
( long "whole-sequence"
<> short 'a'
<> help "Ignore window length and only analyze the entire sequence for\
\ diversity and rarefaction curves." )
<*> switch
( long "list"
<> short 'L'
<> help "Analyze a diversity of species in a list separated by lines\
\ instead of a fasta file" )
<*> switch
( long "sample"
<> short 's'
<> help "Whether to use sample based rarefaction (requires sample ID\
\ field from input-sample-field)" )
<*> switch
( long "rarefaction-df"
<> short 'd'
<> help "Whether to output the rarefaction curve as a data frame" )
<*> switch
( long "std"
<> short 't'
<> help "Whether to output to stdout or to a file if no file is\
\ supplied. Must still put some string in -O, -c, or -o\
\ depending on which output is needed."
)
<*> strOption
( long "output-rarefaction"
<> short 'O'
<> metavar "FILE"
<> value ""
<> help "The csv file containing the rarefaction values (the percent\
\ of the rarefaction curve that is above 95% of the height of\
\ the rarefaction curve). Expects a string, so you need a\
\ string even with std" )
<*> strOption
( long "output-rarefaction-curve"
<> short 'c'
<> metavar "FILE"
<> value ""
<> help "The csv file containing the rarefaction curve. Expects a\
\ a string, so you need a string even with std" )
<*> strOption
( long "output"
<> short 'o'
<> metavar "FILE"
<> value ""
<> help "The csv file containing the diversities at each position.\
\ expects a string, so you need a string even with std" )
parseSampling :: (Num a, Read a) => String -> [a]
parseSampling = map read . parsing . words
where
parsing [x, y] = [x, y, "0"]
parsing x = x
pipesPositionMap :: Options -> IO PositionMap
pipesPositionMap opts = do
h <- case inputFasta opts of
Nothing -> return IO.stdin
(Just x) -> IO.openFile x IO.ReadMode
runEffect
$ P.fold (Map.unionWith (Map.unionWith (+))) Map.empty id
$ P.fromHandle h
>-> toFastaSequence (list opts) h
>-> P.map ( generatePositionMap
(gapsFlag opts)
(sample opts)
(inputSampleField opts)
(inputCountField opts)
(wholeSeq opts)
(inputWindow opts)
. removals )
where
toFastaSequence True _ = P.map ( \x -> FastaSequence { fastaHeader = ""
, fastaSeq = x } )
toFastaSequence False h = pipesFasta h
removals = if removeNBool opts then removeN else id
generateDiversity :: Options -> IO ()
generateDiversity opts = do
let contents = parseFasta
label = inputLabel opts
order = inputOrder opts
window = inputWindow opts
nFlag = removeNBool opts
whole = wholeSeq opts
[start, interval, end] = parseSampling . inputSubsampling $ opts
howToOutput x = if std opts then putStrLn else writeFile x
positionMap <- pipesPositionMap opts
unless (null . output $ opts)
$ howToOutput (output opts)
. printDiversity label order window
$ positionMap
unless (null . outputRarefaction $ opts)
$ do
s <- printRarefaction
(sample opts)
(inputG opts)
label
window
positionMap
howToOutput (outputRarefaction opts) s
unless (null . outputRarefactionCurve $ opts)
$ do
s <- printRarefactionCurve
(rarefactionDF opts)
(sample opts)
(fastBin opts)
(runs opts)
start
interval
end
label
window
positionMap
howToOutput (outputRarefactionCurve opts) s
main :: IO ()
main = execParser opts >>= generateDiversity
where
opts = info (helper <*> options)
( fullDesc
<> progDesc "Quantify the diversity of a population." )