forked from ExaScience/cl-elprep
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathelprep-utils.lisp
397 lines (382 loc) · 25 KB
/
elprep-utils.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
(in-package :elprep)
(in-simple-base-string-syntax)
(defun explain-flag (flag)
"Return a symbolic representation of the FLAG entry in a SAM file alignment line.
This is primarily for debugging purposes.
See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4."
(let ((result '()))
(macrolet ((test (&rest bits)
`(progn ,@(loop for bit in bits
for bitn = (symbol-name bit)
for bitk = (intern (subseq bitn 1 (1- (length bitn))) :keyword)
collect `(when (/= (logand flag ,bit) 0)
(push ,bitk result))))))
(test +supplementary+
+duplicate+
+qc-failed+
+secondary+
+last+
+first+
+next-reversed+
+reversed+
+next-unmapped+
+unmapped+
+proper+
+multiple+))
result))
(defun sam-alignment-differ (aln1 aln2)
"Return false if the two sam-alignments have the same mandatory fields.
Otherwise, return a symbol indicating which field differs."
(declare (sam-alignment aln1 aln2) #.*optimization*)
; check that all mandatory fields are =
(or (when (string/= (the base-string (sam-alignment-qname aln1)) (the base-string (sam-alignment-qname aln2))) 'qname)
(when (/= (the fixnum (sam-alignment-flag aln1)) (the fixnum (sam-alignment-flag aln2))) 'flag)
(when (string/= (the base-string (sam-alignment-rname aln1)) (the base-string (sam-alignment-rname aln2))) 'rname)
(when (/= (the int32 (sam-alignment-pos aln1)) (the int32 (sam-alignment-pos aln2))) 'pos)
(when (/= (sam-alignment-mapq aln1) (sam-alignment-mapq aln2)) 'mapq)
(when (string/= (the base-string (sam-alignment-cigar aln1)) (the base-string (sam-alignment-cigar aln2))) 'cigar)
(when (string/= (the base-string (sam-alignment-rnext aln1)) (the base-string (sam-alignment-rnext aln2))) 'rnext)
(when (string/= (the base-string (sam-alignment-qual aln1)) (the base-string (sam-alignment-qual aln2))) 'qual)))
(defun sam-alignment-same (aln1 aln2)
"Return true if the two sam-alignments have the same mandatory fields, false otherwise."
(declare (sam-alignment aln1 aln2) #.*optimization*)
(and (string= (the base-string (sam-alignment-qname aln1)) (the base-string (sam-alignment-qname aln2)))
(= (the fixnum (sam-alignment-flag aln1)) (the fixnum (sam-alignment-flag aln2)))
(string= (the base-string (sam-alignment-rname aln1)) (the base-string (sam-alignment-rname aln2)))
(= (the int32 (sam-alignment-pos aln1)) (the int32 (sam-alignment-pos aln2)))
(= (sam-alignment-mapq aln1) (sam-alignment-mapq aln2))
(string= (the base-string (sam-alignment-cigar aln1)) (the base-string (sam-alignment-cigar aln2)))
(string= (the base-string (sam-alignment-rnext aln1)) (the base-string (sam-alignment-rnext aln2)))
(string= (the base-string (sam-alignment-qual aln1)) (the base-string (sam-alignment-qual aln2)))))
(defun real-diffs (alns1 alns2)
"Return a list of sam-alignments in alns1 for which no alignments in alns2 exist that have the same mandatory fields."
(loop for aln1 in alns1
unless (find aln1 alns2 :test #'sam-alignment-same)
collect aln1))
(defun compare-sams (sam1-file sam2-file)
"Parse both SAM files, then compare the mandatory fields of all alignments one by one."
(let ((sam1 (make-sam))
(sam2 (make-sam))
(working-directory (get-working-directory)))
(run-pipeline (merge-pathnames sam1-file working-directory) sam1 :sorting-order :queryname)
(run-pipeline (merge-pathnames sam2-file working-directory) sam2 :sorting-order :queryname)
(format t "sam1:~s alns sam2:~s alns ~%" (length (sam-alignments sam1)) (length (sam-alignments sam2)))
(let ((differences1 nil)
(differences2 nil))
(loop for aln1 in (sam-alignments sam1) ; filter diffs
for aln2 in (sam-alignments sam2)
do (let ((d (sam-alignment-differ aln1 aln2)))
(when d
(push aln1 differences1)
(push aln2 differences2))))
(real-diffs differences1 differences2)))) ; sort slightly different order in elprep so get out real diffs
(defun verify-order-kept (sam-file)
"Assume the SAM file is sorted by coordinate order. Verify that this is stil the case."
(format t "verifying order kept ~%")
(let ((sam (make-sam))
(working-directory (get-working-directory)))
(run-pipeline (merge-pathnames sam-file working-directory) sam)
(let (pos rname qname ctr)
(do-sam-alignments (aln sam)
(if (null pos)
;; first alignment
(setq pos (sam-alignment-pos aln)
rname (sam-alignment-rname aln)
qname (sam-alignment-qname aln)
ctr 1)
;; remaining alignments
(let ((new-pos (sam-alignment-pos aln))
(new-rname (sam-alignment-rname aln))
(new-qname (sam-alignment-qname aln)))
(cond ((and (< new-pos pos) (string= rname new-rname ))
(format t "Not sorted: previous pos: ~s,~s,~s current pos: ~s,~s,~s. ~s reads were in the right order. ~%" qname rname pos new-qname new-rname new-pos ctr)
(return-from verify-order-kept nil))
(t
(incf ctr)
(setf rname new-rname)
(setf pos new-pos))))))
(return-from verify-order-kept t))))
(defun count-duplicates (sam-file)
"Return the number of alignments in the SAM file that are marked as duplicates."
(let ((sam (make-sam)))
(run-pipeline (merge-pathnames sam-file (get-working-directory)) sam)
(let ((count 0))
(do-sam-alignments (aln sam)
(when (sam-alignment-duplicate-p aln) (incf count)))
count)))
; code for splitting up sam files into chromosomes
(define-symbol-macro optional-data-tag "sr:i:1")
(defun split-file-per-chromosome (input output-path output-prefix output-extension)
"A function for splitting a sam file into : a file containing all unmapped reads, a file containing all pairs where reads map to different chromosomes, a file per chromosome containing all pairs where the reads map to that chromosome. There are no requirements on the input file for splitting."
(let ((files (directory input)))
(multiple-value-bind
(first-in first-program)
(open-sam (first files) :direction :input)
(let ((header (parse-sam-header first-in))
(splits-path (merge-pathnames (make-pathname :directory '(:relative "splits")) output-path))
(chroms-encountered (make-single-thread-hash-table :test #'buffer= :hash-function #'buffer-hash)) ; chr -> file name
(buf-unmapped (make-buffer "*")))
; create a directory for split files
(ensure-directories-exist splits-path)
; tag the header as one created with elPrep split
(setf (sam-header-user-tag header :|@sr|) (list (list :|co| "This file was created using elprep split.")))
; fill in a file for unmapped reads
(setf (gethash buf-unmapped chroms-encountered)
(multiple-value-bind
(file program) ; create a new headerless file
(open-sam (merge-pathnames splits-path (make-pathname :name (format nil "~a-unmapped" output-prefix) :type output-extension)) :direction :output)
(format-sam-header file header)
(cons file program)))
; create split files for each chromosome
(loop for sn-form in (sam-header-sq header)
do (assert (not (null sn-form)))
(let* ((chrom (getf sn-form :SN))
(buf-chrom (make-buffer chrom)))
(setf (gethash buf-chrom chroms-encountered)
(multiple-value-bind
(file program)
(open-sam (merge-pathnames splits-path (make-pathname :name (format nil "~a-~a" output-prefix chrom) :type output-extension)) :direction :output)
(format-sam-header file header)
(cons file program)))))
(unwind-protect
(with-open-sam (spread-reads-stream (merge-pathnames output-path (make-pathname :name (format nil "~a-spread" output-prefix) :type output-extension)) :direction :output)
(format-sam-header spread-reads-stream header)
(let ((buf-= (make-buffer "=")))
(let ((rname (make-buffer))
(rnext (make-buffer))
(aln-string (make-buffer)))
(flet ((process-file (in)
(loop do (read-line-into-buffer in aln-string)
until (buffer-emptyp aln-string)
do (progn (buffer-partition aln-string #\Tab 2 rname 6 rnext)
(let* ((file (car (gethash rname chroms-encountered))))
(cond ((or (or (buffer= buf-= rnext) (buffer= rname rnext)) ; read and mate map to the same chromosome;
(buffer= buf-unmapped rname)) ; read unmapped
(write-buffer aln-string file)
(write-newline file))
(t ; the read is part of a pair mapping to two different chromosomes
(write-buffer aln-string spread-reads-stream)
(write-newline spread-reads-stream)
; duplicate the info in the chromosome file so it can be used; mark the read as duplicate info
(write-buffer aln-string file)
(write-tab file)
(writestr file optional-data-tag)
(write-newline file))))))
(reinitialize-buffer rname)
(reinitialize-buffer rnext)
(reinitialize-buffer aln-string)))
(process-file first-in)
(close-sam first-in first-program)
(loop for in-file in (rest files)
do (with-open-sam (in in-file :direction :input)
(skip-sam-header in)
(process-file in)))))))
(loop for (file . program) being each hash-value of chroms-encountered
do (close-sam file program)))))))
(defun merge-sorted-files-split-per-chromosome (input-path output input-prefix input-extension header)
"A function for merging files that were split with elPrep and sorted in coordinate order."
; Extract the header to identify the files names.
; Assume that all files are sorted per cooordinate order, i.e. first sorted on refid entry according to sequence dictionary, then sorted on position entry.
; There is a file per chromosome in the sequence dictionary. These contain all reads that map to that chromosome.
; On top of that, there is a file that contains the unmapped (or *) reads and a file that contains the reads that map to different chromosomes.
; Merge these files in the order of the sequence dictionary. Put the unmapped reads as the last entries.
; When merging a particular chromosome file into the merged file, make sure that reads that map to different chromosomes are merged in correctly.
; So while mergin a particular chromosome file, pop and compare against reads in the file for reads that map to different chromosomes until the next chromosome
; is encountered on the refid position.
; when a file is empty, close it and remove it from the list of files to merge
; loop for identifying and opening the files to merge
; extract min/max from the header
; merge the files
(with-open-sam (spread-reads-file (merge-pathnames input-path (make-pathname :name (format nil "~a-spread" input-prefix) :type input-extension)) :direction :input)
(skip-sam-header spread-reads-file)
; merge loop
(with-open-sam (out output :direction :output)
(format-sam-header out header)
(let ((spread-read (make-buffer)) ; for storing entries from the spread-read file
(spread-read-refid (make-buffer))
(spread-read-pos (make-buffer))
(chromosome-read (make-buffer)) ; for storing reads from the chromsome file we are currently merging
(chromosome-read-refid (make-buffer))
(chromosome-read-pos (make-buffer))
(common-read-refid (make-buffer)))
; then merge the rest of the files
(loop for sn-form in (sam-header-sq header)
for chrom = (getf sn-form :SN)
for file-name = (merge-pathnames input-path (make-pathname :name (format nil "~a-~a" input-prefix chrom) :type input-extension))
when (probe-file file-name) do
(reinitialize-buffer common-read-refid)
(buffer-extend common-read-refid chrom)
(block chromosome-loop
(with-open-sam (file file-name :direction :input)
(skip-sam-header file)
(read-line-into-buffer file chromosome-read)
(when (buffer-emptyp chromosome-read) (return-from chromosome-loop))
(buffer-partition chromosome-read #\Tab 2 chromosome-read-refid 3 chromosome-read-pos)
(assert (buffer= chromosome-read-refid common-read-refid))
(when (buffer-emptyp spread-read) ; if the buffer is not empty, the current entry is potentially an entry for this file and it should not be overwritten
(read-line-into-buffer spread-reads-file spread-read)
(buffer-partition spread-read #\Tab 2 spread-read-refid 3 spread-read-pos))
(unless (buffer-emptyp spread-read)
(when (buffer= spread-read-refid chromosome-read-refid)
(let ((pos1 (buffer-parse-integer spread-read-pos))
(pos2 (buffer-parse-integer chromosome-read-pos)))
(loop do (cond ((< pos1 pos2)
(write-buffer spread-read out)
(write-newline out)
(read-line-into-buffer spread-reads-file spread-read)
(cond ((buffer-emptyp spread-read)
(loop-finish))
(t (buffer-partition spread-read #\Tab 2 spread-read-refid 3 spread-read-pos)
(setq pos1 (buffer-parse-integer spread-read-pos)))))
(t (write-buffer chromosome-read out)
(write-newline out)
(read-line-into-buffer file chromosome-read)
(cond ((buffer-emptyp chromosome-read)
(loop-finish))
(t (buffer-partition chromosome-read #\Tab 3 chromosome-read-pos)
(setq pos2 (buffer-parse-integer chromosome-read-pos))))))
while (buffer= chromosome-read-refid spread-read-refid)))))
; copy remaining reads in the file, if any
(when (not (buffer-emptyp chromosome-read))
(write-buffer chromosome-read out)
(write-newline out))
(copy-stream file out)))
; copy remaining reads in the spread file, if any, that are one the same chromosome as the file was
(when (not (buffer-emptyp spread-read))
(loop while (buffer= spread-read-refid common-read-refid)
do
(write-buffer spread-read out)
(write-newline out)
(read-line-into-buffer spread-reads-file spread-read)
(when (buffer-emptyp spread-read) (return))
(buffer-partition spread-read #\Tab 2 spread-read-refid)
finally
(buffer-partition spread-read #\Tab 3 spread-read-pos))))
; merge the remaining reads in the spread-reads file
(when (not (buffer-emptyp spread-read))
(write-buffer spread-read out)
(write-newline out))
(copy-stream spread-reads-file out))
; merge the unmapped reads
(with-open-sam (unmapped-file (merge-pathnames input-path (make-pathname :name (format nil "~a-unmapped" input-prefix) :type input-extension)) :direction :input)
(skip-sam-header unmapped-file)
(copy-stream unmapped-file out)))))
(defun merge-unsorted-files-split-per-chromosome (input-path output input-prefix input-extension header)
"A function for merging files that were split with elPrep and are unsorted"
; merge loop
(with-open-sam (out output :direction :output)
(format-sam-header out header)
; first merge the unmapped reads
(with-open-sam (unmapped-file (merge-pathnames input-path (make-pathname :name (format nil "~a-unmapped" input-prefix) :type input-extension)) :direction :input)
(skip-sam-header unmapped-file)
(copy-stream unmapped-file out))
; merge spread reads
(with-open-sam (spread-reads-file (merge-pathnames input-path (make-pathname :name (format nil "~a-spread" input-prefix) :type input-extension)) :direction :input)
(skip-sam-header spread-reads-file)
(copy-stream spread-reads-file out))
; merge the rest of the files
(loop for sn-form in (sam-header-sq header)
for chrom = (getf sn-form :SN)
for file-name = (merge-pathnames input-path (make-pathname :name (format nil "~a-~a" input-prefix chrom) :type input-extension))
when (probe-file file-name) do
(with-open-sam (file file-name :direction :input)
(skip-sam-header file)
(copy-stream file out)))))
(declaim (inline parse-sam-alignment-from-stream))
(defun parse-sam-alignment-from-stream (stream)
"Read a line from a stream and parse it as a SAM alignment line.
Return NIL if stream is at end of file.
See http://samtools.github.io/hts-specs/SAMv1.pdf - Section 1.4."
(let ((line (read-line stream nil)))
(when line (parse-sam-alignment line))))
(defun compare-sam-files (file1 file2 &optional (output "/dev/stdout"))
"A function for comparing two sam files. The input files must be sorted by coordinate order."
(labels ((get-alns (stream next-aln)
(loop with group-aln = (or next-aln (parse-sam-alignment-from-stream stream))
with alns = (list group-aln)
for aln = (parse-sam-alignment-from-stream stream)
while (and aln
(= (sam-alignment-pos group-aln)
(sam-alignment-pos aln))
(string= (sam-alignment-rname group-aln)
(sam-alignment-rname aln)))
do (push aln alns)
finally (return (values (sort alns (lambda (aln1 aln2)
(or (string< (sam-alignment-qname aln1)
(sam-alignment-qname aln2))
(when (string= (sam-alignment-qname aln1)
(sam-alignment-qname aln2))
(< (sam-alignment-flag aln1)
(sam-alignment-flag aln2))))))
aln))))
(plist-to-sorted-alist (plist)
(sort (loop for (key value) on plist by #'cddr collect (cons key value))
#'string< :key (lambda (object) (string (car object)))))
(compare-alns (out alns1 alns2)
(loop for aln1 in alns1
for aln2 in alns2
for difference = (cond ((string/= (sam-alignment-qname aln1)
(sam-alignment-qname aln2)) "qname (1)")
((/= (sam-alignment-flag aln1)
(sam-alignment-flag aln2)) "flag (2)")
((string/= (sam-alignment-rname aln1)
(sam-alignment-rname aln2)) "rname (3)")
((/= (sam-alignment-pos aln1)
(sam-alignment-pos aln2)) "pos (4)")
((/= (sam-alignment-mapq aln1)
(sam-alignment-mapq aln2)) "mapq (5)")
((string/= (sam-alignment-cigar aln1)
(sam-alignment-cigar aln2)) "cigar (6)")
((string/= (sam-alignment-rnext aln1)
(sam-alignment-rnext aln2)) "rnext (7)")
((/= (sam-alignment-pnext aln1)
(sam-alignment-pnext aln2)) "pnext (8)")
((/= (sam-alignment-tlen aln1)
(sam-alignment-tlen aln2)) "tlen (9)")
((string/= (sam-alignment-seq aln1)
(sam-alignment-seq aln2)) "seq (10)")
((string/= (sam-alignment-qual aln1)
(sam-alignment-qual aln2)) "qual (11)")
(t (let ((tags1 (plist-to-sorted-alist (sam-alignment-tags aln1)))
(tags2 (plist-to-sorted-alist (sam-alignment-tags aln2))))
(when (or (/= (length tags1) (length tags2))
(loop for (nil . val1) in tags1
for (nil . val2) in tags2
thereis (or (not (eq (type-of val1) (type-of val2)))
(etypecase val1
(character (char/= val1 val2))
(number (/= val1 val2))
(string (string/= val1 val2))
(array (not (equalp val1 val2)))))))
"optional tags"))))
when difference do
(format t "alignments differ for ~a entry: ~%" difference)
(format-sam-alignment *standard-output* aln1)
(format-sam-alignment *standard-output* aln2)
(format-sam-alignment out aln1)
(format-sam-alignment out aln2))))
(with-open-sam (in1 file1 :direction :input)
(skip-sam-header in1)
(with-open-sam (in2 file2 :direction :input)
(skip-sam-header in2)
(with-open-sam (out output :direction :output)
(loop
for prev-aln1 = nil
for prev-aln2 = nil
for alns1 = (multiple-value-bind (alns next) (get-alns in1 prev-aln1) (setf prev-aln1 next) alns)
for alns2 = (multiple-value-bind (alns next) (get-alns in2 prev-aln2) (setf prev-aln2 next) alns)
for l1 = (length alns1)
for l2 = (length alns2)
for index from 1
sum l1 into nr-of-reads-matched
while (or alns1 alns2) do
(if (= l1 l2)
(compare-alns out alns1 alns2)
(let ((pos (sam-alignment-pos (or (first alns1) (first alns2))))
(rname (sam-alignment-rname (or (first alns1) (first alns2)))))
(format t "Files contain an unequal number of read entries at the same position.~%")
(format t "File ~a has ~a reads at position ~a ~a.~%" file1 l1 rname pos)
(format t "File ~a has ~a reads at position ~a ~a.~%" file2 l2 rname pos)))
(when (zerop (mod index 1000000))
(format t "~a reads compared and matched.~%" nr-of-reads-matched))
finally (format t "~a reads compared and matched.~%" nr-of-reads-matched)))))))