Skip to content

Commit

Permalink
qualUpdate
Browse files Browse the repository at this point in the history
  • Loading branch information
telatin committed Mar 30, 2021
1 parent 53b7805 commit 32c5b25
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 46 deletions.
55 changes: 45 additions & 10 deletions docs/tools/2.13_usage_qual.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ scores
Options:
-m, --max INT Check the first INT reads [default: 2000]
-l, --maxlen INT Maximum read length [default: 300]
-p, --profile Print graphical average quality profile
-p, --profile Quality profile per position
-c, --colorbars Print graphical average quality profile
-v, --verbose Verbose output
--help Show this help
```
Expand All @@ -28,19 +29,53 @@ Check if a set of files is likely in Illumina 1.8 encoding:
```
seqfu qual data/primers/*
data/primers/16Snano_R1.fq.gz 40 71 Sanger;Illumina-1.8;
data/primers/16Snano_R2.fq.gz 35 71 Sanger;Illumina-1.8;
data/primers/16S_R1.fq.gz 40 71 Sanger;Illumina-1.8;
data/primers/16S_R2.fq.gz 35 71 Sanger;Illumina-1.8;
data/primers/artificial.fa.gz 0 0 Invalid Range
data/primers/artificial.fq.gz 73 73 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
data/primers/art_R1.fq.gz 73 73 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
data/primers/art_R2.fq.gz 73 73 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
data/primers/16Snano_R1.fq.gz 40.0 71.0 Sanger;Illumina-1.8; 66.37+/-8.63
data/primers/16Snano_R2.fq.gz 35.0 71.0 Sanger;Illumina-1.8; 65.05+/-9.54
data/primers/16S_R1.fq.gz 40.0 71.0 Sanger;Illumina-1.8; 66.26+/-8.62
data/primers/16S_R2.fq.gz 35.0 71.0 Sanger;Illumina-1.8; 64.72+/-9.72
data/primers/artificial.fa.gz 0.0 0.0 Invalid Range 0.00+/-0.00
data/primers/artificial.fq.gz 73.0 73.0 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8; 73.00+/-0.00
data/primers/art_R1.fq.gz 73.0 73.0 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8; 73.00+/-0.00
data/primers/art_R2.fq.gz 73.0 73.0 Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8; 73.00+/-0.00
```

The artifical datasets (`art*`) were designed to be compatible with most encodings,
while the `16S*` files are real Illumina 1.8 sequences.

## Output

For each file a 4 column string is printed:
* filename
* minimum quality value (no offset is used)
* maximum quality value (no offset)
* possible encoding
* Mean, StDev of the quality value (no offset)

## Per base statistics

With the `--profile` option tabular overview of the quality scores per
nucleotide position of the read is printed:

```text
#data/primers/16Snano_R1.fq.gz 40.0 71.0 Sanger;Illumina-1.8; 66.37+/-8.63
#Pos Min Max Mean StDev Skewness
0 27.0 34.0 33.95 0.50 -12.36
1 27.0 34.0 33.97 0.46 -14.78
2 11.0 34.0 33.73 2.28 -9.08
3 23.0 34.0 33.92 0.76 -13.09
4 31.0 34.0 33.99 0.20 -15.20
5 28.0 38.0 37.76 0.86 -7.20
6 10.0 38.0 36.26 3.98 -4.89
7 21.0 38.0 36.75 2.68 -3.88
8 10.0 38.0 36.06 4.29 -3.90
9 9.0 37.0 32.32 5.93 -2.26
10 8.0 37.0 31.14 4.25 -1.31
...
298 7.0 37.0 26.34 10.34 -0.58
299 7.0 37.0 25.62 11.23 -0.55
300 7.0 37.0 21.29 9.26 -0.09
```
## Graphical summary

With the `--profile` option a graphical (Unicode) colored histogram of the average
With the `--colorbar` option a graphical (Unicode) colored histogram of the average
quality per base position is printed after each file.
2 changes: 1 addition & 1 deletion docs/utilities/2.1_usage_primers.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ Usage: fu-primers [options] -1 <FOR> [-2 <REV>]
--pattern-R2 <tag-2> Tag in second pairs filenames [default: auto]
-v --verbose Verbose output
-h --help Show this help
```
```
72 changes: 38 additions & 34 deletions src/fastx_qual.nim
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,25 @@ import tables, strutils
from os import fileExists
import docopt
import ./seqfu_utils
import colorize

import strformat
import stats

proc rangesToTable(ranges: Table[string, tuple[rangeStart, rangeEnd: int]]): Table[int, string] =
echo "OK"

proc rangeToStr(min, max: int, t: Table[string, tuple[rangeStart, rangeEnd: int]]): string =
proc rangeToStr(min, max: float, t: Table[string, tuple[rangeStart, rangeEnd: int]]): string =
result = ""
for x,y in t:
if min >= y.rangeStart and max <= y.rangeEnd:
if min >= float(y.rangeStart) and max <= float(y.rangeEnd):
result &= x & ";"
if result == "":
result = "Invalid Range"

proc qualityProfile(sum,cnt: seq[int]): string =
for i, quality in sum:
if cnt[i] < 1:
proc qualityProfile(s: seq[RunningStat]): string =
for i, stats in s:
if stats.n == 0:
break
let avg = quality / cnt[i]
result &= qualToChar(int(avg))
result &= qualToChar(int(stats.mean))



Expand All @@ -43,8 +42,9 @@ scores
Options:
-m, --max INT Check the first INT reads [default: 2000]
-l, --maxlen INT Maximum read length [default: 300]
-p, --profile Print graphical average quality profile
-l, --maxlen INT Maximum read length [default: 1000]
-p, --profile Quality profile per position
-c, --colorbars Print graphical average quality profile
-v, --verbose Verbose output
--help Show this help
Expand All @@ -54,48 +54,52 @@ Options:
let
maxSeqs = parseInt($args["--max"])
maxLen = parseInt($args["--maxlen"])

comment = if args["--profile"]: "#"
else: ""
for file in @(args["<FASTQ>"]):
if args["--verbose"]:
stderr.writeLine("Parsing: ", file)

var count = 0
var min, max: int


if not fileExists(file):
stderr.writeLine("ERROR: File not found: ", file)
continue

try:
var
sumSeq = newSeq[int]()
cntSeq = newSeq[int]()

for i in 0 .. maxLen:
sumSeq.add(0)
cntSeq.add(0)

sttSeq = newSeq[RunningStat](maxLen + 1)
stats: RunningStat
for record in readfq(file):
count += 1
if count > maxSeqs:
break
for i, q in record.quality:
if count == 1:
min = q.ord
max = min
else:
if min > q.ord:
min = q.ord
if max < q.ord:
max = q.ord
cntSeq[i] += 1
sumSeq[i] += charToQual(q)

let encodingType = rangeToStr(min, max, ranges)
let
quality_ord = q.ord
quality_enc = charToQual(q)
sttSeq[i].push(quality_enc)
stats.push(quality_ord)
let encodingType = rangeToStr(stats.min, stats.max, ranges)

echo(file, "\t", min, "\t", max, "\t", encodingType)

echo(comment, file, "\t", stats.min, "\t", stats.max, "\t", encodingType, "\t", fmt"{stats.mean:.2f}+/-{stats.standardDeviationS:.2f}")
if args["--colorbars"]:
let profile = qualityProfile(sttSeq)
echo "#", qualToUnicode(profile, @[1, 10, 20, 25, 30, 35, 40], true)


if args["--profile"]:
let profile = qualityProfile(sumSeq, cntSeq)
echo qualToUnicode(profile, @[1, 10, 20, 25, 30, 35, 40], true)
echo("#Pos\tMin\tMax\tMean\tStDev\tSkewness")
var profString = ""
for pos, stats in sttSeq:
if stats.n == 0:
continue
profString &= fmt"{pos}" & "\t" & fmt"{stats.min:.1f}" & "\t" & fmt"{stats.max:.1f}" & "\t" & fmt"{stats.mean:.2f}" & "\t" & fmt"{stats.standardDeviationS:.2f}" & "\t" & fmt"{stats.skewness:.2f}" & "\n"
echo profString


except Exception as e:
stderr.writeLine("Error parsing ", file, ": ", e.msg)
2 changes: 1 addition & 1 deletion src/fastx_view.nim
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import ./seqfu_utils
import colorize


proc qualToUnicode(s: string, v: seq[int], color = false): string =
proc qualToUnicode*(s: string, v: seq[int], color = false): string =
# 1 2 3 4 5 6 7
# Low Mid Ok
for i in s:
Expand Down

0 comments on commit 32c5b25

Please sign in to comment.