qualUpdate

telatin · Mar 30, 2021 · 32c5b25 · 32c5b25
1 parent 53b7805
commit 32c5b25
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 46 deletions.
diff --git a/docs/tools/2.13_usage_qual.md b/docs/tools/2.13_usage_qual.md
@@ -16,7 +16,8 @@ scores
 Options:
   -m, --max INT          Check the first INT reads [default: 2000]
   -l, --maxlen INT       Maximum read length [default: 300]
-  -p, --profile          Print graphical average quality profile
+  -p, --profile          Quality profile per position
+  -c, --colorbars        Print graphical average quality profile
   -v, --verbose          Verbose output
   --help                 Show this help
 ```
@@ -28,19 +29,53 @@ Check if a set of files is likely in Illumina 1.8 encoding:
 ```
 seqfu qual data/primers/*
 
-data/primers/16Snano_R1.fq.gz   40      71      Sanger;Illumina-1.8;
-data/primers/16Snano_R2.fq.gz   35      71      Sanger;Illumina-1.8;
-data/primers/16S_R1.fq.gz       40      71      Sanger;Illumina-1.8;
-data/primers/16S_R2.fq.gz       35      71      Sanger;Illumina-1.8;
-data/primers/artificial.fa.gz   0       0       Invalid Range
-data/primers/artificial.fq.gz   73      73      Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
-data/primers/art_R1.fq.gz       73      73      Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
-data/primers/art_R2.fq.gz       73      73      Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;
+data/primers/16Snano_R1.fq.gz   40.0    71.0    Sanger;Illumina-1.8;    66.37+/-8.63
+data/primers/16Snano_R2.fq.gz   35.0    71.0    Sanger;Illumina-1.8;    65.05+/-9.54
+data/primers/16S_R1.fq.gz       40.0    71.0    Sanger;Illumina-1.8;    66.26+/-8.62
+data/primers/16S_R2.fq.gz       35.0    71.0    Sanger;Illumina-1.8;    64.72+/-9.72
+data/primers/artificial.fa.gz   0.0     0.0     Invalid Range   0.00+/-0.00
+data/primers/artificial.fq.gz   73.0    73.0    Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;   73.00+/-0.00
+data/primers/art_R1.fq.gz       73.0    73.0    Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;   73.00+/-0.00
+data/primers/art_R2.fq.gz       73.0    73.0    Illumina-1.3;Sanger;Illumina-1.5;Solexa;Illumina-1.8;   73.00+/-0.00
 ```
+
 The artifical datasets (`art*`) were designed to be compatible with most encodings,
 while the `16S*` files are real Illumina 1.8 sequences.
 
+## Output
+
+For each file a 4 column string is printed:
+* filename
+* minimum quality value (no offset is used)
+* maximum quality value (no offset)
+* possible encoding
+* Mean, StDev of the quality value (no offset)
+
+## Per base statistics
+
+With the `--profile` option tabular overview of the quality scores per
+nucleotide position of the read is printed:
+
+```text
+#data/primers/16Snano_R1.fq.gz  40.0    71.0    Sanger;Illumina-1.8;    66.37+/-8.63
+#Pos    Min     Max     Mean    StDev   Skewness
+0       27.0    34.0    33.95   0.50    -12.36
+1       27.0    34.0    33.97   0.46    -14.78
+2       11.0    34.0    33.73   2.28    -9.08
+3       23.0    34.0    33.92   0.76    -13.09
+4       31.0    34.0    33.99   0.20    -15.20
+5       28.0    38.0    37.76   0.86    -7.20
+6       10.0    38.0    36.26   3.98    -4.89
+7       21.0    38.0    36.75   2.68    -3.88
+8       10.0    38.0    36.06   4.29    -3.90
+9       9.0     37.0    32.32   5.93    -2.26
+10      8.0     37.0    31.14   4.25    -1.31
+...
+298     7.0     37.0    26.34   10.34   -0.58
+299     7.0     37.0    25.62   11.23   -0.55
+300     7.0     37.0    21.29   9.26    -0.09
+```
 ## Graphical summary
 
-With the `--profile` option a graphical (Unicode) colored histogram of the average
+With the `--colorbar` option a graphical (Unicode) colored histogram of the average
 quality per base position is printed after each file.
diff --git a/docs/utilities/2.1_usage_primers.md b/docs/utilities/2.1_usage_primers.md
@@ -25,4 +25,4 @@ Usage: fu-primers [options] -1 <FOR> [-2 <REV>]
     --pattern-R2 <tag-2>      Tag in second pairs filenames [default: auto]
     -v --verbose              Verbose output
     -h --help                 Show this help
-```
+```       
diff --git a/src/fastx_qual.nim b/src/fastx_qual.nim
@@ -3,26 +3,25 @@ import tables, strutils
 from os import fileExists
 import docopt
 import ./seqfu_utils
-import colorize
-
+import strformat
+import stats
 
 proc rangesToTable(ranges: Table[string, tuple[rangeStart, rangeEnd: int]]): Table[int, string] =
   echo "OK"
 
-proc rangeToStr(min, max: int, t: Table[string, tuple[rangeStart, rangeEnd: int]]): string =
+proc rangeToStr(min, max: float, t: Table[string, tuple[rangeStart, rangeEnd: int]]): string =
   result = ""
   for x,y in t:
-    if min >= y.rangeStart and max <= y.rangeEnd:
+    if min >= float(y.rangeStart) and max <= float(y.rangeEnd):
       result &= x & ";"
   if result == "":
     result = "Invalid Range"
 
-proc qualityProfile(sum,cnt: seq[int]): string =
-  for i, quality in sum:
-    if cnt[i] < 1:
+proc qualityProfile(s: seq[RunningStat]): string =
+  for i, stats in s:
+    if stats.n == 0:
       break
-    let avg = quality / cnt[i]
-    result &= qualToChar(int(avg))
+    result &= qualToChar(int(stats.mean))
 
 
 
@@ -43,8 +42,9 @@ scores
 
 Options:
   -m, --max INT          Check the first INT reads [default: 2000]
-  -l, --maxlen INT       Maximum read length [default: 300]
-  -p, --profile          Print graphical average quality profile
+  -l, --maxlen INT       Maximum read length [default: 1000]
+  -p, --profile          Quality profile per position
+  -c, --colorbars        Print graphical average quality profile
   -v, --verbose          Verbose output
   --help                 Show this help
 
@@ -54,48 +54,52 @@ Options:
     let
       maxSeqs = parseInt($args["--max"])
       maxLen  = parseInt($args["--maxlen"])
-
+      comment = if args["--profile"]: "#"
+                else: ""
     for file in @(args["<FASTQ>"]):
       if args["--verbose"]:
         stderr.writeLine("Parsing: ", file)
 
       var count = 0
-      var min, max: int 
+
 
       if not fileExists(file):
         stderr.writeLine("ERROR: File not found: ", file)
         continue
 
       try:
         var
-          sumSeq = newSeq[int]()
-          cntSeq = newSeq[int]() 
-
-        for i in 0 .. maxLen:
-          sumSeq.add(0)
-          cntSeq.add(0)   
 
+          sttSeq = newSeq[RunningStat](maxLen + 1)
+          stats: RunningStat
         for record in readfq(file):
           count += 1
           if count > maxSeqs:
             break
           for i, q in record.quality:
-            if count == 1:
-              min = q.ord
-              max = min
-            else:
-              if min > q.ord:
-                min = q.ord
-              if max < q.ord:
-                max = q.ord
-            cntSeq[i] += 1
-            sumSeq[i] += charToQual(q)
-
-        let encodingType = rangeToStr(min, max, ranges)
+            let 
+              quality_ord = q.ord
+              quality_enc = charToQual(q)
+            sttSeq[i].push(quality_enc)
+            stats.push(quality_ord)
+        let encodingType = rangeToStr(stats.min, stats.max, ranges)
 
-        echo(file, "\t", min, "\t", max, "\t", encodingType)
+
+        echo(comment, file, "\t", stats.min, "\t", stats.max, "\t", encodingType, "\t", fmt"{stats.mean:.2f}+/-{stats.standardDeviationS:.2f}")
+        if args["--colorbars"]:
+          let profile = qualityProfile(sttSeq)
+          echo "#",  qualToUnicode(profile, @[1, 10, 20, 25, 30, 35, 40], true)
+
+
         if args["--profile"]:
-          let profile = qualityProfile(sumSeq, cntSeq)
-          echo qualToUnicode(profile, @[1, 10, 20, 25, 30, 35, 40], true)
+          echo("#Pos\tMin\tMax\tMean\tStDev\tSkewness")
+          var profString = ""
+          for pos, stats in sttSeq:
+            if stats.n == 0:
+              continue
+            profString &= fmt"{pos}" & "\t" & fmt"{stats.min:.1f}" & "\t" & fmt"{stats.max:.1f}" & "\t" & fmt"{stats.mean:.2f}" & "\t" & fmt"{stats.standardDeviationS:.2f}" & "\t" & fmt"{stats.skewness:.2f}" & "\n"
+          echo profString
+
+
       except Exception as e:
         stderr.writeLine("Error parsing ", file, ": ", e.msg)
diff --git a/src/fastx_view.nim b/src/fastx_view.nim
@@ -6,7 +6,7 @@ import ./seqfu_utils
 import colorize
 
 
-proc qualToUnicode(s: string, v: seq[int], color = false): string =
+proc qualToUnicode*(s: string, v: seq[int], color = false): string =
   #  1   2   3   4   5   6   7
   #  Low     Mid     Ok
   for i in s: