update to v2.1.7

YuSugihara · Apr 7, 2020 · 1598551 · 1598551
1 parent dab6614
commit 1598551
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ $ mutmap -h
 usage: mutmap -r <FASTA> -c <BAM|FASTQ> -b <BAM|FASTQ>
               -n <INT> -o <OUT_DIR> [-T] [-e <DATABASE>]
 
-MutMap version 2.1.6
+MutMap version 2.1.7
 
 optional arguments:
   -h, --help         show this help message and exit
@@ -101,7 +101,7 @@ optional arguments:
   -d , --min-depth   Minimum depth of variants which will be used.
                      This cutoff will be applied in both of cultivar
                      and bulk. [8]
-  -N , --N-rep       Number of replicates for simulation to make
+  -N , --N-rep       Number of replicates for simulation to make 
                      null distribution. [5000]
   -T, --trim         Trim fastq using trimmomatic.
   -a , --adapter     FASTA of adapter sequences. This will be used
@@ -123,6 +123,11 @@ optional arguments:
   -Q , --min-BQ      Minimum base quality in mpileup. [18]
   -C , --adjust-MQ   "adjust-MQ" in mpileup. Default parameter
                      is suited for BWA. [50]
+  --species          Consider multiple test correction derived by
+                     Huang et al. (2019). Please spesify a species name.
+                     With this option. QTL-seq produces a theoretical threshold.
+                     Currently, Arabidopsis, Cucumber, Maize, Rapeseed,
+                     Rice, Tobacco, Tomato, Wheat, and Yeast are supported.
   -v, --version      show program's version number and exit
 ```
 
@@ -216,9 +221,9 @@ $ mutplot -h
 
 usage: mutplot -v <VCF> -o <OUT_DIR> -n <INT> [-w <INT>] [-s <INT>]
                [-D <INT>] [-d <INT>] [-N <INT>] [-m <FLOAT>]
-               [-S <INT>] [-e <DATABASE>] [--igv] [--corr] [--indel]
+               [-S <INT>] [-e <DATABASE>] [--igv] [--indel]
 
-MutPlot version 2.1.6
+MutPlot version 2.1.7
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -234,7 +239,7 @@ optional arguments:
   -d , --min-depth      Minimum depth of variants which will be used.
                         This cutoff will be applied in both of cultivar
                         and bulk. [8]
-  -N , --N-rep          Number of replicates for simulation to make
+  -N , --N-rep          Number of replicates for simulation to make 
                         null distribution. [5000]
   -m , --min-SNPindex   Cutoff of minimum SNP-index for clear results. [0.3]
   -S , --strand-bias    Filter spurious homo genotypes in cultivar using
@@ -245,10 +250,11 @@ optional arguments:
   -e , --snpEff         Predict causal variant using SnpEff. Please
                         check available databases in SnpEff.
   --igv                 Output IGV format file to check results on IGV.
-  --corr                Use the corrected threshold in Huang et al. (2019).
-                        Please spesify mu_alpha_2 in Huang et al. (2019).
-                        When you specify this option, p99 and p95 has the
-                        same value.
+  --species             Consider multiple test correction derived by
+                        Huang et al. (2019). Please spesify a species name.
+                        With this option. QTL-seq produces a theoretical threshold.
+                        Currently, Arabidopsis, Cucumber, Maize, Rapeseed,
+                        Rice, Tobacco, Tomato, Wheat, and Yeast are supported.
   --indel               Plot SNP-index with INDEL.
   --fig-width           Width allocated in chromosome figure. [7.5]
   --fig-height          Height allocated in chromosome figure. [4.0]
@@ -294,6 +300,8 @@ Inside of `OUT_DIR` is like below.
 |   `-- mutmap.vcf.gz.tbi
 |-- 40_mutmap
 |   |-- snp_index.tsv
+│   ├── snp_index.p95.tsv
+│   ├── snp_index.p99.tsv
 |   |-- sliding_window.tsv
 │   ├── sliding_window.p95.tsv
 │   ├── sliding_window.p99.tsv

diff --git a/mutmap/__init__.py b/mutmap/__init__.py
@@ -1 +1 @@
-__version__ = "2.1.6"
+__version__ = "2.1.7"
diff --git a/mutmap/mutmap.py b/mutmap/mutmap.py
@@ -107,38 +107,26 @@ def mpileup(self):
         mp.run()
 
     def mutplot(self):
-        if args.snpEff is None:
-            cmd = 'mutplot -v {0}/30_vcf/mutmap.vcf.gz \
-                           -n {1} \
-                           -w {2} \
-                           -s {3} \
-                           -N {4} \
-                           -D {5} \
-                           -d {6} \
-                           -o {0}/40_mutmap'.format(self.out,
-                                                    self.args.N_bulk,
-                                                    self.args.window,
-                                                    self.args.step,
-                                                    self.args.N_rep,
-                                                    self.args.max_depth,
-                                                    self.args.min_depth)
-        else:
-            cmd = 'mutplot -v {0}/30_vcf/mutmap.vcf.gz \
-                           -n {1} \
-                           -w {2} \
-                           -s {3} \
-                           -N {4} \
-                           -D {5} \
-                           -d {6} \
-                           -o {0}/40_mutmap \
-                           -e {7}'.format(self.out,
-                                          self.args.N_bulk,
-                                          self.args.window,
-                                          self.args.step,
-                                          self.args.N_rep,
-                                          self.args.max_depth,
-                                          self.args.min_depth,
-                                          self.args.snpEff)
+        cmd = 'mutplot -v {0}/30_vcf/mutmap.vcf.gz \
+                        -n {1} \
+                        -w {2} \
+                        -s {3} \
+                        -N {4} \
+                        -D {5} \
+                        -d {6} \
+                        -o {0}/40_mutmap'.format(self.out,
+                                                self.args.N_bulk,
+                                                self.args.window,
+                                                self.args.step,
+                                                self.args.N_rep,
+                                                self.args.max_depth,
+                                                self.args.min_depth)
+
+        if self.args.snpEff is not None:
+            cmd = cmd + ' -e {}'.format(self.args.snpEff)
+
+        if self.args.species is not None:
+            cmd = cmd + ' --species {}'.format(self.args.species)
 
         cmd = clean_cmd(cmd)
         p = sbp.Popen(cmd,

diff --git a/mutmap/mutplot.py b/mutmap/mutplot.py
@@ -45,6 +45,39 @@ def run_snpEff(self):
 
         self.args.vcf = '{}/mutmap.snpEff.vcf'.format(self.out)
 
+    def get_outlier_SNPindex(self):
+        if self.snpEff is None:
+            snp_index = pd.read_csv('{}/snp_index.tsv'.format(self.out),
+                                    sep='\t',
+                                    names=['CHROM',
+                                           'POSI',
+                                           'variant',
+                                           'depth',
+                                           'p99',
+                                           'p95',
+                                           'SNPindex'])
+        else:
+            snp_index = pd.read_csv('{}/snp_index.tsv'.format(self.out),
+                                    sep='\t',
+                                    names=['CHROM',
+                                           'POSI',
+                                           'variant',
+                                           'impact',
+                                           'depth',
+                                           'p99',
+                                           'p95',
+                                           'SNPindex'])
+
+        snp_index[abs(snp_index['p99']) <= \
+                  abs(snp_index['SNPindex'])].to_csv('{}/snp_index.p99.tsv'.format(self.out),
+                                                     sep='\t',
+                                                     index=False)
+
+        snp_index[abs(snp_index['p95']) <= \
+                  abs(snp_index['SNPindex'])].to_csv('{}/snp_index.p95.tsv'.format(self.out),
+                                                     sep='\t',
+                                                     index=False)
+
     def get_outlier_windows(self):
         sliding_window = pd.read_csv('{}/sliding_window.tsv'.format(self.out),
                                      sep='\t',
@@ -143,6 +176,7 @@ def run(self):
         pt = Plot(self.args)
         pt.run()
 
+        self.get_outlier_SNPindex()
         self.get_outlier_windows()
 
         if self.args.igv:

diff --git a/mutmap/params.py b/mutmap/params.py
@@ -202,6 +202,24 @@ def mutmap_options(self):
                                   'is suited for BWA. [50]'),
                             metavar='')
 
+        parser.add_argument('--species',
+                            action='store',
+                            choices=['Arabidopsis', 
+                                     'Cucumber', 
+                                     'Maize', 
+                                     'Rapeseed', 
+                                     'Rice', 
+                                     'Tobacco', 
+                                     'Tomato', 
+                                     'Wheat', 
+                                     'Yeast'],
+                            help=('Consider multiple test correction derived by\n'
+                                  'Huang et al. (2019). Please spesify a species name.\n'
+                                  'With this option. QTL-seq produces a theoretical threshold.\n'
+                                  'Currently, Arabidopsis, Cucumber, Maize, Rapeseed,\n'
+                                  'Rice, Tobacco, Tomato, Wheat, and Yeast are supported.'),
+                            metavar='')
+
         # set version
         parser.add_argument('-v',
                             '--version',
@@ -215,7 +233,7 @@ def mutplot_options(self):
                                          formatter_class=argparse.RawTextHelpFormatter)
         parser.usage = ('mutplot -v <VCF> -o <OUT_DIR> -n <INT> [-w <INT>] [-s <INT>]\n'
                         '               [-D <INT>] [-d <INT>] [-N <INT>] [-m <FLOAT>]\n'
-                        '               [-S <INT>] [-e <DATABASE>] [--igv] [--corr] [--indel]')
+                        '               [-S <INT>] [-e <DATABASE>] [--igv] [--indel]\n')
 
         # set options
         parser.add_argument('-v',
@@ -321,13 +339,22 @@ def mutplot_options(self):
                             default=False,
                             help='Output IGV format file to check results on IGV.')
 
-        parser.add_argument('--corr',
+        parser.add_argument('--species',
                             action='store',
-                            type=float,
-                            help=('Use the corrected threshold in Huang et al. (2019).\n'
-                                  'Please spesify mu_alpha_2 in Huang et al. (2019).\n'
-                                  'When you specify this option, p99 and p95 has the\n'
-                                  'same value.'),
+                            choices=['Arabidopsis', 
+                                     'Cucumber', 
+                                     'Maize', 
+                                     'Rapeseed', 
+                                     'Rice', 
+                                     'Tobacco', 
+                                     'Tomato', 
+                                     'Wheat', 
+                                     'Yeast'],
+                            help=('Consider multiple test correction derived by\n'
+                                  'Huang et al. (2019). Please spesify a species name.\n'
+                                  'With this option. QTL-seq produces a theoretical threshold.\n'
+                                  'Currently, Arabidopsis, Cucumber, Maize, Rapeseed,\n'
+                                  'Rice, Tobacco, Tomato, Wheat, and Yeast are supported.'),
                             metavar='')
 
         parser.add_argument('--indel',

diff --git a/mutmap/vcf2index.py b/mutmap/vcf2index.py
@@ -16,7 +16,7 @@ def __init__(self, args):
         self.out = args.out
         self.vcf = args.vcf
         self.snpEff = args.snpEff
-        self.corr = args.corr
+        self.species = args.species
         self.N_bulk = args.N_bulk
         self.N_replicates = args.N_rep
         self.min_SNPindex = args.min_SNPindex
@@ -25,9 +25,51 @@ def __init__(self, args):
         if self.snpEff is not None:
             self.ANN_re = re.compile(';ANN=(.*);*')
 
-        if self.corr is not None:
-            var_fix = 0.25/(2*self.N_bulk)
-            self.threshold = self.corr*(var_fix**(1/2)) + 0.5
+        if self.species is not None:
+            self.u99, self.u95 = self.correct_threshold()
+
+    def correct_threshold(self):
+        if self.species == 'Arabidopsis':
+            u99 = 3.82
+            u95 = 3.41
+
+        elif self.species == 'Cucumber':
+            u99 = 4.02
+            u95 = 3.62
+
+        elif self.species == 'Maize':
+            u99 = 4.11
+            u95 = 3.72
+
+        elif self.species == 'Rapeseed':
+            u99 = 4.16
+            u95 = 3.78
+
+        elif self.species == 'Rice':
+            u99 = 4.05
+            u95 = 3.65
+
+        elif self.species == 'Tobacco':
+            u99 = 4.22
+            u95 = 3.84
+
+        elif self.species == 'Tomato':
+            u99 = 4.04
+            u95 = 3.65
+
+        elif self.species == 'Wheat':
+            u99 = 4.21
+            u95 = 3.83
+
+        elif self.species == 'Yeast':
+            u99 = 4.31
+            u95 = 3.93
+
+        else:
+            print('You specified not supported species.', file=sys.stderr)
+            sys.exit(1)
+
+        return u99, u95
 
     def get_field(self):
         root, ext = os.path.splitext(self.vcf)
@@ -142,10 +184,11 @@ def calc_SNPindex(self, field_pos):
                 if record['type'] == 'keep':
                     variant = self.check_variant_type(REF, ALT)
 
-                    if self.corr is not None:
-                        p99, p95 = self.threshold, self.threshold
-                    else:
+                    if self.species is None:
                         p99, p95 = self.F2_simulation(record['bulk_depth'])
+                    else:
+                        p99 = self.u99*0.25/record['bulk_depth'] + 0.5
+                        p95 = self.u95*0.25/record['bulk_depth'] + 0.5
 
                     if self.snpEff is None:
                         snp_index.write(('{}\t{}\t{}\t{}\t'