Merge pull request #251 from sestaton/master

Add sunflower models
Gaius-Augustus · Jan 20, 2021 · 5e06af7 · 5e06af7
2 parents b4830c6 + 893e0b2
commit 5e06af7
Show file tree

Hide file tree

Showing 8 changed files with 13,063 additions and 0 deletions.
diff --git a/config/species/sunflower/sunflower_exon_probs.pbl b/config/species/sunflower/sunflower_exon_probs.pbl
diff --git a/config/species/sunflower/sunflower_igenic_probs.pbl b/config/species/sunflower/sunflower_igenic_probs.pbl
diff --git a/config/species/sunflower/sunflower_intron_probs.pbl b/config/species/sunflower/sunflower_intron_probs.pbl
diff --git a/config/species/sunflower/sunflower_metapars.cfg b/config/species/sunflower/sunflower_metapars.cfg
@@ -0,0 +1,48 @@
+# This file contains the list of meta parameters for the coding regions (CDS) which are subject to optimization. 
+# All other meta parameters are chosen as given in the species parameter file. The order 
+# of the parameters determines the order in the optimization process.
+# Basically, different values for these meta parameters are tried out and the ones
+# giving best performance in a cross-validation on the training set are chosen.
+# For each parameter the range of possible values is specified after the parameter
+# name and at least one white space.
+# 3 cases are possible for the range:
+# - an explicit list is given, e.g. protein	"on" "off"
+# - it is an integer range, e.g. window_size	"1"-"5"
+# - it is a range of floating point numbers, e.g. pseudocount	"0.3"_"1.8"
+#
+# 
+# Mario Stanke, 19.12.2006
+#
+
+/Constant/dss_end		"1"-"4"
+/Constant/dss_start		"1"-"3"
+/Constant/ass_start		"1"-"3"
+/Constant/ass_end		"0"-"4"
+/Constant/ass_upwindow_size	"1"-"50"
+/IntronModel/d                  "100"-"950"
+/IntronModel/ass_motif_memory	"0"-"3"
+/IntronModel/ass_motif_radius	"0"-"4"
+/ExonModel/tis_motif_memory	"0"-"3"
+/ExonModel/tis_motif_radius	"0"-"3"
+/Constant/trans_init_window	"0"-"25"
+/Constant/init_coding_len	"0"-"18"
+/ExonModel/patpseudocount	"0.5"_"5"
+/ExonModel/etpseudocount	"0"-"10"
+/ExonModel/etorder		"0"-"3"
+/Constant/intterm_coding_len	"0"-"13"
+/ExonModel/slope_of_bandwidth	"0.05"_"0.6"
+/ExonModel/minwindowcount	"1"-"15"
+/IGenicModel/patpseudocount	"0.5"_"7"
+/IntronModel/patpseudocount	"0.5"_"7"
+/IntronModel/slope_of_bandwidth	"0.05"_"0.6"
+/IntronModel/minwindowcount	"1"-"8"
+/IntronModel/asspseudocount	"0.0005"_"0.03"
+/IntronModel/dsspseudocount	"0.0002"_"0.04"
+/IntronModel/dssneighborfactor  "0.0001"_"0.01"
+/ExonModel/minPatSum		"100"_"600"
+/Constant/probNinCoding         "0.15"_".25"
+/Constant/decomp_num_steps	"1"-"5"
+# comment parameters out that you do not want to be subject of optimization
+#/IGenicModel/k                  "4" "3" "5"
+#/IntronModel/k                  "4" "3" "5"
+#/ExonModel/k                    "4" "3" "5"
diff --git a/config/species/sunflower/sunflower_metapars.cgp.cfg b/config/species/sunflower/sunflower_metapars.cgp.cfg
@@ -0,0 +1,22 @@
+# This file contains the list of meta parameters of comparative gene prediction which are subject to optimization. 
+# All other meta parameters are chosen as given in the species parameter file. The order 
+# of the parameters determines the order in the optimization process.
+# Basically, different values for these meta parameters are tried out and the ones
+# giving best performance on the evaluation set are chosen.
+# For each parameter the range of possible values is specified after the parameter
+# name and at least one white space.
+# 3 cases are possible for the range:
+# - an explicit list is given, e.g. protein	"on" "off"
+# - it is an integer range, e.g. window_size	"1"-"5"
+# - it is a range of floating point numbers, e.g. pseudocount	"0.3"_"1.8"
+#
+# 
+# Stefanie Koenig, 9.4.2015
+#
+/CompPred/ec_thold	"-1.5"_"1.5"
+/CompPred/ic_thold	"-1.5"_"1.5"
+/CompPred/exon_gain     "0.00001"_"0.1"   # rate of exon gain
+/CompPred/exon_loss     "0.00001"_"0.1"   # rate of exon loss
+/CompPred/phylo_factor  "1"-"20"
+# comment parameters out that you do not want to be subject of optimization
+#/CompPred/dd_factor    "1"-"100"
diff --git a/config/species/sunflower/sunflower_metapars.utr.cfg b/config/species/sunflower/sunflower_metapars.utr.cfg
@@ -0,0 +1,30 @@
+# This file contains the list of meta parameters for the Untranslated Regions (UTRs), which are subject to optimization. 
+# All other parameters are chosen as given in the species parameter file. The order 
+# of the parameters determines the order in the optimization process.
+# Basically, different values for these meta parameters are tried out and the ones
+# giving best performance in a cross-validation on the training set are chosen.
+# For each parameter the range of possible values is specified after the parameter
+# name and at least one white space.
+# 3 cases are possible for the range:
+# - an explicit list is given, e.g. protein	"on" "off"
+# - it is an integer range, e.g. window_size	"1"-"5"
+# - it is a range of floating point numbers, e.g. pseudocount	"0.3"_"1.8"
+#
+# 
+# Mario Stanke, 9.5.2008
+#
+
+/UtrModel/prob_polya            "0.0"_"0.99"
+/UtrModel/d_polya_cleavage_min  "6"-"14"
+/UtrModel/d_polya_cleavage_max  "17"-"27"
+/UtrModel/tss_start             "0"-"12"
+/UtrModel/tss_end               "0"-"8"
+/UtrModel/tts_motif_memory      "0"-"2"
+/UtrModel/utr5patternweight     "0.1"_"1.0"
+/UtrModel/utr3patternweight     "0.1"_"1.0"
+/UtrModel/patpseudocount        "1"_"3"
+/UtrModel/tssup_k               "0"-"2"
+/UtrModel/slope_of_bandwidth    "0.2"_"0.4"
+/UtrModel/minwindowcount        "1"-"4"
+#/UtrModel/k                     "2"-"4"
+
diff --git a/config/species/sunflower/sunflower_parameters.cfg b/config/species/sunflower/sunflower_parameters.cfg
@@ -0,0 +1,145 @@
+#
+# sunflower parameters. 
+# 
+# date : 19.12.2006
+#
+
+#
+# Properties for augustus
+#------------------------------------
+/augustus/verbosity 3     # 0-3, 0: only print the neccessary
+maxDNAPieceSize    500000 # maximum segment that is predicted in one piece
+stopCodonExcludedFromCDS false # make this 'true' if the CDS does not include the stop codon (training and prediction)
+
+# gff output options:
+protein             on    # output predicted protein sequence
+codingseq           off   # output the coding sequence
+cds                 on    # output 'cds' as feature for exons
+start               on    # output start codons (translation start)
+stop                on    # output stop codons  (translation stop)
+introns             on    # output introns
+tss                 on   # output transcription start site
+tts                 on   # output transcription termination site
+print_utr           off   # output 5'UTR and 3'UTR lines in addition to exon lines
+
+checkExAcc          off   # internal parameter for extrinsic accuracy
+
+# alternative transcripts and posterior probabilities
+sample                      100   # the number of sampling iterations
+alternatives-from-sampling  false # output alternative suboptimal transcripts 
+alternatives-from-evidence  false # output alternative transcripts based on explicit evidence from hints
+minexonintronprob           0.08  # minimal posterior probability of all (coding) exons
+minmeanexonintronprob       0.4   # minimal geometric mean of the posterior probs of introns and exons
+maxtracks                   -1    # maximum number of reported transcripts per gene (-1: no limit)
+keep_viterbi                true  # set to true if all Viterbi transcripts should be reported
+uniqueCDS                   true  # don't report transcripts that differ only in the UTR
+UTR                         off   # predict untranslated regions
+
+#
+# 
+# The rest of the file contains mainly meta parameters used for training.
+#
+
+# global constants
+# ----------------------------
+
+/Constant/trans_init_window           20
+/Constant/ass_upwindow_size           50
+/Constant/ass_start                   1
+/Constant/ass_end                     2
+/Constant/dss_start                   2
+/Constant/dss_end                     3
+/Constant/init_coding_len	      4
+/Constant/intterm_coding_len	      5
+/Constant/tss_upwindow_size           45
+/Constant/decomp_num_at               1
+/Constant/decomp_num_gc               1
+/Constant/gc_range_min		      0.32   # This range has an effect only when decomp_num_steps>1. 
+/Constant/gc_range_max                0.73   # States the minimal and maximal percentage of c or g
+/Constant/decomp_num_steps            1      # I recomment keeping this to 1 for most species.
+/Constant/min_coding_len              201    # no gene with a coding sequence shorter than this is predicted
+/Constant/probNinCoding               0.225   # divide this by .25 to get a malus for making one masked letter part of the coding sequence
+/Constant/amberprob                   0.33   # Prob(stop codon = tag), if 0 tag is assumed to code for amino acid
+/Constant/ochreprob                   0.33   # Prob(stop codon = taa), if 0 taa is assumed to code for amino acid
+/Constant/opalprob                    0.34   # Prob(stop codon = tga), if 0 tga is assumed to code for amino acid
+/Constant/subopt_transcript_threshold 0.7
+/Constant/almost_identical_maxdiff    10
+
+# type of weighing, one of  1 = equalWeights, 2 = gcContentClasses, 3 = multiNormalKernel
+/BaseCount/weighingType    3
+# file with the weight matrix (only for multiNormalKernel type weighing)
+/BaseCount/weightMatrixFile   sunflower_weightmatrix.txt # change this to your species if at all neccessary
+
+# Properties for IGenicModel
+# ----------------------------
+/IGenicModel/verbosity      0
+/IGenicModel/infile         sunflower_igenic_probs.pbl   # change this and the other five filenames *_probs.pbl below to your species
+/IGenicModel/outfile        sunflower_igenic_probs.pbl
+/IGenicModel/patpseudocount 5.0
+/IGenicModel/k              4        # order of the Markov chain for content model, keep equal to /ExonModel/k
+
+# Properties for ExonModel
+# ----------------------------
+/ExonModel/verbosity          3
+/ExonModel/infile             sunflower_exon_probs.pbl
+/ExonModel/outfile            sunflower_exon_probs.pbl
+/ExonModel/patpseudocount     0.5
+/ExonModel/minPatSum          233.3
+/ExonModel/k                  4       # order of the Markov chain for content model
+/ExonModel/etorder	      2
+/ExonModel/etpseudocount      3
+/ExonModel/exonlengthD        2000    # beyond this the distribution is geometric
+/ExonModel/maxexonlength      15000
+/ExonModel/slope_of_bandwidth 0.325
+/ExonModel/minwindowcount     10
+/ExonModel/tis_motif_memory   3
+/ExonModel/tis_motif_radius   3
+
+# Properties for IntronModel
+# ----------------------------
+/IntronModel/verbosity          0
+/IntronModel/infile             sunflower_intron_probs.pbl
+/IntronModel/outfile            sunflower_intron_probs.pbl
+/IntronModel/patpseudocount     5.0
+/IntronModel/k                  4     # order of the Markov chain for content model, keep equal to /ExonModel/k
+/IntronModel/slope_of_bandwidth 0.53125
+/IntronModel/minwindowcount     6
+/IntronModel/asspseudocount     0.00266
+/IntronModel/dsspseudocount     0.0005
+/IntronModel/dssneighborfactor  0.01
+#/IntronModel/splicefile         sunflower_splicefile.txt # this optional file contains additional windows around splice sites for training, uncomment if you have one
+/IntronModel/sf_with_motif	false           # if true the splice file is also used to train the branch point region
+/IntronModel/d                  525  # constraint: this must be larger than 4 + /Constant/dss_end + /Constant/ass_upwindow_size + /Constant/ass_start
+/IntronModel/ass_motif_memory   1
+/IntronModel/ass_motif_radius   0
+
+# Properties for UtrModel
+# ----------------------------
+/UtrModel/verbosity             3
+/UtrModel/infile                sunflower_utr_probs.pbl
+/UtrModel/outfile               sunflower_utr_probs.pbl
+/UtrModel/k                     4
+/UtrModel/utr5patternweight     0.5
+/UtrModel/utr3patternweight     0.5
+/UtrModel/patpseudocount        1
+/UtrModel/tssup_k               0
+/UtrModel/tssup_patpseudocount  1
+/UtrModel/slope_of_bandwidth    0.2375
+/UtrModel/minwindowcount        3
+/UtrModel/exonlengthD           800
+/UtrModel/maxexonlength         1800
+/UtrModel/max3singlelength      2000
+/UtrModel/max3termlength        1500
+/UtrModel/tss_start             8
+/UtrModel/tss_end               5
+/UtrModel/tata_start            2
+/UtrModel/tata_end              10
+/UtrModel/tata_pseudocount      2
+/UtrModel/d_tss_tata_min        26      # minimal distance between start of tata box (if existent) and tss 
+/UtrModel/d_tss_tata_max        37      # maximal distance between start of tata box (if existent) and tss
+/UtrModel/polyasig_consensus    aataaa  # polyadenylation signal training not fully automated yet
+/UtrModel/d_polyasig_cleavage   14      # the transcription end is predicted this many bases after the polyadenylation signal
+/UtrModel/d_polya_cleavage_min  7
+/UtrModel/d_polya_cleavage_max  19
+/UtrModel/prob_polya            0.4
+/UtrModel/tts_motif_memory      1
diff --git a/config/species/sunflower/sunflower_weightmatrix.txt b/config/species/sunflower/sunflower_weightmatrix.txt
@@ -0,0 +1,23 @@
+# 
+# This file contains a matrix used for weighing the training sequences
+# when given an input sequence. Let z = (da, dc, dg, dt) be the vector
+# containing the differences in the relative nucleotide frequencies of
+# two sequences, the input sequence and a training sequence.
+# Then the training sequence has weight proportional to 
+#
+# exp ( - z M z^t)
+#
+# with M being the matrix specified below.
+# If M is nonsingular, then (apart from a two normalizing factors) M
+# is the inverse of the covariance matrix of a multinormal
+# distribution - the kernel for the estimation.
+
+
+# this matrix is gc-content only, i.e. 
+# weight = 10 * exp (-200 * (dc + dg))^2)
+# in particular weight <= 10
+0      0      0             0
+0      200    0             0
+0      0      200           0
+0      0      0             0
+