Write normalized spectracounts

instead of log2 transformed Put the default maps of KEGGCharter back into default_config.json
iquasere · May 23, 2024 · dadfa2d · dadfa2d
1 parent 837336b
commit dadfa2d
Show file tree

Hide file tree

Showing 5 changed files with 262 additions and 9 deletions.
diff --git a/cicd/Dockerfile b/cicd/Dockerfile
@@ -6,10 +6,10 @@ VOLUME ["named_volume"]
 RUN git clone https://github.com/iquasere/MOSCA.git -b development \
 && bash MOSCA/cicd/install.bash --mosca_env "$(conda info --base)" \
 && conda clean --all \
-# all instructions ahead are only for MP spectra conversion with docker
+# all commands ahead are only for MP spectra conversion with docker
 && apt-get update \
 && apt-get install -y curl \
 && curl -fsSL https://get.docker.com -o get-docker.sh \
 && sh get-docker.sh
 
-CMD [ "python", "bin/mosca.py" ]
+CMD [ "mosca" ]
diff --git a/resources/default_config.json b/resources/default_config.json
@@ -76,7 +76,255 @@
   "keggcharter_taxa_level": "SPECIES",
   "keggcharter_number_of_taxa": 10,
   "keggcharter_maps": [
-    "00680"
+    "00010",
+    "00020",
+    "00030",
+    "00040",
+    "00051",
+    "00052",
+    "00053",
+    "00190",
+    "00195",
+    "00196",
+    "00500",
+    "00520",
+    "00562",
+    "00620",
+    "00630",
+    "00640",
+    "00650",
+    "00660",
+    "00680",
+    "00710",
+    "00720",
+    "00910",
+    "00920",
+    "00061",
+    "00062",
+    "00071",
+    "00073",
+    "00100",
+    "00120",
+    "00121",
+    "00140",
+    "00561",
+    "00564",
+    "00565",
+    "00600",
+    "00590",
+    "00591",
+    "00592",
+    "01040",
+    "00230",
+    "00240",
+    "00250",
+    "00260",
+    "00270",
+    "00280",
+    "00290",
+    "00310",
+    "00220",
+    "00330",
+    "00340",
+    "00350",
+    "00360",
+    "00380",
+    "00400",
+    "00410",
+    "00430",
+    "00440",
+    "00450",
+    "00460",
+    "00470",
+    "00480",
+    "00510",
+    "00513",
+    "00512",
+    "00515",
+    "00514",
+    "00532",
+    "00534",
+    "00533",
+    "00531",
+    "00563",
+    "00601",
+    "00603",
+    "00604",
+    "00540",
+    "00550",
+    "00511",
+    "00571",
+    "00572",
+    "00730",
+    "00740",
+    "00750",
+    "00760",
+    "00770",
+    "00780",
+    "00785",
+    "00790",
+    "00670",
+    "00830",
+    "00860",
+    "00130",
+    "00900",
+    "00902",
+    "00909",
+    "00904",
+    "00906",
+    "00905",
+    "00981",
+    "00908",
+    "00903",
+    "00907",
+    "01052",
+    "00522",
+    "01051",
+    "01059",
+    "01056",
+    "01057",
+    "00253",
+    "00523",
+    "01054",
+    "01053",
+    "01055",
+    "00940",
+    "00945",
+    "00941",
+    "00944",
+    "00942",
+    "00943",
+    "00901",
+    "00403",
+    "00950",
+    "00960",
+    "00996",
+    "00232",
+    "00965",
+    "00966",
+    "00402",
+    "00311",
+    "00332",
+    "00261",
+    "00331",
+    "00521",
+    "00524",
+    "00525",
+    "00401",
+    "00404",
+    "00405",
+    "00333",
+    "00254",
+    "00998",
+    "00999",
+    "00362",
+    "00627",
+    "00364",
+    "00625",
+    "00361",
+    "00623",
+    "00622",
+    "00633",
+    "00642",
+    "00643",
+    "00791",
+    "00930",
+    "00363",
+    "00621",
+    "00626",
+    "00624",
+    "00365",
+    "00984",
+    "00980",
+    "03020",
+    "03022",
+    "03040",
+    "03010",
+    "00970",
+    "03013",
+    "03015",
+    "03008",
+    "03060",
+    "04141",
+    "04130",
+    "04120",
+    "04122",
+    "03050",
+    "03018",
+    "03030",
+    "03410",
+    "03420",
+    "03430",
+    "03440",
+    "03450",
+    "03460",
+    "02010",
+    "02060",
+    "03070",
+    "02020",
+    "04014",
+    "04015",
+    "04010",
+    "04011",
+    "04012",
+    "04310",
+    "04330",
+    "04340",
+    "04350",
+    "04390",
+    "04392",
+    "04370",
+    "04371",
+    "04630",
+    "04064",
+    "04668",
+    "04066",
+    "04068",
+    "04020",
+    "04070",
+    "04072",
+    "04071",
+    "04024",
+    "04022",
+    "04151",
+    "04152",
+    "04150",
+    "04075",
+    "04080",
+    "04060",
+    "04061",
+    "04512",
+    "04514",
+    "04144",
+    "04145",
+    "04142",
+    "04146",
+    "04138",
+    "04136",
+    "04139",
+    "04110",
+    "04111",
+    "04112",
+    "04113",
+    "04114",
+    "04210",
+    "04215",
+    "04216",
+    "04217",
+    "04115",
+    "04218",
+    "04510",
+    "04520",
+    "04530",
+    "04540",
+    "04550",
+    "02024",
+    "05111",
+    "02025",
+    "02026",
+    "02030",
+    "02040",
+    "04810"
   ],
  "inside_container": true
 }
diff --git a/workflow/mosca.py b/workflow/mosca.py
@@ -9,7 +9,7 @@
 import pandas as pd
 import re
 
-__version__ = '2.3.0'
+__version__ = '2.3.1'
 
 parser = argparse.ArgumentParser(description="MOSCA's main script")
 parser.add_argument("-s", "--snakefile", default=f'{sys.path[0]}/Snakefile', help="Path to Snakefile")
@@ -53,14 +53,17 @@ def validate_exps(exps_data):
     reserved_words = [
         'if', 'else', 'repeat', 'while', 'function', 'for', 'in', 'next', 'break', 'TRUE', 'FALSE', 'NULL', 'Inf',
         'NaN', 'NA', 'NA_integer_', 'NA_real_', 'NA_complex_', 'NA_character_']
-    good_pattern = re.compile(r'^(?!^\d)(?!^\.\d)([\w.]+)$')    # don't start with a number, nor a decimal (dot followed by number)
+    good_pattern = re.compile(r'^(?!^\d)(?!^\.\d)([\w.]+)$')    # don't start with a number, nor a decimal (dot followed by number), and have no special characters (-, +, *, /, etc.)
+    if exps['Name'].duplicated().any():
+        sys.exit(f'ERROR: Multiple rows with same "Name" value: {",".join(exps["Name"].duplicated().any())}.')
     for name in exps['Name']:
         if not name:        # if name is None, or empty string, MOSCA should be able to build one that is fine
             continue
         if name in reserved_words:
             sys.exit(f'INVALID "NAME" in "experiments": {name} is a reserved R word.')
         if not bool(good_pattern.match(name)):
-            sys.exit(f'INVALID "NAME" in "experiments": {name} starts with a number or has a special character.')
+            sys.exit(f'INVALID "NAME" in "experiments": {name} starts with a number or has a special character.\n'
+                     f'Please use only letters, numbers, dots (.) and underscores (_).')
 
 
 def validate_config(config_data):

diff --git a/workflow/scripts/binning.py b/workflow/scripts/binning.py
@@ -12,6 +12,7 @@
 import pandas as pd
 import shutil
 import pathlib
+import snakemake
 
 
 class Binner:
@@ -74,7 +75,7 @@ def better_bin(self, table1, table2):
             return True
         if lq_bins1 < lq_bins2:
             return False
-        return True
+        return False
 
     def iterative_binning(self, contigs, output, threads=8, reads=None, reads2=None, markerset='40'):
         best_bin = 10

diff --git a/workflow/scripts/normalization.R b/workflow/scripts/normalization.R
@@ -28,13 +28,14 @@ normalize_gene_expression <- function(df, output_file, norm_method, imput_method
       imputed <- llsImpute(t(norm), correlation = "pearson", allVariables = allVariables)
       df <- t(completeObs(imputed))
     } else if (imput_method == "MIN") {
+      print("Imputing missing values with the minimum value.")
       df[is.na(df)] <- min(norm, na.rm=TRUE)
     }
   } else {
     stop("Error: normalization method must be either TMM, RLE, or VSN")
   }
-  print('Writing normalized results.')
-  write.table(df, file = output_file, sep = "\t", row.names = TRUE, col.names = TRUE)
+  print('Writing normalized results.')    # but first convert back to counts (normalized)
+  write.table(2^df, file = output_file, sep = "\t", row.names = TRUE, col.names = TRUE, quote=FALSE)
 }
 
 print("Reading data to normalize.")