Skip to content

Commit

Permalink
Write normalized spectracounts
Browse files Browse the repository at this point in the history
instead of log2 transformed
Put the default maps of KEGGCharter back into default_config.json
  • Loading branch information
iquasere committed May 23, 2024
1 parent 837336b commit dadfa2d
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 9 deletions.
4 changes: 2 additions & 2 deletions cicd/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ VOLUME ["named_volume"]
RUN git clone https://github.com/iquasere/MOSCA.git -b development \
&& bash MOSCA/cicd/install.bash --mosca_env "$(conda info --base)" \
&& conda clean --all \
# all instructions ahead are only for MP spectra conversion with docker
# all commands ahead are only for MP spectra conversion with docker
&& apt-get update \
&& apt-get install -y curl \
&& curl -fsSL https://get.docker.com -o get-docker.sh \
&& sh get-docker.sh

CMD [ "python", "bin/mosca.py" ]
CMD [ "mosca" ]
250 changes: 249 additions & 1 deletion resources/default_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,255 @@
"keggcharter_taxa_level": "SPECIES",
"keggcharter_number_of_taxa": 10,
"keggcharter_maps": [
"00680"
"00010",
"00020",
"00030",
"00040",
"00051",
"00052",
"00053",
"00190",
"00195",
"00196",
"00500",
"00520",
"00562",
"00620",
"00630",
"00640",
"00650",
"00660",
"00680",
"00710",
"00720",
"00910",
"00920",
"00061",
"00062",
"00071",
"00073",
"00100",
"00120",
"00121",
"00140",
"00561",
"00564",
"00565",
"00600",
"00590",
"00591",
"00592",
"01040",
"00230",
"00240",
"00250",
"00260",
"00270",
"00280",
"00290",
"00310",
"00220",
"00330",
"00340",
"00350",
"00360",
"00380",
"00400",
"00410",
"00430",
"00440",
"00450",
"00460",
"00470",
"00480",
"00510",
"00513",
"00512",
"00515",
"00514",
"00532",
"00534",
"00533",
"00531",
"00563",
"00601",
"00603",
"00604",
"00540",
"00550",
"00511",
"00571",
"00572",
"00730",
"00740",
"00750",
"00760",
"00770",
"00780",
"00785",
"00790",
"00670",
"00830",
"00860",
"00130",
"00900",
"00902",
"00909",
"00904",
"00906",
"00905",
"00981",
"00908",
"00903",
"00907",
"01052",
"00522",
"01051",
"01059",
"01056",
"01057",
"00253",
"00523",
"01054",
"01053",
"01055",
"00940",
"00945",
"00941",
"00944",
"00942",
"00943",
"00901",
"00403",
"00950",
"00960",
"00996",
"00232",
"00965",
"00966",
"00402",
"00311",
"00332",
"00261",
"00331",
"00521",
"00524",
"00525",
"00401",
"00404",
"00405",
"00333",
"00254",
"00998",
"00999",
"00362",
"00627",
"00364",
"00625",
"00361",
"00623",
"00622",
"00633",
"00642",
"00643",
"00791",
"00930",
"00363",
"00621",
"00626",
"00624",
"00365",
"00984",
"00980",
"03020",
"03022",
"03040",
"03010",
"00970",
"03013",
"03015",
"03008",
"03060",
"04141",
"04130",
"04120",
"04122",
"03050",
"03018",
"03030",
"03410",
"03420",
"03430",
"03440",
"03450",
"03460",
"02010",
"02060",
"03070",
"02020",
"04014",
"04015",
"04010",
"04011",
"04012",
"04310",
"04330",
"04340",
"04350",
"04390",
"04392",
"04370",
"04371",
"04630",
"04064",
"04668",
"04066",
"04068",
"04020",
"04070",
"04072",
"04071",
"04024",
"04022",
"04151",
"04152",
"04150",
"04075",
"04080",
"04060",
"04061",
"04512",
"04514",
"04144",
"04145",
"04142",
"04146",
"04138",
"04136",
"04139",
"04110",
"04111",
"04112",
"04113",
"04114",
"04210",
"04215",
"04216",
"04217",
"04115",
"04218",
"04510",
"04520",
"04530",
"04540",
"04550",
"02024",
"05111",
"02025",
"02026",
"02030",
"02040",
"04810"
],
"inside_container": true
}
9 changes: 6 additions & 3 deletions workflow/mosca.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import re

__version__ = '2.3.0'
__version__ = '2.3.1'

parser = argparse.ArgumentParser(description="MOSCA's main script")
parser.add_argument("-s", "--snakefile", default=f'{sys.path[0]}/Snakefile', help="Path to Snakefile")
Expand Down Expand Up @@ -53,14 +53,17 @@ def validate_exps(exps_data):
reserved_words = [
'if', 'else', 'repeat', 'while', 'function', 'for', 'in', 'next', 'break', 'TRUE', 'FALSE', 'NULL', 'Inf',
'NaN', 'NA', 'NA_integer_', 'NA_real_', 'NA_complex_', 'NA_character_']
good_pattern = re.compile(r'^(?!^\d)(?!^\.\d)([\w.]+)$') # don't start with a number, nor a decimal (dot followed by number)
good_pattern = re.compile(r'^(?!^\d)(?!^\.\d)([\w.]+)$') # don't start with a number, nor a decimal (dot followed by number), and have no special characters (-, +, *, /, etc.)
if exps['Name'].duplicated().any():
sys.exit(f'ERROR: Multiple rows with same "Name" value: {",".join(exps["Name"].duplicated().any())}.')
for name in exps['Name']:
if not name: # if name is None, or empty string, MOSCA should be able to build one that is fine
continue
if name in reserved_words:
sys.exit(f'INVALID "NAME" in "experiments": {name} is a reserved R word.')
if not bool(good_pattern.match(name)):
sys.exit(f'INVALID "NAME" in "experiments": {name} starts with a number or has a special character.')
sys.exit(f'INVALID "NAME" in "experiments": {name} starts with a number or has a special character.\n'
f'Please use only letters, numbers, dots (.) and underscores (_).')


def validate_config(config_data):
Expand Down
3 changes: 2 additions & 1 deletion workflow/scripts/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import pandas as pd
import shutil
import pathlib
import snakemake


class Binner:
Expand Down Expand Up @@ -74,7 +75,7 @@ def better_bin(self, table1, table2):
return True
if lq_bins1 < lq_bins2:
return False
return True
return False

def iterative_binning(self, contigs, output, threads=8, reads=None, reads2=None, markerset='40'):
best_bin = 10
Expand Down
5 changes: 3 additions & 2 deletions workflow/scripts/normalization.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ normalize_gene_expression <- function(df, output_file, norm_method, imput_method
imputed <- llsImpute(t(norm), correlation = "pearson", allVariables = allVariables)
df <- t(completeObs(imputed))
} else if (imput_method == "MIN") {
print("Imputing missing values with the minimum value.")
df[is.na(df)] <- min(norm, na.rm=TRUE)
}
} else {
stop("Error: normalization method must be either TMM, RLE, or VSN")
}
print('Writing normalized results.')
write.table(df, file = output_file, sep = "\t", row.names = TRUE, col.names = TRUE)
print('Writing normalized results.') # but first convert back to counts (normalized)
write.table(2^df, file = output_file, sep = "\t", row.names = TRUE, col.names = TRUE, quote=FALSE)
}

print("Reading data to normalize.")
Expand Down

0 comments on commit dadfa2d

Please sign in to comment.