Limma-Voom differential expression can now fit mixed linear models co…

…ntaining a random effect (e.g. nested design).
GuyTeichman · Nov 13, 2023 · 3adf0a7 · 3adf0a7
1 parent 6acb871
commit 3adf0a7
Show file tree

Hide file tree

Showing 10 changed files with 89 additions and 24 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -7,9 +7,8 @@ History
 
 Added
 *******
+* Limma-Voom differential expression can now fit mixed linear models containing a random effect (e.g. nested design).
 
-Changed
-********
 
 Fixed
 *******

diff --git a/rnalysis/data_files/r_templates/limma_install.R b/rnalysis/data_files/r_templates/limma_install.R
@@ -1,6 +1,20 @@
+pgks <- list("statmod")
+for (pkg in pgks) {
+    tryCatch(
+        tryCatch(
+        {install.packages(pkg)
+        require(pkg)},
+        warning = function(e) {
+        install.packages(pkg, type = "binary")},
+        error = function(e) {
+        install.packages(pkg, type = "binary")}),
+    error = function(e) {}
+    )
+}
 if (("limma" %in% rownames(installed.packages()) == FALSE) || (!require("limma", quietly = TRUE))) {
     if (!require("BiocManager", quietly = TRUE)) {
         install.packages("BiocManager")
         }
     BiocManager::install("limma",update=TRUE, ask=FALSE, force=TRUE)
 }
+
diff --git a/rnalysis/data_files/r_templates/limma_run_parametric.R b/rnalysis/data_files/r_templates/limma_run_parametric.R
@@ -7,5 +7,5 @@ colnames(design)[1] <- "Intercept"
 
 count_data <- read.table("$COUNT_MATRIX", header=TRUE, sep= ",", row.names = 1)
 voom_object <- voom(count_data, design, plot=FALSE, save.plot=TRUE)
-fit <- lmFit(voom_object, design)
 
+$RANDOM_EFFECT_FIT
diff --git a/rnalysis/filtering.py b/rnalysis/filtering.py
@@ -3069,15 +3069,16 @@ def _diff_exp_assertions(self, design_mat_df: pd.DataFrame):
                                                                     f"{sorted(self.columns)}"
 
         for factor in design_mat_df.columns:
-            assert parsing.slugify(factor) == factor, f"Invalid factor name '{factor}': contains invalid characters." \
-                                                      f" \nSuggested alternative name: '{parsing.slugify(factor)}'. "
+            assert generic.sanitize_variable_name(
+                factor) == factor, f"Invalid factor name '{factor}': contains invalid characters." \
+                                   f" \nSuggested alternative name: '{generic.sanitize_variable_name(factor)}'. "
 
     @readable_name('Run Limma-Voom differential expression')
     def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
                                            comparisons: Iterable[Tuple[str, str, str]],
                                            r_installation_folder: Union[str, Path, Literal['auto']] = 'auto',
-                                           output_folder: Union[str, Path, None] = None
-                                           ) -> Tuple['DESeqFilter', ...]:
+                                           output_folder: Union[str, Path, None] = None,
+                                           random_effect: Union[str, None] = None) -> Tuple['DESeqFilter', ...]:
         """
         Run differential expression analysis on the count matrix using the \
         `Limma-Voom <https://doi.org/10.1186/gb-2014-15-2-r29>`_ pipeline. \
@@ -3107,6 +3108,9 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
         as well as the log files and R script used to generate them, will be saved. \
         if output_folder is None, the results will not be saved to a specified directory.
         :type output_folder: str, Path, or None
+        :param random_effect: optionally, specify a single factor to model as a random effect. \
+        This is useful when your experimental design is nested. limma-voom can only treat one factor as a random effect.
+        :type random_effect: str or None
         :return: a tuple of DESeqFilter objects, one for each comparison
         """
         if output_folder is not None:
@@ -3128,7 +3132,7 @@ def differential_expression_limma_voom(self, design_matrix: Union[str, Path],
         io.save_table(design_mat_df, design_mat_path)
 
         r_output_dir = differential_expression.run_limma_analysis(data_path, design_mat_path, comparisons,
-                                                                  r_installation_folder)
+                                                                  r_installation_folder, random_effect)
         outputs = []
         for item in r_output_dir.iterdir():
             if not item.is_file():

diff --git a/rnalysis/utils/differential_expression.py b/rnalysis/utils/differential_expression.py
@@ -17,7 +17,7 @@ def install_limma(r_installation_folder: Union[str, Path, Literal['auto']] = 'au
 
 
 def create_limma_script(data_path: Union[str, Path], design_mat_path: Union[str, Path],
-                        comparisons: Iterable[Tuple[str, str, str]]):
+                        comparisons: Iterable[Tuple[str, str, str]], random_effect: Union[str, None]):
     cache_dir = io.get_todays_cache_dir().joinpath(hashlib.sha1(str(time.time_ns()).encode('utf-8')).hexdigest())
     if not cache_dir.exists():
         cache_dir.mkdir(parents=True)
@@ -36,7 +36,11 @@ def create_limma_script(data_path: Union[str, Path], design_mat_path: Union[str,
         factors_str = ''
         for factor in design_mat_df.columns:
             factor_name = generic.sanitize_variable_name(factor)
-            factor_names[factor] = factor_name
+
+            if factor == random_effect or factor_name == random_effect:
+                random_effect = factor_name
+            else:
+                factor_names[factor] = factor_name
 
             values = sorted(design_mat_df[factor].unique())
             values_str = ', '.join([f'"{val}"' for val in values])
@@ -45,10 +49,24 @@ def create_limma_script(data_path: Union[str, Path], design_mat_path: Union[str,
 
         formula = "~ " + " + ".join(factor_names.values())
 
+        if random_effect is None:
+            random_effect_fit_code = "fit <- lmFit(voom_object, design)"
+        else:
+            random_effect_fit_code = (f"cor <- duplicateCorrelation(voom_object, design, block={random_effect})\n"
+                                      "if (cor$consensus.correlation > 0) { "
+                                      "#only include random effect if the correlation is positive\n"
+                                      f"  fit <- lmFit(voom_object, design, block={random_effect}, "
+                                      f"correlation=cor$consensus.correlation)\n"
+                                      "}"
+                                      "else {"
+                                      "   fit <- lmFit(voom_object, design)"
+                                      "}")
+
         run_template = run_template.replace("$COUNT_MATRIX", Path(data_path).as_posix())
         run_template = run_template.replace("$DESIGN_MATRIX", (Path(design_mat_path).as_posix()))
         run_template = run_template.replace("$DEFINE_FACTORS", factors_str)
         run_template = run_template.replace("$FORMULA", formula)
+        run_template = run_template.replace("$RANDOM_EFFECT_FIT", random_effect_fit_code)
 
         outfile.write(run_template)
 
@@ -68,9 +86,10 @@ def create_limma_script(data_path: Union[str, Path], design_mat_path: Union[str,
 
 def run_limma_analysis(data_path: Union[str, Path], design_mat_path: Union[str, Path],
                        comparisons: Iterable[Tuple[str, str, str]],
-                       r_installation_folder: Union[str, Path, Literal['auto']] = 'auto'):
+                       r_installation_folder: Union[str, Path, Literal['auto']] = 'auto',
+                       random_effect: Union[str, None] = None):
     install_limma(r_installation_folder)
-    script_path = create_limma_script(data_path, design_mat_path, comparisons)
+    script_path = create_limma_script(data_path, design_mat_path, comparisons, random_effect)
     io.run_r_script(script_path, r_installation_folder)
     return script_path.parent
 

diff --git a/tests/test_differential_expression.py b/tests/test_differential_expression.py
@@ -1,4 +1,3 @@
-import platform
 import sys
 
 import numpy as np
@@ -11,18 +10,21 @@ def test_install_limma():
     install_limma()
 
 
-@pytest.mark.parametrize("data,design_matrix,comparisons,expected_path", [
+@pytest.mark.parametrize("data,design_matrix,comparisons,random_effect,expected_path", [
     ('tests/test_files/big_counted.csv', 'tests/test_files/test_design_matrix.csv', [('condition', 'cond2', 'cond1')],
-     'tests/test_files/limma_tests/case1/expected_limma_script_1.R'),
+     None, 'tests/test_files/limma_tests/case1/expected_limma_script_1.R'),
     ('counted.csv', 'tests/test_files/test_design_matrix.csv',
      [('condition', 'cond3', 'cond2'), ('replicate', 'rep2', 'rep1'), ('condition', 'cond1', 'cond2')],
-     'tests/test_files/limma_tests/case2/expected_limma_script_2.R')
+     None, 'tests/test_files/limma_tests/case2/expected_limma_script_2.R'),
+    ('tests/test_files/big_counted.csv', 'tests/test_files/test_design_matrix.csv', [('condition', 'cond2', 'cond1')],
+     'replicate', 'tests/test_files/limma_tests/case3/expected_limma_script_3.R'),
+
 ])
-def test_create_limma_script(data, design_matrix, comparisons, expected_path):
+def test_create_limma_script(data, design_matrix, comparisons, random_effect, expected_path):
     with open(expected_path) as f:
         expected = f.read()
 
-    out_path = create_limma_script(data, design_matrix, comparisons)
+    out_path = create_limma_script(data, design_matrix, comparisons, random_effect)
     assert Path(out_path).exists()
     with open(out_path) as f:
         out = f.read()
@@ -33,7 +35,8 @@ def test_create_limma_script(data, design_matrix, comparisons, expected_path):
 
 @pytest.mark.parametrize('comparisons,expected_paths', [
     (
-    [('replicate', 'rep2', 'rep3')], ['tests/test_files/limma_tests/case1/LimmaVoom_replicate_rep2_vs_rep3_truth.csv']),
+        [('replicate', 'rep2', 'rep3')],
+        ['tests/test_files/limma_tests/case1/LimmaVoom_replicate_rep2_vs_rep3_truth.csv']),
     ([('condition', 'cond2', 'cond1'), ('condition', 'cond3', 'cond2')],
      ['tests/test_files/limma_tests/case2/LimmaVoom_condition_cond2_vs_cond1_truth.csv',
       'tests/test_files/limma_tests/case2/LimmaVoom_condition_cond3_vs_cond2_truth.csv'])

diff --git a/tests/test_files/limma_tests/case1/expected_limma_script_1.R b/tests/test_files/limma_tests/case1/expected_limma_script_1.R
@@ -9,8 +9,8 @@ colnames(design)[1] <- "Intercept"
 
 count_data <- read.table("tests/test_files/big_counted.csv", header=TRUE, sep= ",", row.names = 1)
 voom_object <- voom(count_data, design, plot=FALSE, save.plot=TRUE)
-fit <- lmFit(voom_object, design)
 
+fit <- lmFit(voom_object, design)
 
 contrast <- makeContrasts("conditioncond2", levels = design)
 contrast_fit <- contrasts.fit(fit, contrast)

diff --git a/tests/test_files/limma_tests/case2/expected_limma_script_2.R b/tests/test_files/limma_tests/case2/expected_limma_script_2.R
@@ -9,8 +9,8 @@ colnames(design)[1] <- "Intercept"
 
 count_data <- read.table("counted.csv", header=TRUE, sep= ",", row.names = 1)
 voom_object <- voom(count_data, design, plot=FALSE, save.plot=TRUE)
-fit <- lmFit(voom_object, design)
 
+fit <- lmFit(voom_object, design)
 
 contrast <- makeContrasts("conditioncond3 - conditioncond2", levels = design)
 contrast_fit <- contrasts.fit(fit, contrast)

diff --git a/tests/test_files/limma_tests/case3/expected_limma_script_3.R b/tests/test_files/limma_tests/case3/expected_limma_script_3.R
@@ -0,0 +1,23 @@
+require("limma")
+
+design_matrix <- read.table("tests/test_files/test_design_matrix.csv", header=TRUE, sep= ",")
+condition <- factor(design_matrix$condition, levels=c("cond1", "cond2", "cond3"))
+replicate <- factor(design_matrix$replicate, levels=c("rep1", "rep2", "rep3"))
+
+design <- model.matrix(~ condition)
+colnames(design)[1] <- "Intercept"
+
+count_data <- read.table("tests/test_files/big_counted.csv", header=TRUE, sep= ",", row.names = 1)
+voom_object <- voom(count_data, design, plot=FALSE, save.plot=TRUE)
+
+cor <- duplicateCorrelation(voom_object, design, block=replicate)
+if (cor$consensus.correlation > 0) { #only include random effect if the correlation is positive
+  fit <- lmFit(voom_object, design, block=replicate, correlation=cor$consensus.correlation)
+}else {   fit <- lmFit(voom_object, design)}
+
+contrast <- makeContrasts("conditioncond2", levels = design)
+contrast_fit <- contrasts.fit(fit, contrast)
+contrast_bayes <- eBayes(contrast_fit)
+res <- topTable(contrast_bayes, n=Inf)
+res_ordered <- res[order(res$adj.P.Val),]
+write.csv(as.data.frame(res_ordered),file="*PLACEHOLDERPATH*/LimmaVoom_condition_cond2_vs_cond1.csv")
diff --git a/tests/test_filtering.py b/tests/test_filtering.py
@@ -2066,6 +2066,7 @@ def mock_run_analysis(data_path, design_mat_path, comps, r_installation_folder):
         unlink_tree(outdir)
 
 
+@pytest.mark.parametrize('random_effect', [None, 'replicate'])
 @pytest.mark.parametrize('comparisons,expected_paths,script_path', [
     ([('replicate', 'rep2', 'rep3')], ['tests/test_files/limma_tests/case1/LimmaVoom_replicate_rep2_vs_rep3_truth.csv'],
      'tests/test_files/limma_tests/case1/expected_limma_script_1.R'),
@@ -2074,24 +2075,26 @@ def mock_run_analysis(data_path, design_mat_path, comps, r_installation_folder):
       'tests/test_files/limma_tests/case2/LimmaVoom_condition_cond3_vs_cond2_truth.csv'],
      'tests/test_files/limma_tests/case2/expected_limma_script_2.R')
 ])
-def test_differential_expression_limma(monkeypatch, comparisons, expected_paths, script_path):
+def test_differential_expression_limma(monkeypatch, comparisons, expected_paths, script_path, random_effect):
     outdir = 'tests/test_files/limma_tests/outdir'
     sample_table_path = 'tests/test_files/test_design_matrix.csv'
     truth = parsing.data_to_tuple(
         [DESeqFilter(file, log2fc_col='logFC', padj_col='adj.P.Val') for file in expected_paths])
     c = CountFilter('tests/test_files/big_counted.csv')
 
-    def mock_run_analysis(data_path, design_mat_path, comps, r_installation_folder):
+    def mock_run_analysis(data_path, design_mat_path, comps, r_installation_folder, rand_eff):
         assert r_installation_folder == 'auto'
         assert comps == comparisons
+        assert rand_eff == random_effect
         assert CountFilter(data_path) == c
         assert io.load_table(design_mat_path, 0).equals(io.load_table(sample_table_path, 0))
 
         return Path(script_path).parent
 
     monkeypatch.setattr(differential_expression, 'run_limma_analysis', mock_run_analysis)
     try:
-        res = c.differential_expression_limma_voom(sample_table_path, comparisons, output_folder=outdir)
+        res = c.differential_expression_limma_voom(sample_table_path, comparisons, output_folder=outdir,
+                                                   random_effect=random_effect)
         assert sorted(res, key=lambda filter_obj: filter_obj.fname.name) == sorted(truth, key=lambda
             filter_obj: filter_obj.fname.name)
         with open(Path(outdir).joinpath(Path(script_path).name)) as outfile, open(script_path) as truthfile: