Merge branch 'rel-0.3.4'

DessimozLab · Aug 19, 2024 · 790a086 · 790a086
2 parents 62ae7cb + 95b531a
commit 790a086
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 24 deletions.
diff --git a/FastOMA/__init__.py b/FastOMA/__init__.py
@@ -1,3 +1,3 @@
 
 __packagename__ = "FastOMA"
-__version__ = "0.3.3"
+__version__ = "0.3.4"
diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
@@ -254,7 +254,7 @@ def write_roothogs(orthoxml: Path, roothog_folder: Path, output_file_roothog_tsv
             for gene in group_members:
                 tsv.write(f"{group_name}\t{gene}\t{omamer_roothog}\n")
 
-            _write_group_fasta(fasta_format, group_members, group_name, id_transformer, meta, output_fasta_groups,
+            _write_group_fasta(fasta_format, group_members, group_name.replace(":", ""), id_transformer, meta, output_fasta_groups,
                                roothog_folder)
 
     logger.info("writing of %s done. created %d groups containing %d proteins in total",

diff --git a/FastOMA/zoo/hog/extract_groups.py b/FastOMA/zoo/hog/extract_groups.py
@@ -1,3 +1,5 @@
+import itertools
+
 from ..utils import auto_open
 import collections
 from time import time
@@ -72,8 +74,20 @@ def add_genome_genes(self, genome_node):
         self.genes.update(generef_2_xref)
         return True
 
+    def _count_genes(self, node):
+        count = 0
+        for gene in node.iter('{http://orthoXML.org/2011/}geneRef'):
+            count += 1
+        for og in node.iter('{http://orthoXML.org/2011/}orthologGroup'):
+            for n in og.text:
+                if isinstance(n, Gene):
+                    count += 1
+        return count
+
     def _collect_genes(self, node):
-        genes = set([]); to_rem = []
+        genes = set([])
+        if node.tag != "{http://orthoXML.org/2011/}orthologGroup":
+            raise RuntimeError("_collect_genes() only works for ortholog groups")
         for child in node.iter():
             if child == node:
                 continue
@@ -83,22 +97,15 @@ def _collect_genes(self, node):
                 except KeyError:
                     logger.info(f"ignoring gene(id={child.get('id')}), probably in skip set.")
                     pass
-                to_rem.append(child)
             elif child.tag == "{http://orthoXML.org/2011/}orthologGroup":
                 genes.update((n for n in child.text if isinstance(n, Gene)))
-                to_rem.append(child)
-        for c in to_rem:
-            try:
-                node.remove(c)
-            except ValueError as e:
-                # this is not a direct child of node. we ignore this potential
-                # memory-leak as the entire group will be deleted at latest once
-                # we reach the root orthologGroup node.
-                pass
         return genes
 
     def merge_children(self, node):
         genes = self._collect_genes(node)
+        for child in reversed(node):
+            if child.tag in ("{http://orthoXML.org/2011/}orthologGroup", "{http://orthoXML.org/2011/}geneRef", "{http://orthoXML.org/2011/}paralogGroup"):
+                node.remove(child)
         node.text = genes
 
     def get_group(self, node):
@@ -134,11 +141,11 @@ def handle_duplication_node(self, elem):
 
 class MarkerGroupExtractor(GroupExtractor):
     def handle_duplication_node(self, elem):
-        nr_children = [len(self._collect_genes(child)) for child in elem]
+        nr_children = [self._count_genes(child) for child in elem]
         max_pos = nr_children.index(max(nr_children))
-        for i, child in enumerate(elem):
-            if i != max_pos:
-                elem.remove(child)
+        to_rem = [c for i, c in enumerate(elem) if i != max_pos]
+        for child in to_rem:
+            elem.remove(child)
 
 
 def parse_orthoxml(fh, processor:GroupExtractor):
@@ -174,7 +181,10 @@ def fixtag(tag, ns=""):
                     processor.merge_children(elem)
                     if og_level == extract_at_depth:
                         logger.debug("dumping annotated group with {} genes".format(len(elem.text)))
-                        yield processor.get_group(elem)
+                        if len(elem.text) > 1:
+                            yield processor.get_group(elem)
+                        else:
+                            logger.debug("won't return group of less than two proteins")
                         elem.clear()
                         extract_at_depth = -1 if processor.target_clade is not None else 0
                 if og_level == 0:

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ installed.
 ```bash
 nextflow run dessimozlab/FastOMA -profile docker  --input_folder /path/to/in_folder --output_folder /path/to/out_folder 
 ```
-You could also add specific version to be used by adding `-r v0.3.1` to the command line. 
+You could also add specific version to be used by adding `-r v0.3.4` to the command line. 
 Nextflow will automatically fetch the [dessimozlab/FastOMA](https://github.com/dessimozlab/FastOMA) repository and starts 
 the `FastOMA.nf` workflow. The `-profile` argument must be used to specify the profile to use. We support `docker`, 
 `singularity` and `conda` which then automatically set up the necessary tools by downloading the required containers or creating 
@@ -88,10 +88,10 @@ There are four ways to run/install FastOMA detailed below:
 The FastOMA workflow can be run directly without any installation using nextflow's ability to fetch a workflow from github. A specific version can be selected by specifying the `-r` option to nextflow to select a specific version of FastOMA:
 
 ```bash
-nextflow run desimozlab/FastOMA -r v0.3.1 -profile conda 
+nextflow run desimozlab/FastOMA -r v0.3.4 -profile conda 
 ```
 
-This will fetch version v0.3.1 from github and run the FastOMA workflow using the conda profile. See section [How to run fastOMA](#how-to-run-fastoma). 
+This will fetch version v0.3.4 from github and run the FastOMA workflow using the conda profile. See section [How to run fastOMA](#how-to-run-fastoma). 
 
 ### 2. Cloning the FastOMA repo and running from there
 
@@ -184,7 +184,7 @@ nextflow run FastOMA.nf -profile docker \
     --output_folder myresult/
 ```
 This will use the container that is tagged with the current commit id. Similarly, one could also use 
-`--container_version "0.3.1"` to use the container with version `dessimozlab/fastoma:0.3.1` from dockerhub. Check the latest version on the [DockerHub](https://hub.docker.com/r/dessimozlab/fastoma/tags).
+`--container_version "0.3.4"` to use the container with version `dessimozlab/fastoma:0.3.4` from dockerhub. Check the latest version on the [DockerHub](https://hub.docker.com/r/dessimozlab/fastoma/tags).
 
 ### Singularity
 Since Docker needs administrator privileges (root access), [Singluarity](https://apptainer.org/index.html) (a.k.a Apptainer) is a good alternative. This can be installed using [Conda](https://anaconda.org/conda-forge/singularity) with `conda install conda-forge::singularity`. However, in most of the academic HPC cluster, singluarity is already installed and can be called with `module load`.
@@ -449,6 +449,8 @@ Majidian, Sina, Yannis Nevers, Ali Yazdizadeh Kharrazi, Alex Warwick Vesztrocy,
 
 
 ## Change log
+- Update  v0.3.4: Fixing a bug in marker gene groups extraction. Before, more than one gene per species were possible
+- Update  v0.3.3: improvements for nextflow (selection of alternative versions) and updates on readme
 - Update  v0.3.1: spliting HOG and sampling
 - Update  v0.1.6: adding dynamic resources, additional and improved output
 - Update  v0.1.5: docker, add help, clean nextflow 

diff --git a/nextflow.config b/nextflow.config
@@ -8,12 +8,12 @@ manifest {
   nextflowVersion = ">=22.10.4"
   defaultBranch = "main"
   doi = "10.1101/2024.01.29.577392"
-  version = "0.3.3"
+  version = "0.3.4"
 }
 
 params {
   container_name = "dessimozlab/fastoma"
-  container_version = "0.3.3"
+  container_version = "0.3.4"
   omamer_db = "https://omabrowser.org/All/LUCA.h5"
   debug_enabled = false
   help = false