From f601900dd2bb270c57b1b0e3050b3775e57c4708 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 15 Aug 2024 08:43:45 +0200 Subject: [PATCH 1/5] [FIX] extract_marker does now properly work --- FastOMA/zoo/hog/extract_groups.py | 37 ++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/FastOMA/zoo/hog/extract_groups.py b/FastOMA/zoo/hog/extract_groups.py index 9a44a81..19db3e5 100644 --- a/FastOMA/zoo/hog/extract_groups.py +++ b/FastOMA/zoo/hog/extract_groups.py @@ -1,3 +1,5 @@ +import itertools + from ..utils import auto_open import collections from time import time @@ -72,8 +74,20 @@ def add_genome_genes(self, genome_node): self.genes.update(generef_2_xref) return True + def _count_genes(self, node): + count = 0 + for gene in node.iter('{http://orthoXML.org/2011/}geneRef'): + count += 1 + for og in node.iter('{http://orthoXML.org/2011/}orthologGroup'): + for n in og.text: + if isinstance(n, Gene): + count += 1 + return count + def _collect_genes(self, node): - genes = set([]); to_rem = [] + genes = set([]) + if node.tag != "{http://orthoXML.org/2011/}orthologGroup": + raise RuntimeError("_collect_genes() only works for ortholog groups") for child in node.iter(): if child == node: continue @@ -83,22 +97,15 @@ def _collect_genes(self, node): except KeyError: logger.info(f"ignoring gene(id={child.get('id')}), probably in skip set.") pass - to_rem.append(child) elif child.tag == "{http://orthoXML.org/2011/}orthologGroup": genes.update((n for n in child.text if isinstance(n, Gene))) - to_rem.append(child) - for c in to_rem: - try: - node.remove(c) - except ValueError as e: - # this is not a direct child of node. we ignore this potential - # memory-leak as the entire group will be deleted at latest once - # we reach the root orthologGroup node. - pass return genes def merge_children(self, node): genes = self._collect_genes(node) + for child in reversed(node): + if child.tag in ("{http://orthoXML.org/2011/}orthologGroup", "{http://orthoXML.org/2011/}geneRef", "{http://orthoXML.org/2011/}paralogGroup"): + node.remove(child) node.text = genes def get_group(self, node): @@ -134,11 +141,11 @@ def handle_duplication_node(self, elem): class MarkerGroupExtractor(GroupExtractor): def handle_duplication_node(self, elem): - nr_children = [len(self._collect_genes(child)) for child in elem] + nr_children = [self._count_genes(child) for child in elem] max_pos = nr_children.index(max(nr_children)) - for i, child in enumerate(elem): - if i != max_pos: - elem.remove(child) + to_rem = [c for i, c in enumerate(elem) if i != max_pos] + for child in to_rem: + elem.remove(child) def parse_orthoxml(fh, processor:GroupExtractor): From ee2a5b66cfddcd11269d0a96870df6c5ca1c7b3e Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 15 Aug 2024 08:49:43 +0200 Subject: [PATCH 2/5] [FIX] some filesystem do not allow filenames with ':'. Avoid this for roothog fasta files --- FastOMA/collect_subhogs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py index 0e97c01..6bcb3a5 100644 --- a/FastOMA/collect_subhogs.py +++ b/FastOMA/collect_subhogs.py @@ -254,7 +254,7 @@ def write_roothogs(orthoxml: Path, roothog_folder: Path, output_file_roothog_tsv for gene in group_members: tsv.write(f"{group_name}\t{gene}\t{omamer_roothog}\n") - _write_group_fasta(fasta_format, group_members, group_name, id_transformer, meta, output_fasta_groups, + _write_group_fasta(fasta_format, group_members, group_name.replace(":", ""), id_transformer, meta, output_fasta_groups, roothog_folder) logger.info("writing of %s done. created %d groups containing %d proteins in total", From 4b18d6fb9e2a1a85132145219e5590dbb79ade6f Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 19 Aug 2024 09:30:46 +0200 Subject: [PATCH 3/5] [FIX] only return groups with more than one gene. --- FastOMA/zoo/hog/extract_groups.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/FastOMA/zoo/hog/extract_groups.py b/FastOMA/zoo/hog/extract_groups.py index 19db3e5..cf4d16b 100644 --- a/FastOMA/zoo/hog/extract_groups.py +++ b/FastOMA/zoo/hog/extract_groups.py @@ -181,7 +181,10 @@ def fixtag(tag, ns=""): processor.merge_children(elem) if og_level == extract_at_depth: logger.debug("dumping annotated group with {} genes".format(len(elem.text))) - yield processor.get_group(elem) + if len(elem.text) > 1: + yield processor.get_group(elem) + else: + logger.debug("won't return group of less than two proteins") elem.clear() extract_at_depth = -1 if processor.target_clade is not None else 0 if og_level == 0: From 2898cba131847cb74e7701a637af363db9db0384 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 19 Aug 2024 09:40:48 +0200 Subject: [PATCH 4/5] bump version 0.3.3+dev -> 0.3.4 --- FastOMA/__init__.py | 2 +- README.md | 8 ++++---- nextflow.config | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/FastOMA/__init__.py b/FastOMA/__init__.py index 71aa787..4db05ee 100644 --- a/FastOMA/__init__.py +++ b/FastOMA/__init__.py @@ -1,3 +1,3 @@ __packagename__ = "FastOMA" -__version__ = "0.3.3+dev" +__version__ = "0.3.4" diff --git a/README.md b/README.md index 5bf03be..c02b6b2 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ installed. ```bash nextflow run dessimozlab/FastOMA -profile docker --input_folder /path/to/in_folder --output_folder /path/to/out_folder ``` -You could also add specific version to be used by adding `-r v0.3.1` to the command line. +You could also add specific version to be used by adding `-r v0.3.4` to the command line. Nextflow will automatically fetch the [dessimozlab/FastOMA](https://github.com/dessimozlab/FastOMA) repository and starts the `FastOMA.nf` workflow. The `-profile` argument must be used to specify the profile to use. We support `docker`, `singularity` and `conda` which then automatically set up the necessary tools by downloading the required containers or creating @@ -88,10 +88,10 @@ There are four ways to run/install FastOMA detailed below: The FastOMA workflow can be run directly without any installation using nextflow's ability to fetch a workflow from github. A specific version can be selected by specifying the `-r` option to nextflow to select a specific version of FastOMA: ```bash -nextflow run desimozlab/FastOMA -r v0.3.1 -profile conda +nextflow run desimozlab/FastOMA -r v0.3.4 -profile conda ``` -This will fetch version v0.3.1 from github and run the FastOMA workflow using the conda profile. See section [How to run fastOMA](#how-to-run-fastoma). +This will fetch version v0.3.4 from github and run the FastOMA workflow using the conda profile. See section [How to run fastOMA](#how-to-run-fastoma). ### 2. Cloning the FastOMA repo and running from there @@ -184,7 +184,7 @@ nextflow run FastOMA.nf -profile docker \ --output_folder myresult/ ``` This will use the container that is tagged with the current commit id. Similarly, one could also use -`--container_version "0.3.1"` to use the container with version `dessimozlab/fastoma:0.3.1` from dockerhub. Check the latest version on the [DockerHub](https://hub.docker.com/r/dessimozlab/fastoma/tags). +`--container_version "0.3.4"` to use the container with version `dessimozlab/fastoma:0.3.4` from dockerhub. Check the latest version on the [DockerHub](https://hub.docker.com/r/dessimozlab/fastoma/tags). ### Singularity Since Docker needs administrator privileges (root access), [Singluarity](https://apptainer.org/index.html) (a.k.a Apptainer) is a good alternative. This can be installed using [Conda](https://anaconda.org/conda-forge/singularity) with `conda install conda-forge::singularity`. However, in most of the academic HPC cluster, singluarity is already installed and can be called with `module load`. diff --git a/nextflow.config b/nextflow.config index 10a68e2..bfabdca 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,12 +8,12 @@ manifest { nextflowVersion = ">=22.10.4" defaultBranch = "main" doi = "10.1101/2024.01.29.577392" - version = "0.4dev" + version = "0.3.4" } params { container_name = "dessimozlab/fastoma" - container_version = "dev" + container_version = "0.3.4" omamer_db = "https://omabrowser.org/All/LUCA.h5" debug_enabled = false help = false From 95b531a1e8275634aa9c51916b322999dc0e5187 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 19 Aug 2024 09:41:09 +0200 Subject: [PATCH 5/5] update change log --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c02b6b2..eeea04a 100644 --- a/README.md +++ b/README.md @@ -449,6 +449,8 @@ Majidian, Sina, Yannis Nevers, Ali Yazdizadeh Kharrazi, Alex Warwick Vesztrocy, ## Change log +- Update v0.3.4: Fixing a bug in marker gene groups extraction. Before, more than one gene per species were possible +- Update v0.3.3: improvements for nextflow (selection of alternative versions) and updates on readme - Update v0.3.1: spliting HOG and sampling - Update v0.1.6: adding dynamic resources, additional and improved output - Update v0.1.5: docker, add help, clean nextflow