From f3d664a4df4c62f7717769ac38851ad1629fd385 Mon Sep 17 00:00:00 2001 From: Gerry Tonkin-Hill Date: Tue, 24 Sep 2024 02:54:13 +0200 Subject: [PATCH] exposed cdhit length requirement in gene family clustering steps to address #299 --- panaroo/__main__.py | 7 +++++++ panaroo/clean_network.py | 3 +++ 2 files changed, 10 insertions(+) diff --git a/panaroo/__main__.py b/panaroo/__main__.py index 72ce87c..0e51864 100755 --- a/panaroo/__main__.py +++ b/panaroo/__main__.py @@ -114,6 +114,10 @@ def get_options(args): dest="len_dif_percent", help="length difference cutoff (default=0.98)", type=float) + matching.add_argument("--family_len_dif_percent", + dest="family_len_dif_percent", + help="length difference cutoff at the gene family level (default=0.0)", + type=float) matching.add_argument("--merge_paralogs", dest="merge_paralogs", help="don't split paralogs", @@ -386,6 +390,7 @@ def main(): outdir=temp_dir, dna_error_threshold=0.98, correct_mistranslations=True, + family_len_dif_percent=args.family_len_dif_percent, length_outlier_support_proportion=args. length_outlier_support_proportion, n_cpu=args.n_cpu, @@ -401,6 +406,7 @@ def main(): outdir=temp_dir, family_threshold=args.family_threshold, correct_mistranslations=False, + family_len_dif_percent=args.family_len_dif_percent, length_outlier_support_proportion=args. length_outlier_support_proportion, n_cpu=args.n_cpu, @@ -454,6 +460,7 @@ def main(): outdir=temp_dir, family_threshold=args.family_threshold, correct_mistranslations=False, + family_len_dif_percent=args.family_len_dif_percent, length_outlier_support_proportion=args. length_outlier_support_proportion, n_cpu=args.n_cpu, diff --git a/panaroo/clean_network.py b/panaroo/clean_network.py index a9312e2..0a52d07 100755 --- a/panaroo/clean_network.py +++ b/panaroo/clean_network.py @@ -92,6 +92,7 @@ def collapse_families(G, outdir, family_threshold=0.7, dna_error_threshold=0.99, + family_len_dif_percent=0, correct_mistranslations=False, length_outlier_support_proportion=0.01, n_cpu=1, @@ -113,6 +114,7 @@ def collapse_families(G, cdhit_clusters = iterative_cdhit(G, outdir, thresholds=threshold, + s=family_len_dif_percent, n_cpu=n_cpu, quiet=True, dna=True, @@ -124,6 +126,7 @@ def collapse_families(G, cdhit_clusters = iterative_cdhit(G, outdir, thresholds=threshold, + s=family_len_dif_percent, n_cpu=n_cpu, quiet=True, dna=False)