diff --git a/pyproject.toml b/pyproject.toml index 6ce4573..d0adcdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "HogProf" dynamic = ["entry-points"] -version = "0.0.8" +version = "0.0.9" authors = [ { name="Dave Moi", email="dmoi@unil.ch" }, ] diff --git a/src/HogProf/lshbuilder.py b/src/HogProf/lshbuilder.py index 803cb55..4cad58c 100755 --- a/src/HogProf/lshbuilder.py +++ b/src/HogProf/lshbuilder.py @@ -8,6 +8,8 @@ import time as t import pickle import xml.etree.cElementTree as ET +from ete3 import Phyloxml + from datasketch import MinHashLSHForest , WeightedMinHashGenerator from datetime import datetime @@ -71,8 +73,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving self.datetime = datetime self.fileglob = fileglob self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now()) - - if saving_name: self.saving_name= saving_name if self.saving_name[-1]!= '/': @@ -95,13 +95,27 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxlist , outdir=self.saving_path) else: raise Exception( 'please specify either a list of taxa or a tree' ) - self.swap2taxcode = True - elif mastertree: - self.tree_ete3 = ete3.Tree(masterTree, format=1) + elif masterTree: + if 'xml' in masterTree.lower(): + project = Phyloxml() + project.build_from_file(masterTree) + trees = [t for t in project.get_phylogeny()] + self.tree_ete3 = [ n for n in trees[0] ][0] + print( self.tree_ete3 ) + + else: + + try: + self.tree_ete3 = ete3.Tree(masterTree, format=1 , quoted_node_names= True) + print( self.tree_ete3 ) + except: + self.tree_ete3 = ete3.Tree(masterTree, format=0) + with open(masterTree) as treein: self.tree_string = treein.read() - self.swap2taxcode = use_taxcodes + #self.tree_string = self.tree_ete3.write(format=0) + self.swap2taxcode = use_taxcodes self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3 , self.tax_filter, self.tax_mask) with open( self.saving_path + 'taxaIndex.pkl', 'wb') as taxout: taxout.write( pickle.dumps(self.taxaIndex)) @@ -118,7 +132,16 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving wmgout.write( pickle.dumps(wmg)) self.wmg = wmg print( 'configuring pyham functions') + + if self.swap2taxcode == True: + print('swapping ids') + else: + print('not swapping ids') + + + if self.h5OMA: + self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode ) else: self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , orthoXML_as_string = False ) @@ -133,7 +156,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving self.n_groups = len(self.fileglob) else: raise Exception( 'please specify an input file' ) - self.hashes_path = self.saving_path + 'hashes.h5' self.lshpath = self.saving_path + 'newlsh.pkl' self.lshforestpath = self.saving_path + 'newlshforest.pkl' @@ -155,7 +177,7 @@ def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None ): if self.h5OMA: self.groups = self.h5OMA.root.OrthoXML.Index self.rows = len(self.groups) - for i, row in tqdm.tqdm(enumerate(self.groups)): + for i, row in enumerate(self.groups): if i > start: fam = row[0] ortho_fam = self.READ_ORTHO(fam) @@ -344,8 +366,10 @@ def mp_with_timeout(functypes, data_generator): for key in work_processes: for process in work_processes[key]: process.start() - for data in data_generator: + + for data in tqdm.tqdm(data_generator): q.put(data) + print('done spooling data') for key in work_processes: for i in range(2): @@ -452,12 +476,16 @@ def main(): else: duplonly = False - if args['taxcodes']: - taxcodes = args['taxcodes'] - else: + + + if args['taxcodes'] == 'True': + taxcodes = True + else: taxcodes = False + + print('taxcodes', taxcodes) - if args['verbose']: + if args['verbose'] == 'True': verbose = args['verbose'] else: verbose = False @@ -485,7 +513,6 @@ def main(): else: mastertree=None start = time.time() - if omafile: with open_file( omafile , mode="r") as h5_oma: lsh_builder = LSHBuilder(h5_oma = h5_oma, fileglob=orthoglob ,saving_name=dbname , numperm = nperm , diff --git a/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc b/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc index 0763262..3cd22ca 100644 Binary files a/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc and b/src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc differ diff --git a/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc b/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc index 16199a6..db5f3f6 100644 Binary files a/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc and b/src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc differ diff --git a/src/HogProf/utils/hashutils.py b/src/HogProf/utils/hashutils.py index 5b340fd..8c97548 100755 --- a/src/HogProf/utils/hashutils.py +++ b/src/HogProf/utils/hashutils.py @@ -73,6 +73,8 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly = else: #throwaway vector... hog_matrix_weighted[0,0] = 1 + if np.sum(hog_matrix_weighted) == 0: + hog_matrix_weighted[0,0] = 1 weighted_hash = wmg.minhash(list(hog_matrix_weighted.flatten())) return hog_matrix_binary , weighted_hash diff --git a/src/HogProf/utils/pyhamutils.py b/src/HogProf/utils/pyhamutils.py index 2c1cdd2..6491312 100755 --- a/src/HogProf/utils/pyhamutils.py +++ b/src/HogProf/utils/pyhamutils.py @@ -33,14 +33,17 @@ def switch_name_ncbi_id(orthoxml , mapdict = None ): orthoxml = ET.tostring(root, encoding='unicode', method='xml') return orthoxml -def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True): +def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True ): fam, orthoxml = row if orthoxml: try: if swap_ids == True and orthoXML_as_string == True: orthoxml = switch_name_ncbi_id(orthoxml) - ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=orthoXML_as_string) - tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0]) + quoted = False + else: + quoted = True + ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = 'newick_string' ,use_internal_name=True, orthoXML_as_string=orthoXML_as_string ) + tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0]) return tp.treemap except: print('error' , traceback.format_exc())