Skip to content

Commit

Permalink
toml update
Browse files Browse the repository at this point in the history
fixing taxfilter errors
  • Loading branch information
cactuskid committed Oct 23, 2023
1 parent ca394de commit a6f4536
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 18 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "HogProf"
dynamic = ["entry-points"]
version = "0.0.8"
version = "0.0.9"
authors = [
{ name="Dave Moi", email="[email protected]" },
]
Expand Down
55 changes: 41 additions & 14 deletions src/HogProf/lshbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import time as t
import pickle
import xml.etree.cElementTree as ET
from ete3 import Phyloxml


from datasketch import MinHashLSHForest , WeightedMinHashGenerator
from datetime import datetime
Expand Down Expand Up @@ -71,8 +73,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
self.datetime = datetime
self.fileglob = fileglob
self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now())


if saving_name:
self.saving_name= saving_name
if self.saving_name[-1]!= '/':
Expand All @@ -95,13 +95,27 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
self.tree_string , self.tree_ete3 = files_utils.get_tree(taxa=taxlist , outdir=self.saving_path)
else:
raise Exception( 'please specify either a list of taxa or a tree' )
self.swap2taxcode = True
elif mastertree:
self.tree_ete3 = ete3.Tree(masterTree, format=1)
elif masterTree:
if 'xml' in masterTree.lower():
project = Phyloxml()
project.build_from_file(masterTree)
trees = [t for t in project.get_phylogeny()]
self.tree_ete3 = [ n for n in trees[0] ][0]
print( self.tree_ete3 )

else:

try:
self.tree_ete3 = ete3.Tree(masterTree, format=1 , quoted_node_names= True)
print( self.tree_ete3 )
except:
self.tree_ete3 = ete3.Tree(masterTree, format=0)

with open(masterTree) as treein:
self.tree_string = treein.read()
self.swap2taxcode = use_taxcodes
#self.tree_string = self.tree_ete3.write(format=0)

self.swap2taxcode = use_taxcodes
self.taxaIndex, self.reverse = files_utils.generate_taxa_index(self.tree_ete3 , self.tax_filter, self.tax_mask)
with open( self.saving_path + 'taxaIndex.pkl', 'wb') as taxout:
taxout.write( pickle.dumps(self.taxaIndex))
Expand All @@ -118,7 +132,16 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
wmgout.write( pickle.dumps(wmg))
self.wmg = wmg
print( 'configuring pyham functions')

if self.swap2taxcode == True:
print('swapping ids')
else:
print('not swapping ids')



if self.h5OMA:

self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode )
else:
self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string , swap_ids=self.swap2taxcode , orthoXML_as_string = False )
Expand All @@ -133,7 +156,6 @@ def __init__(self,h5_oma=None,fileglob = None, taxa=None,masterTree=None, saving
self.n_groups = len(self.fileglob)
else:
raise Exception( 'please specify an input file' )

self.hashes_path = self.saving_path + 'hashes.h5'
self.lshpath = self.saving_path + 'newlsh.pkl'
self.lshforestpath = self.saving_path + 'newlshforest.pkl'
Expand All @@ -155,7 +177,7 @@ def generates_dataframes(self, size=100, minhog_size=10, maxhog_size=None ):
if self.h5OMA:
self.groups = self.h5OMA.root.OrthoXML.Index
self.rows = len(self.groups)
for i, row in tqdm.tqdm(enumerate(self.groups)):
for i, row in enumerate(self.groups):
if i > start:
fam = row[0]
ortho_fam = self.READ_ORTHO(fam)
Expand Down Expand Up @@ -344,8 +366,10 @@ def mp_with_timeout(functypes, data_generator):
for key in work_processes:
for process in work_processes[key]:
process.start()
for data in data_generator:

for data in tqdm.tqdm(data_generator):
q.put(data)

print('done spooling data')
for key in work_processes:
for i in range(2):
Expand Down Expand Up @@ -452,12 +476,16 @@ def main():
else:
duplonly = False

if args['taxcodes']:
taxcodes = args['taxcodes']
else:


if args['taxcodes'] == 'True':
taxcodes = True
else:
taxcodes = False

print('taxcodes', taxcodes)

if args['verbose']:
if args['verbose'] == 'True':
verbose = args['verbose']
else:
verbose = False
Expand Down Expand Up @@ -485,7 +513,6 @@ def main():
else:
mastertree=None
start = time.time()

if omafile:
with open_file( omafile , mode="r") as h5_oma:
lsh_builder = LSHBuilder(h5_oma = h5_oma, fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
Expand Down
Binary file modified src/HogProf/utils/__pycache__/hashutils.cpython-310.pyc
Binary file not shown.
Binary file modified src/HogProf/utils/__pycache__/pyhamutils.cpython-310.pyc
Binary file not shown.
2 changes: 2 additions & 0 deletions src/HogProf/utils/hashutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def hash_tree(tp , taxaIndex , treeweights , wmg , lossonly = False , duplonly =
else:
#throwaway vector...
hog_matrix_weighted[0,0] = 1
if np.sum(hog_matrix_weighted) == 0:
hog_matrix_weighted[0,0] = 1

weighted_hash = wmg.minhash(list(hog_matrix_weighted.flatten()))
return hog_matrix_binary , weighted_hash
Expand Down
9 changes: 6 additions & 3 deletions src/HogProf/utils/pyhamutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ def switch_name_ncbi_id(orthoxml , mapdict = None ):

orthoxml = ET.tostring(root, encoding='unicode', method='xml')
return orthoxml
def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True):
def get_ham_treemap_from_row(row, tree , level = None , swap_ids = True , orthoXML_as_string = True ):
fam, orthoxml = row
if orthoxml:
try:
if swap_ids == True and orthoXML_as_string == True:
orthoxml = switch_name_ncbi_id(orthoxml)
ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml", use_internal_name=True, orthoXML_as_string=orthoXML_as_string)
tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0])
quoted = False
else:
quoted = True
ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = 'newick_string' ,use_internal_name=True, orthoXML_as_string=orthoXML_as_string )
tp = ham_obj.create_tree_profile(hog=ham_obj.get_list_top_level_hogs()[0])
return tp.treemap
except:
print('error' , traceback.format_exc())
Expand Down

0 comments on commit a6f4536

Please sign in to comment.