diff --git a/.gitignore b/.gitignore index a224e6b..9dad835 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ inputs/hg38/ results/ +tmp/ # Byte-compiled / optimized / DLL files __pycache__/ @@ -90,3 +91,7 @@ ENV/ # Rope project settings .ropeproject + +test.py + +data/ \ No newline at end of file diff --git a/attic_util/util.py b/attic_util/util.py index 4deb572..938282f 100644 --- a/attic_util/util.py +++ b/attic_util/util.py @@ -1,11 +1,18 @@ import random import string -import resource import logbook import arrow import numpy as np import os +importedResource = False + +try: + import resource + importedResource = True +except ImportError: + pass + def split_Xy(df, y_colname='label'): X = df.drop([y_colname], axis=1) y = df[y_colname] @@ -26,7 +33,10 @@ def random_str(N): return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.ascii_uppercase + string.digits) for _ in range(N)) def memory_usage(): - return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1E6 + if(importedResource): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1E6 + else: + return 0 def estimate_bytes(filenames): return sum([os.stat(f).st_size for f in filenames]) diff --git a/dna2vec/multi_k_model.py b/dna2vec/multi_k_model.py index 1d1513c..2493844 100644 --- a/dna2vec/multi_k_model.py +++ b/dna2vec/multi_k_model.py @@ -4,9 +4,11 @@ import tempfile import numpy as np -from gensim.models import word2vec +# from gensim.models import word2vec from gensim import matutils +import gensim + class SingleKModel: def __init__(self, model): self.model = model @@ -14,7 +16,7 @@ def __init__(self, model): class MultiKModel: def __init__(self, filepath): - self.aggregate = word2vec.Word2Vec.load_word2vec_format(filepath, binary=False) + self.aggregate = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=False) self.logger = logbook.Logger(self.__class__.__name__) vocab_lens = [len(vocab) for vocab in self.aggregate.vocab.keys()] @@ -25,6 +27,7 @@ def __init__(self, filepath): self.data = {} for k in range(self.k_low, self.k_high + 1): self.data[k] = self.separate_out_model(k) + print(len(self.data)) def model(self, k_len): """ @@ -50,10 +53,11 @@ def separate_out_model(self, k_len): self.logger.warn('Missing {}-mers: {} / {}'.format(k_len, len(vocabs), 4 ** k_len)) header_str = '{} {}'.format(len(vocabs), self.vec_dim) - with tempfile.NamedTemporaryFile(mode='w') as fptr: + with tempfile.NamedTemporaryFile(mode='w', delete=False) as fptr: print(header_str, file=fptr) for vocab in vocabs: vec_str = ' '.join("%f" % val for val in self.aggregate[vocab]) print('{} {}'.format(vocab, vec_str), file=fptr) fptr.flush() - return SingleKModel(word2vec.Word2Vec.load_word2vec_format(fptr.name, binary=False)) + open(fptr.name, "rb") + return SingleKModel(gensim.models.KeyedVectors.load_word2vec_format(fptr.name, binary=False)) diff --git a/requirements.txt b/requirements.txt index 14b5c6a..de84075 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,18 @@ -arrow==0.8.0 -biopython==1.68 -boto==2.46.1 +arrow==0.15.6 +biopython==1.76 bz2file==0.98 -ConfigArgParse==0.11.0 -gensim==0.13.2 -Logbook==1.0.0 -numpy==1.16 -pep8==1.7.0 -pluggy==0.4.0 -py==1.4.33 -pytest==3.0.7 -python-dateutil==2.6.0 -requests==2.20.0 -scipy==0.19.0 -six==1.10.0 -smart-open==1.5.1 -tox==2.7.0 -tox-pyenv==1.0.3 -virtualenv==15.1.0 +ConfigArgParse==1.2.3 +gensim==3.8.3 +Logbook==1.5.3 +numpy==1.18.1 +pep8==1.7.1 +pluggy==0.13.1 +py==1.8.1 +pytest==5.4.2 +python-dateutil==2.8.1 +requests==2.23.0 +six==1.14.0 +smart-open==2.0.0 +tox==3.15.0 +tox-pyenv==1.1.0 +virtualenv==20.0.20 diff --git a/scripts/train_dna2vec.py b/scripts/train_dna2vec.py index e2fe6a2..c2e9347 100755 --- a/scripts/train_dna2vec.py +++ b/scripts/train_dna2vec.py @@ -132,7 +132,7 @@ def main(): args.kmer_fragmenter)) out_txt_filename = '{}.txt'.format(out_fileroot) - with open(out_txt_filename, 'w') as summary_fptr: + with open(out_txt_filename, 'w+') as summary_fptr: with Tee(summary_fptr): logbook.StreamHandler(sys.stdout, level=log_level).push_application() redirect_logging()