Skip to content

Commit

Permalink
feat: implemented normalization to avoid large vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
felixnext committed Sep 23, 2019
1 parent b848a3b commit fd31d00
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
__status__ = "Package"
__copyright__ = "Copyright 2019"
__license__ = "MIT License"
__version__ = "0.1.2"
__version__ = "0.1.3"

# 01101100 00110000 00110000 01110000
__author__ = "Felix Geilert"
Expand Down
21 changes: 16 additions & 5 deletions sklearn_recommender/glove.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from zipfile import ZipFile
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize

# TODO: store in user home dir
folder = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -81,11 +82,17 @@ def load_vectors(self, name, dim):
embeddings_index = list(filter(lambda x: len(x[1]) == dim, embeddings_index))
return dict(embeddings_index)

def word_vector(self, word):
def word_vector(self, word, normalize=True):
'''Tries to retrieve the embedding for the given word, otherwise returns random vector.'''
# generate randomness otherwise
vec = self.emb.get(word)
return vec if vec is not None else np.random.normal(self.emb_mean, self.emb_std, (self.emb_size))
vec = vec if vec is not None else np.random.normal(self.emb_mean, self.emb_std, (self.emb_size))
# check for normalization
if normalize:
norm = np.linalg.norm(vec)
if norm != 0:
vec = np.divide(vec, norm)
return vec

def sent_vector(self, sent, use_rand=True):
'''Generates a single embedding vector.
Expand All @@ -109,11 +116,15 @@ def sent_vector(self, sent, use_rand=True):
else:
vec += wvec
vec_count += 1

# select the vector
vec = vec if vec is not None else np.random.normal(self.emb_mean, self.emb_std, (self.emb_size))
# normalize the vector
if vec is not None and vec_count > 0:
vec = vec / vec_count
norm = np.linalg.norm(vec)
if norm != 0:
vec = np.divide(vec, norm)
# if no word is found return random vector
return vec if vec is not None else np.random.normal(self.emb_mean, self.emb_std, (self.emb_size))
return vec

def sent_matrix(self, sent, max_feat, pad, dedub=False):
'''Generates a Matrix of single embeddings for the item.
Expand Down

0 comments on commit fd31d00

Please sign in to comment.