diff --git a/README.md b/README.md index 1d1aa0d..2541cc1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,53 @@ # LSCDetection -Data Sets and Models for Evaluation of Lexical Semantic Change Detection +Data Sets and Models for Evaluation of Lexical Semantic Change Detection. + +If you use this software for academic research, [please cite this paper](#bibtex) and make sure you give appropriate credit to the below-mentioned software this repository strongly depends on. + +The code heavily relies on [DISSECT](http://clic.cimec.unitn.it/composes/toolkit/introduction.html) (modules/composes). For aligning embeddings (SGNS/SVD/RI) we used [VecMap](https://github.com/artetxem/vecmap) (alignment/map_embeddings.py). We used the implementation of [gensim](https://github.com/rare-technologies/gensim) for SGNS. + +### Testsets + +In `testsets/` we provide the testset versions of DURel and SURel as used in the paper. + +### Usage Note + +The scripts should be run directly from the main directory. If you wish to do otherwise, you may have to change the path you add to the path attribute in `sys.path.append('./modules/')` in the scripts. All scripts can be run directly from the command line, e.g.: + + python representations/count.py + +We recommend you to run the scripts with the Python Anaconda distribution (Python 2.7.15), only for VecMap Python 3 is needed. You will have to install some additional packages such as: docopt, gensim, i.a. Those that aren't available from the Anaconda installer can be installed via EasyInstall, or by running `pip install -r requirements.txt`. + +### Pipeline + +Under `scripts/` you find an example of a full pipeline for the models on a small test corpus. Assuming you are working on a UNIX-based system, first make the scripts executable with + + chmod 755 scripts/*.sh + +Then run either of + + bash -e scripts/make_results_sim.sh + bash -e scripts/make_results_disp.sh + bash -e scripts/make_results_wi.sh + +The script `make_results_sim.sh` produces results for the similarity measures (Cosine Distance, Local Neighborhood Distance) for all vector space and alignment types except for Word Injection. It first reads the gzipped test corpus in `corpora/test/corpus.txt.gz` with each line in the following format: + + year [tab] word1 word2 word3... + +It then produces model predictions for the targets in `testsets/test/targets.tsv`, writes them under `results/` and correlates the predictions with the gold rank `testsets/test/gold.tsv`. It finally writes the Spearman correlation between each model prediction and the gold rank under `results/`. + +The scripts `make_results_disp.sh` and `make_results_wi.sh` do similarly for the dispersion measures (Frequency, Types, Entropy Difference) and the similarity measures for Word Injection. + +BibTex +-------- + +``` +@inproceedings{Schlechtwegetal19, +title = {{A Wind of Change: Detecting and Evaluating Lexical Semantic Change across Times and Domains}}, +author = {Dominik Schlechtweg and Anna H\"{a}tty and Marco del Tredici and Sabine {Schulte im Walde}}, +booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", +year = "2019", +address = "Florence, Italy", +publisher = "Association for Computational Linguistics" +} +``` + diff --git a/alignment/ci_align.py b/alignment/ci_align.py new file mode 100644 index 0000000..dc82033 --- /dev/null +++ b/alignment/ci_align.py @@ -0,0 +1,83 @@ +import sys +sys.path.append('./modules/') + +from docopt import docopt +from dsm import load_pkl_files, save_pkl_files +from composes.semantic_space.space import Space +from composes.matrix.sparse_matrix import SparseMatrix +from scipy.sparse import linalg +import logging +import time + + +def main(): + """ + Align two sparse matrices by intersecting their columns. + """ + + # Get the arguments + args = docopt('''Align two sparse matrices by intersecting their columns. + + Usage: + ci_align.py [-l] + + = output path for aligned space 1 + = output path for aligned space 2 + = path to pickled space1 without suffix + = path to pickled space2 without suffix + + Options: + -l, --len normalize final vectors to unit length + + ''') + + is_len = args['--len'] + spacePrefix1 = args[''] + spacePrefix2 = args[''] + outPath1 = args[''] + outPath2 = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get the two matrices as spaces and intersect their columns + space1 = load_pkl_files(spacePrefix1) + space2 = load_pkl_files(spacePrefix2) + id2row1 = space1.get_id2row() + id2row2 = space2.get_id2row() + id2column1 = space1.get_id2column() + id2column2 = space2.get_id2column() + column2id1 = space1.get_column2id() + column2id2 = space2.get_column2id() + intersected_columns = list(set(id2column1).intersection(id2column2)) + intersected_columns_id1 = [column2id1[item] for item in intersected_columns] + intersected_columns_id2 = [column2id2[item] for item in intersected_columns] + reduced_matrix1 = space1.get_cooccurrence_matrix()[:, intersected_columns_id1].get_mat() + reduced_matrix2 = space2.get_cooccurrence_matrix()[:, intersected_columns_id2].get_mat() + + if is_len: + # L2-normalize vectors + l2norm1 = linalg.norm(reduced_matrix1, axis=1, ord=2) + l2norm2 = linalg.norm(reduced_matrix2, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + l2norm2[l2norm2==0.0] = 1.0 # Convert 0 values to 1 + reduced_matrix1 /= l2norm1.reshape(len(l2norm1),1) + reduced_matrix2 /= l2norm2.reshape(len(l2norm2),1) + + # Make new spaces + reduced_space1 = Space(SparseMatrix(reduced_matrix1), id2row1, intersected_columns) + reduced_space2 = Space(SparseMatrix(reduced_matrix2), id2row2, intersected_columns) + + if reduced_space1.get_id2column()!=reduced_space2.get_id2column(): + sys.exit('Two spaces not properly aligned!') + + # Save the Space object in pickle format + save_pkl_files(reduced_space1, outPath1 + '.sm', save_in_one_file=True) + save_pkl_files(reduced_space2, outPath2 + '.sm', save_in_one_file=True) + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/alignment/map_embeddings.py b/alignment/map_embeddings.py new file mode 100644 index 0000000..d04e7b8 --- /dev/null +++ b/alignment/map_embeddings.py @@ -0,0 +1,435 @@ +import sys +sys.path.append('./modules/') + +# Copyright (C) 2016-2018 Mikel Artetxe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import embeddings +from cupy_utils import * + +import argparse +import collections +import numpy as np +import re +import sys +import time +import logging + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) +logging.info(__file__.upper()) +start_time = time.time() + + +def dropout(m, p): + if p <= 0.0: + return m + else: + xp = get_array_module(m) + mask = xp.random.rand(*m.shape) >= p + return m*mask + + +def topk_mean(m, k, inplace=False): # TODO Assuming that axis is 1 + xp = get_array_module(m) + n = m.shape[0] + ans = xp.zeros(n, dtype=m.dtype) + if k <= 0: + return ans + if not inplace: + m = xp.array(m) + ind0 = xp.arange(n) + ind1 = xp.empty(n, dtype=int) + minimum = m.min() + for i in range(k): + m.argmax(axis=1, out=ind1) + ans += m[ind0, ind1] + m[ind0, ind1] = minimum + return ans / k + + +def main(): + # Parse command line arguments + parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') + parser.add_argument('src_input', help='the input source embeddings') + parser.add_argument('trg_input', help='the input target embeddings') + parser.add_argument('src_output', help='the output source embeddings') + parser.add_argument('trg_output', help='the output target embeddings') + parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') + parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') + parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') + parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') + parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') + + recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') + recommended_type = recommended_group.add_mutually_exclusive_group() + recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') + recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') + recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') + recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') + recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') + recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') + recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') + recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') + recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') + + init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') + init_type = init_group.add_mutually_exclusive_group() + init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') + init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') + init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') + init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') + init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') + + mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') + mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') + mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') + mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') + mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') + mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') + mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') + mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') + mapping_type = mapping_group.add_mutually_exclusive_group() + mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') + mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') + + self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') + self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') + self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') + self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') + self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') + self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') + self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') + self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') + self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') + self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') + self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') + self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') + args = parser.parse_args() + + if args.supervised is not None: + parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) + if args.semi_supervised is not None: + parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) + if args.identical: + parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) + if args.unsupervised or args.acl2018: + parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) + if args.aaai2018: + parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) + if args.acl2017: + parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) + if args.acl2017_seed: + parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) + if args.emnlp2016: + parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) + args = parser.parse_args() + + # Check command line arguments + if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: + print('ERROR: De-whitening requires whitening first', file=sys.stderr) + sys.exit(-1) + + # Choose the right dtype for the desired precision + if args.precision == 'fp16': + dtype = 'float16' + elif args.precision == 'fp32': + dtype = 'float32' + elif args.precision == 'fp64': + dtype = 'float64' + + # Read input embeddings + srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') + trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') + src_words, x = embeddings.read(srcfile, dtype=dtype) + trg_words, z = embeddings.read(trgfile, dtype=dtype) + + # NumPy/CuPy management + if args.cuda: + if not supports_cupy(): + print('ERROR: Install CuPy for CUDA support', file=sys.stderr) + sys.exit(-1) + xp = get_cupy() + x = xp.asarray(x) + z = xp.asarray(z) + else: + xp = np + xp.random.seed(args.seed) + + # Build word to index map + src_word2ind = {word: i for i, word in enumerate(src_words)} + trg_word2ind = {word: i for i, word in enumerate(trg_words)} + + #print(args.normalize) + #print(args.self_learning) + # STEP 0: Normalization + embeddings.normalize(x, args.normalize) + embeddings.normalize(z, args.normalize) + + # Build the seed dictionary + src_indices = [] + trg_indices = [] + if args.init_unsupervised: + sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) + u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) + xsim = (u*s).dot(u.T) + u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) + zsim = (u*s).dot(u.T) + del u, s, vt + xsim.sort(axis=1) + zsim.sort(axis=1) + embeddings.normalize(xsim, args.normalize) + embeddings.normalize(zsim, args.normalize) + sim = xsim.dot(zsim.T) + if args.csls_neighborhood > 0: + knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) + knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) + sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 + if args.direction == 'forward': + src_indices = xp.arange(sim_size) + trg_indices = sim.argmax(axis=1) + elif args.direction == 'backward': + src_indices = sim.argmax(axis=0) + trg_indices = xp.arange(sim_size) + elif args.direction == 'union': + src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) + trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) + del xsim, zsim, sim + elif args.init_numerals: + numeral_regex = re.compile('^[0-9]+$') + src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} + trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} + numerals = src_numerals.intersection(trg_numerals) + for word in numerals: + src_indices.append(src_word2ind[word]) + trg_indices.append(trg_word2ind[word]) + elif args.init_identical: + identical = set(src_words).intersection(set(trg_words)) + for word in identical: + src_indices.append(src_word2ind[word]) + trg_indices.append(trg_word2ind[word]) + else: + f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') + for line in f: + src, trg = line.split() + try: + src_ind = src_word2ind[src] + trg_ind = trg_word2ind[trg] + src_indices.append(src_ind) + trg_indices.append(trg_ind) + except KeyError: + print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) + + # Read validation dictionary + if args.validation is not None: + f = open(args.validation, encoding=args.encoding, errors='surrogateescape') + validation = collections.defaultdict(set) + oov = set() + vocab = set() + for line in f: + src, trg = line.split() + try: + src_ind = src_word2ind[src] + trg_ind = trg_word2ind[trg] + validation[src_ind].add(trg_ind) + vocab.add(src) + except KeyError: + oov.add(src) + oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov + validation_coverage = len(validation) / (len(validation) + len(oov)) + + # Create log file + if args.log: + log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') + + # Allocate memory + xw = xp.empty_like(x) + zw = xp.empty_like(z) + src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) + trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) + simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) + simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) + if args.validation is not None: + simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) + + best_sim_forward = xp.full(src_size, -100, dtype=dtype) + src_indices_forward = xp.arange(src_size) + trg_indices_forward = xp.zeros(src_size, dtype=int) + best_sim_backward = xp.full(trg_size, -100, dtype=dtype) + src_indices_backward = xp.zeros(trg_size, dtype=int) + trg_indices_backward = xp.arange(trg_size) + knn_sim_fwd = xp.zeros(src_size, dtype=dtype) + knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) + + # Training loop + best_objective = objective = -100. + it = 1 + last_improvement = 0 + keep_prob = args.stochastic_initial + t = time.time() + end = not args.self_learning + while True: + + # Increase the keep probability if we have not improve in args.stochastic_interval iterations + if it - last_improvement > args.stochastic_interval: + if keep_prob >= 1.0: + end = True + keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) + last_improvement = it + + # Update the embedding mapping + if args.orthogonal or not end: # orthogonal mapping + u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) + w = vt.T.dot(u.T) + x.dot(w, out=xw) + zw[:] = z + elif args.unconstrained: # unconstrained mapping + x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) + w = x_pseudoinv.dot(z[trg_indices]) + x.dot(w, out=xw) + zw[:] = z + else: # advanced mapping + + # TODO xw.dot(wx2, out=xw) and alike not working + xw[:] = x + zw[:] = z + + # STEP 1: Whitening + def whitening_transformation(m): + u, s, vt = xp.linalg.svd(m, full_matrices=False) + return vt.T.dot(xp.diag(1/s)).dot(vt) + if args.whiten: + wx1 = whitening_transformation(xw[src_indices]) + wz1 = whitening_transformation(zw[trg_indices]) + xw = xw.dot(wx1) + zw = zw.dot(wz1) + + # STEP 2: Orthogonal mapping + wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) + wz2 = wz2_t.T + xw = xw.dot(wx2) + zw = zw.dot(wz2) + + # STEP 3: Re-weighting + xw *= s**args.src_reweight + zw *= s**args.trg_reweight + + # STEP 4: De-whitening + if args.src_dewhiten == 'src': + xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) + elif args.src_dewhiten == 'trg': + xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) + if args.trg_dewhiten == 'src': + zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) + elif args.trg_dewhiten == 'trg': + zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) + + # STEP 5: Dimensionality reduction + if args.dim_reduction > 0: + xw = xw[:, :args.dim_reduction] + zw = zw[:, :args.dim_reduction] + + # Self-learning + if end: + break + else: + # Update the training dictionary + if args.direction in ('forward', 'union'): + if args.csls_neighborhood > 0: + for i in range(0, trg_size, simbwd.shape[0]): + j = min(i + simbwd.shape[0], trg_size) + zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) + knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) + for i in range(0, src_size, simfwd.shape[0]): + j = min(i + simfwd.shape[0], src_size) + xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) + simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) + simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN + dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) + if args.direction in ('backward', 'union'): + if args.csls_neighborhood > 0: + for i in range(0, src_size, simfwd.shape[0]): + j = min(i + simfwd.shape[0], src_size) + xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) + knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) + for i in range(0, trg_size, simbwd.shape[0]): + j = min(i + simbwd.shape[0], trg_size) + zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) + simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) + simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN + dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) + if args.direction == 'forward': + src_indices = src_indices_forward + trg_indices = trg_indices_forward + elif args.direction == 'backward': + src_indices = src_indices_backward + trg_indices = trg_indices_backward + elif args.direction == 'union': + src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) + trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) + + # Objective function evaluation + if args.direction == 'forward': + objective = xp.mean(best_sim_forward).tolist() + elif args.direction == 'backward': + objective = xp.mean(best_sim_backward).tolist() + elif args.direction == 'union': + objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 + if objective - best_objective >= args.threshold: + last_improvement = it + best_objective = objective + + # Accuracy and similarity evaluation in validation + if args.validation is not None: + src = list(validation.keys()) + xw[src].dot(zw.T, out=simval) + nn = asnumpy(simval.argmax(axis=1)) + accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) + similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) + + # Logging + duration = time.time() - t + if args.verbose: + print(file=sys.stderr) + print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) + print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) + print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) + if args.validation is not None: + print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) + print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) + print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) + sys.stderr.flush() + if args.log is not None: + val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( + 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' + print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) + log.flush() + + t = time.time() + it += 1 + + # Write mapped embeddings + srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') + trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') + embeddings.write(src_words, xw, srcfile) + embeddings.write(trg_words, zw, trgfile) + srcfile.close() + trgfile.close() + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/alignment/sgns_vi.py b/alignment/sgns_vi.py new file mode 100644 index 0000000..588c074 --- /dev/null +++ b/alignment/sgns_vi.py @@ -0,0 +1,117 @@ +import sys +sys.path.append('./modules/') + +import codecs +from collections import defaultdict +import os +from os.path import basename +import zipfile +from docopt import docopt +import logging +import logging.config +import time +import gensim +from gensim.models.word2vec import Word2Vec +from gensim.models import KeyedVectors +from dsm import PathLineSentences_mod + + +def intersection_dic(t1, t2): + voc_t1 = [x for xs in t1 for x in xs] + voc_t2 = [x for xs in t2 for x in xs] + intersection = list(set(voc_t1) & set(voc_t2)) + return [[x] for x in intersection] # note: gensim wants list of iterables (i.e. list of lists) + +def main(): + """ + Make comparable embedding vector spaces with Skip-Gram with Negative Sampling as described in: + + Yoon Kim, Yi-I. Chiu, Kentaro Hanaki, Darshan Hegde, and Slav Petrov. 2014. Temporal analysis of language through neural language models. arXiv preprint arXiv:1405.3515. + + """ + + # Get the arguments + args = docopt("""Make comparable embedding vector spaces with Skip-Gram with Negative Sampling and Vector Initialization from corpus. + + Usage: + sgns_vi.py [-l] + + Arguments: + + = vectors on which model should be initialized + = the linear distance of context words to consider in each direction + = dimensionality of embeddings + = number of negative samples parameter (equivalent to shifting parameter for PPMI) + = threshold for subsampling + = number of occurrences for a word to be included in the vocabulary + = number of iterations + = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' + = output path for vectors + = lower bound for time period + = upper bound for time period + + Options: + -l, --len normalize final vectors to unit length + + Note: + Initialization vectors should be non-length-normalized. + + """) + + is_len = args['--len'] + initVectorsPath = args[''] + windowSize = int(args['']) + dim = int(args['']) + k = int(args['']) + if args['']=='None': + t = None + else: + t = float(args['']) + minCount = int(args['']) + itera = int(args['']) + corpDir = args[''] + outPath = args[''] + lowerBound = int(args['']) + upperBound = int(args['']) + + logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Initialize model + model = gensim.models.Word2Vec(sg=1, # skipgram + hs=0, # negative sampling + negative=k, # number of negative samples + sample=t, # threshold for subsampling, if None, no subsampling is performed + size=dim, window=windowSize, min_count=minCount, iter=itera, workers=20) + + # Receive vectors for initialization + initVectors = KeyedVectors.load_word2vec_format(initVectorsPath, binary=False) + + # Initialize vocabulary + vocab_initVectors = initVectors.vocab + + # Intersect vocabulary + vocab_sentences_t_2 = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + vocab_intersect = intersection_dic([[token] for token in vocab_initVectors],vocab_sentences_t_2) + model.build_vocab(vocab_intersect) + + # Train + sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + model.intersect_word2vec_format(initVectorsPath, lockf=1) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + + if is_len: + # L2-normalize vectors + model.init_sims(replace=True) + + # Save the vectors and the model + model.wv.save_word2vec_format(outPath + '.w2v') + #model.save(outPath + '.model') + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/alignment/srv_align.py b/alignment/srv_align.py new file mode 100644 index 0000000..e217fb0 --- /dev/null +++ b/alignment/srv_align.py @@ -0,0 +1,201 @@ +import sys +sys.path.append('./modules/') + +import os +from docopt import docopt +from dsm import load_pkl_files, save_pkl_files +import logging +import time +import codecs +import numpy as np +from composes.semantic_space.space import Space +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.sparse_matrix import SparseMatrix +from scipy.sparse import lil_matrix, csr_matrix, csc_matrix, hstack, vstack +from sklearn.random_projection import sparse_random_matrix + + +def main(): + """ + Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices as described in: + Pierpaolo Basile, Annalina Caputo and Giovanni Semeraro, 2014. Analysing Word Meaning over Time by Exploiting Temporal Random Indexing. + """ + + # Get the arguments + args = docopt('''Create two aligned low-dimensional vector spaces by sparse random indexing from two co-occurrence matrices. + + Usage: + srv_align.py [-l] (-s | -a) + + = number negative samples, expressed as percentage of positive samples + = smoothing parameter for negative sampling + = number of non-zero values in each random vector + = number of dimensions for random vectors + = threshold for downsampling (if t=None, no subsampling is applied) + = output path for aligned space 1 + = output path for aligned space 2 + = path to pickled space without suffix + = path to pickled space without suffix + = output path for elemental space (context vectors) + + Options: + -l, --len normalize final vectors to unit length + -s, --see specify number of seeds manually + -a, --aut calculate number of seeds automatically as proposed in [1,2] + + References: + [1] Ping Li, T. Hastie and K. W. Church, 2006, + "Very Sparse Random Projections". + http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf + [2] D. Achlioptas, 2001, "Database-friendly random projections", + http://www.cs.ucsc.edu/~optas/papers/jl.pdf + + ''') + + is_len = args['--len'] + is_seeds = args['--see'] + if is_seeds: + seeds = int(args['']) + is_aut = args['--aut'] + dim = int(args['']) + if args['']=='None': + t = None + else: + t = float(args['']) + outPath1 = args[''] + outPath2 = args[''] + outPathElement = args[''] + spacePrefix1 = args[''] + spacePrefix2 = args[''] + + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Load input spaces + space1 = load_pkl_files(spacePrefix1) + space2 = load_pkl_files(spacePrefix2) + matrix1 = csc_matrix(space1.get_cooccurrence_matrix().get_mat()) + matrix2 = csc_matrix(space2.get_cooccurrence_matrix().get_mat()) + + # Get mappings between rows/columns and words + id2row1 = space1.get_id2row() + id2row2 = space2.get_id2row() + row2id_1 = space1.get_row2id() + row2id_2 = space2.get_row2id() + id2column1 = space1.get_id2column() + id2column2 = space2.get_id2column() + + # Get union of rows and columns in both spaces + unified_rows = sorted(list(set(id2row1).union(id2row2))) + unified_columns = sorted(list(set(id2column1).union(id2column2))) + columns_diff1 = list(set(unified_columns) - set(id2column1)) + columns_diff2 = list(set(unified_columns) - set(id2column2)) + + # Get mappings of indices of columns in original spaces to indices of columns in unified space + c2i = {w: i for i, w in enumerate(unified_columns)} + cj2i1 = {j: c2i[w] for j, w in enumerate(id2column1+columns_diff1)} + cj2i2 = {j: c2i[w] for j, w in enumerate(id2column2+columns_diff2)} + + if t!=None: + rows_diff1 = list(set(unified_rows) - set(id2row1)) + rows_diff2 = list(set(unified_rows) - set(id2row2)) + + r2i = {w: i for i, w in enumerate(unified_rows)} + rj2i1 = {j: r2i[w] for j, w in enumerate(id2row1+rows_diff1)} + rj2i2 = {j: r2i[w] for j, w in enumerate(id2row2+rows_diff2)} + + # Build spaces with unified COLUMNS + new_columns1 = csc_matrix((len(id2row1),len(columns_diff1))) # Get empty columns for additional context words + unified_matrix1 = hstack((matrix1,new_columns1))[:,sorted(cj2i1, key=cj2i1.get)] # First concatenate matrix and empty columns and then order columns according to unified_columns + + new_columns2 = csc_matrix((len(id2row2),len(columns_diff2))) + unified_matrix2 = hstack((matrix2,new_columns2))[:,sorted(cj2i2, key=cj2i2.get)] + + # Build spaces with unified ROWS + new_rows1 = csc_matrix((len(rows_diff1),len(unified_columns))) + final_unified_matrix1 = csc_matrix(vstack((unified_matrix1,new_rows1)))[sorted(rj2i1, key=rj2i1.get)] + + new_rows2 = csc_matrix((len(rows_diff2),len(unified_columns))) + final_unified_matrix2 = csc_matrix(vstack((unified_matrix2,new_rows2)))[sorted(rj2i2, key=rj2i2.get)] + + # Add up final unified matrices + common_unified_matrix = np.add(final_unified_matrix1,final_unified_matrix2) + + # Get number of total occurrences of any word + totalOcc = np.sum(common_unified_matrix) + + # Define function for downsampling + downsample = lambda f: np.sqrt(float(t)/f) if f>t else 1.0 + downsample = np.vectorize(downsample) + + # Get total normalized co-occurrence frequency of all contexts in both spaces + context_freqs = np.array(common_unified_matrix.sum(axis=0)/totalOcc)[0] + + + ## Generate ternary random vectors + if is_seeds: + elementalMatrix = lil_matrix((len(unified_columns),dim)) + # Generate base vector for random vectors + baseVector = np.zeros(dim) # Note: Make sure that number of seeds is not greater than dimensions + for i in range(0,seeds/2): + baseVector[i] = 1.0 + for i in range(seeds/2,seeds): + baseVector[i] = -1.0 + for i in range(len(unified_columns)): # To-do: make this more efficient by generating random indices for a whole array + np.random.shuffle(baseVector) + elementalMatrix[i] = baseVector + if is_aut: + elementalMatrix = sparse_random_matrix(dim,len(unified_columns)).T + + # Initialize target vectors + alignedMatrix1 = np.zeros((len(id2row1),dim)) + alignedMatrix2 = np.zeros((len(id2row2),dim)) + + + # Iterate over rows of space, find context words and update aligned matrix with low-dimensional random vectors of these context words + for (space,id2row,cj2i,alignedMatrix) in [(space1,id2row1,cj2i1,alignedMatrix1),(space2,id2row2,cj2i2,alignedMatrix2)]: + # Iterate over targets + for i, target in enumerate(id2row): + # Get co-occurrence values as matrix + m = space.get_row(target).get_mat() + # Get nonzero indexes + nonzeros = m.nonzero() + nonzeros = [cj2i[j] for j in nonzeros[1]] + data = m.data + pos_context_vectors = elementalMatrix[nonzeros] + if t!=None: + # Apply subsampling + rfs = context_freqs[nonzeros] + rfs = downsample(rfs) + data *= rfs + # Weight context vectors by occurrence frequency + pos_context_vectors = pos_context_vectors.multiply(data.reshape(-1,1)) + # Add up context vectors and store as row for target + alignedMatrix[i] = np.sum(pos_context_vectors, axis=0) + + if is_len: + # L2-normalize vectors + l2norm1 = np.linalg.norm(alignedMatrix1, axis=1, ord=2) + l2norm2 = np.linalg.norm(alignedMatrix2, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + l2norm2[l2norm2==0.0] = 1.0 # Convert 0 values to 1 + alignedMatrix1 /= l2norm1.reshape(len(l2norm1),1) + alignedMatrix2 /= l2norm2.reshape(len(l2norm2),1) + + # Make spaces + alignedSpace1 = Space(DenseMatrix(alignedMatrix1), id2row1, []) + alignedSpace2 = Space(DenseMatrix(alignedMatrix2), id2row2, []) + elementalSpace = Space(SparseMatrix(elementalMatrix), unified_columns, []) + + # Save the Space objects in pickle format + save_pkl_files(alignedSpace1, outPath1 + '.dm', save_in_one_file=False) + save_pkl_files(alignedSpace2, outPath2 + '.dm', save_in_one_file=False) + save_pkl_files(elementalSpace, outPathElement + '.dm', save_in_one_file=False) + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/alignment/wi.py b/alignment/wi.py new file mode 100644 index 0000000..7cf7590 --- /dev/null +++ b/alignment/wi.py @@ -0,0 +1,103 @@ +import sys +sys.path.append('./modules/') + +import codecs +import os +from docopt import docopt +import logging +import time +import re +import random +import numpy as np + + +def main(): + """ + Combine two corpora and shuffle. Seed words are substituted in first corpus. (Word Injection) + """ + + # Get the arguments + args = docopt("""Combine two corpora and shuffle. Seed words are substituted in first corpus. (Word Injection) + + + Usage: + wi.py + + Arguments: + + = first corpus + = second corpus + = lower bound for time period in first corpus + = upper bound for time period in first corpus + = lower bound for time period in second corpus + = upper bound for time period in second corpus + = target words (to substitute in one corpus) + = path+filename to target corpus (2 corpora combined, with substitution) + + """) + + corp1 = args[''] + corp2 = args[''] + lowerBound1 = int(args['']) + upperBound1 = int(args['']) + lowerBound2 = int(args['']) + upperBound2 = int(args['']) + targWords = args[''] + outFile = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # get seeds words + seedList = [] + for line in codecs.open(targWords, "r", 'utf-8'): + line = line.strip().split("\t")[0] + seedList.append(line) + + searchPat = re.compile(r'(\b(?:%s)\b)' % '|'.join(seedList), re.UNICODE) + + lineCt = 0 + wFile = codecs.open("tempOutFile.txt", "w", 'utf-8') + for line in codecs.open(corp1, "r", 'utf-8'): + date = int(line.split("\t")[0]) + if not lowerBound1 <= date <= upperBound1: # skip every sentence which is not in timeframe + continue + newLine = re.sub(searchPat, r"\1_", line) + wFile.write(newLine) + lineCt +=1 + for line in codecs.open(corp2, "r", 'utf-8'): + date = int(line.split("\t")[0]) + if not lowerBound2 <= date <= upperBound2: # skip every sentence which is not in timeframe + continue + wFile.write(line) + lineCt +=1 + print("Seed words substituted. Total number of lines: %d" % (lineCt)) + indList = list(range(lineCt)) + random.shuffle(indList) + sublists = np.array_split(indList, 5) + + # make sure that you do not append at the outFile form the last iteration + open(outFile, 'w').close() + wFile = codecs.open(outFile, "a", 'utf-8') + for nrSub, sublist in enumerate(sublists): + sublist = set(sublist) + print("Processing %d part ..." % (nrSub)) + smallLineList = [] + for nrL, line in enumerate(codecs.open("tempOutFile.txt", "r", 'utf-8')): + if nrL in sublist: + smallLineList.append(line) + random.shuffle(smallLineList) + for line in smallLineList: + wFile.write(line.strip("\n")+"\n") + + + os.remove("tempOutFile.txt") + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() + diff --git a/corpora/test/corpus.txt.gz b/corpora/test/corpus.txt.gz new file mode 100644 index 0000000..f090cee Binary files /dev/null and b/corpora/test/corpus.txt.gz differ diff --git a/corpora/test_wi/corpus.txt.gz b/corpora/test_wi/corpus.txt.gz new file mode 100644 index 0000000..473b152 Binary files /dev/null and b/corpora/test_wi/corpus.txt.gz differ diff --git a/evaluation/spearman.py b/evaluation/spearman.py new file mode 100644 index 0000000..78fdc47 --- /dev/null +++ b/evaluation/spearman.py @@ -0,0 +1,75 @@ +import sys +sys.path.append('./modules/') + +import os +import random +import codecs +import numpy as np +from docopt import docopt +from scipy.stats import spearmanr +import logging +import time + + +def main(): + """ + Calculate spearman correlation coefficient for specified columns of two files. + """ + + # Get the arguments + args = docopt("""Calculate spearman correlation coefficient for specified columns of two files. + + + Usage: + spearman.py + + Arguments: + = path to file1 + = path to file2 + = name of file1 to print + = name of file2 to print + = target column in file1 + = target column in file2 + + Note: + Assumes tap-separated CSV files as input. Assumes that rows are in same order and columns have same length. Nan values are omitted. + + """) + + file1 = args[''] + file2 = args[''] + filename1 = args[''] + filename2 = args[''] + col1 = int(args['']) + col2 = int(args['']) + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get data + with codecs.open(file1, 'r', 'utf-8') as f_in: + data1 = np.array([float(line.strip().split('\t')[col1]) for line in f_in]) + + with codecs.open(file2, 'r', 'utf-8') as f_in: + data2 = np.array([float(line.strip().split('\t')[col2]) for line in f_in]) + + # Check if there are non-number values + nan_list1 = [x for x in data1 if np.isnan(x)] + nan_list2 = [x for x in data2 if np.isnan(x)] + if len(nan_list1)>0 or len(nan_list2)>0: + print 'nan encountered!' + + # compute correlation + try: + rho, p = spearmanr(data1, data2, nan_policy='omit') + except ValueError as e: + logging.info(e) + rho, p = 'nan', 'nan' + + print filename1, filename2, rho, p + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/cd.py b/measures/cd.py new file mode 100644 index 0000000..eba79dd --- /dev/null +++ b/measures/cd.py @@ -0,0 +1,99 @@ +import sys +sys.path.append('./modules/') + +import os +from os.path import basename +from docopt import docopt +from dsm import load_pkl_files +import logging +import time +import codecs +import numpy as np +from scipy import spatial +from scipy.sparse import csr_matrix +from composes.matrix.dense_matrix import DenseMatrix + +def main(): + """ + Compute cosine distance for target pairs from two vector spaces. + """ + + # Get the arguments + args = docopt("""Compute cosine distance for target pairs from two vector spaces. + + Usage: + cd.py [(-f | -s)] [] + + = path to pickled space without suffix + = path to pickled space without suffix + = path to file with tab-separated word pairs + = output path for result file + + Options: + -f, --fst write only first target in output file + -s, --scd write only second target in output file + + Note: + Important: spaces must be already aligned (columns in same order)! + + """) + + is_fst = args['--fst'] + is_scd = args['--scd'] + spacePrefix1 = args[''] + spacePrefix2 = args[''] + testset = args[''] + outPath = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Load spaces + space1 = load_pkl_files(spacePrefix1) + space2 = load_pkl_files(spacePrefix2) + + if testset!=None: + # target vectors in first/second column are computed from space1/space2 + with codecs.open(testset, 'r', 'utf-8') as f_in: + targets = [(line.strip().split('\t')[0],line.strip().split('\t')[1]) for line in f_in] + else: + # If no test set is provided, compute values for all targets occurring in both spaces + target_intersection = set([target.decode('utf-8') for target in space1.get_row2id()]).intersection([target.decode('utf-8') for target in space2.get_row2id()]) + targets = zip(target_intersection,target_intersection) + + scores = {} + for i, (t1, t2) in enumerate(targets): + + # Get row vectors + try: + row1 = space1.get_row(t1.encode('utf8')) + row2 = space2.get_row(t2.encode('utf8')) + except KeyError: + scores[(t1, t2)] = 'nan' + continue + + # Convert to list + row_vector1 = csr_matrix(row1.get_mat()).toarray()[0].tolist() + row_vector2 = csr_matrix(row2.get_mat()).toarray()[0].tolist() + + # Compute cosine distance of vectors + distance = spatial.distance.cosine(row_vector1, row_vector2) + scores[(t1, t2)] = distance + + + with codecs.open(outPath +'.csv', 'w', 'utf-8') as f_out: + for (t1, t2) in targets: + if is_fst: # output only first target string + print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)])))) + elif is_scd: # output only second target string + print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)])))) + else: # standard outputs both target strings + print >> f_out, '\t'.join(('%s,%s' % (t1,t2), str(float(scores[(t1, t2)])))) + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + + +if __name__ == '__main__': + main() diff --git a/measures/entropy.py b/measures/entropy.py new file mode 100644 index 0000000..9331fc2 --- /dev/null +++ b/measures/entropy.py @@ -0,0 +1,90 @@ +import sys +sys.path.append('./modules/') + +import os +from os.path import basename +from docopt import docopt +from dsm import load_pkl_files +from scipy.stats import entropy +import logging +import time +import codecs +import numpy as np + + +def main(): + """ + Compute entropy for rows of targets from vector space. + """ + + # Get the arguments + args = docopt("""Compute entropy for rows of targets from vector space. + + Usage: + entropy.py [-n] [] + + = path to pickled space without suffix + = output path for result file + = path to file with targets in first column + + Options: + -n, --nrm normalize values by log of number of types + + """) + + is_norm = args['--nrm'] + spacePrefix = args[''] + outPath = args[''] + testset = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + space = load_pkl_files(spacePrefix) + + if testset!=None: + # target vectors in first/second column are computed from space1/space2 + with codecs.open(testset, 'r', 'utf-8') as f_in: + targets = [line.strip().split('\t')[0] for line in f_in] + else: + # If no test set is provided, compute values for all targets + targets = [target.decode('utf-8') for target in space.get_row2id()] + + scores = {} + norms = {} + for i, v in enumerate(targets): + + try: + row = space.get_row(v.encode('utf8')) + except KeyError: + scores[v] = 'nan' + norms[v] = 'nan' + continue + + # Get all counts in row (non-zero elements) + counts = row.get_mat().data + + # Compute entropy of row + H = entropy(counts, base=2) + scores[v] = H + + if is_norm: + # Get number of non-zero elements in row + types = row.get_mat().getnnz() + norms[v] = np.log2(types) + + + with codecs.open(outPath+'.csv', 'w', 'utf-8') as f_out: + for word in targets: + if is_norm: + print >> f_out, '\t'.join((word, str(float(scores[word])/float(norms[word])))) + else: + print >> f_out, '\t'.join((word, str(float(scores[word])))) + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/freq.py b/measures/freq.py new file mode 100644 index 0000000..b1ea543 --- /dev/null +++ b/measures/freq.py @@ -0,0 +1,90 @@ +import sys +sys.path.append('./modules/') + +import codecs +from collections import defaultdict +import os +from dsm import PathLineSentences_mod +from docopt import docopt +import logging +import time + + +def main(): + """ + Get frequencies from corpus. + """ + + # Get the arguments + args = docopt("""Get frequencies from corpus. + + Usage: + freq.py [-o] [(-n )] [] + + Arguments: + + = path to zipped corpus directory + = output path for result file + = lower bound for time period + = upper bound for time period + = path to tab-separated file with targets in first column + = normalization constant + + Options: + -n, --nrm normalize values by normalization constant + + Note: + Outputs frequencies for all tokens in case no testset is provided. + + """) + + is_norm = args['--nrm'] + if is_norm: + normConst = float(args['']) + corpDir = args[''] + outPath = args[''] + lowerBound = int(args['']) + upperBound = int(args['']) + testset = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + + freqs = defaultdict(int) + + sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + + for sentence in sentences: + for word in sentence: + freqs[word] = freqs[word] + 1 + + + if testset!=None: + # Targets for which to output values. + with codecs.open(testset, 'r', 'utf-8') as f_in: + targets = [line.strip().split('\t')[0] for line in f_in] + else: + # Rank the lemmas + freqs_ranked = sorted(freqs, key=lambda x: -(freqs[x])) + # If no test set is provided, compute values for all tokens + targets = freqs_ranked + + with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out: + for word in targets: + if word in freqs: + if is_norm: + freqs[word]=float(freqs[word])/normConst + print >> f_out, '\t'.join((word, str(float(freqs[word])))) + else: + print >> f_out, '\t'.join((word, 'nan')) + + + logging.info('total number of tokens: %d' % (sentences.corpusSize)) + logging.info('total number of types: %d' % (len(freqs.keys()))) + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/lnd.py b/measures/lnd.py new file mode 100644 index 0000000..671c2a9 --- /dev/null +++ b/measures/lnd.py @@ -0,0 +1,103 @@ +import sys +sys.path.append('./modules/') + +import os +from os.path import basename +from docopt import docopt +from dsm import load_pkl_files +import codecs +import numpy as np +from scipy import spatial +from composes.similarity.cos import CosSimilarity +import logging +import time + + +def main(): + """ + Compute local neighborhood distance for target pairs from two vector spaces. + """ + + # Get the arguments + args = docopt("""Compute local neighborhood distance for target pairs from two vector spaces. + + Usage: + lnd.py [(-f | -s)] [] + + = path to pickled space without suffix + = path to pickled space without suffix + = path to file with tab-separated word pairs + = parameter k (k nearest neighbors) + = output path for result file + + Options: + -f, --fst write only first target in output file + -s, --scd write only second target in output file + + """) + + is_fst = args['--fst'] + is_scd = args['--scd'] + spacePrefix1 = args[''] + spacePrefix2 = args[''] + testset = args[''] + outPath = args[''] + k = int(args['']) + + logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Load spaces + space1 = load_pkl_files(spacePrefix1) + space2 = load_pkl_files(spacePrefix2) + + if testset!=None: + # target vectors in first/second column are computed from space1/space2 + with codecs.open(testset, 'r', 'utf8') as f_in: + targets = [(line.strip().split('\t')[0],line.strip().split('\t')[1]) for line in f_in] + else: + # If no test set is provided, compute values for all targets occurring in both spaces + target_intersection = set([target.decode('utf8') for target in space1.get_row2id()]).intersection([target.decode('utf8') for target in space2.get_row2id()]) + targets = zip(target_intersection,target_intersection) + + scores = {} + neighborUnionSizes = {} + for i, (t1, t2) in enumerate(targets): + + # Get nearest neighbors + try: + neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity()) + neighbors2 = space2.get_neighbours(t2.encode('utf8'), k, CosSimilarity()) + except KeyError: + scores[(t1, t2)] = 'nan' + neighborUnionSizes[(t1, t2)] = 'nan' + continue + + neighborUnion = list(set([a for (a,b) in neighbors1+neighbors2 if (a in space1.row2id and a in space2.row2id and not a in [t1.encode('utf8'),t2.encode('utf8')])])) + + simVec1 = [space1.get_sim(t1.encode('utf8'), n, CosSimilarity()) for n in neighborUnion] + simVec2 = [space2.get_sim(t2.encode('utf8'), n, CosSimilarity()) for n in neighborUnion] + + # Compute cosine distance of vectors + distance = spatial.distance.cosine(simVec1, simVec2) + scores[(t1, t2)] = distance + neighborUnionSizes[(t1, t2)] = len(neighborUnion) + + + with codecs.open(outPath +'.csv', 'w', 'utf-8') as f_out: + for (t1, t2) in targets: + if is_fst: # output only first target string + print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) + elif is_scd: # output only second target string + print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) + else: # standard outputs both target strings + print >> f_out, '\t'.join(('%s,%s' % (t1,t2), str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/subtract.py b/measures/subtract.py new file mode 100644 index 0000000..727dbe3 --- /dev/null +++ b/measures/subtract.py @@ -0,0 +1,73 @@ +import sys +sys.path.append('./modules/') + +import codecs +from docopt import docopt +import logging +import time + + +def main(): + """ + Subtract values in tab-separated CSV files. + """ + + # Get the arguments + args = docopt("""Subtract values in tab-separated CSV files. + + Usage: + subtract.py [-a] + + Arguments: + = target strings in first column + = strings in first column and values in second column + = strings in first column and values in second column + = output path for result file + + Options: + -a, --abs store absolute (always positive) instead of raw difference + + Note: + Assumes tap-separated CSV files as input. Appends nan if target is not present in valueFiles. + + """) + + targetFile = args[''] + valueFile1 = args[''] + valueFile2 = args[''] + outPath = args[''] + isAbsolute = args['--abs'] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get targets + with codecs.open(targetFile, 'r', 'utf-8') as f_in: + targets = [line.strip().split('\t')[0] for line in f_in] + + # Get target-value map 1 + with codecs.open(valueFile1, 'r', 'utf-8') as f_in: + string2value1 = dict([( line.strip().split('\t')[0], float(line.strip().split('\t')[1]) ) for line in f_in]) + + # Get target-value map 2 + with codecs.open(valueFile2, 'r', 'utf-8') as f_in: + string2value2 = dict([( line.strip().split('\t')[0], float(line.strip().split('\t')[1]) ) for line in f_in]) + + # Print only targets to output file + with codecs.open(outPath+'.csv', 'w', 'utf-8') as f_out: + for string in targets: + try: + if isAbsolute: + print >> f_out, '\t'.join((string, str(abs(string2value2[string]-string2value1[string])))) + else: + print >> f_out, '\t'.join((string, str(string2value2[string]-string2value1[string]))) + except KeyError: + print >> f_out, '\t'.join((string, 'nan')) + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/transform.py b/measures/transform.py new file mode 100644 index 0000000..b145eb8 --- /dev/null +++ b/measures/transform.py @@ -0,0 +1,65 @@ +import sys +sys.path.append('./modules/') + +import codecs +from docopt import docopt +import logging +import time +import numpy as np + +def main(): + """ + Transform values from tab-separated CSV file by function specified as option. + """ + + # Get the arguments + args = docopt("""Transform values from tab-separated CSV file by function specified as option. + + Usage: + transform.py -l + + Arguments: + = target strings in first column + = strings in first column and values in second column + = output path for result file + + Options: + -l, --log2 logarithmic transformation (base 2) + + Note: + Assumes tap-separated CSV files as input. Appends nan if target is not present in valueFile or normFile. + + """) + + targetFile = args[''] + valueFile = args[''] + outPath = args[''] + is_log2 = args['--log2'] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get targets + with codecs.open(targetFile, 'r', 'utf-8') as f_in: + targets = [line.strip().split('\t')[0] for line in f_in] + + # Get target-value map + with codecs.open(valueFile, 'r', 'utf-8') as f_in: + string2value = dict([( line.strip().split('\t')[0], float(line.strip().split('\t')[1]) ) for line in f_in]) + + # Print only targets to output file + with codecs.open(outPath+'.csv', 'w', 'utf-8') as f_out: + for string in targets: + try: + if is_log2: + print >> f_out, '\t'.join((string, str(np.log2(string2value[string])))) + except KeyError: + print >> f_out, '\t'.join((string, 'nan')) + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/measures/types.py b/measures/types.py new file mode 100644 index 0000000..a282680 --- /dev/null +++ b/measures/types.py @@ -0,0 +1,83 @@ +import sys +sys.path.append('./modules/') + +import os +from os.path import basename +from docopt import docopt +from dsm import load_pkl_files +import logging +import time +import codecs +import numpy as np + + +def main(): + """ + Compute number of context types for all rows of a vector space and save their scores. + """ + + # Get the arguments + args = docopt("""Compute number of context types for all rows of a vector space and save their scores. + + Usage: + types.py [(-n )] [] + + = path to pickled space without suffix + = output path for result file + = path to file with targets in first column + = normalization constant + + Options: + -n, --nrm normalize values by normalization constant + + """) + + is_norm = args['--nrm'] + if is_norm: + normConst = float(args['']) + spacePrefix = args[''] + outPath = args[''] + testset = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + space = load_pkl_files(spacePrefix) + + if testset!=None: + # target vectors in first/second column are computed from space1/space2 + with codecs.open(testset, 'r', 'utf-8') as f_in: + targets = [line.strip().split('\t')[0] for line in f_in] + else: + # If no test set is provided, compute values for all targets + targets = [target.decode('utf-8') for target in space.get_row2id()] + + scores = {} + # Iterate over targets + for i, v in enumerate(targets): + + try: + row = space.get_row(v.encode('utf8')) + except KeyError: + scores[v] = 'nan' + continue + + # Get number of non-zero elements in row + types = row.get_mat().getnnz() + + scores[v] = types + + + with codecs.open(outPath+'.csv', 'w', 'utf-8') as f_out: + for word in targets: + if is_norm: + scores[word]=float(scores[word])/normConst + print >> f_out, '\t'.join((word, str(float(scores[word])))) + + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000..0519ecb --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/modules/__pycache__/cupy_utils.cpython-37.pyc b/modules/__pycache__/cupy_utils.cpython-37.pyc new file mode 100644 index 0000000..e6690e4 Binary files /dev/null and b/modules/__pycache__/cupy_utils.cpython-37.pyc differ diff --git a/modules/__pycache__/embeddings.cpython-37.pyc b/modules/__pycache__/embeddings.cpython-37.pyc new file mode 100644 index 0000000..fad4440 Binary files /dev/null and b/modules/__pycache__/embeddings.cpython-37.pyc differ diff --git a/modules/composes/__init__.py b/modules/composes/__init__.py new file mode 100755 index 0000000..914df52 --- /dev/null +++ b/modules/composes/__init__.py @@ -0,0 +1,12 @@ +import logging + +class NullHandler(logging.Handler): + """For python versions <= 2.6; same as `logging.NullHandler` in 2.7.""" + def emit(self, record): + pass + +logger = logging.getLogger(__name__) +if len(logger.handlers) == 0: # To ensure reload() doesn't add another one + logger.addHandler(NullHandler()) + +#logging.basicConfig(filename='composes.log', filemode='w+',level=logging.DEBUG, format = "") diff --git a/modules/composes/composition/__init__.py b/modules/composes/composition/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/composition/composition_model.py b/modules/composes/composition/composition_model.py new file mode 100755 index 0000000..2e87c0b --- /dev/null +++ b/modules/composes/composition/composition_model.py @@ -0,0 +1,259 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +import time +import math +from warnings import warn +from composes.semantic_space.space import Space +from composes.matrix.dense_matrix import DenseMatrix +from composes.utils.gen_utils import assert_is_instance +from composes.utils.matrix_utils import resolve_type_conflict +from composes.utils.io_utils import create_parent_directories + +import logging +from composes.utils import log_utils as log + +logger = logging.getLogger(__name__) + +class CompositionModel(object): + """ + Parent class of the composition models. + """ + + _name = "no name" + + MAX_MEM_OVERHEAD = 0.2 + + """ + double, in interval [0,1] + maximum overhead allowed: MAX_MEM_OVERHEAD ratio of argument space memory + when composing + """ + composed_id2column = None + """ + List of strings, the column strings of the resulted composed space. + """ + + def __init__(self): + """ + Constructor + """ + + def train(self, train_data, arg_space, phrase_space): + """ + Trains a composition model and sets its learned parameters. + + Args: + train_data: list of string tuples. Each tuple contains 3 + string elements: (arg1, arg2, phrase). + + arg_space: argument space(s). Space object or a tuple of two + Space objects (e.g. my_space, or (my_space1, my_space2)). + If two spaces are provided, arg1 elements of train data are + interpreted in space1, and arg2 in space2. + + phrase space: phrase space, of type Space. + + Calls the specific training routine of the current composition + model. Training tuples which contain strings not found in their + respective spaces are ignored. + + The id2column attribute of the resulted composed space is set to + be equal to that of the phrase space given as an input. + """ + + start = time.time() + + arg1_space, arg2_space = self.extract_arg_spaces(arg_space) + arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(train_data, + (arg1_space.row2id, + arg2_space.row2id, + phrase_space.row2id) + ) + + + self._train(arg1_space, arg2_space, phrase_space, + arg1_list, arg2_list, phrase_list) + + self.composed_id2column = phrase_space.id2column + + log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") + log.print_info(logger, 2, "With total data points:%s" % len(arg1_list)) + log.print_matrix_info(logger, arg1_space.cooccurrence_matrix, 3, + "Semantic space of argument 1:") + log.print_matrix_info(logger, arg2_space.cooccurrence_matrix, 3, + "Semantic space of argument 2:") + log.print_matrix_info(logger, phrase_space.cooccurrence_matrix, 3, + "Semantic space of phrases:") + log.print_time_info(logger, time.time(), start, 2) + + + def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list): + + arg1_mat = arg1_space.get_rows(arg1_list) + arg2_mat = arg2_space.get_rows(arg2_list) + phrase_mat = phrase_space.get_rows(phrase_list) + + [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat, + arg2_mat, + phrase_mat], + DenseMatrix) + + self._solve(arg1_mat, arg2_mat, phrase_mat) + + def compose(self, data, arg_space): + """ + Uses a composition model to compose elements. + + Args: + data: data to be composed. List of tuples, each containing 3 + strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the + elements to be composed and composed_phrase is the string associated + to their composition. + + arg_space: argument space(s). Space object or a tuple of two + Space objects (e.g. my_space, or (my_space1, my_space2)). + If two spaces are provided, arg1 elements of data are + interpreted in space1, and arg2 in space2. + + Returns: + composed space: a new object of type Space, containing the + phrases obtained through composition. + + """ + start = time.time() + + arg1_space, arg2_space = self.extract_arg_spaces(arg_space) + arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, + (arg1_space.row2id, + arg2_space.row2id, + None)) + + # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead + # the /3.0 is needed + # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) + chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list)) + * self.MAX_MEM_OVERHEAD / 3.0) + 1 + + composed_mats = [] + for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): + beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) + + arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) + arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) + + [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], + DenseMatrix) + composed_mat = self._compose(arg1_mat, arg2_mat) + composed_mats.append(composed_mat) + + composed_phrase_mat = composed_mat.nary_vstack(composed_mats) + + if self.composed_id2column is None: + self.composed_id2column = self._build_id2column(arg1_space, arg2_space) + + log.print_name(logger, self, 1, "\nComposed with composition model:") + log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) + log.print_matrix_info(logger, composed_phrase_mat, 4, + "Resulted (composed) semantic space::") + log.print_time_info(logger, time.time(), start, 2) + + return Space(composed_phrase_mat, phrase_list, self.composed_id2column) + + @classmethod + def extract_arg_spaces(cls, arg_space): + """ + TO BE MOVED TO A UTILS MODULE! + """ + if not isinstance(arg_space, tuple): + arg1_space = arg_space + arg2_space = arg_space + else: + if len(arg_space) != 2: + raise ValueError("expected two spaces, received %d-ary tuple " + % len(arg_space)) + arg1_space, arg2_space = arg_space + + assert_is_instance(arg1_space, Space) + assert_is_instance(arg2_space, Space) + + cls._assert_space_match(arg1_space, arg2_space) + + return arg1_space, arg2_space + + @classmethod + def _assert_space_match(cls, arg1_space, arg2_space, phrase_space=None): + + if arg1_space.id2column != arg2_space.id2column: + raise ValueError("Argument spaces do not have identical columns!") + + if not phrase_space is None: + if arg1_space.id2column != phrase_space.id2column: + raise ValueError("Argument and phrase space do not have identical columns!") + + def _build_id2column(self, arg1_space, arg2_space): + return arg1_space.id2column + + + def valid_data_to_lists(self, data, (row2id1, row2id2, row2id3)): + """ + TO BE MOVED TO A UTILS MODULE! + """ + list1 = [] + list2 = [] + list3 = [] + + j = 0 + for i in xrange(len(data)): + sample = data[i] + + cond = True + + if not row2id1 is None: + cond = cond and sample[0] in row2id1 + + if not row2id2 is None: + cond = cond and sample[1] in row2id2 + + if not row2id3 is None: + cond = cond and sample[2] in row2id3 + + if cond: + list1.append(sample[0]) + list2.append(sample[1]) + list3.append(sample[2]) + j += 1 + + if i + 1 != j: + warn("%d (out of %d) lines are ignored because one of the elements is not found in its semantic space" + % ((i + 1) - j, (i + 1))) + + if not list1: + raise ValueError("No valid data found for training/composition!") + + return list1, list2, list3 + + def export(self, filename): + """ + Prints the parameters of the composition model to file. + + Args: + filename: output filename, string + + Prints the parameters of the compositional model in an appropriate + format, specific to each model. + """ + create_parent_directories(filename) + self._export(filename) + + def get_name(self): + return self._name + + name = property(get_name) + """ + String, name of the composition model. + """ + + diff --git a/modules/composes/composition/dilation.py b/modules/composes/composition/dilation.py new file mode 100755 index 0000000..b99fce6 --- /dev/null +++ b/modules/composes/composition/dilation.py @@ -0,0 +1,91 @@ +''' +Created on Oct 15, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +import numpy as np +from composition_model import CompositionModel +from composes.utils.num_utils import is_numeric +from composes.utils.py_matrix_utils import nonzero_invert + + +class Dilation(CompositionModel): + """ + Implements the dilation compositional model: + + :math:`\\vec{p} = (\\vec{u} \\cdot \\vec{u}) \\vec{v} + (\\lambda - 1) (\\vec{u} \\cdot \\vec{v}) \\vec{u}` + + where :math:`\\vec{p}` is the vector of the composed phrase, :math:`\\vec{u}, \\vec{v}` the vectors of the components + and :math:`\\lambda` is a scalar. + + """ + + + _name = "dilation" + + _lambda = 2 + + + def __init__(self, lambda_=None): + """ + Constructor. + + Args: + lambda_ : numeric, value of the lambda parameter. Optional. + """ + + if not lambda_ is None: + if not is_numeric(lambda_): + raise ValueError("Parameter not numeric: %s " %(type(lambda_))) + else: + self._lambda = lambda_ + + def _solve(self, arg1_mat, arg2_mat, phrase_mat): + + v1_row_norms = arg1_mat.norm(1) + v1_row_sqr_norms = np.multiply(v1_row_norms, v1_row_norms) + + v2_minus_p = arg2_mat.scale_rows(v1_row_sqr_norms) - phrase_mat + v1_dot_prod_v2_minus_p = arg1_mat.multiply(v2_minus_p).sum(1) + + v1_v2 = arg1_mat.multiply(arg2_mat).sum(1) + v1_v2_sqr = np.multiply(v1_v2, v1_v2) + + nom = np.multiply(v1_v2_sqr, v1_row_sqr_norms).sum() + denom = np.multiply(v1_v2, v1_dot_prod_v2_minus_p).sum() + + if nom != 0: + self._lambda = 1 - denom/nom + else: + self._lambda = 2 + + + def _compose(self, arg1_mat, arg2_mat): + # TO DO: this is inefficient here, we do 2 for s instead of one + # we do a for in get_rows in parent.compose() and a for here + # comp = ((self._lambda -1) * v1.multiply(v2).sum()/pow(v1.norm(),2)) * v1 + v2 + + v1_row_norms = arg1_mat.norm(1) + scale_factors1 = arg1_mat.multiply(arg2_mat).sum(1) + scale_factors2 = np.multiply(v1_row_norms, v1_row_norms) + + arg1_mat_scaled = arg1_mat.scale_rows(scale_factors1) + arg2_mat_scaled = arg2_mat.scale_rows(scale_factors2) + + #print "FACTORS u:", ((self._lambda -1)*scale_factors1).sum()/float(len(scale_factors1)) + #print "FACTORS v:", (scale_factors2).sum()/float(len(scale_factors2)) + + result = (self._lambda - 1) * arg1_mat_scaled + arg2_mat_scaled + + return result + + def get_lambda(self): + return self._lambda + """ + Lambda parameter. Default, set to lambda=2. + """ + + + def _export(self, filename): + with open(filename, "w") as output_stream: + output_stream.write("lambda\t%f" % self._lambda) diff --git a/modules/composes/composition/full_additive.py b/modules/composes/composition/full_additive.py new file mode 100755 index 0000000..5961202 --- /dev/null +++ b/modules/composes/composition/full_additive.py @@ -0,0 +1,139 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from composition_model import CompositionModel +from composes.utils.gen_utils import assert_is_instance +from composes.utils.matrix_utils import is_array_or_matrix +from composes.utils.matrix_utils import padd_matrix +from composes.utils.matrix_utils import to_compatible_matrix_types +from composes.utils.regression_learner import LstsqRegressionLearner +from composes.utils.regression_learner import RegressionLearner +from composes.utils.matrix_utils import resolve_type_conflict +from composes.matrix.dense_matrix import DenseMatrix +from composes.exception.illegal_state_error import IllegalStateError + + +class FullAdditive(CompositionModel): + """ + Implements the full additive compositional model: + + :math:`\\vec{p} = A \\vec{u} + B \\vec{v}` + + where :math:`\\vec{p}` is the vector of the composed phrase, + :math:`\\vec{u}, \\vec{v}`, the vectors of the components + and :math:`A`, :math:`B` are two matrices. + + """ + _name = "full_additive" + _mat_a_t = None + _mat_b_t = None + + + def __init__(self, A=None, B=None, learner=LstsqRegressionLearner()): + #TODO here; very important, should be able to set the intercept + #when mat a and mat b are given , to true or false. now by default is + #is false + """ + Constructor. + + Args: + A= : matrix A, of matrix-like type (Matrix, ndarray, + numpy matrix, scipy matrix). Optional (parameters can be set + through training.) + + B= : matrix B, matrix-like type. Optional. + + learner= : regression learner object, of type RegressionLearner. + Optional, default LstsqRegressionLearner. + """ + if A is not None and B is not None: + mat_a = A + mat_b = B + if not is_array_or_matrix(mat_a): + raise TypeError("expected matrix type, received: %s" + % type(mat_a)) + + if not is_array_or_matrix(mat_b): + raise TypeError("expected matrix type, received: %s" + % type(mat_b)) + + mat_a, mat_b = to_compatible_matrix_types(mat_a, mat_b) + self._mat_a_t = mat_a.transpose() + self._mat_b_t = mat_b.transpose() + self._has_intercept = False + + else: + self._regression_learner = learner + self._has_intercept = self._regression_learner.has_intercept() + + + def _solve(self, arg1_mat, arg2_mat, phrase_mat): + + self._has_intercept = self._regression_learner.has_intercept() + + result = self._regression_learner.train(arg1_mat.hstack(arg2_mat), phrase_mat) + + self._mat_a_t = result[0:arg1_mat.shape[1], :] + self._mat_b_t = result[arg1_mat.shape[1]:, :] + + + def _compose(self, arg1_mat, arg2_mat): + #NOTE when we get in this compose arg1 mat and arg2 mat have the same type + [mat_a_t, mat_b_t, arg1_mat] = resolve_type_conflict([self._mat_a_t, + self._mat_b_t, + arg1_mat], + type(arg1_mat)) + if self._has_intercept: + return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t + else: + return arg1_mat * mat_a_t + arg2_mat * mat_b_t + + def set_regression_learner(self, regression_learner): + assert_is_instance(regression_learner, RegressionLearner) + self._regression_learner = regression_learner + + def get_regression_learner(self): + return self._regression_learner + + regression_learner = property(get_regression_learner, set_regression_learner) + """ + Regression method to be used in training, of type RegressionLearner. + Default is LstsqRegressionLearner. + """ + + def _build_id2column(self, arg1_space, arg2_space): + return [] + + def _export(self, filename): + if self._mat_a_t is None or self._mat_b_t is None: + raise IllegalStateError("cannot export an untrained FullAdditive model.") + + with open(filename, "w") as output_stream: + output_stream.write("A\n") + output_stream.write(str(DenseMatrix(self._mat_a_t).mat.T)) + output_stream.write("\nB\n") + + if self._has_intercept: + output_stream.write(str(DenseMatrix(self._mat_b_t[:-1,]).mat.T)) + output_stream.write("\nIntercept\n") + output_stream.write(str(DenseMatrix(self._mat_b_t[-1,]).mat.T)) + else: + output_stream.write(str(DenseMatrix(self._mat_b_t).mat.T)) + + + def get_mat_a_t(self): + return self._mat_a_t + mat_a_t = property(get_mat_a_t) + """ + Transpose of matrix A parameter, of type Matrix. + """ + + def get_mat_b_t(self): + return self._mat_b_t + mat_b_t = property(get_mat_b_t) + """ + Transpose of matrix B parameter, of type Matrix. + """ diff --git a/modules/composes/composition/lexical_function.py b/modules/composes/composition/lexical_function.py new file mode 100755 index 0000000..cb07b3b --- /dev/null +++ b/modules/composes/composition/lexical_function.py @@ -0,0 +1,288 @@ +''' +Created on Oct 11, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +import time +from composition_model import CompositionModel +from composes.semantic_space.space import Space +from composes.utils.gen_utils import get_partitions +from composes.utils.regression_learner import LstsqRegressionLearner +from composes.utils.regression_learner import RegressionLearner +from composes.utils.matrix_utils import resolve_type_conflict +from composes.utils.matrix_utils import get_type_of_largest +from composes.utils.matrix_utils import padd_matrix +from composes.utils.num_utils import is_integer +from composes.utils.gen_utils import assert_is_instance +from composes.exception.illegal_state_error import IllegalStateError + +import logging +from composes.utils import log_utils as log + +logger = logging.getLogger(__name__) + + +class LexicalFunction(CompositionModel): + """ + Implements the lexical function compositional model. + + :math:`\\vec{p} = U \\vec{v}` + + where :math:`\\vec{p}` is the vector of the composed phrase, + :math:`U` is the matrix representation of the first component (the lexical function) + and :math:`\\vec{v}` is the vector representation of the second component + + """ + + _name = "lexical_function" + + def __init__(self, function_space=None, intercept=False, learner=None, min_samples=1): + """ + Constructor. + + Args: + function_space= : function space parameter, containing + the lexical functions, of type Space. Optional, can be set through + training. + + intercept= : True/False, True if the function space has intercept. + Optional, default False. When training is used, intercept is set + to the intercept value of the regression learner used. + + learner= : regression method of type RegressionLearner. Optional, + default LstsqRegressionLearner. + + min_samples= : minimum number of training samples required before a + LexicalFunction can be trained. Optional, default 1. + + """ + # assert_valid_kwargs(kwargs, ["function_space", "intercept", "learner"]) + + self.composed_id2column = [] + if learner and function_space: + raise ValueError("Cannot instantiate with both learner and function_space!") + + self._regression_learner = learner if learner else LstsqRegressionLearner() + self._function_space = function_space + self._has_intercept = intercept + self._MIN_SAMPLES = min_samples + + + def train(self, train_data, arg_space, phrase_space): + """ + Trains a lexical function composition model to learn a function + space and sets the function_space parameter. + + Args: + train_data: list of string tuples. Each tuple contains 3 + string elements: (function_word, arg, phrase). + + arg_space: argument space, of type Space. arg elements of + train data are interpreted in this space. + + phrase space: phrase space, of type Space. phrase elements of + the train data are interpreted in this space. + + Training tuples which contain strings not found in their + respective spaces are ignored. Function words containing less than + _MIN_SAMPLES training instances are ignored. For example, if + _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red" + is ignored. + + The id2column attribute of the resulted composed space is set to + be equal to that of the phrase space given as an input. + """ + + start = time.time() + + self._has_intercept = self._regression_learner.has_intercept() + + if not isinstance(arg_space, Space): + raise ValueError("expected one input spaces!") + + result_mats = [] + + train_data = sorted(train_data, key=lambda tup: tup[0]) + function_word_list, arg_list, phrase_list = self.valid_data_to_lists(train_data, + (None, + arg_space.row2id, + phrase_space.row2id)) + #partitions the sorted input data + keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES) + + if not keys: + raise ValueError("No valid training data found!") + + assert (len(arg_space.element_shape) == 1) + + if self._has_intercept: + new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0] + 1,) + else: + new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0],) + + for i in xrange(len(key_ranges)): + idx_beg, idx_end = key_ranges[i] + + print ("Training lexical function...%s with %d samples" + % (keys[i], idx_end - idx_beg)) + + arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) + phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end]) + + #convert them to the same type + matrix_type = get_type_of_largest([arg_mat, phrase_mat]) + [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat], + matrix_type) + + result_mat = self._regression_learner.train(arg_mat, phrase_mat).transpose() + + result_mat.reshape((1, np.prod(new_element_shape))) + + result_mats.append(result_mat) + + new_space_mat = arg_mat.nary_vstack(result_mats) + + self.composed_id2column = phrase_space.id2column + + self._function_space = Space(new_space_mat, keys, [], + element_shape=new_element_shape) + + log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") + log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys)) + log.print_info(logger, 3, "With total data points:%s" % len(function_word_list)) + log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, + "Semantic space of arguments:") + log.print_info(logger, 3, "Shape of lexical functions learned:%s" + % (new_element_shape,)) + log.print_matrix_info(logger, new_space_mat, 3, + "Semantic space of lexical functions:") + log.print_time_info(logger, time.time(), start, 2) + + def compose(self, data, arg_space): + """ + Uses a lexical function composition model to compose elements. + + Args: + data: data to be composed. List of tuples, each containing 3 + strings: (function_word, arg, composed_phrase). function_word and + arg are the elements to be composed and composed_phrase is the + string associated to their composition. function_word elements + are interpreted in self.function_space. + + arg_space: argument space, of type Space. arg elements of data are + interpreted in this space. + + Returns: + composed space: a new object of type Space, containing the + phrases obtained through composition. + + """ + start = time.time() + + assert_is_instance(arg_space, Space) + arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, + (self._function_space.row2id, + arg_space.row2id, + None)) + + composed_vec_list = [] + for i in xrange(len(arg1_list)): + arg1_vec = self._function_space.get_row(arg1_list[i]) + arg2_vec = arg_space.get_row(arg2_list[i]) + + matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) + [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], + matrix_type) + + composed_ph_vec = self._compose(arg1_vec, arg2_vec, + self._function_space.element_shape) + + composed_vec_list.append(composed_ph_vec) + + result_element_shape = self._function_space.element_shape[0:-1] + composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) + + log.print_name(logger, self, 1, "\nComposed with composition model:") + log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) + log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" + % (result_element_shape,)) + log.print_matrix_info(logger, composed_ph_mat, 4, + "Resulted (composed) semantic space:") + log.print_time_info(logger, time.time(), start, 2) + + return Space(composed_ph_mat, phrase_list, self.composed_id2column, + element_shape=result_element_shape) + + + def _compose(self, function_arg_vec, arg_vec, function_arg_element_shape): + + new_shape = (np.prod(function_arg_element_shape[0:-1]), + function_arg_element_shape[-1]) + + function_arg_vec.reshape(new_shape) + + if self._has_intercept: + comp_el = function_arg_vec * padd_matrix(arg_vec.transpose(), 0) + else: + comp_el = function_arg_vec * arg_vec.transpose() + + return comp_el.transpose() + + @classmethod + def _assert_space_match(cls, arg1_space, arg2_space, phrase_space=None): + pass + + def set_regression_learner(self, regression_learner): + assert_is_instance(regression_learner, RegressionLearner) + self._regression_learner = regression_learner + + def get_regression_learner(self): + return self._regression_learner + + regression_learner = property(get_regression_learner, set_regression_learner) + """ + Regression method to be used in training, of type RegressionLearner. + Default is RidgeRegressionLearner(param=1). + """ + + def get_function_space(self): + return self._function_space + + function_space = property(get_function_space) + """ + Function space parameter, containing the lexical functions, of type Space. + Can be set through training or through initialization, default None. + """ + + def get_has_intercept(self): + return self._has_intercept + + has_intercept = property(get_has_intercept) + """ + Has intercept parameter, boolean. If True, then the function_space is + assumed to contain intercept. Can be set through training or through + initialization, default is assumed to be False. + """ + + def set_min_samples(self, min_samples): + if not is_integer(min_samples): + raise ValueError("expected %s min_samples value, received %s" + % ("integer", type(min_samples))) + self._MIN_SAMPLES = min_samples + + def get_min_samples(self): + return self._MIN_SAMPLES + + MIN_SAMPLES = property(get_min_samples, set_min_samples) + """ + Minimal number of samples for each training instance. Default 3. + """ + + def _export(self, filename): + if self._function_space is None: + raise IllegalStateError("cannot export an untrained LexicalFunction model.") + self._function_space.export(filename, format="dm") + + diff --git a/modules/composes/composition/multiplicative.py b/modules/composes/composition/multiplicative.py new file mode 100755 index 0000000..c656ac1 --- /dev/null +++ b/modules/composes/composition/multiplicative.py @@ -0,0 +1,42 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from composition_model import CompositionModel +from composes.exception.illegal_state_error import IllegalOperationError + +class Multiplicative(CompositionModel): + """ + Implements the component-wise multiplication compositional model: + + :math:`\\vec{p} = \\vec{u} \\cdot \\vec{v}` + + where :math:`\\vec{p}` is the vector of the composed phrase and + :math:`\\vec{u}, \\vec{v}` are the vectors of the components. + + :math:`\\vec{u} \\cdot \\vec{v} = (u_1v_1,...,u_nv_n)` + """ + + _name = "multiplicative" + + def __init__(self): + """ + Constructor + """ + + def train(self): + """ + Current multiplicative model cannot be trained, it has no parameters. + """ + raise IllegalOperationError("Cannot train multiplicative model!") + + def _compose(self, arg1_mat, arg2_mat): + return arg1_mat.multiply(arg2_mat) + + def export(self, filename): + """ + Current multiplicative model cannot be exported, it has no parameters. + """ + raise IllegalOperationError("cannot export a Multiplicative model.") diff --git a/modules/composes/composition/weighted_additive.py b/modules/composes/composition/weighted_additive.py new file mode 100755 index 0000000..09bf9b9 --- /dev/null +++ b/modules/composes/composition/weighted_additive.py @@ -0,0 +1,143 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from composition_model import CompositionModel +from composes.matrix.dense_matrix import DenseMatrix +from composes.utils.num_utils import is_numeric +# from composes.utils.mem_utils import get_mem_usage +from composes.utils.matrix_utils import resolve_type_conflict +import numpy as np +import math + +class WeightedAdditive(CompositionModel): + """ + Implements weighted additive compositional model: + + :math:`\\vec{p} = \\alpha \\vec{u} + \\beta \\vec{v}` + + where :math:`\\vec{p}` is the vector of the composed phrase and + :math:`\\vec{u}, \\vec{v}` are the vectors of the components + + When :math:`\\alpha=\\beta=0.5` the model performs simple vector addition. + """ + + _name = "weighted_additive" + + """ + double, in interval [0,1] + maximum overhead allowed: MAX_MEM_OVERHEAD ratio of peripheral space memory + """ + MAX_MEM_OVERHEAD = 0.2 + + + def __init__(self, alpha=None, beta=None): + """ + Constructor. + + Args: + alpha: alpha parameter, numeric type. Optional, can be set through + training + beta: beta parameter, numeric type. Optional, can be set through + training. + + Raises: + TypeError if alpha or beta are not numeric. + """ + self._alpha = 0.5 + self._beta = 0.5 + if not alpha is None: + if not is_numeric(alpha): + raise TypeError("Parameter not numeric: %s " % (type(alpha))) + else: + self._alpha = alpha + + if not beta is None: + if not is_numeric(beta): + raise TypeError("Parameter not numeric: %s " % (type(beta))) + else: + self._beta = beta + + if not alpha is None and beta is None: + self._beta = 1 - self._alpha + + + def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list): + + # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead + # the /3.0 is needed + # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) + chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] * self.MAX_MEM_OVERHEAD / 3.0) + 1 + + arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (0, 0, 0, 0, 0) + + for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): + beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) + + arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) + arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) + phrase_mat = phrase_space.get_rows(phrase_list[beg:end]) + + [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat, + arg2_mat, + phrase_mat], + DenseMatrix) + + res = self._process(arg1_mat, arg2_mat, phrase_mat) + arg1_arg2_dot += res[0] + arg1_phrase_dot += res[1] + arg2_phrase_dot += res[2] + arg1_norm_sqr += res[3] + arg2_norm_sqr += res[4] + + + self._solve(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr) + + + def _process(self, arg1_mat, arg2_mat, phrase_mat): + + # debug here + # remove when done + # print "Using %s MB " % (get_mem_usage()) + + arg1_arg2_dot = arg1_mat.multiply(arg2_mat).sum() + arg1_phrase_dot = arg1_mat.multiply(phrase_mat).sum() + arg2_phrase_dot = arg2_mat.multiply(phrase_mat).sum() + + arg1_norm_sqr = pow(arg1_mat.norm(), 2) + arg2_norm_sqr = pow(arg2_mat.norm(), 2) + + return arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr + + def _solve(self, arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr): + + a = np.linalg.pinv(np.mat([[arg1_norm_sqr,arg1_arg2_dot], + [arg1_arg2_dot,arg2_norm_sqr]])) + a = a * np.mat([[arg1_phrase_dot],[arg2_phrase_dot]]) + self._alpha = a[0, 0] + self._beta = a[1, 0] + + + def _compose(self, arg1_mat, arg2_mat): + return self._alpha * arg1_mat + self._beta * arg2_mat + + def _export(self, filename): + with open(filename, "w") as output_stream: + output_stream.write("alpha\t%f\n" % self._alpha) + output_stream.write("beta\t%f" % self._beta) + + def get_alpha(self): + return self._alpha + alpha = property(get_alpha) + """ + Alpha parameter, default 0.5. + """ + + def get_beta(self): + return self._beta + beta = property(get_beta) + """ + Beta parameter, default 0.5. + """ diff --git a/modules/composes/exception/__init__.py b/modules/composes/exception/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/exception/illegal_state_error.py b/modules/composes/exception/illegal_state_error.py new file mode 100755 index 0000000..5065f78 --- /dev/null +++ b/modules/composes/exception/illegal_state_error.py @@ -0,0 +1,18 @@ +''' +Created on Jun 15, 2012 + +@author: thenghia.pham +''' + +class IllegalStateError(Exception): + ''' + ''' + def __init__(self, msg): + self.__msg = msg + + +class IllegalOperationError(Exception): + ''' + ''' + def __init__(self, msg): + self.__msg = msg \ No newline at end of file diff --git a/modules/composes/exception/invalid_argument_error.py b/modules/composes/exception/invalid_argument_error.py new file mode 100755 index 0000000..0613344 --- /dev/null +++ b/modules/composes/exception/invalid_argument_error.py @@ -0,0 +1,6 @@ + +class InvalidArgumentError(Exception): + ''' + ''' + def __init__(self, msg): + self.__msg = msg \ No newline at end of file diff --git a/modules/composes/matrix/__init__.py b/modules/composes/matrix/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/matrix/dense_matrix.py b/modules/composes/matrix/dense_matrix.py new file mode 100755 index 0000000..ae55185 --- /dev/null +++ b/modules/composes/matrix/dense_matrix.py @@ -0,0 +1,362 @@ +''' +Created on Sep 17, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +from warnings import warn +from scipy.sparse import issparse +from composes.utils.num_utils import is_numeric +from composes.matrix.matrix import Matrix + +class DenseMatrix(Matrix): + ''' + classdocs + ''' + + def __init__(self, data): + """ + Constructor, creates a DenseMatrix from a numpy matrix-like + object. + + Matrix-like objects (np.ndarray, np.matrix, scipy.sparse.matrix, + SparseMatrix) are converted into np.matrix. + + Params: + data: numpy matrix-like object or Matrix type + + Raises: + TypeError: if input data is not one of scipy.sparse/ + numpy.ndarray/numpy.matrix/Matrix + """ + + if issparse(data): + self.mat = data.todense() + elif isinstance(data, np.matrix): + if data.shape[0] == 0 or data.shape[1] == 0: + raise ValueError("cannot initialize empty matrix") + self.mat = data + elif isinstance(data, np.ndarray): + if len(data) == 0: + raise ValueError("cannot initialize empty matrix") + self.mat = np.matrix(data) + elif isinstance(data, Matrix): + # TODO: remove warning or remove import somehow fix this!! + # from composes.matrix.sparse_matrix import SparseMatrix + self.mat = data.to_dense_matrix().mat + else: + # TODO: raise suitable message + raise TypeError("expected matrix-like type, received %s" + % type(data)) + + def __str__(self): + return str(self.mat) + + def __getitem__(self, index): + result = self.mat[index] + if is_numeric(result): + return result + else: + return type(self)(result.copy()) + + def multiply(self, matrix_): + """ + Computes component-wise multiplication of two matrices. + + Args: + matrix_: a second matrix of type DenseMatrix + + Returns: + A DenseMatrix containing the cw multiplication of the two. + + Raises: + TypeError: if the argument is not of type DenseMatrix + ValueError: if the two matrices don t have the same shape. + """ + + self._assert_same_type(matrix_) + if self.mat.shape != matrix_.mat.shape: + raise ValueError("inconsistent shapes: %s %s" + % (str(self.mat.shape), str(matrix_.mat.shape) )) + return DenseMatrix(np.multiply(self.mat, matrix_.mat)) + + def transpose(self): + """ + Transposes the current matrix. + + Returns: + DenseMatrix, a transpose of the current matrix. + + """ + return type(self)(self.mat.transpose().copy()) + + def reshape(self, new_shape): + """ + Reshapes current matrix. + + Overwrites the current matrix with a new matrix of the + given shape! + + Args: + shape: length 2 tuple or pair of integers + + Raises: + ValueError: if shape is not an integer pair or + if new shape is inconsistent with the total + size of the current matrix. + """ + + # TODO: change this is necessary to make a copy + self.mat = self.mat.reshape(new_shape) + + @staticmethod + def identity(size): + """ + Builds the identity matrix. + + Args: + size: integer, the result matrix is of shape size x size + + Returns: + Identity DenseMatrix. + """ + return DenseMatrix(np.eye(size, size, 0, np.double)) + + def vstack(self, matrix_): + """ + Vertical stack of two matrices. + + Args: + matrix_: a second matrix of type DenseMatrix + + Returns: + A DenseMatrix, vertical stack of the two matrices. + + Raises: + TypeError: if the argument is not of type DenseMatrix + + """ + self._assert_same_type(matrix_) + return DenseMatrix(np.vstack((self.mat, matrix_.mat))) + + def hstack(self, matrix_): + """ + Horizontal stack of two matrices. + + Args: + matrix_: a second matrix of type DenseMatrix + + Returns: + A DenseMatrix, horizontal stack of the two matrices. + + Raises: + TypeError: if the argument is not of type DenseMatrix + + """ + self._assert_same_type(matrix_) + return DenseMatrix(np.hstack((self.mat, matrix_.mat))) + + @classmethod + def nary_vstack(cls, mat_list): + """ + Class method, vertical stack of n matrices. + + Args: + mat_list: a list of matrices of type DenseMatrix + + Returns: + A DenseMatrix, vertical stack of the arguments. + + """ + np_mat_list = [matrix_.mat for matrix_ in mat_list] + return DenseMatrix(np.vstack(np_mat_list)) + + @classmethod + def nary_hstack(cls, mat_list): + """ + Class method, horizontal stack of n matrices. + + Args: + mat_list: a list of matrices of type DenseMatrix + + Returns: + A DenseMatrix, horizontal stack of the arguments. + + """ + + np_mat_list = [matrix_.mat for matrix_ in mat_list] + return DenseMatrix(np.hstack(np_mat_list)) + + + + def scale_rows(self, array_): + """ + Scales each row of the matrix by the values given in an array. + + Args: + array_: ndarray containing the values to scale by + + Returns: + A new DenseMatrix with scaled rows. + """ + self._assert_array(array_) + + x_dim = self.mat.shape[0] + if array_.shape in ((x_dim, 1), (x_dim,)): + if array_.shape == (x_dim,): + array_ = array_.reshape((x_dim, 1)) + return DenseMatrix(np.multiply(self.mat, array_)) + else: + raise ValueError("inconsistent shapes: %s %s" + % (str(self.mat.shape), str(array_.shape))) + + def scale_columns(self, array_): + """ + Scales each column of the matrix by the values given in an array. + + Args: + array_: ndarray containing the values to scale by + + Returns: + A new DenseMatrix with scaled columns. + """ + self._assert_array(array_) + + y_dim = self.mat.shape[1] + if array_.shape in ((1, y_dim), (y_dim,)): + return DenseMatrix(np.multiply(self.mat, array_)) + else: + raise ValueError("inconsistent shapes: %s %s" + % (str(self.mat.shape), str(array_.shape))) + + def plog(self): + """ + Applies positive log to the matrix elements. + + Elements smaller than 1 (leading to not-defined log or negative log) + are set to 0. Log is applied on all other elements. + + Modifies the current matrix. + """ + + #this line uses 3 x size(mat) to run in the worst case + #(if we select the entire matrix - depends on the size of the selection) + self.mat[self.mat < 1.0] = 1 + self.mat = np.log(self.mat) + + + def assert_positive(self): + """ + Asserts that all values are larger or equal to 0. + + Raises: + ValueError if not all values are >= 0. + """ + if not np.all(self.mat >= 0): + raise ValueError("expected non-negative matrix") + + def get_non_negative(self): + """ + Turns negative entries to 0. + + Returns: + A new DenseMatrix matrix in which negative entries are set to 0. + + """ + mat_ = self.mat.copy() + # TODO: time against : mat_.data[mat_.data < 0] = 0 + mat_ = np.where(mat_ > 0, mat_, 0) + return DenseMatrix(mat_) + + def to_non_negative(self): + """ + Turns negative entries to 0. + + Modifies the current matrix: all negative entries are set to 0. + + """ + + self.mat = np.where(self.mat > 0, self.mat, 0) + + def to_ones(self): + """ + Turns strictly positive entries to 1 and negative entries to 0. + + Modifies the current matrix: all strictly positive entries are + set to 1, all negative entries are set to 0. + + """ + + self.mat = np.where(self.mat > 0, 1, 0) + + def remove_small_values(self, epsilon): + """ + Sets values smaller than an epsilon to 0. + + Args: + epsilon: scalar, threshold + Returns: + A DenseMatrix in which all values smaller than epsilon are + set to 0. + + """ + mat_ = self.mat.copy() + mat_ = np.where(mat_ > epsilon, mat_, 0) + return DenseMatrix(mat_) + + def is_mostly_positive(self): + """ + Checks if more than 50% of the non zero elements of a + matrix are positive. + + """ + return self.mat[self.mat > 0].size > self.mat.size/2 + + def all_close(self, matrix_): + """ + Checks of the values in two matrices are all_close. + + Args: + matrix_: input matrix of type DenseMatrix + + Returns: + bool: True if the elements are allclose (using np.allclose). + + """ + return np.allclose(self.mat, matrix_.mat) + + def norm(self, axis = None): + """ + Computes the norms on a certain axis or of the entire matrix. + + Args: + axis: 0/1 or None, if axis is None computes the norm of the + full matrix + Returns: + nd.array containing the norms on a given axis, or a scalar + if the axis is None. + + """ + if axis is None: + return np.linalg.norm(self.mat) + else: + return np.sqrt(self.multiply(self).sum(axis)) + + def to_sparse_matrix(self): + """ + Converts to SparseMatrix. + """ + from composes.matrix.sparse_matrix import SparseMatrix + return SparseMatrix(self.mat) + + def to_dense_matrix(self, copy = False): + """ + Returns a copy is copy=True, returns self otherwise. + """ + + if (copy): + return self.copy() + else: + return self + diff --git a/modules/composes/matrix/linalg.py b/modules/composes/matrix/linalg.py new file mode 100755 index 0000000..b2155a5 --- /dev/null +++ b/modules/composes/matrix/linalg.py @@ -0,0 +1,406 @@ +''' +Created on Oct 4, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +import logging +import scipy.linalg as splinalg +from sparsesvd import sparsesvd +from warnings import warn +from time import time +from math import sqrt +from composes.matrix.matrix import Matrix +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.sparse_matrix import SparseMatrix +from composes.utils.matrix_utils import assert_same_shape +from composes.utils.matrix_utils import padd_matrix +import composes.utils.log_utils as log + +logger = logging.getLogger(__name__) + +class Linalg(object): + """ + Contains a set of liniar algebra utilities defined to work both with sparse and + with dense matrices as an input (i.e. with objects of type SparseMatrix/DenseMatrix). + + Implements: + svd, + nmf (LIN algorithm, add citation here!), + pinv, + ordinary least squares regression, + ridge regression + """ + + _NMF_ALPHA = 1.0 + _NMF_BETA = 0.1 + _NMF_MAX_ITER = 20 + _NMF_MAX_ITER_SUBPROB = 15 + _NMF_MIN_TOL = 0.001 + _NMF_TOL = _NMF_MIN_TOL + _NMF_TOL_DECREASE_FACTOR = 0.5 + _NMF_TIME_LIMIT = 36000 + + _SVD_TOL = 1e-12 + + @staticmethod + def svd(matrix_, reduced_dimension): + """ + Performs SVD decomposition. + + If the rank is smaller than the requested reduced dimension, + reduction to rank is performed. Dense SVD uses Linalg._SVD_TOL to decide + the rank of the matrix. + + + Args: + matrix_: input of type Matrix + reduced_dimension: int, the desired reduced dimension + + Returns: + U,S,V of the decomposition X = USV^T. U, V: Matrix type, + S: ndarray of singular values. + + """ + log.print_info(logger, 4, "In SVD..reducing to dim %d" % reduced_dimension) + log.print_matrix_info(logger, matrix_, 5, "Input matrix:") + + #TODO: IMPORTANT!! do the sign normalization COLUMN-wise!!!not + #for the full matrix at once!! + if reduced_dimension == 0: + raise ValueError("Cannot reduce to dimensionality 0.") + + if isinstance(matrix_, SparseMatrix): + result = Linalg._sparse_svd(matrix_, reduced_dimension) + elif isinstance(matrix_, DenseMatrix): + result = Linalg._dense_svd(matrix_, reduced_dimension) + else: + raise TypeError("expected Matrix type, received %s" % type(matrix_)) + + log.print_matrix_info(logger, result[0], 5, "Resulting matrix U:") + return result + + @staticmethod + def ridge_regression(matrix_a , matrix_b, lambda_, intercept=False): + #log.print_info(logger, "In Ridge regression..", 4) + #log.print_matrix_info(logger, matrix_a, 5, "Input matrix A:") + #log.print_matrix_info(logger, matrix_b, 5, "Input matrix B:") + """ + Performs Ridge Regression. + + This method use the general formula: + ... + to solve the problem: + :math:`X = argmin(||AX - B||_2 + \\lambda||X||_2)` + + Args: + matrix_a: input matrix A, of type Matrix + matrix_b: input matrix A, of type Matrix + lambda_: scalar, lambda parameter + intercept: bool. If True intercept is used. Optional, default False. + + Returns: + solution X of type Matrix + + """ + + matrix_a._assert_same_type(matrix_b) + # TODO: check out where to define this assert + assert_same_shape(matrix_a, matrix_b, 0) + + matrix_type = type(matrix_a) + dim = matrix_a.shape[1] + + if intercept: + matrix_a = matrix_a.hstack(matrix_type(np.ones((matrix_a.shape[0], + 1)))) + lambda_diag = (lambda_ ) * matrix_type.identity(dim) + + if intercept: + lambda_diag = padd_matrix(padd_matrix(lambda_diag, 0, 0.0), 1, 0.0) + + matrix_a_t = matrix_a.transpose() + try: + tmp_mat = Linalg.pinv(((matrix_a_t * matrix_a) + lambda_diag)) + except np.linalg.LinAlgError: + print "Warning! LinAlgError" + tmp_mat = matrix_type.identity(lambda_diag.shape[0]) + + tmp_res = tmp_mat * matrix_a_t + result = tmp_res * matrix_b + + #S: used in generalized cross validation, page 244 7.52 (YZ also used it) + # S is defined in 7.31, page 232 + # instead of computing the matrix and then its trace, we can compute + # its trace directly + # NOTE when lambda = 0 we get out trace(S) = rank(matrix_a) + + dist = (matrix_a * result - matrix_b).norm() + S_trace = matrix_a_t.multiply(tmp_res).sum() + + return result, S_trace, dist + + @classmethod + def lstsq_regression(cls, matrix_a, matrix_b, intercept=False): + """ + Performs Least Squares Regression. + + Solves the problem: + + :math:`X = argmin(||AX - B||_2)` + + Args: + matrix_a: input matrix A, of type Matrix + matrix_b: input matrix A, of type Matrix + intercept: bool. If True intercept is used. Optional, False by default. + + Returns: + solution X of type Matrix + + """ + + matrix_a._assert_same_type(matrix_b) + # TODO: check out where to define this assert + assert_same_shape(matrix_a, matrix_b, 0) + + if intercept: + matrix_a = matrix_a.hstack(type(matrix_a)(np.ones((matrix_a.shape[0], + 1)))) + if isinstance(matrix_a, DenseMatrix): + result = Linalg._dense_lstsq_regression(matrix_a, matrix_b) + else: + result = Linalg._sparse_lstsq_regression(matrix_a, matrix_b) + + return result + + @staticmethod + def _dense_lstsq_regression(matrix_a , matrix_b): + return DenseMatrix(Linalg._numpy_lstsq_regression(matrix_a, matrix_b)) + #return DenseMatrix(Linalg._scipy_lstsq_regression(matrix_a, matrix_b)) + + @staticmethod + def _sparse_lstsq_regression(matrix_a , matrix_b, intercept=False): + return Linalg.ridge_regression(matrix_a, matrix_b, 0.0)[0] + #return SparseMatrix(Linalg._dense_lstsq_regression(DenseMatrix(matrix_a), + # DenseMatrix(matrix_b))) + + @staticmethod + def _numpy_lstsq_regression(matrix_a, matrix_b, rcond=-1): + return np.linalg.lstsq(matrix_a.mat, matrix_b.mat, rcond)[0] + + @staticmethod + def _scipy_lstsq_regression(matrix_a, matrix_b): + return splinalg.lstsq(matrix_a.mat, matrix_b.mat)[0] + + @staticmethod + def _sparse_svd(matrix_, reduced_dimension): + #svds from scipy.sparse.linalg + #RAISES ValueError if the rank is smaller than reduced_dimension + 1 + #TODO : fix this or replace with svdsparse + #??? eIGENVALUES ARE NOT SORTED!!!!!! + #IF EVER USE THIS; FIX THE PROBLEMS + #u, s, vt = svds(matrix_.mat, False, True) + """ + Patch + + Problem: sparsesvd sometimes returns fewer dimensions that requested. + It will be no longer needs when sparsesvd will allow + SVDLIBC parameters as an input (kappa parameter of SVDLIBC has to be + larger than the default. e.g. 1E-05 instead of 1E-06) + + Current fix: ask for more dimensions and remove the unnecessary ones. + """ + + extra_dims = int(reduced_dimension/10) + + ut, s, vt = sparsesvd(matrix_.mat.tocsc(), reduced_dimension + extra_dims) + + u = SparseMatrix(ut.transpose()) + v = SparseMatrix(vt.transpose()) + + no_cols = min(u.shape[1], reduced_dimension) + u = u[:, 0:no_cols] + v = v[:, 0:no_cols] + + Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1], reduced_dimension) + + if not u.is_mostly_positive(): + u = -u + v = -v + + return u, s[0:no_cols], v + + @staticmethod + def _dense_svd(matrix_, reduced_dimension): + + print "Running dense svd" + u, s, vt = np.linalg.svd(matrix_.mat, False, True) + rank = len(s[s > Linalg._SVD_TOL]) + + no_cols = min(u.shape[1], reduced_dimension, rank) + u = DenseMatrix(u[:,0:no_cols]) + s = s[0:no_cols] + v = DenseMatrix(vt[0:no_cols,:].transpose()) + + Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1], reduced_dimension) + + if not u.is_mostly_positive(): + u = -u + v = -v + + return u, s, v + + @staticmethod + def _check_reduced_dim(no_columns, reduced_dim, requested_reduced_dim): + if requested_reduced_dim > no_columns: + warn("Number of columns smaller than the reduced dimensionality requested: %d < %d. Truncating to %d dimensions (rank)." % (no_columns, requested_reduced_dim, reduced_dim)) + elif reduced_dim != requested_reduced_dim: + warn("Returning %d dimensions instead of %d." % (reduced_dim, requested_reduced_dim)) + + @staticmethod + def _nmf_nlssubprob(v, w, w_t, h_init, tol, maxiter): + """ + h, grad: output solution and gradient + iteration: #iterations used + v, w: constant matrices + h_init: initial solution + tol: stopping tolerance + maxiter: limit of iterations + """ + h = h_init + w_t_v = w_t * v + w_t_w = w_t * w + + alpha = Linalg._NMF_ALPHA + beta = Linalg._NMF_BETA + + #sub_loop_time = time() + + for iteration in xrange(1, maxiter): + grad = w_t_w * h - w_t_v + + # search step size + for inner_iter in xrange(1, 20): + hn = h - alpha * grad + hn = hn.get_non_negative() + d = hn - h + gradd = grad.multiply(d).sum() + dQd = (w_t_w * d).multiply(d).sum() + suff_decr = 0.99 * gradd + 0.5 * dQd < 0 + if inner_iter == 1: + decr_alpha = not suff_decr + hp = h + if decr_alpha: + if suff_decr: + h = hn + break + else: + alpha = alpha * beta + else: + if not suff_decr or hp.all_close(hn): + h = hp + break + else: + alpha = alpha / beta + hp = hn + + return h, grad, iteration + + @staticmethod + def nmf(v, w_init, h_init): + """ + Performs Non-negative Matrix Factorization. + + It solves the problem: + :math:`W,H = argmin(||X - WH||_2)` such that W and H are non-negative matrices. + + Args: + w_init: initial value for matrix W, type Matrix + h_init: initial value for matrix H, type Matrix + + Returns: + W, H : where W, H solve the NMF problem stated above. + + """ + + log.print_info(logger, 4, "In NMF..reducing to dim %d" % w_init.shape[1]) + log.print_matrix_info(logger, w_init, 5, "W init matrix:") + log.print_matrix_info(logger, h_init, 5, "H init matrix:") + + if not isinstance(v, Matrix): + raise TypeError("expected Matrix type, received %s" % type(v)) + w = w_init + h = h_init + init_time = time() + + wt = w.transpose() + ht = h.transpose() + vt = v.transpose() + gradW = (w * (h * ht)) - (v * ht) + gradH = ((wt * w) * h) - (wt * v) + + gradW_norm = gradW.norm() + gradH_norm = gradH.norm() + initgrad = sqrt(pow(gradW_norm, 2) + pow(gradH_norm, 2)) + + #print 'Init gradient norm %f' % initgrad + tolW = max(Linalg._NMF_MIN_TOL, Linalg._NMF_TOL) * initgrad + tolH = tolW + + #loop_time = init_time + for iteration in xrange(1, Linalg._NMF_MAX_ITER): + log.print_info(logger, 5, "Iteration: %d(%d)" % (iteration, Linalg._NMF_MAX_ITER)) + + if time() - init_time > Linalg._NMF_TIME_LIMIT: + break + + w, gradW, iterW = Linalg._nmf_nlssubprob(vt, h.transpose(), h, + w.transpose(), tolW, + Linalg._NMF_MAX_ITER_SUBPROB) + old_w = w + w = w.transpose() + gradW = gradW.transpose() + + if iterW == 1: + tolW = Linalg._NMF_TOL_DECREASE_FACTOR * tolW + + h, gradH, iterH = Linalg._nmf_nlssubprob(v, w, old_w, h, tolH, + Linalg._NMF_MAX_ITER_SUBPROB) + + if iterH == 1: + tolH = Linalg._NMF_TOL_DECREASE_FACTOR * tolH + + log.print_matrix_info(logger, w, 5, "Return W matrix:") + log.print_matrix_info(logger, h, 5, "Return H matrix:") + return w, h + + @staticmethod + def pinv(matrix_): + """ + Computes the pseudo-inverse of a matrix. + + Args: + matrix_: input matrix, of type Matrix + + Returns: + Pseudo-inverse of input matrix, of type Matrix + + Raises: + TypeError, if input is not of type Matrix + """ + if isinstance(matrix_, SparseMatrix): + return Linalg._sparse_pinv(matrix_) + elif isinstance(matrix_, DenseMatrix): + return Linalg._dense_pinv(matrix_) + else: + raise TypeError("expected Matrix type, received %s" % type(matrix_)) + + @staticmethod + def _dense_pinv(matrix_): + return DenseMatrix(np.linalg.pinv(matrix_.mat)) + + @staticmethod + def _sparse_pinv(matrix_): + # TODO: implement pinv + return SparseMatrix(np.linalg.pinv(matrix_.mat.todense())) diff --git a/modules/composes/matrix/matrix.py b/modules/composes/matrix/matrix.py new file mode 100755 index 0000000..d987204 --- /dev/null +++ b/modules/composes/matrix/matrix.py @@ -0,0 +1,152 @@ +''' +Created on Sep 17, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from composes.utils.num_utils import is_numeric +from composes.utils.py_matrix_utils import is_array + +class Matrix(object): + """ + Provides a common interface for matrix implementations. + + Provides a common interface for different matrix implementations + (sparse/dense). In vector space models, a matrix is used to encode + a set of entities such as words or phrases (rows) described in terms + of contextual features (columns). + """ + + def __init__(self, *args, **kwargs): + raise NotImplementedError() + + + def __add__(self, matrix_): + ''' + operation''' + self._assert_same_type(matrix_) + return type(self)(self.mat + matrix_.mat) + + def __sub__(self, matrix_): + ''' - operation''' + self._assert_same_type(matrix_) + return type(self)(self.mat - matrix_.mat) + + def __neg__(self): + ''' - operation''' + return type(self)(-self.mat) + + def __mul__(self, factor): + ''' * operation''' + if is_numeric(factor): + return type(self)(self.mat * factor) + else: + self._assert_same_type(factor) + return type(self)(self.mat * factor.mat) + + def __div__(self, factor): + ''' / operation''' + if is_numeric(factor): + if factor == 0: + raise ZeroDivisionError("Division by zero") + else: + raise TypeError("expected numeric type, received %s" % (type(factor))) + return type(self)(self.mat / float(factor)) + + def __rmul__(self, factor): + ''' * operation''' + if is_numeric(factor): + return self.__mul__(factor) + raise TypeError("expected numeric type, received %s" % (type(factor))) + + + #TODO move all these asserts somewhere else + def _assert_same_type(self, operand): + if type(self) != type(operand): + raise TypeError("expected matrix of type %s, received %s" % + (type(self), type(operand))) + + def assert_same_shape(self, matrix_): + """ + Asserts that the matrix has the same shape as a second matrix. + + Args: + matrix_: A second matrix of type Matrix. + + Raises: + ValueError: If the current matrix and the argument matrix + do not have the same shape. + """ + + if self.mat.shape != matrix_.mat.shape: + raise ValueError("inconsistent shapes: %s %s" + % (str(self.mat.shape), str(matrix_.mat.shape) )) + + #TODO move all these asserts somewhere else + def _assert_array(self, operand): + if not is_array(operand): + raise TypeError("expected array, received %s" % (type(operand))) + + + def sum(self, axis=None): + #return type is dense matrix of shape (1, dimy) or (dimx,1) + #or a number if **kwargs is None + return self.mat.sum(axis) + + def sorted_permutation(self, norm_function, axis_): + """ + Computes the permutation resulted when sorting the matrix + on an axis, according to a function, in descending order. + + Sorts the rows or the columns (as given by axis) + of a matrix according to a norm_function and returns + the permutation of this as a np.array + + Args: + norm_function: One of sum/length. A function that + takes an axis as an argument (i.e. 0 or 1) and + returns an array of values (i.e. sum of all rows + if axis = 0 and norm_function = sum). + + axis_: axis value, one of 0/1 + + Returns: + perm_srtd: np.array containing the permutation of the + sorting + """ + + #norms = norm_function(axis=axis_) + + norms = norm_function(axis_).getA().flatten() + perm_srtd = sorted(range(len(norms)), key = norms.__getitem__, + reverse=True) + + return perm_srtd + + def get_mat(self): + return self._mat + + def set_mat(self, mat_): + self._mat = mat_ + + mat = property(get_mat, set_mat) + """ + Stores the actual matrix structure of the Matrix object. + Of type numpy.matrix for DenseMatrix, and scipy.sparse.csr_matrix + for SparseMatrix. + """ + + def get_shape(self): + return self.mat.shape + + shape = property(get_shape) + """ + Shape of the matrix, tuple with two elements. + """ + + def copy(self): + return type(self)(self.mat.copy()) + + + + + diff --git a/modules/composes/matrix/sparse_matrix.py b/modules/composes/matrix/sparse_matrix.py new file mode 100755 index 0000000..563188b --- /dev/null +++ b/modules/composes/matrix/sparse_matrix.py @@ -0,0 +1,413 @@ +''' +Created on Sep 17, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +from warnings import warn +from scipy.sparse import issparse +from scipy.sparse import vstack +from scipy.sparse import hstack +from scipy.sparse import csr_matrix +from scipy.sparse.sputils import isintlike +from composes.utils.num_utils import is_numeric +from composes.utils.num_utils import is_integer +from composes.matrix.matrix import Matrix +from composes.utils.py_matrix_utils import array_to_csr_diagonal +from scipy.sparse import identity + +class SparseMatrix(Matrix): + ''' + classdocs + ''' + + + def __init__(self, data): + """ + Constructor, creates a SparseMatrix from a numpy matrix-like + object. + + Matrix-like objects (np.ndarray, np.matrix, scipy.sparse.matrix, + DenseMatrix) are converted into scipy.csr_matrix. + + Args: + data: numpy matrix-like object or Matrix type + + Raises: + TypeError: if input data is not one of scipy.sparse/ + numpy.ndarray/numpy.matrix/Matrix + ValueError: if trying to initialize shape-0 matrix + """ + if issparse(data): + self.mat = data.tocsr() + + elif isinstance(data, np.matrix): + if data.shape[0] == 0 or data.shape[1] == 0: + raise ValueError("cannot initialize matrix with shape 0") + self.mat = csr_matrix(data) + + elif isinstance(data, np.ndarray): + if len(data) == 0: + raise ValueError("cannot initialize matrix with shape 0") + self.mat = csr_matrix(data) + + elif isinstance(data, Matrix): + self.mat = data.to_sparse_matrix().mat + else: + raise TypeError("expected scipy sparse matrix, received %s" + % (type(data))) + + def __str__(self): + return str(self.mat.todense()) + + def __getitem__(self, key): + """ + Overwrites csr_matrix m[i,:], m[i] operations which are faulty in + current scipy.sparse releases. + + """ + def __get_row(row): + start = self.mat.indptr[row] + end = self.mat.indptr[row + 1] + return SparseMatrix(csr_matrix((self.mat.data[start:end], + self.mat.indices[start:end], + [0, end - start]), + shape=(1, self.mat.shape[1]), + copy=True)) + + if isinstance(key, tuple): + row = key[0] + col = key[1] + if isintlike(row) and row >= 0 and isinstance(col, slice): + if col == slice(None, None, None): + return __get_row(row) + + if isintlike(key) and key >= 0: + return __get_row(key) + + result = self.mat[key] + if is_numeric(result): + return result + else: + return SparseMatrix(result) + + def reshape(self, new_shape): + """ + Reshapes current matrix. + + Overwrites the current matrix with a new matrix of the + given shape! + + Args: + shape: length 2 tuple or pair of integers + + Raises: + ValueError: if shape is not an integer pair or + if new shape is inconsistent with the total + size of the current matrix. + """ + + if not isinstance(new_shape, tuple) or len(new_shape) != 2: + raise ValueError("shape must be integer pair") + + no_rows, no_cols = self.mat.shape + new_no_rows, new_no_cols = new_shape + + if not is_integer(new_no_rows) or not is_integer(new_no_cols): + raise ValueError("shape must be integer pair") + if no_rows * no_cols != new_no_rows * new_no_cols: + raise ValueError("total size of new matrix must be unchanged.") + + #TODO: change here if we want a copy!! + mat = self.mat.tocoo(copy=False) + + #upcast mat.row and mat.col + if no_rows * no_cols >= 2**31-1: + linear_pos = np.array(mat.row, dtype=np.int64) * no_cols + mat.col + else: + linear_pos = mat.row * no_cols + mat.col + + mat.row = linear_pos // new_no_cols + mat.col = linear_pos - (mat.row * new_no_cols) + + #NOTE: change here if we want a copy!! + self.mat = csr_matrix((mat.data, (mat.row, mat.col)), shape=new_shape) + + @staticmethod + def identity(size): + """ + Builds the identity matrix. + + Args: + size: integer, the result matrix is of shape size x size + + Returns: + Identity SparseMatrix. + """ + # TODO: should do system-wise + return SparseMatrix(identity(size, dtype = np.double, format = "csr")) + + def transpose(self): + """ + Transposes the current matrix. + + Returns: + SparseMatrix, a transpose of the current matrix. + + """ + return type(self)(self.mat.transpose()) + + def multiply(self, matrix_): + """ + Computes component-wise multiplication of two matrices. + + Args: + matrix_: a second matrix of type SparseMatrix + + Returns: + A SparseMatrix containing the cw multiplication of the two. + + Raises: + TypeError: if the argument is not of type SparseMatrix + ValueError: if the two matrices don t have the same shape. + """ + self._assert_same_type(matrix_) + if self.mat.shape != matrix_.mat.shape: + raise ValueError("inconsistent shapes: %s %s" + % (str(self.mat.shape), str(matrix_.mat.shape) )) + + return SparseMatrix(self.mat.multiply(matrix_.mat)) + + def vstack(self, matrix_): + """ + Vertical stack of two matrices. + + Args: + matrix_: a second matrix of type SparseMatrix + + Returns: + A SparseMatrix, vertical stack of the two matrices. + + Raises: + TypeError: if the argument is not of type SparseMatrix + + """ + self._assert_same_type(matrix_) + return SparseMatrix(vstack([self.mat, matrix_.mat], format = "csr")) + + + def hstack(self, matrix_): + """ + Horizontal stack of two matrices. + + Args: + matrix_: a second matrix of type SparseMatrix + + Returns: + A SparseMatrix, horizontal stack of the two matrices. + + Raises: + TypeError: if the argument is not of type SparseMatrix + + """ + self._assert_same_type(matrix_) + return SparseMatrix(hstack([self.mat, matrix_.mat], format = "csr")) + + + @classmethod + def nary_vstack(cls, mat_list): + """ + Class method, vertical stack of n matrices. + + Args: + mat_list: a list of matrices of type SparseMatrix + + Returns: + A SparseMatrix, vertical stack of the arguments. + + """ + np_mat_list = [matrix_.mat for matrix_ in mat_list] + return SparseMatrix(vstack(np_mat_list)) + + @classmethod + def nary_hstack(cls, mat_list): + """ + Class method, horizontal stack of n matrices. + + Args: + mat_list: a list of matrices of type SparseMatrix + + Returns: + A SparseMatrix, horizontal stack of the arguments. + + """ + + np_mat_list = [matrix_.mat for matrix_ in mat_list] + return SparseMatrix(hstack(np_mat_list)) + + def scale_rows(self, array_): + """ + Scales each row of the matrix by the values given in an array. + + Args: + array_: ndarray containing the values to scale by + + Returns: + A new SparseMatrix with scaled rows. + """ + + self._assert_array(array_) + + diag_matrix = array_to_csr_diagonal(array_) + return SparseMatrix(diag_matrix * self.mat) + + def scale_columns(self, array_): + """ + Scales each column of the matrix by the values given in an array. + + Args: + array_: ndarray containing the values to scale by + + Returns: + A new SparseMatrix with scaled columns. + """ + self._assert_array(array_) + + diag_matrix = array_to_csr_diagonal(array_) + return SparseMatrix(self.mat * diag_matrix) + + def plog(self): + """ + Applies positive log to the matrix elements. + + Elements smaller than 1 (leading to not-defined log or negative log) + are set to 0. Log is applied on all other elements. + + Modifies the current matrix. + """ + + self.mat.data[self.mat.data <= 1] = 1 + self.mat.data = np.log(self.mat.data) + self.mat.eliminate_zeros() + + def get_non_negative(self): + """ + Turns negative entries to 0. + + Returns: + A new SparseMatrix matrix in which negative entries are set to 0. + + """ + mat_ = self.mat.copy() + #TODO: time against : mat_.data[mat_.data < 0] = 0 + mat_.data = np.where(mat_.data > 0, mat_.data, 0) + mat_.eliminate_zeros() + return SparseMatrix(mat_) + + def to_non_negative(self): + """ + Turns negative entries to 0. + + Modifies the current matrix: all negative entries are set to 0. + + """ + self.mat.data.clip(0, out=self.mat.data) + self.mat.eliminate_zeros() + + def to_ones(self): + """ + Turns strictly positive entries to 1 and negative entries to 0. + + Modifies the current matrix: all strictly positive entries are + set to 1, all negative entries are set to 0. + + """ + self.mat.data = np.where(self.mat.data > 0, 1, 0) + self.mat.eliminate_zeros() + + def remove_small_values(self, epsilon): + """ + Sets values smaller than an epsilon to 0. + + Args: + epsilon: scalar, threshold + Returns: + A SparseMatrix in which all values smaller than epsilon are + set to 0. + + """ + mat_ = self.mat.copy() + mat_.data = np.where(mat_.data > epsilon, mat_.data, 0) + mat_.eliminate_zeros() + return SparseMatrix(mat_) + + def assert_positive(self): + """ + Asserts that all values are larger or equal to 0. + + Raises: + ValueError if not all values are >= 0. + """ + if not np.all(self.mat.data >= 0): + raise ValueError("expected non-negative matrix") + + def is_mostly_positive(self): + """ + Checks if more than 50% of the non zero elements of a + matrix are positive. + + """ + return self.mat.data[self.mat.data > 0].size > self.mat.data.size/2 + + def all_close(self, matrix_): + """ + Checks of the values in two matrices are all_close. + + Args: + matrix_: input matrix of type SparseMatrix + + Returns: + bool: True if the elements are allclose (using np.allclose). + + """ + diff = self.mat - matrix_.mat + return np.allclose(diff.data, np.zeros(len(diff.data))) + + def norm(self, axis = None): + """ + Computes the norms on a certain axis or of the entire matrix. + + Args: + axis: 0/1 or None, if axis is None computes the norm of the + full matrix + Returns: + nd.array containing the norms on a given axis, or a scalar + if the axis is None. + + """ + if axis is None: + return np.linalg.norm(self.mat.data) + else: + return np.sqrt(self.multiply(self).sum(axis)) + + def to_dense_matrix(self): + """ + Converts to DenseMatrix. + """ + from composes.matrix.dense_matrix import DenseMatrix + return DenseMatrix(self.mat) + + def to_sparse_matrix(self, copy = False): + """ + Returns a copy is copy=True, returns self otherwise. + """ + if (copy): + return self.copy() + else: + return self + + + + + diff --git a/modules/composes/semantic_space/__init__.py b/modules/composes/semantic_space/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/semantic_space/operation.py b/modules/composes/semantic_space/operation.py new file mode 100755 index 0000000..a9f9dbc --- /dev/null +++ b/modules/composes/semantic_space/operation.py @@ -0,0 +1,248 @@ +''' +Created on Jun 6, 2012 + +@author: thenghia.pham +''' + +from composes.matrix.dense_matrix import DenseMatrix +from composes.exception.illegal_state_error import IllegalStateError +from composes.utils.matrix_utils import resolve_type_conflict +from warnings import warn + +class Operation(object): + """ + This class implements both the application, and the projection of a + transformation on a semantic space. + + An operation object can be used to apply or to project a specific + transformation on a semantic space. + After a transformation is applied, for example on a core space, the operation + class stores the information required to further project this same + operation onto a space peripheral to the core space. + """ + + def __init__(self): + """ + Constructor + """ + pass + + def _raise_projection_error(self, transformation): + raise IllegalStateError("Illegal projection of %s. Attempting\ + projection before application." + % (transformation)) + + def _raise_double_application_error(self, transformation): + raise IllegalStateError("Illegal application of %s. Attempting\ + double application." % (transformation)) + +class ScalingOperation(Operation): + """ + This class implements the application and the projection of scaling + transformations. + """ + + def __init__(self, scaling): + self.__scaling = scaling + self.__column_stats = None + + def apply(self, matrix_): + """ + Applies a scaling operation. + + Args: + matrix_: matrix on which the scaling is applied, of type Matrix + + Returns: + the scaled matrix + + The column statistics computed by the scaling transformation, if any, + is stored in the current operation object. For example, PPMI scaling + needs column sums in order to be projected on peripheral spaces, + while PLOG scaling does not require this. + + """ + + if not self.__column_stats is None: + self._raise_double_application_error(self.__scaling) + + result_matrix = self.__scaling.apply(matrix_) + + if self.__scaling.uses_column_stats: + self.__column_stats = self.__scaling.get_column_stats(matrix_) + + return result_matrix + + def project(self, matrix_): + """ + Projects a scaling operation. + + Args: + matrix_: matrix on which the scaling is projected, of type Matrix + + Returns: + the scaled matrix + + If the current operation object has column_stats, this structure is + used in the projection. + """ + + if self.__column_stats is None and self.__scaling.uses_column_stats: + self._raise_projection_error(self.__scaling) + + if self.__scaling.uses_column_stats: + return self.__scaling.apply(matrix_, self.__column_stats) + else: + return self.__scaling.apply(matrix_) + + def __str__(self): + return str(self.__scaling) + + +class DimensionalityReductionOperation(Operation): + """ + This class implements the application and the projection of dimensionality + reduction transformations. + """ + + def __init__(self, dim_reduction): + self.__dim_reduction = dim_reduction + self.__transmat = None + + def apply(self, matrix_): + """ + Applies a dim. reduction operation. + + Args: + matrix_: matrix on which the reduction is applied, of type Matrix + + Returns: + the reduced matrix + + The transformation matrix obtained in the reduction (specific to each + reduction method) is stored in the operation object. This transformation + matrix is further used for projecting the dim. reduction method on + a space peripheral to the space on which it has been originally applied. + """ + + if not self.__transmat is None: + self._raise_double_application_error(self.__dim_reduction) + + res_mat, self.__transmat = self.__dim_reduction.apply(matrix_) + + return DenseMatrix(res_mat) + + def project(self, matrix_): + """ + Projects a dim. reduction operation. + + Args: + matrix_: matrix on which the reduction is projected, of type Matrix + + Returns: + the reduced matrix + + Uses the transformation matrix stored in the operation object to project + the dimensionality reduction method on a new space, peripheral to the + original one. + """ + + if self.__transmat is None: + self._raise_projection_error(self.__dim_reduction) + + if self.__dim_reduction.name == "nmf": + matrix_.assert_positive() + + if not isinstance(matrix_, type(self.__transmat)): + warn("WARNING: peripheral matrix type (dense/sparse) should be the same as the core space matrix type!!") + + [matrix_, transmat] = resolve_type_conflict([matrix_, self.__transmat], + type(matrix_)) + + result_mat = matrix_ * transmat + + if self.__dim_reduction.name == "nmf": + result_mat.to_non_negative() + + return DenseMatrix(result_mat) + + def __str__(self): + return str(self.__dim_reduction) + + +class FeatureSelectionOperation(Operation): + """ + This class implements the application and the projection of feature + selection transformations. + """ + + def __init__(self, feat_selection): + self.__feat_selection = feat_selection + self.__selected_columns = None + self.__original_columns = None + + def apply(self, matrix_): + """ + Applies a dim. feature selection operation. + + Args: + matrix_: matrix on which the reduction is applied, of type Matrix + + Returns: + the reduced matrix + + The columns selected are stored in the operation object. These are + further used for projecting the feature selection method on + a space peripheral to the original space on which it has been applied. + """ + + if not self.__selected_columns is None: + self._raise_double_application_error(self.__feat_selection) + + res_mat, self.__selected_columns = self.__feat_selection.apply(matrix_) + return res_mat + + def project(self, matrix_): + """ + Projects a feature selection operation. + + Args: + matrix_: matrix on which the selection is applied, of type Matrix + + Returns: + the reduced matrix + + Uses the information on selected columns stored in the operation + object to project the feature selection method on a new space, + peripheral to the original one. + """ + + if self.__selected_columns is None: + self._raise_projection_error(self.__dim_reduction) + + res_mat = matrix_[:, self.__selected_columns] + return res_mat + + + def __str__(self): + return str(self.__feat_selection) + + + def get_selected_columns(self): + return self.__selected_columns + + def get_original_columns(self): + return self.__original_columns + + def set_original_columns(self, original_columns): + self.__original_columns = original_columns + + selected_columns = property(get_selected_columns) + """ + List of integers, indices of the columns selected. + """ + original_columns = property(get_original_columns, set_original_columns) + """ + List of strings, the id2column of the space before applying the + feature selection. + """ diff --git a/modules/composes/semantic_space/peripheral_space.py b/modules/composes/semantic_space/peripheral_space.py new file mode 100755 index 0000000..a553f5c --- /dev/null +++ b/modules/composes/semantic_space/peripheral_space.py @@ -0,0 +1,160 @@ +''' +Created on Sep 26, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from space import Space +from numpy import array +from composes.utils.space_utils import list2dict +from composes.utils.space_utils import assert_dict_match_list +from composes.utils.space_utils import assert_shape_consistent +from composes.utils.space_utils import add_items_to_dict +from composes.semantic_space.operation import FeatureSelectionOperation +from composes.semantic_space.operation import DimensionalityReductionOperation +from composes.utils.gen_utils import assert_is_instance +from composes.matrix.matrix import Matrix + +class PeripheralSpace(Space): + ''' + classdocs + ''' + + + def __init__(self, core_space, matrix_, id2row, row2id=None): + """ + Constructor. + + Args: + core_space: Space type, the core space that this is peripheral to. + matrix_: Matrix type, the data matrix of the space + id2row: list, the row elements + row2id: dictionary, maps row strings to ids. Optional, built from + id2row by default. + + Returns: + A peripheral semantic space (type PeripheralSpace) on which the + core space operations have been projected. Column indexing structures + and operations are taken over from the core space. + + Raises: + TypeError: if matrix_ or core_space are not of the correct type + ValueError: if element shape is not consistent with + the size of matrix rows + if the matrix and the provided row and column + indexing structures are not of consistent shapes. + """ + assert_is_instance(matrix_, Matrix) + assert_is_instance(core_space, Space) + assert_is_instance(id2row, list) + # TODO: assert it is not a peripheral space here! + + if row2id is None: + row2id = list2dict(id2row) + else: + assert_dict_match_list(row2id, id2row) + + column2id = core_space.column2id + id2column = core_space.id2column + + self._operations = list(core_space.operations) + self._row2id = row2id + self._id2row = id2row + self._column2id = column2id + self._id2column = id2column + + self._cooccurrence_matrix = self._project_core_operations(matrix_) + assert_shape_consistent(self.cooccurrence_matrix, self._id2row, + self._id2column, self._row2id, self._column2id) + + self._element_shape = (self._cooccurrence_matrix.shape[1],) + + + def _project_core_operations(self, matrix_): + + for operation in self._operations: + if isinstance(operation, DimensionalityReductionOperation): + self._id2column, self._column2id = [], {} + + if isinstance(operation, FeatureSelectionOperation): + if operation.original_columns: + self._id2column = list(array(operation.original_columns)[operation.selected_columns]) + self._column2id = list2dict(self._id2column) + else: + self._id2column, self._column2id = [],{} + + matrix_ = operation.project(matrix_) + return matrix_ + + + def add_rows(self, matrix_, id2row): + """ + Adds rows to a peripheral space. + + Args: + matrix_: Matrix type, the matrix of the elements to be added. + id2row: list, string identifiers of the rows to be added. + + Modifies the current space by appending the new rows. + All operations of the core space are projected to the new rows. + + Raises: + ValueError: if attempting to add row strings which are already + in the space. + matrix of the new data is not consistent in shape + with the current data matrix. + """ + + try: + self._row2id = add_items_to_dict(self.row2id, id2row) + except ValueError: + raise ValueError("Found duplicate keys when appending rows to\ + peripheral space.") + + if matrix_.mat.shape[0] != len(id2row): + raise ValueError("Matrix shape inconsistent with no. of rows:%s %s" + % (matrix_.mat.shape, len(id2row))) + + self._id2row = self.id2row + id2row + matrix_ = self._project_core_operations(matrix_) + + self._cooccurrence_matrix = self._cooccurrence_matrix.vstack(matrix_) + assert_shape_consistent(self.cooccurrence_matrix, self.id2row, + self.id2column, self.row2id, self.column2id) + + @classmethod + def build(cls, core_space, **kwargs): + """ + Reads in data files and extracts the data to construct a semantic space. + + If the data is read in dense format and no columns are provided, + the column indexing structures are set to empty. + + Args: + data: file containing the counts + format: format on the input data file: one of sm/dm + rows: file containing the row elements. Optional, if not provided, + extracted from the data file. + cols: file containing the column elements + + Returns: + A semantic space build from the input data files. + + Raises: + ValueError: if one of data/format arguments is missing. + if cols is missing and format is "sm" + if the input columns provided are not consistent with + the shape of the matrix (for "dm" format) + + """ + + sp = Space.build(**kwargs) + + mat = sp._cooccurrence_matrix + id2row = sp.id2row + row2id = sp.row2id + return PeripheralSpace(core_space, mat, id2row, row2id) + + + + diff --git a/modules/composes/semantic_space/space.py b/modules/composes/semantic_space/space.py new file mode 100755 index 0000000..df29e04 --- /dev/null +++ b/modules/composes/semantic_space/space.py @@ -0,0 +1,649 @@ +''' +Created on Sep 21, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import time +import logging +from numpy import array +from numpy import prod +from composes.utils.space_utils import list2dict +from composes.utils.space_utils import assert_dict_match_list +from composes.utils.space_utils import assert_shape_consistent +from composes.utils.gen_utils import assert_is_instance +from composes.utils.space_utils import add_items_to_dict +from composes.utils.matrix_utils import resolve_type_conflict +from composes.utils.matrix_utils import get_type_of_largest +from composes.matrix.matrix import Matrix +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.sparse_matrix import SparseMatrix +from composes.semantic_space.operation import FeatureSelectionOperation +from composes.semantic_space.operation import DimensionalityReductionOperation +from composes.similarity.similarity import Similarity +from composes.transformation.scaling.scaling import Scaling +from composes.transformation.dim_reduction.dimensionality_reduction import DimensionalityReduction +from composes.transformation.feature_selection.feature_selection import FeatureSelection +from composes.exception.illegal_state_error import IllegalOperationError +from composes.utils import log_utils as log +from composes.utils.io_utils import read_sparse_space_data +from composes.utils.io_utils import extract_indexing_structs +from composes.utils.io_utils import read_dense_space_data +from composes.utils.io_utils import create_parent_directories +from composes.utils.io_utils import print_list +from composes.utils.io_utils import print_cooc_mat_dense_format +from composes.utils.io_utils import print_cooc_mat_sparse_format + + +logger = logging.getLogger(__name__) + + +class Space(object): + """ + This class implements semantic spaces. + + A semantic space describes a list of targets (words, phrases, etc.) + in terms of co-occurrence with contextual features. + + It contains a matrix storing (some type of) co-occurrence + strength values between targets and contextual features: by convention, + targets are rows and features are columns. The space also stores structures + that encode the mappings between the matrix row/column indices and the + associated target/context-feature strings. + + Transformations which rescale the matrix elements can be applied + to a semantic space. A semantic also space allows for similarity + computations between row elements of the space. + + """ + + def __init__(self, matrix_, id2row, id2column, row2id=None, column2id=None, + operations=[], element_shape=None): + """ + Constructor. + + Args: + matrix_: Matrix type, the data matrix of the space + id2row: list, the row elements + id2column: list, the column elements + row2id: dictionary, maps row strings to ids. Optional, built from + id2row by default. + column2id: dictionary, maps col strings to ids. Optional, built + from id2column by default + operations: list of operations already performed on the input + matrix, Optional, by default set to empty. + element_shape: tuple of int, the shape on row elements. Optional, + by default row elements are one-dimensional and element_shape is + (no_cols, ). Used in 3D composition. + + Returns: + A semantic space (type Space) + + Raises: + TypeError: if matrix_ is not of the correct type + ValueError: if element shape is not consistent with + the size of matrix rows + if the matrix and the provided row and column + indexing structures are not of consistent shapes. + + """ + assert_is_instance(matrix_, Matrix) + assert_is_instance(id2row, list) + assert_is_instance(id2column, list) + + if row2id is None: + row2id = list2dict(id2row) + else: + assert_dict_match_list(row2id, id2row) + + if column2id is None: + column2id = list2dict(id2column) + else: + assert_dict_match_list(column2id, id2column) + + assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id) + + self._cooccurrence_matrix = matrix_ + self._row2id = row2id + self._id2row = id2row + self._column2id = column2id + self._id2column = id2column + self._operations = operations + + if element_shape: + if prod(element_shape) != self._cooccurrence_matrix.shape[1]: + raise ValueError("Trying to assign invalid element shape:\ + element_shape: %s, matrix columns: %s" + % (str(element_shape), + str(self._cooccurrence_matrix.shape[1]))) + + # NOTE: watch out here, can cause bugs, if we change the dimension + # of a regular space and we do not create a new space + self._element_shape = element_shape + else: + self._element_shape = (self._cooccurrence_matrix.shape[1],) + + + def apply(self, transformation): + """ + Applies a transformation on the current space. + + All transformations affect the data matrix. If the transformation + reduces the dimensionality of the space, the column indexing + structures are also updated. The operation applied is appended + to the list of operations that the space holds. + + Args: + transformation: of type Scaling, DimensionalityReduction or + FeatureSelection + + Returns: + A new space on which the transformation has been applied. + + """ + start = time.time() + #TODO , FeatureSelection, DimReduction .. + assert_is_instance(transformation, (Scaling, DimensionalityReduction, + FeatureSelection)) + op = transformation.create_operation() + new_matrix = op.apply(self.cooccurrence_matrix) + + new_operations = list(self.operations) + new_operations.append(op) + + id2row, row2id = list(self.id2row), self.row2id.copy() + + + if isinstance(op, DimensionalityReductionOperation): + self.assert_1dim_element() + id2column, column2id = [], {} + elif isinstance(op, FeatureSelectionOperation): + self.assert_1dim_element() + op.original_columns = self.id2column + + if op.original_columns: + id2column = list(array(op.original_columns)[op.selected_columns]) + column2id = list2dict(id2column) + else: + id2column, column2id = [],{} + else: + id2column, column2id = list(self.id2column), self.column2id.copy() + + log.print_transformation_info(logger, transformation, 1, + "\nApplied transformation:") + log.print_matrix_info(logger, self.cooccurrence_matrix, 2, + "Original semantic space:") + log.print_matrix_info(logger, new_matrix, 2, "Resulted semantic space:") + log.print_time_info(logger, time.time(), start, 2) + + return Space(new_matrix, id2row, id2column, + row2id, column2id, operations = new_operations) + + def get_sim(self, word1, word2, similarity, space2=None): + """ + Computes the similarity between two targets in the semantic + space. + + If one of the two targets to be compared is not found, it returns 0.. + + Args: + word1: string + word2: string + similarity: of type Similarity, the similarity measure to be used + space2: Space type, Optional. If provided, word2 is interpreted in + this space, rather than the current space. Default, both words + are interpreted in the current space. + Returns: + scalar, similarity score + + """ + + assert_is_instance(similarity, Similarity) + + try: + v1 = self.get_row(word1) + except KeyError: + print "Row string %s not found, returning 0.0" % (word1) + return 0.0 + try: + if space2 is None: + v2 = self.get_row(word2) + else: + v2 = space2.get_row(word2) + except KeyError: + print "Row string %s not found, returning 0.0" % (word2) + return 0.0 + + [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix) + return similarity.get_sim(v1, v2) + + def get_sims(self, word_pair_list, similarity, space2=None): + """ + Computes the similarity between two LIST of targets in the semantic + space. + + If one of the two targets to be compared is not found, it returns 0.. + + Args: + word_pair_list: list of (string, string) tuples. Words to be compared. + similarity: of type Similarity, the similarity measure to be used + space2: Space type, Optional. If provided, the second word of the word pairs + is interpreted in this space, rather than the current space. + Default, both words are interpreted in the current space. + Returns: + list, list of similarity scores + + """ + sims = [] + + for word1, word2 in word_pair_list: + sims.append(self.get_sim(word1, word2, similarity, space2)) + + return sims + + def get_neighbours(self, word, no_neighbours, similarity, + space2=None): + """ + Computes the neighbours of a word in the semantic space. + + Args: + word: string, target word + no_neighbours: int, the number of neighbours desired + similarity: of type Similarity, the similarity measure to be used + space2: Space type, Optional. If provided, the neighbours are + retrieved from this space, rather than the current space. + Default, neighbours are retrieved from the current space. + + Returns: + list of (neighbour_string, similarity_value) tuples. + + Raises: + KeyError: if the word is not found in the semantic space. + + """ + + start = time.time() + assert_is_instance(similarity, Similarity) + vector = self.get_row(word) + + if space2 is None: + id2row = self.id2row + sims_to_matrix = similarity.get_sims_to_matrix(vector, + self.cooccurrence_matrix) + else: + mat_type = type(space2.cooccurrence_matrix) + if not isinstance(vector, mat_type): + vector = mat_type(vector) + + sims_to_matrix = similarity.get_sims_to_matrix(vector, + space2.cooccurrence_matrix) + id2row = space2.id2row + + sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) + no_neighbours = min(no_neighbours, len(id2row)) + result = [] + + for count in range(no_neighbours): + i = sorted_perm[count] + result.append((id2row[i], sims_to_matrix[i,0])) + + log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word)) + log.print_name(logger, similarity, 1, "Similarity:") + log.print_time_info(logger, time.time(), start, 2) + return result + + @classmethod + def vstack(cls, space1, space2): + """ + Classmethod. Stacks two semantic spaces. + + The rows in the two spaces are concatenated. + + Args: + space1, space2: spaces to be stacked, of type Space + + Returns: + Stacked space, type Space. + + Raises: + ValueError: if the spaces have different number of columns + or their columns are not identical + + """ + if space1.cooccurrence_matrix.shape[1] != space2.cooccurrence_matrix.shape[1]: + raise ValueError("Inconsistent shapes: %s, %s" + % (space1.cooccurrence_matrix.shape[1], + space2.cooccurrence_matrix.shape[1])) + + if space1.id2column != space2.id2column: + raise ValueError("Identical columns required") + + new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row) + new_id2row = space1.id2row + space2.id2row + + matrix_type = get_type_of_largest([space1.cooccurrence_matrix, + space2.cooccurrence_matrix]) + [new_mat1, new_mat2] = resolve_type_conflict([space1.cooccurrence_matrix, + space2.cooccurrence_matrix], + matrix_type) + + new_mat = new_mat1.vstack(new_mat2) + + log.print_info(logger, 1, "\nVertical stack of two spaces") + log.print_matrix_info(logger, space1.cooccurrence_matrix, 2, + "Semantic space 1:") + log.print_matrix_info(logger, space2.cooccurrence_matrix, 2, + "Semantic space 2:") + log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:") + + return Space(new_mat, new_id2row, list(space1.id2column), new_row2id, + space1.column2id.copy(), operations=[]) + + def to_dense(self): + """ + Converts the matrix of the current space to DenseMatrix + """ + self._cooccurrence_matrix = DenseMatrix(self.cooccurrence_matrix) + + def to_sparse(self): + """ + Converts the matrix of the current space to SparseMatrix + """ + self._cooccurrence_matrix = SparseMatrix(self.cooccurrence_matrix) + + def get_row(self, word): + """ + Returns the row vector of a word. + + Args: + word: string + + Returns: Matrix type (of shape (1, no_cols)), the row of the word argument. + + Raises: + KeyError: if the word is not found in the space + """ + return self.cooccurrence_matrix[self.row2id[word],:] + + def get_rows(self, words): + """ + Returns the sub-matrix corresponding to a list of words. + + Args: + words: list of strings + + Returns: Matrix type (of shape (len(words), no_cols)), + the sub-matrix containing the words given as an input. + + Raises: + KeyError: if one of words is not found in the space + """ + assert_is_instance(words, list) + row_ids = [] + for word in words: + row_ids.append(self.row2id[word]) + + return self.cooccurrence_matrix[row_ids,:] + + def set_cooccurrence_matrix(self, matrix_): + assert_is_instance(matrix_, Matrix) + assert_shape_consistent(matrix_, self.row2id, self.id2row, + self.column2id, self.id2column) + self._cooccurrence_matrix = matrix_ + + def get_cooccurrence_matrix(self): + return self._cooccurrence_matrix + + cooccurrence_matrix = property(get_cooccurrence_matrix) + """ + Co-occurrence matrix associated to the semantic space, of type Matrix. + + """ + def get_row2id(self): + return self._row2id + + row2id = property(get_row2id) + """ + Dictionary, maps row strings to integer ids. + """ + + def get_id2row(self): + return self._id2row + + id2row = property(get_id2row) + """ + List of strings, the row elements. + """ + def get_column2id(self): + return self._column2id + + column2id = property(get_column2id) + """ + Dictionary, maps column strings to integer ids. + """ + + def get_id2column(self): + return self._id2column + + id2column = property(get_id2column) + """ + List of strings, the column elements. + """ + + def get_element_shape(self): + return self._element_shape + + element_shape = property(get_element_shape) + """ + Shape of row elements, of type tuple. By default, in standard spaces, + element_shape=(no_cols,). + + Used in composition models which build + word representations which are matrices or higher order tensors, instead + of simple vectors. If the representation of a word is a matrix of shape + (2,2) for example, then element_shape=(2,2). The actual space matrix + stores each element as a linearized vector, just as in standard spaces. + """ + + def get_operations(self): + return self._operations + + operations = property(get_operations) + """ + List of operations which have been applied on the semantic space. List of + Operation type objects. + + The operations, together with their associated side information, are stored + because they may need to be projected on peripheral data. + """ + + def assert_1dim_element(self): + """ + Asserts that the elements of the space are one dimensional. + + """ + if len(self.element_shape) > 1: + raise IllegalOperationError("Operation not allowed on spaces with\ + element shape: %s" % self.element_shape) + + @classmethod + def build(cls, **kwargs): + """ + Reads in data files and extracts the data to construct a semantic space. + + If the data is read in dense format and no columns are provided, + the column indexing structures are set to empty. + + Args: + data: file containing the counts + format: format on the input data file: one of sm/dm + rows: file containing the row elements. Optional, if not provided, + extracted from the data file. + cols: file containing the column elements + + Returns: + A semantic space build from the input data files. + + Raises: + ValueError: if one of data/format arguments is missing. + if cols is missing and format is "sm" + if the input columns provided are not consistent with + the shape of the matrix (for "dm" format) + + """ + start = time.time() + id2row = None + id2column = None + + if "data" in kwargs: + data_file = kwargs["data"] + else: + raise ValueError("Space data file needs to be specified") + + if "format" in kwargs: + format_ = kwargs["format"] + if not format_ in ["dm","sm"]: + raise ValueError("Unrecognized format: %s" % format_) + else: + raise ValueError("Format of input files needs to be specified") + + if "rows" in kwargs and not kwargs["rows"] is None: + [id2row], [row2id] = extract_indexing_structs(kwargs["rows"], [0]) + + if "cols" in kwargs and not kwargs["cols"] is None: + [id2column], [column2id] = extract_indexing_structs(kwargs["cols"], [0]) + elif format_ == "sm": + raise ValueError("Need to specify column file when input format is sm!") + + if format_ == "sm": + if id2row is None: + [id2row], [row2id] = extract_indexing_structs(data_file, [0]) + mat = read_sparse_space_data(data_file, row2id, column2id) + + else: + if id2row is None: + [id2row],[row2id] = extract_indexing_structs(data_file, [0]) + mat = read_dense_space_data(data_file, row2id) + + if id2column and len(id2column) != mat.shape[1]: + raise ValueError("Columns provided inconsistent with shape of input matrix!") + + if id2column is None: + id2column, column2id = [], {} + + log.print_matrix_info(logger, mat, 1, "Built semantic space:") + log.print_time_info(logger, time.time(), start, 2) + return Space(mat, id2row, id2column, row2id, column2id) + + def export(self, file_prefix, **kwargs): + """ + Exports the current space to disk. + If the space has no column information, it cannot be exported in + sparse format (sm). + + Args: + file_prefix: string, prefix of the files to be exported + format: string, one of dm/sm + + Prints: + - matrix in file_prefix. + - row elements in file_prefix. + - col elements in file_prefix. + + Raises: + ValueError: if the space has no column info and "sm" exporting + is attempted + NotImplementedError: the space matrix is dense and "sm" exporting + is attempted + + """ + + start = time.time() + create_parent_directories(file_prefix) + format_ = "dm" + if "format" in kwargs: + format_ = kwargs["format"] + if not format_ in ["dm","sm"]: + raise ValueError("Unrecognized format: %s" %format_) + elif format_ == "dm": + print_cooc_mat_dense_format(self.cooccurrence_matrix, + self.id2row, file_prefix) + else: + print_cooc_mat_sparse_format(self.cooccurrence_matrix, + self.id2row, + self.id2column, file_prefix) + self._export_row_column(file_prefix) + + log.print_matrix_info(logger, self.cooccurrence_matrix, 1, + "Printed semantic space:") + log.print_time_info(logger, time.time(), start, 2) + + def _export_row_column(self, file_prefix): + row_file = "%s.%s" %(file_prefix, "rows") + column_file = "%s.%s" %(file_prefix, "cols") + + if self.column2id: + print_list(self.id2column, column_file) + + print_list(self.id2row, row_file) + + + +""" +def build(cls, **kwargs): +FANCY BUILD + start = time.time() + id2row = None + id2column = None + + if "data" in kwargs: + data_file = kwargs["data"] + else: + raise ValueError("Space data file needs to be specified") + + if "format" in kwargs: + format_ = kwargs["format"] + if not format_ in ["dm","sm"]: + raise ValueError("Unrecognized format: %s" % format_) + else: + raise ValueError("Format of input files needs to be specified") + + if "rows" in kwargs and not kwargs["rows"] is None: + [id2row], [row2id] = extract_indexing_structs(kwargs["rows"], [0]) + + if "cols" in kwargs and not kwargs["cols"] is None: + [id2column], [column2id] = extract_indexing_structs(kwargs["cols"], [0]) + + if format_ == "sm": + if id2row is None and id2column is None: + ([id2row, id2column], + [row2id, column2id]) = extract_indexing_structs(data_file, [0, 1]) + if id2row is None: + [id2row], [row2id] = extract_indexing_structs(data_file, [0]) + if id2column is None: + [id2column], [column2id] = extract_indexing_structs(data_file, [1]) + + mat = read_sparse_space_data(data_file, row2id, column2id) + else: + if id2row is None: + [id2row],[row2id] = extract_indexing_structs(data_file, [0]) + if id2column is None: + id2column, column2id = [], {} + + mat = read_dense_space_data(data_file, row2id) + + if id2column and len(id2column) != mat.shape[1]: + raise ValueError("Columns provided inconsistent with shape of input matrix!") + + log.print_matrix_info(logger, mat, 1, "Built semantic space:") + log.print_time_info(logger, time.time(), start, 2) + return Space(mat, id2row, id2column, row2id, column2id) +FANCY BUILD + + + + Some transformations, such as weighings, only scale the values + in the space matrix, while others, such as dimensionality + reduction, or feature selection, alter the set of + contextual features. +""" diff --git a/modules/composes/similarity/__init__.py b/modules/composes/similarity/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/similarity/cos.py b/modules/composes/similarity/cos.py new file mode 100755 index 0000000..b4f6038 --- /dev/null +++ b/modules/composes/similarity/cos.py @@ -0,0 +1,39 @@ +""" +Created on Oct 2, 2012 + +@author: Georgiana Dinu, Pham The Nghia +""" +import numpy as np + +from composes.utils.py_matrix_utils import nonzero_invert + +from composes.similarity.similarity import Similarity +from composes.similarity.dot_prod import DotProdSimilarity + + +class CosSimilarity(Similarity): + """ + Computes the cosine similarity of two vectors. + + :math:`sim(\\vec{u},\\vec{v}) = \\frac{<\\vec{u},\\vec{v}>}{\\sqrt{||\\vec{u}||||\\vec{v}||}}` + + """ + + def _sim(self, v1, v2): + if v1.norm() == 0 or v2.norm() == 0: + return 0.0 + s = DotProdSimilarity()._sim(v1, v2) / np.double(v1.norm() * v2.norm()) + return s + + def _sims_to_matrix(self, vector, matrix_): + sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_) + + vector_norm = vector.norm() + row_norms = vector_norm * matrix_.norm(1) + row_norms = nonzero_invert(row_norms) + + return sims.scale_rows(row_norms) + + + + diff --git a/modules/composes/similarity/dot_prod.py b/modules/composes/similarity/dot_prod.py new file mode 100755 index 0000000..9323e9a --- /dev/null +++ b/modules/composes/similarity/dot_prod.py @@ -0,0 +1,20 @@ +""" +Created on Oct 2, 2012 + +@author: Georgiana Dinu, Pham The Nghia +""" +from composes.similarity.similarity import Similarity + + +class DotProdSimilarity(Similarity): + """ + Computes the scalar product (dot product) of two vectors. + + :math:`sim(\\vec{u},\\vec{v}) = <\\vec{u},\\vec{v}> = \\sum_iu_iv_i` + + """ + def _sim(self, v1, v2): + return v1.multiply(v2).sum() + + def _sims_to_matrix(self, vector, matrix_): + return matrix_ * vector.transpose() diff --git a/modules/composes/similarity/euclidean.py b/modules/composes/similarity/euclidean.py new file mode 100755 index 0000000..1a307bf --- /dev/null +++ b/modules/composes/similarity/euclidean.py @@ -0,0 +1,18 @@ +""" +Created on Oct 2, 2012 + +@author: Georgiana Dinu, Pham The Nghia +""" +from composes.similarity.similarity import Similarity + + +class EuclideanSimilarity(Similarity): + """ + Computes the euclidean similarity of two vectors as the inverse of their + euclidean distance. + + :math:`sim(\\vec{u},\\vec{v}) = \\frac{1}{||\\vec{u}-\\vec{v}|| + 1}` + """ + + def _sim(self, v1, v2): + return 1 / (1 + (v1 - v2).norm()) diff --git a/modules/composes/similarity/lin.py b/modules/composes/similarity/lin.py new file mode 100755 index 0000000..604a804 --- /dev/null +++ b/modules/composes/similarity/lin.py @@ -0,0 +1,33 @@ +""" +Created on Oct 2, 2012 + +@author: Georgiana Dinu, Pham The Nghia +""" +import numpy as np + +from composes.similarity.similarity import Similarity + + +class LinSimilarity(Similarity): + """ + Computes the Lin similarity of two vectors. + + :math:`sim(\\vec{u},\\vec{v}) = \\frac{\\sum_{i \\in I}(u_i+v_i)}{\\sum_iu_i + \\sum_iv_i}` + + Where :math:`I=\\{i | u_i > 0 \\text{ and } v_i > 0\\}`, the set of components + on which both vectors are strictly positive. + + """ + + def _sim(self, v1, v2): + + common = v1.multiply(v2) + common.to_ones() + denom = v1.sum() + v2.sum() + + if denom == 0: + return 0 + else: + return common.multiply(v1 + v2).sum() / np.double(denom) + + diff --git a/modules/composes/similarity/similarity.py b/modules/composes/similarity/similarity.py new file mode 100755 index 0000000..3d003fe --- /dev/null +++ b/modules/composes/similarity/similarity.py @@ -0,0 +1,46 @@ +""" +Created on Oct 2, 2012 + +@author: Georgiana Dinu, Pham The Nghia +""" +import numpy as np + +from composes.utils.matrix_utils import ( + assert_is_array_or_matrix, + to_compatible_matrix_types, +) + + +class Similarity(object): + + def get_sim(self, v1, v2): + + assert_is_array_or_matrix(v1) + assert_is_array_or_matrix(v2) + + # TODO: figure out where these asserts belong!! + v1, v2 = to_compatible_matrix_types(v1, v2) + v1.assert_same_shape(v2) + + return self._sim(v1, v2) + + def get_sims_to_matrix(self, vector, matrix_): + + assert_is_array_or_matrix(vector) + assert_is_array_or_matrix(matrix_) + + vector, matrix_ = to_compatible_matrix_types(vector, matrix_) + + if vector.shape[1] != matrix_.shape[1] or vector.shape[0] != 1: + raise ValueError( + 'Inconsistent shapes {0} and {1}'.format(vector.shape, matrix_.shape) + ) + + return self._sims_to_matrix(vector, matrix_) + + def _sims_to_matrix(self, vector, matrix_): + + result = np.zeros(shape=(matrix_.shape[0], 1)) + for i in range(matrix_.shape[0]): + result[i] = self._sim(vector, matrix_[i, :]) + return type(matrix_)(result) diff --git a/modules/composes/transformation/__init__.py b/modules/composes/transformation/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/transformation/dim_reduction/__init__.py b/modules/composes/transformation/dim_reduction/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/transformation/dim_reduction/dimensionality_reduction.py b/modules/composes/transformation/dim_reduction/dimensionality_reduction.py new file mode 100755 index 0000000..5b2776a --- /dev/null +++ b/modules/composes/transformation/dim_reduction/dimensionality_reduction.py @@ -0,0 +1,37 @@ +''' +Created on Sep 28, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +from composes.semantic_space.operation import DimensionalityReductionOperation + +class DimensionalityReduction(object): + ''' + classdocs + ''' + + _name = "we are NOT stupid" + + def __init__(self, reduced_dimension): + ''' + Constructor + ''' + if reduced_dimension <= 0: + raise ValueError("Cannot reduce to non-positive dimensionality: %d" + % reduced_dimension) + self._reduced_dimension = reduced_dimension + + def create_operation(self): + return DimensionalityReductionOperation(self) + + def get_reduced_dimension(self): + return self._reduced_dimension + + def get_name(self): + return self._name + + def __str__(self): + return self._name + + name = property(get_name) + reduced_dimension = property(get_reduced_dimension) diff --git a/modules/composes/transformation/dim_reduction/nmf.py b/modules/composes/transformation/dim_reduction/nmf.py new file mode 100755 index 0000000..b7251db --- /dev/null +++ b/modules/composes/transformation/dim_reduction/nmf.py @@ -0,0 +1,136 @@ +''' +Created on Oct 1, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +from dimensionality_reduction import DimensionalityReduction +from composes.matrix.linalg import Linalg +from math import sqrt + +class Nmf(DimensionalityReduction): + """ + Performs Non-negative Matrix Factorization to reduced dimension :math:`k`. + + Given an input non-negative matrix :math:`X`, it computes the decomposition: + + :math:`X \\approx WH` where W and H are non-negative matrices which minimize + :math:`||X-WH||_{2}` + + It returns the matrix W. + """ + + _name = "nmf" + + def __init__(self, reduced_dimension): + ''' + Constructor + ''' + super(Nmf, self).__init__(reduced_dimension) + + def apply(self, matrix_): + + matrix_.assert_positive() + #w_init, h_init = self.nndsvd_init(matrix_) + w_init, h_init = self.v_col_init(matrix_) + #w_init, h_init = self.random_init(matrix_) + w, h = Linalg.nmf(matrix_, w_init, h_init) + return w, Linalg.pinv(h) + + def random_init(self, matrix_): + + # TODO: implement the fancier but still fast init (from nimfa: v_col) + rndcol = np.random.random_integers(0, matrix_.shape[1] - 1, + self._reduced_dimension) + + rndrow = np.random.random_integers(0, matrix_.shape[0] - 1, + self._reduced_dimension) + + #otherwise we would have had to convert to DenseMatrix/SparseMatrix + #type(matrix_)(result) + w = matrix_[:, rndcol] + h = matrix_[rndrow, :] + + return w, h + + def v_col_init(self, matrix_): + w = np.zeros((matrix_.shape[0], self._reduced_dimension)) + h = np.zeros((self._reduced_dimension, matrix_.shape[1])) + + #in case there are less than 5 rows or columns + p_col = matrix_.shape[1]//5 + 1 + p_row = matrix_.shape[0]//5 + 1 + for i in range(self._reduced_dimension): + + rndcol = np.random.random_integers(0, matrix_.shape[1] - 1, + p_col) + + rndrow = np.random.random_integers(0, matrix_.shape[0] - 1, + p_row) + + w[:, i] = (matrix_[:, rndcol].sum(1)/float(p_col)).flatten() + h[i, :] = (matrix_[rndrow, :].sum(0)/float(p_row)).flatten() + + w = type(matrix_)(w) + h = type(matrix_)(h) + + return w, h + + def nndsvd_init(self,matrix_): + def matrix_abs(mat_): + mat_p = mat_.get_non_negative() + mat_n_abs = mat_p - mat_ + return mat_p + mat_n_abs + + def padd_zeros(matrix_, axis, thickness): + matrix_type = type(matrix_) + if axis == 0: + append_mat = matrix_type(np.zeros((thickness, matrix_.shape[1]))) + return matrix_.vstack(append_mat) + elif axis == 1: + append_mat = matrix_type(np.zeros((matrix_.shape[0], thickness))) + return matrix_.hstack(append_mat) + + u, s, v = Linalg.svd(matrix_, self._reduced_dimension); + + rank = u.shape[1] + w = [[]]*rank + h = [[]]*rank + + vt = v.transpose() + + w[0] = sqrt(s[0]) * matrix_abs(u[:,0]) + h[0] = sqrt(s[0]) * matrix_abs(vt[0,:]) + + for i in range(1,rank): + uu = u[:,i] + vv = vt[i,:] + uup = uu.get_non_negative() + uun = uup - uu + vvp = vv.get_non_negative() + vvn = vvp - vv + + n_uup = uup.norm() + n_uun = uun.norm() + n_vvp = vvp.norm() + n_vvn = vvn.norm() + + termp = n_uup * n_vvp; termn = n_uun * n_vvn + if (termp >= termn): + w[i] = sqrt(s[i] * termp) * uup / n_uup + h[i] = sqrt(s[i] * termp) * vvp / n_vvp + else: + w[i] = sqrt(s[i] * termn) * uun / n_uun + h[i] = sqrt(s[i] * termn) * vvn / n_vvn + + w = matrix_.nary_hstack(w) + h = matrix_.nary_vstack(h) + + w.remove_small_values(0.0000000001) + h.remove_small_values(0.0000000001) + + if (rank < self._reduced_dimension): + w = padd_zeros(w, 1, self._reduced_dimension - rank) + h = padd_zeros(h, 0, self._reduced_dimension - rank) + return w,h diff --git a/modules/composes/transformation/dim_reduction/svd.py b/modules/composes/transformation/dim_reduction/svd.py new file mode 100755 index 0000000..417a588 --- /dev/null +++ b/modules/composes/transformation/dim_reduction/svd.py @@ -0,0 +1,33 @@ +''' +Created on Sep 28, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from dimensionality_reduction import DimensionalityReduction +from composes.matrix.linalg import Linalg + +class Svd(DimensionalityReduction): + """ + Performs truncated Singular Value Decomposition to a reduced dimension :math:`k`. + + Given an input matrix :math:`X`, it computes the decomposition: + + :math:`X = U \\Sigma V^{T}` + + It returns :math:`U \\Sigma` truncated to dimension :math:`min(k,rank(X))` + """ + + _name = "svd" + + def __init__(self, reduced_dimension): + ''' + Constructor + ''' + super(Svd, self).__init__(reduced_dimension) + + def apply(self, matrix_): + + u, s, v = Linalg.svd(matrix_, self._reduced_dimension) + return u.scale_columns(s), v + diff --git a/modules/composes/transformation/feature_selection/__init__.py b/modules/composes/transformation/feature_selection/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/transformation/feature_selection/feature_selection.py b/modules/composes/transformation/feature_selection/feature_selection.py new file mode 100755 index 0000000..2e9a86e --- /dev/null +++ b/modules/composes/transformation/feature_selection/feature_selection.py @@ -0,0 +1,27 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +from composes.semantic_space.operation import FeatureSelectionOperation + +class FeatureSelection(object): + ''' + classdocs + ''' + + + def __init__(self, reduced_dimension): + + if reduced_dimension <= 0: + raise ValueError("Cannot reduce to non-positive dimensionality: %d" + % reduced_dimension) + self._reduced_dimension = reduced_dimension + + def create_operation(self): + return FeatureSelectionOperation(self) + + def get_reduced_dimension(self): + return self._reduced_dimension + + reduced_dimension = property(get_reduced_dimension) \ No newline at end of file diff --git a/modules/composes/transformation/feature_selection/top_feature_selection.py b/modules/composes/transformation/feature_selection/top_feature_selection.py new file mode 100755 index 0000000..1b42eb9 --- /dev/null +++ b/modules/composes/transformation/feature_selection/top_feature_selection.py @@ -0,0 +1,54 @@ +''' +Created on Oct 5, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +from warnings import warn +from feature_selection import FeatureSelection + +class TopFeatureSelection(FeatureSelection): + """ + Sorts the columns of a space according to some criterion and returns a space + containing only the top :math:`k` ones. + + Available criteria: + + sum: Default. Ranks columns according to the sum on their elements. + + length: Ranks columns according to their vector length. + + """ + + _name = "top_feature_selection" + _valid_criteria = {"sum", "length"} + + def __init__(self, reduced_dimension, criterion='sum'): + ''' + Constructor + ''' + super(TopFeatureSelection, self).__init__(reduced_dimension) + + if criterion: + if criterion not in self._valid_criteria: + raise ValueError("Unrecognized criterion: %s" % criterion) + self.criterion = criterion + + def apply(self, matrix_): + + if self.criterion == "sum": + norm_function = matrix_.sum + else: + norm_function = matrix_.norm + + if self._reduced_dimension >= matrix_.shape[1]: + warn("Reduced dimension larger than number of columns!") + + no_columns = min(self._reduced_dimension, matrix_.shape[1]) + sorted_perm = matrix_.sorted_permutation(norm_function, 0) + + sorted_perm = sorted_perm[0:no_columns] + matrix_ = matrix_[:, sorted_perm] + + return matrix_, sorted_perm + + diff --git a/modules/composes/transformation/scaling/__init__.py b/modules/composes/transformation/scaling/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/transformation/scaling/epmi_weighting.py b/modules/composes/transformation/scaling/epmi_weighting.py new file mode 100755 index 0000000..e9bd407 --- /dev/null +++ b/modules/composes/transformation/scaling/epmi_weighting.py @@ -0,0 +1,52 @@ + +from scaling import Scaling +from composes.utils.py_matrix_utils import nonzero_invert + +class EpmiWeighting(Scaling): + """ + Exponential Point-wise Mutual Information. + + :math:`epmi(r,c) = \\frac{P(r,c)}{P(r)P(c)}` + + """ + + _name = 'epmi' + _uses_column_stats = True + + def apply(self, matrix_, column_marginal=None): + """ + Performs epmi weighting. + + Args: + matrix_ (Matrix): Input matrix + + column_marginal (np.ndarray): column marginals of the + core matrix if the matrix is a peripheral matrix + + Returns: + Matrix: the matrix after applying epmi. + + """ + + matrix_.assert_positive() + row_sum = matrix_.sum(axis = 1) + + if not column_marginal is None: + col_sum = column_marginal + else: + col_sum = matrix_.sum(axis = 0) + + total = col_sum.sum() + + row_sum = nonzero_invert(row_sum) + col_sum = nonzero_invert(col_sum) + col_sum = col_sum * total + + matrix_ = matrix_.scale_rows(row_sum) + matrix_ = matrix_.scale_columns(col_sum) + + return matrix_ + + def get_column_stats(self, matrix_): + return matrix_.sum(0) + diff --git a/modules/composes/transformation/scaling/normalization.py b/modules/composes/transformation/scaling/normalization.py new file mode 100755 index 0000000..13c5767 --- /dev/null +++ b/modules/composes/transformation/scaling/normalization.py @@ -0,0 +1,55 @@ +''' +Created on Oct 4, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +from numpy import double +from warnings import warn +from scaling import Scaling + +class Normalization(Scaling): + """ + Normalizes the a space according to a some criterion. + + Available criteria: + + sum: Default. The result matrix :math:`X` will satisfy: :math:`\\sum_{i,j} X_{ij}=1` + + length: The result matrix :math:`X` will satisfy: :math:`\\sqrt{\\sum_{i,j} X_{ij}^2}=1` + + """ + _name = "row_normalization" + _valid_criteria = ["sum", "length"] + _uses_column_stats = True + + def __init__(self, criterion='sum'): + ''' + Constructor + ''' + if criterion: + if criterion not in self._valid_criteria: + raise ValueError("Unrecognized criterion: %s" % criterion) + self.criterion = criterion + + + def apply(self, matrix_, total=None): + + if total is None: + if self.criterion == "length": + total = matrix_.norm() + else: + total = matrix_.sum() + + if total == 0: + warn("Could not normalize: sum/length of matrix is 0.") + return matrix_ + + matrix_ = (1 / double(total)) * matrix_ + return matrix_ + + def get_column_stats(self, matrix_): + + if self.criterion == "length": + return matrix_.norm() + else: + return matrix_.sum() diff --git a/modules/composes/transformation/scaling/plmi_weighting.py b/modules/composes/transformation/scaling/plmi_weighting.py new file mode 100755 index 0000000..39f759b --- /dev/null +++ b/modules/composes/transformation/scaling/plmi_weighting.py @@ -0,0 +1,22 @@ + +from scaling import Scaling +from ppmi_weighting import PpmiWeighting + +class PlmiWeighting(Scaling): + """ + Positive Local Mutual Information. + + :math:`plmi(r,c)=ppmi(r,c)count(r,c)` + + """ + + _name = "plmi" + _uses_column_stats = True + + def apply(self, matrix_, column_marginal=None): + return matrix_.multiply(PpmiWeighting().apply(matrix_, + column_marginal)) + + + def get_column_stats(self, matrix_): + return matrix_.sum(0) \ No newline at end of file diff --git a/modules/composes/transformation/scaling/plog_weighting.py b/modules/composes/transformation/scaling/plog_weighting.py new file mode 100755 index 0000000..478102e --- /dev/null +++ b/modules/composes/transformation/scaling/plog_weighting.py @@ -0,0 +1,29 @@ + +from scaling import Scaling + +class PlogWeighting(Scaling): + """ + Positive Log Weighting + + :math:`plog(r,c)= log(r,c) \\text{ if } log(r,c) \\geq 0 \\text{ else } 0` + """ + + _name = "plog" + + def apply(self, matrix_): + ''' + Performs positive log weighting. + + Args: + matrix_ (Matrix): Input matrix + column_marginal (array): column marginals of the core matrix if the matrix is a peripheral matrix + + Returns: + Matrix: the matrix after applying plog + + ''' + matrix_ = matrix_.copy() + matrix_.plog() + return matrix_ + + diff --git a/modules/composes/transformation/scaling/ppmi_weighting.py b/modules/composes/transformation/scaling/ppmi_weighting.py new file mode 100755 index 0000000..b171a48 --- /dev/null +++ b/modules/composes/transformation/scaling/ppmi_weighting.py @@ -0,0 +1,30 @@ + +from scaling import Scaling +from epmi_weighting import EpmiWeighting + +class PpmiWeighting(Scaling): + """ + Positive Point-wise Mutual Information. + + + :math:`pmi(r,c) = log\\frac{P(r,c)}{P(r)P(c)}` + + :math:`ppmi(r,c)= pmi(r,c) \\text{ if } pmi(r,c)\\geq 0 \\text{ else } 0` + """ + + _name = "ppmi" + _uses_column_stats = True + + def apply(self, matrix_, column_marginal=None): + + matrix_ = EpmiWeighting().apply(matrix_, column_marginal) + matrix_.plog() + return matrix_ + + def get_column_stats(self, matrix_): + return matrix_.sum(0) + + """ + :math:`ppmi(r,c)=\\begin{cases}pmi(rc) & \\text{if }pmi(r,c)\\geq0 + 0 & \\text{otherwise}\\end{cases}` + """ \ No newline at end of file diff --git a/modules/composes/transformation/scaling/row_normalization.py b/modules/composes/transformation/scaling/row_normalization.py new file mode 100755 index 0000000..b6145d2 --- /dev/null +++ b/modules/composes/transformation/scaling/row_normalization.py @@ -0,0 +1,46 @@ +''' +Created on Oct 4, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from scaling import Scaling +from composes.utils.py_matrix_utils import nonzero_invert + +class RowNormalization(Scaling): + """ + Normalizes the rows of a space according to a some criterion. + + Available criteria: + + length: Default. Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sqrt{\\sum_j X_{ij}^2}=1` + + + sum: Each row :math:`X_i` of the result matrix will satisfy: :math:`\\sum_j X_{ij}=1` + + """ + _name = "row_normalization" + _valid_criteria = ["sum", "length"] + + def __init__(self, criterion='length'): + ''' + Constructor + ''' + if criterion: + if criterion not in self._valid_criteria: + raise ValueError("Unrecognized criterion: %s" % criterion) + self.criterion = criterion + + + def apply(self, matrix_): + + if self.criterion == "length": + row_norms = matrix_.norm(axis=1) + else: + row_norms = matrix_.sum(axis=1) + + inv_row_norm = nonzero_invert(row_norms) + matrix_ = matrix_.scale_rows(inv_row_norm) + return matrix_ + + diff --git a/modules/composes/transformation/scaling/scaling.py b/modules/composes/transformation/scaling/scaling.py new file mode 100755 index 0000000..52765a7 --- /dev/null +++ b/modules/composes/transformation/scaling/scaling.py @@ -0,0 +1,29 @@ +''' +Created on Sep 20, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from composes.semantic_space.operation import ScalingOperation + +class Scaling(object): + ''' + classdocs + ''' + _name = "we are NOT stupid" + _uses_column_stats = False + + def get_name(self): + return self._name + + def get_uses_column_stats(self): + return self._uses_column_stats + + def create_operation(self): + return ScalingOperation(self) + + def __str__(self): + return self._name + + name = property(get_name) + uses_column_stats = property(get_uses_column_stats) \ No newline at end of file diff --git a/modules/composes/utils/__init__.py b/modules/composes/utils/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/modules/composes/utils/crossvalidation_utils.py b/modules/composes/utils/crossvalidation_utils.py new file mode 100755 index 0000000..c0a0432 --- /dev/null +++ b/modules/composes/utils/crossvalidation_utils.py @@ -0,0 +1,35 @@ +''' +Created on Oct 9, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from random import shuffle + +def get_split_indices(range_len, fold): + + if fold <= 0: + raise ValueError("Strictly positive number of folds required, received %s:" + % fold) + + indices_list = [] + if range_len < fold: + return get_split_indices(range_len, range_len) + + range_ = range(range_len) + shuffle(range_) + current_index = 0 + for i in range(fold): + if i < len(range_)%fold: + slice_length = range_len // fold + 1 + else: + slice_length = range_len // fold + + indices_list.append(range_[current_index:current_index + slice_length]) + current_index += slice_length + + return indices_list + +def get_submatrix_list(matrix_, indices_list): + return [matrix_[indices, :] for indices in indices_list] + diff --git a/modules/composes/utils/gen_utils.py b/modules/composes/utils/gen_utils.py new file mode 100755 index 0000000..877280a --- /dev/null +++ b/modules/composes/utils/gen_utils.py @@ -0,0 +1,29 @@ +''' +Created on May 21, 2013 + +@author: Georgiana Dinu, Pham The Nghia +''' +from composes.exception.invalid_argument_error import InvalidArgumentError + + +def assert_is_instance(object_, class_): + if not isinstance(object_, class_): + raise TypeError("expected %s, received %s" % (class_, type(object_))) + + +def get_partitions(sorted_list, min_samples): + prev_idx = 0 + range_list = [] + for i in range(1, len(sorted_list)): + if sorted_list[i] != sorted_list[i - 1]: + if i - prev_idx >= min_samples: + range_list.append((prev_idx, i)) + + prev_idx = i + + if len(sorted_list) - prev_idx >= min_samples: + range_list.append((prev_idx, len(sorted_list))) + + keys = [sorted_list[range_list[i][0]] for i in xrange(len(range_list))] + + return keys, range_list \ No newline at end of file diff --git a/modules/composes/utils/io_utils.py b/modules/composes/utils/io_utils.py new file mode 100755 index 0000000..69f7f4b --- /dev/null +++ b/modules/composes/utils/io_utils.py @@ -0,0 +1,272 @@ +''' +Created on Oct 16, 2012 + +@author: nghia +''' + +import numpy as np +try: + import cPickle as pickle +except ImportError: + import pickle +import os +import gzip as gzip +from warnings import warn +from scipy.sparse import csr_matrix +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.sparse_matrix import SparseMatrix +from composes.utils.gen_utils import assert_is_instance +import struct + + +def save(object_, file_name): + create_parent_directories(file_name) + try: + with open(file_name, 'wb') as f: + pickle.dump(object_, f, 2) + except struct.error: + warn("object is too big, using pickle with protocol 0") + with open(file_name, 'wb') as f: + pickle.dump(object_, f, 0) + + +def load(file_name, data_type=None): + with open(file_name, 'rb') as f: + result = pickle.load(f) + + if not data_type is None: + assert_is_instance(result, data_type) + + return result + + +def create_directories(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def create_parent_directories(file_name): + parent_dir = os.path.dirname(file_name) + + if parent_dir != "" and not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + +def extract_indexing_structs(filename, field_list): + str2id = {} + id2str = [] + no_fields = len(field_list) + + str2id_list = [str2id.copy() for i in xrange(no_fields)] + id2str_list = [list(id2str) for i in xrange(no_fields)] + index_list = [0 for i in xrange(no_fields)] + max_field = max(field_list) + + if filename.endswith(".gz"): + input_stream = gzip.open(filename, "rb") + else: + input_stream = open(filename, "rb") + + for line in input_stream: + if line.strip() != "": + elements = line.strip().split() + if len(elements) <= max_field: + warn("Invalid input line:%s. Skipping it" % line.strip()) + else: + for field_idx, field in enumerate(field_list): + current_str = elements[field] + if not current_str in str2id_list[field_idx]: + str2id_list[field_idx][current_str] = index_list[field_idx] + id2str_list[field_idx].append(current_str) + index_list[field_idx] += 1 + + for id2str in id2str_list: + if not id2str: + raise ValueError("Found no valid data in file: %s!" % filename) + return (id2str_list, str2id_list) + + +def read_tuple_list(data_file, fields=None): + field_list = [] + result = [] + + if fields: + field_list = fields + + with open(data_file) as f: + for line in f: + line = line.strip() + if line != "": + elements = line.split() + if field_list: + try: + elements = np.array(elements)[field_list] + except IndexError: + raise IndexError("Cannot extract fields:%s from %s!" + % (field_list, data_file)) + + result.append(tuple(elements)) + + return result + + +def read_list(file_name, **kwargs): + field = None + result = [] + if "field" in kwargs: + field = kwargs["field"] + + with open(file_name) as f: + for line in f: + line = line.strip() + if line != "": + if not field is None: + try: + result.append(line.split()[field]) + except IndexError: + raise IndexError("Cannot extract field:%s from %s!" + % (field, file_name)) + else: + result.append(line) + return result + + +def read_sparse_space_data(matrix_file, row2id, column2id, dtype=np.double): + if matrix_file.endswith(".gz"): + f = gzip.open(matrix_file, "rb") + else: + f = open(matrix_file, "rb") + + no_lines = sum(1 for line in f if line.strip() != "") + f.close() + + row = np.zeros(no_lines, dtype=np.int32) + col = np.zeros(no_lines, dtype=np.int32) + + data = np.zeros(no_lines, dtype=dtype) + + if matrix_file.endswith(".gz"): + f = gzip.open(matrix_file, "rb") + else: + f = open(matrix_file, "rb") + + i = 0 + for line in f: + if line.strip() != "": + line_elements = line.strip().split() + if len(line_elements) >= 3: + [word1, word2, count] = line_elements[0:3] + if word1 in row2id and word2 in column2id: + row[i] = row2id[word1] + col[i] = column2id[word2] + data[i] = dtype(count) + i += 1 + if i % 1000000 == 0: + print "Progress...%d" % i + #if len(line_elements) > 3: + # warn("Invalid input line:%s. Expected 3 fields, ignoring additional ones!" % line.strip()) + else: + raise ValueError("Invalid row: %s, expected at least %d fields" + % (line.strip(), 3)) + + f.close() + # eliminate the extra zeros created when word1 or word2 is not row2id or col2id!! + data = data[0:i] + row = row[0:i] + col = col[0:i] + + m = SparseMatrix(csr_matrix((data, (row, col)), shape=(len(row2id), len(column2id)))) + if m.mat.nnz != i: + warn("Found 0-counts or duplicate row,column pairs. (Duplicate entries are summed up.)") + + return m + + +def read_dense_space_data(matrix_file, row2id, element_type=np.double): + #get number of rows and columns + if matrix_file.endswith(".gz"): + f = gzip.open(matrix_file, "rb") + else: + f = open(matrix_file, "rb") + + first_line = f.next() + no_cols = len(first_line.strip().split()) - 1 + if no_cols <= 0: + raise ValueError("Invalid row: %s, expected at least %d fields" % (first_line.strip(), 2)) + f.close() + + no_rows = len(row2id) + row_string_set = set([]) + + m = np.mat(np.zeros(shape=(no_rows, no_cols), dtype=element_type)) + + if matrix_file.endswith(".gz"): + f = gzip.open(matrix_file, "rb") + else: + f = open(matrix_file, "rb") + + for line in f: + if not line.strip() == "": + elements = line.strip().split() + if len(elements) != no_cols + 1: + raise ValueError("Invalid row: %s, expected %d fields" + % (line.strip(), no_cols + 1)) + word = elements[0] + if word in row2id: + i = row2id[word] + if word in row_string_set != 0: + warn("Found duplicate row: %s. Ignoring it." % word) + else: + m[i, :] = elements[1:] + row_string_set.add(word) + + f.close() + + return DenseMatrix(m) + + +def print_list(list_, file_name): + with open(file_name, 'w') as f: + for item in list_: + f.write(item + "\n") + + +def print_cooc_mat_sparse_format(matrix_, id2row, id2column, file_prefix): + matrix_file = "%s.%s" % (file_prefix, "sm") + if not id2column: + raise ValueError("Cannot print matrix with no column info in sparse format!") + + mat = matrix_.mat + with open(matrix_file, 'w') as f: + if isinstance(matrix_, SparseMatrix): + + data = mat.data + row_indices = mat.indptr + col_indices = mat.indices + + row_index = 0 + next_row = row_indices[1] + row = id2row[0] + for i in xrange(len(data)): + while i == next_row: + row_index += 1 + next_row = row_indices[row_index + 1] + row = id2row[row_index] + col = id2column[col_indices[i]] + f.write("%s\t%s\t%f\n" % (row, col, data[i])) + else: + for i in range(mat.shape[0]): + for j in range(mat.shape[1]): + if mat[i, j] != 0: + f.write("%s\t%s\t%f\n" % (id2row[i], id2column[j], mat[i, j])) + + +def print_cooc_mat_dense_format(matrix_, id2row, file_prefix): + matrix_file = "%s.%s" % (file_prefix, "dm") + + with open(matrix_file, 'w') as f: + for i, row in enumerate(id2row): + v = DenseMatrix(matrix_[i]).mat.flat + line = "\t".join([row] + [repr(v[j]) for j in range(len(v))]) + f.write("%s\n" % (line)) + diff --git a/modules/composes/utils/log_utils.py b/modules/composes/utils/log_utils.py new file mode 100755 index 0000000..94b3346 --- /dev/null +++ b/modules/composes/utils/log_utils.py @@ -0,0 +1,110 @@ +''' +Created on Oct 15, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from numpy import double +import logging +from composes.utils.io_utils import create_parent_directories + +def config_logging(file_name, level = logging.INFO, format_ =""): + if not file_name is None: + create_parent_directories(file_name) + logging.basicConfig(filename=file_name, level=level, format=format_) + logging.debug("start logging") + + +def get_ident(delim, ident_level): + return delim * ident_level + +def print_matrix_info(logger_, matrix_, ident_level, intro_string): + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + intro_string + ident = ident + delim + + logger_string += ("\n%sMatrix type:%s" % (ident, type(matrix_).__name__)) + logger_string += ("\n%sMatrix shape:%sx%s" % (ident, matrix_.shape[0], + matrix_.shape[1])) + + if type(matrix_).__name__ == "SparseMatrix": + perc_nnz = 100 * matrix_.mat.nnz/double(matrix_.shape[0]*matrix_.shape[1]) + logger_string += ("\n%sPerc. non-zero entries:%d" % (ident, perc_nnz)) + + logger_.info(logger_string) + + +def get_learner_info(learner, ident): + logger_string = "" + + if hasattr(learner, '_intercept'): + logger_string += ("\n%sUsing intercept:%s" % (ident, learner._intercept)) + + if hasattr(learner, '_crossvalidation'): + logger_string += ("\n%sUsing crossvalidation:%s" % (ident, learner._crossvalidation)) + + if learner._crossvalidation and hasattr(learner, '_folds'): + logger_string += ("\n%sUsing number of folds:%s" % (ident, learner._folds)) + + return logger_string + +def print_composition_model_info(logger_, model, ident_level, intro_string): + + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + intro_string + ident = ident + delim + + logger_.info(logger_string) + + print_name(logger_, model, ident_level, "Composition model type:") + + logger_string = "" + if hasattr(model, '_regression_learner'): + logger_string += ("\n%sUsing regression:%s" % (ident, + type(model.regression_learner).__name__)) + logger_string += get_learner_info(model.regression_learner, ident + delim) + + logger_.info(logger_string) + +def print_transformation_info(logger_, trans, ident_level, intro_string): + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + intro_string + ident = ident + delim + + logger_string += ("\n%sTransformation type:%s" % (ident, type(trans).__name__)) + + if hasattr(trans, '_reduced_dimension'): + logger_string += ("\n%sReduced dimension:%s" % (ident, trans.reduced_dimension)) + + + logger_.info(logger_string) + +def print_info(logger_, ident_level, text): + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + "" + + logger_string += "\n%s%s" % (ident, text) + logger_.info(logger_string) + +def print_name(logger_, object_, ident_level, intro_string): + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + intro_string + ident = ident + delim + + logger_string += ("\n%s%s" % (ident, type(object_).__name__)) + + logger_.info(logger_string) + +def print_time_info(logger_, end, beg, ident_level): + delim = " " + ident = get_ident(delim, ident_level) + logger_string = ident + logger_string += ("\n%sTiming:%s seconds" % (ident, end - beg)) + + logger_.info(logger_string) + diff --git a/modules/composes/utils/matrix_utils.py b/modules/composes/utils/matrix_utils.py new file mode 100755 index 0000000..3b5c9e6 --- /dev/null +++ b/modules/composes/utils/matrix_utils.py @@ -0,0 +1,103 @@ + +import numpy as np +from composes.matrix.sparse_matrix import SparseMatrix +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.matrix import Matrix +from scipy.sparse import issparse +from py_matrix_utils import is_array +from warnings import warn + +def to_matrix(matrix_): + """ + Converts an array-like structure to a DenseMatrix/SparseMatrix + """ + if issparse(matrix_): + return SparseMatrix(matrix_) + else: + return DenseMatrix(matrix_) + +def is_array_or_matrix(data): + return is_array(data) or isinstance(data, Matrix) + + +def assert_is_array_or_matrix(data): + if not is_array_or_matrix(data): + raise TypeError("expected array-like or matrix, received %s" + % (type(data))) + +def padd_matrix(matrix_, axis, value=1): + matrix_type = type(matrix_) + if axis == 0: + append_mat = matrix_type(np.ones((1, matrix_.shape[1]))*value) + return matrix_.vstack(append_mat) + elif axis == 1: + append_mat = matrix_type(np.ones((matrix_.shape[0], 1))*value) + return matrix_.hstack(append_mat) + else: + raise ValueError("Invalid axis value:%s" % axis) + + +def assert_same_shape(matrix1, matrix2, axis=None): + + if axis is None: + if matrix1.shape != matrix2.shape: + raise ValueError("Inconsistent shapes") + else: + if not axis in [0, 1]: + raise ValueError("Invalid axis value: %s, expected 0 or 1." % axis) + if matrix1.shape[axis] != matrix2.shape[axis]: + raise ValueError("Inconsistent shapes") + + +def to_compatible_matrix_types(v1, v2): + + if isinstance(v1, Matrix) and isinstance(v2, Matrix): + v2 = type(v1)(v2) + elif not isinstance(v1, Matrix) and isinstance(v2, Matrix): + v1 = type(v2)(v1) + elif not isinstance(v2, Matrix) and isinstance(v1, Matrix): + v2 = type(v1)(v2) + else: + v1 = to_matrix(v1) + v2 = type(v1)(v2) + + return v1, v2 + + + +def get_type_of_largest(matrix_list): + max_dim = 0 + max_type = None + for matrix_ in matrix_list: + if matrix_.shape[0] * matrix_.shape[1] > max_dim: + max_type = type(matrix_) + max_dim = matrix_.shape[0] * matrix_.shape[1] + + return max_type + +def resolve_type_conflict(matrix_list, matrix_type): + new_matrix_list = [] + + if matrix_type_conflict(matrix_list): + warn("Efficiency warning: matrices should have the same dense/sparse type!") + for matrix_ in matrix_list: + new_matrix_list.append(matrix_type(matrix_)) + return new_matrix_list + + return list(matrix_list) + + +def matrix_type_conflict(matrix_list): + + if not matrix_list: + return False + + matrix_type = type(matrix_list[0]) + for matrix_ in matrix_list: + if not isinstance(matrix_, matrix_type): + return True + + return False + + + diff --git a/modules/composes/utils/mem_utils.py b/modules/composes/utils/mem_utils.py new file mode 100755 index 0000000..db1b474 --- /dev/null +++ b/modules/composes/utils/mem_utils.py @@ -0,0 +1,16 @@ +''' +Created on Sep 21, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +""" +Wrappers around psutil functions that display memory usage information. +""" +import numpy as np +from os import getpid +import psutil + +def get_mem_usage(): + p = psutil.Process(getpid()) + return p.get_memory_info()[0]/np.double(1024*1024) \ No newline at end of file diff --git a/modules/composes/utils/num_utils.py b/modules/composes/utils/num_utils.py new file mode 100755 index 0000000..c9cb215 --- /dev/null +++ b/modules/composes/utils/num_utils.py @@ -0,0 +1,15 @@ +''' +Created on Sep 18, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +from numbers import Number +from numbers import Integral +import numpy as np + +def is_numeric(operand): + return isinstance(operand, (Number, np.number)) + +def is_integer(operand): + return isinstance(operand, Integral) diff --git a/modules/composes/utils/py_matrix_utils.py b/modules/composes/utils/py_matrix_utils.py new file mode 100755 index 0000000..172e1d3 --- /dev/null +++ b/modules/composes/utils/py_matrix_utils.py @@ -0,0 +1,35 @@ +''' +Created on Sep 19, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' +import numpy as np +from scipy.sparse import spdiags + + +def array_to_csr_diagonal(array_): + #array_ can't be a sparse matrix, if it is dense, it has to be a row matrix + #(i.e. shape = (1, x)) + + flat_array = array_.flatten() + array_size = flat_array.size + csr_diag = spdiags(flat_array, [0], array_size, array_size, format = 'csr') + return csr_diag + +def is_array(operand): + return hasattr(operand, 'dtype') and hasattr(operand, 'shape') + + +def nonzero_invert(matrix_): + ''' + Performs 1/x for all x, non-zero elements of the matrix. + + Params: + matrix_: np.matrix + ''' + + matrix_ = matrix_.astype(np.double) + matrix_[matrix_ != 0] = np.array(1.0/matrix_[matrix_ != 0]).flatten() + return matrix_ + + diff --git a/modules/composes/utils/regression_learner.py b/modules/composes/utils/regression_learner.py new file mode 100755 index 0000000..fd7b641 --- /dev/null +++ b/modules/composes/utils/regression_learner.py @@ -0,0 +1,106 @@ +import numpy as np +from composes.matrix.linalg import Linalg + + +class RegressionLearner(object): + """ + Implements a set of regression methods. + + Supported regression methods are least squares regression and + ridge regression. Ridge regression can be used with generalized + cross validation. (Hastie, Tibshirani and Friedman, Second edition, + page 244) + """ + + + def __init__(self): + ''' + Constructor + ''' + + def has_intercept(self): + return self._intercept + + +class LstsqRegressionLearner(RegressionLearner): + """ + This class performs Least Squares Regression. + + It finds the matrix X which solves: + + :math:`X = argmin(||AX - B||_2)` + + It can be used with intercept or without (by default intercept=True). + + """ + + def __init__(self, intercept=True): + self._intercept = intercept + + def train(self, matrix_a, matrix_b): + return Linalg.lstsq_regression(matrix_a, matrix_b, self._intercept) + + +class RidgeRegressionLearner(RegressionLearner): + """ + This class performs Ridge Regression. + + It finds the matrix X which solves: + + :math:`X = argmin(||AX - B||_2 + \\lambda||X||_2)` + + It can be used with intercept or without (by default intercept=True). + Cross validation can be used with default :math:`\\lambda` range of + :math:`linspace(0, 5, 11)`. By default Generalized cross validation is performed. + If cross validation is set False it requires the input of a :math:`\\lambda` value. + + """ + + def __init__(self, intercept=True, param_range=None, crossvalidation=True, param=None): + self._intercept = intercept + self._param_range = param_range if param_range is not None else np.linspace(0.0, 5, 11) + + self._param = param + self._crossvalidation = crossvalidation + + if param: + self._crossvalidation = False + self._param = param + + if not self._crossvalidation and self._param is None: + raise ValueError("Cannot run (no-crossvalidation) RidgeRegression with no lambda value!") + + + def train(self, matrix_a, matrix_b): + """ + If cross validation is set to True, it performs generalized + cross validation. (Hastie, Tibshirani and Friedman, Second edition, + page 244). + """ + + if not self._crossvalidation: + return Linalg.ridge_regression(matrix_a, matrix_b, self._param, + self._intercept)[0] + + else: + min_err_param = 0 + min_err = np.Inf + gcv_err = np.Inf + + N = matrix_a.shape[0] + for param in self._param_range: + + mat_x, S_trace, err1 = Linalg.ridge_regression(matrix_a, matrix_b, param, + self._intercept) + + nom = pow(1 - S_trace / N, 2) * N + if nom != 0: + gcv_err = (err1 * err1) / nom + + if gcv_err < min_err: + min_err = gcv_err + min_err_param = param + + #print "lambda:", min_err_param + return Linalg.ridge_regression(matrix_a, matrix_b, min_err_param, + self._intercept)[0] diff --git a/modules/composes/utils/scoring_utils.py b/modules/composes/utils/scoring_utils.py new file mode 100755 index 0000000..64dc787 --- /dev/null +++ b/modules/composes/utils/scoring_utils.py @@ -0,0 +1,64 @@ +''' +Created on Oct 17, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + +import numpy as np +from scipy import stats + + +def score(gold, prediction, method): + if len(gold) != len(prediction): + raise ValueError("The two arrays must have the same length!") + + gold = np.array(gold, dtype=np.double) + prediction = np.array(prediction, dtype=np.double) + + if method == "pearson": + return pearson(gold, prediction)[0] + elif method == "spearman": + return spearman(gold, prediction)[0] + elif method == "auc": + return auc(gold, prediction) + else: + raise NotImplementedError("Unknown scoring measure:%s" % method) + +def pearson(gold, prediction): + return stats.pearsonr(gold, prediction) + +def spearman(gold, prediction): + return stats.spearmanr(gold, prediction, None) + +def auc(gold, prediction): + + positive = float(gold[gold == 1].size) + negative = float(gold.size - positive) + + total_count = gold.size + point_set = np.empty(total_count, dtype = [('gold',float),('score',float)]) + for i in range(total_count): + if not gold[i] in (0,1): + raise ValueError("For evaluating AUC, gold scores are required to be 0 or 1.") + point_set[i]=(gold[i], prediction[i]) + + point_set.sort(order = 'score') + + xi = 1.0 + yi = 1.0 + xi_old = 1.0 + true_positive = positive + false_positive = negative + auc = 0 + + for i in range(total_count): + if (point_set[i][0] == 1): + true_positive -= 1 + yi = true_positive / positive + else: + false_positive -= 1 + xi = false_positive / negative + auc += (xi_old - xi) * yi + xi_old = xi + + return auc diff --git a/modules/composes/utils/space_utils.py b/modules/composes/utils/space_utils.py new file mode 100755 index 0000000..6cf36ae --- /dev/null +++ b/modules/composes/utils/space_utils.py @@ -0,0 +1,56 @@ +''' +Created on Sep 26, 2012 + +@author: Georgiana Dinu, Pham The Nghia +''' + + +def list2dict(list_): + return_dict = {} + + for idx, word in enumerate(list_): + if word in return_dict: + raise ValueError("duplicate string found in list: %s" % (word)) + return_dict[word] = idx + + return return_dict + +def add_items_to_dict(dict_, list_): + + no_els = len(dict_) + for idx, el in enumerate(list_): + if el in dict_: + raise ValueError("Found duplicate keys when appending elements to\ + dictionary.") + dict_[el] = no_els + idx + return dict_ + +def assert_dict_match_list(dict_, list_): + + match_err = ValueError("expected matching dictionary and list structures.") + + if not len(list_) == len(dict_): + raise match_err + for (k, v) in dict_.iteritems(): + if not list_[v] == k: + raise match_err + + +def assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id): + + no_rows = matrix_.mat.shape[0] + no_cols = matrix_.mat.shape[1] + + has_column_maps = column2id or id2column + + if not no_rows == len(id2row) or not no_rows == len(row2id): + raise ValueError("expected consistent shapes: %d %d %d" + % (no_rows, len(id2row), len(row2id))) + + if (has_column_maps and + (not no_cols == len(id2column) or not no_cols == len(column2id))): + raise ValueError("expected consistent shapes: %d %d %d" + % (no_cols, len(id2column), len(column2id))) + + + diff --git a/modules/cupy_utils.py b/modules/cupy_utils.py new file mode 100644 index 0000000..a0240d9 --- /dev/null +++ b/modules/cupy_utils.py @@ -0,0 +1,43 @@ +# Copyright (C) 2018 Mikel Artetxe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import numpy + +try: + import cupy +except ImportError: + cupy = None + + +def supports_cupy(): + return cupy is not None + + +def get_cupy(): + return cupy + + +def get_array_module(x): + if cupy is not None: + return cupy.get_array_module(x) + else: + return numpy + + +def asnumpy(x): + if cupy is not None: + return cupy.asnumpy(x) + else: + return numpy.asarray(x) diff --git a/modules/dsm.py b/modules/dsm.py new file mode 100644 index 0000000..734483b --- /dev/null +++ b/modules/dsm.py @@ -0,0 +1,171 @@ +import sys +sys.path.append('../') + +import logging +import os +import itertools +from gensim import utils +try: + from gensim.models.word2vec_inner import MAX_WORDS_IN_BATCH +except ImportError: + # failed... fall back to plain numpy (20-80x slower training than the above) + MAX_WORDS_IN_BATCH = 10000 + +import gzip +import bz2 +import pickle +import numpy as np + +from composes.semantic_space.space import Space +from collections import defaultdict +from composes.utils import io_utils +from scipy.sparse import coo_matrix, csr_matrix +from composes.matrix.sparse_matrix import SparseMatrix +from composes.matrix.dense_matrix import DenseMatrix + + +# To-do: should be renamed and restructured +def save_pkl_files(dsm, dsm_prefix, save_in_one_file=False, save_as_w2v=False): + """ + Save semantic space (from DISSECT package) to different formats. + :param dsm: the semantic space + :param dsm_prefix: the prefix for the output files + :param save_in_one_file: whether to save as one file (pkl or w2v) or separate files (npz for matrix and pkl for rows and columns) + :param save_as_w2v: given save_in_one_file=True, whether to save it in w2v format or pkl + """ + + # Save in a single file (for small spaces) + if save_in_one_file: + # only useful for dense spaces + if save_as_w2v: + rows = np.array(dsm.cooccurrence_matrix.get_mat()).astype(object) + id2row = np.array([word.decode('utf-8') for word in dsm.get_id2row()]) + r, d = rows.shape + id2row = id2row.reshape(-1,1) + rows = np.concatenate((id2row, rows), axis=1) + np.savetxt(dsm_prefix + '.w2v', rows, fmt=["%s"] + ['%.16g',]*d, delimiter=' ', newline='\n', header='%d %d' %(r, d), comments='', encoding='utf-8') + else: + io_utils.save(dsm, dsm_prefix + '.pkl') + + # Save in multiple files: npz for the matrix and pkl for the other data members of Space + else: + mat = coo_matrix(dsm.cooccurrence_matrix.get_mat()) + np.savez_compressed(dsm_prefix + '.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape) + + with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out: + pickle.dump(dsm._row2id, f_out, 2) + + with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out: + pickle.dump(dsm._id2row, f_out, 2) + + with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out: + pickle.dump(dsm._column2id, f_out, 2) + + with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out: + pickle.dump(dsm._id2column, f_out, 2) + + +def load_pkl_files(dsm_prefix): + """ + Load the space from either a single pkl file or numerous files. + :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols) + """ + + # Check whether there is a single pickle file for the Space object + if os.path.isfile(dsm_prefix + '.pkl'): + return io_utils.load(dsm_prefix + '.pkl') + + # Load the multiple files: npz for the matrix and pkl for the other data members of Space + if os.path.isfile(dsm_prefix + '.npz'): + with np.load(dsm_prefix + '.npz') as loader: + coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) + + cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) + + with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: + row2id = pickle.load(f_in) + + with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: + id2row = pickle.load(f_in) + + with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: + column2id = pickle.load(f_in) + + with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: + id2column = pickle.load(f_in) + + return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id) + + if os.path.isfile(dsm_prefix + '.tsv'): + values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments='', encoding='utf-8') + targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments='', encoding='utf-8') + # Convert to space in sparse matrix format + return Space(SparseMatrix(values), list(targets), []) + + # If everything fails try to load it as single w2v file + space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments='', encoding='utf-8') + targets = space_array[:,0].flatten() + values = space_array[:,1:].astype(np.float) + # Convert to space and sparse matrix format + return Space(SparseMatrix(values), list(targets), []) + + +class PathLineSentences_mod(object): + """ + Simple format: date\tsentence = one line; words already preprocessed and separated by whitespace. + Like LineSentence, but will process all files in a directory in alphabetical order by filename + """ + + def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None, lowerBound=-9999, upperBound=9999): + """ + `source` should be a path to a directory (as a string) where all files can be opened by the + LineSentence class. Each file will be read up to + `limit` lines (or no clipped if limit is None, the default). + + Example:: + + sentences = LineSentencePath_mod(os.getcwd() + '\\corpus\\') + + The files in the directory should be either text files, .bz2 files, or .gz files. + + """ + self.source = source + self.max_sentence_length = max_sentence_length + self.limit = limit + self.lowerBound = lowerBound + self.upperBound = upperBound + self.corpusSize = 0 + + + if os.path.isfile(self.source): + logging.warning('single file read, better to use models.word2vec.LineSentence') + self.input_files = [self.source] # force code compatibility with list of files + elif os.path.isdir(self.source): + self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path + logging.debug('reading directory ' + self.source) + self.input_files = os.listdir(self.source) + self.input_files = [self.source + file for file in self.input_files] # make full paths + self.input_files.sort() # makes sure it happens in filename order + else: # not a file or a directory, then we can't do anything with it + raise ValueError('input is neither a file nor a path') + + logging.info('files read into PathLineSentences_mod:' + '\n'.join(self.input_files)) + + def __iter__(self): + '''iterate through the files''' + for file_name in self.input_files: + if '.DS_Store' in file_name: + continue + logging.info('reading file ' + file_name) + with utils.smart_open(file_name) as fin: + for line in itertools.islice(fin, self.limit): + lineSplit = line.split("\t") + date, line = int(lineSplit[0]), utils.to_unicode(lineSplit[1]).split() # Get date and sentence + if not self.lowerBound <= date <= self.upperBound: # skip every sentence which is not in timeframe + continue + self.corpusSize+=len(line) + i = 0 + while i < len(line): + yield line[i:i + self.max_sentence_length] + i += self.max_sentence_length + diff --git a/modules/dsm.pyc b/modules/dsm.pyc new file mode 100644 index 0000000..79efef5 Binary files /dev/null and b/modules/dsm.pyc differ diff --git a/modules/embeddings.py b/modules/embeddings.py new file mode 100644 index 0000000..9a407a5 --- /dev/null +++ b/modules/embeddings.py @@ -0,0 +1,80 @@ +# Copyright (C) 2016-2018 Mikel Artetxe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from cupy_utils import * + +import numpy as np + + +def read(file, threshold=0, vocabulary=None, dtype='float'): + header = file.readline().split(' ') + count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0])) + dim = int(header[1]) + words = [] + matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else [] + for i in range(count): + word, vec = file.readline().split(' ', 1) + if vocabulary is None: + words.append(word) + matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype) + elif word in vocabulary: + words.append(word) + matrix.append(np.fromstring(vec, sep=' ', dtype=dtype)) + return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype)) + + +def write(words, matrix, file): + m = asnumpy(matrix) + print('%d %d' % m.shape, file=file) + for i in range(len(words)): + print(words[i] + ' ' + ' '.join(['%.6g' % x for x in m[i]]), file=file) + + +def length_normalize(matrix): + xp = get_array_module(matrix) + norms = xp.sqrt(xp.sum(matrix**2, axis=1)) + norms[norms == 0] = 1 + matrix /= norms[:, xp.newaxis] + + +def mean_center(matrix): + xp = get_array_module(matrix) + avg = xp.mean(matrix, axis=0) + matrix -= avg + + +def length_normalize_dimensionwise(matrix): + xp = get_array_module(matrix) + norms = xp.sqrt(xp.sum(matrix**2, axis=0)) + norms[norms == 0] = 1 + matrix /= norms + + +def mean_center_embeddingwise(matrix): + xp = get_array_module(matrix) + avg = xp.mean(matrix, axis=1) + matrix -= avg[:, xp.newaxis] + + +def normalize(matrix, actions): + for action in actions: + if action == 'unit': + length_normalize(matrix) + elif action == 'center': + mean_center(matrix) + elif action == 'unitdim': + length_normalize_dimensionwise(matrix) + elif action == 'centeremb': + mean_center_embeddingwise(matrix) diff --git a/representations/count.py b/representations/count.py new file mode 100644 index 0000000..4063540 --- /dev/null +++ b/representations/count.py @@ -0,0 +1,100 @@ +import sys +sys.path.append('./modules/') + +from collections import defaultdict +from docopt import docopt +import logging +import time +import numpy as np +from dsm import save_pkl_files, PathLineSentences_mod +from scipy.sparse import dok_matrix, csr_matrix, linalg +from composes.semantic_space.space import Space +from composes.matrix.sparse_matrix import SparseMatrix + + +def main(): + """ + Make count-based vector space from corpus. + """ + + # Get the arguments + args = docopt("""Make count-based vector space from corpus. + + Usage: + count.py [-l] + + Arguments: + + = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' + = output path for vectors + = the linear distance of context words to consider in each direction + = lower bound for time period + = upper bound for time period + + Options: + -l, --len normalize final vectors to unit length + + """) + + is_len = args['--len'] + corpDir = args[''] + outPath = args[''] + windowSize = int(args['']) + lowerBound = int(args['']) + upperBound = int(args['']) + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Build vocabulary + logging.info("Building vocabulary") + sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + vocabulary = list(set([word for sentence in sentences for word in sentence if len(sentence)>1])) # Skip one-word sentences to avoid zero-vectors + w2i = {w: i for i, w in enumerate(vocabulary)} + + # Initialize co-occurrence matrix as dictionary + cooc_mat = defaultdict(lambda: 0) + + # Get counts from corpus + sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + logging.info("Counting context words") + for sentence in sentences: + for i, word in enumerate(sentence): + lowerWindowSize = max(i-windowSize, 0) + upperWindowSize = min(i+windowSize, len(sentence)) + window = sentence[lowerWindowSize:i] + sentence[i+1:upperWindowSize+1] + if len(window)==0: # Skip one-word sentences + continue + windex = w2i[word] + for contextWord in window: + cooc_mat[(windex,w2i[contextWord])] += 1 + + + # Convert dictionary to sparse matrix + logging.info("Converting dictionary to matrix") + cooc_mat_sparse = dok_matrix((len(vocabulary),len(vocabulary)), dtype=float) + try: + cooc_mat_sparse.update(cooc_mat) + except NotImplementedError: + cooc_mat_sparse._update(cooc_mat) + + if is_len: + # L2-normalize vectors + l2norm1 = linalg.norm(cooc_mat_sparse, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + cooc_mat_sparse /= l2norm1.reshape(len(l2norm1),1) + + # Make space + vocabulary = [v.encode('utf-8') for v in vocabulary] + countSpace = Space(SparseMatrix(cooc_mat_sparse), vocabulary, vocabulary) + + # Save the Space object in pickle format + save_pkl_files(countSpace, outPath, save_in_one_file=False) + + logging.info("Corpus has size %d" % sentences.corpusSize) + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/representations/ppmi.py b/representations/ppmi.py new file mode 100644 index 0000000..9652dec --- /dev/null +++ b/representations/ppmi.py @@ -0,0 +1,103 @@ +import sys +sys.path.append('./modules/') + +import numpy as np +from docopt import docopt +from scipy.sparse import csc_matrix, coo_matrix, linalg +from composes.utils import io_utils +from composes.semantic_space.space import Space +from composes.utils.py_matrix_utils import nonzero_invert +from composes.transformation.scaling.ppmi_weighting import PpmiWeighting +from composes.matrix.sparse_matrix import SparseMatrix +from dsm import save_pkl_files, load_pkl_files +import logging +import time + + +def main(): + """ + Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in + + Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. + + """ + + # Get the arguments + args = docopt('''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format. + + Usage: + ppmi.py [-l] + + = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) + = shifting parameter + = smoothing parameter + = output path for space + + Options: + -l, --len normalize final vectors to unit length + + ''') + + is_len = args['--len'] + dsm_prefix = args[''] + k = int(args['']) + alpha = float(args['']) + outPath = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get space with sparse matrix + dsm = load_pkl_files(dsm_prefix) + id2row = dsm.get_id2row() + id2column = dsm.get_id2column() + + # Get probabilities + matrix_ = dsm.cooccurrence_matrix + + matrix_.assert_positive() + row_sum = matrix_.sum(axis = 1) + col_sum = matrix_.sum(axis = 0) + + # Compute smoothed P_alpha(c) + smooth_col_sum = np.power(col_sum, alpha) + col_sum = smooth_col_sum/smooth_col_sum.sum() + + # Compute P(w) + row_sum = nonzero_invert(row_sum) + col_sum = nonzero_invert(col_sum) + + # Apply epmi weighting (without log) + matrix_ = matrix_.scale_rows(row_sum) + matrix_ = matrix_.scale_columns(col_sum) + + # Apply log weighting + matrix_.mat.data = np.log(matrix_.mat.data) + + # Shift values + matrix_.mat.data -= np.log(k) + + # Eliminate negative counts + matrix_.mat.data[matrix_.mat.data <= 0] = 0.0 + + # Eliminate zero counts + matrix_.mat.eliminate_zeros() + + matrix_ = matrix_.get_mat() + + if is_len: + # L2-normalize vectors + l2norm1 = linalg.norm(matrix_, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + matrix_ /= l2norm1.reshape(len(l2norm1),1) + + dsm = Space(SparseMatrix(matrix_), id2row, id2column) + + # Save the Space object in pickle format + save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False) + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/representations/ri.py b/representations/ri.py new file mode 100644 index 0000000..89bc4bb --- /dev/null +++ b/representations/ri.py @@ -0,0 +1,151 @@ +import sys +sys.path.append('./modules/') + +import os +from os.path import basename +from docopt import docopt +from dsm import load_pkl_files, save_pkl_files +import logging +import time +import codecs +import numpy as np +from composes.semantic_space.space import Space +from composes.matrix.dense_matrix import DenseMatrix +from composes.matrix.sparse_matrix import SparseMatrix +from sklearn.random_projection import sparse_random_matrix +from scipy.sparse import lil_matrix, csr_matrix, csc_matrix + + +def main(): + """ + Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. + """ + + # Get the arguments + args = docopt('''Create low-dimensional vector space by sparse random indexing from co-occurrence matrix. + + Usage: + reduce_matrix_ri.py [-l] (-s | -a) + + = number of non-zero values in each random vector + = number of dimensions for random vectors + = threshold for downsampling (if t=None, no subsampling is applied) + = output path for reduced space + = output path for elemental space (context vectors) + = path to pickled space without suffix + + Options: + -l, --len normalize final vectors to unit length + -s, --see specify number of seeds manually + -a, --aut calculate number of seeds automatically as proposed in [1,2] + + References: + [1] Ping Li, T. Hastie and K. W. Church, 2006, + "Very Sparse Random Projections". + http://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf + [2] D. Achlioptas, 2001, "Database-friendly random projections", + http://www.cs.ucsc.edu/~optas/papers/jl.pdf + + ''') + + is_len = args['--len'] + is_seeds = args['--see'] + if is_seeds: + seeds = int(args['']) + is_aut = args['--aut'] + dim = int(args['']) + if args['']=='None': + t = None + else: + t = float(args['']) + outPath = args[''] + outPathElement = args[''] + spacePrefix = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Load input space + space1 = load_pkl_files(spacePrefix) + matrix1 = space1.get_cooccurrence_matrix() + + # Get mappings between rows/columns and words + id2row1 = space1.get_id2row() + id2column1 = space1.get_id2column() + column2id1 = space1.get_column2id() + + ## Generate ternary random vectors + if is_seeds: + elementalMatrix = np.zeros((len(id2column1),dim)) + # Generate base vector for random vectors + baseVector = np.zeros(dim) # Note: Make sure that number of seeds is not greater than dimensions + for i in range(0,seeds/2): + baseVector[i] = 1.0 + for i in range(seeds/2,seeds): + baseVector[i] = -1.0 + for i in range(len(id2column1)): + np.random.shuffle(baseVector) + elementalMatrix[i] = baseVector + if is_aut: + elementalMatrix = sparse_random_matrix(dim,len(id2column1)).toarray().T + + elementalMatrix = csc_matrix(elementalMatrix) + # to-do: get rid of transformation into sparse matrices by initializing them as such + + # Initialize target vectors + reducedMatrix1 = np.zeros((len(id2row1),dim)) + + # Get number of total occurrences of any word + totalOcc = np.sum(matrix1.get_mat()) + + # Define function for downsampling + downsample = lambda f: np.sqrt(float(t)/f) if f>t else 1.0 + downsample = np.vectorize(downsample) + + # Get total normalized co-occurrence frequency of all contexts in space + context_freqs = np.array(matrix1.sum(axis=0))/totalOcc + + #to-do: matrix multiplication is done row-wise, do this matrix-wise + # Iterate over rows of space, find context words and update reduced matrix with low-dimensional random vectors of these context words + for (space,matrix,id2row,id2column,column2id,reducedMatrix) in [(space1,matrix1,id2row1,id2column1,column2id1,reducedMatrix1)]: + # Iterate over targets + for i, target in enumerate(id2row): + # Get co-occurrence values as matrix + m = space.get_row(target).get_mat() + # Get nonzero indexes and data + nonzeros = m.nonzero() + data = m.data + # Smooth context distribution + pos_context_vectors = elementalMatrix[nonzeros[1]] + if t!=None: + # Apply subsampling + rfs = context_freqs[0,nonzeros[1]] + rfs = downsample(rfs) + data *= rfs + data = csc_matrix(data) + # Weight context vectors by occurrence frequency + pos_context_vectors = pos_context_vectors.multiply(data.reshape(-1,1)) + pos_context_vectors = np.sum(pos_context_vectors, axis=0) + # Add up context vectors and store as row for target + reducedMatrix[i] = pos_context_vectors + + if is_len: + # L2-normalize vectors + l2norm1 = np.linalg.norm(reducedMatrix1, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + reducedMatrix1 /= l2norm1.reshape(len(l2norm1),1) + + # Make spaces + reducedSpace1 = Space(DenseMatrix(reducedMatrix1), id2row1, []) + elementalSpace = Space(SparseMatrix(elementalMatrix), id2column1, []) + + # Save the Space objects in pickle format + save_pkl_files(reducedSpace1, outPath + '.ri.dm', save_in_one_file=True, save_as_w2v=True) + save_pkl_files(elementalSpace, outPathElement + '.sm', save_in_one_file=False) + + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/representations/sgns.py b/representations/sgns.py new file mode 100644 index 0000000..e179f3b --- /dev/null +++ b/representations/sgns.py @@ -0,0 +1,95 @@ +import sys +sys.path.append('./modules/') + +import codecs +from collections import defaultdict +import os +from os.path import basename +import zipfile +from docopt import docopt +import logging +import logging.config +import time +import gensim +from dsm import PathLineSentences_mod + + + +def main(): + """ + Make embedding vector space with Negative Sampling from corpus. + """ + + # Get the arguments + args = docopt("""Make embedding vector space with Skip-Gram with Negative Sampling from corpus. + + Usage: + sgns.py [-l] + + Arguments: + + = the linear distance of context words to consider in each direction + = dimensionality of embeddings + = number of negative samples parameter (equivalent to shifting parameter for PPMI) + = threshold for subsampling + = number of occurrences for a word to be included in the vocabulary + = number of iterations + = path to corpus directory with zipped files, each sentence in form 'year\tword1 word2 word3...' + = output path for vectors + = lower bound for time period + = upper bound for time period + + Options: + -l, --len normalize final vectors to unit length + + """) + + is_len = args['--len'] + windowSize = int(args['']) + dim = int(args['']) + k = int(args['']) + if args['']=='None': + t = None + else: + t = float(args['']) + minCount = int(args['']) + itera = int(args['']) + corpDir = args[''] + outPath = args[''] + lowerBound = int(args['']) + upperBound = int(args['']) + + logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Initialize model + model = gensim.models.Word2Vec(sg=1, # skipgram + hs=0, # negative sampling + negative=k, # number of negative samples + sample=t, # threshold for subsampling, if None, no subsampling is performed + size=dim, window=windowSize, min_count=minCount, iter=itera, workers=20) + + # Initialize vocabulary + vocab_sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + model.build_vocab(vocab_sentences) + + # Train + sentences = PathLineSentences_mod(corpDir, lowerBound=lowerBound, upperBound=upperBound) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) + + if is_len: + # L2-normalize vectors + model.init_sims(replace=True) + + # Save the vectors and the model + model.wv.save_word2vec_format(outPath + '.w2v') + #model.save(outPath + '.model') + + logging.info("Corpus has size %d" % vocab_sentences.corpusSize) + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/representations/svd.py b/representations/svd.py new file mode 100644 index 0000000..6794135 --- /dev/null +++ b/representations/svd.py @@ -0,0 +1,84 @@ +import sys +sys.path.append('./modules/') + +import numpy as np +from docopt import docopt +from composes.utils import io_utils +from composes.semantic_space.space import Space +from composes.matrix.dense_matrix import DenseMatrix +from sklearn.utils.extmath import randomized_svd +from dsm import save_pkl_files, load_pkl_files +import logging +import time + + +def main(): + """ + Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD as described in + + Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. + + """ + + # Get the arguments + args = docopt('''Perform dimensionality reduction on a (normally PPMI) matrix by applying truncated SVD and save it in pickle format. + + Usage: + svd.py [-l] + + = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.svd) + = dimensionality of low-dimensional output vectors + = eigenvalue weighting parameter + = output path for space + + Options: + -l, --len normalize final vectors to unit length + + ''') + + is_len = args['--len'] + dsm_prefix = args[''] + dim = int(args['']) + gamma = float(args['']) + outPath = args[''] + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + logging.info(__file__.upper()) + start_time = time.time() + + # Get space with sparse matrix + dsm = load_pkl_files(dsm_prefix) + + id2row = dsm.get_id2row() + + # Get matrix from space + matrix_ = dsm.get_cooccurrence_matrix() + + # Apply SVD + u, s, v = randomized_svd(matrix_.get_mat(), n_components=dim, n_iter=5, transpose=False) + + # Weight matrix + if gamma == 0.0: + matrix_ = u + elif gamma == 1.0: + #matrix_ = np.dot(u, np.diag(s)) # This is equivalent to the below formula (because s is a flattened diagonal matrix) + matrix_ = s * u + else: + #matrix_ = np.dot(u, np.power(np.diag(s), gamma)) # This is equivalent to the below formula + matrix_ = np.power(s, gamma) * u + + if is_len: + # L2-normalize vectors + l2norm1 = np.linalg.norm(matrix_, axis=1, ord=2) + l2norm1[l2norm1==0.0] = 1.0 # Convert 0 values to 1 + matrix_ /= l2norm1.reshape(len(l2norm1),1) + + dsm = Space(DenseMatrix(matrix_), id2row, []) + + # Save the Space object in pickle format + save_pkl_files(dsm, outPath + ".svd.dm", save_in_one_file=True, save_as_w2v=True) + logging.info("--- %s seconds ---" % (time.time() - start_time)) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f911d47 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +# packages to install: +docopt==0.6.2 +gensim==3.7.3 diff --git a/scripts/make_results_disp.sh b/scripts/make_results_disp.sh new file mode 100644 index 0000000..025c9cb --- /dev/null +++ b/scripts/make_results_disp.sh @@ -0,0 +1,118 @@ +### THIS SCRIPT PRODUCES RESULTS FOR DISPERSION MEASURES (FD, TD, HD) ON COUNT SPACES ### + +### Define global parameters ### +# Test parameters +declare -a windowSizes=(1) +declare -a globalmatrixfolderprefix=matrices/test_disp +declare -a globalresultfolderprefix=results/test_disp +declare -a parameterfile=scripts/parameters_test.sh + +# DURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/durel_disp +#declare -a globalresultfolderprefix=results/durel_disp +#declare -a parameterfile=scripts/parameters_durel.sh + +# SURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/surel_disp +#declare -a globalresultfolderprefix=results/surel_disp +#declare -a parameterfile=scripts/parameters_surel.sh + + +# Get corpus- and testset-specific parameters +source $parameterfile + +# Overwrite any specific parameters here +declare -a ks=(1) +declare -a ts=(None) +declare -a iterations=(1) +declare -a dim=5 +declare -a matrixfolder1=$globalmatrixfolder1 +declare -a matrixfolder2=$globalmatrixfolder2 +declare -a matrixfoldercomb=$globalmatrixfoldercomb +matrixfolders=($globalmatrixfolder1 $globalmatrixfolder2) + +# Run model code +lowerBound=$lowerBound1 +upperBound=$upperBound1 +outfolder=$countmatrixfolder1 +source scripts/run_CNT.sh # Raw Count for first time period +lowerBound=$lowerBound2 +upperBound=$upperBound2 +outfolder=$countmatrixfolder2 +source scripts/run_CNT.sh # Raw Count for second time period + +# Get frequencies +lowerBound=$lowerBound1 +upperBound=$upperBound1 +outfolder=$freqresultfolder1 +source scripts/run_FREQ.sh # Raw token frequency in first time period +norm=$freqnorm1 +source scripts/run_NFREQ.sh # Normalized frequency +lowerBound=$lowerBound2 +upperBound=$upperBound2 +outfolder=$freqresultfolder2 +source scripts/run_FREQ.sh # Raw token frequency in second time period +norm=$freqnorm2 +source scripts/run_NFREQ.sh +infolder=$freqresultfolder1 +outfolder=$freqresultfolder1 +source scripts/run_TRSF.sh # Log transformation +infolder=$freqresultfolder2 +outfolder=$freqresultfolder2 +source scripts/run_TRSF.sh +# Subtract values +infolder1=$freqresultfolder1 +infolder2=$freqresultfolder2 +outfolder=$resultfolder +source scripts/run_SBTR.sh # Subtract frequencies (Frequency Difference) + +# Get types +matrixfolder=$countmatrixfolder1 +outfolder=$typesresultfolder1 +source scripts/run_TYPE.sh # Number of context types in first time period +norm=$typesnorm1 +source scripts/run_NTYPE.sh # Normalized number of context types +matrixfolder=$countmatrixfolder2 +outfolder=$typesresultfolder2 +source scripts/run_TYPE.sh # Number of context types in second time period +norm=$typesnorm2 +source scripts/run_NTYPE.sh +infolder=$typesresultfolder1 +outfolder=$typesresultfolder1 +source scripts/run_TRSF.sh # Log transformation +infolder=$typesresultfolder2 +outfolder=$typesresultfolder2 +source scripts/run_TRSF.sh +# Subtract values +infolder1=$typesresultfolder1 +infolder2=$typesresultfolder2 +outfolder=$resultfolder +source scripts/run_SBTR.sh # Subtract types (Type Difference) + +# Get entropies +matrixfolder=$countmatrixfolder1 +outfolder=$entropyresultfolder1 +source scripts/run_ENTR.sh # Entropy in first time period +source scripts/run_NENTR.sh # Normalized Entropy, by number of context types +matrixfolder=$countmatrixfolder2 +outfolder=$entropyresultfolder2 +source scripts/run_ENTR.sh # Entropy in second time period +source scripts/run_NENTR.sh +infolder=$entropyresultfolder1 +outfolder=$entropyresultfolder1 +source scripts/run_TRSF.sh # Log transformation +infolder=$entropyresultfolder2 +outfolder=$entropyresultfolder2 +source scripts/run_TRSF.sh +# Subtract values +infolder1=$entropyresultfolder1 +infolder2=$entropyresultfolder2 +outfolder=$resultfolder +source scripts/run_SBTR.sh # Subtract entropy (Entropy Difference) + +# Evaluate results +resultfolder=$resultfolder +outfolder=$globalresultfolder +source scripts/run_SPR.sh # Get Spearman correlation of measure predictions with gold scores diff --git a/scripts/make_results_sim.sh b/scripts/make_results_sim.sh new file mode 100644 index 0000000..c4c8bfc --- /dev/null +++ b/scripts/make_results_sim.sh @@ -0,0 +1,110 @@ +### THIS SCRIPT PRODUCES RESULTS FOR SIMILARITY MEASURES (CD, LND) ON ALL VECTOR SPACE AND ALIGNMENT TYPES EXCEPT WORD INJECTION ### + +## Define global parameters ## +# Test parameters +declare -a windowSizes=(1) # Window sizes for all models +declare -a globalmatrixfolderprefix=matrices/test_sim # parent folder for matrices +declare -a globalresultfolderprefix=results/test_sim # parent folder for results +declare -a parameterfile=scripts/parameters_test.sh # corpus- and testset-specific parameter specifications + +# DURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/durel_sim +#declare -a globalresultfolderprefix=results/durel_sim +#declare -a parameterfile=scripts/parameters_durel.sh + +# SURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/surel_sim +#declare -a globalresultfolderprefix=results/surel_sim +#declare -a parameterfile=scripts/parameters_surel.sh + + +# Get corpus- and testset-specific parameters +source $parameterfile + +# Overwrite any specific parameters here +declare -a ks=(1) +declare -a ts=(None) +declare -a iterations=(1) +declare -a dim=5 +declare -a matrixfolder1=$globalmatrixfolder1 +declare -a matrixfolder2=$globalmatrixfolder2 +declare -a matrixfoldercomb=$globalmatrixfoldercomb +matrixfolders=($globalmatrixfolder1 $globalmatrixfolder2) + +# Run model code +lowerBound=$lowerBound1 +upperBound=$upperBound1 +outfolder=$sgnsmatrixfolder1 +source scripts/run_SGNS.sh # Skip-Gram with Negative Sampling for first time period +lowerBound=$lowerBound2 +upperBound=$upperBound2 +outfolder=$sgnsmatrixfolder2 +source scripts/run_SGNS.sh # for second time period +infolder=$sgnsmatrixfolder1 +outfolder=$sgnsmatrixfolder2 +source scripts/run_SGNS_VI.sh # Skip-Gram with Negative Sampling aligned by Vector Initialization +lowerBound=$lowerBound1 +upperBound=$upperBound1 +outfolder=$countmatrixfolder1 +source scripts/run_CNT.sh # Raw Count +lowerBound=$lowerBound2 +upperBound=$upperBound2 +outfolder=$countmatrixfolder2 +source scripts/run_CNT.sh +matrixfolder=$countmatrixfolder1 +outfolder=$rimatrixfolder1 +source scripts/run_RI.sh # Random Indexing +matrixfolder=$countmatrixfolder2 +outfolder=$rimatrixfolder2 +source scripts/run_RI.sh +matrixfolder=$countmatrixfolder1 +outfolder=$ppmimatrixfolder1 +source scripts/run_PPMI.sh # PPMI weighting of count matrix +matrixfolder=$countmatrixfolder2 +outfolder=$ppmimatrixfolder2 +source scripts/run_PPMI.sh +matrixfolder=$ppmimatrixfolder1 +outfolder=$svdmatrixfolder1 +source scripts/run_SVD.sh # SVD on PPMI matrix +matrixfolder=$ppmimatrixfolder2 +outfolder=$svdmatrixfolder2 +source scripts/run_SVD.sh + +# Align matrices +outfolder1=$alignedmatrixfolder1 +outfolder2=$alignedmatrixfolder2 + +matrixfolder1=$countmatrixfolder1 +matrixfolder2=$countmatrixfolder2 +source scripts/run_CI.sh # Column Intersection alignment of count matrices +matrixfolder1=$countmatrixfolder1 +matrixfolder2=$countmatrixfolder2 +source scripts/run_SRV.sh # Shared Random Vector alignment + +# Align matrices +matrixfolder1=$ppmimatrixfolder1 +matrixfolder2=$ppmimatrixfolder2 +source scripts/run_CI.sh # Column Intersection alignment of PPMI matrices +matrixfolder1=$sgnsmatrixfolder1 +matrixfolder2=$sgnsmatrixfolder2 +source scripts/run_OP.sh # Orthogonal Procrustes alignment for SGNS +matrixfolder1=$rimatrixfolder1 +matrixfolder2=$rimatrixfolder2 +source scripts/run_OP.sh # Orthogonal Procrustes alignment for RI +matrixfolder1=$svdmatrixfolder1 +matrixfolder2=$svdmatrixfolder2 +source scripts/run_OP.sh # Orthogonal Procrustes alignment for SVD + +# Measure change scores from aligned matrices +matrixfolder1=$alignedmatrixfolder1 +matrixfolder2=$alignedmatrixfolder2 +outfolder=$resultfolder +source scripts/run_CD.sh # Cosine Distance +source scripts/run_LND.sh # Local Neighborhood Distance + +# Evaluate results +resultfolder=$resultfolder +outfolder=$globalresultfolder +source scripts/run_SPR.sh # Get Spearman correlation of measure predictions with gold scores diff --git a/scripts/make_results_wi.sh b/scripts/make_results_wi.sh new file mode 100644 index 0000000..b84b416 --- /dev/null +++ b/scripts/make_results_wi.sh @@ -0,0 +1,65 @@ +### THIS SCRIPT PRODUCES RESULTS FOR SIMILARITY MEASURES (CD, LND) ON ALL VECTOR SPACE TYPES WITH WORD INJECTION ### + +## Define global parameters ## +# Test parameters +declare -a windowSizes=(1) +declare -a globalmatrixfolderprefix=matrices/test_wi +declare -a globalresultfolderprefix=results/test_wi +declare -a parameterfile=scripts/parameters_test.sh + +# DURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/durel_wi +#declare -a globalresultfolderprefix=results/durel_wi +#declare -a parameterfile=scripts/parameters_durel.sh + +# SURel parameters +#declare -a windowSizes=(1) +#declare -a globalmatrixfolderprefix=matrices/surel_wi +#declare -a globalresultfolderprefix=results/surel_wi +#declare -a parameterfile=scripts/parameters_surel.sh + + +# Get corpus- and testset-specific parameters +source $parameterfile + +# Overwrite any specific parameters here +declare -a ks=(1) +declare -a ts=(None) +declare -a iterations=(1) +declare -a dim=5 +testset=$testsetwi + +declare -a matrixfolder=$globalmatrixfolderwi +matrixfolders=($sgnsmatrixfolderwi $countmatrixfolderwi $rimatrixfolderwi $ppmimatrixfolderwi $svdmatrixfolderwi) + +# Run model code +outfolder=$sgnsmatrixfolderwi +source scripts/run_SGNS_WI.sh # Skip-Gram with Negative Sampling for Word Injection +outfolder=$countmatrixfolderwi +source scripts/run_CNT_WI.sh # Raw Count +matrixfolder=$countmatrixfolderwi +outfolder=$rimatrixfolderwi +source scripts/run_RI.sh # Random Indexing +matrixfolder=$countmatrixfolderwi +outfolder=$ppmimatrixfolderwi +source scripts/run_PPMI.sh # PPMI +matrixfolder=$ppmimatrixfolderwi +outfolder=$svdmatrixfolderwi +source scripts/run_SVD.sh # SVD + +# Get Predictions +for matrixfolder in "${matrixfolders[@]}" +do + # Measure change scores from common Word Injection matrix + matrixfolder1=$matrixfolder + matrixfolder2=$matrixfolder + outfolder=$resultfolder + source scripts/run_CD.sh # Cosine Distance + source scripts/run_LND.sh # Local Neighborhood Distance +done + +# Evaluate predictions +resultfolder=$resultfolder +outfolder=$globalresultfolder +source scripts/run_SPR.sh # Get Spearman correlation of measure predictions with gold scores diff --git a/scripts/parameters_durel.sh b/scripts/parameters_durel.sh new file mode 100644 index 0000000..e70f606 --- /dev/null +++ b/scripts/parameters_durel.sh @@ -0,0 +1,88 @@ +shopt -s extglob # For more powerful regular expressions in shell + +### Define parameters ### +declare -a corpDir="corpora/test/" # directory for corpus files (all files in directory will be read) +declare -a wiCorpDir="corpora/test_wi/" # directory for word-injected corpus (only needed for Word Injection) +declare -a bounds1=(1750 1799) # lower and upper bound for first time period +declare -a bounds2=(1850 1899) # lower and upper bound for second time period +declare -a freqnorms=(26650530 40323497) # normalization constants for token frequency (total number of tokens in first and second time period) +declare -a typesnorms=(252437 796365) # normalization constants for number of context types (total number of types in first and second time period) +declare -a ks=(5 1) # values for shifting parameter k +declare -a ts=(0.001 None) # values for subsampling parameter t +declare -a iterations=(1 2 3 4 5) # list of iterations, each item is one iteration, for five iterations define: iterations=(1 2 3 4 5) +declare -a dim=300 # dimensionality of low-dimensional matrices (SVD/RI/SGNS) +declare -a testset="testsets/durel/targets.tsv" # target words for which change scores should be predicted (one target per line repeated twice with tab-separation, i.e., 'word\tword') +declare -a testsetwi="testsets/durel/targets_wi.tsv" # target words for Word Injection (one target per line, injected version in first column, non-injected version in second column, i.e., 'word_\tword') +declare -a goldscorefile="testsets/durel/gold.tsv" # file with gold scores for target words in same order as targets in testsets + + +### No changes needed after this line ### + +# Get time bounds for corpora +lowerBound1=${bounds1[0]} +upperBound1=${bounds1[1]} +lowerBound2=${bounds2[0]} +upperBound2=${bounds2[1]} + +# Get normalization constants for dispersion measures +declare -a freqnorm1=${freqnorms[0]} +declare -a freqnorm2=${freqnorms[1]} +declare -a typesnorm1=${typesnorms[0]} +declare -a typesnorm2=${typesnorms[1]} + + +## Make result folder structure +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a resultfolder=$globalresultfolder/$(basename "${testset%.*}") +mkdir --parents $resultfolder +# For dispersion measures +declare -a resultfolder1=$resultfolder/$lowerBound1-$upperBound1 +mkdir --parents $resultfolder1 +declare -a resultfolder2=$resultfolder/$lowerBound2-$upperBound2 +mkdir --parents $resultfolder2 +declare -a resultfolders=($resultfolder1:1 $resultfolder2:2) +declare -a measures=(entropy types freq) +for folder2suffix in "${resultfolders[@]}" +do + folder="$(cut -d':' -f1 <<<"$folder2suffix")" + suffix="$(cut -d':' -f2 <<<"$folder2suffix")" + for measure in "${measures[@]}" + do + declare -a $measure\resultfolder$suffix=$folder/$measure + mkdir --parents $( eval "echo $"$measure\resultfolder$suffix"" ) + done +done + + +# Make matrix folder structure +declare -a globalmatrixfolder=$globalmatrixfolderprefix/$(basename "$corpDir") +declare -a globalmatrixfolder1=$globalmatrixfolder/$lowerBound1-$upperBound1 +declare -a globalmatrixfolder2=$globalmatrixfolder/$lowerBound2-$upperBound2 +declare -a globalmatrixfolderwi=$globalmatrixfolder/wi +mkdir --parents $globalmatrixfolder +mkdir --parents $globalmatrixfolder1 +mkdir --parents $globalmatrixfolder2 +mkdir --parents $globalmatrixfolderwi + +declare -a matrixfolders=($globalmatrixfolder1:1 $globalmatrixfolder2:2 $globalmatrixfolderwi:wi) +for matrixfolder2suffix in "${matrixfolders[@]}" +do + matrixfolder="$(cut -d':' -f1 <<<"$matrixfolder2suffix")" + suffix="$(cut -d':' -f2 <<<"$matrixfolder2suffix")" + + declare -a countmatrixfolder$suffix=$matrixfolder/count + declare -a ppmimatrixfolder$suffix=$matrixfolder/ppmi + declare -a svdmatrixfolder$suffix=$matrixfolder/svd + declare -a rimatrixfolder$suffix=$matrixfolder/ri + declare -a sgnsmatrixfolder$suffix=$matrixfolder/sgns + declare -a alignedmatrixfolder$suffix=$matrixfolder/aligned + mkdir --parents $( eval "echo $"countmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"ppmimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"svdmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"rimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"sgnsmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"alignedmatrixfolder$suffix"" ) +done diff --git a/scripts/parameters_surel.sh b/scripts/parameters_surel.sh new file mode 100644 index 0000000..ea41b66 --- /dev/null +++ b/scripts/parameters_surel.sh @@ -0,0 +1,88 @@ +shopt -s extglob # For more powerful regular expressions in shell + +### Define parameters ### +declare -a corpDir="corpora/test/" # directory for corpus files (all files in directory will be read) +declare -a wiCorpDir="corpora/test_wi/" # directory for word-injected corpus (only needed for Word Injection) +declare -a bounds1=(2006 2006) # lower and upper bound for first time period +declare -a bounds2=(2020 2020) # lower and upper bound for second time period +declare -a freqnorms=(109731661 1049573) # normalization constants for token frequency (total number of tokens in first and second time period) +declare -a typesnorms=(2417171 49187) # normalization constants for number of context types (total number of types in first and second time period) +declare -a ks=(5 1) # values for shifting parameter k +declare -a ts=(0.001 None) # values for subsampling parameter t +declare -a iterations=(1 2 3 4 5) # list of iterations, each item is one iteration, for five iterations define: iterations=(1 2 3 4 5) +declare -a dim=300 # dimensionality of low-dimensional matrices (SVD/RI/SGNS) +declare -a testset="testsets/surel/targets.tsv" # target words for which change scores should be predicted (one target per line repeated twice with tab-separation, i.e., 'word\tword') +declare -a testsetwi="testsets/surel/targets_wi.tsv" # target words for Word Injection (one target per line, injected version in first column, non-injected version in second column, i.e., 'word_\tword') +declare -a goldscorefile="testsets/surel/gold.tsv" # file with gold scores for target words in same order as targets in testsets + + +### No changes needed after this line ### + +# Get time bounds for corpora +lowerBound1=${bounds1[0]} +upperBound1=${bounds1[1]} +lowerBound2=${bounds2[0]} +upperBound2=${bounds2[1]} + +# Get normalization constants for dispersion measures +declare -a freqnorm1=${freqnorms[0]} +declare -a freqnorm2=${freqnorms[1]} +declare -a typesnorm1=${typesnorms[0]} +declare -a typesnorm2=${typesnorms[1]} + + +## Make result folder structure +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a resultfolder=$globalresultfolder/$(basename "${testset%.*}") +mkdir --parents $resultfolder +# For dispersion measures +declare -a resultfolder1=$resultfolder/$lowerBound1-$upperBound1 +mkdir --parents $resultfolder1 +declare -a resultfolder2=$resultfolder/$lowerBound2-$upperBound2 +mkdir --parents $resultfolder2 +declare -a resultfolders=($resultfolder1:1 $resultfolder2:2) +declare -a measures=(entropy types freq) +for folder2suffix in "${resultfolders[@]}" +do + folder="$(cut -d':' -f1 <<<"$folder2suffix")" + suffix="$(cut -d':' -f2 <<<"$folder2suffix")" + for measure in "${measures[@]}" + do + declare -a $measure\resultfolder$suffix=$folder/$measure + mkdir --parents $( eval "echo $"$measure\resultfolder$suffix"" ) + done +done + + +# Make matrix folder structure +declare -a globalmatrixfolder=$globalmatrixfolderprefix/$(basename "$corpDir") +declare -a globalmatrixfolder1=$globalmatrixfolder/$lowerBound1-$upperBound1 +declare -a globalmatrixfolder2=$globalmatrixfolder/$lowerBound2-$upperBound2 +declare -a globalmatrixfolderwi=$globalmatrixfolder/wi +mkdir --parents $globalmatrixfolder +mkdir --parents $globalmatrixfolder1 +mkdir --parents $globalmatrixfolder2 +mkdir --parents $globalmatrixfolderwi + +declare -a matrixfolders=($globalmatrixfolder1:1 $globalmatrixfolder2:2 $globalmatrixfolderwi:wi) +for matrixfolder2suffix in "${matrixfolders[@]}" +do + matrixfolder="$(cut -d':' -f1 <<<"$matrixfolder2suffix")" + suffix="$(cut -d':' -f2 <<<"$matrixfolder2suffix")" + + declare -a countmatrixfolder$suffix=$matrixfolder/count + declare -a ppmimatrixfolder$suffix=$matrixfolder/ppmi + declare -a svdmatrixfolder$suffix=$matrixfolder/svd + declare -a rimatrixfolder$suffix=$matrixfolder/ri + declare -a sgnsmatrixfolder$suffix=$matrixfolder/sgns + declare -a alignedmatrixfolder$suffix=$matrixfolder/aligned + mkdir --parents $( eval "echo $"countmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"ppmimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"svdmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"rimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"sgnsmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"alignedmatrixfolder$suffix"" ) +done diff --git a/scripts/parameters_test.sh b/scripts/parameters_test.sh new file mode 100644 index 0000000..972379c --- /dev/null +++ b/scripts/parameters_test.sh @@ -0,0 +1,88 @@ +shopt -s extglob # For more powerful regular expressions in shell + +### Define parameters ### +declare -a corpDir="corpora/test/" # directory for corpus files (all files in directory will be read) +declare -a wiCorpDir="corpora/test_wi/" # directory for word-injected corpus (only needed for Word Injection) +declare -a bounds1=(1750 1799) # lower and upper bound for first time period +declare -a bounds2=(1850 1899) # lower and upper bound for second time period +declare -a freqnorms=(73314 110409) # normalization constants for token frequency (total number of tokens in first and second time period) +declare -a typesnorms=(9658 14177) # normalization constants for number of context types (total number of types in first and second time period) +declare -a ks=(1) # values for shifting parameter k +declare -a ts=(None) # values for subsampling parameter t +declare -a iterations=(1) # list of iterations, each item is one iteration, for five iterations define: iterations=(1 2 3 4 5) +declare -a dim=30 # dimensionality of low-dimensional matrices (SVD/RI/SGNS) +declare -a testset="testsets/test/targets.tsv" # target words for which change scores should be predicted (one target per line repeated twice with tab-separation, i.e., 'word\tword') +declare -a testsetwi="testsets/test/targets_wi.tsv" # target words for Word Injection (one target per line, injected version in first column, non-injected version in second column, i.e., 'word_\tword') +declare -a goldscorefile="testsets/test/gold.tsv" # file with gold scores for target words in same order as targets in testsets + + +### No changes needed after this line ### + +# Get time bounds for corpora +lowerBound1=${bounds1[0]} +upperBound1=${bounds1[1]} +lowerBound2=${bounds2[0]} +upperBound2=${bounds2[1]} + +# Get normalization constants for dispersion measures +declare -a freqnorm1=${freqnorms[0]} +declare -a freqnorm2=${freqnorms[1]} +declare -a typesnorm1=${typesnorms[0]} +declare -a typesnorm2=${typesnorms[1]} + + +## Make result folder structure +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a globalresultfolder=$globalresultfolderprefix/$(basename "$corpDir") +mkdir --parents $globalresultfolder +declare -a resultfolder=$globalresultfolder/$(basename "${testset%.*}") +mkdir --parents $resultfolder +# For dispersion measures +declare -a resultfolder1=$resultfolder/$lowerBound1-$upperBound1 +mkdir --parents $resultfolder1 +declare -a resultfolder2=$resultfolder/$lowerBound2-$upperBound2 +mkdir --parents $resultfolder2 +declare -a resultfolders=($resultfolder1:1 $resultfolder2:2) +declare -a measures=(entropy types freq) +for folder2suffix in "${resultfolders[@]}" +do + folder="$(cut -d':' -f1 <<<"$folder2suffix")" + suffix="$(cut -d':' -f2 <<<"$folder2suffix")" + for measure in "${measures[@]}" + do + declare -a $measure\resultfolder$suffix=$folder/$measure + mkdir --parents $( eval "echo $"$measure\resultfolder$suffix"" ) + done +done + + +# Make matrix folder structure +declare -a globalmatrixfolder=$globalmatrixfolderprefix/$(basename "$corpDir") +declare -a globalmatrixfolder1=$globalmatrixfolder/$lowerBound1-$upperBound1 +declare -a globalmatrixfolder2=$globalmatrixfolder/$lowerBound2-$upperBound2 +declare -a globalmatrixfolderwi=$globalmatrixfolder/wi +mkdir --parents $globalmatrixfolder +mkdir --parents $globalmatrixfolder1 +mkdir --parents $globalmatrixfolder2 +mkdir --parents $globalmatrixfolderwi + +declare -a matrixfolders=($globalmatrixfolder1:1 $globalmatrixfolder2:2 $globalmatrixfolderwi:wi) +for matrixfolder2suffix in "${matrixfolders[@]}" +do + matrixfolder="$(cut -d':' -f1 <<<"$matrixfolder2suffix")" + suffix="$(cut -d':' -f2 <<<"$matrixfolder2suffix")" + + declare -a countmatrixfolder$suffix=$matrixfolder/count + declare -a ppmimatrixfolder$suffix=$matrixfolder/ppmi + declare -a svdmatrixfolder$suffix=$matrixfolder/svd + declare -a rimatrixfolder$suffix=$matrixfolder/ri + declare -a sgnsmatrixfolder$suffix=$matrixfolder/sgns + declare -a alignedmatrixfolder$suffix=$matrixfolder/aligned + mkdir --parents $( eval "echo $"countmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"ppmimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"svdmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"rimatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"sgnsmatrixfolder$suffix"" ) + mkdir --parents $( eval "echo $"alignedmatrixfolder$suffix"" ) +done diff --git a/scripts/run_CD.sh b/scripts/run_CD.sh new file mode 100644 index 0000000..9aada86 --- /dev/null +++ b/scripts/run_CD.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/cd.py -s "${matrix%.*}" $matrixfolder2/$(basename "${matrix%.*}") $outfolder/CD-$(basename "$testset")-$(basename "$matrix") $testset # cosine distance +done + diff --git a/scripts/run_CI.sh b/scripts/run_CI.sh new file mode 100644 index 0000000..12fd9da --- /dev/null +++ b/scripts/run_CI.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u alignment/ci_align.py $outfolder1/$(basename "${matrix%.*}")-CI $outfolder2/$(basename "${matrix%.*}")-CI "${matrix%.*}" $matrixfolder2/$(basename "${matrix%.*}") # align matrices by column intersection +done + diff --git a/scripts/run_CNT.sh b/scripts/run_CNT.sh new file mode 100644 index 0000000..d06f135 --- /dev/null +++ b/scripts/run_CNT.sh @@ -0,0 +1,5 @@ + +for windowSize in "${windowSizes[@]}" +do + python -u representations/count.py $windowSize $corpDir $outfolder/$(basename "$corpDir")-win$windowSize.count.sm $lowerBound $upperBound # Create count matrix +done diff --git a/scripts/run_CNT_WI.sh b/scripts/run_CNT_WI.sh new file mode 100644 index 0000000..a43feec --- /dev/null +++ b/scripts/run_CNT_WI.sh @@ -0,0 +1,5 @@ + +for windowSize in "${windowSizes[@]}" +do + python -u representations/count.py $windowSize $wiCorpDir $outfolder/$(basename "$wiCorpDir")-win$windowSize.count.sm 0000 9999 # construct count matrix for word-injected corpus +done diff --git a/scripts/run_ENTR.sh b/scripts/run_ENTR.sh new file mode 100644 index 0000000..6dc37eb --- /dev/null +++ b/scripts/run_ENTR.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/entropy.py "${matrix%.*}" $outfolder/entropies-$(basename "$matrix") $testset # entropy +done + diff --git a/scripts/run_FREQ.sh b/scripts/run_FREQ.sh new file mode 100644 index 0000000..2741ca2 --- /dev/null +++ b/scripts/run_FREQ.sh @@ -0,0 +1,3 @@ + +python -u measures/freq.py $corpDir $outfolder/freq-$(basename "$corpDir") $lowerBound $upperBound $testset # token frequency + diff --git a/scripts/run_LND.sh b/scripts/run_LND.sh new file mode 100644 index 0000000..9803905 --- /dev/null +++ b/scripts/run_LND.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/lnd.py -s "${matrix%.*}" $matrixfolder2/$(basename "${matrix%.*}") 25 $outfolder/LND-$(basename "$testset")-$(basename "$matrix") $testset # local neighborhood distance +done + diff --git a/scripts/run_NENTR.sh b/scripts/run_NENTR.sh new file mode 100644 index 0000000..d769a11 --- /dev/null +++ b/scripts/run_NENTR.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/entropy.py -n "${matrix%.*}" $outfolder/normalized-entropies-$(basename "$matrix") $testset # entropy normalized +done + diff --git a/scripts/run_NFREQ.sh b/scripts/run_NFREQ.sh new file mode 100644 index 0000000..0daaaf2 --- /dev/null +++ b/scripts/run_NFREQ.sh @@ -0,0 +1,3 @@ + +python -u measures/freq.py -n $norm $corpDir $outfolder/normalized-freq-$(basename "$corpDir") $lowerBound $upperBound $testset # token frequency normalized + diff --git a/scripts/run_NTYPE.sh b/scripts/run_NTYPE.sh new file mode 100644 index 0000000..1d0b0a4 --- /dev/null +++ b/scripts/run_NTYPE.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/types.py -n $norm "${matrix%.*}" $outfolder/normalized-types-$(basename "$matrix") $testset # number of context types normalized +done + diff --git a/scripts/run_OP+.sh b/scripts/run_OP+.sh new file mode 100644 index 0000000..21f98b7 --- /dev/null +++ b/scripts/run_OP+.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python3 -u alignment/map_embeddings.py --normalize unit center unit --init_identical --whiten --src_reweight=0.5 --trg_reweight=0.5 --src_dewhiten='src' --trg_dewhiten='trg' $matrixfolder2/$(basename "$matrix") $matrix $outfolder2/$(basename "${matrix%.*}")-OP+.w2v $outfolder1/$(basename "${matrix%.*}")-OP+.w2v # align matrices by Orthogonal Procrustes plus additional pre- and post-processing steps +done + diff --git a/scripts/run_OP-.sh b/scripts/run_OP-.sh new file mode 100644 index 0000000..d958edd --- /dev/null +++ b/scripts/run_OP-.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python3 -u alignment/map_embeddings.py --normalize unit --init_identical --orthogonal $matrixfolder2/$(basename "$matrix") $matrix $outfolder2/$(basename "${matrix%.*}")-OP-.w2v $outfolder1/$(basename "${matrix%.*}")-OP-.w2v # align matrices by Orthogonal Procrustes without centering +done + diff --git a/scripts/run_OP.sh b/scripts/run_OP.sh new file mode 100644 index 0000000..8431d64 --- /dev/null +++ b/scripts/run_OP.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder1/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python3 -u alignment/map_embeddings.py --normalize unit center --init_identical --orthogonal $matrixfolder2/$(basename "$matrix") $matrix $outfolder2/$(basename "${matrix%.*}")-OP.w2v $outfolder1/$(basename "${matrix%.*}")-OP.w2v # align matrices by Orthogonal Procrustes +done + diff --git a/scripts/run_PPMI.sh b/scripts/run_PPMI.sh new file mode 100644 index 0000000..9a421f7 --- /dev/null +++ b/scripts/run_PPMI.sh @@ -0,0 +1,10 @@ + +matrices=($matrixfolder/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + for k in "${ks[@]}" + do + python -u representations/ppmi.py "${matrix%.*}" $k 0.75 $outfolder/$(basename "${matrix%.*}")-k$k # weight matrix with PPMI + done +done diff --git a/scripts/run_RI.sh b/scripts/run_RI.sh new file mode 100644 index 0000000..25d07c1 --- /dev/null +++ b/scripts/run_RI.sh @@ -0,0 +1,15 @@ + +matrices=($matrixfolder/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + for iteration in "${iterations[@]}" + do + for t in "${ts[@]}" + do + python -u representations/ri.py -s 2 $dim $t $outfolder/$(basename "${matrix%.*}")-t$t-iter$iteration $outfolder/$(basename "${matrix%.*}")-t$t-iter$iteration-elemental-space "${matrix%.*}" # reduce matrix by random indexing + done + done +done + +rm $outfolder/*elemental-space* # delete random vectors after constructing the matrix diff --git a/scripts/run_SBTR.sh b/scripts/run_SBTR.sh new file mode 100644 index 0000000..6e49038 --- /dev/null +++ b/scripts/run_SBTR.sh @@ -0,0 +1,8 @@ + +resultfiles=($infolder1/*) + +for resultfile in "${resultfiles[@]}" +do + python -u measures/subtract.py -a $testset $resultfile $infolder2/$(basename "$resultfile") $outfolder/subtract-$(basename "${resultfile%.*}") # subtract values +done + diff --git a/scripts/run_SGNS.sh b/scripts/run_SGNS.sh new file mode 100644 index 0000000..fa61092 --- /dev/null +++ b/scripts/run_SGNS.sh @@ -0,0 +1,14 @@ + +for windowSize in "${windowSizes[@]}" +do + for k in "${ks[@]}" + do + for t in "${ts[@]}" + do + for iteration in "${iterations[@]}" + do + python -u representations/sgns.py $windowSize $dim $k $t 0 5 $corpDir $outfolder/$(basename "$corpDir")-win$windowSize-k$k-t$t-iter$iteration.sgns $lowerBound $upperBound # construct word2vec skip-gram embeddings + done + done + done +done diff --git a/scripts/run_SGNS_VI.sh b/scripts/run_SGNS_VI.sh new file mode 100644 index 0000000..c890f6b --- /dev/null +++ b/scripts/run_SGNS_VI.sh @@ -0,0 +1,15 @@ + +for windowSize in "${windowSizes[@]}" +do + for k in "${ks[@]}" + do + for t in "${ts[@]}" + do + for iteration in "${iterations[@]}" + do + python -u alignment/sgns_vi.py $infolder/$(basename "$corpDir")-win$windowSize-k$k-t$t-iter$iteration.sgns.w2v $windowSize $dim $k $t 0 5 $corpDir $outfolder/$(basename "$corpDir")-win$windowSize-k$k-t$t-iter$iteration\_vi.sgns $lowerBound2 $upperBound2 # construct word2vec skip-gram embeddings with vector initialization + scp $infolder/$(basename "$corpDir")-win$windowSize-k$k-t$t-iter$iteration.sgns.w2v $infolder/$(basename "$corpDir")-win$windowSize-k$k-t$t-iter$iteration\_vi.sgns.w2v # copy initialization vectors as matrix for first time period + done + done + done +done diff --git a/scripts/run_SGNS_WI.sh b/scripts/run_SGNS_WI.sh new file mode 100644 index 0000000..9963989 --- /dev/null +++ b/scripts/run_SGNS_WI.sh @@ -0,0 +1,14 @@ + +for windowSize in "${windowSizes[@]}" +do + for k in "${ks[@]}" + do + for t in "${ts[@]}" + do + for iteration in "${iterations[@]}" + do + python -u representations/sgns.py $windowSize $dim $k $t 0 5 $wiCorpDir $outfolder/$(basename "$wiCorpDir")-win$windowSize-k$k-t$t-iter$iteration 0000 9999 # construct word2vec skip-gram embeddings for word-injected corpus + done + done + done +done diff --git a/scripts/run_SPR.sh b/scripts/run_SPR.sh new file mode 100644 index 0000000..d741b27 --- /dev/null +++ b/scripts/run_SPR.sh @@ -0,0 +1,6 @@ + +for resultfile in $resultfolder/*.csv +do + declare -a resultfileshort=${resultfile#$(dirname "$(dirname "$resultfile")")/} + python -u evaluation/spearman.py $goldscorefile $resultfile $(basename "$goldscorefile") $resultfileshort 0 1 >> $outfolder/spearman_$(basename "$resultfolder").csv # evaluate results with Spearman correlation +done diff --git a/scripts/run_SRV.sh b/scripts/run_SRV.sh new file mode 100644 index 0000000..bc49475 --- /dev/null +++ b/scripts/run_SRV.sh @@ -0,0 +1,15 @@ + +matrices=($matrixfolder1/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + for iteration in "${iterations[@]}" + do + for t in "${ts[@]}" + do + python -u alignment/srv_align.py -s 2 $dim $t $outfolder1/$(basename "${matrix%.*}")-t$t-iter$iteration-SRV $outfolder2/$(basename "${matrix%.*}")-t$t-iter$iteration-SRV $outfolder1/$(basename "${matrix%.*}")-t$t-iter$iteration-elemental-space "${matrix%.*}" $matrixfolder2/$(basename "${matrix%.*}") # construct random indexing matrices from count matrices with shared random vectors + done + done +done + +rm $outfolder1/*elemental-space* # delete the shared random vectors after constructing the matrices diff --git a/scripts/run_SVD.sh b/scripts/run_SVD.sh new file mode 100644 index 0000000..2e7a2d6 --- /dev/null +++ b/scripts/run_SVD.sh @@ -0,0 +1,10 @@ + +matrices=($matrixfolder/!(*@(row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + for iteration in "${iterations[@]}" + do + python -u representations/svd.py "${matrix%.*}" $dim 0.0 $outfolder/$(basename "${matrix%.*}")-iter$iteration # reduce matrix by SVD + done +done diff --git a/scripts/run_TRSF.sh b/scripts/run_TRSF.sh new file mode 100644 index 0000000..9085d7c --- /dev/null +++ b/scripts/run_TRSF.sh @@ -0,0 +1,8 @@ + +resultfiles=($infolder/*) + +for resultfile in "${resultfiles[@]}" +do + python -u measures/transform.py --log2 $testset $resultfile $outfolder/transformed-$(basename "${resultfile%.*}") # log-transform values +done + diff --git a/scripts/run_TYPE.sh b/scripts/run_TYPE.sh new file mode 100644 index 0000000..7528827 --- /dev/null +++ b/scripts/run_TYPE.sh @@ -0,0 +1,8 @@ + +matrices=($matrixfolder/!(*@(|row2id*|id2row*|id2column*|column2id*))) + +for matrix in "${matrices[@]}" +do + python -u measures/types.py "${matrix%.*}" $outfolder/types-$(basename "$matrix") $testset # number of context types +done + diff --git a/testsets/durel/durel.tsv b/testsets/durel/durel.tsv new file mode 100644 index 0000000..c8a3433 --- /dev/null +++ b/testsets/durel/durel.tsv @@ -0,0 +1,20 @@ +Lexeme POS LSC frequency Dta18 frequency Dta19 +Vorwort NN -1.5825 85 273 +Donnerwetter NN -1.8375 100 89 +Presse NN -1.8825 193 1519 +Feine NN -1.93 112 84 +Anstalt NN -2.0725 425 911 +Feder NN -2.1403508772 1489 3022 +billig ADJ -2.4316666667 2073 1705 +Motiv NN -2.66 104 2551 +Anstellung NN -2.6789473684 53 499 +packen VV -2.7350877193 279 1057 +locker ADJ -2.84 454 769 +technisch ADJ -2.89 25 2177 +geharnischt ADJ -3 56 117 +Zufall NN -3.1125 2444 1618 +Bilanz NN -3.2 51 58 +englisch ADJ -3.3375 1921 7280 +Reichstag NN -3.4525 609 1781 +Museum NN -3.7325 414 1827 +Abend NN -3.79 4144 4372 diff --git a/testsets/durel/gold.tsv b/testsets/durel/gold.tsv new file mode 100644 index 0000000..2f6bcae --- /dev/null +++ b/testsets/durel/gold.tsv @@ -0,0 +1,19 @@ +-3.79 +-2.0725 +-2.6789473684 +-3.2 +-2.4316666667 +-1.8375 +-3.3375 +-2.1403508772 +-1.93 +-3 +-2.84 +-2.66 +-3.7325 +-2.7350877193 +-1.8825 +-3.4525 +-2.89 +-1.5825 +-3.1125 diff --git a/testsets/durel/targets.tsv b/testsets/durel/targets.tsv new file mode 100644 index 0000000..19803af --- /dev/null +++ b/testsets/durel/targets.tsv @@ -0,0 +1,19 @@ +Abend Abend +Anstalt Anstalt +Anstellung Anstellung +Bilanz Bilanz +billig billig +Donnerwetter Donnerwetter +englisch englisch +Feder Feder +Feine Feine +geharnischt geharnischt +locker locker +Motiv Motiv +Museum Museum +packen packen +Presse Presse +Reichstag Reichstag +technisch technisch +Vorwort Vorwort +Zufall Zufall diff --git a/testsets/durel/targets_wi.tsv b/testsets/durel/targets_wi.tsv new file mode 100644 index 0000000..89f8426 --- /dev/null +++ b/testsets/durel/targets_wi.tsv @@ -0,0 +1,19 @@ +Abend_ Abend +Anstalt_ Anstalt +Anstellung_ Anstellung +Bilanz_ Bilanz +billig_ billig +Donnerwetter_ Donnerwetter +englisch_ englisch +Feder_ Feder +Feine_ Feine +geharnischt_ geharnischt +locker_ locker +Motiv_ Motiv +Museum_ Museum +packen_ packen +Presse_ Presse +Reichstag_ Reichstag +technisch_ technisch +Vorwort_ Vorwort +Zufall_ Zufall diff --git a/testsets/surel/gold.tsv b/testsets/surel/gold.tsv new file mode 100644 index 0000000..ec24d45 --- /dev/null +++ b/testsets/surel/gold.tsv @@ -0,0 +1,21 @@ +-1.75 +-2.95 +-3.75 +-2.25 +-4 +-1.15 +-2.7 +-1.5294117647 +-3.4473684211 +-3.5 +-3.3333333333 +-3.1 +-3.55 +-3.7368421053 +-1.1 +-1.05 +-4 +-3.975 +-1.4166666667 +-1.05 +-2.65 diff --git a/testsets/surel/surel.tsv b/testsets/surel/surel.tsv new file mode 100644 index 0000000..81bf600 --- /dev/null +++ b/testsets/surel/surel.tsv @@ -0,0 +1,22 @@ +Lexeme POS LSC frequency SdeWaC frequency Cook +Schnee NN -1.05 2228 53 +Strudel NN -1.05 232 46 +schlagen VV -1.1 14693 309 +Gericht NN -1.15 13263 1071 +Schuß NN -1.4166666667 2153 117 +Hamburger NN -1.5294117647 5558 46 +abschrecken VV -1.75 730 170 +Form NN -2.25 36639 851 +trennen VV -2.65 5771 170 +Glas NN -2.7 3830 863 +Blech NN -2.95 409 145 +Prise NN -3.1 370 622 +Paprika NN -3.3333333333 377 453 +Mandel NN -3.4473684211 402 274 +Messer NN -3.5 1774 925 +Rum NN -3.55 244 181 +Salz NN -3.7368421053 3087 5806 +Eiweiß NN -3.75 1075 3037 +Schokolade NN -3.975 947 251 +Gemüse NN -4 2696 1224 +Schnittlauch NN -4 156 247 diff --git a/testsets/surel/targets.tsv b/testsets/surel/targets.tsv new file mode 100644 index 0000000..beb4f65 --- /dev/null +++ b/testsets/surel/targets.tsv @@ -0,0 +1,21 @@ +abschrecken abschrecken +Blech Blech +Eiweiß Eiweiß +Form Form +Gemüse Gemüse +Gericht Gericht +Glas Glas +Hamburger Hamburger +Mandel Mandel +Messer Messer +Paprika Paprika +Prise Prise +Rum Rum +Salz Salz +schlagen schlagen +Schnee Schnee +Schnittlauch Schnittlauch +Schokolade Schokolade +Schuß Schuß +Strudel Strudel +trennen trennen diff --git a/testsets/surel/targets_wi.tsv b/testsets/surel/targets_wi.tsv new file mode 100644 index 0000000..d58d7d2 --- /dev/null +++ b/testsets/surel/targets_wi.tsv @@ -0,0 +1,21 @@ +abschrecken_ abschrecken +Blech_ Blech +Eiweiß_ Eiweiß +Form_ Form +Gemüse_ Gemüse +Gericht_ Gericht +Glas_ Glas +Hamburger_ Hamburger +Mandel_ Mandel +Messer_ Messer +Paprika_ Paprika +Prise_ Prise +Rum_ Rum +Salz_ Salz +schlagen_ schlagen +Schnee_ Schnee +Schnittlauch_ Schnittlauch +Schokolade_ Schokolade +Schuß_ Schuß +Strudel_ Strudel +trennen_ trennen diff --git a/testsets/test/gold.tsv b/testsets/test/gold.tsv new file mode 100644 index 0000000..57c508f --- /dev/null +++ b/testsets/test/gold.tsv @@ -0,0 +1,4 @@ +1.1 +2.2 +0.5 +3.6 diff --git a/testsets/test/targets.tsv b/testsets/test/targets.tsv new file mode 100644 index 0000000..0ed987f --- /dev/null +++ b/testsets/test/targets.tsv @@ -0,0 +1,4 @@ +Gott Gott +und und +haben haben +göttlich göttlich diff --git a/testsets/test/targets_wi.tsv b/testsets/test/targets_wi.tsv new file mode 100644 index 0000000..48383c0 --- /dev/null +++ b/testsets/test/targets_wi.tsv @@ -0,0 +1,4 @@ +Gott_ Gott +und_ und +haben_ haben +göttlich_ göttlich