standalone_document_embedding_demo.py


#!/usr/bin/env python3

"""
Copyright 2019 Brian Thompson

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


This is a standalone example of creating a document vector from sentence vectors
following https://aclanthology.org/2020.emnlp-main.483 

"""


import numpy as np

from mcerp import PERT  # pip install mcerp  # see https://github.com/tisimst/mcerp/blob/master/mcerp/__init__.py


NUM_TIME_SLOTS = 16
PERT_G = 20


# PERT is very slow (50ms per distribution) so we cache a bank of PERT distributions
_num_banks = 100
_xx = np.linspace(start=0, stop=1, num=NUM_TIME_SLOTS)
PERT_BANKS = []
for _pp in np.linspace(0, 1, num=_num_banks):
    if _pp == 0.5:  # some special case that makes g do nothing
        _pp += 0.001
    pert = PERT(low=-0.001, peak=_pp, high=1.001, g=PERT_G, tag=None)
    _yy = pert.rv.pdf(_xx)
    _yy = _yy / sum(_yy)  # normalize
    PERT_BANKS.append(_yy)


np.set_printoptions(threshold=50, precision=5)
    

def build_doc_embedding(sent_vecs, sent_counts):
    # ensure sentence counds are >= 1
    sent_counts = np.clip(sent_counts, a_min=1, a_max=None)

    # scale each sent vec by 1/count
    sent_weights = 1.0/np.array(sent_counts)

    scaled_sent_vecs = np.multiply(sent_vecs.T, sent_weights).T

    # equally space sentences
    sent_centers = np.linspace(0, 1, len(scaled_sent_vecs))

    # find weighting for each sentence, for each time slot
    sentence_loc_weights = np.zeros((len(sent_centers), NUM_TIME_SLOTS))

    for sent_ii, p in enumerate(sent_centers):
        bank_idx = int(p * (len(PERT_BANKS) - 1))  # find the nearest cached pert distribution
        sentence_loc_weights[sent_ii, :] = PERT_BANKS[bank_idx]
        
    # make each chunk vector
    doc_chunk_vec = np.matmul(scaled_sent_vecs.T, sentence_loc_weights).T

    # concatenate chunk vectors into a single vector for the full document
    doc_vec = doc_chunk_vec.flatten()

    # normalize document vector
    doc_vec = doc_vec / (np.linalg.norm(doc_vec) + 1e-5)

    return doc_vec


def demo():

    # Replace sent_vecs with laser/LaBSE/etc embeddings of each sentence in your document,
    #    after projecting the sentence embeddings into a lower-dimensional space using something like PCA (see paper for details).
    sent_emb_size = 32  # Document embedding size will be sent_emb_size * NUM_TIME_SLOTS
    n_sents = 7
    sent_vecs = np.random.rand(n_sents, sent_emb_size)-0.5
    
    # Replace sent_counts with the number of times each sentence has been seen in your corpus.
    sent_counts = np.random.randint(low=1, high=50, size=n_sents)

    doc_emb = build_doc_embedding(sent_vecs, sent_counts)
    
    print('Document Embedding:', doc_emb)
    print('Document Embedding Size:', doc_emb.shape)


if __name__ == '__main__':
    demo()