From fc42be0c8aa6246f224e1c3bdc0bb4b6a9d8491b Mon Sep 17 00:00:00 2001 From: Ryan Nazareth Date: Mon, 7 Nov 2022 09:37:07 +0000 Subject: [PATCH] Added Tutorial for NLP Processing using Gensim in Flyte Workflow (#911) * add tutorial for nlp Signed-off-by: Ryan Nazareth * add script and folder to cookbook Signed-off-by: Ryan Nazareth * add flytedeck Signed-off-by: Ryan Nazareth * pin pandas and profiling versions Signed-off-by: Ryan Nazareth * add docstring to script Signed-off-by: Ryan Nazareth * trigger ci Signed-off-by: Ryan Nazareth * typos Signed-off-by: Ryan Nazareth * typos Signed-off-by: Ryan Nazareth * add tutorial to panel and toc tree in rst Signed-off-by: Ryan Nazareth * add loads of descriptions Signed-off-by: Ryan Nazareth * typos and formatting Signed-off-by: Ryan Nazareth * correction to flytedeck description Signed-off-by: Ryan Nazareth * formatting and typos Signed-off-by: Ryan Nazareth * add requested changes to description and other bits Signed-off-by: Ryan Nazareth * formatting and add gitignore Signed-off-by: Ryan Nazareth * add typing to plotting task Signed-off-by: Ryan Nazareth * add in requested changes and add sklearn to requirements.in Signed-off-by: Ryan Nazareth * few more Signed-off-by: Ryan Nazareth * run pip-compile again to correct relative path to requirements-common.in Signed-off-by: Ryan Nazareth * bump resource for tasks that errored in flyte console Signed-off-by: Ryan Nazareth * whitespace Signed-off-by: Ryan Nazareth * switch from np to flyte supported types and model_ser.download Signed-off-by: Ryan Nazareth * add support for plotly and disable deck in task Signed-off-by: Ryan Nazareth * return output for word similarity and plotly layout size adjustment Signed-off-by: Ryan Nazareth * remove returned output from word sim task Signed-off-by: Ryan Nazareth * remove type and also output in wmd Signed-off-by: Ryan Nazareth * add workflow outputs and modify comments Signed-off-by: Ryan Nazareth * fix typing for return value word sim task Signed-off-by: Ryan Nazareth Signed-off-by: Ryan Nazareth Co-authored-by: Samhita Alla --- .../ml_training/nlp_processing/Dockerfile | 53 +++ .../ml_training/nlp_processing/Makefile | 3 + .../ml_training/nlp_processing/README.rst | 39 ++ .../ml_training/nlp_processing/__init__.py | 0 .../nlp_processing/requirements.in | 7 + .../nlp_processing/requirements.txt | 312 ++++++++++++++++ .../ml_training/nlp_processing/sandbox.config | 2 + .../nlp_processing/word2vec_and_lda.py | 338 ++++++++++++++++++ cookbook/docs/conf.py | 3 + cookbook/docs/ml_training.rst | 10 + 10 files changed, 767 insertions(+) create mode 100644 cookbook/case_studies/ml_training/nlp_processing/Dockerfile create mode 100644 cookbook/case_studies/ml_training/nlp_processing/Makefile create mode 100644 cookbook/case_studies/ml_training/nlp_processing/README.rst create mode 100644 cookbook/case_studies/ml_training/nlp_processing/__init__.py create mode 100644 cookbook/case_studies/ml_training/nlp_processing/requirements.in create mode 100644 cookbook/case_studies/ml_training/nlp_processing/requirements.txt create mode 100644 cookbook/case_studies/ml_training/nlp_processing/sandbox.config create mode 100644 cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py diff --git a/cookbook/case_studies/ml_training/nlp_processing/Dockerfile b/cookbook/case_studies/ml_training/nlp_processing/Dockerfile new file mode 100644 index 0000000000..b4d87cbb05 --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/Dockerfile @@ -0,0 +1,53 @@ +FROM ubuntu:focal + +WORKDIR /root +ENV VENV /opt/venv +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONPATH /root + +RUN : \ + && apt-get update \ + && apt install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa + +RUN : \ + && apt-get update \ + && apt-get install -y python3.8 python3-pip python3-venv make build-essential libssl-dev curl vim + +# This is necessary for opencv to work +RUN apt-get update && apt-get install -y libsm6 libxext6 libxrender-dev ffmpeg + +# Install the AWS cli separately to prevent issues with boto being written over +RUN pip3 install awscli + +WORKDIR /opt +RUN curl https://sdk.cloud.google.com > install.sh +RUN bash /opt/install.sh --install-dir=/opt +ENV PATH $PATH:/opt/google-cloud-sdk/bin +WORKDIR /root + +# Virtual environment +ENV VENV /opt/venv +RUN python3 -m venv ${VENV} +ENV PATH="${VENV}/bin:$PATH" + +# Install Python dependencies +COPY nlp_processing/requirements.txt /root +RUN ${VENV}/bin/pip install -r /root/requirements.txt + +# Copy the makefile targets to expose on the container. This makes it easier to register. +COPY in_container.mk /root/Makefile +COPY nlp_processing/sandbox.config /root + +# Copy the actual code +COPY nlp_processing/ /root/nlp_processing/ + +# Copy over the helper script that the SDK relies on +RUN cp ${VENV}/bin/flytekit_venv /usr/local/bin/ +RUN chmod a+x /usr/local/bin/flytekit_venv + +# This tag is supplied by the build script and will be used to determine the version +# when registering tasks, workflows, and launch plans +ARG tag +ENV FLYTE_INTERNAL_IMAGE $tag diff --git a/cookbook/case_studies/ml_training/nlp_processing/Makefile b/cookbook/case_studies/ml_training/nlp_processing/Makefile new file mode 100644 index 0000000000..799d03eaa5 --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/Makefile @@ -0,0 +1,3 @@ +PREFIX=nlp_processing +include ../../../common/common.mk +include ../../../common/leaf.mk diff --git a/cookbook/case_studies/ml_training/nlp_processing/README.rst b/cookbook/case_studies/ml_training/nlp_processing/README.rst new file mode 100644 index 0000000000..686a9f1ee7 --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/README.rst @@ -0,0 +1,39 @@ +NLP Processing +-------------- + +This tutorial will demonstrate how to process text data and generate word embeddings and visualizations +as part of a Flyte workflow. It's an adaptation of the official Gensim `Word2Vec tutorial `__. + + +About Gensim +============ + +Gensim is a popular open-source natural language processing (NLP) library used to process +large corpora (can be larger than RAM). +It has efficient multicore implementations of a number of algorithms such as `Latent Semantic Analysis `__, `Latent Dirichlet Allocation (LDA) `__, +`Word2Vec deep learning `__ to perform complex tasks including understanding +document relationships, topic modeling, learning word embeddings, and more. + +You can read more about Gensim `here `__. + + +Data +==== + +The dataset used for this tutorial is the open-source `Lee Background Corpus `__ +that comes with the Gensim library. + + +Step-by-Step Process +==================== + +The following points outline the modelling process: + +- Returns a preprocessed (tokenized, stop words excluded, lemmatized) corpus from the custom iterator. +- Trains the Word2vec model on the preprocessed corpus. +- Generates a bag of words from the corpus and trains the LDA model. +- Saves the LDA and Word2Vec models to disk. +- Deserializes the Word2Vec model, runs word similarity and computes word movers distance. +- Reduces the dimensionality (using tsne) and plots the word embeddings. + +Let's dive into the code! diff --git a/cookbook/case_studies/ml_training/nlp_processing/__init__.py b/cookbook/case_studies/ml_training/nlp_processing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cookbook/case_studies/ml_training/nlp_processing/requirements.in b/cookbook/case_studies/ml_training/nlp_processing/requirements.in new file mode 100644 index 0000000000..da66df9954 --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/requirements.in @@ -0,0 +1,7 @@ +-r ../../../common/requirements-common.in +numpy +gensim +nltk +plotly +pyemd +scikit-learn diff --git a/cookbook/case_studies/ml_training/nlp_processing/requirements.txt b/cookbook/case_studies/ml_training/nlp_processing/requirements.txt new file mode 100644 index 0000000000..219540c82e --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/requirements.txt @@ -0,0 +1,312 @@ +# +# This file is autogenerated by pip-compile with python 3.9 +# To update, run: +# +# pip-compile requirements.in +# +arrow==1.2.2 + # via jinja2-time +attrs==22.1.0 + # via visions +binaryornot==0.4.4 + # via cookiecutter +certifi==2021.10.8 + # via requests +cffi==1.15.1 + # via cryptography +chardet==4.0.0 + # via binaryornot +charset-normalizer==2.0.12 + # via requests +click==8.1.2 + # via + # cookiecutter + # flytekit + # nltk +cloudpickle==2.0.0 + # via flytekit +cookiecutter==2.1.1 + # via flytekit +croniter==1.3.4 + # via flytekit +cryptography==38.0.1 + # via pyopenssl +cycler==0.11.0 + # via matplotlib +dataclasses-json==0.5.7 + # via flytekit +decorator==5.1.1 + # via retry +deprecated==1.2.13 + # via flytekit +diskcache==5.4.0 + # via flytekit +docker==5.0.3 + # via flytekit +docker-image-py==0.1.12 + # via flytekit +docstring-parser==0.13 + # via flytekit +flyteidl==1.1.14 + # via flytekit +flytekit==1.1.1 + # via + # -r ../../../common/requirements-common.in + # flytekitplugins-deck-standard +flytekitplugins-deck-standard==1.1.1 + # via -r ../../../common/requirements-common.in +fonttools==4.32.0 + # via matplotlib +gensim==4.2.0 + # via -r requirements.in +googleapis-common-protos==1.56.0 + # via + # flyteidl + # grpcio-status +grpcio==1.44.0 + # via + # flytekit + # grpcio-status +grpcio-status==1.44.0 + # via flytekit +htmlmin==0.1.12 + # via pandas-profiling +idna==3.3 + # via requests +imagehash==4.3.1 + # via visions +importlib-metadata==4.11.3 + # via + # flytekit + # keyring + # markdown +jinja2==3.1.1 + # via + # cookiecutter + # jinja2-time + # pandas-profiling +jinja2-time==0.2.0 + # via cookiecutter +joblib==1.1.0 + # via + # nltk + # pandas-profiling + # phik + # scikit-learn +keyring==23.5.0 + # via flytekit +kiwisolver==1.4.2 + # via matplotlib +markdown==3.4.1 + # via flytekitplugins-deck-standard +markupsafe==2.1.1 + # via jinja2 +marshmallow==3.15.0 + # via + # dataclasses-json + # marshmallow-enum + # marshmallow-jsonschema +marshmallow-enum==1.5.1 + # via dataclasses-json +marshmallow-jsonschema==0.13.0 + # via flytekit +matplotlib==3.5.1 + # via + # -r ../../../common/requirements-common.in + # missingno + # pandas-profiling + # phik + # seaborn +missingno==0.5.1 + # via pandas-profiling +multimethod==1.8 + # via + # pandas-profiling + # visions +mypy-extensions==0.4.3 + # via typing-inspect +natsort==8.1.0 + # via flytekit +networkx==2.8.7 + # via visions +nltk==3.7 + # via -r requirements.in +numpy==1.22.3 + # via + # -r requirements.in + # gensim + # imagehash + # matplotlib + # missingno + # pandas + # pandas-profiling + # patsy + # phik + # pyarrow + # pyemd + # scikit-learn + # scipy + # seaborn + # statsmodels + # visions +packaging==21.3 + # via + # marshmallow + # matplotlib + # statsmodels +pandas==1.4.2 + # via + # flytekit + # pandas-profiling + # phik + # seaborn + # statsmodels + # visions +pandas-profiling==3.3.0 + # via flytekitplugins-deck-standard +patsy==0.5.3 + # via statsmodels +phik==0.12.2 + # via pandas-profiling +pillow==9.1.0 + # via + # imagehash + # matplotlib + # visions +plotly==5.10.0 + # via + # -r requirements.in + # flytekitplugins-deck-standard +protobuf==3.20.0 + # via + # flyteidl + # flytekit + # googleapis-common-protos + # grpcio-status + # protoc-gen-swagger +protoc-gen-swagger==0.1.0 + # via flyteidl +py==1.11.0 + # via retry +pyarrow==6.0.1 + # via flytekit +pycparser==2.21 + # via cffi +pydantic==1.9.2 + # via pandas-profiling +pyemd==0.5.1 + # via -r requirements.in +pyopenssl==22.0.0 + # via flytekit +pyparsing==3.0.8 + # via + # matplotlib + # packaging +python-dateutil==2.8.2 + # via + # arrow + # croniter + # flytekit + # matplotlib + # pandas +python-json-logger==2.0.2 + # via flytekit +python-slugify==6.1.1 + # via cookiecutter +pytimeparse==1.1.8 + # via flytekit +pytz==2022.1 + # via + # flytekit + # pandas +pywavelets==1.3.0 + # via imagehash +pyyaml==6.0 + # via + # cookiecutter + # flytekit + # pandas-profiling +regex==2022.3.15 + # via + # docker-image-py + # nltk +requests==2.27.1 + # via + # cookiecutter + # docker + # flytekit + # pandas-profiling + # responses +responses==0.20.0 + # via flytekit +retry==0.9.2 + # via flytekit +scikit-learn==1.1.3 + # via -r requirements.in +scipy==1.8.0 + # via + # gensim + # imagehash + # missingno + # pandas-profiling + # phik + # scikit-learn + # seaborn + # statsmodels +seaborn==0.11.2 + # via + # missingno + # pandas-profiling +six==1.16.0 + # via + # grpcio + # patsy + # python-dateutil +smart-open==6.2.0 + # via gensim +sortedcontainers==2.4.0 + # via flytekit +statsd==3.3.0 + # via flytekit +statsmodels==0.13.2 + # via pandas-profiling +tangled-up-in-unicode==0.2.0 + # via + # pandas-profiling + # visions +tenacity==8.1.0 + # via plotly +text-unidecode==1.3 + # via python-slugify +threadpoolctl==3.1.0 + # via scikit-learn +tqdm==4.64.1 + # via + # nltk + # pandas-profiling +typing-extensions==4.1.1 + # via + # flytekit + # pydantic + # typing-inspect +typing-inspect==0.7.1 + # via dataclasses-json +urllib3==1.26.9 + # via + # flytekit + # requests + # responses +visions[type_image_path]==0.7.5 + # via pandas-profiling +websocket-client==1.4.1 + # via docker +wheel==0.37.1 + # via + # -r ../../../common/requirements-common.in + # flytekit +wrapt==1.14.0 + # via + # deprecated + # flytekit +zipp==3.8.0 + # via importlib-metadata diff --git a/cookbook/case_studies/ml_training/nlp_processing/sandbox.config b/cookbook/case_studies/ml_training/nlp_processing/sandbox.config new file mode 100644 index 0000000000..43ae3cfea5 --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/sandbox.config @@ -0,0 +1,2 @@ +[sdk] +workflow_packages=nlp_processing diff --git a/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py b/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py new file mode 100644 index 0000000000..4004a1324a --- /dev/null +++ b/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py @@ -0,0 +1,338 @@ +""" +.. _word2vec_and_lda: + +Word Embeddings and Topic Modelling with Gensim +----------------------------------------------- + +This example creates six Flyte tasks that: + +1. Generate the sample dataset. +2. Train the word2vec model. +3. Train the LDA model and display the words per topic. +4. Compute word similarities. +5. Compute word movers distance. +6. Reduce dimensions using t-SNE and generate a plot using FlyteDeck. + +""" + +# %% +# First, we import the necessary libraries. +import logging +import os +import random +import typing +from dataclasses import dataclass +from typing import Dict, List + +import flytekit +import gensim +import nltk +import numpy as np +import plotly.graph_objects as go +import plotly.io as io +from dataclasses_json import dataclass_json +from flytekit import Resources, task, workflow +from flytekit.types.file import FlyteFile +from gensim import utils +from gensim.corpora import Dictionary +from gensim.models import LdaModel, Word2Vec +from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords +from gensim.test.utils import datapath +from nltk.stem import WordNetLemmatizer +from nltk.tokenize import RegexpTokenizer +from sklearn.manifold import TSNE + +logger = logging.getLogger(__file__) + + +# %% +# We define the output file type. +MODELSER_NLP = typing.TypeVar("model") +model_file = typing.NamedTuple("ModelFile", model=FlyteFile[MODELSER_NLP]) + +# %% +# Next, we define the path to the lee corpus dataset (installed with gensim). +data_dir = os.path.join(gensim.__path__[0], "test", "test_data") +lee_train_file = os.path.join(data_dir, "lee_background.cor") + + +# %% +# We declare ``NamedTuple``s which will be used as signatures of the Flyte task outputs. +# The variable names and types correspond to the values of the unpacked tuples returned +# from the corresponding Flyte task. +plotdata = typing.NamedTuple( + "PlottingData", + x_values=List[float], + y_values=List[float], + labels=List[str], +) + + +workflow_outputs = typing.NamedTuple( + "WorkflowOutputs", + simwords=Dict[str, float], + distance=float, + topics=Dict[int, List[str]], +) + + +# %% +# We sample sentences of similar contexts to compare using the trained model. +SENTENCE_A = "Australian cricket captain has supported fast bowler" +SENTENCE_B = "Fast bowler received support from cricket captain" + + +# %% +# Data Generation +# =============== +# +# The data pre-processor implements the following steps: +# +# 1. Turns all words to lowercase and remove stopwords. +# 2. Splits the document into tokens using a regular expression tokenizer from NLTK. +# 3. Removes numeric single-character tokens as they do not tend to be useful, and the dataset contains a lot of them. +# 4. Uses the WordNet lemmatizer from NLTK and returns a list of lemmatized tokens. +def pre_processing(line: str) -> List[str]: + tokenizer = RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(remove_stopwords(line.lower())) + lemmatizer = WordNetLemmatizer() + return [lemmatizer.lemmatize(token) for token in tokens] + + +# %% +# Now, we implement an iterator that calls the ``pre_processing`` function on each input sentence from the corpus +# and yield the processed results. +class MyCorpus: + """An iterator that yields sentences (lists of str).""" + + def __init__(self, path): + self.corpus_path = datapath(path) + + def __iter__(self): + for line in open(self.corpus_path): + yield pre_processing(line) + + +# %% +# We define a Flyte task to generate the processed corpus containing a list of tokenized sentence lists. +@task +def generate_processed_corpus() -> List[List[str]]: + # download the required packages from the nltk library + nltk.download("wordnet") + nltk.download("omw-1.4") + sentences_train = MyCorpus(lee_train_file) + train_corpus = list(sentences_train) + return train_corpus + + +# %% +# Hyperparameters +# =============== +# +# Next, we create a dataclass comprising Word2Vec hyperparameters: +# +# - ``min_count``: Prunes the dictionary and removes low-frequency words. +# - ``vector_size``: Number of dimensions (N) of the N-dimensional space that gensim Word2Vec maps the words onto. +# Bigger size values require more training data but can lead to better (more accurate) models. +# - ``workers``: For training parallelization to speed up training. +# - ``compute_loss``: To toggle computation of loss while training the Word2Vec model. +@dataclass_json +@dataclass +class Word2VecModelHyperparams(object): + """ + Hyperparameters that can be used while training the word2vec model. + """ + + vector_size: int = 200 + min_count: int = 1 + workers: int = 4 + compute_loss: bool = True + + +# %% +# LDA needs a similar dataclass: +# +# - ``num_topics``: The number of topics to be extracted from the training corpus. +# - ``alpha``: A-priori belief on document-topic distribution. In `auto` mode, the model learns this from the data. +# - ``passes``: Controls how often the model is trained on the entire corpus or number of epochs. +# - ``chunksize``: Controls how many documents are processed at a time in the training algorithm. Increasing the +# chunk size speeds up training, at least as long as the chunk of documents easily fits into memory. +# - ``update_every``: Number of documents to be iterated through for each update. +# - ``random_state``: Seed for reproducibility. +@dataclass_json +@dataclass +class LDAModelHyperparams(object): + """ + Hyperparameters that can be used while training the LDA model. + """ + + num_topics: int = 5 + alpha: str = "auto" + passes: int = 10 + chunksize: int = 100 + update_every: int = 1 + random_state: int = 100 + + +# %% +# Training +# ======== +# +# We initialize and train a Word2Vec model on the preprocessed corpus. +@task +def train_word2vec_model( + training_data: List[List[str]], hyperparams: Word2VecModelHyperparams +) -> model_file: + + model = Word2Vec( + training_data, + min_count=hyperparams.min_count, + workers=hyperparams.workers, + vector_size=hyperparams.vector_size, + compute_loss=hyperparams.compute_loss, + ) + training_loss = model.get_latest_training_loss() + logger.info(f"training loss: {training_loss}") + out_path = os.path.join( + flytekit.current_context().working_directory, "word2vec.model" + ) + model.save(out_path) + return (out_path,) + + +# %% +# Next, we transform the documents to a vectorized form and compute the frequency of each word to generate a bag of +# words corpus for the LDA model to train on. We also create a mapping from word IDs to words to send it as an input to +# the LDA model for training. +@task +def train_lda_model( + corpus: List[List[str]], hyperparams: LDAModelHyperparams +) -> Dict[int, List[str]]: + id2word = Dictionary(corpus) + bow_corpus = [id2word.doc2bow(doc) for doc in corpus] + id_words = [[(id2word[id], count) for id, count in line] for line in bow_corpus] + logger.info(f"Sample of bag of words generated: {id_words[:2]}") + lda = LdaModel( + corpus=bow_corpus, + id2word=id2word, + num_topics=hyperparams.num_topics, + alpha=hyperparams.alpha, + passes=hyperparams.passes, + chunksize=hyperparams.chunksize, + update_every=hyperparams.update_every, + random_state=hyperparams.random_state, + ) + return dict(lda.show_topics(num_words=5)) + + +# %% +# Word Similarities +# ================= +# +# We deserialize the model from disk and compute the top 10 similar +# words to the given word in the corpus (we will use the word `computer` when running +# the workflow to output similar words). Note that since the model is trained +# on a small corpus, some of the relations might not be clear. +@task(cache_version="1.0", cache=True, limits=Resources(mem="600Mi")) +def word_similarities( + model_ser: FlyteFile[MODELSER_NLP], word: str +) -> Dict[str, float]: + model = Word2Vec.load(model_ser.download()) + wv = model.wv + logger.info(f"Word vector for {word}:{wv[word]}") + return dict(wv.most_similar(word, topn=10)) + + +# %% +# Sentence Similarity +# =================== +# +# We compute Word Mover’s Distance (WMD) using the trained embeddings of words. +# This enables us to assess the distance between two documents in a meaningful way even when they have +# no words in common. +# WMD outputs a large value for two completely unrelated sentences and small value for two closely related +# sentences. +# Since we chose two similar sentences for comparison, the word movers distance +# should be small. You can try altering either ``SENTENCE_A`` or ``SENTENCE_B`` variables to be dissimilar +# to the other sentence, and check if the value computed is larger. +@task(cache_version="1.0", cache=True, limits=Resources(mem="600Mi")) +def word_movers_distance(model_ser: FlyteFile[MODELSER_NLP]) -> float: + sentences = [SENTENCE_A, SENTENCE_B] + results = [] + for i in sentences: + result = [w for w in utils.tokenize(i) if w not in STOPWORDS] + results.append(result) + model = Word2Vec.load(model_ser.download()) + logger.info(f"Computing word movers distance for: {SENTENCE_A} and {SENTENCE_B} ") + return model.wv.wmdistance(*results) + + +# %% +# Dimensionality Reduction and Plotting +# ===================================== +# +# The word embeddings made by the model can be visualized after reducing the dimensionality to two with t-SNE. +# This task can take a few minutes to complete. +@task(cache_version="1.0", cache=True, limits=Resources(mem="1000Mi")) +def dimensionality_reduction(model_ser: FlyteFile[MODELSER_NLP]) -> plotdata: + model = Word2Vec.load(model_ser.download()) + num_dimensions = 2 + vectors = np.asarray(model.wv.vectors) + labels = np.asarray(model.wv.index_to_key) + logger.info("Running dimensionality reduction using t-SNE") + tsne = TSNE(n_components=num_dimensions, random_state=0) + vectors = tsne.fit_transform(vectors) + x_vals = [float(v[0]) for v in vectors] + y_vals = [float(v[1]) for v in vectors] + labels = [str(l) for l in labels] + return x_vals, y_vals, labels + + +@task( + cache_version="1.0", cache=True, limits=Resources(mem="600Mi"), disable_deck=False +) +def plot_with_plotly(x: List[float], y: List[float], labels: List[str]): + layout = go.Layout(height=600, width=800) + fig = go.Figure( + data=go.Scattergl(x=x, y=y, mode="markers", marker=dict(color="aqua")), + layout=layout, + ) + indices = list(range(len(labels))) + selected_indices = random.sample(indices, 50) + for i in selected_indices: + fig.add_annotation( + text=labels[i], + x=x[i], + y=y[i], + showarrow=False, + font=dict(size=15, color="black", family="Sans Serif"), + ) + logger.info("Generating the Word Embedding Plot using Flyte Deck") + flytekit.Deck("Word Embeddings", io.to_html(fig, full_html=True)) + + +# %% +# Running the Workflow +# ==================== +# +# Let's kick off a workflow! This will return the inference outputs of both gensim models: +# similar words, WMD and LDA topics. +@workflow +def nlp_workflow(target_word: str = "computer") -> workflow_outputs: + corpus = generate_processed_corpus() + model_wv = train_word2vec_model( + training_data=corpus, hyperparams=Word2VecModelHyperparams() + ) + lda_topics = train_lda_model(corpus=corpus, hyperparams=LDAModelHyperparams()) + similar_words = word_similarities(model_ser=model_wv.model, word=target_word) + distance = word_movers_distance(model_ser=model_wv.model) + axis_labels = dimensionality_reduction(model_ser=model_wv.model) + plot_with_plotly( + x=axis_labels.x_values, y=axis_labels.y_values, labels=axis_labels.labels + ) + return similar_words, distance, lda_topics + + +if __name__ == "__main__": + print(f"Running {__file__} main...") + print(nlp_workflow()) diff --git a/cookbook/docs/conf.py b/cookbook/docs/conf.py index 57bf8eccd9..ab3800028f 100644 --- a/cookbook/docs/conf.py +++ b/cookbook/docs/conf.py @@ -149,6 +149,7 @@ class CustomSorter(FileNameSortKey): "house_price_predictor.py", "multiregion_house_price_predictor.py", "keras_spark_rossmann_estimator.py", + "word2vec_and_lda.py", ## Feature Engineering "pytorch_single_node_and_gpu.py", "pytorch_single_node_multi_gpu.py", @@ -270,6 +271,7 @@ def __call__(self, filename): "../case_studies/ml_training/pima_diabetes", "../case_studies/ml_training/house_price_prediction", "../case_studies/ml_training/mnist_classifier", + "../case_studies/ml_training/nlp_processing", "../case_studies/ml_training/spark_horovod", "../case_studies/feature_engineering/eda", "../case_studies/feature_engineering/feast_integration", @@ -311,6 +313,7 @@ def __call__(self, filename): "auto/case_studies/ml_training/pima_diabetes", "auto/case_studies/ml_training/house_price_prediction", "auto/case_studies/ml_training/mnist_classifier", + "auto/case_studies/ml_training/nlp_processing", "auto/case_studies/ml_training/spark_horovod", "auto/case_studies/feature_engineering/eda", "auto/case_studies/feature_engineering/feast_integration", diff --git a/cookbook/docs/ml_training.rst b/cookbook/docs/ml_training.rst index 0cef6fc700..3027b82a48 100644 --- a/cookbook/docs/ml_training.rst +++ b/cookbook/docs/ml_training.rst @@ -35,6 +35,15 @@ Understand how machine learning models can be trained from within Flyte, with an --- + .. link-button:: auto/case_studies/ml_training/nlp_processing/index + :type: ref + :text: NLP Processing with Gensim + :classes: btn-block stretched-link + ^^^^^^^^^^^^ + Word embedding and topic modelling on lee background corpus with Gensim + + --- + .. link-button:: auto/case_studies/ml_training/spark_horovod/index :type: ref :text: Forecast Sales Using Rossmann Store Sales Data with Horovod and Spark @@ -51,6 +60,7 @@ Understand how machine learning models can be trained from within Flyte, with an auto/case_studies/ml_training/pima_diabetes/index auto/case_studies/ml_training/house_price_prediction/index auto/case_studies/ml_training/mnist_classifier/index + auto/case_studies/ml_training/nlp_processing/index auto/case_studies/ml_training/spark_horovod/index