From fc42be0c8aa6246f224e1c3bdc0bb4b6a9d8491b Mon Sep 17 00:00:00 2001
From: Ryan Nazareth <ryankarlos@gmail.com>
Date: Mon, 7 Nov 2022 09:37:07 +0000
Subject: [PATCH] Added Tutorial for NLP Processing using Gensim in Flyte
 Workflow (#911)

* add tutorial for nlp

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add script and folder to cookbook

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add flytedeck

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* pin pandas and profiling versions

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add docstring to script

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* trigger ci

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* typos

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* typos

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add tutorial to panel and toc tree in rst

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add loads of descriptions

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* typos and formatting

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* correction to flytedeck description

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* formatting and typos

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add requested changes to description and other bits

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* formatting and add gitignore

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add typing to plotting task

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add in requested changes and add sklearn to requirements.in

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* few more

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* run pip-compile again to correct relative path to requirements-common.in

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* bump resource for tasks that errored in flyte console

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* whitespace

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* switch from np to flyte supported types and model_ser.download

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add support for plotly and disable deck in task

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* return output for word similarity and plotly layout size adjustment

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* remove returned output from word sim task

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* remove type and also output in wmd

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* add workflow outputs and modify comments

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

* fix typing for return value word sim task

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>

Signed-off-by: Ryan Nazareth <ryankarlos@gmail.com>
Co-authored-by: Samhita Alla <aallasamhita@gmail.com>
---
 .../ml_training/nlp_processing/Dockerfile     |  53 +++
 .../ml_training/nlp_processing/Makefile       |   3 +
 .../ml_training/nlp_processing/README.rst     |  39 ++
 .../ml_training/nlp_processing/__init__.py    |   0
 .../nlp_processing/requirements.in            |   7 +
 .../nlp_processing/requirements.txt           | 312 ++++++++++++++++
 .../ml_training/nlp_processing/sandbox.config |   2 +
 .../nlp_processing/word2vec_and_lda.py        | 338 ++++++++++++++++++
 cookbook/docs/conf.py                         |   3 +
 cookbook/docs/ml_training.rst                 |  10 +
 10 files changed, 767 insertions(+)
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/Dockerfile
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/Makefile
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/README.rst
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/__init__.py
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/requirements.in
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/requirements.txt
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/sandbox.config
 create mode 100644 cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py

diff --git a/cookbook/case_studies/ml_training/nlp_processing/Dockerfile b/cookbook/case_studies/ml_training/nlp_processing/Dockerfile
new file mode 100644
index 0000000000..b4d87cbb05
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/Dockerfile
@@ -0,0 +1,53 @@
+FROM ubuntu:focal
+
+WORKDIR /root
+ENV VENV /opt/venv
+ENV LANG C.UTF-8
+ENV LC_ALL C.UTF-8
+ENV PYTHONPATH /root
+
+RUN : \
+    && apt-get update \
+    && apt install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa
+
+RUN : \
+    && apt-get update \
+    && apt-get install -y python3.8 python3-pip python3-venv make build-essential libssl-dev curl vim
+
+# This is necessary for opencv to work
+RUN apt-get update && apt-get install -y libsm6 libxext6 libxrender-dev ffmpeg
+
+# Install the AWS cli separately to prevent issues with boto being written over
+RUN pip3 install awscli
+
+WORKDIR /opt
+RUN curl https://sdk.cloud.google.com > install.sh
+RUN bash /opt/install.sh --install-dir=/opt
+ENV PATH $PATH:/opt/google-cloud-sdk/bin
+WORKDIR /root
+
+# Virtual environment
+ENV VENV /opt/venv
+RUN python3 -m venv ${VENV}
+ENV PATH="${VENV}/bin:$PATH"
+
+# Install Python dependencies
+COPY nlp_processing/requirements.txt /root
+RUN ${VENV}/bin/pip install -r /root/requirements.txt
+
+# Copy the makefile targets to expose on the container. This makes it easier to register.
+COPY in_container.mk /root/Makefile
+COPY nlp_processing/sandbox.config /root
+
+# Copy the actual code
+COPY nlp_processing/ /root/nlp_processing/
+
+# Copy over the helper script that the SDK relies on
+RUN cp ${VENV}/bin/flytekit_venv /usr/local/bin/
+RUN chmod a+x /usr/local/bin/flytekit_venv
+
+# This tag is supplied by the build script and will be used to determine the version
+# when registering tasks, workflows, and launch plans
+ARG tag
+ENV FLYTE_INTERNAL_IMAGE $tag
diff --git a/cookbook/case_studies/ml_training/nlp_processing/Makefile b/cookbook/case_studies/ml_training/nlp_processing/Makefile
new file mode 100644
index 0000000000..799d03eaa5
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/Makefile
@@ -0,0 +1,3 @@
+PREFIX=nlp_processing
+include ../../../common/common.mk
+include ../../../common/leaf.mk
diff --git a/cookbook/case_studies/ml_training/nlp_processing/README.rst b/cookbook/case_studies/ml_training/nlp_processing/README.rst
new file mode 100644
index 0000000000..686a9f1ee7
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/README.rst
@@ -0,0 +1,39 @@
+NLP Processing
+--------------
+
+This tutorial will demonstrate how to process text data and generate word embeddings and visualizations
+as part of a Flyte workflow. It's an adaptation of the official Gensim `Word2Vec tutorial <https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html>`__.
+
+
+About Gensim
+============
+
+Gensim is a popular open-source natural language processing (NLP) library used to process
+large corpora (can be larger than RAM).
+It has efficient multicore implementations of a number of algorithms such as `Latent Semantic Analysis <http://lsa.colorado.edu/papers/dp1.LSAintro.pdf>`__, `Latent Dirichlet Allocation (LDA) <https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf>`__,
+`Word2Vec deep learning <https://arxiv.org/pdf/1301.3781.pdf>`__ to perform complex tasks including understanding
+document relationships, topic modeling, learning word embeddings, and more.
+
+You can read more about Gensim `here <https://radimrehurek.com/gensim/>`__.
+
+
+Data
+====
+
+The dataset used for this tutorial is the open-source `Lee Background Corpus <https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/lee_background.cor>`__
+that comes with the Gensim library.
+
+
+Step-by-Step Process
+====================
+
+The following points outline the modelling process:
+
+- Returns a preprocessed (tokenized, stop words excluded, lemmatized) corpus from the custom iterator.
+- Trains the Word2vec model on the preprocessed corpus.
+- Generates a bag of words from the corpus and trains the LDA model.
+- Saves the LDA and Word2Vec models to disk.
+- Deserializes the Word2Vec model, runs word similarity and computes word movers distance.
+- Reduces the dimensionality (using tsne) and plots the word embeddings.
+
+Let's dive into the code!
diff --git a/cookbook/case_studies/ml_training/nlp_processing/__init__.py b/cookbook/case_studies/ml_training/nlp_processing/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cookbook/case_studies/ml_training/nlp_processing/requirements.in b/cookbook/case_studies/ml_training/nlp_processing/requirements.in
new file mode 100644
index 0000000000..da66df9954
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/requirements.in
@@ -0,0 +1,7 @@
+-r ../../../common/requirements-common.in
+numpy
+gensim
+nltk
+plotly
+pyemd
+scikit-learn
diff --git a/cookbook/case_studies/ml_training/nlp_processing/requirements.txt b/cookbook/case_studies/ml_training/nlp_processing/requirements.txt
new file mode 100644
index 0000000000..219540c82e
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/requirements.txt
@@ -0,0 +1,312 @@
+#
+# This file is autogenerated by pip-compile with python 3.9
+# To update, run:
+#
+#    pip-compile requirements.in
+#
+arrow==1.2.2
+    # via jinja2-time
+attrs==22.1.0
+    # via visions
+binaryornot==0.4.4
+    # via cookiecutter
+certifi==2021.10.8
+    # via requests
+cffi==1.15.1
+    # via cryptography
+chardet==4.0.0
+    # via binaryornot
+charset-normalizer==2.0.12
+    # via requests
+click==8.1.2
+    # via
+    #   cookiecutter
+    #   flytekit
+    #   nltk
+cloudpickle==2.0.0
+    # via flytekit
+cookiecutter==2.1.1
+    # via flytekit
+croniter==1.3.4
+    # via flytekit
+cryptography==38.0.1
+    # via pyopenssl
+cycler==0.11.0
+    # via matplotlib
+dataclasses-json==0.5.7
+    # via flytekit
+decorator==5.1.1
+    # via retry
+deprecated==1.2.13
+    # via flytekit
+diskcache==5.4.0
+    # via flytekit
+docker==5.0.3
+    # via flytekit
+docker-image-py==0.1.12
+    # via flytekit
+docstring-parser==0.13
+    # via flytekit
+flyteidl==1.1.14
+    # via flytekit
+flytekit==1.1.1
+    # via
+    #   -r ../../../common/requirements-common.in
+    #   flytekitplugins-deck-standard
+flytekitplugins-deck-standard==1.1.1
+    # via -r ../../../common/requirements-common.in
+fonttools==4.32.0
+    # via matplotlib
+gensim==4.2.0
+    # via -r requirements.in
+googleapis-common-protos==1.56.0
+    # via
+    #   flyteidl
+    #   grpcio-status
+grpcio==1.44.0
+    # via
+    #   flytekit
+    #   grpcio-status
+grpcio-status==1.44.0
+    # via flytekit
+htmlmin==0.1.12
+    # via pandas-profiling
+idna==3.3
+    # via requests
+imagehash==4.3.1
+    # via visions
+importlib-metadata==4.11.3
+    # via
+    #   flytekit
+    #   keyring
+    #   markdown
+jinja2==3.1.1
+    # via
+    #   cookiecutter
+    #   jinja2-time
+    #   pandas-profiling
+jinja2-time==0.2.0
+    # via cookiecutter
+joblib==1.1.0
+    # via
+    #   nltk
+    #   pandas-profiling
+    #   phik
+    #   scikit-learn
+keyring==23.5.0
+    # via flytekit
+kiwisolver==1.4.2
+    # via matplotlib
+markdown==3.4.1
+    # via flytekitplugins-deck-standard
+markupsafe==2.1.1
+    # via jinja2
+marshmallow==3.15.0
+    # via
+    #   dataclasses-json
+    #   marshmallow-enum
+    #   marshmallow-jsonschema
+marshmallow-enum==1.5.1
+    # via dataclasses-json
+marshmallow-jsonschema==0.13.0
+    # via flytekit
+matplotlib==3.5.1
+    # via
+    #   -r ../../../common/requirements-common.in
+    #   missingno
+    #   pandas-profiling
+    #   phik
+    #   seaborn
+missingno==0.5.1
+    # via pandas-profiling
+multimethod==1.8
+    # via
+    #   pandas-profiling
+    #   visions
+mypy-extensions==0.4.3
+    # via typing-inspect
+natsort==8.1.0
+    # via flytekit
+networkx==2.8.7
+    # via visions
+nltk==3.7
+    # via -r requirements.in
+numpy==1.22.3
+    # via
+    #   -r requirements.in
+    #   gensim
+    #   imagehash
+    #   matplotlib
+    #   missingno
+    #   pandas
+    #   pandas-profiling
+    #   patsy
+    #   phik
+    #   pyarrow
+    #   pyemd
+    #   scikit-learn
+    #   scipy
+    #   seaborn
+    #   statsmodels
+    #   visions
+packaging==21.3
+    # via
+    #   marshmallow
+    #   matplotlib
+    #   statsmodels
+pandas==1.4.2
+    # via
+    #   flytekit
+    #   pandas-profiling
+    #   phik
+    #   seaborn
+    #   statsmodels
+    #   visions
+pandas-profiling==3.3.0
+    # via flytekitplugins-deck-standard
+patsy==0.5.3
+    # via statsmodels
+phik==0.12.2
+    # via pandas-profiling
+pillow==9.1.0
+    # via
+    #   imagehash
+    #   matplotlib
+    #   visions
+plotly==5.10.0
+    # via
+    #   -r requirements.in
+    #   flytekitplugins-deck-standard
+protobuf==3.20.0
+    # via
+    #   flyteidl
+    #   flytekit
+    #   googleapis-common-protos
+    #   grpcio-status
+    #   protoc-gen-swagger
+protoc-gen-swagger==0.1.0
+    # via flyteidl
+py==1.11.0
+    # via retry
+pyarrow==6.0.1
+    # via flytekit
+pycparser==2.21
+    # via cffi
+pydantic==1.9.2
+    # via pandas-profiling
+pyemd==0.5.1
+    # via -r requirements.in
+pyopenssl==22.0.0
+    # via flytekit
+pyparsing==3.0.8
+    # via
+    #   matplotlib
+    #   packaging
+python-dateutil==2.8.2
+    # via
+    #   arrow
+    #   croniter
+    #   flytekit
+    #   matplotlib
+    #   pandas
+python-json-logger==2.0.2
+    # via flytekit
+python-slugify==6.1.1
+    # via cookiecutter
+pytimeparse==1.1.8
+    # via flytekit
+pytz==2022.1
+    # via
+    #   flytekit
+    #   pandas
+pywavelets==1.3.0
+    # via imagehash
+pyyaml==6.0
+    # via
+    #   cookiecutter
+    #   flytekit
+    #   pandas-profiling
+regex==2022.3.15
+    # via
+    #   docker-image-py
+    #   nltk
+requests==2.27.1
+    # via
+    #   cookiecutter
+    #   docker
+    #   flytekit
+    #   pandas-profiling
+    #   responses
+responses==0.20.0
+    # via flytekit
+retry==0.9.2
+    # via flytekit
+scikit-learn==1.1.3
+    # via -r requirements.in
+scipy==1.8.0
+    # via
+    #   gensim
+    #   imagehash
+    #   missingno
+    #   pandas-profiling
+    #   phik
+    #   scikit-learn
+    #   seaborn
+    #   statsmodels
+seaborn==0.11.2
+    # via
+    #   missingno
+    #   pandas-profiling
+six==1.16.0
+    # via
+    #   grpcio
+    #   patsy
+    #   python-dateutil
+smart-open==6.2.0
+    # via gensim
+sortedcontainers==2.4.0
+    # via flytekit
+statsd==3.3.0
+    # via flytekit
+statsmodels==0.13.2
+    # via pandas-profiling
+tangled-up-in-unicode==0.2.0
+    # via
+    #   pandas-profiling
+    #   visions
+tenacity==8.1.0
+    # via plotly
+text-unidecode==1.3
+    # via python-slugify
+threadpoolctl==3.1.0
+    # via scikit-learn
+tqdm==4.64.1
+    # via
+    #   nltk
+    #   pandas-profiling
+typing-extensions==4.1.1
+    # via
+    #   flytekit
+    #   pydantic
+    #   typing-inspect
+typing-inspect==0.7.1
+    # via dataclasses-json
+urllib3==1.26.9
+    # via
+    #   flytekit
+    #   requests
+    #   responses
+visions[type_image_path]==0.7.5
+    # via pandas-profiling
+websocket-client==1.4.1
+    # via docker
+wheel==0.37.1
+    # via
+    #   -r ../../../common/requirements-common.in
+    #   flytekit
+wrapt==1.14.0
+    # via
+    #   deprecated
+    #   flytekit
+zipp==3.8.0
+    # via importlib-metadata
diff --git a/cookbook/case_studies/ml_training/nlp_processing/sandbox.config b/cookbook/case_studies/ml_training/nlp_processing/sandbox.config
new file mode 100644
index 0000000000..43ae3cfea5
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/sandbox.config
@@ -0,0 +1,2 @@
+[sdk]
+workflow_packages=nlp_processing
diff --git a/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py b/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py
new file mode 100644
index 0000000000..4004a1324a
--- /dev/null
+++ b/cookbook/case_studies/ml_training/nlp_processing/word2vec_and_lda.py
@@ -0,0 +1,338 @@
+"""
+.. _word2vec_and_lda:
+
+Word Embeddings and Topic Modelling with Gensim
+-----------------------------------------------
+
+This example creates six Flyte tasks that:
+
+1. Generate the sample dataset.
+2. Train the word2vec model.
+3. Train the LDA model and display the words per topic.
+4. Compute word similarities.
+5. Compute word movers distance.
+6. Reduce dimensions using t-SNE and generate a plot using FlyteDeck.
+
+"""
+
+# %%
+# First, we import the necessary libraries.
+import logging
+import os
+import random
+import typing
+from dataclasses import dataclass
+from typing import Dict, List
+
+import flytekit
+import gensim
+import nltk
+import numpy as np
+import plotly.graph_objects as go
+import plotly.io as io
+from dataclasses_json import dataclass_json
+from flytekit import Resources, task, workflow
+from flytekit.types.file import FlyteFile
+from gensim import utils
+from gensim.corpora import Dictionary
+from gensim.models import LdaModel, Word2Vec
+from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
+from gensim.test.utils import datapath
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import RegexpTokenizer
+from sklearn.manifold import TSNE
+
+logger = logging.getLogger(__file__)
+
+
+# %%
+# We define the output file type.
+MODELSER_NLP = typing.TypeVar("model")
+model_file = typing.NamedTuple("ModelFile", model=FlyteFile[MODELSER_NLP])
+
+# %%
+# Next, we define the path to the lee corpus dataset (installed with gensim).
+data_dir = os.path.join(gensim.__path__[0], "test", "test_data")
+lee_train_file = os.path.join(data_dir, "lee_background.cor")
+
+
+# %%
+# We declare ``NamedTuple``s which will be used as signatures of the Flyte task outputs.
+# The variable names and types correspond to the values of the unpacked tuples returned
+# from the corresponding Flyte task.
+plotdata = typing.NamedTuple(
+    "PlottingData",
+    x_values=List[float],
+    y_values=List[float],
+    labels=List[str],
+)
+
+
+workflow_outputs = typing.NamedTuple(
+    "WorkflowOutputs",
+    simwords=Dict[str, float],
+    distance=float,
+    topics=Dict[int, List[str]],
+)
+
+
+# %%
+# We sample sentences of similar contexts to compare using the trained model.
+SENTENCE_A = "Australian cricket captain has supported fast bowler"
+SENTENCE_B = "Fast bowler received support from cricket captain"
+
+
+# %%
+# Data Generation
+# ===============
+#
+# The data pre-processor implements the following steps:
+#
+# 1. Turns all words to lowercase and remove stopwords.
+# 2. Splits the document into tokens using a regular expression tokenizer from NLTK.
+# 3. Removes numeric single-character tokens as they do not tend to be useful, and the dataset contains a lot of them.
+# 4. Uses the WordNet lemmatizer from NLTK and returns a list of lemmatized tokens.
+def pre_processing(line: str) -> List[str]:
+    tokenizer = RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(remove_stopwords(line.lower()))
+    lemmatizer = WordNetLemmatizer()
+    return [lemmatizer.lemmatize(token) for token in tokens]
+
+
+# %%
+# Now, we implement an iterator that calls the ``pre_processing`` function on each input sentence from the corpus
+# and yield the processed results.
+class MyCorpus:
+    """An iterator that yields sentences (lists of str)."""
+
+    def __init__(self, path):
+        self.corpus_path = datapath(path)
+
+    def __iter__(self):
+        for line in open(self.corpus_path):
+            yield pre_processing(line)
+
+
+# %%
+# We define a Flyte task to generate the processed corpus containing a list of tokenized sentence lists.
+@task
+def generate_processed_corpus() -> List[List[str]]:
+    # download the required packages from the nltk library
+    nltk.download("wordnet")
+    nltk.download("omw-1.4")
+    sentences_train = MyCorpus(lee_train_file)
+    train_corpus = list(sentences_train)
+    return train_corpus
+
+
+# %%
+# Hyperparameters
+# ===============
+#
+# Next, we create a dataclass comprising Word2Vec hyperparameters:
+#
+# - ``min_count``:  Prunes the dictionary and removes low-frequency words.
+# - ``vector_size``: Number of dimensions (N) of the N-dimensional space that gensim Word2Vec maps the words onto.
+#   Bigger size values require more training data but can lead to better (more accurate) models.
+# - ``workers``: For training parallelization to speed up training.
+# - ``compute_loss``: To toggle computation of loss while training the Word2Vec model.
+@dataclass_json
+@dataclass
+class Word2VecModelHyperparams(object):
+    """
+    Hyperparameters that can be used while training the word2vec model.
+    """
+
+    vector_size: int = 200
+    min_count: int = 1
+    workers: int = 4
+    compute_loss: bool = True
+
+
+# %%
+# LDA needs a similar dataclass:
+#
+# - ``num_topics``: The number of topics to be extracted from the training corpus.
+# - ``alpha``: A-priori belief on document-topic distribution. In `auto` mode, the model learns this from the data.
+# - ``passes``: Controls how often the model is trained on the entire corpus or number of epochs.
+# - ``chunksize``:  Controls how many documents are processed at a time in the training algorithm. Increasing the
+#   chunk size speeds up training, at least as long as the chunk of documents easily fits into memory.
+# - ``update_every``: Number of documents to be iterated through for each update.
+# - ``random_state``: Seed for reproducibility.
+@dataclass_json
+@dataclass
+class LDAModelHyperparams(object):
+    """
+    Hyperparameters that can be used while training the LDA model.
+    """
+
+    num_topics: int = 5
+    alpha: str = "auto"
+    passes: int = 10
+    chunksize: int = 100
+    update_every: int = 1
+    random_state: int = 100
+
+
+# %%
+# Training
+# ========
+#
+# We initialize and train a Word2Vec model on the preprocessed corpus.
+@task
+def train_word2vec_model(
+    training_data: List[List[str]], hyperparams: Word2VecModelHyperparams
+) -> model_file:
+
+    model = Word2Vec(
+        training_data,
+        min_count=hyperparams.min_count,
+        workers=hyperparams.workers,
+        vector_size=hyperparams.vector_size,
+        compute_loss=hyperparams.compute_loss,
+    )
+    training_loss = model.get_latest_training_loss()
+    logger.info(f"training loss: {training_loss}")
+    out_path = os.path.join(
+        flytekit.current_context().working_directory, "word2vec.model"
+    )
+    model.save(out_path)
+    return (out_path,)
+
+
+# %%
+# Next, we transform the documents to a vectorized form and compute the frequency of each word to generate a bag of
+# words corpus for the LDA model to train on. We also create a mapping from word IDs to words to send it as an input to
+# the LDA model for training.
+@task
+def train_lda_model(
+    corpus: List[List[str]], hyperparams: LDAModelHyperparams
+) -> Dict[int, List[str]]:
+    id2word = Dictionary(corpus)
+    bow_corpus = [id2word.doc2bow(doc) for doc in corpus]
+    id_words = [[(id2word[id], count) for id, count in line] for line in bow_corpus]
+    logger.info(f"Sample of bag of words generated: {id_words[:2]}")
+    lda = LdaModel(
+        corpus=bow_corpus,
+        id2word=id2word,
+        num_topics=hyperparams.num_topics,
+        alpha=hyperparams.alpha,
+        passes=hyperparams.passes,
+        chunksize=hyperparams.chunksize,
+        update_every=hyperparams.update_every,
+        random_state=hyperparams.random_state,
+    )
+    return dict(lda.show_topics(num_words=5))
+
+
+# %%
+# Word Similarities
+# =================
+#
+# We deserialize the model from disk and compute the top 10 similar
+# words to the given word in the corpus (we will use the word `computer` when running
+# the workflow to output similar words). Note that since the model is trained
+# on a small corpus, some of the relations might not be clear.
+@task(cache_version="1.0", cache=True, limits=Resources(mem="600Mi"))
+def word_similarities(
+    model_ser: FlyteFile[MODELSER_NLP], word: str
+) -> Dict[str, float]:
+    model = Word2Vec.load(model_ser.download())
+    wv = model.wv
+    logger.info(f"Word vector for {word}:{wv[word]}")
+    return dict(wv.most_similar(word, topn=10))
+
+
+# %%
+# Sentence Similarity
+# ===================
+#
+# We compute Word Mover’s Distance (WMD) using the trained embeddings of words.
+# This enables us to assess the distance between two documents in a meaningful way even when they have
+# no words in common.
+# WMD outputs a large value for two completely unrelated sentences and small value for two closely related
+# sentences.
+# Since we chose two similar sentences for comparison, the word movers distance
+# should be small. You can try altering either ``SENTENCE_A`` or ``SENTENCE_B`` variables to be dissimilar
+# to the other sentence, and check if the value computed is larger.
+@task(cache_version="1.0", cache=True, limits=Resources(mem="600Mi"))
+def word_movers_distance(model_ser: FlyteFile[MODELSER_NLP]) -> float:
+    sentences = [SENTENCE_A, SENTENCE_B]
+    results = []
+    for i in sentences:
+        result = [w for w in utils.tokenize(i) if w not in STOPWORDS]
+        results.append(result)
+    model = Word2Vec.load(model_ser.download())
+    logger.info(f"Computing word movers distance for: {SENTENCE_A} and {SENTENCE_B} ")
+    return model.wv.wmdistance(*results)
+
+
+# %%
+# Dimensionality Reduction and Plotting
+# =====================================
+#
+# The word embeddings made by the model can be visualized after reducing the dimensionality to two with t-SNE.
+# This task can take a few minutes to complete.
+@task(cache_version="1.0", cache=True, limits=Resources(mem="1000Mi"))
+def dimensionality_reduction(model_ser: FlyteFile[MODELSER_NLP]) -> plotdata:
+    model = Word2Vec.load(model_ser.download())
+    num_dimensions = 2
+    vectors = np.asarray(model.wv.vectors)
+    labels = np.asarray(model.wv.index_to_key)
+    logger.info("Running dimensionality reduction using t-SNE")
+    tsne = TSNE(n_components=num_dimensions, random_state=0)
+    vectors = tsne.fit_transform(vectors)
+    x_vals = [float(v[0]) for v in vectors]
+    y_vals = [float(v[1]) for v in vectors]
+    labels = [str(l) for l in labels]
+    return x_vals, y_vals, labels
+
+
+@task(
+    cache_version="1.0", cache=True, limits=Resources(mem="600Mi"), disable_deck=False
+)
+def plot_with_plotly(x: List[float], y: List[float], labels: List[str]):
+    layout = go.Layout(height=600, width=800)
+    fig = go.Figure(
+        data=go.Scattergl(x=x, y=y, mode="markers", marker=dict(color="aqua")),
+        layout=layout,
+    )
+    indices = list(range(len(labels)))
+    selected_indices = random.sample(indices, 50)
+    for i in selected_indices:
+        fig.add_annotation(
+            text=labels[i],
+            x=x[i],
+            y=y[i],
+            showarrow=False,
+            font=dict(size=15, color="black", family="Sans Serif"),
+        )
+    logger.info("Generating the Word Embedding Plot using Flyte Deck")
+    flytekit.Deck("Word Embeddings", io.to_html(fig, full_html=True))
+
+
+# %%
+# Running the Workflow
+# ====================
+#
+# Let's kick off a workflow! This will return the inference outputs of both gensim models:
+# similar words, WMD and LDA topics.
+@workflow
+def nlp_workflow(target_word: str = "computer") -> workflow_outputs:
+    corpus = generate_processed_corpus()
+    model_wv = train_word2vec_model(
+        training_data=corpus, hyperparams=Word2VecModelHyperparams()
+    )
+    lda_topics = train_lda_model(corpus=corpus, hyperparams=LDAModelHyperparams())
+    similar_words = word_similarities(model_ser=model_wv.model, word=target_word)
+    distance = word_movers_distance(model_ser=model_wv.model)
+    axis_labels = dimensionality_reduction(model_ser=model_wv.model)
+    plot_with_plotly(
+        x=axis_labels.x_values, y=axis_labels.y_values, labels=axis_labels.labels
+    )
+    return similar_words, distance, lda_topics
+
+
+if __name__ == "__main__":
+    print(f"Running {__file__} main...")
+    print(nlp_workflow())
diff --git a/cookbook/docs/conf.py b/cookbook/docs/conf.py
index 57bf8eccd9..ab3800028f 100644
--- a/cookbook/docs/conf.py
+++ b/cookbook/docs/conf.py
@@ -149,6 +149,7 @@ class CustomSorter(FileNameSortKey):
         "house_price_predictor.py",
         "multiregion_house_price_predictor.py",
         "keras_spark_rossmann_estimator.py",
+        "word2vec_and_lda.py",
         ## Feature Engineering
         "pytorch_single_node_and_gpu.py",
         "pytorch_single_node_multi_gpu.py",
@@ -270,6 +271,7 @@ def __call__(self, filename):
     "../case_studies/ml_training/pima_diabetes",
     "../case_studies/ml_training/house_price_prediction",
     "../case_studies/ml_training/mnist_classifier",
+    "../case_studies/ml_training/nlp_processing",
     "../case_studies/ml_training/spark_horovod",
     "../case_studies/feature_engineering/eda",
     "../case_studies/feature_engineering/feast_integration",
@@ -311,6 +313,7 @@ def __call__(self, filename):
     "auto/case_studies/ml_training/pima_diabetes",
     "auto/case_studies/ml_training/house_price_prediction",
     "auto/case_studies/ml_training/mnist_classifier",
+    "auto/case_studies/ml_training/nlp_processing",
     "auto/case_studies/ml_training/spark_horovod",
     "auto/case_studies/feature_engineering/eda",
     "auto/case_studies/feature_engineering/feast_integration",
diff --git a/cookbook/docs/ml_training.rst b/cookbook/docs/ml_training.rst
index 0cef6fc700..3027b82a48 100644
--- a/cookbook/docs/ml_training.rst
+++ b/cookbook/docs/ml_training.rst
@@ -35,6 +35,15 @@ Understand how machine learning models can be trained from within Flyte, with an
 
     ---
 
+    .. link-button:: auto/case_studies/ml_training/nlp_processing/index
+       :type: ref
+       :text: NLP Processing with Gensim
+       :classes: btn-block stretched-link
+    ^^^^^^^^^^^^
+    Word embedding and topic modelling on lee background corpus with Gensim
+
+    ---
+
     .. link-button:: auto/case_studies/ml_training/spark_horovod/index
        :type: ref
        :text: Forecast Sales Using Rossmann Store Sales Data with Horovod and Spark
@@ -51,6 +60,7 @@ Understand how machine learning models can be trained from within Flyte, with an
     auto/case_studies/ml_training/pima_diabetes/index
     auto/case_studies/ml_training/house_price_prediction/index
     auto/case_studies/ml_training/mnist_classifier/index
+    auto/case_studies/ml_training/nlp_processing/index
     auto/case_studies/ml_training/spark_horovod/index