Skip to content

Commit

Permalink
[CDC Data] Create Python venv in a single step (datacommonsorg#4762)
Browse files Browse the repository at this point in the history
Previously, conflicting deps between importer requirements and NL
requirements could result in runtime errors.
  • Loading branch information
hqpho authored and gmechali committed Dec 5, 2024
1 parent da9aa06 commit cd7e3f6
Showing 1 changed file with 28 additions and 44 deletions.
72 changes: 28 additions & 44 deletions build/cdc_data/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,47 +12,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# #### Stage 1: Build env for data importer. ####
FROM python:3.11.4-slim as data-importer

ARG PIP_DISABLE_PIP_VERSION_CHECK=1
ARG PIP_NO_CACHE_DIR=1
# #### Stage 1: Download base dc model from GCS. ####
FROM google/cloud-sdk:slim as model-downloader

# Copy model.
RUN mkdir -p /tmp/datcom-nl-models \
&& gsutil -m cp -R gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2/ /tmp/datcom-nl-models/


# #### Stage 2: Copy required files. ####
FROM python:3.11.4-slim as file-copier

WORKDIR /workspace

# Copy requirements.
# Copy simple importer requirements.
COPY import/simple/requirements.txt ./import/simple/requirements.txt

# Create a virtual env and install requirements.
RUN python -m venv /workspace/venv
ENV PATH="/workspace/venv/bin:$PATH"
RUN pip3 install -r ./import/simple/requirements.txt

# Copy simple importer.
COPY import/simple/ ./import/simple/


# #### Stage 2: Build env for embeddings builder. ####
FROM python:3.11.4-slim as embeddings-builder

ARG PIP_DISABLE_PIP_VERSION_CHECK=1
ARG PIP_NO_CACHE_DIR=1

WORKDIR /workspace

# Copy requirements.
# Copy embeddings builder requirements.
# Copy nl_requirements.txt since it is referenced by embeddings requirements.txt
COPY tools/nl/embeddings/requirements.txt ./tools/nl/embeddings/requirements.txt
COPY nl_requirements.txt ./nl_requirements.txt

# Create a virtual env and install requirements.
# Remove lancedb - it is not used by custom dc.
RUN python -m venv ./venv
ENV PATH="/workspace/venv/bin:$PATH"
RUN sed -i'' '/lancedb/d' /workspace/nl_requirements.txt \
&& pip3 install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cpu \
&& pip3 install -r ./tools/nl/embeddings/requirements.txt

# Copy the embeddings builder module.
COPY tools/nl/embeddings/. ./tools/nl/embeddings/
# Copy the shared module.
Expand All @@ -63,15 +47,7 @@ COPY nl_server/. /workspace/nl_server/
COPY deploy/nl/. /datacommons/nl/


# #### Stage 3: Download base dc model from GCS. ####
FROM google/cloud-sdk:slim as model-downloader

# Copy model.
RUN mkdir -p /tmp/datcom-nl-models \
&& gsutil -m cp -R gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2/ /tmp/datcom-nl-models/


# #### Stage 4: Runtime env. ####
# #### Stage 3: Runtime env. ####
FROM python:3.11.4-slim as runner

ARG ENV
Expand All @@ -80,19 +56,27 @@ ENV ENV=${ENV}
WORKDIR /workspace

# Copy scripts, dependencies and files from the build stages.
COPY --from=data-importer /workspace/ .
COPY --from=embeddings-builder /workspace/ .
COPY --from=embeddings-builder /datacommons/ /datacommons
COPY --from=file-copier /workspace/ .
COPY --from=file-copier /datacommons/ /datacommons
COPY --from=model-downloader /tmp/datcom-nl-models /tmp/datcom-nl-models

ARG PIP_DISABLE_PIP_VERSION_CHECK=1
ARG PIP_NO_CACHE_DIR=1

# Create a virtual env, add it to path, and install all requirements.
RUN python -m venv /workspace/venv
ENV PATH="/workspace/venv/bin:$PATH"
RUN pip3 install -r ./import/simple/requirements.txt
# Remove lancedb - it is not used by custom dc.
RUN sed -i'' '/lancedb/d' /workspace/nl_requirements.txt \
&& pip3 install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cpu \
&& pip3 install -r ./tools/nl/embeddings/requirements.txt

# Copy executable script.
COPY build/cdc_data/run.sh .

# Make script executable.
RUN chmod +x run.sh

# Add virtual env to the path.
ENV PATH="/workspace/venv/bin:$PATH"

# Set the default command to run the script.
CMD ./run.sh
CMD ./run.sh

0 comments on commit cd7e3f6

Please sign in to comment.