From fed511544833c7458b3c10d544378971090d74da Mon Sep 17 00:00:00 2001 From: Hannah Pho Date: Mon, 2 Dec 2024 13:01:49 -0500 Subject: [PATCH] [CDC Data] Create Python venv in a single step (#4762) Previously, conflicting deps between importer requirements and NL requirements could result in runtime errors. --- build/cdc_data/Dockerfile | 72 +++++++++++++++------------------------ 1 file changed, 28 insertions(+), 44 deletions(-) diff --git a/build/cdc_data/Dockerfile b/build/cdc_data/Dockerfile index d5c4254b5e..2d8db8edad 100644 --- a/build/cdc_data/Dockerfile +++ b/build/cdc_data/Dockerfile @@ -12,47 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -# #### Stage 1: Build env for data importer. #### -FROM python:3.11.4-slim as data-importer -ARG PIP_DISABLE_PIP_VERSION_CHECK=1 -ARG PIP_NO_CACHE_DIR=1 +# #### Stage 1: Download base dc model from GCS. #### +FROM google/cloud-sdk:slim as model-downloader + +# Copy model. +RUN mkdir -p /tmp/datcom-nl-models \ + && gsutil -m cp -R gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2/ /tmp/datcom-nl-models/ + + +# #### Stage 2: Copy required files. #### +FROM python:3.11.4-slim as file-copier WORKDIR /workspace -# Copy requirements. +# Copy simple importer requirements. COPY import/simple/requirements.txt ./import/simple/requirements.txt -# Create a virtual env and install requirements. -RUN python -m venv /workspace/venv -ENV PATH="/workspace/venv/bin:$PATH" -RUN pip3 install -r ./import/simple/requirements.txt - # Copy simple importer. COPY import/simple/ ./import/simple/ - -# #### Stage 2: Build env for embeddings builder. #### -FROM python:3.11.4-slim as embeddings-builder - -ARG PIP_DISABLE_PIP_VERSION_CHECK=1 -ARG PIP_NO_CACHE_DIR=1 - -WORKDIR /workspace - -# Copy requirements. +# Copy embeddings builder requirements. # Copy nl_requirements.txt since it is referenced by embeddings requirements.txt COPY tools/nl/embeddings/requirements.txt ./tools/nl/embeddings/requirements.txt COPY nl_requirements.txt ./nl_requirements.txt -# Create a virtual env and install requirements. -# Remove lancedb - it is not used by custom dc. -RUN python -m venv ./venv -ENV PATH="/workspace/venv/bin:$PATH" -RUN sed -i'' '/lancedb/d' /workspace/nl_requirements.txt \ - && pip3 install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cpu \ - && pip3 install -r ./tools/nl/embeddings/requirements.txt - # Copy the embeddings builder module. COPY tools/nl/embeddings/. ./tools/nl/embeddings/ # Copy the shared module. @@ -63,15 +47,7 @@ COPY nl_server/. /workspace/nl_server/ COPY deploy/nl/. /datacommons/nl/ -# #### Stage 3: Download base dc model from GCS. #### -FROM google/cloud-sdk:slim as model-downloader - -# Copy model. -RUN mkdir -p /tmp/datcom-nl-models \ - && gsutil -m cp -R gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2/ /tmp/datcom-nl-models/ - - -# #### Stage 4: Runtime env. #### +# #### Stage 3: Runtime env. #### FROM python:3.11.4-slim as runner ARG ENV @@ -80,19 +56,27 @@ ENV ENV=${ENV} WORKDIR /workspace # Copy scripts, dependencies and files from the build stages. -COPY --from=data-importer /workspace/ . -COPY --from=embeddings-builder /workspace/ . -COPY --from=embeddings-builder /datacommons/ /datacommons +COPY --from=file-copier /workspace/ . +COPY --from=file-copier /datacommons/ /datacommons COPY --from=model-downloader /tmp/datcom-nl-models /tmp/datcom-nl-models +ARG PIP_DISABLE_PIP_VERSION_CHECK=1 +ARG PIP_NO_CACHE_DIR=1 + +# Create a virtual env, add it to path, and install all requirements. +RUN python -m venv /workspace/venv +ENV PATH="/workspace/venv/bin:$PATH" +RUN pip3 install -r ./import/simple/requirements.txt +# Remove lancedb - it is not used by custom dc. +RUN sed -i'' '/lancedb/d' /workspace/nl_requirements.txt \ + && pip3 install torch==2.2.2 --extra-index-url https://download.pytorch.org/whl/cpu \ + && pip3 install -r ./tools/nl/embeddings/requirements.txt + # Copy executable script. COPY build/cdc_data/run.sh . # Make script executable. RUN chmod +x run.sh -# Add virtual env to the path. -ENV PATH="/workspace/venv/bin:$PATH" - # Set the default command to run the script. -CMD ./run.sh \ No newline at end of file +CMD ./run.sh