-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* initial * added .dockerignore * updated Dockerfile * updated * updates * updates * bug fixes * bugfix * bugfix * bug fixes * added metadata
- Loading branch information
Showing
24 changed files
with
293 additions
and
250 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.git | ||
__pycache__ | ||
*.pyc | ||
.env | ||
.vscode | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,86 @@ | ||
FROM ubuntu:22.04 | ||
# Stage 1: Build stage | ||
FROM python:3.11-slim AS builder | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive \ | ||
HOME=/home/vectara \ | ||
HOME=/home/vectara \ | ||
XDG_RUNTIME_DIR=/tmp \ | ||
RAY_DEDUP_LOGS="0" \ | ||
CUDA_VISIBLE_DEVICES="" | ||
|
||
RUN sed 's/main$/main universe/' -i /etc/apt/sources.list | ||
CUDA_VISIBLE_DEVICES="" \ | ||
UV_SYSTEM_PYTHON=1 | ||
|
||
# Install build dependencies | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
libopenblas-dev \ | ||
unzip \ | ||
wget \ | ||
git \ | ||
curl \ | ||
wkhtmltopdf \ | ||
libssl-dev \ | ||
unixodbc \ | ||
poppler-utils \ | ||
tesseract-ocr \ | ||
libtesseract-dev \ | ||
xvfb \ | ||
python3-pip python3-dev \ | ||
libmagic1 \ | ||
libfontconfig fontconfig \ | ||
libjpeg-turbo8 \ | ||
fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \ | ||
&& apt-get purge -y \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* | ||
python3-dev \ | ||
&& rm -rf /var/lib/apt/lists/* /tmp/* | ||
|
||
RUN rm -f /usr/share/fonts/truetype/unifont/unifont_sample.ttf /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf | ||
ENV OMP_NUM_THREADS=4 | ||
|
||
# Install python packages | ||
# Install Python packages | ||
WORKDIR ${HOME} | ||
COPY requirements.txt requirements-extra.txt $HOME/ | ||
|
||
RUN pip install --no-cache-dir torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu \ | ||
&& pip install --no-cache-dir -r requirements.txt \ | ||
&& playwright install --with-deps firefox \ | ||
&& find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \ | ||
&& find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \ | ||
&& find /usr/local -type d \( -name '__pycache__' \) -exec rm -rf '{}' + \ | ||
&& find /usr/local -type d \( -name 'build' \) -exec rm -rf '{}' + \ | ||
&& rm -rf /root/.cache/* /tmp/* \ | ||
&& pip cache purge | ||
RUN pip install --no-cache-dir uv==0.5.6 | ||
RUN uv pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cpu \ | ||
&& uv pip install --no-cache-dir -r requirements.txt | ||
|
||
# Install additional large packages for all-docs unstructured inference and PII detection | ||
ARG INSTALL_EXTRA=false | ||
RUN if [ "$INSTALL_EXTRA" = "true" ]; then \ | ||
pip3 install --no-cache-dir -r requirements-extra.txt && \ | ||
uv pip install --no-cache-dir -r requirements-extra.txt && \ | ||
python3 -m spacy download en_core_web_lg; \ | ||
fi | ||
|
||
# Clean up unnecessary files | ||
RUN find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \ | ||
&& find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \ | ||
&& find /usr/local -type d -name '__pycache__' -exec rm -rf '{}' + \ | ||
&& rm -rf /root/.cache/* /tmp/* | ||
|
||
# Clean up unnecessary filesin site-packages | ||
RUN find /usr/local/lib/python3.11/site-packages \ | ||
-type d \( -name 'tests' -o -name 'test' -o -name 'examples' \) -exec rm -rf '{}' + \ | ||
&& find /usr/local/lib/python3.11/site-packages -type d -name '__pycache__' -exec rm -rf '{}' + \ | ||
&& find /usr/local/lib/python3.11/site-packages -type f -name '*.pyc' -exec rm -f '{}' + \ | ||
&& find /usr/local/lib/python3.11/site-packages -type f -name '*.pyo' -exec rm -f '{}' + | ||
|
||
# Stage 2: Final image | ||
FROM python:3.11-slim | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive \ | ||
HOME=/home/vectara \ | ||
XDG_RUNTIME_DIR=/tmp \ | ||
RAY_DEDUP_LOGS="0" \ | ||
CUDA_VISIBLE_DEVICES="" | ||
|
||
# Install runtime dependencies | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
# libopenblas-dev \ | ||
tesseract-ocr \ | ||
# xvfb \ | ||
unixodbc poppler-utils libmagic1 libjpeg62-turbo \ | ||
libfontconfig fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy Python packages and application code from the builder stage | ||
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages | ||
COPY --from=builder /usr/local/bin /usr/local/bin | ||
|
||
# Install Playwright browsers | ||
RUN playwright install --with-deps firefox \ | ||
&& rm -f /usr/local/bin/pwdebug \ | ||
&& rm -rf /var/lib/apt/lists/* /tmp/* /root/.cache/* | ||
|
||
# Set working directory | ||
WORKDIR ${HOME} | ||
|
||
# Copy application code | ||
COPY *.py $HOME/ | ||
COPY core/*.py $HOME/core/ | ||
COPY crawlers/ $HOME/crawlers/ | ||
|
||
#SHELL ["/bin/bash", "-c"] | ||
|
||
# Set entrypoint and command | ||
ENTRYPOINT ["/bin/bash", "-l", "-c"] | ||
CMD ["python3 ingest.py $CONFIG $PROFILE"] | ||
CMD ["python3 ingest.py $CONFIG $PROFILE"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,3 @@ website_crawler: | |
pos_regex: [".*hbs.edu.*"] | ||
num_per_second: 1 | ||
pages_source: sitemap | ||
extraction: playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,4 +19,3 @@ rss_crawler: | |
] | ||
days_past: 30 | ||
delay: 1 | ||
extraction: playwright # pdf or playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,4 +19,3 @@ rss_crawler: | |
] | ||
days_past: 30 | ||
delay: 1 | ||
extraction: playwright # pdf or playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,4 +38,3 @@ rss_crawler: | |
] | ||
days_past: 30 | ||
delay: 1 | ||
extraction: playwright # pdf or playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,4 +16,3 @@ rss_crawler: | |
] | ||
days_past: 30 | ||
delay: 1 | ||
extraction: playwright # pdf or playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,4 +17,3 @@ rss_crawler: | |
] | ||
days_past: 30 | ||
delay: 1 | ||
extraction: playwright # pdf or playwright |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.