Skip to content

Commit

Permalink
feat(Storage): Filesystem-based storage backend for files
Browse files Browse the repository at this point in the history
* chore(docker): Add .ruff_cache & .venv to ignore folder for docker

* chore(CI): Copy the .env.dist to .env for the tests

* feat(Storage): Implement a filesystem storage to store the datasets files in a mounted directory

* Fix datasets worker

* Manage proxy settings in base.py settings and docker image build args (#810)

This allows to reduce the number of env vars to pass and manage secured
connections.

Co-authored-by: Christophe Philemotte <[email protected]>

* fix: enforce file size to be an integer

* chore: Adapt code based on Nazar's comments

* chore: Do not silently handle the ValueError in case of wrong value

---------

Co-authored-by: Christophe Philemotte <[email protected]>
Co-authored-by: Christophe Philemotte <[email protected]>
Co-authored-by: nazarfil <[email protected]>
  • Loading branch information
4 people authored Sep 19, 2024
1 parent 057451b commit d9d7159
Show file tree
Hide file tree
Showing 68 changed files with 1,827 additions and 1,794 deletions.
3 changes: 2 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
.idea
.devcontainer
.env

.ruff_cache
.venv
Dockerfile
docker-compose.yml
.dockerignore
Expand Down
154 changes: 141 additions & 13 deletions .env.dist
Original file line number Diff line number Diff line change
@@ -1,23 +1,149 @@
DEBUG_LOGGING=false
DEBUG_TOOLBAR=false
# General
###################

DEBUG=true

# Required to run with default storage mode GCP
GCS_SERVICE_ACCOUNT_KEY=
# Settings module for Django in dev
DJANGO_SETTINGS_MODULE=config.settings.dev

# To run it in AWS mode or in LocalHosting mode set the variable to s3
# WORKSPACE_STORAGE_ENGINE=s3
WORKSPACE_DATASETS_BUCKET=
WORKSPACE_STORAGE_ENGINE_AWS_ACCESS_KEY_ID=
WORKSPACE_STORAGE_ENGINE_AWS_SECRET_ACCESS_KEY=
# Django debugging settings
DEBUG_LOGGING=true
DEBUG_TOOLBAR=false

# Not required
# Encryption settings
SECRET_KEY="))dodw9%n)7q86l-q1by4e-2z#vonph50!%ep7_je)_=x0m2v-"
ENCRYPTION_KEY="oT7DKt8zf0vsnbBcJ0R36SHkBzbjF2agFIK3hSAVvko="

# Email settings
EMAIL_HOST=
EMAIL_PORT=
EMAIL_HOST_USER=
EMAIL_USE_TLS=
EMAIL_HOST_PASSWORD=

# Required to run legacy part of the code
# Database settings for Django
DATABASE_HOST=db
DATABASE_PORT=5432
DATABASE_NAME=hexa-app
DATABASE_USER=hexa-app
DATABASE_PASSWORD=hexa-app
# Database settings for Postgres
POSTGRES_DB=hexa-app
POSTGRES_USER=hexa-app
POSTGRES_PASSWORD=hexa-app

# Networking
############

# To enable TLS/SSL directly on the app
# TLS="false"

# The hostname on which the services are published / bound
BASE_HOSTNAME=localhost
# The port number to access the backend
BASE_PORT=8000
# URL to use for the communication between pipelines, workers & the backend's API
# If not set, it falls back to BASE_HOSTNAME:BASE_PORT
INTERNAL_BASE_URL=http://app:8000

# NextJS Frontend
# If not set, it falls back to either PROXY_HOSTNAME_AND_PORT or
# BASE_HOSTNAME:FRONTEND_PORT
# NEW_FRONTEND_DOMAIN=http://localhost:3000

# Jupyter Hub
# If not set, it falls back to either PROXY_HOSTNAME_AND_PORT or
# BASE_HOSTNAME:JUPYTERHUB_PORT
# NOTEBOOKS_URL=http://localhost:8001

# The port number to access the frontend
FRONTEND_PORT=3000
# The port number to access the Jupyter hub
JUPYTERHUB_PORT=8001

# I'd put that directly in the compose manifest file
OPENHEXA_BACKEND_URL=http://app:8000

# if it's behind a reverse proxy
# PROXY_HOSTNAME_AND_PORT=example.com
# If TLS/SSL is set up on a reverse proxy routing to the app
# TRUST_FORWARDED_PROTO="no"

# MixPanel
##########

# mixpanel analytics
MIXPANEL_TOKEN=


# Pipelines
############

DEFAULT_WORKSPACE_IMAGE=blsq/openhexa-base-environment:latest # Change this to the image of the workspace you want to use by default
PIPELINE_SCHEDULER_SPAWNER=docker # Change to kubernetes to use kubernetes spawner

# Kubernetes resources settings (used only in kubernetes spawner mode
PIPELINE_DEFAULT_CONTAINER_CPU_LIMIT=2
PIPELINE_DEFAULT_CONTAINER_MEMORY_LIMIT=4G
PIPELINE_DEFAULT_CONTAINER_CPU_REQUEST=0.05
PIPELINE_DEFAULT_CONTAINER_MEMORY_REQUEST=100M


# Notebooks
############

NOTEBOOKS_HUB_URL=http://jupyterhub:8000/hub
HUB_API_TOKEN=cbb352d6a412e266d7494fb014dd699373645ec8d353e00c7aa9dc79ca87800d # Change this to the token of the jupyterhub service

# Workspaces
#############

# Workspaces' DB settings
WORKSPACES_DATABASE_HOST=db
WORKSPACES_DATABASE_PORT=5432
WORKSPACES_DATABASE_ROLE=hexa-app
WORKSPACES_DATABASE_DEFAULT_DB=hexa-app
WORKSPACES_DATABASE_PASSWORD=hexa-app
WORKSPACES_DATABASE_PROXY_HOST=db


# Workspace storage options
# --------------------------

# Add a prefix to the bucket name (may be useful to separate dev and prod workspaces inside a shared Google Cloud Project)
WORKSPACE_BUCKET_PREFIX=

# Local FS: Define the root location where the workspaces files will be stored
# Absolute path to the directory where the workspaces data will be stored
WORKSPACE_STORAGE_LOCATION=$WORKSPACE_STORAGE_LOCATION
# Uncomment to disable the check of the file size before uploading it to the workspace (only for local storage)
#DISABLE_UPLOAD_MAX_SIZE_CHECK=false

## GCP: Mandatory to run with GCS
WORKSPACE_STORAGE_BACKEND_GCS_SERVICE_ACCOUNT_KEY=
# The region where the buckets will be created
# WORKSPACE_BUCKET_REGION=

## AWS: To run it in AWS mode or in LocalHosting mode set the variable to s3
WORKSPACE_STORAGE_BACKEND_AWS_ENDPOINT_URL=
WORKSPACE_STORAGE_BACKEND_AWS_PUBLIC_ENDPOINT_URL=
WORKSPACE_STORAGE_BACKEND_AWS_SECRET_ACCESS_KEY=
WORKSPACE_STORAGE_BACKEND_AWS_ACCESS_KEY_ID=
WORKSPACE_STORAGE_BACKEND_AWS_BUCKET_REGION=
# The region where the buckets will be created
# WORKSPACE_BUCKET_REGION=

# Datasets
###########

# Bucket to store datasets for all workspaces
WORKSPACE_DATASETS_BUCKET=hexa-datasets
WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE=50

# Legacy
#########

# Required for the `connector_s3` django app to work
AWS_USERNAME=
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
Expand All @@ -27,5 +153,7 @@ AWS_USER_ARN=
AWS_APP_ROLE_ARN=
AWS_PERMISSIONS_BOUNDARY_POLICY_ARN=

# mixpanel analytics
MIXPANEL_TOKEN=
# Accessmod settings
ACCESSMOD_BUCKET_NAME=s3://hexa-demo-accessmod
ACCESSMOD_MANAGE_REQUESTS_URL=http://localhost:3000/admin/access-requests
ACCESSMOD_SET_PASSWORD_URL=http://localhost:3000/account/set-password
4 changes: 4 additions & 0 deletions .github/workflows/build_docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ jobs:
context: .
target: app
file: Dockerfile
build-args: |
DJANGO_SETTINGS_MODULE=config.settings.dev
cache-from: type=registry,ref=blsq/openhexa-app:buildcache
cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
tags: |
Expand All @@ -85,6 +87,8 @@ jobs:
context: .
target: app
file: Dockerfile
build-args: |
DJANGO_SETTINGS_MODULE=config.settings.dev
cache-from: type=registry,ref=blsq/openhexa-app:buildcache
cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
tags: |
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@ jobs:

- name: Create docker network
run: docker network create openhexa

- name: Copy .env file
run: cp .env.dist .env

- name: Build docker app image
env:
DOCKER_BUILDKIT: 1
run: docker compose build
run: docker compose build --build-arg DJANGO_SETTINGS_MODULE="config.settings.dev"

- name: Run Django tests
run: docker compose run -e DEBUG=false app coveraged-test
Expand All @@ -63,10 +66,12 @@ jobs:
- name: Build and push (on main)
uses: docker/build-push-action@v6
with:
push: true
push: ${{ github.event_name != 'pull_request' }}
context: .
target: app
file: Dockerfile
build-args: |
DJANGO_SETTINGS_MODULE=config.settings.dev
cache-from: type=registry,ref=blsq/openhexa-app:buildcache
cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
tags: |
Expand Down
63 changes: 44 additions & 19 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.12-slim as deps
FROM python:3.12-slim AS deps

RUN \
--mount=type=cache,target=/var/cache/apt,sharing=locked \
Expand All @@ -8,33 +8,58 @@ RUN \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

RUN pip install --upgrade pip

# Set up work directory
RUN mkdir /code
WORKDIR /code

RUN \
--mount=type=cache,target=/root/.cache \
--mount=type=bind,source=requirements.txt,target=/code/requirements.txt \
pip install setuptools==68.0.0 && pip install -r requirements.txt
# Upgrade pip and install requirements
RUN --mount=type=cache,target=/root/.cache \
pip install --upgrade pip setuptools==68.0.0

# Install project dependencies from requirements.txt
COPY requirements.txt /code/

RUN --mount=type=cache,target=/root/.cache \
pip install -r requirements.txt && \
apt-get remove -y build-essential && \
apt-get autoremove -y

# Copy the rest of the application
COPY . /code/

ENV SECRET_KEY="collectstatic"
ENV DJANGO_SETTINGS_MODULE config.settings.production
ARG DJANGO_SETTINGS_MODULE

# Entry point
ARG WORKSPACE_STORAGE_LOCATION
ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
ENTRYPOINT ["/code/docker-entrypoint.sh"]
CMD start

Check warning on line 37 in Dockerfile

View workflow job for this annotation

GitHub Actions / Run test suite

JSON arguments recommended for ENTRYPOINT/CMD to prevent unintended behavior related to OS signals

JSONArgsRecommended: JSON arguments recommended for CMD to prevent unintended behavior related to OS signals More info: https://docs.docker.com/go/dockerfile/rule/json-args-recommended/

FROM deps as app
ENV DJANGO_SETTINGS_MODULE config.settings.production
FROM deps AS app
ARG DJANGO_SETTINGS_MODULE
ARG WORKSPACE_STORAGE_LOCATION
ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
RUN python manage.py collectstatic --noinput

# Staged used to run the pipelines scheduler and runner
FROM app as pipelines
ENV DJANGO_SETTINGS_MODULE config.settings.production
RUN mkdir -m 0755 -p /etc/apt/keyrings
RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
RUN echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian \
$(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
RUN apt-get update && apt-get install -y docker-ce-cli
FROM app AS pipelines
ARG DJANGO_SETTINGS_MODULE
ARG WORKSPACE_STORAGE_LOCATION
ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
apt-get update && \
apt-get install -y --no-install-recommends \
curl \
ca-certificates \
gnupg && \
mkdir -m 0755 -p /etc/apt/keyrings && \
curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian \
$(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
apt-get update && \
apt-get install -y --no-install-recommends docker-ce-cli && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
Loading

0 comments on commit d9d7159

Please sign in to comment.