feat(Storage): Filesystem-based storage backend for files

* chore(docker): Add .ruff_cache & .venv to ignore folder for docker * chore(CI): Copy the .env.dist to .env for the tests * feat(Storage): Implement a filesystem storage to store the datasets files in a mounted directory * Fix datasets worker * Manage proxy settings in base.py settings and docker image build args (#810) This allows to reduce the number of env vars to pass and manage secured connections. Co-authored-by: Christophe Philemotte <[email protected]> * fix: enforce file size to be an integer * chore: Adapt code based on Nazar's comments * chore: Do not silently handle the ValueError in case of wrong value --------- Co-authored-by: Christophe Philemotte <[email protected]> Co-authored-by: Christophe Philemotte <[email protected]> Co-authored-by: nazarfil <[email protected]>
BLSQ · Sep 19, 2024 · d9d7159 · d9d7159
1 parent 057451b
commit d9d7159
Show file tree

Hide file tree

Showing 68 changed files with 1,827 additions and 1,794 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -3,7 +3,8 @@
 .idea
 .devcontainer
 .env
-
+.ruff_cache
+.venv
 Dockerfile
 docker-compose.yml
 .dockerignore

diff --git a/.env.dist b/.env.dist
@@ -1,23 +1,149 @@
-DEBUG_LOGGING=false
-DEBUG_TOOLBAR=false
+# General
+###################
+
+DEBUG=true
 
-# Required to run with default storage mode GCP
-GCS_SERVICE_ACCOUNT_KEY=
+# Settings module for Django in dev
+DJANGO_SETTINGS_MODULE=config.settings.dev
 
-# To run it in AWS mode or in LocalHosting mode set the variable to s3
-# WORKSPACE_STORAGE_ENGINE=s3
-WORKSPACE_DATASETS_BUCKET=
-WORKSPACE_STORAGE_ENGINE_AWS_ACCESS_KEY_ID=
-WORKSPACE_STORAGE_ENGINE_AWS_SECRET_ACCESS_KEY=
+# Django debugging settings
+DEBUG_LOGGING=true
+DEBUG_TOOLBAR=false
 
-# Not required
+# Encryption settings
+SECRET_KEY="))dodw9%n)7q86l-q1by4e-2z#vonph50!%ep7_je)_=x0m2v-"
+ENCRYPTION_KEY="oT7DKt8zf0vsnbBcJ0R36SHkBzbjF2agFIK3hSAVvko="
+
+# Email settings
 EMAIL_HOST=
 EMAIL_PORT=
 EMAIL_HOST_USER=
 EMAIL_USE_TLS=
 EMAIL_HOST_PASSWORD=
 
-# Required to run legacy part of the code
+# Database settings for Django
+DATABASE_HOST=db
+DATABASE_PORT=5432
+DATABASE_NAME=hexa-app
+DATABASE_USER=hexa-app
+DATABASE_PASSWORD=hexa-app
+# Database settings for Postgres
+POSTGRES_DB=hexa-app
+POSTGRES_USER=hexa-app
+POSTGRES_PASSWORD=hexa-app
+
+# Networking
+############
+
+# To enable TLS/SSL directly on the app
+# TLS="false"
+
+# The hostname on which the services are published / bound
+BASE_HOSTNAME=localhost
+# The port number to access the backend
+BASE_PORT=8000
+# URL to use for the communication between pipelines, workers & the backend's API
+# If not set, it falls back to BASE_HOSTNAME:BASE_PORT
+INTERNAL_BASE_URL=http://app:8000
+
+# NextJS Frontend
+# If not set, it falls back to either PROXY_HOSTNAME_AND_PORT or
+# BASE_HOSTNAME:FRONTEND_PORT
+# NEW_FRONTEND_DOMAIN=http://localhost:3000
+
+# Jupyter Hub
+# If not set, it falls back to either PROXY_HOSTNAME_AND_PORT or
+# BASE_HOSTNAME:JUPYTERHUB_PORT
+# NOTEBOOKS_URL=http://localhost:8001
+
+# The port number to access the frontend
+FRONTEND_PORT=3000
+# The port number to access the Jupyter hub
+JUPYTERHUB_PORT=8001
+
+# I'd put that directly in the compose manifest file
+OPENHEXA_BACKEND_URL=http://app:8000
+
+# if it's behind a reverse proxy
+# PROXY_HOSTNAME_AND_PORT=example.com
+# If TLS/SSL is set up on a reverse proxy routing to the app
+# TRUST_FORWARDED_PROTO="no"
+
+# MixPanel
+##########
+
+# mixpanel analytics
+MIXPANEL_TOKEN=
+
+
+# Pipelines
+############
+
+DEFAULT_WORKSPACE_IMAGE=blsq/openhexa-base-environment:latest # Change this to the image of the workspace you want to use by default
+PIPELINE_SCHEDULER_SPAWNER=docker # Change to kubernetes to use kubernetes spawner
+
+# Kubernetes resources settings (used only in kubernetes spawner mode
+PIPELINE_DEFAULT_CONTAINER_CPU_LIMIT=2
+PIPELINE_DEFAULT_CONTAINER_MEMORY_LIMIT=4G
+PIPELINE_DEFAULT_CONTAINER_CPU_REQUEST=0.05
+PIPELINE_DEFAULT_CONTAINER_MEMORY_REQUEST=100M
+
+
+# Notebooks
+############
+
+NOTEBOOKS_HUB_URL=http://jupyterhub:8000/hub
+HUB_API_TOKEN=cbb352d6a412e266d7494fb014dd699373645ec8d353e00c7aa9dc79ca87800d # Change this to the token of the jupyterhub service
+
+# Workspaces
+#############
+
+# Workspaces' DB settings
+WORKSPACES_DATABASE_HOST=db
+WORKSPACES_DATABASE_PORT=5432
+WORKSPACES_DATABASE_ROLE=hexa-app
+WORKSPACES_DATABASE_DEFAULT_DB=hexa-app
+WORKSPACES_DATABASE_PASSWORD=hexa-app
+WORKSPACES_DATABASE_PROXY_HOST=db
+
+
+# Workspace storage options
+# --------------------------
+
+# Add a prefix to the bucket name (may be useful to separate dev and prod workspaces inside a shared Google Cloud Project)
+WORKSPACE_BUCKET_PREFIX=
+
+# Local FS: Define the root location where the workspaces files will be stored
+# Absolute path to the directory where the workspaces data will be stored
+WORKSPACE_STORAGE_LOCATION=$WORKSPACE_STORAGE_LOCATION
+# Uncomment to disable the check of the file size before uploading it to the workspace (only for local storage)
+#DISABLE_UPLOAD_MAX_SIZE_CHECK=false
+
+## GCP: Mandatory to run with GCS
+WORKSPACE_STORAGE_BACKEND_GCS_SERVICE_ACCOUNT_KEY=
+# The region where the buckets will be created
+# WORKSPACE_BUCKET_REGION=
+
+## AWS: To run it in AWS mode or in LocalHosting mode set the variable to s3
+WORKSPACE_STORAGE_BACKEND_AWS_ENDPOINT_URL=
+WORKSPACE_STORAGE_BACKEND_AWS_PUBLIC_ENDPOINT_URL=
+WORKSPACE_STORAGE_BACKEND_AWS_SECRET_ACCESS_KEY=
+WORKSPACE_STORAGE_BACKEND_AWS_ACCESS_KEY_ID=
+WORKSPACE_STORAGE_BACKEND_AWS_BUCKET_REGION=
+# The region where the buckets will be created
+# WORKSPACE_BUCKET_REGION=
+
+# Datasets
+###########
+
+# Bucket to store datasets for all workspaces
+WORKSPACE_DATASETS_BUCKET=hexa-datasets
+WORKSPACE_DATASETS_FILE_SNAPSHOT_SIZE=50
+
+# Legacy
+#########
+
+# Required for the `connector_s3` django app to work
 AWS_USERNAME=
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
@@ -27,5 +153,7 @@ AWS_USER_ARN=
 AWS_APP_ROLE_ARN=
 AWS_PERMISSIONS_BOUNDARY_POLICY_ARN=
 
-# mixpanel analytics
-MIXPANEL_TOKEN=
+# Accessmod settings
+ACCESSMOD_BUCKET_NAME=s3://hexa-demo-accessmod
+ACCESSMOD_MANAGE_REQUESTS_URL=http://localhost:3000/admin/access-requests
+ACCESSMOD_SET_PASSWORD_URL=http://localhost:3000/account/set-password
diff --git a/.github/workflows/build_docker_image.yml b/.github/workflows/build_docker_image.yml
@@ -71,6 +71,8 @@ jobs:
           context: .
           target: app
           file: Dockerfile
+          build-args: |
+            DJANGO_SETTINGS_MODULE=config.settings.dev
           cache-from: type=registry,ref=blsq/openhexa-app:buildcache
           cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
           tags: |
@@ -85,6 +87,8 @@ jobs:
           context: .
           target: app
           file: Dockerfile
+          build-args: |
+            DJANGO_SETTINGS_MODULE=config.settings.dev
           cache-from: type=registry,ref=blsq/openhexa-app:buildcache
           cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
           tags: |

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -50,11 +50,14 @@ jobs:
 
       - name: Create docker network
         run: docker network create openhexa
+
+      - name: Copy .env file
+        run: cp .env.dist .env
 
       - name: Build docker app image
         env:
           DOCKER_BUILDKIT: 1
-        run: docker compose build
+        run: docker compose build --build-arg DJANGO_SETTINGS_MODULE="config.settings.dev"
 
       - name: Run Django tests
         run: docker compose run -e DEBUG=false app coveraged-test
@@ -63,10 +66,12 @@ jobs:
       - name: Build and push (on main)
         uses: docker/build-push-action@v6
         with:
-          push: true
+          push: ${{ github.event_name != 'pull_request' }} 
           context: .
           target: app
           file: Dockerfile
+          build-args: |
+            DJANGO_SETTINGS_MODULE=config.settings.dev
           cache-from: type=registry,ref=blsq/openhexa-app:buildcache
           cache-to: type=registry,ref=blsq/openhexa-app:buildcache,mode=max
           tags: |

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.12-slim as deps
+FROM python:3.12-slim AS deps
 
 RUN \
   --mount=type=cache,target=/var/cache/apt,sharing=locked \
@@ -8,33 +8,58 @@ RUN \
   apt-get clean && \
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
-RUN pip install --upgrade pip
-
+  # Set up work directory
 RUN mkdir /code
 WORKDIR /code
 
-RUN \
-  --mount=type=cache,target=/root/.cache \ 
-  --mount=type=bind,source=requirements.txt,target=/code/requirements.txt \
-  pip install setuptools==68.0.0 && pip install -r requirements.txt
+# Upgrade pip and install requirements
+RUN --mount=type=cache,target=/root/.cache \
+    pip install --upgrade pip setuptools==68.0.0
+
+# Install project dependencies from requirements.txt
+COPY requirements.txt /code/
 
+RUN --mount=type=cache,target=/root/.cache \
+    pip install -r requirements.txt  && \ 
+    apt-get remove -y build-essential && \
+    apt-get autoremove -y
+
+# Copy the rest of the application
 COPY . /code/
 
-ENV SECRET_KEY="collectstatic"
-ENV DJANGO_SETTINGS_MODULE config.settings.production
+ARG DJANGO_SETTINGS_MODULE
+
+# Entry point
+ARG WORKSPACE_STORAGE_LOCATION
+ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
+ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
 ENTRYPOINT ["/code/docker-entrypoint.sh"]
 CMD start
 
-FROM deps as app
-ENV DJANGO_SETTINGS_MODULE config.settings.production
+FROM deps AS app
+ARG DJANGO_SETTINGS_MODULE
+ARG WORKSPACE_STORAGE_LOCATION
+ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
+ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
 RUN python manage.py collectstatic --noinput
 
 # Staged used to run the pipelines scheduler and runner
-FROM app as pipelines
-ENV DJANGO_SETTINGS_MODULE config.settings.production
-RUN mkdir -m 0755 -p /etc/apt/keyrings
-RUN curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
-RUN echo \
-  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian \
-  $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-RUN apt-get update && apt-get install -y docker-ce-cli
+FROM app AS pipelines
+ARG DJANGO_SETTINGS_MODULE
+ARG WORKSPACE_STORAGE_LOCATION
+ENV DJANGO_SETTINGS_MODULE=${DJANGO_SETTINGS_MODULE}
+ENV WORKSPACE_STORAGE_LOCATION=${WORKSPACE_STORAGE_LOCATION}
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        ca-certificates \
+        gnupg && \
+    mkdir -m 0755 -p /etc/apt/keyrings && \
+    curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
+    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian \
+        $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends docker-ce-cli && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*