initial attempt

Energinet-DataHub · Dec 3, 2024 · 7064e93 · 7064e93
1 parent ed4bc04
commit 7064e93
Show file tree

Hide file tree

Showing 185 changed files with 17,994 additions and 1 deletion.
diff --git a/.devcontainer/check_test_count.sh b/.devcontainer/check_test_count.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -l
+
+# Description
+# This script checks if all the tests are included in the matrix in the test step in ci-databricks.yml.
+# It is used in the pipeline to ensure that all the tests are included in the matrix.
+# The script must be invoked with a filter matching the paths NOT included in the matrix
+
+# $@: (Optional) Can be set to specify a filter for running python tests at the specified path.
+echo "Filter (paths): '$@'"
+
+# Exit immediately with failure status if any command fails
+set -e
+
+cd source/settlement_report_python/tests/
+# Enable extended globbing. E.g. see https://stackoverflow.com/questions/8525437/list-files-not-matching-a-pattern
+shopt -s extglob
+
+# This script runs pytest with the --collect-only flag to get the number of tests.
+# 'grep' filters the output to get the line with the number of tests collected. Multiple lines can be returned.
+# 'awk' is used to get the second column of the output which contains the number of tests.
+# 'head' is used to get the first line of the output which contains the number of tests.
+# Example output line returned by the grep filter: 'collected 10 items'
+executed_test_count=$(coverage run --branch -m pytest $@ --collect-only  | grep collected | awk '{print $2}' | head -n 1)
+
+total_test_count=$(coverage run --branch -m pytest --collect-only  | grep collected | awk '{print $2}' | head -n 1)
+
+echo "Number of tests being executed: $executed_test_count"
+echo "Total number of pytest tests: $total_test_count"
+
+
+if [ "$total_test_count" == "$executed_test_count" ]; then
+    echo "Not missing any tests."
+else
+    difference=$((total_test_count - executed_test_count))
+    echo "Found $difference tests not executed. A folder is missing in the matrix."
+    exit 1
+fi
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,55 @@
+{
+	"name": "Spark Dev",
+	"build": {
+		"dockerfile": "../.docker/Dockerfile",
+		"args": {}
+	},
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"matangover.mypy",
+				"ms-python.flake8",
+				"ms-dotnettools.dotnet-interactive-vscode",
+				"ms-python.python",
+				"ms-python.black-formatter",
+				"littlefoxteam.vscode-python-test-adapter",
+				"hbenl.vscode-test-explorer",
+				"eamodio.gitlens",
+				"ms-python.vscode-pylance",
+				"HashiCorp.terraform",
+				"christian-kohler.path-intellisense",
+				"Gruntfuggly.todo-tree",
+				"DavidAnson.vscode-markdownlint",
+				"kevinglasson.cornflakes-linter",
+				"KevinRose.vsc-python-indent",
+				"sonarsource.sonarlint-vscode"
+			],
+			// Set *default* container specific settings.json values on container create.
+			"settings": {
+				"terminal.integrated.shell.linux": "/bin/bash",
+				"editor.formatOnSave": false,
+				"[python]": {
+					"editor.formatOnSave": true
+				},
+				"python.formatting.provider": "black",
+				"python.defaultInterpreterPath": "/opt/conda/bin/python",
+				"python.languageServer": "Pylance",
+				"markdownlint.config": {
+					"MD007": {
+						"indent": 4
+					}
+				}
+			}
+		}
+	},
+	"containerEnv": {
+		"GRANT_SUDO": "yes"
+	},
+	"forwardPorts": [
+		5568
+	],
+	"appPort": [
+		"5568:5050"
+	],
+	"containerUser": "root"
+}
diff --git a/.devcontainer/docker-compose-windows.yml b/.devcontainer/docker-compose-windows.yml
@@ -0,0 +1,15 @@
+services:
+  python-unit-test:
+    image: ghcr.io/energinet-datahub/geh-settlement-report/python-unit-test:${IMAGE_TAG:-latest}
+    volumes:
+      # Forwards the local Docker socket to the container.
+      - /var/run/docker.sock:/var/run/docker-host.sock
+      # Update this to wherever you want VS Code to mount the folder of your project
+      - ..:/workspaces/geh-settlement-report:cached
+      # Map to Azure CLI token cache location (on Windows)
+      - "${USERPROFILE}/.azure:/home/joyvan/.azure"
+    environment:
+      # Pass the environment variables from your shell straight through to your containers.
+      # No warning is issued if the variable in the shell environment is not set.
+      # See https://docs.docker.com/compose/environment-variables/set-environment-variables/#additional-information-1
+      - AZURE_KEYVAULT_URL
diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml
@@ -0,0 +1,15 @@
+services:
+  python-unit-test:
+    image: ghcr.io/energinet-datahub/geh-settlement-report/python-unit-test:${IMAGE_TAG:-latest}
+    volumes:
+      # Forwards the local Docker socket to the container.
+      - /var/run/docker.sock:/var/run/docker-host.sock
+      # Update this to wherever you want VS Code to mount the folder of your project
+      - ..:/workspaces/geh-settlement-report:cached
+      # Map to Azure CLI token cache location (on Linux)
+      - "${HOME}/.azure:/home/joyvan/.azure"
+    environment:
+      # Pass the environment variables from your shell straight through to your containers.
+      # No warning is issued if the variable in the shell environment is not set.
+      # See https://docs.docker.com/compose/environment-variables/set-environment-variables/#additional-information-1
+      - AZURE_KEYVAULT_URL
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
@@ -0,0 +1,38 @@
+# This is a pip 'requirements.txt' file
+# See https://pip.pypa.io/en/stable/reference/requirements-file-format/
+
+#
+# PYTHON TOOLS
+#
+black
+build
+coverage-threshold
+flake8
+mypy
+pyspelling
+pytest-xdist
+
+#
+# CODE DEPENDENCIES
+#  - Make sure any packages specified in setup.py are pinned to the same version here
+#
+databricks-cli==0.18
+dataclasses-json==0.6.7
+delta-spark==3.2.0
+pyspark==3.5.1
+dependency_injector==4.43.0
+azure-identity==1.17.1
+azure-keyvault-secrets==4.8.0
+azure-monitor-opentelemetry==1.6.4
+azure-core==1.32.0
+azure-monitor-query==1.4.0
+opengeh-spark-sql-migrations @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/spark_sql_migrations
+python-dateutil==2.8.2
+types-python-dateutil==2.9.0.20241003
+opengeh-telemetry @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/telemetry
+
+coverage==7.6.8
+pytest==8.3.3
+configargparse==1.7.0
+pytest-mock==3.14.0
+virtualenv==20.24.2
diff --git a/.docker/Dockerfile b/.docker/Dockerfile
@@ -0,0 +1,56 @@
+# Copyright 2020 Energinet DataHub A/S
+#
+# Licensed under the Apache License, Version 2.0 (the "License2");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The spark version should follow the spark version in databricks.
+# The databricks version of spark is controlled from dh3-infrastructure and uses latest LTS (ATTOW - Spark v3.5.0)
+# pyspark-slim version should match pyspark version in requirements.txt
+FROM ghcr.io/energinet-datahub/pyspark-slim:3.5.1-5
+
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+RUN apt-get update; \
+    # Install git as it is needed by spark
+    apt-get install --no-install-recommends -y git; \
+    # Curl is temporarily installed in order to download the Azure CLI (consider multi stage build instead)
+    apt-get install --no-install-recommends -y curl; \
+    # Install Azure CLI, see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt
+    # as it is needed by integration tests
+    curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash; \
+    # Cleanup apt cache to reduce image size
+    apt-get remove -y curl; \
+    rm -rf /var/lib/apt/lists/*
+
+# This replaces the default spark configuration in the docker image with the ones defined in the sibling file
+COPY spark-defaults.conf $SPARK_HOME/conf/
+
+# Install python packages used in pyspark development (keep spark dependent packages alligned)
+# delta-spark version has to have compatibility with spark version (https://docs.delta.io/latest/releases.html)
+# example (delta 2.2.x = spark 3.3.x)
+COPY requirements.txt requirements.txt
+RUN pip --no-cache-dir install -r requirements.txt
+
+# Set misc environment variables required for properly run spark
+# note the amount of memory used on the driver is adjusted here
+ENV PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH \
+    PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip" \
+    SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
+
+# Dynamically downloading spark dependencies from conf/spark-defaults.conf. This is done to save time in the build pipeline so that we don't need to download on every build.
+RUN spark-shell
+
+# Make $HOME owned by the root, which is the user used in the container
+# This is needed for e.g. commands that create files or folders in $HOME
+RUN sudo chown -R root:users $HOME
diff --git a/.docker/entrypoint.sh b/.docker/entrypoint.sh
@@ -0,0 +1,44 @@
+#!/bin/bash -l
+
+# Copyright 2020 Energinet DataHub A/S
+#
+# Licensed under the Apache License, Version 2.0 (the "License2");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# $1: Mandatory test folder path
+# $2: (Optional) Can be set to specify a filter for running python tests by using 'keyword expressions'.
+# See use of '-k' and 'keyword expressions' here: https://docs.pytest.org/en/7.4.x/how-to/usage.html#specifying-which-tests-to-run
+echo "Tests folder path: '$1'"
+echo "Filter (paths): '$2'"
+
+# Configure Azure CLI to use token cache which must be mapped as volume from host machine
+export AZURE_CONFIG_DIR=/home/joyvan/.azure
+
+# There env vars are important to ensure that the driver and worker nodes in spark are alligned
+export PYSPARK_PYTHON=/opt/conda/bin/python
+export PYSPARK_DRIVER_PYTHON=/opt/conda/bin/python
+
+# Exit immediately with failure status if any command fails
+set -e
+
+# Enable extended globbing. E.g. see https://stackoverflow.com/questions/8525437/list-files-not-matching-a-pattern
+shopt -s extglob
+
+cd $1
+coverage run --branch -m pytest -vv --junitxml=pytest-results.xml $2
+
+# Create data for threshold evaluation
+coverage json
+# Create human reader friendly HTML report
+coverage html
+
+coverage-threshold --line-coverage-min 25
diff --git a/.docker/requirements.txt b/.docker/requirements.txt
@@ -0,0 +1,38 @@
+# This is a pip 'requirements.txt' file
+# See https://pip.pypa.io/en/stable/reference/requirements-file-format/
+
+#
+# PYTHON TOOLS
+#
+black
+build
+coverage-threshold
+flake8
+mypy
+pyspelling
+pytest-xdist
+
+#
+# CODE DEPENDENCIES
+#  - Make sure any packages specified in setup.py are pinned to the same version here
+#
+databricks-cli==0.18
+dataclasses-json==0.6.7
+delta-spark==3.2.0
+pyspark==3.5.1
+dependency_injector==4.43.0
+azure-identity==1.17.1
+azure-keyvault-secrets==4.8.0
+azure-monitor-opentelemetry==1.6.4
+azure-core==1.32.0
+azure-monitor-query==1.4.0
+opengeh-spark-sql-migrations @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/spark_sql_migrations
+python-dateutil==2.8.2
+types-python-dateutil==2.9.0.20241003
+opengeh-telemetry @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/telemetry
+
+coverage==7.6.8
+pytest==8.3.3
+configargparse==1.7.0
+pytest-mock==3.14.0
+virtualenv==20.24.2
diff --git a/.docker/spark-defaults.conf b/.docker/spark-defaults.conf
@@ -0,0 +1,15 @@
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory               16g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.jars.packages io.delta:delta-core_2.12:2.2.0
+
+# spark.hadoop.fs.AbstractFileSystem.abfss.impl org.apache.hadoop.fs.azurebfs.Abfss
+# spark.hadoop.fs.abfss.impl org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem
diff --git a/.github/workflows/ci-orchestrator.yml b/.github/workflows/ci-orchestrator.yml
@@ -35,11 +35,25 @@ jobs:
   changes:
     uses: ./.github/workflows/detect-changes.yml
 
+  ci_docker:
+    needs: changes
+    uses: Energinet-DataHub/.github/.github/workflows/python-build-and-push-docker-image.yml@v13
+    with:
+      docker_changed: ${{ needs.changes.outputs.docker == 'true' }}
+      docker_changed_in_commit: ${{ needs.changes.outputs.docker_in_commit == 'true' }}
+
   ci_dotnet:
     needs: changes
     if: ${{ needs.changes.outputs.dotnet == 'true' || needs.changes.outputs.db_migrations == 'true' }}
     uses: ./.github/workflows/ci-dotnet.yml
 
+  ci_python:
+    needs: changes
+    if: ${{ needs.changes.outputs.settlement_report_job == 'true' }}
+    uses: ./.github/workflows/ci-python.yml
+    with:
+      image_tag: ${{ needs.ci_docker.outputs.image_tag }}
+
   render_c4model_views:
     needs: changes
     if: ${{ needs.changes.outputs.render_c4model_views == 'true' }}