Skip to content

Commit

Permalink
initial attempt
Browse files Browse the repository at this point in the history
  • Loading branch information
cenh-halfspace committed Dec 3, 2024
1 parent ed4bc04 commit 7064e93
Show file tree
Hide file tree
Showing 185 changed files with 17,994 additions and 1 deletion.
37 changes: 37 additions & 0 deletions .devcontainer/check_test_count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash -l

# Description
# This script checks if all the tests are included in the matrix in the test step in ci-databricks.yml.
# It is used in the pipeline to ensure that all the tests are included in the matrix.
# The script must be invoked with a filter matching the paths NOT included in the matrix

# $@: (Optional) Can be set to specify a filter for running python tests at the specified path.
echo "Filter (paths): '$@'"

# Exit immediately with failure status if any command fails
set -e

cd source/settlement_report_python/tests/
# Enable extended globbing. E.g. see https://stackoverflow.com/questions/8525437/list-files-not-matching-a-pattern
shopt -s extglob

# This script runs pytest with the --collect-only flag to get the number of tests.
# 'grep' filters the output to get the line with the number of tests collected. Multiple lines can be returned.
# 'awk' is used to get the second column of the output which contains the number of tests.
# 'head' is used to get the first line of the output which contains the number of tests.
# Example output line returned by the grep filter: 'collected 10 items'
executed_test_count=$(coverage run --branch -m pytest $@ --collect-only | grep collected | awk '{print $2}' | head -n 1)

total_test_count=$(coverage run --branch -m pytest --collect-only | grep collected | awk '{print $2}' | head -n 1)

echo "Number of tests being executed: $executed_test_count"
echo "Total number of pytest tests: $total_test_count"


if [ "$total_test_count" == "$executed_test_count" ]; then
echo "Not missing any tests."
else
difference=$((total_test_count - executed_test_count))
echo "Found $difference tests not executed. A folder is missing in the matrix."
exit 1
fi
55 changes: 55 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"name": "Spark Dev",
"build": {
"dockerfile": "../.docker/Dockerfile",
"args": {}
},
"customizations": {
"vscode": {
"extensions": [
"matangover.mypy",
"ms-python.flake8",
"ms-dotnettools.dotnet-interactive-vscode",
"ms-python.python",
"ms-python.black-formatter",
"littlefoxteam.vscode-python-test-adapter",
"hbenl.vscode-test-explorer",
"eamodio.gitlens",
"ms-python.vscode-pylance",
"HashiCorp.terraform",
"christian-kohler.path-intellisense",
"Gruntfuggly.todo-tree",
"DavidAnson.vscode-markdownlint",
"kevinglasson.cornflakes-linter",
"KevinRose.vsc-python-indent",
"sonarsource.sonarlint-vscode"
],
// Set *default* container specific settings.json values on container create.
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"editor.formatOnSave": false,
"[python]": {
"editor.formatOnSave": true
},
"python.formatting.provider": "black",
"python.defaultInterpreterPath": "/opt/conda/bin/python",
"python.languageServer": "Pylance",
"markdownlint.config": {
"MD007": {
"indent": 4
}
}
}
}
},
"containerEnv": {
"GRANT_SUDO": "yes"
},
"forwardPorts": [
5568
],
"appPort": [
"5568:5050"
],
"containerUser": "root"
}
15 changes: 15 additions & 0 deletions .devcontainer/docker-compose-windows.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
services:
python-unit-test:
image: ghcr.io/energinet-datahub/geh-settlement-report/python-unit-test:${IMAGE_TAG:-latest}
volumes:
# Forwards the local Docker socket to the container.
- /var/run/docker.sock:/var/run/docker-host.sock
# Update this to wherever you want VS Code to mount the folder of your project
- ..:/workspaces/geh-settlement-report:cached
# Map to Azure CLI token cache location (on Windows)
- "${USERPROFILE}/.azure:/home/joyvan/.azure"
environment:
# Pass the environment variables from your shell straight through to your containers.
# No warning is issued if the variable in the shell environment is not set.
# See https://docs.docker.com/compose/environment-variables/set-environment-variables/#additional-information-1
- AZURE_KEYVAULT_URL
15 changes: 15 additions & 0 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
services:
python-unit-test:
image: ghcr.io/energinet-datahub/geh-settlement-report/python-unit-test:${IMAGE_TAG:-latest}
volumes:
# Forwards the local Docker socket to the container.
- /var/run/docker.sock:/var/run/docker-host.sock
# Update this to wherever you want VS Code to mount the folder of your project
- ..:/workspaces/geh-settlement-report:cached
# Map to Azure CLI token cache location (on Linux)
- "${HOME}/.azure:/home/joyvan/.azure"
environment:
# Pass the environment variables from your shell straight through to your containers.
# No warning is issued if the variable in the shell environment is not set.
# See https://docs.docker.com/compose/environment-variables/set-environment-variables/#additional-information-1
- AZURE_KEYVAULT_URL
38 changes: 38 additions & 0 deletions .devcontainer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This is a pip 'requirements.txt' file
# See https://pip.pypa.io/en/stable/reference/requirements-file-format/

#
# PYTHON TOOLS
#
black
build
coverage-threshold
flake8
mypy
pyspelling
pytest-xdist

#
# CODE DEPENDENCIES
# - Make sure any packages specified in setup.py are pinned to the same version here
#
databricks-cli==0.18
dataclasses-json==0.6.7
delta-spark==3.2.0
pyspark==3.5.1
dependency_injector==4.43.0
azure-identity==1.17.1
azure-keyvault-secrets==4.8.0
azure-monitor-opentelemetry==1.6.4
azure-core==1.32.0
azure-monitor-query==1.4.0
opengeh-spark-sql-migrations @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/spark_sql_migrations
python-dateutil==2.8.2
types-python-dateutil==2.9.0.20241003
opengeh-telemetry @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/telemetry

coverage==7.6.8
pytest==8.3.3
configargparse==1.7.0
pytest-mock==3.14.0
virtualenv==20.24.2
56 changes: 56 additions & 0 deletions .docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright 2020 Energinet DataHub A/S
#
# Licensed under the Apache License, Version 2.0 (the "License2");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The spark version should follow the spark version in databricks.
# The databricks version of spark is controlled from dh3-infrastructure and uses latest LTS (ATTOW - Spark v3.5.0)
# pyspark-slim version should match pyspark version in requirements.txt
FROM ghcr.io/energinet-datahub/pyspark-slim:3.5.1-5

SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

RUN apt-get update; \
# Install git as it is needed by spark
apt-get install --no-install-recommends -y git; \
# Curl is temporarily installed in order to download the Azure CLI (consider multi stage build instead)
apt-get install --no-install-recommends -y curl; \
# Install Azure CLI, see https://learn.microsoft.com/en-us/cli/azure/install-azure-cli-linux?pivots=apt
# as it is needed by integration tests
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash; \
# Cleanup apt cache to reduce image size
apt-get remove -y curl; \
rm -rf /var/lib/apt/lists/*

# This replaces the default spark configuration in the docker image with the ones defined in the sibling file
COPY spark-defaults.conf $SPARK_HOME/conf/

# Install python packages used in pyspark development (keep spark dependent packages alligned)
# delta-spark version has to have compatibility with spark version (https://docs.delta.io/latest/releases.html)
# example (delta 2.2.x = spark 3.3.x)
COPY requirements.txt requirements.txt
RUN pip --no-cache-dir install -r requirements.txt

# Set misc environment variables required for properly run spark
# note the amount of memory used on the driver is adjusted here
ENV PATH=$SPARK_HOME/bin:$HADOOP_HOME/bin:$PATH \
PYTHONPATH="${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip" \
SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"

# Dynamically downloading spark dependencies from conf/spark-defaults.conf. This is done to save time in the build pipeline so that we don't need to download on every build.
RUN spark-shell

# Make $HOME owned by the root, which is the user used in the container
# This is needed for e.g. commands that create files or folders in $HOME
RUN sudo chown -R root:users $HOME
44 changes: 44 additions & 0 deletions .docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash -l

# Copyright 2020 Energinet DataHub A/S
#
# Licensed under the Apache License, Version 2.0 (the "License2");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# $1: Mandatory test folder path
# $2: (Optional) Can be set to specify a filter for running python tests by using 'keyword expressions'.
# See use of '-k' and 'keyword expressions' here: https://docs.pytest.org/en/7.4.x/how-to/usage.html#specifying-which-tests-to-run
echo "Tests folder path: '$1'"
echo "Filter (paths): '$2'"

# Configure Azure CLI to use token cache which must be mapped as volume from host machine
export AZURE_CONFIG_DIR=/home/joyvan/.azure

# There env vars are important to ensure that the driver and worker nodes in spark are alligned
export PYSPARK_PYTHON=/opt/conda/bin/python
export PYSPARK_DRIVER_PYTHON=/opt/conda/bin/python

# Exit immediately with failure status if any command fails
set -e

# Enable extended globbing. E.g. see https://stackoverflow.com/questions/8525437/list-files-not-matching-a-pattern
shopt -s extglob

cd $1
coverage run --branch -m pytest -vv --junitxml=pytest-results.xml $2

# Create data for threshold evaluation
coverage json
# Create human reader friendly HTML report
coverage html

coverage-threshold --line-coverage-min 25
38 changes: 38 additions & 0 deletions .docker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This is a pip 'requirements.txt' file
# See https://pip.pypa.io/en/stable/reference/requirements-file-format/

#
# PYTHON TOOLS
#
black
build
coverage-threshold
flake8
mypy
pyspelling
pytest-xdist

#
# CODE DEPENDENCIES
# - Make sure any packages specified in setup.py are pinned to the same version here
#
databricks-cli==0.18
dataclasses-json==0.6.7
delta-spark==3.2.0
pyspark==3.5.1
dependency_injector==4.43.0
azure-identity==1.17.1
azure-keyvault-secrets==4.8.0
azure-monitor-opentelemetry==1.6.4
azure-core==1.32.0
azure-monitor-query==1.4.0
opengeh-spark-sql-migrations @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/spark_sql_migrations
python-dateutil==2.8.2
types-python-dateutil==2.9.0.20241003
opengeh-telemetry @ git+https://[email protected]/Energinet-DataHub/[email protected]#subdirectory=source/telemetry

coverage==7.6.8
pytest==8.3.3
configargparse==1.7.0
pytest-mock==3.14.0
virtualenv==20.24.2
15 changes: 15 additions & 0 deletions .docker/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.

# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 16g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"

spark.jars.packages io.delta:delta-core_2.12:2.2.0

# spark.hadoop.fs.AbstractFileSystem.abfss.impl org.apache.hadoop.fs.azurebfs.Abfss
# spark.hadoop.fs.abfss.impl org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem
14 changes: 14 additions & 0 deletions .github/workflows/ci-orchestrator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,25 @@ jobs:
changes:
uses: ./.github/workflows/detect-changes.yml

ci_docker:
needs: changes
uses: Energinet-DataHub/.github/.github/workflows/python-build-and-push-docker-image.yml@v13
with:
docker_changed: ${{ needs.changes.outputs.docker == 'true' }}
docker_changed_in_commit: ${{ needs.changes.outputs.docker_in_commit == 'true' }}

ci_dotnet:
needs: changes
if: ${{ needs.changes.outputs.dotnet == 'true' || needs.changes.outputs.db_migrations == 'true' }}
uses: ./.github/workflows/ci-dotnet.yml

ci_python:
needs: changes
if: ${{ needs.changes.outputs.settlement_report_job == 'true' }}
uses: ./.github/workflows/ci-python.yml
with:
image_tag: ${{ needs.ci_docker.outputs.image_tag }}

render_c4model_views:
needs: changes
if: ${{ needs.changes.outputs.render_c4model_views == 'true' }}
Expand Down
Loading

0 comments on commit 7064e93

Please sign in to comment.