Skip to content

Commit

Permalink
Refactor/sort out dependencies (#70)
Browse files Browse the repository at this point in the history
This Pull Request: 
---
- Integrates refactoring changes to clean up the dependencies and
workflow files in the repo
- This branch is pulled from the following branch / PR:
#69
- Updated method of pulling documents from s3.

---------

Co-authored-by: Mark <[email protected]>
  • Loading branch information
THOR300 and Mark authored Nov 21, 2023
1 parent 126ab82 commit 3995341
Show file tree
Hide file tree
Showing 13 changed files with 851 additions and 839 deletions.
28 changes: 18 additions & 10 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,29 @@ on:
branches:
- main

# https://github.com/marketplace/actions/docker-layer-caching
jobs:
build-start-test:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Setting up Github Actions
uses: actions/checkout@v4

- name: Setting up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Build
run: make build
- name: Install Dependencies
run: |
python -m pip install "poetry==1.3.2" && poetry install --only integration-test
- name: Run Unit Tests
run: echo TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO
- name: Building the Image
run: make build

- name: Run Integration Tests
run: echo TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO-TODO
- name: Running Unit Tests of the Ingest Application
run: |
make test
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1-node16
Expand All @@ -41,4 +49,4 @@ jobs:
env:
DOCKER_REGISTRY: ${{ secrets.DOCKER_REGISTRY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
20 changes: 12 additions & 8 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ jobs:
python-version: '3.10'

- name: Installing Dependencies
run: pip install -r requirements_for_tests.txt
run: |
python -m pip install "poetry==1.3.2" && poetry install --only integration-test
- name: Building the Image
run: make build_test
run: make build

- name: Configuring AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
Expand All @@ -55,21 +56,21 @@ jobs:
- name: Destroying Infrastructure
run: |
python -m integration_tests.remove_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
poetry run python -m integration_tests.remove_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
- name: Building s3 buckets and uploading test data
run: |
python -m integration_tests.setup_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
python -m integration_tests.setup_execution_data_file ${{ env.INGEST_PIPELINE_BUCKET }} ${{ env.EXECUTION_DATA_PREFIX }}/${{ env.EXECUTION_DATA_FILE_NAME }} ${{ env.TEST_DATA_UPLOAD_PATH }}
poetry run python -m integration_tests.setup_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
poetry run python -m integration_tests.setup_execution_data_file ${{ env.INGEST_PIPELINE_BUCKET }} ${{ env.EXECUTION_DATA_PREFIX }}/${{ env.EXECUTION_DATA_FILE_NAME }} ${{ env.TEST_DATA_UPLOAD_PATH }}
aws s3 sync integration_tests/data/pipeline_in s3://${{ env.INGEST_PIPELINE_BUCKET }}
- name: Running the Ingest Stage
run: |
docker run -e AWS_ACCESS_KEY_ID=${{ secrets.UNIT_TESTS_AWS_ACCESS_KEY_ID }} -e AWS_SECRET_ACCESS_KEY=${{ secrets.UNIT_TESTS_AWS_SECRET_ACCESS_KEY }} -e API_HOST="" -e MACHINE_USER_EMAIL="" -e MACHINE_USER_PASSWORD="" navigator-data-ingest-test --pipeline-bucket ${{ env.INGEST_PIPELINE_BUCKET }} --document-bucket ${{ env.INGEST_DOCUMENT_BUCKET }} --updates-file-name ${{ env.UPDATES_FILE_NAME }} --output-prefix ${{ env.INGEST_OUTPUT_PREFIX }} --embeddings-input-prefix ${{ env.EMBEDDINGS_INPUT_PREFIX }} --indexer-input-prefix ${{ env.INDEXER_INPUT_PREFIX }} --execution-id ${{ env.EXECUTION_DATA_ID }} --execution-data-prefix ${{ env.EXECUTION_DATA_PREFIX }}
docker run -e AWS_ACCESS_KEY_ID=${{ secrets.UNIT_TESTS_AWS_ACCESS_KEY_ID }} -e AWS_SECRET_ACCESS_KEY=${{ secrets.UNIT_TESTS_AWS_SECRET_ACCESS_KEY }} -e API_HOST="" -e MACHINE_USER_EMAIL="" -e MACHINE_USER_PASSWORD="" navigator-data-ingest --pipeline-bucket ${{ env.INGEST_PIPELINE_BUCKET }} --document-bucket ${{ env.INGEST_DOCUMENT_BUCKET }} --updates-file-name ${{ env.UPDATES_FILE_NAME }} --output-prefix ${{ env.INGEST_OUTPUT_PREFIX }} --embeddings-input-prefix ${{ env.EMBEDDINGS_INPUT_PREFIX }} --indexer-input-prefix ${{ env.INDEXER_INPUT_PREFIX }} --execution-id ${{ env.EXECUTION_DATA_ID }} --execution-data-prefix ${{ env.EXECUTION_DATA_PREFIX }}
- name: Running Integration Tests on the Ingest Stage Output 🚀
run: |
python -m pytest -vvv integration_tests/ -m integration
poetry run python -m pytest -vvv integration_tests/ -m integration
env:
INGEST_PIPELINE_BUCKET: ${{ env.INGEST_PIPELINE_BUCKET }}
INGEST_OUTPUT_PREFIX: ${{ env.INGEST_OUTPUT_PREFIX }}
Expand All @@ -78,8 +79,11 @@ jobs:
DOCUMENT_NAME_KEY: ${{ env.DOCUMENT_NAME_KEY }}
PARSER_INPUT_EXPECTED_DATA_FILE_PATH: ${{ env.PARSER_INPUT_EXPECTED_DATA_FILE_PATH }}
TEST_DATA_FILE_PATH: ${{ env.TEST_DATA_FILE_PATH }}
AWS_ACCESS_KEY_ID: ${{ secrets.UNIT_TESTS_AWS_ACCESS_KEY_ID }}
AWS_DEFAULT_REGION: ${{ secrets.UNIT_TESTS_AWS_REGION }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.UNIT_TESTS_AWS_SECRET_ACCESS_KEY }}

- name: Destroying Infrastructure
if: always()
run: |
python -m integration_tests.remove_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
poetry run python -m integration_tests.remove_test_buckets ${{ env.INGEST_DOCUMENT_BUCKET }} ${{ env.INGEST_PIPELINE_BUCKET }} ${{ secrets.UNIT_TESTS_AWS_REGION }}
32 changes: 0 additions & 32 deletions .github/workflows/unit-tests.yml

This file was deleted.

2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ COPY src/navigator_data_ingest/main.py ./src/navigator_data_ingest/main.py

# Install python dependencies using poetry
RUN poetry config virtualenvs.create false
RUN poetry install --without=dev
RUN poetry install

# Copy files to image
COPY src ./src
Expand Down
4 changes: 2 additions & 2 deletions HOW_TO_UPDATE_TESTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Effectively follow the first 3 steps of the integration-tests.yml github actions
### Build the docker image locally

```shell
make build_test
make build
```

### MAKE SURE YOU HAVE THE CORRECT AWS CREDENTIALS SET UP
Expand Down Expand Up @@ -85,7 +85,7 @@ If you are trying to figure out what the variables are look in the env var secti
Example:

```shell
docker run -e AWS_ACCESS_KEY_ID=XXX -e AWS_SECRET_ACCESS_KEY=XXX -e API_HOST="" -e MACHINE_USER_EMAIL="" -e MACHINE_USER_PASSWORD="" navigator-data-ingest-test --pipeline-bucket pipbucket123123123 --document-bucket docbucket123123123 --updates-file-name new_and_updated_documents.json --output-prefix ingest_unit_test_parser_input --embeddings-input-prefix ingest_unit_test_embeddings_input --indexer-input-prefix ingest_unit_test_indexer_input --execution-id 123456 --execution-data-prefix execution_data
docker run -e AWS_ACCESS_KEY_ID=XXX -e AWS_SECRET_ACCESS_KEY=XXX -e API_HOST="" -e MACHINE_USER_EMAIL="" -e MACHINE_USER_PASSWORD="" navigator-data-ingest --pipeline-bucket pipbucket123123123 --document-bucket docbucket123123123 --updates-file-name new_and_updated_documents.json --output-prefix ingest_unit_test_parser_input --embeddings-input-prefix ingest_unit_test_embeddings_input --indexer-input-prefix ingest_unit_test_indexer_input --execution-id 123456 --execution-data-prefix execution_data
```

### Sync Down Output
Expand Down
6 changes: 1 addition & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ git_hooks:

build:
docker build -t navigator-data-ingest .
docker build -t navigator-data-ingest-staging .

build_test:
docker build -t navigator-data-ingest-test .

test:
docker run --entrypoint pytest navigator-data-ingest-test ./src -vvv --log-cli-level=INFO -m 'not integration'
docker run --entrypoint pytest navigator-data-ingest ./src -vvv --log-cli-level=INFO -m 'not integration'
32 changes: 24 additions & 8 deletions integration_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
import pytest
from cloudpathlib import S3Path
import os
import boto3

PIPELINE_BUCKET = os.environ.get("INGEST_PIPELINE_BUCKET")
S3_CLIENT = boto3.client("s3")
PAGINATOR = S3_CLIENT.get_paginator("list_objects")

def get_bucket_files_with_suffix(bucket: S3Path, suffix: str) -> list[S3Path]:
"""Get all the files in a bucket with a given suffix."""
bucket_files = []
for pattern in ["*", "*/*", "*/*/*", "*/*/*/*"]:
files = list(bucket.glob(pattern + suffix))
bucket_files.extend(set(files))
return bucket_files

def get_bucket_files_with_suffix(bucket: str, suffix: str) -> list[S3Path]:
"""Retrieve all the files in an s3 bucket with a given suffix."""
page_iterator = PAGINATOR.paginate(Bucket=bucket)

for page in page_iterator:
if "Contents" in page:
files_with_suffix = [
obj["Key"] for obj in page["Contents"] if obj["Key"].endswith(suffix)
]

# Convert to s3 paths and return
return [
S3Path(os.path.join("s3://", bucket, file))
for file in files_with_suffix
]
return []


@pytest.fixture
def bucket_path():
"""Get the bucket path."""
return S3Path(os.path.join("s3://", os.environ.get("INGEST_PIPELINE_BUCKET")))
if isinstance(PIPELINE_BUCKET, str) and len(PIPELINE_BUCKET) > 0:
return PIPELINE_BUCKET
raise ValueError(f"Invalid env var for PIPELINE_BUCKET: {str(PIPELINE_BUCKET)}")


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"input_dir_path": "s3://pipbucket123123123/input/2022-11-01T21.53.26.945831"
"input_dir_path": "s3://pipbucket123123123/input/2022-11-01T21.53.26.945831"
}
2 changes: 1 addition & 1 deletion integration_tests/setup_test_buckets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def build(
pipeline_bucket_name: str,
region: str,
) -> None:
"""Setup integration_tests data and infrastructure for the integration integration_tests."""
"""Setup integration_tests data and infrastructure for the integration tests."""
s3_conn = boto3.client("s3", region_name=region)
location = {"LocationConstraint": region}

Expand Down
2 changes: 1 addition & 1 deletion integration_tests/test_pipeline_bucket_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_local_fp(file: S3Path) -> Path:

def timestamped_file(file: S3Path) -> bool:
"""Check if a file is timestamped."""
return file.name.startswith("20")
return str(file.name).startswith("20")


@pytest.mark.integration
Expand Down
Loading

0 comments on commit 3995341

Please sign in to comment.