Skip to content

Commit

Permalink
Added pytest
Browse files Browse the repository at this point in the history
  • Loading branch information
rtdurga committed Aug 1, 2024
1 parent fee4424 commit 1eb3137
Show file tree
Hide file tree
Showing 43 changed files with 1,362 additions and 0 deletions.
Empty file added modules/__init__.py
Empty file.
Empty file added modules/sagemaker/__init__.py
Empty file.
Empty file.
Empty file.
24 changes: 24 additions & 0 deletions modules/sagemaker/sagemaker-groundtruth/tests/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# the codecommit repo where the pipeline should pull its source from
repoName: "aiops-to-greengrass-cs"
# the branch to use
branchName: "main"
# S3 prefix where pipeline assets will be stored
pipelineAssetsPrefix: "pipeline/labeling"
# whether to use a private worteam for Labeling
usePrivateWorkteamForLabeling: false
# whether to use a private worteam for verifaction
usePrivateWorkteamForVerification: false
# maximum number of labels per labeling job
maxLabelsPerLabelingJob: 200
# the arn of the private workteam for labeling (only used if usePrivateWorkteamForLabeling is true)
labelingJobPrivateWorkteamArn: "arn:aws:sagemaker:eu-west-1:0000000000000:workteam/private-crowd/GT1"
# the arn of the private workteam for labeling (only used if usePrivateWorkteamForLabeling is true)
verificationJobPrivateWorkteamArn: "arn:aws:sagemaker:eu-west-1:0000000000000:workteam/private-crowd/GT1"
# labeling pipeline schedule, triggering once a month on the 1st to keep cost to a minimum , fell free to change this
labelingPipelineSchedule: "cron(0 12 1 * ? *)"
# featureGroupName in SageMaker Feature Store, where features should be saved
featureGroupName: "tag-quality-inspection"
# modelPackageGroupName in model Registry
modelPackageGroupName: "TagQualityInspectionPackageGroup"
# modelPackageGroupName in model Registry
modelPackageGroupDescription: "Contains models for quality inspection of metal tags"
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"labels": [{"label": "scratch"}]}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"document-version":"2021-05-13","labels":[{"label":"Label correct"},{"label":"Incorrect label - missed object"},{"label":"Incorrect label - bounding box not accurate enough"}]}

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM public.ecr.aws/lambda/python:3.11-x86_64

# Install the function's dependencies using file requirements.txt
# from your project folder.

COPY requirements.txt .
RUN pip3 install -r requirements.txt --target "${LAMBDA_TASK_ROOT}"
RUN mkdir -p /opt/extensions
# Copy function code
COPY app.py ${LAMBDA_TASK_ROOT}

# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
CMD [ "app.handler" ]
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from typing import List, Dict, Any, Tuple, Optional

from collections import namedtuple
from botocore.exceptions import ClientError
import sagemaker
from urllib.parse import urlparse
import boto3
import logging
from sagemaker.feature_store.feature_group import FeatureGroup
import os

APPROVED_LABELS_QUERY = """
SELECT *
FROM
(SELECT *, row_number()
OVER (PARTITION BY source_ref
ORDER BY event_time desc, Api_Invocation_Time DESC, write_time DESC) AS row_number
FROM "{table}")
WHERE row_number = 1 AND status = 'APPROVED' AND NOT is_deleted
"""

logger = logging.getLogger()
logger.setLevel(logging.INFO)

# initialize clients
s3 = boto3.resource("s3")
s3_client = boto3.client("s3")
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client("sagemaker")

# initialize config from env variables
LambdaConfig = namedtuple(
"LambdaConfig",
[
"feature_group_name",
"feature_name_s3uri",
"input_images_s3uri",
"query_results_s3uri",
],
)


def initialize_lambda_config() -> LambdaConfig:
feature_group_name = (
os.environ["FEATURE_GROUP_NAME"]
if "FEATURE_GROUP_NAME" in os.environ
else "tag-quality-inspection"
)
feature_name_s3uri = (
os.environ["FEATURE_NAME_S3URI"]
if "FEATURE_NAME_S3URI" in os.environ
else "source_ref"
)
input_images_s3uri = (
os.environ["INPUT_IMAGES_S3URI"]
if "INPUT_IMAGES_S3URI" in os.environ
else "s3://aiopsbucket/pipeline/assets/images/"
)
query_results_s3uri = (
os.environ["QUERY_RESULTS_S3URI"]
if "QUERY_RESULTS_S3URI" in os.environ
else "s3://aiopsbucket/tmp/feature_store_query_results"
)
return LambdaConfig(
feature_group_name, feature_name_s3uri, input_images_s3uri, query_results_s3uri
)


lambda_config = initialize_lambda_config()


def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
logger.info(
f"check-missing-labels called with event {event} and lambda config {lambda_config}"
)

bucket, key = split_s3_url(lambda_config.input_images_s3uri)
images = get_list_of_files(bucket=bucket, prefix=key, file_types=[".jpg", ".png"])

existing_labels = get_existing_labels(
lambda_config.feature_group_name, lambda_config.query_results_s3uri
)
missing_labels = get_images_without_labels(
images=images, existing_labels=existing_labels
)

logger.info(
f"Finished check-missing-labels lambda with {len(missing_labels)} missing labels"
)
output = {
"missing_labels_count": len(missing_labels),
"missing_labels": missing_labels,
}
return output


def split_s3_url(s3_url: str) -> Tuple[str, str]:
bucket = urlparse(s3_url, allow_fragments=False).netloc
key = urlparse(s3_url, allow_fragments=False).path[1:]
return bucket, key


def get_list_of_files(
bucket: str, prefix: str, file_types: Optional[List[str]] = None
) -> List[str]:
logger.info(f"Getting list of files for bucket {bucket} and prefix {prefix}")
filtered_files: List[str] = []

bucket_resource = s3.Bucket(bucket)
files = bucket_resource.objects.filter(Prefix=prefix)

for file in files:
if is_allowed_file_type(file.key, file_types):
filtered_files.append(f"s3://{file.bucket_name}/{file.key}")
logger.info(f"Found {len(filtered_files)} images")
return filtered_files


def is_allowed_file_type(file: str, file_types: Optional[List[str]] = None) -> bool:
allowed = False
for file_type in file_types or []:
if file.endswith(file_type):
allowed = True
return allowed


def feature_group_exists(feature_group_name: str) -> bool:
try:
sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
except ClientError as error:
if error.response["Error"]["Code"] == "ResourceNotFound":
logger.info(f"No feature group found with name {feature_group_name}")
return False
return True


def get_existing_labels(feature_group_name: str, query_results_s3uri: str) -> Any:
if not feature_group_exists(feature_group_name):
return []
feature_group = FeatureGroup(
name=feature_group_name, sagemaker_session=sagemaker_session
)
query = feature_group.athena_query()
query_string = APPROVED_LABELS_QUERY.format(table=query.table_name)
logger.debug(f"Running query {query_string} against FeatureGroup {feature_group}")
query.run(query_string=query_string, output_location=query_results_s3uri)
query.wait()
df = query.as_dataframe()
logger.info(f"Found {len(df[lambda_config.feature_name_s3uri].tolist())} labels")
return df[lambda_config.feature_name_s3uri].tolist()


def get_images_without_labels(
images: List[str], existing_labels: List[str]
) -> List[str]:
missing_labels = [image for image in images if image not in existing_labels]
logger.info(
f"images: {len(images)} , existing_labels: {len(existing_labels)}, missing_labels: {len(missing_labels)}"
)
return missing_labels
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pandas==1.5.2
scikit-learn==1.1.3
boto3==1.28.52
sagemaker==2.187.0
Empty file.
Loading

0 comments on commit 1eb3137

Please sign in to comment.