diff --git a/CHANGELOG.md b/CHANGELOG.md index 610dd8eb..8b2ba995 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - added `sagemaker-custom-kernel` module - added batch inference project template to `sagemaker-templates-service-catalog` module - added EFS removal policy to `mlflow-fargate` module +- added `mwaa` module with example dag which demonstrates the MLOps in Airflow ### **Changed** diff --git a/README.md b/README.md index 7961d57d..15f1f1aa 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,13 @@ See deployment steps in the [Deployment Guide](DEPLOYMENT.md). |-----------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------| | [SageMaker JumpStart Foundation Model Endpoint Module](modules/fmops/sagemaker-jumpstart-fm-endpoint/README.md) | Creates an endpoint for a SageMaker JumpStart Foundation Model. | + +### MWAA Modules + +| Type | Description | +|-------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Example DAG for MLOps](modules/examples/airflow-dags/README.md) | Deploys a Sample DAG in MWAA demonstrating MLOPs and it is using MWAA module from IDF | + ### Industry Data Framework (IDF) Modules The modules in this repository are compatible with [Industry Data Framework (IDF) Modules](https://github.com/awslabs/idf-modules) and can be used together within the same deployment. Refer to `examples/manifests` for examples. diff --git a/data/mwaa/requirements/constraint-requirements.txt b/data/mwaa/requirements/constraint-requirements.txt new file mode 100644 index 00000000..dd78068d --- /dev/null +++ b/data/mwaa/requirements/constraint-requirements.txt @@ -0,0 +1,715 @@ + +# +# This constraints file was automatically generated on 2024-01-16T07:06:10.475440 +# via "eager-upgrade" mechanism of PIP. For the "v2-8-test" branch of Airflow. +# This variant of constraints install uses the HEAD of the branch version for 'apache-airflow' but installs +# the providers from PIP-released packages at the moment of the constraint generation. +# +# Those constraints are actually those that regular users use to install released version of Airflow. +# We also use those constraints after "apache-airflow" is released and the constraints are tagged with +# "constraints-X.Y.Z" tag to build the production image for that version. +# +# This constraints file is meant to be used only in the "apache-airflow" installation command and not +# in all subsequent pip commands. By using a constraints.txt file, we ensure that solely the Airflow +# installation step is reproducible. Subsequent pip commands may install packages that would have +# been incompatible with the constraints used in Airflow reproducible installation step. Finally, pip +# commands that might change the installed version of apache-airflow should include "apache-airflow==X.Y.Z" +# in the list of install targets to prevent Airflow accidental upgrade or downgrade. +# +# Typical installation process of airflow for Python 3.8 is (with random selection of extras and custom +# dependencies added), usually consists of two steps: +# +# 1. Reproducible installation of airflow with selected providers (note constraints are used): +# +# pip install "apache-airflow[celery,cncf.kubernetes,google,amazon,snowflake]==X.Y.Z" \ +# --constraint \ +# "https://raw.githubusercontent.com/apache/airflow/constraints-X.Y.Z/constraints-3.11.txt" +# +# 2. Installing own dependencies that are potentially not matching the constraints (note constraints are not +# used, and apache-airflow==X.Y.Z is used to make sure there is no accidental airflow upgrade/downgrade. +# +# pip install "apache-airflow==X.Y.Z" "snowflake-connector-python[pandas]=N.M.O" +# +Authlib==1.3.0 +Babel==2.14.0 +ConfigUpdater==3.2 +Deprecated==1.2.14 +Flask-AppBuilder==4.3.10 +Flask-Babel==2.0.0 +Flask-Bcrypt==1.0.1 +Flask-Caching==2.1.0 +Flask-JWT-Extended==4.6.0 +Flask-Limiter==3.5.0 +Flask-Login==0.6.3 +Flask-SQLAlchemy==2.5.1 +Flask-Session==0.5.0 +Flask-WTF==1.2.1 +Flask==2.2.5 +GitPython==3.1.41 +JPype1==1.5.0 +JayDeBeApi==1.2.3 +Jinja2==3.1.3 +Js2Py==0.74 +Mako==1.3.0 +Markdown==3.5.2 +MarkupSafe==2.1.3 +PyGithub==2.1.1 +PyHive==0.7.0 +PyJWT==2.8.0 +PyNaCl==1.5.0 +PyYAML==6.0.1 +Pygments==2.17.2 +SQLAlchemy-JSONField==1.0.2 +SQLAlchemy-Utils==0.41.1 +SQLAlchemy==1.4.51 +SecretStorage==3.3.3 +Sphinx==5.3.0 +WTForms==3.1.2 +Werkzeug==2.2.3 +adal==1.2.7 +adlfs==2023.12.0 +aiobotocore==2.11.2 +aiofiles==23.2.1 +aiohttp==3.9.1 +aioitertools==0.11.0 +aioresponses==0.7.6 +aiosignal==1.3.1 +alabaster==0.7.16 +alembic==1.13.1 +alibabacloud-adb20211201==1.2.6 +alibabacloud-tea==0.3.5 +alibabacloud_credentials==0.3.2 +alibabacloud_endpoint_util==0.0.3 +alibabacloud_gateway_spi==0.0.1 +alibabacloud_openapi_util==0.2.2 +alibabacloud_tea_openapi==0.3.8 +alibabacloud_tea_util==0.3.11 +alibabacloud_tea_xml==0.0.2 +aliyun-python-sdk-core==2.14.0 +aliyun-python-sdk-kms==2.16.2 +amqp==5.2.0 +analytics-python==1.2.9 +annotated-types==0.6.0 +anyascii==0.3.2 +anyio==4.2.0 +apache-airflow-providers-airbyte==3.5.1 +apache-airflow-providers-alibaba==2.7.1 +apache-airflow-providers-amazon==8.16.0 +apache-airflow-providers-apache-beam==5.5.0 +apache-airflow-providers-apache-cassandra==3.4.1 +apache-airflow-providers-apache-drill==2.6.0 +apache-airflow-providers-apache-druid==3.7.0 +apache-airflow-providers-apache-flink==1.3.0 +apache-airflow-providers-apache-hdfs==4.3.2 +apache-airflow-providers-apache-hive==6.4.1 +apache-airflow-providers-apache-impala==1.3.0 +apache-airflow-providers-apache-kafka==1.3.1 +apache-airflow-providers-apache-kylin==3.5.0 +apache-airflow-providers-apache-livy==3.7.1 +apache-airflow-providers-apache-pig==4.3.0 +apache-airflow-providers-apache-pinot==4.3.0 +apache-airflow-providers-apache-spark==4.7.0 +apache-airflow-providers-apprise==1.2.1 +apache-airflow-providers-arangodb==2.4.1 +apache-airflow-providers-asana==2.4.1 +apache-airflow-providers-atlassian-jira==2.5.0 +apache-airflow-providers-celery==3.5.1 +apache-airflow-providers-cloudant==3.4.1 +apache-airflow-providers-cncf-kubernetes==7.13.0 +apache-airflow-providers-cohere==1.1.1 +apache-airflow-providers-common-io==1.2.0 +apache-airflow-providers-common-sql==1.10.0 +apache-airflow-providers-databricks==6.0.0 +apache-airflow-providers-datadog==3.5.1 +apache-airflow-providers-dbt-cloud==3.5.1 +apache-airflow-providers-dingding==3.4.0 +apache-airflow-providers-discord==3.5.0 +apache-airflow-providers-docker==3.9.1 +apache-airflow-providers-elasticsearch==5.3.1 +apache-airflow-providers-exasol==4.4.1 +apache-airflow-providers-facebook==3.4.0 +apache-airflow-providers-ftp==3.7.0 +apache-airflow-providers-github==2.5.1 +apache-airflow-providers-google==10.13.1 +apache-airflow-providers-grpc==3.4.1 +apache-airflow-providers-hashicorp==3.6.1 +apache-airflow-providers-http==4.8.0 +apache-airflow-providers-imap==3.5.0 +apache-airflow-providers-influxdb==2.4.0 +apache-airflow-providers-jdbc==4.2.1 +apache-airflow-providers-jenkins==3.5.1 +apache-airflow-providers-microsoft-azure==8.5.1 +apache-airflow-providers-microsoft-mssql==3.6.0 +apache-airflow-providers-microsoft-psrp==2.5.0 +apache-airflow-providers-microsoft-winrm==3.4.0 +apache-airflow-providers-mongo==3.5.0 +apache-airflow-providers-mysql==5.5.1 +apache-airflow-providers-neo4j==3.5.0 +apache-airflow-providers-odbc==4.4.0 +apache-airflow-providers-openai==1.1.0 +apache-airflow-providers-openfaas==3.4.0 +apache-airflow-providers-openlineage==1.4.0 +apache-airflow-providers-opensearch==1.1.1 +apache-airflow-providers-opsgenie==5.5.0 +apache-airflow-providers-oracle==3.9.1 +apache-airflow-providers-pagerduty==3.6.0 +apache-airflow-providers-papermill==3.6.0 +apache-airflow-providers-pgvector==1.1.0 +apache-airflow-providers-pinecone==1.1.1 +apache-airflow-providers-postgres==5.10.0 +apache-airflow-providers-presto==5.4.0 +apache-airflow-providers-redis==3.6.0 +apache-airflow-providers-salesforce==5.6.1 +apache-airflow-providers-samba==4.5.0 +apache-airflow-providers-segment==3.4.0 +apache-airflow-providers-sendgrid==3.4.0 +apache-airflow-providers-sftp==4.8.1 +apache-airflow-providers-singularity==3.4.0 +apache-airflow-providers-slack==8.5.1 +apache-airflow-providers-smtp==1.5.0 +apache-airflow-providers-snowflake==5.2.1 +apache-airflow-providers-sqlite==3.7.0 +apache-airflow-providers-ssh==3.10.0 +apache-airflow-providers-tableau==4.4.0 +apache-airflow-providers-tabular==1.4.1 +apache-airflow-providers-telegram==4.3.0 +apache-airflow-providers-trino==5.6.0 +apache-airflow-providers-vertica==3.7.0 +apache-airflow-providers-weaviate==1.3.0 +apache-airflow-providers-yandex==3.7.1 +apache-airflow-providers-zendesk==4.6.0 +apache-beam==2.53.0 +apispec==6.4.0 +apprise==1.7.1 +argcomplete==3.2.1 +asana==3.2.2 +asgiref==3.7.2 +asn1crypto==1.5.1 +astroid==2.15.8 +asttokens==2.4.1 +atlasclient==1.0.0 +atlassian-python-api==3.41.5 +attrs==23.2.0 +aws-sam-translator==1.83.0 +aws-xray-sdk==2.12.1 +azure-batch==14.1.0 +azure-common==1.1.28 +azure-core==1.29.6 +azure-cosmos==4.5.1 +azure-datalake-store==0.0.53 +azure-identity==1.15.0 +azure-keyvault-secrets==4.7.0 +azure-kusto-data==4.3.1 +azure-mgmt-containerinstance==10.1.0 +azure-mgmt-containerregistry==10.3.0 +azure-mgmt-core==1.4.0 +azure-mgmt-cosmosdb==9.4.0 +azure-mgmt-datafactory==4.0.0 +azure-mgmt-datalake-nspkg==3.0.1 +azure-mgmt-datalake-store==0.5.0 +azure-mgmt-nspkg==3.0.2 +azure-mgmt-resource==23.0.1 +azure-mgmt-storage==21.1.0 +azure-nspkg==3.0.2 +azure-servicebus==7.11.4 +azure-storage-blob==12.19.0 +azure-storage-file-datalake==12.14.0 +azure-storage-file-share==12.15.0 +azure-synapse-artifacts==0.18.0 +azure-synapse-spark==0.7.0 +backoff==2.2.1 +bcrypt==4.1.2 +beautifulsoup4==4.12.2 +billiard==4.2.0 +bitarray==2.9.2 +black==24.1a1 +blinker==1.7.0 +boto3==1.33.13 +botocore==1.33.13 +cachelib==0.9.0 +cachetools==5.3.2 +cassandra-driver==3.29.0 +cattrs==23.2.3 +celery==5.3.6 +certifi==2023.11.17 +cffi==1.16.0 +cfgv==3.4.0 +cfn-lint==0.83.8 +cgroupspy==0.2.2 +chardet==5.2.0 +charset-normalizer==3.3.2 +checksumdir==1.2.0 +ciso8601==2.3.1 +click-didyoumean==0.3.0 +click-plugins==1.1.1 +click-repl==0.3.0 +click==8.1.7 +clickclick==20.10.2 +cloudant==2.15.0 +cloudpickle==2.2.1 +cohere==4.44 +colorama==0.4.6 +colorlog==4.8.0 +comm==0.2.1 +confluent-kafka==2.3.0 +connexion==2.14.2 +coverage==7.4.0 +crcmod==1.7 +cron-descriptor==1.4.0 +croniter==2.0.1 +cryptography==41.0.7 +curlify==2.2.1 +databricks-sql-connector==2.9.3 +datadog==0.47.0 +db-dtypes==1.2.0 +debugpy==1.8.0 +decorator==5.1.1 +defusedxml==0.7.1 +deltalake==0.15.1 +dill==0.3.8 +distlib==0.3.8 +distro==1.9.0 +dnspython==2.4.2 +docker==7.0.0 +docopt==0.6.2 +docutils==0.20.1 +duckdb==0.9.2 +ecdsa==0.18.0 +editables==0.5 +elastic-transport==8.11.0 +elasticsearch==8.11.1 +email-validator==1.3.1 +entrypoints==0.4 +eralchemy2==1.3.8 +et-xmlfile==1.1.0 +eventlet==0.34.3 +execnet==2.0.2 +executing==2.0.1 +facebook-business==18.0.5 +fastavro==1.9.3 +fasteners==0.19 +fastjsonschema==2.19.1 +filelock==3.13.1 +flower==2.0.1 +frozenlist==1.4.1 +fsspec==2023.12.2 +future==0.18.3 +gcloud-aio-auth==4.2.3 +gcloud-aio-bigquery==7.0.0 +gcloud-aio-storage==9.0.0 +gcsfs==2023.12.2.post1 +geomet==0.2.1.post1 +gevent==23.9.1 +gitdb==4.0.11 +google-ads==22.1.0 +google-analytics-admin==0.22.2 +google-api-core==2.15.0 +google-api-python-client==2.113.0 +google-auth-httplib2==0.2.0 +google-auth-oauthlib==1.2.0 +google-auth==2.26.2 +google-cloud-aiplatform==1.39.0 +google-cloud-appengine-logging==1.4.0 +google-cloud-audit-log==0.2.5 +google-cloud-automl==2.12.0 +google-cloud-batch==0.17.7 +google-cloud-bigquery-datatransfer==3.13.0 +google-cloud-bigquery-storage==2.24.0 +google-cloud-bigquery==3.16.0 +google-cloud-bigtable==2.22.0 +google-cloud-build==3.22.0 +google-cloud-compute==1.15.0 +google-cloud-container==2.37.0 +google-cloud-core==2.4.1 +google-cloud-datacatalog==3.17.2 +google-cloud-dataflow-client==0.8.6 +google-cloud-dataform==0.5.5 +google-cloud-dataplex==1.11.0 +google-cloud-dataproc-metastore==1.14.0 +google-cloud-dataproc==5.8.0 +google-cloud-dlp==3.14.0 +google-cloud-kms==2.20.0 +google-cloud-language==2.12.0 +google-cloud-logging==3.9.0 +google-cloud-memcache==1.8.0 +google-cloud-monitoring==2.18.0 +google-cloud-orchestration-airflow==1.10.0 +google-cloud-os-login==2.13.0 +google-cloud-pubsub==2.19.0 +google-cloud-redis==2.14.0 +google-cloud-resource-manager==1.11.0 +google-cloud-run==0.10.1 +google-cloud-secret-manager==2.17.0 +google-cloud-spanner==3.41.0 +google-cloud-speech==2.23.0 +google-cloud-storage-transfer==1.10.0 +google-cloud-storage==2.14.0 +google-cloud-tasks==2.15.0 +google-cloud-texttospeech==2.15.1 +google-cloud-translate==3.14.0 +google-cloud-videointelligence==2.12.0 +google-cloud-vision==3.5.0 +google-cloud-workflows==1.13.0 +google-crc32c==1.5.0 +google-re2==1.1 +google-resumable-media==2.7.0 +googleapis-common-protos==1.62.0 +graphql-core==3.2.3 +graphviz==0.20.1 +greenlet==3.0.3 +grpc-google-iam-v1==0.13.0 +grpcio-gcp==0.2.2 +grpcio-status==1.60.0 +grpcio==1.60.0 +gssapi==1.8.3 +gunicorn==21.2.0 +h11==0.14.0 +hatch==1.9.1 +hatchling==1.21.0 +hdfs==2.7.3 +hmsclient==0.1.1 +httpcore==0.16.3 +httplib2==0.22.0 +httpx==0.23.3 +humanize==4.9.0 +hvac==2.1.0 +hyperlink==21.0.0 +icdiff==2.0.7 +identify==2.5.33 +idna==3.6 +ijson==3.2.3 +imagesize==1.4.1 +importlib-metadata==6.11.0 +importlib-resources==6.1.1 +impyla==0.19.0 +incremental==22.10.0 +inflection==0.5.1 +influxdb-client==1.39.0 +iniconfig==2.0.0 +ipdb==0.13.13 +ipykernel==6.28.0 +ipython==8.20.0 +isodate==0.6.1 +itsdangerous==2.1.2 +jaraco.classes==3.3.0 +jedi==0.19.1 +jeepney==0.8.0 +jmespath==0.10.0 +jschema-to-python==1.2.3 +json-merge-patch==0.2 +jsondiff==2.0.0 +jsonpatch==1.33 +jsonpath-ng==1.6.1 +jsonpickle==3.0.2 +jsonpointer==2.4 +jsonschema-path==0.3.2 +jsonschema-specifications==2023.12.1 +jsonschema==4.20.0 +junit-xml==1.9 +jupyter_client==8.6.0 +jupyter_core==5.7.1 +keyring==24.3.0 +kombu==5.3.5 +krb5==0.5.1 +kubernetes-asyncio==24.2.3 +kubernetes==23.6.0 +kylinpy==2.8.4 +lazy-object-proxy==1.10.0 +ldap3==2.9.1 +limits==3.7.0 +linkify-it-py==2.0.2 +lockfile==0.12.2 +loguru==0.7.2 +looker-sdk==23.20.1 +lxml==5.1.0 +lz4==4.3.3 +markdown-it-py==3.0.0 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.26.1 +marshmallow==3.20.2 +matplotlib-inline==0.1.6 +mdit-py-plugins==0.4.0 +mdurl==0.1.2 +mmhash3==3.0.1 +mongomock==4.1.2 +more-itertools==10.2.0 +moto==4.2.13 +mpmath==1.3.0 +msal-extensions==1.1.0 +msal==1.26.0 +msrest==0.7.1 +msrestazure==0.6.4 +multi_key_dict==2.0.3 +multidict==6.0.4 +mypy-boto3-appflow==1.34.0 +mypy-boto3-rds==1.34.6 +mypy-boto3-redshift-data==1.34.0 +mypy-boto3-s3==1.34.14 +mypy-extensions==1.0.0 +mypy==1.2.0 +mysql-connector-python==8.0.29 +mysqlclient==2.2.1 +nbclient==0.9.0 +nbformat==5.9.2 +neo4j==5.16.0 +nest-asyncio==1.5.9 +networkx==3.2.1 +nh3==0.2.15 +nodeenv==1.8.0 +numpy==1.24.4 +oauthlib==3.2.2 +objsize==0.6.1 +openai==1.7.2 +openapi-schema-validator==0.6.2 +openapi-spec-validator==0.7.1 +openlineage-integration-common==1.7.0 +openlineage-python==1.7.0 +openlineage_sql==1.7.0 +openpyxl==3.1.2 +opensearch-py==2.4.2 +opentelemetry-api==1.22.0 +opentelemetry-exporter-otlp-proto-common==1.22.0 +opentelemetry-exporter-otlp-proto-grpc==1.22.0 +opentelemetry-exporter-otlp-proto-http==1.22.0 +opentelemetry-exporter-otlp==1.22.0 +opentelemetry-exporter-prometheus==0.43b0 +opentelemetry-proto==1.22.0 +opentelemetry-sdk==1.22.0 +opentelemetry-semantic-conventions==0.43b0 +opsgenie-sdk==2.1.5 +oracledb==2.0.1 +ordered-set==4.1.0 +orjson==3.9.10 +oss2==2.18.4 +packaging==23.2 +pandas-gbq==0.20.0 +pandas-stubs==2.0.2.230605 +pandas==2.1.4 +papermill==2.5.0 +paramiko==3.4.0 +parso==0.8.3 +pathable==0.4.3 +pathspec==0.12.1 +pbr==6.0.0 +pdpyras==5.2.0 +pendulum==3.0.0 +pexpect==4.9.0 +pgvector==0.2.4 +pinecone-client==2.2.4 +pinotdb==5.1.2 +pipdeptree==2.13.2 +pipx==1.4.2 +pkginfo==1.9.6 +platformdirs==3.11.0 +pluggy==1.3.0 +ply==3.11 +plyvel==1.5.1 +portalocker==2.8.2 +pprintpp==0.4.0 +pre-commit==3.6.0 +presto-python-client==0.8.4 +prison==0.2.1 +prometheus-client==0.19.0 +prompt-toolkit==3.0.43 +proto-plus==1.23.0 +protobuf==4.25.2 +psutil==5.9.7 +psycopg2-binary==2.9.9 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pure-sasl==0.6.2 +py-partiql-parser==0.5.0 +py4j==0.10.9.7 +pyOpenSSL==23.3.0 +pyarrow-hotfix==0.6 +pyarrow==14.0.2 +pyasn1-modules==0.3.0 +pyasn1==0.5.1 +pycountry==23.12.11 +pycparser==2.21 +pycryptodome==3.20.0 +pydantic==2.5.3 +pydantic_core==2.14.6 +pydata-google-auth==1.8.2 +pydot==1.4.2 +pydruid==0.6.6 +pyenchant==3.2.2 +pyexasol==0.25.2 +pygraphviz==1.12 +pyiceberg==0.5.1 +pyjsparser==2.7.1 +pykerberos==1.2.4 +pymongo==4.6.1 +pymssql==2.2.11 +pyodbc==5.0.1 +pyparsing==3.1.1 +pypsrp==0.8.1 +pyspark==3.5.0 +pyspnego==0.10.2 +pytest-asyncio==0.23.3 +pytest-cov==4.1.0 +pytest-httpx==0.21.3 +pytest-icdiff==0.9 +pytest-instafail==0.5.0 +pytest-mock==3.12.0 +pytest-rerunfailures==13.0 +pytest-timeouts==1.2.1 +pytest-xdist==3.5.0 +pytest==7.4.4 +python-arango==7.9.0 +python-daemon==3.0.1 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-http-client==3.3.7 +python-jenkins==1.8.2 +python-jose==3.3.0 +python-ldap==3.4.4 +python-nvd3==0.15.0 +python-slugify==8.0.1 +python-telegram-bot==20.2 +python3-saml==1.16.0 +pytz==2023.3.post1 +pywinrm==0.4.3 +pyzmq==25.1.2 +reactivex==4.0.4 +readme-renderer==42.0 +redis==4.6.0 +redshift-connector==2.0.918 +referencing==0.32.1 +regex==2023.12.25 +requests-file==1.5.1 +requests-kerberos==0.14.0 +requests-mock==1.11.0 +requests-ntlm==1.2.0 +requests-oauthlib==1.3.1 +requests-toolbelt==1.0.0 +requests==2.31.0 +responses==0.24.1 +restructuredtext_lint==1.4.0 +rfc3339-validator==0.1.4 +rfc3986==1.5.0 +rich-argparse==1.4.0 +rich-click==1.7.3 +rich==13.7.0 +rpds-py==0.17.1 +rsa==4.9 +ruff==0.1.11 +s3fs==2023.12.2 +s3transfer==0.8.2 +sarif-om==1.0.4 +scramp==1.4.4 +scrapbook==0.5.0 +semver==3.0.2 +sendgrid==6.11.0 +sentinels==1.0.0 +sentry-sdk==1.39.2 +setproctitle==1.3.3 +shapely==2.0.2 +shellingham==1.5.4 +simple-salesforce==1.12.5 +six==1.16.0 +slack_sdk==3.26.2 +smbprotocol==1.12.0 +smmap==5.0.1 +sniffio==1.3.0 +snowballstemmer==2.2.0 +snowflake-connector-python==3.6.0 +snowflake-sqlalchemy==1.5.1 +sortedcontainers==2.4.0 +soupsieve==2.5 +sphinx-airflow-theme==0.0.12 +sphinx-argparse==0.4.0 +sphinx-autoapi==2.1.1 +sphinx-copybutton==0.5.2 +sphinx-jinja==2.0.2 +sphinx-rtd-theme==2.0.0 +sphinx_design==0.5.0 +sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-httpdomain==1.8.1 +sphinxcontrib-jquery==4.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-redoc==1.6.0 +sphinxcontrib-serializinghtml==1.1.5 +sphinxcontrib-spelling==8.0.0 +spython==0.3.13 +sqlalchemy-bigquery==1.9.0 +sqlalchemy-redshift==0.8.14 +sqlalchemy-spanner==1.6.2 +sqlalchemy_drill==1.1.4 +sqlparse==0.4.4 +sshpubkeys==3.3.1 +sshtunnel==0.4.0 +stack-data==0.6.3 +starkbank-ecdsa==2.2.0 +statsd==4.0.1 +strictyaml==1.7.3 +sympy==1.12 +tableauserverclient==0.29 +tabulate==0.9.0 +tenacity==8.2.3 +termcolor==2.4.0 +text-unidecode==1.3 +thrift-sasl==0.4.3 +thrift==0.16.0 +time-machine==2.13.0 +tomli_w==1.0.0 +tomlkit==0.12.3 +tornado==6.4 +towncrier==23.11.0 +tqdm==4.66.1 +traitlets==5.14.1 +trino==0.327.0 +trove-classifiers==2024.1.8 +twine==4.0.2 +types-Deprecated==1.2.9.20240106 +types-Markdown==3.5.0.20240106 +types-PyMySQL==1.1.0.1 +types-PyYAML==6.0.12.12 +types-aiofiles==23.2.0.20240106 +types-certifi==2021.10.8.3 +types-croniter==2.0.0.20240106 +types-docutils==0.20.0.20240106 +types-paramiko==3.4.0.20240106 +types-protobuf==4.24.0.20240106 +types-pyOpenSSL==23.3.0.20240106 +types-python-dateutil==2.8.19.20240106 +types-python-slugify==8.0.0.3 +types-pytz==2023.3.1.1 +types-redis==4.6.0.20240106 +types-requests==2.31.0.20240106 +types-setuptools==69.0.0.20240115 +types-tabulate==0.9.0.20240106 +types-termcolor==1.1.6.2 +types-toml==0.10.8.7 +typing_extensions==4.9.0 +tzdata==2023.4 +tzlocal==5.2 +uc-micro-py==1.0.2 +unicodecsv==0.14.1 +universal-pathlib==0.1.4 +uritemplate==4.1.1 +urllib3==2.0.7 +userpath==1.9.1 +validators==0.22.0 +vertica-python==1.3.8 +vine==5.1.0 +virtualenv==20.25.0 +watchtower==3.0.1 +wcwidth==0.2.13 +weaviate-client==3.26.1 +websocket-client==1.7.0 +wrapt==1.16.0 +xmlsec==1.3.13 +xmltodict==0.13.0 +yamllint==1.33.0 +yandexcloud==0.253.0 +yarl==1.9.4 +zeep==4.2.1 +zenpy==2.0.45 +zipp==3.17.0 +zope.event==5.0 +zope.interface==6.1 +zstandard==0.22.0 diff --git a/data/mwaa/requirements/requirements.txt b/data/mwaa/requirements/requirements.txt new file mode 100644 index 00000000..24dcb8a4 --- /dev/null +++ b/data/mwaa/requirements/requirements.txt @@ -0,0 +1,22 @@ +--constraint https://raw.githubusercontent.com/awslabs/aiops-modules/main/data/mwaa/requirements/constraint-requirements.txt + +aiobotocore==2.11.2 +aioitertools==0.11.0 +cloudpickle==2.2.1 +contextlib2==21.6.0 +dill==0.3.8 +docker==7.0.0 +google-pasta==0.2.0 +multiprocess==0.70.16 +numpy==1.24.4 +pandas==2.1.4 +pathos==0.3.2 +pox==0.3.4 +ppft==1.7.6.8 +s3fs==2023.12.2 +sagemaker==2.214.3 +schema==0.7.5 +scipy==1.13.0 +smdebug-rulesconfig==1.0.1 +tblib==3.0.0 +tqdm==4.66.1 diff --git a/manifests/deployment.yaml b/manifests/deployment.yaml index 680dc0db..f6929a64 100644 --- a/manifests/deployment.yaml +++ b/manifests/deployment.yaml @@ -18,6 +18,10 @@ groups: path: manifests/fmops-modules.yaml - name: sagemaker-kernels path: manifests/kernels-modules.yaml + - name: mwaa + path: manifests/mwaa-modules.yaml + - name: mwaa-dags + path: manifests/mwaa-dag-modules.yaml targetAccountMappings: - alias: primary accountId: @@ -26,4 +30,4 @@ targetAccountMappings: default: true regionMappings: - region: us-east-1 - default: true + default: true \ No newline at end of file diff --git a/manifests/mwaa-dag-modules.yaml b/manifests/mwaa-dag-modules.yaml new file mode 100644 index 00000000..21c56991 --- /dev/null +++ b/manifests/mwaa-dag-modules.yaml @@ -0,0 +1,27 @@ +name: dags +path: modules/examples/airflow-dags/ +parameters: + - name: dag-bucket-name + valueFrom: + moduleMetadata: + group: mwaa + name: mwaa + key: DagBucketName + - name: dag-path + valueFrom: + moduleMetadata: + group: mwaa + name: mwaa + key: DagPath + - name: mwaa-exec-role-arn + valueFrom: + moduleMetadata: + group: mwaa + name: mwaa + key: MwaaExecRoleArn + # - name: bucket-policy-arn + # valueFrom: + # moduleMetadata: + # group: optionals + # name: datalake-buckets + # key: FullAccessPolicyArn \ No newline at end of file diff --git a/manifests/mwaa-modules.yaml b/manifests/mwaa-modules.yaml new file mode 100644 index 00000000..96ac8b2b --- /dev/null +++ b/manifests/mwaa-modules.yaml @@ -0,0 +1,21 @@ +name: mwaa +path: git::https://github.com/awslabs/idf-modules.git//modules/orchestration/mwaa?ref=release/1.4.1&depth=1 +dataFiles: + - filePath: data/mwaa/requirements/requirements.txt +parameters: + - name: vpc-id + valueFrom: + moduleMetadata: + group: networking + name: networking + key: VpcId + - name: private-subnet-ids + valueFrom: + moduleMetadata: + group: networking + name: networking + key: PrivateSubnetIds + - name: airflow-version + value: "2.8.1" + - name: custom-requirements-path + value: data/mwaa/requirements/requirements.txt \ No newline at end of file diff --git a/modules/examples/airflow-dags/README.md b/modules/examples/airflow-dags/README.md new file mode 100644 index 00000000..c70e65bf --- /dev/null +++ b/modules/examples/airflow-dags/README.md @@ -0,0 +1,42 @@ +# Example DAG Module + +## Description + +This module demonstrates: + +- creating a CDK Stack with Role dedicated to the DAGs + - within the Stack, grant the MWAA Execution Role permission to assume the created DAG Execution Role +- creating DAGs on a shared MWAA Environment by utilizing Input Parameters + - within the DAG, demonstrate assuming the DAG Execution Role with service and data permissions specific to the DAG +- exporting Metadata by setting the `MLOPS_MODULE_METADATA` env var on completion + +## Inputs/Outputs + +### Input Paramenters + +#### Required + +- `dag-bucket-name`: name of the Bucket configured in the shared MWAA Environment to store DAG artifacts +- `dag-path`: name of the path in the Bucket configured in the shared MWAA Environment to store DAG artifacts +- `mwaa-exec-role-arn`: ARN of the MWAA Execution Role + +#### Optional + +- `bucket-policy-arn`: ARN of an IAM Managed Policy to attach to the DAG Execution Role granting access to S3 Data Buckets + +### Module Metadata Outputs + +- `DagRoleArn`: ARN of the DAG Execution Role created by the Stack +- `MlOpsBucket`: Name of the Bucket used by the dag +- `SageMakerExecutionRole`: ARN of the Sagemaker Execution Role created by the Stack + +#### Output Example + +```json +{ + "DagRoleArn": "arn::::", + "MlOpsBucket": "", + "SageMakerExecutionRole" : "arn::::" + +} +``` diff --git a/modules/examples/airflow-dags/app.py b/modules/examples/airflow-dags/app.py new file mode 100644 index 00000000..adcdcd57 --- /dev/null +++ b/modules/examples/airflow-dags/app.py @@ -0,0 +1,48 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import os + +import aws_cdk +from aws_cdk import App, CfnOutput +from stack import DagResources + +project_name = os.getenv("SEEDFARMER_PROJECT_NAME", "") +deployment_name = os.getenv("SEEDFARMER_DEPLOYMENT_NAME", "") +module_name = os.getenv("SEEDFARMER_MODULE_NAME", "") +app_prefix = f"{project_name}-{deployment_name}-{module_name}" + +mwaa_exec_role = os.getenv("SEEDFARMER_PARAMETER_MWAA_EXEC_ROLE_ARN", "") +bucket_policy_arn = os.getenv("SEEDFARMER_PARAMETER_BUCKET_POLICY_ARN") +permission_boundary_arn = os.getenv("SEEDFARMER_PERMISSION_BOUNDARY_ARN") + +app = App() + +stack = DagResources( + scope=app, + id=app_prefix, + project_name=project_name, + deployment_name=deployment_name, + module_name=module_name, + mwaa_exec_role=mwaa_exec_role, + bucket_policy_arn=bucket_policy_arn, + permission_boundary_arn=permission_boundary_arn, + env=aws_cdk.Environment( + account=os.environ["CDK_DEFAULT_ACCOUNT"], + region=os.environ["CDK_DEFAULT_REGION"], + ), +) + +CfnOutput( + scope=stack, + id="metadata", + value=stack.to_json_string( + { + "DagRoleArn": stack.dag_role.role_arn, + "MlOpsBucket": stack.mlops_assets_bucket.bucket_name, + "SageMakerExecutionRole": stack.sagemaker_execution_role.role_arn, + } + ), +) + +app.synth(force=True) diff --git a/modules/examples/airflow-dags/coverage.ini b/modules/examples/airflow-dags/coverage.ini new file mode 100644 index 00000000..c3878739 --- /dev/null +++ b/modules/examples/airflow-dags/coverage.ini @@ -0,0 +1,3 @@ +[run] +omit = + tests/* \ No newline at end of file diff --git a/modules/examples/airflow-dags/dags/__init__.py b/modules/examples/airflow-dags/dags/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/examples/airflow-dags/dags/config.py b/modules/examples/airflow-dags/dags/config.py new file mode 100644 index 00000000..d579ff61 --- /dev/null +++ b/modules/examples/airflow-dags/dags/config.py @@ -0,0 +1,3 @@ +MLOPS_DATA_BUCKET = "MLOPS_S3_BUCKET" +SAGEMAKER_EXECUTION_ROLE = "SAGEMAKER_IAM_ROLE" +DAG_EXECUTION_ROLE = "DAG_IAM_ROLE" diff --git a/modules/examples/airflow-dags/dags/evaluation.py b/modules/examples/airflow-dags/dags/evaluation.py new file mode 100644 index 00000000..5cee0f63 --- /dev/null +++ b/modules/examples/airflow-dags/dags/evaluation.py @@ -0,0 +1,39 @@ +import json +import os +import tarfile + +import pandas as pd + +from sklearn.externals import joblib +from sklearn.metrics import classification_report, roc_auc_score, accuracy_score + +if __name__ == "__main__": + model_path = os.path.join("/opt/ml/processing/model", "model.tar.gz") + print("Extracting model from path: {}".format(model_path)) + with tarfile.open(model_path) as tar: + tar.extractall(path=".") + print("Loading model") + model = joblib.load("model.joblib") + + print("Loading test input data") + test_features_data = os.path.join("/opt/ml/processing/test", "test_features.csv") + test_labels_data = os.path.join("/opt/ml/processing/test", "test_labels.csv") + + X_test = pd.read_csv(test_features_data, header=None) + y_test = pd.read_csv(test_labels_data, header=None) + predictions = model.predict(X_test) + + print("Creating classification evaluation report") + report_dict = classification_report(y_test, predictions, output_dict=True) + report_dict["accuracy"] = accuracy_score(y_test, predictions) + report_dict["roc_auc"] = roc_auc_score(y_test, predictions) + + print("Classification report:\n{}".format(report_dict)) + + evaluation_output_path = os.path.join( + "/opt/ml/processing/evaluation", "evaluation.json" + ) + print("Saving classification report to {}".format(evaluation_output_path)) + + with open(evaluation_output_path, "w") as f: + f.write(json.dumps(report_dict)) diff --git a/modules/examples/airflow-dags/dags/mlops_dag.py b/modules/examples/airflow-dags/dags/mlops_dag.py new file mode 100644 index 00000000..a7867ae7 --- /dev/null +++ b/modules/examples/airflow-dags/dags/mlops_dag.py @@ -0,0 +1,170 @@ +""" +Airflow dag with Sagemaker Processing and Training Job +""" + +import os +import sys + +sys.path.append(os.path.dirname(__file__)) + +from datetime import datetime + +import boto3 +from airflow import DAG +from airflow.operators.python import PythonOperator +from config import MLOPS_DATA_BUCKET, SAGEMAKER_EXECUTION_ROLE, DAG_EXECUTION_ROLE +from sagemaker.processing import ProcessingInput, ProcessingOutput +from sagemaker.inputs import TrainingInput +from sagemaker.session import Session +from sagemaker.sklearn.estimator import SKLearn +from sagemaker.sklearn.processing import SKLearnProcessor + +default_args = { + "owner": "airflow", + "depends_on_past": False, + "start_date": datetime(2020, 1, 1), +} +dag = DAG("SciKitLearn_MLOps", default_args=default_args, schedule_interval=None) + +pre_processing_input = ( + f"s3://sagemaker-sample-data-{os.environ['AWS_REGION']}/processing/census" +) +test_data_s3_path = f"s3://{MLOPS_DATA_BUCKET}/processing/test" +train_data_s3_path = f"s3://{MLOPS_DATA_BUCKET}/processing/train" +model_path = f"s3://{MLOPS_DATA_BUCKET}/train/models/" +eval_output_s3_path = f"s3://{MLOPS_DATA_BUCKET}/eval/output" + + +def get_assume_role_session(role_arn): # type: ignore[no-untyped-def] + sts = boto3.client("sts") + response = sts.assume_role(RoleArn=role_arn, RoleSessionName="AssumeRoleSession1") + credentials = response["Credentials"] + return boto3.Session( + aws_access_key_id=credentials["AccessKeyId"], + aws_secret_access_key=credentials["SecretAccessKey"], + aws_session_token=credentials["SessionToken"], + ) + + +def pre_processing(): # type: ignore[no-untyped-def] + boto_session = get_assume_role_session(DAG_EXECUTION_ROLE) # type: ignore[no-untyped-call] + sess = Session( + boto_session=boto_session, + default_bucket=MLOPS_DATA_BUCKET, + ) + + sklearn_processor = SKLearnProcessor( + framework_version="0.20.0", + role=SAGEMAKER_EXECUTION_ROLE, + instance_type="ml.m5.xlarge", + instance_count=1, + sagemaker_session=sess, + ) + + processing_input = ProcessingInput( + source=pre_processing_input, + destination="/opt/ml/processing/input", + input_name="input", + ) + processing_train_output = ProcessingOutput( + source="/opt/ml/processing/train", + destination=train_data_s3_path, + output_name="train", + ) + processing_test_output = ProcessingOutput( + source="/opt/ml/processing/test", + destination=test_data_s3_path, + output_name="test", + ) + + return sklearn_processor.run( + code=os.path.join(os.path.dirname(__file__), "preprocessing.py"), + inputs=[processing_input], + outputs=[ + processing_train_output, + processing_test_output, + ], + arguments=["--train-test-split-ratio", "0.2"], + wait=True, + ) + + +def training(): # type: ignore[no-untyped-def] + boto_session = get_assume_role_session(DAG_EXECUTION_ROLE) # type: ignore[no-untyped-call] + sess = Session( + boto_session=boto_session, + default_bucket=MLOPS_DATA_BUCKET, + ) + sklearn = SKLearn( + entry_point=os.path.join(os.path.dirname(__file__), "train.py"), + framework_version="0.20.0", + instance_type="ml.m5.xlarge", + role=SAGEMAKER_EXECUTION_ROLE, + output_path=model_path, + sagemaker_session=sess, + ) + training_input = TrainingInput( + s3_data=train_data_s3_path, + ) + sklearn.fit(inputs=training_input, wait=True) + training_job_description = sklearn.jobs[-1].describe() + model_data_s3_uri = "{}{}/{}".format( + training_job_description["OutputDataConfig"]["S3OutputPath"], + training_job_description["TrainingJobName"], + "output/model.tar.gz", + ) + return model_data_s3_uri + + +def evaluation(model_path): # type: ignore[no-untyped-def] + boto_session = get_assume_role_session(DAG_EXECUTION_ROLE) # type: ignore[no-untyped-call] + sess = Session( + boto_session=boto_session, + default_bucket=MLOPS_DATA_BUCKET, + ) + + sklearn_processor = SKLearnProcessor( + framework_version="0.20.0", + role=SAGEMAKER_EXECUTION_ROLE, + instance_type="ml.m5.xlarge", + instance_count=1, + sagemaker_session=sess, + ) + sklearn_processor.run( + code=os.path.join(os.path.dirname(__file__), "evaluation.py"), + inputs=[ + ProcessingInput(source=model_path, destination="/opt/ml/processing/model"), + ProcessingInput( + source=test_data_s3_path, destination="/opt/ml/processing/test" + ), + ], + outputs=[ + ProcessingOutput( + output_name="evaluation", + destination=eval_output_s3_path, + source="/opt/ml/processing/evaluation", + ) + ], + ) + + +pre_processing_step = PythonOperator( + task_id="pre_processing", + python_callable=pre_processing, + dag=dag, +) + +training_step = PythonOperator( + task_id="training", + python_callable=training, + dag=dag, +) + +evaluation_step = PythonOperator( + task_id="evaluation", + python_callable=evaluation, + dag=dag, + op_args=["{{ task_instance.xcom_pull('training') }}"], +) + +pre_processing_step >> training_step >> evaluation_step diff --git a/modules/examples/airflow-dags/dags/preprocessing.py b/modules/examples/airflow-dags/dags/preprocessing.py new file mode 100644 index 00000000..292eb844 --- /dev/null +++ b/modules/examples/airflow-dags/dags/preprocessing.py @@ -0,0 +1,119 @@ +import argparse +import os +import warnings + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import ( + StandardScaler, + OneHotEncoder, + KBinsDiscretizer, +) +from sklearn.compose import make_column_transformer + +from sklearn.exceptions import DataConversionWarning + +warnings.filterwarnings(action="ignore", category=DataConversionWarning) + + +columns = [ + "age", + "education", + "major industry code", + "class of worker", + "num persons worked for employer", + "capital gains", + "capital losses", + "dividends from stocks", + "income", +] +class_labels = [" - 50000.", " 50000+."] + + +def print_shape(df): # type: ignore[no-untyped-def] + negative_examples, positive_examples = np.bincount(df["income"]) + print( + "Data shape: {}, {} positive examples, {} negative examples".format( + df.shape, positive_examples, negative_examples + ) + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--train-test-split-ratio", type=float, default=0.3) + args, _ = parser.parse_known_args() + + print("Received arguments {}".format(args)) + + input_data_path = os.path.join("/opt/ml/processing/input", "census-income.csv") + + print("Reading input data from {}".format(input_data_path)) + df = pd.read_csv(input_data_path) + df = pd.DataFrame(data=df, columns=columns) + df.dropna(inplace=True) + df.drop_duplicates(inplace=True) + df.replace(class_labels, [0, 1], inplace=True) + + negative_examples, positive_examples = np.bincount(df["income"]) + print( + "Data after cleaning: {}, {} positive examples, {} negative examples".format( + df.shape, positive_examples, negative_examples + ) + ) + + split_ratio = args.train_test_split_ratio + print("Splitting data into train and test sets with ratio {}".format(split_ratio)) + X_train, X_test, y_train, y_test = train_test_split( + df.drop("income", axis=1), df["income"], test_size=split_ratio, random_state=0 + ) + + preprocess = make_column_transformer( + ( + ["age", "num persons worked for employer"], + KBinsDiscretizer(encode="onehot-dense", n_bins=10), + ), + ( + ["capital gains", "capital losses", "dividends from stocks"], + StandardScaler(), + ), + ( + ["education", "major industry code", "class of worker"], + OneHotEncoder(sparse=False), + ), + ) + print("Running preprocessing and feature engineering transformations") + train_features = preprocess.fit_transform(X_train) + test_features = preprocess.transform(X_test) + + print("Train data shape after preprocessing: {}".format(train_features.shape)) + print("Test data shape after preprocessing: {}".format(test_features.shape)) + + train_features_output_path = os.path.join( + "/opt/ml/processing/train", "train_features.csv" + ) + train_labels_output_path = os.path.join( + "/opt/ml/processing/train", "train_labels.csv" + ) + + test_features_output_path = os.path.join( + "/opt/ml/processing/test", "test_features.csv" + ) + test_labels_output_path = os.path.join("/opt/ml/processing/test", "test_labels.csv") + + print("Saving training features to {}".format(train_features_output_path)) + pd.DataFrame(train_features).to_csv( + train_features_output_path, header=False, index=False + ) + + print("Saving test features to {}".format(test_features_output_path)) + pd.DataFrame(test_features).to_csv( + test_features_output_path, header=False, index=False + ) + + print("Saving training labels to {}".format(train_labels_output_path)) + y_train.to_csv(train_labels_output_path, header=False, index=False) + + print("Saving test labels to {}".format(test_labels_output_path)) + y_test.to_csv(test_labels_output_path, header=False, index=False) diff --git a/modules/examples/airflow-dags/dags/train.py b/modules/examples/airflow-dags/dags/train.py new file mode 100644 index 00000000..2870a9c1 --- /dev/null +++ b/modules/examples/airflow-dags/dags/train.py @@ -0,0 +1,21 @@ +import os + +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.externals import joblib + +if __name__ == "__main__": + print("Starting training") + training_data_directory = "/opt/ml/input/data/training" + train_features_data = os.path.join(training_data_directory, "train_features.csv") + train_labels_data = os.path.join(training_data_directory, "train_labels.csv") + print("Reading input data") + X_train = pd.read_csv(train_features_data, header=None) + y_train = pd.read_csv(train_labels_data, header=None) + + model = LogisticRegression(class_weight="balanced", solver="lbfgs") + print("Training LR model") + model.fit(X_train, y_train) + model_output_directory = os.path.join("/opt/ml/model", "model.joblib") + print("Saving model to {}".format(model_output_directory)) + joblib.dump(model, model_output_directory) diff --git a/modules/examples/airflow-dags/deployspec.yaml b/modules/examples/airflow-dags/deployspec.yaml new file mode 100644 index 00000000..a9bac2f3 --- /dev/null +++ b/modules/examples/airflow-dags/deployspec.yaml @@ -0,0 +1,45 @@ +publishGenericEnvVariables: true +deploy: + phases: + install: + commands: + - npm install -g aws-cdk@2.137.0 + - pip install -r requirements.txt + build: + commands: + - > + echo "SEEDFARMER_MODULE_METADATA: ${SEEDFARMER_MODULE_METADATA}" + - > + echo "SEEDFARMER_PARAMETER_DAG_BUCKET_NAME: ${SEEDFARMER_PARAMETER_DAG_BUCKET_NAME}" + - > + echo "SEEDFARMER_PARAMETER_DAG_PATH: ${SEEDFARMER_PARAMETER_DAG_PATH}" + - > + echo "SEEDFARMER_PARAMETER_MWAA_EXEC_ROLE_ARN: ${SEEDFARMER_PARAMETER_MWAA_EXEC_ROLE_ARN}" + - cdk deploy --require-approval never --progress events --app "python app.py" --outputs-file ./cdk-exports.json + - export SEEDFARMER_MODULE_METADATA=$(python -c "import json; file=open('cdk-exports.json'); print(json.load(file)['${SEEDFARMER_PROJECT_NAME}-${SEEDFARMER_DEPLOYMENT_NAME}-${SEEDFARMER_MODULE_NAME}']['metadata'])") + - export MLOPS_BUCKET=$(echo ${SEEDFARMER_MODULE_METADATA} | jq -r ".MlOpsBucket") + - export DAG_IAM_ROLE=$(echo ${SEEDFARMER_MODULE_METADATA} | jq -r ".DagRoleArn") + - export SAGEMAKER_IAM_ROLE=$(echo ${SEEDFARMER_MODULE_METADATA} | jq -r ".SageMakerExecutionRole") + - > + echo "MLOPS_BUCKET: ${MLOPS_BUCKET}" + - > + echo "DAG_IAM_ROLE: ${DAG_IAM_ROLE}" + - > + echo "SAGEMAKER_IAM_ROLE: ${SAGEMAKER_IAM_ROLE}" + - sed -i "s/MLOPS_S3_BUCKET/${MLOPS_BUCKET}/g" dags/config.py + - sed -i "s/AWS_REGION_NAME/${AWS_DEFAULT_REGION}/g" dags/config.py + - sed -i "s~DAG_IAM_ROLE~${DAG_IAM_ROLE}~g" dags/config.py + - sed -i "s~SAGEMAKER_IAM_ROLE~${SAGEMAKER_IAM_ROLE}~g" dags/config.py + - aws s3 cp --recursive dags/ s3://$SEEDFARMER_PARAMETER_DAG_BUCKET_NAME/$SEEDFARMER_PARAMETER_DAG_PATH/mlops/ +destroy: + phases: + install: + commands: + - npm install -g aws-cdk@2.137.0 + - pip install -r requirements.txt + build: + commands: + - export MLOPS_BUCKET=$(echo ${SEEDFARMER_MODULE_METADATA} | jq -r ".MlOpsBucket") + - aws s3 rm --recursive s3://$SEEDFARMER_PARAMETER_DAG_BUCKET_NAME/$SEEDFARMER_PARAMETER_DAG_PATH/mlops/ + - aws s3 rm --recursive s3://$MLOPS_BUCKET/ + - cdk destroy --force --app "python app.py" \ No newline at end of file diff --git a/modules/examples/airflow-dags/modulestack.yaml b/modules/examples/airflow-dags/modulestack.yaml new file mode 100644 index 00000000..90e677fd --- /dev/null +++ b/modules/examples/airflow-dags/modulestack.yaml @@ -0,0 +1,39 @@ +AWSTemplateFormatVersion: 2010-09-09 +Description: This stack deploys a Module specific IAM permissions + +Parameters: + # DeploymentName: + # Type: String + # Description: The name of the deployment + # ModuleName: + # Type: String + # Description: The name of the Module + RoleName: + Type: String + Description: The name of the IAM Role + DagBucketName: + Type: String + Description: The name Bucket where MWAA DAG artifacts are deployed + DagPath: + Type: String + Description: The path in the Dag Bucket where MWAA DAG artifacts are deployed + +Resources: + Policy: + Type: 'AWS::IAM::Policy' + Properties: + PolicyDocument: + Statement: + - Action: + - 's3:Create*' + - 's3:Put*' + - 's3:Delete*' + - 's3:Get*' + - 's3:List*' + Effect: Allow + Resource: + - 'arn:aws:s3:::cdk*' + - !Sub "arn:aws:s3:::${DagBucketName}/${DagPath}/*" + Version: 2012-10-17 + PolicyName: "mlops-modulespecific-policy" + Roles: [!Ref RoleName] \ No newline at end of file diff --git a/modules/examples/airflow-dags/pyproject.toml b/modules/examples/airflow-dags/pyproject.toml new file mode 100644 index 00000000..d21eb12f --- /dev/null +++ b/modules/examples/airflow-dags/pyproject.toml @@ -0,0 +1,36 @@ +[tool.black] +line-length = 120 +target-version = ["py36", "py37", "py38"] +exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | \.env + | _build + | buck-out + | build + | dist + | codeseeder.out +)/ +''' + +[tool.isort] +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 120 +src_paths = ["example_dags"] +py_version = 36 +skip_gitignore = false + +[tool.pytest.ini_options] +addopts = "-v --cov=. --cov-report term --cov-config=coverage.ini" +pythonpath = [ + "." +] \ No newline at end of file diff --git a/modules/examples/airflow-dags/requirements.in b/modules/examples/airflow-dags/requirements.in new file mode 100644 index 00000000..9ec44d58 --- /dev/null +++ b/modules/examples/airflow-dags/requirements.in @@ -0,0 +1,4 @@ +aws-cdk-lib==2.137.0 +cdk-nag==2.28.89 +boto3~=1.34.84 +attrs==23.2.0 \ No newline at end of file diff --git a/modules/examples/airflow-dags/requirements.txt b/modules/examples/airflow-dags/requirements.txt new file mode 100644 index 00000000..65e4c2a7 --- /dev/null +++ b/modules/examples/airflow-dags/requirements.txt @@ -0,0 +1,82 @@ +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile --output-file=requirements.txt requirements.in +# +attrs==23.2.0 + # via + # -r requirements.in + # cattrs + # jsii +aws-cdk-asset-awscli-v1==2.2.202 + # via aws-cdk-lib +aws-cdk-asset-kubectl-v20==2.1.2 + # via aws-cdk-lib +aws-cdk-asset-node-proxy-agent-v6==2.0.3 + # via aws-cdk-lib +aws-cdk-lib==2.137.0 + # via + # -r requirements.in + # cdk-nag +boto3==1.34.84 + # via -r requirements.in +botocore==1.34.84 + # via + # boto3 + # s3transfer +cattrs==22.1.0 + # via jsii +cdk-nag==2.28.89 + # via -r requirements.in +constructs==10.1.37 + # via + # aws-cdk-lib + # cdk-nag +exceptiongroup==1.2.0 + # via cattrs +importlib-resources==6.4.0 + # via jsii +jmespath==1.0.0 + # via + # boto3 + # botocore +jsii==1.97.0 + # via + # aws-cdk-asset-awscli-v1 + # aws-cdk-asset-kubectl-v20 + # aws-cdk-asset-node-proxy-agent-v6 + # aws-cdk-lib + # cdk-nag + # constructs +publication==0.0.3 + # via + # aws-cdk-asset-awscli-v1 + # aws-cdk-asset-kubectl-v20 + # aws-cdk-asset-node-proxy-agent-v6 + # aws-cdk-lib + # cdk-nag + # constructs + # jsii +python-dateutil==2.8.2 + # via + # botocore + # jsii +s3transfer==0.10.1 + # via boto3 +six==1.16.0 + # via python-dateutil +typeguard==2.13.3 + # via + # aws-cdk-asset-awscli-v1 + # aws-cdk-asset-kubectl-v20 + # aws-cdk-asset-node-proxy-agent-v6 + # aws-cdk-lib + # cdk-nag + # jsii +typing-extensions==4.7.1 + # via jsii +urllib3==1.26.18 + # via botocore +zipp==3.18.1 + # via importlib-resources diff --git a/modules/examples/airflow-dags/setup.cfg b/modules/examples/airflow-dags/setup.cfg new file mode 100644 index 00000000..6f1278ad --- /dev/null +++ b/modules/examples/airflow-dags/setup.cfg @@ -0,0 +1,27 @@ +[metadata] +license_files = + LICENSE + NOTICE + VERSION + +[flake8] +max-line-length = 120 +extend-ignore = E203, W503 +exclude = + .git, + __pycache__, + docs/source/conf.py, + old, + build, + dist, + .venv, + codeseeder.out, + bundle + +[mypy] +python_version = 3.7 +strict = True +ignore_missing_imports = True +allow_untyped_decorators = True +exclude = + codeseeder.out/|example_dags/ diff --git a/modules/examples/airflow-dags/stack.py b/modules/examples/airflow-dags/stack.py new file mode 100755 index 00000000..13e99dfa --- /dev/null +++ b/modules/examples/airflow-dags/stack.py @@ -0,0 +1,174 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any, Optional, cast + +import aws_cdk.aws_iam as aws_iam +import cdk_nag +import aws_cdk.aws_s3 as aws_s3 +from aws_cdk import Aspects, Stack, Tags, RemovalPolicy, Aws +from cdk_nag import NagSuppressions, NagPackSuppression +from constructs import Construct, IConstruct + +_logger: logging.Logger = logging.getLogger(__name__) + + +class DagResources(Stack): + def __init__( + self, + scope: Construct, + id: str, + *, + project_name: str, + deployment_name: str, + module_name: str, + mwaa_exec_role: str, + bucket_policy_arn: Optional[str] = None, + permission_boundary_arn: Optional[str] = None, + **kwargs: Any, + ) -> None: + # MLOPS Env vars + self.deployment_name = deployment_name + self.module_name = module_name + self.mwaa_exec_role = mwaa_exec_role + + super().__init__( + scope, + id, + description="This stack deploys Example DAGs resources for MLOps", + **kwargs, + ) + Tags.of(scope=cast(IConstruct, self)).add( + key="Deployment", value=f"mlops-{deployment_name}" + ) + dep_mod = f"{project_name}-{deployment_name}-{module_name}" + account: str = Aws.ACCOUNT_ID + region: str = Aws.REGION + + mlops_assets_bucket = aws_s3.Bucket( + self, + id="mlops-assets-bucket", + versioned=False, + bucket_name=f"{dep_mod}-{account}-{region}", + removal_policy=RemovalPolicy.DESTROY, + encryption=aws_s3.BucketEncryption.KMS_MANAGED, + block_public_access=aws_s3.BlockPublicAccess.BLOCK_ALL, + enforce_ssl=True, + ) + + self.mlops_assets_bucket = mlops_assets_bucket + # Create Dag IAM Role and policy + dag_statement = aws_iam.PolicyDocument( + statements=[ + aws_iam.PolicyStatement( + actions=["s3:List*", "s3:Get*", "s3:Put*"], + effect=aws_iam.Effect.ALLOW, + resources=[ + mlops_assets_bucket.bucket_arn, + f"{mlops_assets_bucket.bucket_arn}/*", + ], + ) + ] + ) + + managed_policies = ( + [ + aws_iam.ManagedPolicy.from_managed_policy_arn( + self, "bucket-policy", bucket_policy_arn + ) + ] + if bucket_policy_arn + else [] + ) + + # Role with Permission Boundary + r_name = f"mlops-{self.deployment_name}-{self.module_name}-dag-role" + dag_role = aws_iam.Role( + self, + f"dag-role-{self.deployment_name}-{self.module_name}", + assumed_by=aws_iam.ArnPrincipal(self.mwaa_exec_role), + inline_policies={"DagPolicyDocument": dag_statement}, + managed_policies=managed_policies, + permissions_boundary=( + aws_iam.ManagedPolicy.from_managed_policy_arn( + self, + f"perm-boundary-{self.deployment_name}-{self.module_name}", + permission_boundary_arn, + ) + if permission_boundary_arn + else None + ), + role_name=r_name, + path="/", + ) + + dag_role.add_managed_policy( + aws_iam.ManagedPolicy.from_aws_managed_policy_name( + "AmazonSageMakerFullAccess" + ) + ) + dag_role.add_managed_policy( + aws_iam.ManagedPolicy.from_aws_managed_policy_name( + "CloudWatchLogsFullAccess" + ) + ) + + # Define the IAM role + sagemaker_execution_role = aws_iam.Role( + self, + "SageMakerExecutionRole", + assumed_by=aws_iam.ServicePrincipal("sagemaker.amazonaws.com"), + managed_policies=[ + aws_iam.ManagedPolicy.from_aws_managed_policy_name( + "AmazonSageMakerFullAccess" + ) + ], + path="/", + role_name=f"SageMakerExecutionRole-{self.stack_name}", + ) + + # Add policy to allow access to S3 bucket + sagemaker_execution_role.add_to_policy( + aws_iam.PolicyStatement( + actions=["s3:*"], + resources=[ + mlops_assets_bucket.bucket_arn, + f"{mlops_assets_bucket.bucket_arn}/*", + ], + ) + ) + + dag_role.add_to_policy( + aws_iam.PolicyStatement( + actions=["iam:PassRole"], resources=[sagemaker_execution_role.role_arn] + ) + ) + + self.dag_role = dag_role + self.sagemaker_execution_role = sagemaker_execution_role + + Aspects.of(self).add(cdk_nag.AwsSolutionsChecks()) + + NagSuppressions.add_resource_suppressions( + self, + apply_to_children=True, + suppressions=[ + NagPackSuppression( + id="AwsSolutions-S1", + reason="Logs are disabled for demo purposes", + ), + NagPackSuppression( + id="AwsSolutions-S5", + reason="No OAI needed - no one is accessing this data without explicit permissions", + ), + NagPackSuppression( + id="AwsSolutions-IAM5", + reason="Resource access restriced to MLOPS resources.", + ), + NagPackSuppression( + id="AwsSolutions-IAM4", + reason="Managed Policies are for service account roles only", + ), + ], + ) diff --git a/modules/examples/airflow-dags/tests/__init__.py b/modules/examples/airflow-dags/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/examples/airflow-dags/tests/test_app.py b/modules/examples/airflow-dags/tests/test_app.py new file mode 100644 index 00000000..5ea88eaa --- /dev/null +++ b/modules/examples/airflow-dags/tests/test_app.py @@ -0,0 +1,25 @@ +import os +import sys + +import pytest + + +@pytest.fixture(scope="function") +def stack_defaults() -> None: + os.environ["SEEDFARMER_PROJECT_NAME"] = "test-project" + os.environ["SEEDFARMER_DEPLOYMENT_NAME"] = "test-deployment" + os.environ["SEEDFARMER_MODULE_NAME"] = "test-module" + os.environ["CDK_DEFAULT_ACCOUNT"] = "111111111111" + os.environ["CDK_DEFAULT_REGION"] = "us-east-1" + + os.environ["SEEDFARMER_PARAMETER_MWAA_EXEC_ROLE_ARN"] = "vpc-12345" + os.environ["SEEDFARMER_PARAMETER_BUCKET_POLICY_ARN"] = "12345" + os.environ["SEEDFARMER_PERMISSION_BOUNDARY_ARN"] = "sagemaker-project" + + # Unload the app import so that subsequent tests don't reuse + if "app" in sys.modules: + del sys.modules["app"] + + +def test_app(stack_defaults): # type: ignore[no-untyped-def] + import app # noqa: F401 diff --git a/modules/examples/airflow-dags/tests/test_stack.py b/modules/examples/airflow-dags/tests/test_stack.py new file mode 100644 index 00000000..9e83b9bf --- /dev/null +++ b/modules/examples/airflow-dags/tests/test_stack.py @@ -0,0 +1,68 @@ +import os +import sys + +import aws_cdk as cdk +import cdk_nag +import pytest +from aws_cdk.assertions import Annotations, Match, Template + + +@pytest.fixture(scope="function") +def stack_defaults() -> None: + os.environ["CDK_DEFAULT_ACCOUNT"] = "111111111111" + os.environ["CDK_DEFAULT_REGION"] = "us-east-1" + + # Unload the app import so that subsequent tests don't reuse + if "stack" in sys.modules: + del sys.modules["stack"] + + +@pytest.fixture(scope="function") +def stack_model_package_input() -> cdk.Stack: + import stack + + app = cdk.App() + + project_name = "test-project" + deployment_name = "test-deployment" + module_name = "test-module" + + app_prefix = f"{project_name}-{deployment_name}-{module_name}" + mwaa_exec_role = "arn:aws:iam::123456789012:role/mwaarole" + bucket_policy_arn = "arn:aws:iam::123456789012:policy/bucketPolicy" + permission_boundary_arn = "arn:aws:iam::123456789012:policy/boundary" + + return stack.DagResources( + scope=app, + id=app_prefix, + project_name=project_name, + deployment_name=deployment_name, + module_name=module_name, + mwaa_exec_role=mwaa_exec_role, + bucket_policy_arn=bucket_policy_arn, + permission_boundary_arn=permission_boundary_arn, + env=cdk.Environment( + account=os.environ["CDK_DEFAULT_ACCOUNT"], + region=os.environ["CDK_DEFAULT_REGION"], + ), + ) + + +@pytest.fixture(params=["stack_model_package_input"], scope="function") +def stack(request, stack_model_package_input) -> cdk.Stack: # type: ignore[no-untyped-def] + return request.getfixturevalue(request.param) # type: ignore[no-any-return] + + +def test_synthesize_stack(stack: cdk.Stack) -> None: + template = Template.from_stack(stack) + template.resource_count_is("AWS::S3::Bucket", 1) + + +def test_no_cdk_nag_errors(stack: cdk.Stack) -> None: + cdk.Aspects.of(stack).add(cdk_nag.AwsSolutionsChecks()) + + nag_errors = Annotations.from_stack(stack).find_error( + "*", + Match.string_like_regexp(r"AwsSolutions-.*"), + ) + assert not nag_errors, f"Found {len(nag_errors)} CDK nag errors" diff --git a/requirements-dev.txt b/requirements-dev.txt index b33b39b4..1f6c3f32 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ isort~=5.10.1 flake8~=4.0.1 pylint~=2.14.1 mypy~=0.961 -pip-tools~=6.6.2 +pip-tools~=7.4.1 python-dotenv~=0.21.0 cfn-lint~=0.86.2 check-manifest~=0.48