diff --git a/import-automation/executor/README.md b/import-automation/executor/README.md index 9cf052b429..a96e6a4dbe 100644 --- a/import-automation/executor/README.md +++ b/import-automation/executor/README.md @@ -81,16 +81,16 @@ Run `./schedule_update_import.sh --help` for usage. To schedule an import to run as a cron job on the GCP Cloud Scheduler, do the following: ``` -Run `./schedule_update_import.sh -s ` +Run `./schedule_update_import.sh -s ` ``` -`` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`. +`` is the GCP project id where the import executer is run from e.g. `datcom-import-automation-prod`. `` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`. Example invocation: ``` -Run `./schedule_update_import.sh -s datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey` +Run `./schedule_update_import.sh -s datcom-import-automation-prod scripts/us_usda/quickstats:UsdaAgSurvey` ``` The script will log the name of the Cloud Scheduler job and a url for all the jobs on the scheduler. Please verify that all the job metadata was updated as expected. @@ -106,16 +106,16 @@ Once the script runs to completion, the data directory's latest update is printe To excute an Update locally, do the following: ``` -Run `./schedule_update_import.sh -u ` +Run `./schedule_update_import.sh -u ` ``` -`` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`. +`` is the GCP project id where the import executer is run from e.g. `datcom-import-automation-prod`. `` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`. Example invocation: ``` -Run `./schedule_update_import.sh -u datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey` +Run `./schedule_update_import.sh -u datcom-import-automation-prod scripts/us_usda/quickstats:UsdaAgSurvey` ``` diff --git a/import-automation/executor/gke/configure_gke.sh b/import-automation/executor/gke/configure_gke.sh index dea858b270..8cfa745c6d 100755 --- a/import-automation/executor/gke/configure_gke.sh +++ b/import-automation/executor/gke/configure_gke.sh @@ -63,13 +63,18 @@ kubectl annotate serviceaccount \ --namespace import-automation \ --overwrite \ import-automation-ksa \ - iam.gke.io/gcp-service-account=$PROJECT_ID@appspot.gserviceaccount.com + iam.gke.io/gcp-service-account=default-service-account@$PROJECT_ID.iam.gserviceaccount.com kubectl -n import-automation create secret generic import-automation-iap-secret \ --from-literal=client_id=$OAUTH_CLIENT_ID \ --from-literal=client_secret=$OAUTH_CLIENT_SECRET -# Also set what identity will cloud scheduler call as by running: +# Also set what identity will cloud scheduler call as by running the command below. +# Note also that this service account will need to allow the Cloud Build service account +# iam.serviceAccounts.actAs permissions on the service account for the Scheduler below. +# This can be achieved by following the first answer here: +# https://stackoverflow.com/questions/61334524/how-do-you-enable-iam-serviceaccounts-actas-permissions-on-a-sevice-account +# The Cloud Build service account can be found on the Settings tab of the Cloud Build page. kubectl -n import-automation create configmap cluster-oauth-configmap \ - --from-literal=cloud-scheduler-caller-sa=$PROJECT_ID@appspot.gserviceaccount.com \ + --from-literal=cloud-scheduler-caller-sa=default-service-account@$PROJECT_ID.iam.gserviceaccount.com \ --from-literal=cloud-scheduler-caller-oauth-audience=$OAUTH_CLIENT_ID diff --git a/import-automation/executor/gke/deployment.yaml b/import-automation/executor/gke/deployment.yaml index a346102b13..309dee7eee 100644 --- a/import-automation/executor/gke/deployment.yaml +++ b/import-automation/executor/gke/deployment.yaml @@ -51,10 +51,11 @@ spec: port: 8080 type: HTTP requestPath: /healthz - iap: - enabled: true - oauthclientCredentials: - secretName: import-automation-iap-secret + # TODO: re-enable this once the deployments work e2e. + # iap: + # enabled: true + # oauthclientCredentials: + # secretName: import-automation-iap-secret --- apiVersion: v1 kind: Service diff --git a/import-automation/executor/schedule_update_import.py b/import-automation/executor/schedule_update_import.py index b033360844..b6b8a62737 100644 --- a/import-automation/executor/schedule_update_import.py +++ b/import-automation/executor/schedule_update_import.py @@ -37,7 +37,10 @@ _FLAGS = flags.FLAGS flags.DEFINE_string('mode', '', 'Options: update or schedule.') -flags.DEFINE_string('config_project_id', '', 'GCS Project for the config file.') +flags.DEFINE_string('gke_project_id', '', + 'GCP Project where import executor runs.') +flags.DEFINE_string('config_project_id', 'datcom-204919', + 'GCS Project for the config file.') flags.DEFINE_string('config_bucket', 'import-automation-configs', 'GCS bucket name for the config file.') flags.DEFINE_string('config_filename', 'configs.json', @@ -94,14 +97,14 @@ def _override_configs(filename: str, def _get_cloud_config(filename: str) -> Dict: logging.info('Getting cloud config.') - project_id = _FLAGS.config_project_id + config_project_id = _FLAGS.config_project_id bucket_name = _FLAGS.config_bucket logging.info( - f'\nProject ID: {project_id}\nBucket: {bucket_name}\nConfig Filename: {filename}' + f'\nProject ID: {config_project_id}\nBucket: {bucket_name}\nConfig Filename: {filename}' ) - bucket = storage.Client(project_id).bucket(bucket_name, - user_project=project_id) + bucket = storage.Client(config_project_id).bucket( + bucket_name, user_project=config_project_id) blob = bucket.blob(filename) config_dict = json.loads(blob.download_as_string(client=None)) return config_dict @@ -261,8 +264,8 @@ def main(_): mode = _FLAGS.mode absolute_import_path = _FLAGS.absolute_import_path - if not _FLAGS.config_project_id: - raise Exception("Flag: config_project_if must be provided.") + if not _FLAGS.gke_project_id: + raise Exception("Flag: gke_project_id must be provided.") if not mode or (mode not in ['update', 'schedule']): raise Exception('Flag: mode must be set to \'update\' or \'schedule\'') @@ -278,6 +281,7 @@ def main(_): repo_dir = cwd.split("data")[0] + "data" logging.info(f'{mode} called with the following:') logging.info(f'Config Project ID: {_FLAGS.config_project_id}') + logging.info(f'GKE (Import Executor) Project ID: {_FLAGS.gke_project_id}') logging.info(f'Import: {absolute_import_path}') logging.info(f'Repo root directory: {repo_dir}') @@ -287,6 +291,9 @@ def main(_): config_dict = _get_cloud_config(_FLAGS.config_filename) cfg = configs.ExecutorConfig(**config_dict['configs']) + # Update the GCP project id to use with the configs. + cfg.gcp_project_id = _FLAGS.gke_project_id + logging.info( f'Updating any config fields from local file: {_CONFIG_OVERRIDE_FILE}.') cfg = _override_configs(_CONFIG_OVERRIDE_FILE, cfg) diff --git a/import-automation/executor/schedule_update_import.sh b/import-automation/executor/schedule_update_import.sh index 9961405174..4ca0e24bb0 100755 --- a/import-automation/executor/schedule_update_import.sh +++ b/import-automation/executor/schedule_update_import.sh @@ -14,8 +14,8 @@ # limitations under the License. function help { - echo "#Usage: -us " - echo "## is the GCP project ID where the config file is located." + echo "#Usage: -us " + echo "## is the GCP project ID where the import executor is running in." echo "## Update an import specified by , e.g. scripts/us_usda/quickstats:UsdaAgSurvey" exit 1 } @@ -37,13 +37,13 @@ while getopts us OPTION; do esac done -CONFIG_PROJECT_ID=$2 +GKE_PROJECT_ID=$2 IMPORT_PATH=$3 python3 -m venv .env . .env/bin/activate pip3 install --disable-pip-version-check -r requirements.txt -python3 -m schedule_update_import --config_project_id=$CONFIG_PROJECT_ID --mode=$MODE --absolute_import_path=$IMPORT_PATH +python3 -m schedule_update_import --gke_project_id=$GKE_PROJECT_ID --mode=$MODE --absolute_import_path=$IMPORT_PATH deactivate \ No newline at end of file diff --git a/scripts/world_bank/.gitignore b/scripts/world_bank/.gitignore index eafbb11c23..984034d9f1 100644 --- a/scripts/world_bank/.gitignore +++ b/scripts/world_bank/.gitignore @@ -1 +1,2 @@ preprocessed_source_csv +download_indicators/*.csv diff --git a/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py b/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py index 24776559a5..cfd8f9aeec 100644 --- a/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py +++ b/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py @@ -9,7 +9,10 @@ import numpy as np import pandas as pd -_OUT_PATH = flags.DEFINE_string('out_path', None, 'CNS path to write output.') +# The output path should have a default filename. +_OUT_DEFAULT_NAME = 'cleaned_wdi.csv' +_OUT_PATH = flags.DEFINE_string('out_path', _OUT_DEFAULT_NAME, + 'CNS path to write output.') indicators = [ 'SP.POP.TOTL', @@ -72,6 +75,7 @@ def DownloadAndParseCsvs() -> None: """ dat = [] for indicator in indicators: + print(f'DOWNLOADING: {indicator}....') resp = urllib.request.urlopen( f'http://api.worldbank.org/v2/country/all/indicator/{indicator}?source=2&downloadformat=csv' ) @@ -121,6 +125,8 @@ def DownloadAndParseCsvs() -> None: 'unit', ], ) + # Write to the _OUT_PATH which defaults to the output filename + # if no path is provided. with open(_OUT_PATH.value, 'w+') as f_out: out_df.to_csv(f_out, index=False)