Merge branch 'master' into gpcc_

datacommonsorg · Feb 19, 2024 · 6020c23 · 6020c23
2 parents b9c8beb + 4e6c4a6
commit 6020c23
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 25 deletions.
diff --git a/import-automation/executor/README.md b/import-automation/executor/README.md
@@ -81,16 +81,16 @@ Run `./schedule_update_import.sh --help` for usage.
 To schedule an import to run as a cron job on the GCP Cloud Scheduler, do the following:
 
 ```
-Run `./schedule_update_import.sh -s <config_project_id> <path_to_import>`
+Run `./schedule_update_import.sh -s <gke_project_id> <path_to_import>`
 ```
 
-`<config_project_id>` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`.
+`<gke_project_id>` is the GCP project id where the import executer is run from e.g. `datcom-import-automation-prod`.
 `<path_to_import>` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`.
 
 Example invocation:
 
 ```
-Run `./schedule_update_import.sh -s datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey`
+Run `./schedule_update_import.sh -s datcom-import-automation-prod scripts/us_usda/quickstats:UsdaAgSurvey`
 ```
 
 The script will log the name of the Cloud Scheduler job and a url for all the jobs on the scheduler. Please verify that all the job metadata was updated as expected.
@@ -106,16 +106,16 @@ Once the script runs to completion, the data directory's latest update is printe
 To excute an Update locally, do the following:
 
 ```
-Run `./schedule_update_import.sh -u <config_project_id> <path_to_import>`
+Run `./schedule_update_import.sh -u <gke_project_id> <path_to_import>`
 ```
 
-`<config_project_id>` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`.
+`<gke_project_id>` is the GCP project id where the import executer is run from e.g. `datcom-import-automation-prod`.
 `<path_to_import>` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`.
 
 Example invocation:
 
 ```
-Run `./schedule_update_import.sh -u datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey`
+Run `./schedule_update_import.sh -u datcom-import-automation-prod scripts/us_usda/quickstats:UsdaAgSurvey`
 ```
 
 

diff --git a/import-automation/executor/gke/configure_gke.sh b/import-automation/executor/gke/configure_gke.sh
@@ -63,13 +63,18 @@ kubectl annotate serviceaccount \
   --namespace import-automation \
   --overwrite \
   import-automation-ksa \
-  iam.gke.io/gcp-service-account=$PROJECT_ID@appspot.gserviceaccount.com
+  iam.gke.io/gcp-service-account=default-service-account@$PROJECT_ID.iam.gserviceaccount.com
 
 kubectl -n import-automation create secret generic import-automation-iap-secret \
   --from-literal=client_id=$OAUTH_CLIENT_ID \
   --from-literal=client_secret=$OAUTH_CLIENT_SECRET
 
-# Also set what identity will cloud scheduler call as by running:
+# Also set what identity will cloud scheduler call as by running the command below.
+# Note also that this service account will need to allow the Cloud Build service account
+# iam.serviceAccounts.actAs permissions on the service account for the Scheduler below.
+# This can be achieved by following the first answer here: 
+# https://stackoverflow.com/questions/61334524/how-do-you-enable-iam-serviceaccounts-actas-permissions-on-a-sevice-account
+# The Cloud Build service account can be found on the Settings tab of the Cloud Build page.
 kubectl -n import-automation create configmap cluster-oauth-configmap \
-  --from-literal=cloud-scheduler-caller-sa=$PROJECT_ID@appspot.gserviceaccount.com \
+  --from-literal=cloud-scheduler-caller-sa=default-service-account@$PROJECT_ID.iam.gserviceaccount.com \
   --from-literal=cloud-scheduler-caller-oauth-audience=$OAUTH_CLIENT_ID
diff --git a/import-automation/executor/gke/deployment.yaml b/import-automation/executor/gke/deployment.yaml
@@ -51,10 +51,11 @@ spec:
     port: 8080
     type: HTTP
     requestPath: /healthz
-  iap:
-    enabled: true
-    oauthclientCredentials:
-      secretName: import-automation-iap-secret
+  # TODO: re-enable this once the deployments work e2e.
+  # iap:
+  #   enabled: true
+  #   oauthclientCredentials:
+  #     secretName: import-automation-iap-secret
 ---
 apiVersion: v1
 kind: Service

diff --git a/import-automation/executor/schedule_update_import.py b/import-automation/executor/schedule_update_import.py
@@ -37,7 +37,10 @@
 _FLAGS = flags.FLAGS
 
 flags.DEFINE_string('mode', '', 'Options: update or schedule.')
-flags.DEFINE_string('config_project_id', '', 'GCS Project for the config file.')
+flags.DEFINE_string('gke_project_id', '',
+                    'GCP Project where import executor runs.')
+flags.DEFINE_string('config_project_id', 'datcom-204919',
+                    'GCS Project for the config file.')
 flags.DEFINE_string('config_bucket', 'import-automation-configs',
                     'GCS bucket name for the config file.')
 flags.DEFINE_string('config_filename', 'configs.json',
@@ -94,14 +97,14 @@ def _override_configs(filename: str,
 
 def _get_cloud_config(filename: str) -> Dict:
     logging.info('Getting cloud config.')
-    project_id = _FLAGS.config_project_id
+    config_project_id = _FLAGS.config_project_id
     bucket_name = _FLAGS.config_bucket
     logging.info(
-        f'\nProject ID: {project_id}\nBucket: {bucket_name}\nConfig Filename: {filename}'
+        f'\nProject ID: {config_project_id}\nBucket: {bucket_name}\nConfig Filename: {filename}'
     )
 
-    bucket = storage.Client(project_id).bucket(bucket_name,
-                                               user_project=project_id)
+    bucket = storage.Client(config_project_id).bucket(
+        bucket_name, user_project=config_project_id)
     blob = bucket.blob(filename)
     config_dict = json.loads(blob.download_as_string(client=None))
     return config_dict
@@ -261,8 +264,8 @@ def main(_):
     mode = _FLAGS.mode
     absolute_import_path = _FLAGS.absolute_import_path
 
-    if not _FLAGS.config_project_id:
-        raise Exception("Flag: config_project_if must be provided.")
+    if not _FLAGS.gke_project_id:
+        raise Exception("Flag: gke_project_id must be provided.")
 
     if not mode or (mode not in ['update', 'schedule']):
         raise Exception('Flag: mode must be set to \'update\' or \'schedule\'')
@@ -278,6 +281,7 @@ def main(_):
     repo_dir = cwd.split("data")[0] + "data"
     logging.info(f'{mode} called with the following:')
     logging.info(f'Config Project ID: {_FLAGS.config_project_id}')
+    logging.info(f'GKE (Import Executor) Project ID: {_FLAGS.gke_project_id}')
     logging.info(f'Import: {absolute_import_path}')
     logging.info(f'Repo root directory: {repo_dir}')
 
@@ -287,6 +291,9 @@ def main(_):
     config_dict = _get_cloud_config(_FLAGS.config_filename)
     cfg = configs.ExecutorConfig(**config_dict['configs'])
 
+    # Update the GCP project id to use with the configs.
+    cfg.gcp_project_id = _FLAGS.gke_project_id
+
     logging.info(
         f'Updating any config fields from local file: {_CONFIG_OVERRIDE_FILE}.')
     cfg = _override_configs(_CONFIG_OVERRIDE_FILE, cfg)

diff --git a/import-automation/executor/schedule_update_import.sh b/import-automation/executor/schedule_update_import.sh
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 function help {
-  echo "#Usage: -us <config_project_id> <absolute_import_path>"
-  echo "## <config_project_id> is the GCP project ID where the config file is located." 
+  echo "#Usage: -us <gke_project_id> <absolute_import_path>"
+  echo "## <gke_project_id> is the GCP project ID where the import executor is running in." 
   echo "## Update an import specified by <absolute_import_path>, e.g. scripts/us_usda/quickstats:UsdaAgSurvey"  exit 1
 }
 
@@ -37,13 +37,13 @@ while getopts us OPTION; do
     esac
 done
 
-CONFIG_PROJECT_ID=$2
+GKE_PROJECT_ID=$2
 IMPORT_PATH=$3
 
 python3 -m venv .env
 . .env/bin/activate
 pip3 install --disable-pip-version-check -r requirements.txt
 
-python3 -m schedule_update_import --config_project_id=$CONFIG_PROJECT_ID --mode=$MODE --absolute_import_path=$IMPORT_PATH
+python3 -m schedule_update_import --gke_project_id=$GKE_PROJECT_ID --mode=$MODE --absolute_import_path=$IMPORT_PATH
 
 deactivate
diff --git a/scripts/world_bank/.gitignore b/scripts/world_bank/.gitignore
@@ -1 +1,2 @@
 preprocessed_source_csv
+download_indicators/*.csv
diff --git a/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py b/scripts/world_bank/wdi/download_indicators/wdi_download_indicators.py
@@ -9,7 +9,10 @@
 import numpy as np
 import pandas as pd
 
-_OUT_PATH = flags.DEFINE_string('out_path', None, 'CNS path to write output.')
+# The output path should have a default filename.
+_OUT_DEFAULT_NAME = 'cleaned_wdi.csv'
+_OUT_PATH = flags.DEFINE_string('out_path', _OUT_DEFAULT_NAME,
+                                'CNS path to write output.')
 
 indicators = [
     'SP.POP.TOTL',
@@ -72,6 +75,7 @@ def DownloadAndParseCsvs() -> None:
   """
     dat = []
     for indicator in indicators:
+        print(f'DOWNLOADING: {indicator}....')
         resp = urllib.request.urlopen(
             f'http://api.worldbank.org/v2/country/all/indicator/{indicator}?source=2&downloadformat=csv'
         )
@@ -121,6 +125,8 @@ def DownloadAndParseCsvs() -> None:
             'unit',
         ],
     )
+    # Write to the _OUT_PATH which defaults to the output filename
+    # if no path is provided.
     with open(_OUT_PATH.value, 'w+') as f_out:
         out_df.to_csv(f_out, index=False)
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		preprocessed_source_csv
		download_indicators/*.csv