Merge branch 'master' into us_nces_demographics

datacommonsorg · Feb 8, 2024 · f979c73 · f979c73
2 parents 7ff2c7b + c2086d9
commit f979c73
Show file tree

Hide file tree

Showing 135 changed files with 38,022 additions and 21,656 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,9 @@ __pycache__/
 **/tmp/
 dc_generated/
 # Ignore the data folder under the un_census/enhanced_tmcf/ directory.
-*/*/*/data/*
+*/*/*/data/*
+lib/
+bin/
+pyvenv.cfg
+# Ignore updates to the local configs json file.
+import-automation/executor/config_override.json
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "scripts/un/sdg/sdg-dataset"]
+	path = scripts/un/sdg/sdg-dataset
+	url = https://code.officialstatistics.org/undata2/data-commons/sdg-dataset.git
+[submodule "scripts/un/sdg/sssom-mappings"]
+	path = scripts/un/sdg/sssom-mappings
+	url = https://code.officialstatistics.org/undata2/sssom-mappings.git
diff --git a/LICENSE-CC4 b/LICENSE-CC4
@@ -0,0 +1,7 @@
+Copyright 2023 Google LLC
+
+The non-source code materials in this project are licensed under:
+Creative Commons - Attribution CC-BY 4.0
+
+For the full license text, please visit:
+https://creativecommons.org/licenses/by/4.0/legalcode
diff --git a/README.md b/README.md
@@ -113,7 +113,7 @@ Install requirements and setup a virtual environment to isolate python developme
 python3 -m venv .env
 source .env/bin/activate
 
-pip3 install -r requirements.txt
+pip3 install -r requirements_all.txt
 ```
 
 ##### Testing
@@ -153,8 +153,8 @@ you import modules and run tests, as below.
 
 ##### Guidelines
 
-*   Any additional package required must be specified in the requirements.txt
-    in the top-level folder. No other requirements.txt files are allowed.
+*   Any additional package required must be specified in the `requirements_all.txt`
+    file in the top-level folder. No other `requirements.txt` files are allowed.
 *   Code must be formatted according to the
     [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
     according to the [yapf formatter](https://github.com/google/yapf).
@@ -249,7 +249,6 @@ To disable yapf for some lines,
 
 For general questions or issues about importing data into Data Commons, please
 open an issue on our [issues](https://github.com/datacommonsorg/data/issues)
-page. For all other questions, please send an email to
-`[email protected]`.
+page. For all other questions, please [share feedback on this form](https://docs.google.com/forms/d/e/1FAIpQLScJTtNlIItT-uSPXI98WT6yNlavF-kf5JS0jMrCvJ9TPLmelg/viewform).
 
 **Note** - This is not an officially supported Google product.
diff --git a/docs/README.md b/docs/README.md
@@ -4,7 +4,7 @@ This document summarizes the steps involved in adding a dataset to Data Commons
 
 ## Prerequisites
 
-* Ensure that the DC team ([email protected]) has approved the addition of the dataset.
+* Ensure that the Data Commons team has approved the addition of the dataset. Please [suggest a dataset here](https://docs.google.com/forms/d/e/1FAIpQLSf_kZ13bmzXvgEbim0OXeAVsTQYsIhN8_o9ekdbjKoeFjfvRA/viewform).
 * Review the following documents to get a background on the data model, format and workflow:
   * [Summary of data model](https://schema.org/docs/datamodel.html) (DC inherits schema from schema.org)
   * [How statistics is represented in DC](representing_statistics.md)
@@ -35,7 +35,7 @@ Once the entity and schema mapping have been finalized, you can now prepare the
   * Data cleaning code (along with README) checked into [data repo](https://github.com/datacommonsorg/data)
   * Validation results for the artifacts (from running [`dc-import`](https://github.com/datacommonsorg/import#using-import-tool) tool)
 
-Note: you may also use the [DC Import Wizard](datacommons.org/import) to help generate artifacts for common dataset structures
+Note: you may also use the [DC Import Wizard](https://datacommons.org/import) to help generate artifacts for common dataset structures
 
 ## Review by DC team
 

diff --git a/import-automation/executor/Dockerfile b/import-automation/executor/Dockerfile
@@ -13,16 +13,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-FROM gcr.io/google-appengine/python
 
-RUN virtualenv /env -p python3.7
+FROM python:3.11.4
 
-ENV VIRTUAL_ENV /env
-ENV PATH /env/bin:$PATH
+RUN apt-get update \
+    && apt-get -y upgrade \
+    && apt-get -y autoremove \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /workspace
 
-ADD requirements.txt /app/requirements.txt
-RUN pip install -r /app/requirements.txt
+ADD requirements.txt /workspace/requirements.txt
+RUN pip install -r /workspace/requirements.txt
 
-ADD . /app
+COPY app/. /workspace/app/
 
 CMD gunicorn --timeout 1800 --workers 5 -b :$PORT app.main:FLASK_APP
diff --git a/import-automation/executor/README.md b/import-automation/executor/README.md
@@ -47,13 +47,85 @@ Commons knowledge graph using the importer.
 
 ## Running locally
 
+Authenticate with GCP first: `gcloud auth application-default login`
+
+### Scheduling or Updating An Import Locally
+
+You can schedule (on the GCP Cloud Scheduler) or execute an import job from your local machine.
+
+Ensure this script is executed from the directory which contains `schedule_update_import.sh`, i.e. from `/data/import-automation/executor`. Configs (`<repo_root>/import-automation/executor/app/configs.py`) are loaded from GCS. To override any configs locally, set them in the file `<repo_root>/import-automation/executor/config_override.json`. note that the config fields must belong to `<repo_root>/import-automation/executor/app/configs.py`, else the update will produce an Exception. Note that the `user_script_args` field in configs can also be set in the config file.
+
+Note: any local changes to the `<repo_root>/import-automation/executor/config_override.json` file are ignored by git. This was done using:
+
+```
+Run git update-index --skip-worktree <repo_root>/import-automation/executor/config_override.json
+```
+
+To start tracking changes to this file, execute the following:
+```
+Run git update-index --no-skip-worktree <repo_root>/import-automation/executor/config_override.json
+```
+
+To get a list of files that are skipped when checking for changes, execute:
+
+```
+Run git ls-files -v . | grep ^S
+```
+
+### Usage
+
+Run `./schedule_update_import.sh --help` for usage.
+
+
+#### Schedule an Import:
+To schedule an import to run as a cron job on the GCP Cloud Scheduler, do the following:
+
+```
+Run `./schedule_update_import.sh -s <config_project_id> <path_to_import>`
+```
+
+`<config_project_id>` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`.
+`<path_to_import>` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`.
+
+Example invocation:
+
+```
+Run `./schedule_update_import.sh -s datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey`
+```
+
+The script will log the name of the Cloud Scheduler job and a url for all the jobs on the scheduler. Please verify that all the job metadata was updated as expected.
+
+
+#### Update an Import:
+You can execute an import process locally. Note that this is not recommeded for import scripts which take longer than a few minutes to execute because all the processing is done locally. For all prod imports, the recommended path is to Schedule an Import. 
+
+Instead of downloading a fresh version of this repo from GitHub, this script uses the locally downloaded/cloned current state of the repo by inferring the path to the `data` root directory. A side effect is that upon completion, the local GitHub repo may have other artifacts, e.g. output CSV/TMCF files produced. You may want to revert those files if they are not intended to be committed.
+
+Once the script runs to completion, the data directory's latest update is printed (along with the location on GCS) which can confirm whether the import actually produced new data. Note: it is a good idea to check the directory path printed to see if the expected import files are all there.
+
+To excute an Update locally, do the following:
+
+```
+Run `./schedule_update_import.sh -u <config_project_id> <path_to_import>`
+```
+
+`<config_project_id>` is the GCP project id where the config file is stored, e.g. `datcom-import-automation`.
+`<path_to_import>` is the path to the import (relative to the root directory of the `data` repo), with the name of the import provided with a colon, e.g. `scripts/us_usda/quickstats:UsdaAgSurvey`.
+
+Example invocation:
+
+```
+Run `./schedule_update_import.sh -u datcom-import-automation scripts/us_usda/quickstats:UsdaAgSurvey`
+```
+
+
+## Local Executor [should be deprecated soon]
+
 ```
 PYTHONPATH=$(pwd) python app/main.py
 
 ``
 
-## Local Executor
-
 Run `. run_local_executor.sh --help` for usage.
 
 

diff --git a/import-automation/executor/app/configs.py b/import-automation/executor/app/configs.py
@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Configurations for the executor.
+"""Configurations for the executor.
 
 The app endpoints accept a configs field that allows customization of all the
 configurations. See main.py.
 """
 
+import dataclasses
 import os
 from typing import List
-import dataclasses
 
 from google.cloud import logging
 
@@ -32,9 +31,10 @@ def _production():
 @dataclasses.dataclass
 class ExecutorConfig:
     """Configurations for the executor."""
+
     # ID of the Google Cloud project that hosts the executor. The project
     # needs to enable App Engine and Cloud Scheduler.
-    gcp_project_id: str = 'google.com:datcom-data'
+    gcp_project_id: str = 'datcom-import-automation'
     # ID of the Google Cloud project that stores generated CSVs and MCFs. The
     # project needs to enable Cloud Storage and gives the service account the
     # executor uses sufficient permissions to read and write the bucket below.
@@ -102,9 +102,13 @@ class ExecutorConfig:
     # ID of the location where Cloud Scheduler is hosted.
     scheduler_location: str = 'us-central1'
     # Maximum time a user script can run for in seconds.
-    user_script_timeout: float = 600
+    user_script_timeout: float = 3600
+    # Arguments for the user script
+    user_script_args: List[str] = ()
+    # Environment variables for the user script
+    user_script_env: dict = None
     # Maximum time venv creation can take in seconds.
-    venv_create_timeout: float = 600
+    venv_create_timeout: float = 3600
     # Maximum time downloading a file can take in seconds.
     file_download_timeout: float = 600
     # Maximum time downloading the repo can take in seconds.
@@ -126,16 +130,12 @@ class ExecutorConfig:
     def get_data_refresh_config(self):
         """Returns the config used for Cloud Scheduler data refresh jobs."""
         fields = set([
-            'github_repo_name',
-            'github_repo_owner_username',
-            'github_auth_username',
-            'github_auth_access_token',
-            'dashboard_oauth_client_id',
-            'importer_oauth_client_id',
-            'email_account',
-            'email_token',
-            'gcs_project_id',
-            'storage_prod_bucket_name',
+            'github_repo_name', 'github_repo_owner_username',
+            'github_auth_username', 'github_auth_access_token',
+            'dashboard_oauth_client_id', 'importer_oauth_client_id',
+            'email_account', 'email_token', 'gcs_project_id',
+            'storage_prod_bucket_name', 'user_script_args', 'user_script_env',
+            'user_script_timeout'
         ])
         return {
             k: v for k, v in dataclasses.asdict(self).items() if k in fields

diff --git a/import-automation/executor/app/executor/cloud_scheduler.py b/import-automation/executor/app/executor/cloud_scheduler.py
@@ -26,7 +26,8 @@
 from google.protobuf import json_format
 from google.api_core.exceptions import AlreadyExists, NotFound
 
-GKE_SERVICE_DOMAIN = os.getenv('GKE_SERVICE_DOMAIN', 'import.datacommons.dev')
+GKE_SERVICE_DOMAIN = os.getenv('GKE_SERVICE_DOMAIN',
+                               'importautomation.datacommons.org')
 GKE_CALLER_SERVICE_ACCOUNT = os.getenv('CLOUD_SCHEDULER_CALLER_SA')
 GKE_OAUTH_AUDIENCE = os.getenv('CLOUD_SCHEDULER_CALLER_OAUTH_AUDIENCE')
 
@@ -53,9 +54,25 @@ def _base_job_request(absolute_import_name, schedule: str):
     }
 
 
-def http_job_request(absolute_import_name, schedule,
-                     json_encoded_job_body: str) -> Dict:
+def http_job_request(absolute_import_name,
+                     schedule,
+                     json_encoded_job_body: str,
+                     gke_caller_service_account: str = "",
+                     gke_oauth_audience: str = "") -> Dict:
     """Cloud Scheduler request that targets executors launched in GKE."""
+
+    # If the service account and oauth audience are provided as
+    # function args, use them. If not, look for them in the
+    # environment (GKE_CALLER_SERVICE_ACCOUNT and GKE_OAUTH_AUDIENCE
+    # are set to read from environment variables).
+    service_account = gke_caller_service_account
+    oauth_audience = gke_oauth_audience
+
+    if not service_account:
+        service_account = GKE_CALLER_SERVICE_ACCOUNT
+    if not oauth_audience:
+        oauth_audience = GKE_OAUTH_AUDIENCE
+
     job = _base_job_request(absolute_import_name, schedule)
     job['name'] = f'{job["name"]}_GKE'
     job['http_target'] = {
@@ -66,8 +83,8 @@ def http_job_request(absolute_import_name, schedule,
         },
         'body': json_encoded_job_body,
         'oidc_token': {
-            'service_account_email': GKE_CALLER_SERVICE_ACCOUNT,
-            'audience': GKE_OAUTH_AUDIENCE,
+            'service_account_email': service_account,
+            'audience': oauth_audience,
         }
     }
     return job