From d4cc7d328585bb29d6cfdf83cbbd7a3229031e2a Mon Sep 17 00:00:00 2001 From: Hannah Pho Date: Mon, 21 Oct 2024 23:57:09 -0400 Subject: [PATCH 1/4] Document data management schema update mode --- custom_dc/custom_data.md | 88 ++++++++++++++++++------------------ custom_dc/data_cloud.md | 28 ++++++------ custom_dc/database_update.md | 68 ++++++++++++++++++++++++++++ custom_dc/faq.md | 8 ++-- custom_dc/troubleshooting.md | 6 +-- 5 files changed, 133 insertions(+), 65 deletions(-) create mode 100644 custom_dc/database_update.md diff --git a/custom_dc/custom_data.md b/custom_dc/custom_data.md index 1aa32e5ae..2ec31a12c 100644 --- a/custom_dc/custom_data.md +++ b/custom_dc/custom_data.md @@ -54,7 +54,7 @@ San Francisco,2023,300,300,200,50 San Jose,2023,400,400,300,0 ``` -The _ENTITY_ is an existing property in the Data Commons knowledge graph that is used to describe an entity, most commonly a place. The best way to think of the entity type is as a key that could be used to join to other data sets. The column heading can be expressed as any existing place-related property; see [Place types](/place_types.html) for a full list. It may also be any of the special DCID prefixes listed in [Special place names](#special-names). +The _ENTITY_ is an existing property in the Data Commons knowledge graph that is used to describe an entity, most commonly a place. The best way to think of the entity type is as a key that could be used to join to other data sets. The column heading can be expressed as any existing place-related property; see [Place types](/place_types.html) for a full list. It may also be any of the special DCID prefixes listed in [Special place names](#special-names). > **Note:** The type of the entities in a single file should be unique; do not mix multiple entity types in the same CSV file. For example, if you have observations for cities and counties, put all the city data in one CSV file and all the county data in another one. @@ -105,11 +105,11 @@ The config.json file specifies how the CSV contents should be mapped and resolve Here is the general spec for the JSON file:
-{  
-  "inputFiles": {  
-    "FILE_NAME1": {  
-      "entityType": "ENTITY_PROPERTY",  
-      "ignoreColumns": ["COLUMN1", "COLUMN2", ...],  
+{
+  "inputFiles": {
+    "FILE_NAME1": {
+      "entityType": "ENTITY_PROPERTY",
+      "ignoreColumns": ["COLUMN1", "COLUMN2", ...],
       "provenance": "NAME",
       "observationProperties" {
         "unit": "MEASUREMENT_UNIT",
@@ -117,37 +117,37 @@ Here is the general spec for the JSON file:
         "scalingFactor": "DENOMINATOR_VALUE",
         "measurementMethod": "METHOD"
       }
-    },  
-    "FILE_NAME2": {  
-     ...  
-    },  
- ...  
-  "variables": {  
-    "VARIABLE1": {"group": "GROUP_NAME1"},  
-    "VARIABLE2": {"group": "GROUP_NAME1"},  
-    "VARIABLE3": {  
-      "name": "DISPLAY_NAME",  
-      "description": "DESCRIPTION",  
-      "searchDescriptions": ["SENTENCE1", "SENTENCE2", ...],  
-      "group": "GROUP_NAME2",  
-      "properties": {  
-        "PROPERTY_NAME1":"VALUE",  
-        "PROPERTY_NAME2":"VALUE",  
-         …  
-           }  
-    },  
-  },   
-  "sources": {  
-    "SOURCE_NAME1": {  
-      "url": "URL",  
-      "provenances": {  
-        "PROVENANCE_NAME1": "URL",  
-        "PROVENANCE_NAME2": "URL",  
-        ...  
-      }  
-    }  
-  }  
-}  
+    },
+    "FILE_NAME2": {
+     ...
+    },
+ ...
+  "variables": {
+    "VARIABLE1": {"group": "GROUP_NAME1"},
+    "VARIABLE2": {"group": "GROUP_NAME1"},
+    "VARIABLE3": {
+      "name": "DISPLAY_NAME",
+      "description": "DESCRIPTION",
+      "searchDescriptions": ["SENTENCE1", "SENTENCE2", ...],
+      "group": "GROUP_NAME2",
+      "properties": {
+        "PROPERTY_NAME1":"VALUE",
+        "PROPERTY_NAME2":"VALUE",
+         …
+           }
+    },
+  },
+  "sources": {
+    "SOURCE_NAME1": {
+      "url": "URL",
+      "provenances": {
+        "PROVENANCE_NAME1": "URL",
+        "PROVENANCE_NAME2": "URL",
+        ...
+      }
+    }
+  }
+}
 
Each section contains some required and optional fields, which are described in detail below. @@ -192,7 +192,7 @@ You must specify the provenance details under `sources`.`provenances`; this fiel - [`unit`](/glossary.html#unit): The unit of measurement used in the observations. This is a string representing a currency, area, weight, volume, etc. For example, `SquareFoot`, `USD`, `Barrel`, etc. - [`measurementPeriod`](/glossary.html#observation-period): The period of time in which the observations were recorded. This must be in ISO duration format, namely `P[0-9][Y|M|D|h|m|s]`. For example, `P1Y` is 1 year, `P3M` is 3 months, `P3h` is 3 hours. - [`measurementMethod`](/glossary.html#measurement-method): The method used to gather the observations. This can be a random string or an existing DCID of [`MeasurementMethodEnum`](https://datacommons.org/browser/MeasurementMethodEnum){: target="_blank"} type; for example, `EDA_Estimate` or `WorldBankEstimate`. -- [`scalingFactor`](/glossary.html#scaling-factor): An integer representing the denominator used in measurements involving ratios or percentages. For example, for percentages, the denominator would be `100`. +- [`scalingFactor`](/glossary.html#scaling-factor): An integer representing the denominator used in measurements involving ratios or percentages. For example, for percentages, the denominator would be `100`. Note that you cannot mix different property values in a single CSV file. If you have observations using different properties, you must put them in separate CSV files. @@ -204,7 +204,7 @@ The `variables` section is optional. You can use it to override names and associ `name` -: The display name of the variable, which will show up in the site's exploration tools. If not specified, the column name is used as the display name. +: The display name of the variable, which will show up in the site's exploration tools. If not specified, the column name is used as the display name. The name should be concise and precise; that is, the shortest possible name that allow humans to uniquely identify a given variable. The name is used to generate NL embeddings. `description` @@ -234,7 +234,7 @@ Each property is specified as a key:value pair. Here are some examples: You can have a multi-level group hierarchy by using `/` as a separator between each group. -`searchDescriptions` +`searchDescriptions` : An array of descriptions to be used for creating more NL embeddings for the variable. This is only needed if the variable `name` is not sufficient for generating embeddings. @@ -261,12 +261,12 @@ The `sources` section is optional. It encodes the sources and provenances associ The following procedures show you how to load and serve your custom data locally. -To load data in Google Cloud, see instead [Load data in Google Cloud](/custom_dc/deploy_cloud.html) for procedures. +To load data in Google Cloud, see instead [Load data in Google Cloud](/custom_dc/data_cloud.html) for procedures. ### Configure environment variables Edit the `env.list` file you created [previously](/custom_dc/quickstart.html#env-vars) as follows: -- Set the `INPUT_DIR` variable to the directory where your input files are stored. +- Set the `INPUT_DIR` variable to the directory where your input files are stored. - Set the `OUTPUT_DIR` variable to the directory where you would like the output files to be stored. This can be the same or different from the input directory. When you rerun the Docker data management container, it will create a `datacommons` subdirectory under this directory. ### Start the Docker containers with local custom data {#docker-data} @@ -303,7 +303,7 @@ If you need to troubleshoot custom data, it is helpful to inspect the contents o To do so, from a terminal window, open the database: -
  
+
 sqlite3 OUTPUT_DIRECTORY/datacommons/datacommons.db
 
@@ -328,3 +328,5 @@ country/BEL|average_annual_wage|2005|55662.21541|c/p/1 To exit the sqlite shell, press Ctrl-D. +### Database schema updates + diff --git a/custom_dc/data_cloud.md b/custom_dc/data_cloud.md index 824fbdb3b..3d50e4898 100644 --- a/custom_dc/data_cloud.md +++ b/custom_dc/data_cloud.md @@ -15,11 +15,11 @@ This page shows you how to store your custom data in Google Cloud, and create th ## Overview -Once you have tested locally, the next step is to get your data into the Google Cloud Platform. You upload your CSV and JSON files to [Google Cloud Storage](https://cloud.google.com/storage){: target="_blank"}, and run the Data Commons data management Docker container as a Cloud Run job. The job will transform and store the data in a [Google Cloud SQL](https://cloud.google.com/sql){: target="_blank"} database, and generate NL embeddings stored in Cloud Storage. +Once you have tested locally, the next step is to get your data into the Google Cloud Platform. You upload your CSV and JSON files to [Google Cloud Storage](https://cloud.google.com/storage){: target="_blank"}, and run the Data Commons data management Docker container as a Cloud Run job. The job will transform and store the data in a [Google Cloud SQL](https://cloud.google.com/sql){: target="_blank"} database, and generate NL embeddings stored in Cloud Storage. ![data management setup](/assets/images/custom_dc/customdc_setup3.png) -Alternatively, if you have a very large data set, you may find it faster to store your input files and run the data management container locally, and output the data to Google Cloud Storage. If you would like to use this approach, follow steps 1 to 3 of the one-time setup steps below and then skip to [Run the data management container locally](#run-local). +Alternatively, if you have a very large data set, you may find it faster to store your input files and run the data management container locally, and output the data to Google Cloud Storage. If you would like to use this approach, follow steps 1 to 3 of the one-time setup steps below and then skip to [Run the data management container locally](#run-local). ## Prerequisites @@ -42,10 +42,10 @@ This stores the CSV and JSON files that you will upload whenever your data chang 1. For the **Location type**, choose the same regional options as for Cloud SQL above. 1. When you have finished setting all the configuration options, click **Create**. 1. In the **Bucket Details** page, click **Create Folder** to create a new folder to hold your data and name it as desired. -1. Optionally, create separate folders to hold input and output files, or just use the same one as for the input. +1. Optionally, create separate folders to hold input and output files, or just use the same one as for the input. **Note:** If you plan to run the data management container locally, you only need to create a single folder to hold the output files. -1. Record the folder path(s) as gs://BUCKET_NAME/FOLDER_PATH for setting the `INPUT_DIR` and `OUTPUT_DIR` environment variables below. +1. Record the folder path(s) as gs://BUCKET_NAME/FOLDER_PATH for setting the `INPUT_DIR` and `OUTPUT_DIR` environment variables below. ### Step 3: Create a Google Cloud SQL instance @@ -64,7 +64,7 @@ This stores the data that will be served at run time. The Data Commons data mana 1. Select **Databases**. 1. Click **Create Database**. 1. Choose a name for the database or use the default, `datacommons`. -1. Click **Create**. +1. Click **Create**. 1. In the **Overview** page for the new instance, record the **Connection name** to set in environment variables in the next step. ### Step 4: Create a Google Cloud Run job @@ -93,13 +93,13 @@ Now set environment variables: 1. Add names and values for the following environment variables: - `USE_CLOUDSQL`: Set to `true`. - `DC_API_KEY`: Set to your API key. - - `INPUT_DIR`: Set to the Cloud Storage bucket and input folder that you created in step 2 above. + - `INPUT_DIR`: Set to the Cloud Storage bucket and input folder that you created in step 2 above. - `OUTPUT_DIR`: Set to the Cloud Storage bucket (and, optionally, output folder) that you created in step 2 above. If you didn't create a separate folder for output, specify the same folder as the `INPUT_DIR`. - `CLOUDSQL_INSTANCE`: Set to the full connection name of the instance you created in step 3 above. - `DB_USER`: Set to a user you configured when you created the instance in step 3, or to `root` if you didn't create a new user. - `DB_PASS`: Set to the user's or root password you configured when you created the instance in step 3. - `DB_NAME`: Only set this if you configured the database name to something other than `datacommons`. -1. When you finished, click **Done**. +1. When you have finished, click **Done**. ![Cloud Run job](/assets/images/custom_dc/gcp_screenshot3.png){: width="450" } @@ -116,7 +116,7 @@ Now set environment variables: > **Note:** Do not upload the local `datacommons` subdirectory or its files. -As you are iterating on changes to the source CSV and JSON files, you can re-upload them at any time, either overwriting existing files or creating new folders. To load them into Cloud SQL, you run the Cloud Run job you created above. +As you are iterating on changes to the source CSV and JSON files, you can re-upload them at any time, either overwriting existing files or creating new folders. To load them into Cloud SQL, you run the Cloud Run job you created above. ### Step 2: Start the data management Cloud Run job {#run-job} @@ -128,7 +128,7 @@ To run the job: 1. Go to [https://console.cloud.google.com/run/jobs](https://console.cloud.google.com/run/jobs){: target="_blank"} for your project. 1. From the list of jobs, click the link of the "datacommons-data" job you created above. -1. Click **Execute**. It will take several minutes for the job to run. You can click the **Logs** tab to view the progress. +1. Click **Execute**. It will take several minutes for the job to run. You can click the **Logs** tab to view the progress. When it completes, to verify that the data has been loaded correctly, see the next step. @@ -157,7 +157,7 @@ Before you proceed, ensure you have completed steps 1 to 3 of the [One-time setu ### Step 1: Set environment variables To run a local instance of the services container, you need to set all the environment variables in the `custom_dc/env.list` file. See [above](#set-vars) for the details, with the following differences: -- For the `INPUT_DIR`, specify the full local path where your CSV and JSON files are stored, as described in the [Quickstart](/custom_dc/quickstart.html#env-vars). +- For the `INPUT_DIR`, specify the full local path where your CSV and JSON files are stored, as described in the [Quickstart](/custom_dc/quickstart.html#env-vars). - Set `GOOGLE_CLOUD_PROJECT` to your GCP project name. ### Step 2: Generate credentials for Google Cloud authentication {#gen-creds} @@ -170,13 +170,13 @@ Open a terminal window and run the following command: gcloud auth application-default login ``` -This opens a browser window that prompts you to enter credentials, sign in to Google Auth Library and allow Google Auth Library to access your account. Accept the prompts. When it has completed, a credential JSON file is created in +This opens a browser window that prompts you to enter credentials, sign in to Google Auth Library and allow Google Auth Library to access your account. Accept the prompts. When it has completed, a credential JSON file is created in `$HOME/.config/gcloud/application_default_credentials.json`. Use this in the command below to authenticate from the docker container. The first time you run it, may be prompted to specify a quota project for billing that will be used in the credentials file. If so, run this command: -
  
-gcloud auth application-default set-quota-project PROJECT_ID  
+
+gcloud auth application-default set-quota-project PROJECT_ID
 
If you are prompted to install the Cloud Resource Manager API, press `y` to accept. @@ -215,7 +215,7 @@ See the section [above](#gen-creds) for procedures. From the root directory of your repo, run the following command, assuming you are using a locally built image: -
  
+
 docker run -it \
 --env-file $PWD/custom_dc/env.list \
 -p 8080:8080 \
diff --git a/custom_dc/database_update.md b/custom_dc/database_update.md
new file mode 100644
index 000000000..54ee1d430
--- /dev/null
+++ b/custom_dc/database_update.md
@@ -0,0 +1,68 @@
+---
+layout: default
+title: Update your database schema
+nav_order: 9
+parent: Build your own Data Commons
+---
+
+{:.no_toc}
+# Update your database schema
+
+While starting Data Commons services, you may see an error that starts with `SQL schema check failed`. This means your database schema must be updated for compatibility with the latest Data Commons services.
+
+You can update your database by running a data management job with the environment variable `SCHEMA_UPDATE_ONLY` set to `true`. This will alter your database without modifying already-imported data.
+
+Running a data management job in the default mode will also update the database schema, but may take longer since it fully re-imports your custom data.
+
+Once your database is updated, starting Data Commons services should succeed.
+
+This page contains detailed instructions for passing `SCHEMA_UPDATE_ONLY` to the data management container using various workflows.
+
+* TOC
+{:toc}
+
+## Local data management job with local SQLite database
+
+Add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command for the data management container (the first command in [this doc section](/custom_dc/custom_data.html#docker-data)):
+
+
+docker run \
+--env-file $PWD/custom_dc/env.list \
+-v INPUT_DIRECTORY:INPUT_DIRECTORY \
+-v OUTPUT_DIRECTORY:OUTPUT_DIRECTORY \
+-e SCHEMA_UPDATE_ONLY=true \
+gcr.io/datcom-ci/datacommons-data:stable
+
+ +## Cloud Run data management job + +Run your existing Cloud Run job with an environment variable override. + +1. Go to [https://console.cloud.google.com/run/jobs](https://console.cloud.google.com/run/jobs){: target="_blank"} for your project. +1. From the list of jobs, click the link of the "datacommons-data" job. This should be a job that uses the `stable` or `latest` version of the image hosted at gcr.io/datcom-ci/datacommons-data:stable. +1. Next to Execute, use the dropdown to find the option to **Execute with overrides**. +1. Use the **Add variable** button to set a variable with name `SCHEMA_UPDATE_ONLY` and value `true`. +1. Click **Execute**. +1. It should only take a few minutes for the job to run. You can click the **Logs** tab to view the progress. + + +## (Advanced) Local data management job with Cloud SQL + +If you followed [these instructions](/custom_dc/data_cloud.html#run-local) to load data from your local machine into a Cloud SQL database, add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command from the final step: + +
+docker run -it \
+--env-file $PWD/custom_dc/env.list \
+-p 8080:8080 \
+-e DEBUG=true \
+-e GOOGLE_APPLICATION_CREDENTIALS=/gcp/creds.json \
+-e SCHEMA_UPDATE_ONLY=true \
+-v $HOME/.config/gcloud/application_default_credentials.json:/gcp/creds.json:ro \
+-v INPUT_DIRECTORY:INPUT_DIRECTORY \
+-v OUTPUT_DIRECTORY:OUTPUT_DIRECTORY \
+[-v $PWD/server/templates/custom_dc/custom:/workspace/server/templates/custom_dc/custom \]
+[-v $PWD/static/custom_dc/custom:/workspace/static/custom_dc/custom \]
+IMAGE_NAME:IMAGE_TAG
+
+ +Substitute the `IMAGE_TAG` that matches the services container image which threw the schema check error. diff --git a/custom_dc/faq.md b/custom_dc/faq.md index 6f92695eb..d9d381f33 100644 --- a/custom_dc/faq.md +++ b/custom_dc/faq.md @@ -1,7 +1,7 @@ --- layout: default title: Frequently asked questions -nav_order: 9 +nav_order: 11 parent: Build your own Data Commons --- @@ -21,16 +21,16 @@ Please see [Send feedback](/custom_dc/index.html#feedback) for details. ### Can I restrict access to my custom instance? -Yes; there are many options for doing so. If you want an entirely private site with a non-public domain, you may consider using a Google Virtual Private Cloud to host your instance. If you want to have authentication and authorization controls on your site, there are also many other options. Please see [Restricting ingress for Cloud Run](https://cloud.google.com/run/docs/securing/ingress) for more information. +Yes; there are many options for doing so. If you want an entirely private site with a non-public domain, you may consider using a Google Virtual Private Cloud to host your instance. If you want to have authentication and authorization controls on your site, there are also many other options. Please see [Restricting ingress for Cloud Run](https://cloud.google.com/run/docs/securing/ingress) for more information. Note that you cannot apply fine-grained access restrictions, such as access to specific data or pages. Access is either all or nothing. If you want to be able to partition off data, you would need to create additional custom instances. ### Will my data or queries end up in base Data Commons? {#data-security} Your user queries, observations data, or property values are never transferred to base Data Commons. The NL model built from your custom data lives solely in your custom instance. The custom Data Commons instance does make API calls to the base Data Commons instance (as depicted in [this diagram](/custom_dc/index.html#system-overview)) only in the following instances: -- At data load time, API calls are made from the custom instance to the base instance to resolve entity names to [DCIDs](/glossary.html#dcid); for example, if your data refers to a particular country name, the custom instance will send an API request to look up its DCID. +- At data load time, API calls are made from the custom instance to the base instance to resolve entity names to [DCIDs](/glossary.html#dcid); for example, if your data refers to a particular country name, the custom instance will send an API request to look up its DCID. - At run time, when a user enters an NL query, the custom instance uses its local NL model to identify the relevant statistical variables. The custom instance then issues two requests for statistical variable observations: a SQL query to your custom SQL database and an API call to the base Data Commons database. These requests only include DCIDs and contain no information about the original query or context of the user request. The data is joined by entity DCIDs. -- At run time, when the website frontend renders a data visualization, it will also make the same two requests to get observations data. +- At run time, when the website frontend renders a data visualization, it will also make the same two requests to get observations data. ## Natural language processing diff --git a/custom_dc/troubleshooting.md b/custom_dc/troubleshooting.md index 753bde067..776e528e2 100644 --- a/custom_dc/troubleshooting.md +++ b/custom_dc/troubleshooting.md @@ -1,7 +1,7 @@ --- layout: default title: Troubleshooting -nav_order: 9 +nav_order: 10 parent: Build your own Data Commons --- @@ -45,9 +45,7 @@ This indicates that you have not specified API keys in the environment file. Fol ### "SQL schema check failed" -This error indicates that there is a problem with the database schema. Check for the following additional error: - -- "The following columns are missing..." -- This indicates that there has been an update to the database schema. To remedy this, rerun the data management Docker container and then restart the services container. +This error indicates that there is a problem with the database schema. [Update your database schema](/custom_dc/database_update.html), then restart the services container. ## Local build errors From a3930815ca5a27156c2d8a2192a3f3a4bf2c0720 Mon Sep 17 00:00:00 2001 From: Hannah Pho Date: Mon, 21 Oct 2024 23:59:06 -0400 Subject: [PATCH 2/4] Open links in new tab --- custom_dc/database_update.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/custom_dc/database_update.md b/custom_dc/database_update.md index 54ee1d430..d4bcab3af 100644 --- a/custom_dc/database_update.md +++ b/custom_dc/database_update.md @@ -23,7 +23,7 @@ This page contains detailed instructions for passing `SCHEMA_UPDATE_ONLY` to the ## Local data management job with local SQLite database -Add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command for the data management container (the first command in [this doc section](/custom_dc/custom_data.html#docker-data)): +Add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command for the data management container (the first command in [this doc section](/custom_dc/custom_data.html#docker-data){: target="_blank"}):
 docker run \
@@ -48,7 +48,7 @@ Run your existing Cloud Run job with an environment variable override.
 
 ## (Advanced) Local data management job with Cloud SQL
 
-If you followed [these instructions](/custom_dc/data_cloud.html#run-local) to load data from your local machine into a Cloud SQL database, add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command from the final step:
+If you followed [these instructions](/custom_dc/data_cloud.html#run-local){: target="_blank"} to load data from your local machine into a Cloud SQL database, add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command from the final step:
 
 
 docker run -it \

From de61ea6b19ef7770057eb52aa6e935a075a4b250 Mon Sep 17 00:00:00 2001
From: Hannah Pho 
Date: Tue, 22 Oct 2024 00:01:42 -0400
Subject: [PATCH 3/4] Use the right command for local job w Cloud SQL

---
 custom_dc/database_update.md | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/custom_dc/database_update.md b/custom_dc/database_update.md
index d4bcab3af..5a5d36da7 100644
--- a/custom_dc/database_update.md
+++ b/custom_dc/database_update.md
@@ -51,18 +51,14 @@ Run your existing Cloud Run job with an environment variable override.
 If you followed [these instructions](/custom_dc/data_cloud.html#run-local){: target="_blank"} to load data from your local machine into a Cloud SQL database, add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command from the final step:
 
 
-docker run -it \
+docker run \
 --env-file $PWD/custom_dc/env.list \
--p 8080:8080 \
--e DEBUG=true \
--e GOOGLE_APPLICATION_CREDENTIALS=/gcp/creds.json \
--e SCHEMA_UPDATE_ONLY=true \
--v $HOME/.config/gcloud/application_default_credentials.json:/gcp/creds.json:ro \
 -v INPUT_DIRECTORY:INPUT_DIRECTORY \
 -v OUTPUT_DIRECTORY:OUTPUT_DIRECTORY \
-[-v $PWD/server/templates/custom_dc/custom:/workspace/server/templates/custom_dc/custom \]
-[-v $PWD/static/custom_dc/custom:/workspace/static/custom_dc/custom \]
-IMAGE_NAME:IMAGE_TAG
+-e GOOGLE_APPLICATION_CREDENTIALS=/gcp/creds.json \
+-v $HOME/.config/gcloud/application_default_credentials.json:/gcp/creds.json:ro \
+-e SCHEMA_UPDATE_ONLY=true \
+gcr.io/datcom-ci/datacommons-data:VERSION
 
-Substitute the `IMAGE_TAG` that matches the services container image which threw the schema check error. +Substitute the `VERSION` that matches the services container image which failed with a schema check error (typically either `stable` or `latest`). From 8ad34b45e99603887bcbdc6bf8562a3f919aae75 Mon Sep 17 00:00:00 2001 From: Hannah Pho Date: Fri, 25 Oct 2024 13:57:58 -0400 Subject: [PATCH 4/4] Change mode var name --- custom_dc/database_update.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/custom_dc/database_update.md b/custom_dc/database_update.md index 5a5d36da7..212f1bacc 100644 --- a/custom_dc/database_update.md +++ b/custom_dc/database_update.md @@ -10,27 +10,27 @@ parent: Build your own Data Commons While starting Data Commons services, you may see an error that starts with `SQL schema check failed`. This means your database schema must be updated for compatibility with the latest Data Commons services. -You can update your database by running a data management job with the environment variable `SCHEMA_UPDATE_ONLY` set to `true`. This will alter your database without modifying already-imported data. +You can update your database by running a data management job with the environment variable `DATA_RUN_MODE` set to `schemaupdate`. This will alter your database without modifying already-imported data. Running a data management job in the default mode will also update the database schema, but may take longer since it fully re-imports your custom data. Once your database is updated, starting Data Commons services should succeed. -This page contains detailed instructions for passing `SCHEMA_UPDATE_ONLY` to the data management container using various workflows. +This page contains detailed instructions for passing `DATA_RUN_MODE` to the data management container using various workflows. * TOC {:toc} ## Local data management job with local SQLite database -Add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command for the data management container (the first command in [this doc section](/custom_dc/custom_data.html#docker-data){: target="_blank"}): +Add `-e DATA_RUN_MODE=schemaupdate` to the Docker run command for the data management container (the first command in [this doc section](/custom_dc/custom_data.html#docker-data){: target="_blank"}):
 docker run \
 --env-file $PWD/custom_dc/env.list \
 -v INPUT_DIRECTORY:INPUT_DIRECTORY \
 -v OUTPUT_DIRECTORY:OUTPUT_DIRECTORY \
--e SCHEMA_UPDATE_ONLY=true \
+-e DATA_RUN_MODE=schemaupdate \
 gcr.io/datcom-ci/datacommons-data:stable
 
@@ -41,14 +41,14 @@ Run your existing Cloud Run job with an environment variable override. 1. Go to [https://console.cloud.google.com/run/jobs](https://console.cloud.google.com/run/jobs){: target="_blank"} for your project. 1. From the list of jobs, click the link of the "datacommons-data" job. This should be a job that uses the `stable` or `latest` version of the image hosted at gcr.io/datcom-ci/datacommons-data:stable. 1. Next to Execute, use the dropdown to find the option to **Execute with overrides**. -1. Use the **Add variable** button to set a variable with name `SCHEMA_UPDATE_ONLY` and value `true`. +1. Use the **Add variable** button to set a variable with name `DATA_RUN_MODE` and value `schemaupdate`. 1. Click **Execute**. 1. It should only take a few minutes for the job to run. You can click the **Logs** tab to view the progress. ## (Advanced) Local data management job with Cloud SQL -If you followed [these instructions](/custom_dc/data_cloud.html#run-local){: target="_blank"} to load data from your local machine into a Cloud SQL database, add `-e SCHEMA_UPDATE_ONLY=true` to the Docker run command from the final step: +If you followed [these instructions](/custom_dc/data_cloud.html#run-local){: target="_blank"} to load data from your local machine into a Cloud SQL database, add `-e DATA_RUN_MODE=schemaupdate` to the Docker run command from the final step:
 docker run \
@@ -57,7 +57,7 @@ docker run \
 -v OUTPUT_DIRECTORY:OUTPUT_DIRECTORY \
 -e GOOGLE_APPLICATION_CREDENTIALS=/gcp/creds.json \
 -v $HOME/.config/gcloud/application_default_credentials.json:/gcp/creds.json:ro \
--e SCHEMA_UPDATE_ONLY=true \
+-e DATA_RUN_MODE=schemaupdate \
 gcr.io/datcom-ci/datacommons-data:VERSION