Skip to content

Commit

Permalink
lint fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ajaits committed Dec 6, 2023
1 parent 02a392c commit 5a4dcca
Showing 1 changed file with 117 additions and 115 deletions.
232 changes: 117 additions & 115 deletions import-automation/executor/app/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,130 +25,132 @@


def _production():
return 'EXECUTOR_PRODUCTION' in os.environ
return 'EXECUTOR_PRODUCTION' in os.environ


@dataclasses.dataclass
class ExecutorConfig:
"""Configurations for the executor."""
"""Configurations for the executor."""

# ID of the Google Cloud project that hosts the executor. The project
# needs to enable App Engine and Cloud Scheduler.
gcp_project_id: str = 'google.com:datcom-data'
# ID of the Google Cloud project that stores generated CSVs and MCFs. The
# project needs to enable Cloud Storage and gives the service account the
# executor uses sufficient permissions to read and write the bucket below.
gcs_project_id: str = 'datcom-204919'
# Name of the Cloud Storage bucket to store the generated data files
# for importing to prod.
storage_prod_bucket_name: str = 'datcom-prod-imports'
# Name of the Cloud Storage bucket that the Data Commons importer
# outputs to.
storage_importer_bucket_name: str = 'resolved_mcf'
# Data Commons importer output prefix in the
# storage_importer_bucket_name bucket.
storage_importer_output_prefix: str = 'external_tables'
# Name of the Cloud Storage bucket to store the generated data files
# for importing to dev.
storage_dev_bucket_name: str = 'unresolved_mcf'
# Executor output prefix in the storage_dev_bucket_name bucket.
storage_executor_output_prefix: str = 'datcom-dev-imports'
# Name of the file that specifies the most recently generated data files
# of an import. These files are stored in the bucket at a level higher
# than the data files. For example, for importing scripts/us_fed:treausry
# to dev, the bucket structure is
# unresolved_mcf
# -- datcom-dev-imports
# ---- scripts
# ------ us_fed
# -------- treasury
# ---------- latest_version.txt
# ---------- 2020_07_15T12_07_17_365264_07_00
# ------------ data.csv
# ---------- 2020_07_14T12_07_12_552234_07_00
# ------------ data.csv
# The content of latest_version.txt would be a single line of
# '2020_07_15T12_07_17_365264_07_00'.
storage_version_filename: str = 'latest_version.txt'
# Types of inputs accepted by the Data Commons importer. These are
# also the accepted fields of an import_inputs value in the manifest.
import_input_types: List[str] = ('template_mcf', 'cleaned_csv', 'node_mcf')
# Oauth Client ID used to authenticate with the import progress dashboard.
# which is protected by Identity-Aware Proxy. This can be found by going
# to the Identity-Aware Proxy of the Google Cloud project that hosts
# the dashboard and clicking 'Edit OAuth Client'.
dashboard_oauth_client_id: str = ''
# Oauth Client ID used to authenticate with the proxy.
importer_oauth_client_id: str = ''
# Access token of the account used to authenticate with GitHub. This is not
# the account password. See
# https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token.
github_auth_access_token: str = ''
# Username of the account used to authenticate with GitHub.
github_auth_username: str = ''
# Name of the repository that contains all the imports.
# On commits, this is the repository to which the pull requests must be sent
# to trigger the executor. The source repositories of the pull requests
# are downloaded.
# On updates, this is the repository that is downloaded.
github_repo_name: str = 'data'
# Username of the owner of the repository that contains all the imports.
github_repo_owner_username: str = 'datacommonsorg'
# Manifest filename. The executor uses this name to identify manifests.
manifest_filename: str = 'manifest.json'
# Python module requirements filename. The executor installs the modules
# listed in these files before running the user scripts.
requirements_filename: str = 'requirements.txt'
# ID of the location where Cloud Scheduler is hosted.
scheduler_location: str = 'us-central1'
# Maximum time a user script can run for in seconds.
user_script_timeout: float = 3600
# Arguments for the user script
user_script_args: List[str] = ()
# Environment variables for the user script
user_script_env: dict = None
# Maximum time venv creation can take in seconds.
venv_create_timeout: float = 3600
# Maximum time downloading a file can take in seconds.
file_download_timeout: float = 600
# Maximum time downloading the repo can take in seconds.
repo_download_timeout: float = 600
# Email account used to send notification emails about import progress
email_account: str = ''
# The corresponding password, app password, or access token.
email_token: str = ''
# Maximum time a blocking call to the importer to
# perform an import can take in seconds.
importer_import_timeout: float = 20 * 60
# Maximum time a blocking call to the importer to
# delete an import can take in seconds.
importer_delete_timeout: float = 10 * 60
# Executor type depends on where the executor runs
# Suppports one of: "GKE", "GAE"
executor_type: str = 'GAE'
# ID of the Google Cloud project that hosts the executor. The project
# needs to enable App Engine and Cloud Scheduler.
gcp_project_id: str = 'google.com:datcom-data'
# ID of the Google Cloud project that stores generated CSVs and MCFs. The
# project needs to enable Cloud Storage and gives the service account the
# executor uses sufficient permissions to read and write the bucket below.
gcs_project_id: str = 'datcom-204919'
# Name of the Cloud Storage bucket to store the generated data files
# for importing to prod.
storage_prod_bucket_name: str = 'datcom-prod-imports'
# Name of the Cloud Storage bucket that the Data Commons importer
# outputs to.
storage_importer_bucket_name: str = 'resolved_mcf'
# Data Commons importer output prefix in the
# storage_importer_bucket_name bucket.
storage_importer_output_prefix: str = 'external_tables'
# Name of the Cloud Storage bucket to store the generated data files
# for importing to dev.
storage_dev_bucket_name: str = 'unresolved_mcf'
# Executor output prefix in the storage_dev_bucket_name bucket.
storage_executor_output_prefix: str = 'datcom-dev-imports'
# Name of the file that specifies the most recently generated data files
# of an import. These files are stored in the bucket at a level higher
# than the data files. For example, for importing scripts/us_fed:treausry
# to dev, the bucket structure is
# unresolved_mcf
# -- datcom-dev-imports
# ---- scripts
# ------ us_fed
# -------- treasury
# ---------- latest_version.txt
# ---------- 2020_07_15T12_07_17_365264_07_00
# ------------ data.csv
# ---------- 2020_07_14T12_07_12_552234_07_00
# ------------ data.csv
# The content of latest_version.txt would be a single line of
# '2020_07_15T12_07_17_365264_07_00'.
storage_version_filename: str = 'latest_version.txt'
# Types of inputs accepted by the Data Commons importer. These are
# also the accepted fields of an import_inputs value in the manifest.
import_input_types: List[str] = ('template_mcf', 'cleaned_csv', 'node_mcf')
# Oauth Client ID used to authenticate with the import progress dashboard.
# which is protected by Identity-Aware Proxy. This can be found by going
# to the Identity-Aware Proxy of the Google Cloud project that hosts
# the dashboard and clicking 'Edit OAuth Client'.
dashboard_oauth_client_id: str = ''
# Oauth Client ID used to authenticate with the proxy.
importer_oauth_client_id: str = ''
# Access token of the account used to authenticate with GitHub. This is not
# the account password. See
# https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token.
github_auth_access_token: str = ''
# Username of the account used to authenticate with GitHub.
github_auth_username: str = ''
# Name of the repository that contains all the imports.
# On commits, this is the repository to which the pull requests must be sent
# to trigger the executor. The source repositories of the pull requests
# are downloaded.
# On updates, this is the repository that is downloaded.
github_repo_name: str = 'data'
# Username of the owner of the repository that contains all the imports.
github_repo_owner_username: str = 'datacommonsorg'
# Manifest filename. The executor uses this name to identify manifests.
manifest_filename: str = 'manifest.json'
# Python module requirements filename. The executor installs the modules
# listed in these files before running the user scripts.
requirements_filename: str = 'requirements.txt'
# ID of the location where Cloud Scheduler is hosted.
scheduler_location: str = 'us-central1'
# Maximum time a user script can run for in seconds.
user_script_timeout: float = 3600
# Arguments for the user script
user_script_args: List[str] = ()
# Environment variables for the user script
user_script_env: dict = None
# Maximum time venv creation can take in seconds.
venv_create_timeout: float = 3600
# Maximum time downloading a file can take in seconds.
file_download_timeout: float = 600
# Maximum time downloading the repo can take in seconds.
repo_download_timeout: float = 600
# Email account used to send notification emails about import progress
email_account: str = ''
# The corresponding password, app password, or access token.
email_token: str = ''
# Maximum time a blocking call to the importer to
# perform an import can take in seconds.
importer_import_timeout: float = 20 * 60
# Maximum time a blocking call to the importer to
# delete an import can take in seconds.
importer_delete_timeout: float = 10 * 60
# Executor type depends on where the executor runs
# Suppports one of: "GKE", "GAE"
executor_type: str = 'GAE'

def get_data_refresh_config(self):
"""Returns the config used for Cloud Scheduler data refresh jobs."""
fields = set([
'github_repo_name',
'github_repo_owner_username',
'github_auth_username',
'github_auth_access_token',
'dashboard_oauth_client_id',
'importer_oauth_client_id',
'email_account',
'email_token',
'gcs_project_id',
'storage_prod_bucket_name',
])
return {k: v for k, v in dataclasses.asdict(self).items() if k in fields}
def get_data_refresh_config(self):
"""Returns the config used for Cloud Scheduler data refresh jobs."""
fields = set([
'github_repo_name',
'github_repo_owner_username',
'github_auth_username',
'github_auth_access_token',
'dashboard_oauth_client_id',
'importer_oauth_client_id',
'email_account',
'email_token',
'gcs_project_id',
'storage_prod_bucket_name',
])
return {
k: v for k, v in dataclasses.asdict(self).items() if k in fields
}


def _setup_logging():
client = logging.Client(project=ExecutorConfig.gcp_project_id)
client.get_default_handler()
client.setup_logging()
client = logging.Client(project=ExecutorConfig.gcp_project_id)
client.get_default_handler()
client.setup_logging()


if _production():
_setup_logging()
_setup_logging()

0 comments on commit 5a4dcca

Please sign in to comment.