forked from callahantiff/OMOP2OBO
-
Notifications
You must be signed in to change notification settings - Fork 0
/
google_cloud_storage_downloader.py
60 lines (43 loc) · 2.08 KB
/
google_cloud_storage_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import click
import logging
import os
from google.api_core import page_iterator
from google.cloud import storage
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
def download_data(files: page_iterator.HTTPIterator, folder: str) -> None:
"""Method takes a page_iterator containing files located within a Google Cloud Storage bucket and downloads them
to the file location specified by the folder variable.
Args:
files: An iterator containing file names to download from a GCS bucket.
folder: A string containing a local file path where the data will be downloaded to.
Returns:
None.
"""
logging.info('File download Started... Wait for the job to complete.')
# create folder locally if not exists
if not os.path.exists(folder): os.makedirs(folder)
for file in files:
logging.info('GCS File: {}'.format(file.name))
destination_uri = '{}/{}'.format(folder, file.name.split('/')[-1])
file.download_to_filename(destination_uri if destination_uri.endswith('.csv') else destination_uri + '.csv')
logging.info('Exported {} to {}'.format(file.name, destination_uri))
return None
@click.command()
@click.option('--bucket_name', prompt='The name of the GCS bucket')
@click.option('--file_name', prompt='The name of the GCS bucket directory to download data from')
@click.option('--auth_json', prompt='The filepath to service_account.json file')
def main(bucket_name: str, file_name: str, auth_json: str) -> None:
# EXAMPLE INPUT ARGUMENTS
# bucket_name = 'sandbox-tc.appspot.com'
# file_name = 'OMOP2OBO_2020/'
# auth_json = 'resources/programming/google_api/sandbox-tc-43a70953c062.json'
# connect to GCS bucket
storage_client = storage.Client.from_service_account_json(auth_json)
bucket = storage_client.get_bucket(bucket_name)
files = bucket.list_blobs(prefix=file_name) # hardcoded assumption for delimiter
# download data files
download_data(files, 'resources/clinical_data')
if __name__ == '__main__':
main()