Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

automl beta #1575

Merged
merged 7 commits into from
Jul 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
297 changes: 297 additions & 0 deletions language/automl/automl_natural_language_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This application demonstrates how to perform basic operations on Dataset
with the Google AutoML Natural Language API.

For more information, see the tutorial page at
https://cloud.google.com/natural-language/automl/docs/
"""

import argparse
import os


def create_dataset(project_id, compute_region, dataset_name, multilabel=False):
"""Create a dataset."""
# [START automl_natural_language_create_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_name = 'DATASET_NAME_HERE'
# multilabel = True for multilabel or False for multiclass

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, compute_region)

# Classification type is assigned based on multilabel value.
classification_type = "MULTICLASS"
if multilabel:
classification_type = "MULTILABEL"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing wrong with the lines above, a more compact way is of course:
classification_type = "MULTILABEL" if multilabel else "MULTICLASS"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't really affect snippet quality or functionality. won't fix.


# Specify the text classification type for the dataset.
dataset_metadata = {"classification_type": classification_type}

# Set dataset name and metadata.
my_dataset = {
"display_name": dataset_name,
"text_classification_dataset_metadata": dataset_metadata,
}

# Create a dataset with the dataset metadata in the region.
dataset = client.create_dataset(project_location, my_dataset)

# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_create_dataset]


def list_datasets(project_id, compute_region, filter_):
"""List all datasets."""
# [START automl_natural_language_list_datasets]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# filter_ = 'filter expression here'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, compute_region)

# List all the datasets available in the region by applying filter.
response = client.list_datasets(project_location, filter_)

print("List of datasets:")
for dataset in response:
# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_list_datasets]


def get_dataset(project_id, compute_region, dataset_id):
"""Get the dataset."""
# [START automl_natural_language_get_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Get complete detail of the dataset.
dataset = client.get_dataset(dataset_full_id)

# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_get_dataset]


def import_data(project_id, compute_region, dataset_id, path):
"""Import labelled items."""
# [START automl_natural_language_import_data]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'
# path = 'gs://path/to/file.csv'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Get the multiple Google Cloud Storage URIs.
input_uris = path.split(",")
input_config = {"gcs_source": {"input_uris": input_uris}}

# Import the dataset from the input URI.
response = client.import_data(dataset_full_id, input_config)

print("Processing import...")
# synchronous check of operation status.
print("Data imported. {}".format(response.result()))

# [END automl_natural_language_import_data]


def export_data(project_id, compute_region, dataset_id, output_uri):
"""Export a dataset to a Google Cloud Storage bucket."""
# [START automl_natural_language_export_data]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'
# output_uri: 'gs://location/to/export/data'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Set the output URI
output_config = {"gcs_destination": {"output_uri_prefix": output_uri}}

# Export the data to the output URI.
response = client.export_data(dataset_full_id, output_config)

print("Processing export...")
# synchronous check of operation status.
print("Data exported. {}".format(response.result()))

# [END automl_natural_language_export_data]


def delete_dataset(project_id, compute_region, dataset_id):
"""Delete a dataset."""
# [START automl_natural_language_delete_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Delete a dataset.
response = client.delete_dataset(dataset_full_id)

# synchronous check of operation status.
print("Dataset deleted. {}".format(response.result()))

# [END automl_natural_language_delete_dataset]


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
subparsers = parser.add_subparsers(dest="command")

create_dataset_parser = subparsers.add_parser(
"create_dataset", help=create_dataset.__doc__
)
create_dataset_parser.add_argument("dataset_name")
create_dataset_parser.add_argument(
"multilabel", nargs="?", choices=["False", "True"], default="False"
)

list_datasets_parser = subparsers.add_parser(
"list_datasets", help=list_datasets.__doc__
)
list_datasets_parser.add_argument(
"filter_", nargs="?", default="text_classification_dataset_metadata:*"
)

get_dataset_parser = subparsers.add_parser(
"get_dataset", help=get_dataset.__doc__
)
get_dataset_parser.add_argument("dataset_id")

import_data_parser = subparsers.add_parser(
"import_data", help=import_data.__doc__
)
import_data_parser.add_argument("dataset_id")
import_data_parser.add_argument("path")

export_data_parser = subparsers.add_parser(
"export_data", help=export_data.__doc__
)
export_data_parser.add_argument("dataset_id")
export_data_parser.add_argument("output_uri")

delete_dataset_parser = subparsers.add_parser(
"delete_dataset", help=delete_dataset.__doc__
)
delete_dataset_parser.add_argument("dataset_id")

project_id = os.environ["PROJECT_ID"]
compute_region = os.environ["REGION_NAME"]

args = parser.parse_args()

if args.command == "create_dataset":
multilabel = True if args.multilabel == "True" else False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line can be simplified to:
multilabel = (args.multilabel == "True")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't affect snippet quality or functionality. won't fix.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line can be simplified to:
multilabel = (args.multilabel == "True")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't affect snippet quality or functionality. won't fix.

create_dataset(
project_id, compute_region, args.dataset_name, multilabel
)
if args.command == "list_datasets":
list_datasets(project_id, compute_region, args.filter_)
if args.command == "get_dataset":
get_dataset(project_id, compute_region, args.dataset_id)
if args.command == "import_data":
import_data(project_id, compute_region, args.dataset_id, args.path)
if args.command == "export_data":
export_data(
project_id, compute_region, args.dataset_id, args.output_uri
)
if args.command == "delete_dataset":
delete_dataset(project_id, compute_region, args.dataset_id)
Loading