- On January 1, 2020 this library will no longer support Python 2 on the latest released version. - Previously released library versions will continue to be available. For more information please + As of January 1, 2020 this library no longer supports Python 2 on the latest released version. + Library versions released prior to that date will continue to be available. For more information please visit Python 2 support on Google Cloud.
{% block body %} {% endblock %} diff --git a/packages/google-cloud-dlp/docs/conf.py b/packages/google-cloud-dlp/docs/conf.py index fc9991d1ec40..cc9cc3485b21 100644 --- a/packages/google-cloud-dlp/docs/conf.py +++ b/packages/google-cloud-dlp/docs/conf.py @@ -20,6 +20,10 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath("..")) +# For plugins that can not read conf.py. +# See also: https://github.com/docascode/sphinx-docfx-yaml/issues/85 +sys.path.insert(0, os.path.abspath(".")) + __version__ = "" # -- General configuration ------------------------------------------------ @@ -38,21 +42,18 @@ "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", + "recommonmark", ] # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_flags = ["members"] +autodoc_default_options = {"members": True} autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] -# Allow markdown includes (so releases.md can include CHANGLEOG.md) -# http://www.sphinx-doc.org/en/master/markdown.html -source_parsers = {".md": "recommonmark.parser.CommonMarkParser"} - # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] @@ -93,7 +94,12 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build"] +exclude_patterns = [ + "_build", + "samples/AUTHORING_GUIDE.md", + "samples/CONTRIBUTING.md", + "samples/snippets/README.rst", +] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -293,7 +299,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1) + (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1,) ] # If true, show URL addresses after external links. @@ -334,7 +340,7 @@ intersphinx_mapping = { "python": ("http://python.readthedocs.org/en/latest/", None), "google-auth": ("https://google-auth.readthedocs.io/en/stable", None), - "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None), + "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None,), "grpc": ("https://grpc.io/grpc/python/", None), } diff --git a/packages/google-cloud-dlp/noxfile.py b/packages/google-cloud-dlp/noxfile.py index cfaff4be5040..e27f448fbad6 100644 --- a/packages/google-cloud-dlp/noxfile.py +++ b/packages/google-cloud-dlp/noxfile.py @@ -23,14 +23,15 @@ import nox -BLACK_VERSION = "black==19.3b0" +BLACK_VERSION = "black==19.10b0" BLACK_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"] -if os.path.exists("samples"): - BLACK_PATHS.append("samples") +DEFAULT_PYTHON_VERSION = "3.8" +SYSTEM_TEST_PYTHON_VERSIONS = ["2.7", "3.8"] +UNIT_TEST_PYTHON_VERSIONS = ["2.7", "3.5", "3.6", "3.7", "3.8"] -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint(session): """Run linters. @@ -38,7 +39,9 @@ def lint(session): serious code quality issues. """ session.install("flake8", BLACK_VERSION) - session.run("black", "--check", *BLACK_PATHS) + session.run( + "black", "--check", *BLACK_PATHS, + ) session.run("flake8", "google", "tests") @@ -53,10 +56,12 @@ def blacken(session): check the state of the `gcp_ubuntu_config` we use for that Kokoro run. """ session.install(BLACK_VERSION) - session.run("black", *BLACK_PATHS) + session.run( + "black", *BLACK_PATHS, + ) -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" session.install("docutils", "pygments") @@ -84,17 +89,21 @@ def default(session): ) -@nox.session(python=["2.7", "3.5", "3.6", "3.7", "3.8"]) +@nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): """Run the unit test suite.""" default(session) -@nox.session(python=["2.7", "3.7"]) +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def system(session): """Run the system test suite.""" system_test_path = os.path.join("tests", "system.py") system_test_folder_path = os.path.join("tests", "system") + + # Check the value of `RUN_SYSTEM_TESTS` env var. It defaults to true. + if os.environ.get("RUN_SYSTEM_TESTS", "true") == "false": + session.skip("RUN_SYSTEM_TESTS is set to false, skipping") # Sanity check: Only run tests if the environment variable is set. if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""): session.skip("Credentials must be set via environment variable") @@ -110,7 +119,9 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. - session.install("mock", "pytest", "google-cloud-testutils") + session.install( + "mock", "pytest", "google-cloud-testutils", + ) session.install("-e", "test_utils") session.install("-e", ".") @@ -121,7 +132,7 @@ def system(session): session.run("py.test", "--quiet", system_test_folder_path, *session.posargs) -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def cover(session): """Run the final coverage report. @@ -134,19 +145,52 @@ def cover(session): session.run("coverage", "erase") -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" session.install("-e", ".") - session.install("sphinx<3.0.0", "alabaster", "recommonmark") + session.install("sphinx", "alabaster", "recommonmark") + + shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + session.run( + "sphinx-build", + # "-W", # warnings as errors + "-T", # show full traceback on exception + "-N", # no colors + "-b", + "html", + "-d", + os.path.join("docs", "_build", "doctrees", ""), + os.path.join("docs", ""), + os.path.join("docs", "_build", "html", ""), + ) + + +@nox.session(python=DEFAULT_PYTHON_VERSION) +def docfx(session): + """Build the docfx yaml files for this library.""" + + session.install("-e", ".") + session.install("sphinx", "alabaster", "recommonmark", "sphinx-docfx-yaml") shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) session.run( "sphinx-build", - "-W", # warnings as errors "-T", # show full traceback on exception "-N", # no colors + "-D", + ( + "extensions=sphinx.ext.autodoc," + "sphinx.ext.autosummary," + "docfx_yaml.extension," + "sphinx.ext.intersphinx," + "sphinx.ext.coverage," + "sphinx.ext.napoleon," + "sphinx.ext.todo," + "sphinx.ext.viewcode," + "recommonmark" + ), "-b", "html", "-d", diff --git a/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md new file mode 100644 index 000000000000..55c97b32f4c1 --- /dev/null +++ b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/CONTRIBUTING.md b/packages/google-cloud-dlp/samples/CONTRIBUTING.md new file mode 100644 index 000000000000..34c882b6f1a3 --- /dev/null +++ b/packages/google-cloud-dlp/samples/CONTRIBUTING.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/CONTRIBUTING.md \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst b/packages/google-cloud-dlp/samples/snippets/README.rst new file mode 100644 index 000000000000..0b25cc7acde0 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/README.rst @@ -0,0 +1,405 @@ + +.. This file is automatically generated. Do not edit this file directly. + +Google Data Loss Prevention Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/README.rst + + +This directory contains samples for Google Data Loss Prevention. `Google Data Loss Prevention`_ provides programmatic access to a powerful detection engine for personally identifiable information and other privacy-sensitive data in unstructured data streams. + + + + +.. _Google Data Loss Prevention: https://cloud.google.com/dlp/docs/ + + +Setup +------------------------------------------------------------------------------- + + + +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started + + + + +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 3.6+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + + + + + + +Samples +------------------------------------------------------------------------------- + + +Quickstart ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/quickstart.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python quickstart.py + + + + +Inspect Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/inspect_content.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python inspect_content.py + + + usage: inspect_content.py [-h] {string,table,file,gcs,datastore,bigquery} ... + + Sample app that uses the Data Loss Prevention API to inspect a string, a local + file or a file on Google Cloud Storage. + + positional arguments: + {string,table,file,gcs,datastore,bigquery} + Select how to submit content to the API. + string Inspect a string. + table Inspect a table. + file Inspect a local file. + gcs Inspect files on Google Cloud Storage. + datastore Inspect files on Google Datastore. + bigquery Inspect files on Google BigQuery. + + optional arguments: + -h, --help show this help message and exit + + + + + +Redact Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/redact.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python redact.py + + + usage: redact.py [-h] {info_types,all_text} ... + + Sample app that uses the Data Loss Prevent API to redact the contents of an + image file. + + positional arguments: + {info_types,all_text} + Select which content should be redacted. + info_types Redact specific infoTypes from an image. + all_text Redact all text from an image. The MIME type of the + file is inferred via the Python standard library's + mimetypes module. + + optional arguments: + -h, --help show this help message and exit + + + + + +Metadata ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/metadata.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python metadata.py + + + usage: metadata.py [-h] [--language_code LANGUAGE_CODE] [--filter FILTER] + + Sample app that queries the Data Loss Prevention API for supported categories + and info types. + + optional arguments: + -h, --help show this help message and exit + --language_code LANGUAGE_CODE + The BCP-47 language code to use, e.g. 'en-US'. + --filter FILTER An optional filter to only return info types supported + by certain parts of the API. Defaults to + "supported_by=INSPECT". + + + + + +Jobs ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/jobs.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python jobs.py + + + usage: jobs.py [-h] {list,delete} ... + + Sample app to list and delete DLP jobs using the Data Loss Prevent API. + + positional arguments: + {list,delete} Select how to submit content to the API. + list List Data Loss Prevention API jobs corresponding to a given + filter. + delete Delete results of a Data Loss Prevention API job. + + optional arguments: + -h, --help show this help message and exit + + + + + +Templates ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/templates.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python templates.py + + + usage: templates.py [-h] {create,list,delete} ... + + Sample app that sets up Data Loss Prevention API inspect templates. + + positional arguments: + {create,list,delete} Select which action to perform. + create Create a template. + list List all templates. + delete Delete a template. + + optional arguments: + -h, --help show this help message and exit + + + + + +Triggers ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/triggers.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python triggers.py + + + usage: triggers.py [-h] {create,list,delete} ... + + Sample app that sets up Data Loss Prevention API automation triggers. + + positional arguments: + {create,list,delete} Select which action to perform. + create Create a trigger. + list List all triggers. + delete Delete a trigger. + + optional arguments: + -h, --help show this help message and exit + + + + + +Risk Analysis ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/risk.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python risk.py + + + usage: risk.py [-h] {numerical,categorical,k_anonymity,l_diversity,k_map} ... + + Sample app that uses the Data Loss Prevent API to perform risk anaylsis. + + positional arguments: + {numerical,categorical,k_anonymity,l_diversity,k_map} + Select how to submit content to the API. + numerical + categorical + k_anonymity Computes the k-anonymity of a column set in a Google + BigQuerytable. + l_diversity Computes the l-diversity of a column set in a Google + BigQuerytable. + k_map Computes the k-map risk estimation of a column set in + a GoogleBigQuery table. + + optional arguments: + -h, --help show this help message and exit + + + + + +DeID ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/deid.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python deid.py + + + usage: deid.py [-h] + {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype} + ... + + Uses of the Data Loss Prevention API for deidentifying sensitive data. + + positional arguments: + {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype} + Select how to submit content to the API. + deid_mask Deidentify sensitive data in a string by masking it + with a character. + deid_replace Deidentify sensitive data in a string by replacing it + with another string. + deid_fpe Deidentify sensitive data in a string using Format + Preserving Encryption (FPE). + reid_fpe Reidentify sensitive data in a string using Format + Preserving Encryption (FPE). + deid_date_shift Deidentify dates in a CSV file by pseudorandomly + shifting them. + replace_with_infotype + Deidentify sensitive data in a string by replacing it + with the info type of the data. + + optional arguments: + -h, --help show this help message and exit + + + + + + + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst.in b/packages/google-cloud-dlp/samples/snippets/README.rst.in new file mode 100644 index 000000000000..708e870fa08a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/README.rst.in @@ -0,0 +1,52 @@ +# This file is used to generate README.rst + +product: + name: Google Data Loss Prevention + short_name: Data Loss Prevention + url: https://cloud.google.com/dlp/docs/ + description: > + `Google Data Loss Prevention`_ provides programmatic access to a powerful + detection engine for personally identifiable information and other + privacy-sensitive data in unstructured data streams. + +setup: +- auth +- install_deps + +required_api_url: https://console.cloud.google.com/apis/library/dlp.googleapis.com + +required_roles: +- DLP Administrator +- DLP API Service Agent + +samples: +- name: Quickstart + file: quickstart.py +- name: Inspect Content + file: inspect_content.py + show_help: true +- name: Redact Content + file: redact.py + show_help: true +- name: Metadata + file: metadata.py + show_help: true +- name: Jobs + file: jobs.py + show_help: true +- name: Templates + file: templates.py + show_help: true +- name: Triggers + file: triggers.py + show_help: true +- name: Risk Analysis + file: risk.py + show_help: true +- name: DeID + file: deid.py + show_help: true + +cloud_client_library: true + +folder: dlp diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py new file mode 100644 index 000000000000..565fed6994c6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py @@ -0,0 +1,302 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom infoType snippets. + +This file contains sample code that uses the Data Loss Prevention API to create +custom infoType detectors to refine scan results. +""" + + +# [START dlp_omit_name_if_also_email] +def omit_name_if_also_email( + project, + content_string, +): + """Marches PERSON_NAME and EMAIL_ADDRESS, but not both. + + Uses the Data Loss Prevention API omit matches on PERSON_NAME if the + EMAIL_ADDRESS detector also matches. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] + + # Construct the configuration dictionary that will only match on PERSON_NAME + # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce + # the total number of findings when there is a large overlap between different + # infoTypes. + inspect_config = { + "info_types": + info_types_to_locate, + "rule_set": [{ + "info_types": [{ + "name": "PERSON_NAME" + }], + "rules": [{ + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{ + "name": "EMAIL_ADDRESS" + }] + }, + "matching_type": "MATCHING_TYPE_PARTIAL_MATCH" + } + }] + }] + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + return [f.info_type.name for f in response.result.findings] + + +# [END dlp_omit_name_if_also_email] + + +# [START inspect_with_person_name_w_custom_hotword] +def inspect_with_person_name_w_custom_hotword( + project, + content_string, + custom_hotword="patient" +): + """Uses the Data Loss Prevention API increase likelihood for matches on + PERSON_NAME if the user specified custom hotword is present. Only + includes findings with the increased likelihood by setting a minimum + likelihood threshold of VERY_LIKELY. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + custom_hotword: The custom hotword used for likelihood boosting. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a rule set with caller provided hotword, with a likelihood + # boost to VERY_LIKELY when the hotword are present within the 50 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": {"pattern": custom_hotword}, + "likelihood_adjustment": {"fixed_likelihood": "VERY_LIKELY"}, + "proximity": {"window_before": 50}, + } + + rule_set = [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "rule_set": rule_set, + "min_likelihood": "VERY_LIKELY", + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END inspect_with_person_name_w_custom_hotword] + + +# [START dlp_inspect_with_medical_record_number_custom_regex_detector] +def inspect_with_medical_record_number_custom_regex_detector( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": "POSSIBLE", + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END dlp_inspect_with_medical_record_number_custom_regex_detector] + + +# [START dlp_inspect_with_medical_record_number_w_custom_hotwords] +def inspect_with_medical_record_number_w_custom_hotwords( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector, with custom hotwords rules to boost finding + certainty under some circumstances. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": "POSSIBLE", + } + ] + + # Construct a rule set with hotwords "mrn" and "medical", with a likelohood + # boost to VERY_LIKELY when hotwords are present within the 10 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": { + "pattern": "(?i)(mrn|medical)(?-i)" + }, + "likelihood_adjustment": { + "fixed_likelihood": "VERY_LIKELY" + }, + "proximity": { + "window_before": 10 + } + } + + rule_set = [ + { + "info_types": [{"name": "C_MRN"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + "rule_set": rule_set, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END dlp_inspect_with_medical_record_number_w_custom_hotwords] diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py new file mode 100644 index 000000000000..4a81df60adbc --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py @@ -0,0 +1,65 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import custom_infotype + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_omit_name_if_also_email(capsys): + info_types = custom_infotype.omit_name_if_also_email( + GCLOUD_PROJECT, "alice@example.com") + + # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME. + assert len(info_types) == 1 + assert info_types[0] == "EMAIL_ADDRESS" + + +def test_inspect_with_person_name_w_custom_hotword(capsys): + custom_infotype.inspect_with_person_name_w_custom_hotword( + GCLOUD_PROJECT, "patient's name is John Doe.", "patient") + + out, _ = capsys.readouterr() + assert "Info type: PERSON_NAME" in out + assert "Likelihood: 5" in out + + +def test_inspect_with_medical_record_number_custom_regex_detector(capsys): + custom_infotype.inspect_with_medical_record_number_custom_regex_detector( + GCLOUD_PROJECT, "Patients MRN 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "just a number 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 3" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "Patients MRN 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 5" in out diff --git a/packages/google-cloud-dlp/samples/snippets/deid.py b/packages/google-cloud-dlp/samples/snippets/deid.py new file mode 100644 index 000000000000..70bd162385b6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/deid.py @@ -0,0 +1,1073 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Uses of the Data Loss Prevention API for deidentifying sensitive data.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_deidentify_masking] +def deidentify_with_mask( + project, input_str, info_types, masking_character=None, number_to_mask=0 +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by masking it with a character. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + masking_character: The character to mask matching sensitive data with. + number_to_mask: The maximum number of sensitive characters to mask in + a match. If omitted or set to zero, the API will default to no + maximum. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "character_mask_config": { + "masking_character": masking_character, + "number_to_mask": number_to_mask, + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_masking] + +# [START dlp_deidentify_redact] +def deidentify_with_redact( + project, + input_str, + info_types, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by redacting matched input values. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "redact_config": {} + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_redact] + +# [START dlp_deidentify_replace] +def deidentify_with_replace( + project, + input_str, + info_types, + replacement_str="REPLACEMENT_STR", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing matched input values with a value you specify. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + replacement_str: The string to replace all values that match given + info types. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_config": { + "new_value": { + "string_value": replacement_str, + } + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + +# [END dlp_deidentify_replace] + +# [START dlp_deidentify_fpe] + + +def deidentify_with_fpe( + project, + input_str, + info_types, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to use. Only + necessary if you want to reverse the deidentification process. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "common_alphabet": alphabet, + } + + # Add surrogate type + if surrogate_type: + crypto_replace_ffx_fpe_config["surrogate_info_type"] = { + "name": surrogate_type + } + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config + } + } + ] + } + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_fpe] + + +# [START dlp_reidentify_fpe] +def reidentify_with_fpe( + project, + input_str, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct Deidentify Config + reidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + ] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_fpe] + + +# [START dlp_deidentify_free_text_with_fpe_using_surrogate] +def deidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + info_type: The name of the info type to de-identify + surrogate_type: The name of the surrogate custom info type to use. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct de-identify config + transformation = { + "info_types": [{"name": info_type}], + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "unwrapped": {"key": unwrapped_key} + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + + deidentify_config = { + "info_type_transformations": { + "transformations": [transformation] + } + } + + # Construct the inspect config, trying to finding all PII with likelihood + # higher than UNLIKELY + inspect_config = { + "info_types": [{"name": info_type}], + "min_likelihood": "UNLIKELY" + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_reidentify_free_text_with_fpe_using_surrogate] +def reidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE) with + surrogate type. The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct Deidentify Config + transformation = { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "unwrapped": {"key": unwrapped_key} + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + + reidentify_config = { + "info_type_transformations": { + "transformations": [transformation] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_deidentify_date_shift] +def deidentify_with_date_shift( + project, + input_csv_file=None, + output_csv_file=None, + date_fields=None, + lower_bound_days=None, + upper_bound_days=None, + context_field_id=None, + wrapped_key=None, + key_name=None, +): + """Uses the Data Loss Prevention API to deidentify dates in a CSV file by + pseudorandomly shifting them. + Args: + project: The Google Cloud project id to use as a parent resource. + input_csv_file: The path to the CSV file to deidentify. The first row + of the file must specify column names, and all other rows must + contain valid values. + output_csv_file: The path to save the date-shifted CSV file. + date_fields: The list of (date) fields in the CSV file to date shift. + Example: ['birth_date', 'register_date'] + lower_bound_days: The maximum number of days to shift a date backward + upper_bound_days: The maximum number of days to shift a date forward + context_field_id: (Optional) The column to determine date shift amount + based on. If this is not specified, a random shift amount will be + used for every row. If this is specified, then 'wrappedKey' and + 'keyName' must also be set. Example: + contextFieldId = [{ 'name': 'user_id' }] + key_name: (Optional) The name of the Cloud KMS key used to encrypt + ('wrap') the AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. + This key should be encrypted using the Cloud KMS key specified by + key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Convert date field list to Protobuf type + def map_fields(field): + return {"name": field} + + if date_fields: + date_fields = map(map_fields, date_fields) + else: + date_fields = [] + + # Read and parse the CSV file + import csv + from datetime import datetime + + f = [] + with open(input_csv_file, "r") as csvfile: + reader = csv.reader(csvfile) + for row in reader: + f.append(row) + + # Helper function for converting CSV rows to Protobuf types + def map_headers(header): + return {"name": header} + + def map_data(value): + try: + date = datetime.strptime(value, "%m/%d/%Y") + return { + "date_value": { + "year": date.year, + "month": date.month, + "day": date.day, + } + } + except ValueError: + return {"string_value": value} + + def map_rows(row): + return {"values": map(map_data, row)} + + # Using the helper functions, convert CSV rows to protobuf-compatible + # dictionaries. + csv_headers = map(map_headers, f[0]) + csv_rows = map(map_rows, f[1:]) + + # Construct the table dict + table_item = {"table": {"headers": csv_headers, "rows": csv_rows}} + + # Construct date shift config + date_shift_config = { + "lower_bound_days": lower_bound_days, + "upper_bound_days": upper_bound_days, + } + + # If using a Cloud KMS key, add it to the date_shift_config. + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + if context_field_id and key_name and wrapped_key: + import base64 + + date_shift_config["context"] = {"name": context_field_id} + date_shift_config["crypto_key"] = { + "kms_wrapped": { + "wrapped_key": base64.b64decode(wrapped_key), + "crypto_key_name": key_name, + } + } + elif context_field_id or key_name or wrapped_key: + raise ValueError( + """You must set either ALL or NONE of + [context_field_id, key_name, wrapped_key]!""" + ) + + # Construct Deidentify Config + deidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "fields": date_fields, + "primitive_transformation": { + "date_shift_config": date_shift_config + }, + } + ] + } + } + + # Write to CSV helper methods + def write_header(header): + return header.name + + def write_data(data): + return data.string_value or "%s/%s/%s" % ( + data.date_value.month, + data.date_value.day, + data.date_value.year, + ) + + # Call the API + response = dlp.deidentify_content( + parent, deidentify_config=deidentify_config, item=table_item + ) + + # Write results to CSV file + with open(output_csv_file, "w") as csvfile: + write_file = csv.writer(csvfile, delimiter=",") + write_file.writerow(map(write_header, response.item.table.headers)) + for row in response.item.table.rows: + write_file.writerow(map(write_data, row.values)) + # Print status + print("Successfully saved date-shift output to {}".format(output_csv_file)) + + +# [END dlp_deidentify_date_shift] + + +# [START dlp_deidentify_replace_infotype] +def deidentify_with_replace_infotype(project, item, info_types): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing it with the info type. + Args: + project: The Google Cloud project id to use as a parent resource. + item: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_with_info_type_config": {} + } + } + ] + } + } + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item={"value": item}, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_replace_infotype] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + mask_parser = subparsers.add_parser( + "deid_mask", + help="Deidentify sensitive data in a string by masking it with a " + "character.", + ) + mask_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + mask_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + mask_parser.add_argument("item", help="The string to deidentify.") + mask_parser.add_argument( + "-n", + "--number_to_mask", + type=int, + default=0, + help="The maximum number of sensitive characters to mask in a match. " + "If omitted the request or set to 0, the API will mask any mathcing " + "characters.", + ) + mask_parser.add_argument( + "-m", + "--masking_character", + help="The character to mask matching sensitive data with.", + ) + + replace_parser = subparsers.add_parser( + "deid_replace", + help="Deidentify sensitive data in a string by replacing it with " + "another string.", + ) + replace_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_parser.add_argument("item", help="The string to deidentify.") + replace_parser.add_argument("replacement_str", help="The string to " + "replace all matched values with.") + + fpe_parser = subparsers.add_parser( + "deid_fpe", + help="Deidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + fpe_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + fpe_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + fpe_parser.add_argument( + "item", + help="The string to deidentify. " + "Example: string = 'My SSN is 372819127'", + ) + fpe_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + fpe_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + fpe_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + fpe_parser.add_argument( + "-s", + "--surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + + reid_parser = subparsers.add_parser( + "reid_fpe", + help="Reidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + reid_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + reid_parser.add_argument( + "item", + help="The string to deidentify. " + "Example: string = 'My SSN is 372819127'", + ) + reid_parser.add_argument( + "surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + reid_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + reid_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + reid_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + + date_shift_parser = subparsers.add_parser( + "deid_date_shift", + help="Deidentify dates in a CSV file by pseudorandomly shifting them.", + ) + date_shift_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + date_shift_parser.add_argument( + "input_csv_file", + help="The path to the CSV file to deidentify. The first row of the " + "file must specify column names, and all other rows must contain " + "valid values.", + ) + date_shift_parser.add_argument( + "output_csv_file", help="The path to save the date-shifted CSV file." + ) + date_shift_parser.add_argument( + "lower_bound_days", + type=int, + help="The maximum number of days to shift a date backward", + ) + date_shift_parser.add_argument( + "upper_bound_days", + type=int, + help="The maximum number of days to shift a date forward", + ) + date_shift_parser.add_argument( + "date_fields", + nargs="+", + help="The list of date fields in the CSV file to date shift. Example: " + "['birth_date', 'register_date']", + ) + date_shift_parser.add_argument( + "--context_field_id", + help="(Optional) The column to determine date shift amount based on. " + "If this is not specified, a random shift amount will be used for " + "every row. If this is specified, then 'wrappedKey' and 'keyName' " + "must also be set.", + ) + date_shift_parser.add_argument( + "--key_name", + help="(Optional) The name of the Cloud KMS key used to encrypt " + "('wrap') the AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + date_shift_parser.add_argument( + "--wrapped_key", + help="(Optional) The encrypted ('wrapped') AES-256 key to use. This " + "key should be encrypted using the Cloud KMS key specified by" + "key_name.", + ) + + replace_with_infotype_parser = subparsers.add_parser( + "replace_with_infotype", + help="Deidentify sensitive data in a string by replacing it with the " + "info type of the data." + ) + replace_with_infotype_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_with_infotype_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_with_infotype_parser.add_argument( + "item", + help="The string to deidentify." + "Example: 'My credit card is 4242 4242 4242 4242'", + ) + + args = parser.parse_args() + + if args.content == "deid_mask": + deidentify_with_mask( + args.project, + args.item, + args.info_types, + masking_character=args.masking_character, + number_to_mask=args.number_to_mask, + ) + elif args.content == "deid_replace": + deidentify_with_replace( + args.project, + args.item, + args.info_types, + replacement_str=args.replacement_str, + ) + elif args.content == "deid_fpe": + deidentify_with_fpe( + args.project, + args.item, + args.info_types, + alphabet=args.alphabet, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + surrogate_type=args.surrogate_type, + ) + elif args.content == "reid_fpe": + reidentify_with_fpe( + args.project, + args.item, + surrogate_type=args.surrogate_type, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + alphabet=args.alphabet, + ) + elif args.content == "deid_date_shift": + deidentify_with_date_shift( + args.project, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + lower_bound_days=args.lower_bound_days, + upper_bound_days=args.upper_bound_days, + date_fields=args.date_fields, + context_field_id=args.context_field_id, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + ) + elif args.content == "replace_with_infotype": + deidentify_with_replace_infotype( + args.project, + item=args.item, + info_types=args.info_types, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/deid_test.py b/packages/google-cloud-dlp/samples/snippets/deid_test.py new file mode 100644 index 000000000000..7d886c51e362 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/deid_test.py @@ -0,0 +1,257 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import deid + +HARMFUL_STRING = "My SSN is 372819127" +HARMLESS_STRING = "My favorite color is blue" +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA==" +WRAPPED_KEY = ( + "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" + "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" + "rotx7Chxz/4z7SIpXFOBY61z0/U=" +) +KEY_NAME = ( + "projects/python-docs-samples-tests/locations/global/keyRings/" + "dlp-test/cryptoKeys/dlp-test" +) +SURROGATE_TYPE = "SSN_TOKEN" +CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv") +DATE_SHIFTED_AMOUNT = 30 +DATE_FIELDS = ["birth_date", "register_date"] +CSV_CONTEXT_FIELD = "name" + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_deidentify_with_mask(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert "My SSN is *********" in out + + +def test_deidentify_with_mask_ignore_insensitive_data(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_mask_masking_character_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + masking_character="#", + ) + + out, _ = capsys.readouterr() + assert "My SSN is #########" in out + + +def test_deidentify_with_mask_masking_number_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + number_to_mask=7, + ) + + out, _ = capsys.readouterr() + assert "My SSN is *******27" in out + + +def test_deidentify_with_redact(capsys): + deid.deidentify_with_redact( + GCLOUD_PROJECT, HARMFUL_STRING + "!", ["US_SOCIAL_SECURITY_NUMBER"] + ) + out, _ = capsys.readouterr() + assert "My SSN is !" in out + + +def test_deidentify_with_replace(capsys): + deid.deidentify_with_replace( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], + replacement_str="REPLACEMENT_STR" + ) + + out, _ = capsys.readouterr() + assert "My SSN is REPLACEMENT_STR" in out + + +def test_deidentify_with_fpe(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert "My SSN is" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + surrogate_type=SURROGATE_TYPE, + ) + + out, _ = capsys.readouterr() + assert "My SSN is SSN_TOKEN" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_ignores_insensitive_data(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMLESS_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_date_shift(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + context_field_id=CSV_CONTEXT_FIELD, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_reidentify_with_fpe(capsys): + labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681" + + deid.reidentify_with_fpe( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type=SURROGATE_TYPE, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "731997681" not in out + + +def test_deidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is 4359916732" + + deid.deidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" in out + assert "My phone number is" in out + assert "4359916732" not in out + + +def test_reidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398" + + deid.reidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" not in out + assert "9617256398" not in out + assert "My phone number is" in out + + +def test_deidentify_with_replace_infotype(capsys): + url_to_redact = "https://cloud.google.com" + deid.deidentify_with_replace_infotype( + GCLOUD_PROJECT, + "My favorite site is " + url_to_redact, + ["URL"], + ) + + out, _ = capsys.readouterr() + + assert url_to_redact not in out + assert "My favorite site is [URL]" in out diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content.py b/packages/google-cloud-dlp/samples/snippets/inspect_content.py new file mode 100644 index 000000000000..fb2573e4bc8a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/inspect_content.py @@ -0,0 +1,1424 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevention API to inspect a string, a +local file or a file on Google Cloud Storage.""" + +from __future__ import print_function + +import argparse +import json +import os + + +# [START dlp_inspect_string_basic] +def inspect_string_basic( + project, + content_string, + info_types=["PHONE_NUMBER"], +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. + inspect_config = { + "info_types": info_types, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print("Quote: {}".format(finding.quote)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string_basic] + + +# [START dlp_inspect_string] +def inspect_string( + project, + content_string, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string] + +# [START dlp_inspect_table] + + +def inspect_table( + project, + data, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + data: Json string representing table data. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + Example: + data = { + "header":[ + "email", + "phone number" + ], + "rows":[ + [ + "robertfrost@xyz.com", + "4232342345" + ], + [ + "johndoe@pqr.com", + "4253458383" + ] + ] + } + + >> $ python inspect_content.py table \ + '{"header": ["email", "phone number"], + "rows": [["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"]]}' + >> Quote: robertfrost@xyz.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + Quote: johndoe@pqr.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in data["header"]] + rows = [] + for row in data["rows"]: + rows.append( + {"values": [{"string_value": cell_val} for cell_val in row]} + ) + + table = {} + table["headers"] = headers + table["rows"] = rows + item = {"table": table} + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_table] + +# [START dlp_inspect_file] + + +def inspect_file( + project, + filename, + info_types, + min_likelihood=None, + custom_dictionaries=None, + custom_regexes=None, + max_findings=None, + include_quote=True, + mime_type=None, +): + """Uses the Data Loss Prevention API to analyze a file for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + + import mimetypes + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the item, containing the file's byte data. + with open(filename, mode="rb") as f: + item = {"byte_item": {"type": content_type_index, "data": f.read()}} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_file] + + +# [START dlp_inspect_gcs] +def inspect_gcs_file( + project, + bucket, + filename, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze a file on GCS. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket containing the file, as a string. + filename: The name of the file in the bucket, including the path, as a + string; e.g. 'images/myfile.png'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the file's URL. + url = "gs://{}/{}".format(bucket, filename) + storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_gcs] + + +# [START dlp_inspect_datastore] +def inspect_datastore( + project, + datastore_project, + kind, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + namespace_id=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze Datastore data. + Args: + project: The Google Cloud project id to use as a parent resource. + datastore_project: The Google Cloud project id of the target Datastore. + kind: The kind of the Datastore entity to inspect, e.g. 'Person'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Datastore info. + storage_config = { + "datastore_options": { + "partition_id": { + "project_id": datastore_project, + "namespace_id": namespace_id, + }, + "kind": {"name": kind}, + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_datastore] + + +# [START dlp_inspect_bigquery] +def inspect_bigquery( + project, + bigquery_project, + dataset_id, + table_id, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze BigQuery data. + Args: + project: The Google Cloud project id to use as a parent resource. + bigquery_project: The Google Cloud project id of the target table. + dataset_id: The id of the target BigQuery dataset. + table_id: The id of the target BigQuery table. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Bigquery info. + storage_config = { + "big_query_options": { + "table_reference": { + "project_id": bigquery_project, + "dataset_id": dataset_id, + "table_id": table_id, + } + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_bigquery] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + parser_string = subparsers.add_parser("string", help="Inspect a string.") + parser_string.add_argument("item", help="The string to inspect.") + parser_string.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_string.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_string.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_string.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_string.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_string.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_string.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_table = subparsers.add_parser("table", help="Inspect a table.") + parser_table.add_argument( + "data", help="Json string representing a table.", type=json.loads + ) + parser_table.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_table.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_table.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_table.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_table.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_table.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_table.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_file = subparsers.add_parser("file", help="Inspect a local file.") + parser_file.add_argument( + "filename", help="The path to the file to inspect." + ) + parser_file.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_file.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_file.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_file.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_file.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_file.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_file.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + parser_file.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + parser_gcs = subparsers.add_parser( + "gcs", help="Inspect files on Google Cloud Storage." + ) + parser_gcs.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_gcs.add_argument( + "filename", + help="The name of the file in the bucket, including the path, e.g. " + '"images/myfile.png". Wildcards are permitted.', + ) + parser_gcs.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_gcs.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_gcs.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_gcs.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_gcs.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_gcs.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_gcs.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_gcs.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_gcs.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_datastore = subparsers.add_parser( + "datastore", help="Inspect files on Google Datastore." + ) + parser_datastore.add_argument( + "datastore_project", + help="The Google Cloud project id of the target Datastore.", + ) + parser_datastore.add_argument( + "kind", + help='The kind of the Datastore entity to inspect, e.g. "Person".', + ) + parser_datastore.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_datastore.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_datastore.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_datastore.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_datastore.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_datastore.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_datastore.add_argument( + "--namespace_id", help="The Datastore namespace to use, if applicable." + ) + parser_datastore.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_datastore.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_datastore.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_bigquery = subparsers.add_parser( + "bigquery", help="Inspect files on Google BigQuery." + ) + parser_bigquery.add_argument( + "bigquery_project", + help="The Google Cloud project id of the target table.", + ) + parser_bigquery.add_argument( + "dataset_id", help="The ID of the target BigQuery dataset." + ) + parser_bigquery.add_argument( + "table_id", help="The ID of the target BigQuery table." + ) + parser_bigquery.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_bigquery.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_bigquery.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_bigquery.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_bigquery.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_bigquery.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_bigquery.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_bigquery.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_bigquery.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + args = parser.parse_args() + + if args.content == "string": + inspect_string( + args.project, + args.item, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "table": + inspect_table( + args.project, + args.data, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "file": + inspect_file( + args.project, + args.filename, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + mime_type=args.mime_type, + ) + elif args.content == "gcs": + inspect_gcs_file( + args.project, + args.bucket, + args.filename, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "datastore": + inspect_datastore( + args.project, + args.datastore_project, + args.kind, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + namespace_id=args.namespace_id, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "bigquery": + inspect_bigquery( + args.project, + args.bigquery_project, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py new file mode 100644 index 000000000000..bdabda265c1b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py @@ -0,0 +1,467 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.bigquery +import google.cloud.datastore +import google.cloud.dlp_v2 +import google.cloud.exceptions +import google.cloud.pubsub +import google.cloud.storage +import pytest + +import inspect_content + + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +DATASTORE_KIND = "DLP test kind" +DATASTORE_NAME = "DLP test object" + UNIQUE_STRING +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING + +TIMEOUT = 900 # 15 minutes + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(topic_path) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path( + GCLOUD_PROJECT, SUBSCRIPTION_ID) + try: + subscriber.create_subscription(subscription_path, topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(subscription_path) + + +@pytest.fixture(scope="module") +def datastore_project(): + # Adds test Datastore data, yields the project ID and then tears down. + datastore_client = google.cloud.datastore.Client() + + kind = DATASTORE_KIND + name = DATASTORE_NAME + key = datastore_client.key(kind, name) + item = google.cloud.datastore.Entity(key=key) + item["payload"] = "My name is Gary Smith and my email is gary@example.com" + + datastore_client.put(item) + + yield GCLOUD_PROJECT + + datastore_client.delete(key) + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + # DO NOT SUBMIT: trim this down once we find out what works + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")] + + bigquery_client.insert_rows(table, rows_to_insert) + + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + +def test_inspect_string_basic(capsys): + test_string = "String with a phone number: 234-555-6789" + + inspect_content.inspect_string_basic(GCLOUD_PROJECT, test_string) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Quote: 234-555-6789" in out + + +def test_inspect_string(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: FIRST_NAME" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_table(capsys): + test_tabular_data = { + "header": ["email", "phone number"], + "rows": [ + ["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"], + ], + } + + inspect_content.inspect_table( + GCLOUD_PROJECT, + test_tabular_data, + ["PHONE_NUMBER", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_string_with_custom_info_types(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + dictionaries = ["Gary Smith"] + regexes = ["\\w+@\\w+.com"] + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_string_no_results(capsys): + test_string = "Nothing to see here" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_file_with_custom_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_file_no_results(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_image_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + + +def cancel_operation(out): + if "Inspection operation started" in out: + # Cancel the operation + operation_id = out.split( + "Inspection operation started: ")[1].split("\n")[0] + client = google.cloud.dlp_v2.DlpServiceClient() + client.cancel_dlp_job(operation_id) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_with_custom_info_types( + bucket, topic_id, subscription_id, capsys): + try: + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_no_results( + bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "harmless.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "No findings" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.png", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "*", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore( + datastore_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore_no_results( + datastore_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "No findings" in out + finally: + cancel_operation(out) + + +def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_bigquery( + GCLOUD_PROJECT, + bigquery_project, + BIGQUERY_DATASET_ID, + BIGQUERY_TABLE_ID, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=1) + + out, _ = capsys.readouterr() + assert "Inspection operation started" in out + finally: + cancel_operation(out) diff --git a/packages/google-cloud-dlp/samples/snippets/jobs.py b/packages/google-cloud-dlp/samples/snippets/jobs.py new file mode 100644 index 000000000000..a8ac0b43c5e0 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/jobs.py @@ -0,0 +1,167 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app to list and delete DLP jobs using the Data Loss Prevent API. """ + +from __future__ import print_function + +import argparse + + +# [START dlp_list_jobs] +def list_dlp_jobs(project, filter_string=None, job_type=None): + """Uses the Data Loss Prevention API to lists DLP jobs that match the + specified filter in the request. + Args: + project: The project id to use as a parent resource. + filter: (Optional) Allows filtering. + Supported syntax: + * Filter expressions are made up of one or more restrictions. + * Restrictions can be combined by 'AND' or 'OR' logical operators. + A sequence of restrictions implicitly uses 'AND'. + * A restriction has the form of ' '. + * Supported fields/values for inspect jobs: + - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED + - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY + - `trigger_name` - The resource name of the trigger that + created job. + * Supported fields for risk analysis jobs: + - `state` - RUNNING|CANCELED|FINISHED|FAILED + * The operator must be '=' or '!='. + Examples: + * inspected_storage = cloud_storage AND state = done + * inspected_storage = cloud_storage OR inspected_storage = bigquery + * inspected_storage = cloud_storage AND + (state = done OR state = canceled) + type: (Optional) The type of job. Defaults to 'INSPECT'. + Choices: + DLP_JOB_TYPE_UNSPECIFIED + INSPECT_JOB: The job inspected content for sensitive data. + RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Job type dictionary + job_type_to_int = { + "DLP_JOB_TYPE_UNSPECIFIED": + google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, + "INSPECT_JOB": google.cloud.dlp.enums.DlpJobType.INSPECT_JOB, + "RISK_ANALYSIS_JOB": google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB, + } + # If job type is specified, convert job type to number through enums. + if job_type: + job_type = job_type_to_int[job_type] + + # Call the API to get a list of jobs. + response = dlp.list_dlp_jobs(parent, filter_=filter_string, type_=job_type) + + # Iterate over results. + for job in response: + print("Job: %s; status: %s" % (job.name, job.JobState.Name(job.state))) + + +# [END dlp_list_jobs] + + +# [START dlp_delete_job] +def delete_dlp_job(project, job_name): + """Uses the Data Loss Prevention API to delete a long-running DLP job. + Args: + project: The project id to use as a parent resource. + job_name: The name of the DlpJob resource to be deleted. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id and job name into a full resource id. + name = dlp.dlp_job_path(project, job_name) + + # Call the API to delete job. + dlp.delete_dlp_job(name) + + print("Successfully deleted %s" % job_name) + + +# [END dlp_delete_job] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + list_parser = subparsers.add_parser( + "list", + help="List Data Loss Prevention API jobs corresponding to a given " + "filter.", + ) + list_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + list_parser.add_argument( + "-f", + "--filter", + help="Filter expressions are made up of one or more restrictions.", + ) + list_parser.add_argument( + "-t", + "--type", + choices=[ + "DLP_JOB_TYPE_UNSPECIFIED", + "INSPECT_JOB", + "RISK_ANALYSIS_JOB", + ], + help='The type of job. API defaults to "INSPECT"', + ) + + delete_parser = subparsers.add_parser( + "delete", help="Delete results of a Data Loss Prevention API job." + ) + delete_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + delete_parser.add_argument( + "job_name", + help="The name of the DlpJob resource to be deleted. " + "Example: X-#####", + ) + + args = parser.parse_args() + + if args.content == "list": + list_dlp_jobs( + args.project, filter_string=args.filter, job_type=args.type + ) + elif args.content == "delete": + delete_dlp_job(args.project, args.job_name) diff --git a/packages/google-cloud-dlp/samples/snippets/jobs_test.py b/packages/google-cloud-dlp/samples/snippets/jobs_test.py new file mode 100644 index 000000000000..89997bc5097c --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/jobs_test.py @@ -0,0 +1,89 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import pytest + +import jobs + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_COLUMN_NAME = "zip_code" +TEST_TABLE_PROJECT_ID = "bigquery-public-data" +TEST_DATASET_ID = "san_francisco" +TEST_TABLE_ID = "bikeshare_trips" +test_job_id = "test-job-{}".format(uuid.uuid4()) + + +@pytest.fixture(scope="module") +def test_job_name(): + import google.cloud.dlp + + dlp = google.cloud.dlp_v2.DlpServiceClient() + + parent = dlp.project_path(GCLOUD_PROJECT) + + # Construct job request + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}} + }, + "source_table": { + "project_id": TEST_TABLE_PROJECT_ID, + "dataset_id": TEST_DATASET_ID, + "table_id": TEST_TABLE_ID, + }, + } + + response = dlp.create_dlp_job(parent, risk_job=risk_job, job_id=test_job_id) + full_path = response.name + # API expects only job name, not full project path + job_name = full_path[full_path.rfind("/") + 1:] + yield job_name + + # clean up job if not deleted + try: + dlp.delete_dlp_job(full_path) + except google.api_core.exceptions.NotFound: + print("Issue during teardown, missing job") + + +def test_list_dlp_jobs(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert test_job_name not in out + + +def test_list_dlp_jobs_with_filter(test_job_name, capsys): + jobs.list_dlp_jobs( + GCLOUD_PROJECT, + filter_string="state=RUNNING OR state=DONE", + job_type="RISK_ANALYSIS_JOB", + ) + + out, _ = capsys.readouterr() + assert test_job_name in out + + +def test_list_dlp_jobs_with_job_type(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB") + + out, _ = capsys.readouterr() + assert test_job_name not in out # job created is a risk analysis job + + +def test_delete_dlp_job(test_job_name, capsys): + jobs.delete_dlp_job(GCLOUD_PROJECT, test_job_name) diff --git a/packages/google-cloud-dlp/samples/snippets/metadata.py b/packages/google-cloud-dlp/samples/snippets/metadata.py new file mode 100644 index 000000000000..7a65941d622a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/metadata.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_list_info_types] +def list_info_types(language_code=None, result_filter=None): + """List types of sensitive information within a category. + Args: + language_code: The BCP-47 language code to use, e.g. 'en-US'. + filter: An optional filter to only return info types supported by + certain parts of the API. Defaults to "supported_by=INSPECT". + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Make the API call. + response = dlp.list_info_types(language_code, result_filter) + + # Print the results to the console. + print("Info types:") + for info_type in response.info_types: + print( + u"{name}: {display_name}".format( + name=info_type.name, display_name=info_type.display_name + ) + ) + + +# [END dlp_list_info_types] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--language_code", + help="The BCP-47 language code to use, e.g. 'en-US'.", + ) + parser.add_argument( + "--filter", + help="An optional filter to only return info types supported by " + 'certain parts of the API. Defaults to "supported_by=INSPECT".', + ) + + args = parser.parse_args() + + list_info_types( + language_code=args.language_code, result_filter=args.filter + ) diff --git a/packages/google-cloud-dlp/samples/snippets/metadata_test.py b/packages/google-cloud-dlp/samples/snippets/metadata_test.py new file mode 100644 index 000000000000..bde63fd3e8fb --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/metadata_test.py @@ -0,0 +1,22 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import metadata + + +def test_fetch_info_types(capsys): + metadata.list_info_types() + + out, _ = capsys.readouterr() + assert "EMAIL_ADDRESS" in out diff --git a/packages/google-cloud-dlp/samples/snippets/noxfile.py b/packages/google-cloud-dlp/samples/snippets/noxfile.py new file mode 100644 index 000000000000..ba55d7ce53ca --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/noxfile.py @@ -0,0 +1,224 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + 'ignored_versions': ["2.7"], + + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT', + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + 'envs': {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append('.') + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars(): + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG['gcloud_project_env'] + # This should error out if not set. + ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG['envs']) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir): + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session): + session.install("flake8", "flake8-import-order") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + "." + ] + session.run("flake8", *args) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests(session, post_install=None): + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars() + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session): + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip("SKIPPED: {} tests are disabled for this sample.".format( + session.python + )) + + +# +# Readmegen +# + + +def _get_repo_root(): + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session, path): + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart.py b/packages/google-cloud-dlp/samples/snippets/quickstart.py new file mode 100644 index 000000000000..ec929b45f541 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/quickstart.py @@ -0,0 +1,98 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse +import sys + + +def quickstart(project_id): + """Demonstrates use of the Data Loss Prevention API client library.""" + + # [START dlp_quickstart] + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp_client = google.cloud.dlp_v2.DlpServiceClient() + + # The string to inspect + content = "Robert Frost" + + # Construct the item to inspect. + item = {"value": content} + + # The info types to search for in the content. Required. + info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}] + + # The minimum likelihood to constitute a match. Optional. + min_likelihood = "LIKELIHOOD_UNSPECIFIED" + + # The maximum number of findings to report (0 = server maximum). Optional. + max_findings = 0 + + # Whether to include the matching string in the results. Optional. + include_quote = True + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Convert the project id into a full resource id. + parent = dlp_client.project_path(project_id) + + # Call the API. + response = dlp_client.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + # Convert likelihood value to string respresentation. + likelihood = ( + google.cloud.dlp.types.Finding.DESCRIPTOR.fields_by_name[ + "likelihood" + ] + .enum_type.values_by_number[finding.likelihood] + .name + ) + print("Likelihood: {}".format(likelihood)) + else: + print("No findings.") + # [END dlp_quickstart] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "project_id", help="Enter your GCP project id.", type=str + ) + args = parser.parse_args() + if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + quickstart(args.project_id) diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart_test.py b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py new file mode 100644 index 000000000000..1814497c1660 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py @@ -0,0 +1,37 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import google.cloud.dlp +import mock + +import quickstart + + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_quickstart(capsys): + # Mock out project_path to use the test runner's project ID. + with mock.patch.object( + google.cloud.dlp.DlpServiceClient, + "project_path", + return_value="projects/{}".format(GCLOUD_PROJECT), + ): + quickstart.quickstart(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert "FIRST_NAME" in out + assert "LAST_NAME" in out diff --git a/packages/google-cloud-dlp/samples/snippets/redact.py b/packages/google-cloud-dlp/samples/snippets/redact.py new file mode 100644 index 000000000000..8a1650a262db --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/redact.py @@ -0,0 +1,255 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to redact the contents of +an image file.""" + +from __future__ import print_function + +import argparse + +# [START dlp_redact_image] +import mimetypes + +# [END dlp_redact_image] +import os + + +# [START dlp_redact_image] + + +def redact_image( + project, + filename, + output_filename, + info_types, + min_likelihood=None, + mime_type=None, +): + """Uses the Data Loss Prevention API to redact protected data in an image. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare image_redaction_configs, a list of dictionaries. Each dictionary + # contains an info_type and optionally the color used for the replacement. + # The color is omitted in this sample, so the default (black) will be used. + image_redaction_configs = [] + + if info_types is not None: + for info_type in info_types: + image_redaction_configs.append({"info_type": info_type}) + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "min_likelihood": min_likelihood, + "info_types": info_types, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] or "application/octet-stream" + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": content_type_index, "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + inspect_config=inspect_config, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + print( + "Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename + ) + ) + + +# [END dlp_redact_image] + +# [START dlp_redact_image_all_text] + + +def redact_image_all_text( + project, + filename, + output_filename, +): + """Uses the Data Loss Prevention API to redact all text in an image. + + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the image_redaction_configs, indicating to DLP that all text in + # the input image should be redacted. + image_redaction_configs = [{ + "redact_all_text": True, + }] + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": "IMAGE", "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + + print("Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename)) + + +# [END dlp_redact_image_all_text] + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + common_args_parser = argparse.ArgumentParser(add_help=False) + common_args_parser.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + common_args_parser.add_argument( + "filename", help="The path to the file to inspect.") + common_args_parser.add_argument( + "output_filename", + help="The path to which the redacted image will be written.", + ) + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select which content should be redacted.") + subparsers.required = True + + info_types_parser = subparsers.add_parser( + "info_types", + help="Redact specific infoTypes from an image.", + parents=[common_args_parser], + ) + info_types_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + info_types_parser.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + info_types_parser.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + all_text_parser = subparsers.add_parser( + "all_text", + help="Redact all text from an image. The MIME type of the file is " + "inferred via the Python standard library's mimetypes module.", + parents=[common_args_parser], + ) + + args = parser.parse_args() + + if args.content == "info_types": + redact_image( + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) + elif args.content == "all_text": + redact_image_all_text( + args.project, + args.filename, + args.output_filename, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/redact_test.py b/packages/google-cloud-dlp/samples/snippets/redact_test.py new file mode 100644 index 000000000000..0cce514eb1a6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/redact_test.py @@ -0,0 +1,60 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import redact + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_redact_image_file(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + ) + + out, _ = capsys.readouterr() + assert output_filepath in out + + +def test_redact_image_all_text(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image_all_text( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ) + + out, _ = capsys.readouterr() + assert output_filepath in out diff --git a/packages/google-cloud-dlp/samples/snippets/requirements-test.txt b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt new file mode 100644 index 000000000000..d0c01cc98c5f --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt @@ -0,0 +1,4 @@ +pytest==6.0.1 +flaky==3.7.0 +mock==4.0.2 + diff --git a/packages/google-cloud-dlp/samples/snippets/requirements.txt b/packages/google-cloud-dlp/samples/snippets/requirements.txt new file mode 100644 index 000000000000..08b72bbe1fdf --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/requirements.txt @@ -0,0 +1,5 @@ +google-cloud-dlp==1.0.0 +google-cloud-storage==1.30.0 +google-cloud-pubsub==1.7.0 +google-cloud-datastore==1.13.2 +google-cloud-bigquery==1.25.0 diff --git a/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt new file mode 100644 index 000000000000..2763cd0ab820 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt @@ -0,0 +1 @@ +My credit card number is 1234 5678 9012 3456, and my CVV is 789. \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/resources/dates.csv b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv new file mode 100644 index 000000000000..056fccb328ea --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv @@ -0,0 +1,5 @@ +name,birth_date,register_date,credit_card +Ann,01/01/1970,07/21/1996,4532908762519852 +James,03/06/1988,04/09/2001,4301261899725540 +Dan,08/14/1945,11/15/2011,4620761856015295 +Laura,11/03/1992,01/04/2017,4564981067258901 \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt new file mode 100644 index 000000000000..5666de37ab23 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt @@ -0,0 +1 @@ +This file is mostly harmless. diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.png b/packages/google-cloud-dlp/samples/snippets/resources/test.png new file mode 100644 index 0000000000000000000000000000000000000000..8f32c825884261083b7d731676375303d49ca6f6 GIT binary patch literal 21438 zcmagE1yo)=(>96}ifeIqcZVA&?(XjH?(SY3ihGe_#oce*-QC@tL!Vc^?_cLX>+H4m zPIfXgu`4slBoXqmV(>87Fd!fx@Dk#}iXb2muAgyID9Fz*U4vm22nZa8g^-ZEgpd%S zyrZ3|g|!I?h8} z`UBB)&}x1|{16cluBnDvJOM{ePIjsBhe+n2%002#;Mw4Kc+ccI81<7K$1??yZ!kL8 zHO`|3F^0VkRuJeI4elc)FU2ABN1*J&bFb#g$ITfWWCSs}eRIQVf&ZPiS{>i_yzulv zU8bEK4i16>?Es_JHx&9v3erSQA(r+PBoFO4W`B22{1T+^SWp}ZgqXhDg1LgKn~K?* zu0A5-Iw%bSBz@Qvb_PT~;nH;9X<8qb3|fo_G?l^M9j8w>)0rJ(I}Az7%xof5Jsqyb zW7y4M`W>CcgqA!bi#^n&Sv=%)0%Om(=HM-7?{Om`iws|<7YV_#g^^P-fu&+4V00-D zMLMKO;0FpmXbpB>>XUY9`YTypMZ^s-}$z6O6lRvs`mp8pFHjlaTW%1q}!!1=u`oF>1!8KxIsxC1?;rZwh3Tr z-`iK4vriJKF^aiBXs;sicH>DE73)E<@ z2>4~hyXH$aC6RR!c<(o z>wY(6;vA?~LU%SUm7WzP1p~9_0KAw0<|X*MKXjjcq5l#g_~pv8)ytM%ZTj~vNWmYF z?p>mlSa;#6<4~I{*x&s5iMBzf(sHVtQ@&p3y^o}+-q(SaPA_?vijliRIPqt|U~i%_J_~-?&i=u_8_QyE zqaP#}oc_4E&d5RW>n%IaJ$j{4^SvoM`0n9p@J=#CQq~cj%BVAXBW*5D;kBb28KXnU zudYv3pQ0N56yOSB)ny5a$`dwc@Ou#p8vi7?WLg$ehfZ>s4s~EFZh28<#81Z?cLeTcgwG<}Ra0-uy|~m0Lb$YIq6O3nb)FE#RO>u) z!~wOZ2^g-k<8D9(Dair-PVijJ;t9UuLkD8E%w=fMAx*Iq3kv!Uln>PlL7xN{?ZVyP z0m%&bsviKth$ZZg`2)(dC$c2Sde8$w9V8{tP#$JJFh-wd5&GUAe3OwA!BPO66Olf^ zDi?1R3{hY2HZWBm1TMhfi-0&3d>)BrIGY#LDY(;6Ngv{YR@!Lb!1zRUm4+$X|MWO?+^x4yB_QOQZC-Ud{N&r|UH03VVt25tVKEz2j)Cv{HBPk~7Di!zO} ziAI>x9&MkhLSeC7zRF%FPt71LUy`Z7UD1#dE2$`HEQus3Dk&_fF)}hTG}1Ow3GFFT z>Kg|QzEWGo;_t`!GST|NXN3}_{@J)Wr$^?<*#Hd z3BMJ?QPeDI6hjna6icRQOdw29O$heVharadhAEP&XdcQbf2EZ@mR75vmn#3tRBbL` z{w1kauNEUerm9oqDSsDv49k}AvsBX`TkW^FP24g>JwCT6NB+wc*R9EI`)$;%u1kJP zx@Wj&sAuW3!5#Y@C_GzC1hxaV6B{+_xVbYEV<;6#aD2adFXwpE*dwc~Tjm7kdQxm03_M!tvgP0Bt6U9qaaYVkbxZ_VFg%S{bM_sVBn%RF@qmJe}i1Q$%% zEFH$LS62@%@_15Nlvz*QUe1~>kS=%5LC#Ljjfc9EXA4G$HMh*S?1x!%Co?4{UPm`~ z9EUkGA3>$vw+5z694r~>;E>#q-H?VsYmhdOy`iR|HK8G)V(6qynwCc3-Q)E+)QqWQse#_IC(R9qYmLpgN)@RgrwM;+9!p{u=$v29Zi&s(% za7?w#wX9w&1FwP$p-;%`q#rF0j8jb-7tRCPf4&*N2)=l}a3G{0;D*73WyG=qzXSVY zU1F;!G-Y;WR++9UQP(UYXBPj3nkB1LwbaG{j+NHw z7wD1jev>mJ-iMmYp-Zmao8g6VwL`DrhxVM-4Z%)Pzfu0d&c05%?{tLh`c_>#-+R02 zx{kX72upIG1Y){_Hzzk;y4?hwg*b^+h`X8YFNezAa3dH{mt`2S#qm+os{!V+Q9pKFq`1NMmC{ zG#oSPuaR*Wc9_{I+g=C008{(j$fU*9)9mRKc;a)^Q-viXrIu4!IqCG52Q1oWvWhX} zI(d7o2UfAvOf4rye|ngvT+`lHpbiD^KJEq$BiVrs=fyNS;p&pQ>_OErF z?RZ=dyH6!*^oN&c(y^ zQukfv2bFpDZw{~X(^%Z{%QEk0!8pG0LNN@2iE)tcr<3J13iHD8hU zFfIot*-@1&nzR+}3CHzej|o^XSl{%xiGxu)P5o;9qrmeJK3F#fLG&V8OHJ##CUb|2 zgj}+(DT*nk^l$BxmDLrOYqgIicOoq!Qjwm%Fwdne>ZR)H-e%3f>nxf}v{y768ay>y zji>rxEyw!V%DT4O8|v}0a{iT%wx@&mxzh5LdCsb(nv^Eh>ic`{3zx6M$|Eqtp7U}V zdVd0%^Nf32WB#z~Qst<3IH8&(x+^X0SC6@9MK@NgU3*wP&ugJ|poujeS!*?)y}6#> zkKy0jH|5_qZ(EIf*`{`>$~`2zlNMa(i+Dcn}QDx>;t|(vOO)V0EOZ>vg~;s zb_<7wY)TGGBrSjZ^k4(8KdRSpiEzOyp~$f4j+?C%m^x92zI3@5EIm(&xNGuyK}yhQGCS5LR>&Mm*4>9HRf3$`H}$4z)%FXvfD zZY}4I7adKhE*E!iuP?obDF9Lctw-VYuh*LKonb%q*B$dzr-gLekMntoDLMRGdrw_H zG~TyWt=s7Pir41%n=%Xp2JC0Bm*tPNd$Eg=%+%hue!sH!=CkCd@x63#^!2kM5LS=8rx@Uwjs4jM&6={Z()%!wIPco>&0UKlF`!6rf_}n&2+d7bf@6Y!`gI&f5=G)0+?8vzzUq zoecI~$CmyaWbm?h-7ozqnd#i{F2H#%L7s#%|H2A}4I1Mw`kf^A|Ne^s50+JD@Q`0h zGxqEMo9e$ZE(vqtTQ!mX|D=A>cZr1hv!Ci3qZKdBeb$4X?u(*XO0Ku4pIqKV)@nfX zX$b=zNYiu+Pt3r6qi;;o$kBdtqPwL#yK9RJD-i;*Puc&y!l_7L%hlygwQHpXHTvER zo3wBIZ$$Gd=L(rYdA4?BMfVN{cA_$NvCB!)#ti^WlO!j>aPoA#%mVEe_LtwWt!Q^UapEp z!!2fp;Pku6UCBK3olXo05z@$GcDj~-_kx~5#Kuw;UsvrmzdGLu6o*PZgNidZSaF1U zsQR)9af;AQA#xKY(&Ss7x>U;t@|Ac}c{qHlTXc?)rN~`G?1e@$yB{ib@y0e?!wJbQLT2DYome^{!80)@n5J)_5$dsoSU=s(5T3i^ z?A>~PBSC*%9@j6#nHwe7=53`R z%$okJ%m5KsX8ZnTC*72Z`dmYiQj@1e^=9W~LOqRW6iou2#8k)LiyR>npK=$lbK3+X zFkdzc(rHhfIMOn|`xhX8NB2gDrNGWr?_zE9w~KxYKy0xz?uheq=i+n5cJ6gn3fEUI z)UcftoP^1Kn#tQ(SUnEE=wl9Q1#J&`QjA?{mPRBc(e>D*^7bZ1?D49V%cGq8yzM2! zwgv2T+8WoCn?YP-PTOv_4Tqix`r9hJ7_s&45#^1oWecQx`iPYq4*Y>=J{^FG3!Cl0 zZ&Zdg6{u9_Y(%7>`@!`o`)`T-=ZcJvM0rK!k*BQhiOc-#{}jL31Z=JjP(br#LKh= zV=&Z4)vdxPj~<`wwY+g+8P?u?xgaj@g&nF)$yuDd2ybfUk?A@C;tNG0CO{qwQG^`| ztz{)#jntp~;H|^JO6z&m{=KNuh7?5u&YPh!Wq>=nbV+Yoq=OcUb>9U4qN@?>(ZAIm zkL73O+8z*nz7AU2?h6iS#*ZU_e?TFjpPpsx?W&HQ1JO?jcHa|IFuK@un|u(Td~cUm zT2C<0l-Haxus_KV>W6KU{`yTPc%};^b2W=uzV?m-bN){`e`$aEpFPU#^o$^C*sNeO zw-VX!{jgD!3>xyrq>{3;Nqa`F7Unu+RDY1PRHFs#hxmecMLGquMP3q^+ejIn*r(UO zpSekH*KJ7E3Zk@4PQ_k~R%W4x;=dJOO zxEB)vgn(Ou!9pH&2l8 zh`M?cr+5;q8s#a2rK`vvBKM$;6$45B!pf*fEvc-& z@awdX5xW6ikS9asO^ZF{>gE^7jA+BvBXH|9jptpzvb!s=v>{p9_of%W#m$ilYVFp- z`C+TaNu!UfYaoi*!d?Jhb76g>m(+?dcpXGw$aURfv(1i zEb4Jn1uHBN?)EhR2lSW5FY)IGa8ifkFEBo}wqohs;t16(6>{uFX)N5TcT5$7bHkR` z(UZHmG9m-&e|k2eeg7^a4=KDzCpf<(hm0Ofx)X=ji=BMk1xHb-r9ca#lX=H6yL-sU zZhNU{ghbaNsfkQRd;AmT^D+O1IeBm8koSmgVcalk%5BRGtb_9%riG!Dlz6?tLatMgtZ5qCo$@0bK^ zxIhYupHtJ2tCt1cO>WkZp1ON#@&T&Mfe4MPf!BB4hKP10i@m%)RqPjeteq+Y?SNkr6IHi{4>e+Sn&o9+M z1f4T{ONb)iSR}bVUB6|XudM1TDN(Y>zad25jYcwVF`0mg9>mx&8~2C+0KY!o1fED) z)6feJ4Or&dkzb?p1uOV8sfbp}U0!xY1V9>IhtqqNNHVgvY9j2EQcz#@^FQ=W?2n`H zM(vn?$MA0%q`&=gL|k4CJ6aie#ha`;W5cF8<6^%isq&&IasU=37dECV(t23E>6{x# zW6Q?;oT)ZEck%8_@+P5Vm9nwN%`?e;sZY12SE z=P2ij&172AuIc59{58r!!rn42acr{a*=nh+nDgRGQRD%4wqJh=c(}gK*?qvgTe=u- zI5nrK*`iq%@QZ$YS=7LLj4r{txG5}U$mRZUWy$z?P|Mh=o%sXs4(!FW@+9Ob#pC>i z7tUnAW)Wp?-bdDbX)=L8c1ej%2;JI9jqdeu^kHAs)f zB#EsTcpOMXK4H!iZQ-^Dvy~#89iW+?NHTSW+Qd5X*ig>v${>Tqxlz=SwWCQNAA zTbb!GbMGCk(*ShJz=CIA03c3eV^*@5JGpeLbnJn}%g&oTV_Yx)fu2g*I=p+c%^k~d zxWAP+Ro6pfrZ-^RW13^aet&#!A8nm6%~I7%y*^9WL9^SdKAJHd6t1btutn#D8*%aMdT$40Ez z@2$JS%=IH%fFs{(VVv{kV4=n~%A8eSLvu(DZ;-`im@%Q3vc4)JXQC$g?vZ3-l8r;u_Ov2(Y; zMuRaQ-cjQSTj|^2m|V`G0&j-|gDs+yD}QpIp6Emg(OCy$;reA`yRGstrlQ80Q%yLQ zc<%ZyS~qV4i~8$pwHQr*3|Js#kU8#9mdR{V8|WqiOy+!r{v7lU@!W3pePn^jF5_BH zdMx6OW_hDB+n&VXDEj)-w7L{u9ni|=ywmVidZe-s!>&=J1(IYfV#)DJiIqry&%s~k zyMw!OH2)f-b-T4KDE;F!dIW_Q+O5>kTP;KmuqzL-soF9ZvbZUa##gqSO3mGUOo{#` z9|BIfe&L3hga)_?g4@wu9OpUc_k(ax_f| zx9-M-b#16X;o6Xo?qb(8Qq2xl=ZSsu_)k9y~^SRW#TrsoZUtclP0R z?IRaMyUd7pdxztA1G{sxWJP&s8QP-L94jEnp6^*Qd+g9m zn7Y0XBwL26;XSp3KSuZyz4C0TnFn8g3o9Htn~0my(88aq>8gu@IJkh5fvn$RSADy< zr_tRnlVxz%?de6kItSulQ4e(Q;i*-K`ovUhs*r?CboEtr(3CG@44a2X4=zbF)juFx z^)w?*4lGDWxxNJ^Z>j7v{j_pB#y#D*hGh{oueeQ~${VXGvy9YV+#}u-Ti34iSb%{6 z64H0aL8hwDFq_ow{-%bziI7*CLs8aAm7`IM_9blURyDu)R%<}gzpZ0}9L4d?&hdRT ziZulk|P9&2K6J(aJ>(PXMsl{f!=G&c6IxS@)&}W0T}a#mxRpXz`2>&d<

zue1vAOc#_na3SBR;0LJ#-BcbKb(M$m6zElUS_R_pLON8BNgkcAIMrg}%I=FI5oq+9 ziQgDiJ@!vzl66ZW5hgOSKG*!jQ^RPI2q)%ki*9Ganldw4=t4S>#Wc~?lW5A(L85Dm z@n$MJA+LE@<8qgyNf81E&(Z`{^mb0&bz-@Xh}7g$_ns*&_j)xAw&jv28Hq&!id7s6y9Juzvcw3jiZXeDf$U*g z9_6Gg+7FqXd$hD_+R1Q2$~aIvh;^v?f>s_MfiR!(@gahkR8GKS2#M42@;i=ytb%N(Jd1^a~H_6+*?qZ$A&k>;0s+eGQF!jxfS zUaCgY?|z!5BXG!N$5HS|XvOjUA;rt0&II?l{unNDJAdp}~SP`%s)o@>7QD93@Xl z=t(#%iKVSoCdW^dW!4JJ%+&?K`9Y{k?08-vEbms%F7A$Fplpw;trLM8)P?H6^ncHb znV%Bl-e(%~-D<1Sz+T#Ews=9&6xn&(>}^9#Nc8t>M&U#*SRfw}x{LexG5j1^ru`Mh z>NUHHmcQ8-xlrd1b0yKm23)U>2=j89Pgx0)uTOuOw4ok7_OgbbL-;c+$!-w9b!5Pj z|83Iac}B!<;2I!kKpFVZ3y{IE9Nsr-=ohmI{jJjQOGX;-{JR`$BT+ zM76DuN#W+xG;!j#C$A>1g0#Z){WsaHY<{pqs49$!vzT@4VEr;q|u?c*kDcwd2LE4{&H)2$e4;NlziH%JAJk z8{R&SwE?&?n9qf!X3We>R6T7T&O0t?@)i0h!^({26EhgSHre>%)|{FutlUU(vU`b+ zrv`Jdf>dowiZ-cM)iUo@XWx#5GM|qx4n5CY#LpZ^JY65K4-DCSv7bboq>B~{w8j<- zj)mX`rDih9C^N^ed8eumCCUS1dCowUd2_Imcmucm z9RfF3yo>I;Uh0y0IB6kyA(_6SkD0#8bFDC~!P5?6Qvfv;zcl;D$yI+uCya96LE-4P z(|i7XOlzfyTgQTZJAd$qIBTmJhgpIT*m2$X=nT)2e7DGnbHkb(sr3&STK0_QX^`q32QZy#oi9C z(DKK*MPDBcYVITrsWWui=O;%cdX0CB9oTB`_bhj#bMcBqI4DTJvN(xviUrG1rO5gC zG2ZC{=Q7LI!;A8z?FO)ra#XDN8DeghklE@0SYAFqv?X->XH&rtihMYA97Dq~`fB`% zTxvTgm~s)jA^TgSyu_beU!BfdA_OsWAh`7ca1`xCbF#O^BK7Z(qH}om?F5gU3*=(u z(z5kE{=7icHLIoAIWbU#z+mpeqIEq$hYwfd&~i{%a?pvziX5UjIL?L+3^*Oq`Me^R z>Mmv_Dw$1(MS1uNSyRS2m#5YLjtRSZJ{NjCS+d3CzSf;mUz9Z(mv48EKqZh;h`+9Zm-g}M*D}g|P6fBrC@!CZBla~n4IYouesKJMJ{ws)3W%X6F}0<+A|92Bn@V< z&Wde-Zy0OouMa<)03$R2kX_alGPSvqyXL?(b*qB5(<=|AI z!Dvnk1^&}~38wrpe<3h{YsfPj_Fim8cX7)X_A0d2X6GQZPUmr9otGyKj6RWrbU!86 zYzLZuT4d25W>>RT^?Y$0*VLAfp2hAl(W623^Ut=c(>C9;7cZ~eHHro&mUx%`0GGs3 z46Q|wa~WFlzha|&=`KpGAXm6v;9SC)Wcg)(RTkKPn0QnTZ;p@0 zITZ~(=&sru_f6}E;TzyI&D^3o`K11yYZ-af(99KlLOqUp7|mbqpQ+&6Wz70$=AAjg zRR=mxOF~J95vQp7NWlh?DAd#aOEaR=-ifkxA{fmJ{B#gnT z;_lZmFK8puo%t>>h3qL+MeKmgax#2Dfl?|hEqCgg5D1dIBO)frfX@Bt2`|3HdbNb$ zfrpEU40`O81h%a{8n% zz|R-3)GWjh&UU=`M&?gV`|-b<#dW%38jX?3>1|_aS}0=m1f|{a$|+a&sCz&J_gQMq z=mMDN8T7RifQ9UlQykKfQ$X?NM5_DSmG{M&Q~avb@EXsziPa&*NwIvu4T(Z;F;?1b zH26#0lf{)Y=Gx8M&57kas?q02Piv9lR`7ZMaUAgp`@~^Ss9n&jI@(AR?Z^)mpKJyn zmAS`ClpOIB+Du%?0+KJGb{`Cs5NY>fRM1bAXFLCO%mKE?IB5{Kg(Yhz!Rn1U5&uqXgTdVX2}>eg z7DB3$_n&T-xK7m}jBJDhAN6}oK`@u%k zy|zfRdNx=V5^AXYT|fU$uiRyw2RPkJq^Yeg+1^lRFZL|GW=hARp@t`AKhb46rd0%x zd-^$HCZMgxX$a?~ozvau=2gU#ev{n5QbfqsD1N}DKELQ$x6UMj)eoI}YtDOI>(etiic)uJ(4HK-ByvT}M+-R!i6 z|Jh)qw@%rfG17b)$Jlneadyw1SZHnz0~wLmBT-tiy)<$&su13T$emJDSn;MbHa-Ba#>pvx$Ua&hO1V4k2-(#1G;l^Do)EmPD=i3f?P9uLJJ514=&39p6>ZP3 z0?JuX8uRfvVWf|LXD_p4tdj(jqY&`qmWB%nYeIyRf=zY5`$g|`$bB|-HT!puFJ;+s z+F*DS9+34quoBV0+4NB*-e5fXMxvQ}MRsem5^vE(pWqlV2g<52GGYo!7iAcw;7VnD z;7mREtthc6W_xfX;~8eXiLBGe#U@O4`B7wWDEHUa!8r2K$W%LU>|5jDqW)+`#vcm~ z@;I$%RHEb91F5cK0JbYc6@OGs#09D#mg`fkdKOE8>5^8zz5Tq-(vEUpmsF9#jJC9{ zi##D!qrtPWx$XQww#saUSf~0~OHv{L9VY#S-nc5vC&X=Ld~6=4q*iTp2vxp*he_|$ zmjGD}=cjA~6SCmKId31HwN8$~ncM|D)B>rs@HWwo;LiJ*d%-!Zb93$2&^Usi?9ih4 zY@G3l?q?^_{Ni8yratZ1K()sdX=af*j84_V9V$Axu=euj16llN2-sp`pUttRd&{ba z*aMzc1`>mclmXln;U9UJcjy74;SGoJY)&MyUZ{8SoYyu#fH?;&(+=r-iImS}Wmh_E zd~hVKj)UgvZ;#wjYzg%-Fy*?fgSgk5Puxsc4JgHH-ytOmbjA-Q!uvO zN~eoTqaCI1J%e5I2$dQJ zqY0|8o7WgFoJ;owO&-Y#?e{fp;B}ceoq~DS*MP`h5JZ%ep@@Z0w}v*~@}{tDeih1C zIvKnyy=1Iq{pn2SJvd$B{j;PZ_db};YqrnP`7~~)+xhf)l`_+7J>9_lQR{i%)|rb5 zyT)LH*|~M^YA`NLZ#fq9OiSw<^#R;KS}h`y?G#p#R!+n8sn5vju2lVE&+}@Lf5w4Q46HaY= zj5!U5x^dQZ4{XxJ-i1=1e(H{hjlgd&vQ(hle5jdtTNkrPuT#-kC)r7=e3;6jbY*LUY$g+`DeC1k@wycL_1vn8$8`9osDozR!cG z#$weV8XV`iX}mc>Y3<*3H`Q{ZgKzqP$;Xv^)rs(UnJMqJ-)diMI1LMj?-aJ>fz;Iq zd}B(d&8$Wlv1=vF?v(1Ba=E6a#rJGLOO0C?v1scU9PW$U=IVOkYZcx?U%Ry4BX&-h zfv_wgW94h9|4r4o7Tb+y0+!q0h!q->6(&?u8>Y3pbN?h~;2ei=MQg=TptcdAihCun z+rC>bLYa^l<8GZQ50z1T|9Lfcs)a7|yU~kHO-+`f3Y`WNT>H$=o!Pc$kmnCt>c*{} z)?ee1zqzGCV_;Dw_+v6_U>YBIdD!*Dv3Ui2g<*;&9FF*#wQrc{0wDSNUj=;RElF++ zX}4;NXRhSIuvl~~MXqlzc5=|{ShMyk3e2MAxpLmtu3saK8-@m3c%J{jEo=@^L+^5t%Zqz!M>`2v? z2;ld0AxN4d8;@^`#qeFhs3PFMragMZVxaYrnrUrnJw#WY^bV`K6JiBE1+W8=pF+K4 z`eU2lh)kuutx-2#=Ub`TgG(L6k_i+9FZ|jz0)LK^KyxG02A z!ntcuj{Euqt4_e{K=5cWa^I72hJyWJQMA9XuI6*hcd&fLGy0NImeX&-@zz6J1r-U4 zmz_i3L|q(1YQ2URBBK$Fgy9WJlE&PFsHTcKChvoH{C;OAuzpu%GBVHEW=0h%9Z7wn zLsIfMCEP5WLVkGJvis5A?!+O{n@qE3UZFI`|B1H8k zv=oP>7hgA`)wYQ+La`8mh8&^)6h_74Ky+b(qyDXgV{!1c_erSQb?r^Rh3T|l>{p&O zi{yZd*Ux;TRI#Rhq_(yj-@4S6wNRb98unpVBN@3WX;q=GOm_)53iwLTM-DQt2#uVu zuzCz)a?OeJb+B%6^hIk;P1jd?VgBz*W_eXAgo^LGG1p75TRjPilETbY^gdMO7#1CXn&1bJw|V;ctv*C9 z>xy6zcv~Vvx^cx&#DobE7(m$Y>9HM!)yN>#pM^vf>#uAJq|~y{&IYIaHoDW5WA-x9 zHcfsHcih|&1PFcP`6ABj&%)!e3KGpO;PvpoL)2NKy*aNAmE^^IQ;_i$h6~Y4eNT*Y z@g9}PMb2+tC7Wv2P6~Sb5XbS1S(cX`s+@@c-ePRGIlg*mc&yv*)b0w$|DhdIdL` zc4Nw1p`R+)IvDafHCjiX`rH*->Q|S)@A)+JO$)yy)b-880i3vxVlh$-Q*EX(qG|ho zI$($m@{C>eh$z*#IeeJon)*|CTq|81w~+e$Gcn=`sP$XkLIr_xon@Hn#smZk2gNHP zNhD9o)i;wL?lBW4eU4;W65#iO7($ecIJ*MQVFiJ&hDGyXZ(2K~I*+ z=`^F$YU*Qbd zSK<6cFEY0YihDv1LpxvyvWK)K+Kx18?R=fim2Jcv^_7y#F+WT@uV_$)5FCPIi zT|~8Z8zXF3uLiA=C-lsSN8Q#S-l%IU@sjghev4DE=cU+X6{s7`TmOuDr?Ra}%O`Pn ze8zb?;b7GLnNw2j#O(oL1Lqn!mpZcDMYW;45#`$T8(v8UF*Tw9=XXu;gn2|9`9`E* zWcXo5OlBp1BiG7R62{>3J#MLvbcaTHodtDYhneVPDS;P_lgnsf)AA5ntbpzmQqt%7 z`^`$WaCrbyEdCo9!D|OtR1U4!S{KA3j|g>+3ckzfHfzS^b{{c7894{3Y`*>6$URfm zGH)<+kf)BwkjzSWZ-I4}piuGpGm@fX5@fbu+PJ4NOQ(?5`$S zkMU*_LVy`OhSP7Y-bNb0@B35()i%!9%D8Qg(TV$>Vp&Gb6Bf5J9(?aVvE5L1DP-D% zB`JJHuC#lBvrQx>Bj=_12YDY?*f}HNiMT6QiN`!4Ip5~906E&_ZrvGn{qzikh+b!y z{m4UmS{NF`U-NQXZo7?YjP4NWU<>y-@5dk{)FmCGD2udrpK}AX2?kNKS}kfcM$t3= zgSXW_*(nc!aspW2yLZ0Gq6BE)S&_0%qkdc|ego7!l&2dVc@9{O#^>#gN00%JU6<1a z|6PmCd}+AGCWJknlC8`%WW75{N9fPfF%(kNyS;ke&tr0a3zgMOom}@JlFSXP#tjFI zNV}%V;~?RdGKe6F?qlot!;pql&S!t^n`3A0?e<|QMVi>PshgHzy*#&wcCwOWdDyi4 zf_Rz7(hE;d2Yw>s1R4zw9I3&s$tdycKxIEix%EzBm$7AkwYER5%krh+OoVux7zESK{Ep%$**OL^ zt3qGsTA$QDrQ^_~Wk(6E^cc?(PMb?dR_FE62c&FuknPq?WEe*&DQM&fft*X7kto== zY|K>a+^Ov&#Mmz{9&l57OiTJliy4oI@ZCATHbP@(W-nAz{U2IPW{BkgNN;VaYZe)c z7gyS$bj&>FN_1m=?pS(PX`3&Z-N!P0kN``;bubOYQ#z9xV`-+5z~7+CJa?~&*47hU zGwhdxFTH7tJ9NIin_m^)--wdE1fcVD7wnMk9LiPQWplbo%P|;Ot}5(Ne5$qSQ19FF z=1@8EdAwjtTqO@$3jk)q5WIj|^805bK>L0pwH?`>o$5qa$|SqCMs zmCFH%GNN;Wx3BRn_DQdB*?$m}Jb~3D-Lq)K0}IJPpk^`-Bp6X zZb5ct3zK*2yXS3}BR9~}%OE5?s*d1qcMejF5jAC#YA1hDWwuT4F}|zRJyx_<)DzU@ z4!pIZ+oP{qQ9dlhR?7@@kB(h`h{bDn;|a7C)56v9vOaS-X95vJ2;p`SO2`fTz=k;FfF}76=CK{b$pUJ`~6M@wFS9kxFsh3KWLBLz}w|Pc^6* zHRA9tzicc+Qesw^JU4xy(UWBh`(DCdacfsQJ9&;AQ96pg6#Ej(X*jHY7{`8RG! z40M+hxE;Q(vb(#`xG~(`U5CRb!y7-5_$ZvbwVewRCSw$k#X4d$|3LHRmrIRv$w^ZoHg86mDv6Zck<9B;ZkD0AJ_SQDIPs4)!=r z>es(3&ip1fETLLnJvO$Ej2>Vi?($rLJMd)G)~JUH&#!9IKJDb{vAC!Bl4-6(dTQ(O z&d*{%WeO`4mbY6wLV+rwemnv$;Et^14 zVpi+906cTPLn$7>Q+f+6eP>*_91?DwG@+P#2U>a4lML;;_e7o<-x7O$9Wp>Y>WNHP zmnqoLGc+qCqT=TLi^UwSyu~?;Lu3#O2U^tu+bl8C>^wj1A76$%YkQ7LHMYVZP*dXf zl=A>(cpEaP8+|#@-E(eFo4Kp=ZpYa&WKFP4jvs`x!d6O*qL+Nh0*7$XcOo%MEL-J7WNR3pMtMFBmDIO$aNeld3`P5l) zsi*S7M)Bdd-NvqW($?bq548#|p^Ke!DceYG%&mb!%e{<6jMk#DTaJ1k_GyQsbBoq0 zg;yxloz=5QT3|HHG6b38K~^vHF@>^)kH$?XS!zTe335r@(l*1ctT|GK)eMjXIRpFV0VH z+{4Q**OD!aaXOV|1t60zaynNuOPfY`1T%dP_V2$Js~z%-FzvXju~XO57B5Kk&39ns z_YV$+JobDz7-$%RR+>hi2JgM+W*==~2ofC!9rg`$&1@---LZKqu8&s5j*nj7Rc~DQ z9J4F3ep>1kAKGi*@?|%2;7`z7OJ<*$ue7uLUDE+$#i}l(>tK1S_Ni0ll~ZOy>j5s5 z^HgNc7R|ot)E6aRs@f*dJ7ww_1f<@d{@PT&87ziJ@8U@MGJLewUz+V+;rqfa7lv;G zSx{M?(3K}X0Y$Iq9w(dsbRP{|6Mq7SdB4G~#l`0ZIn} zBTBCsHxV`NnZKr@4R8Iq9ig3KeoJAdSLPBDygUFeE(Ys8F}(MK4b>R)H+<>Vn>EHo zJ2PH#C~=9D>C^q0qBd%ok0d3#`cGA@XDh`F2<9HDsJiDc_+{8Km~KNOQi@X5!}-)3 zV1jstw7kE=4mLZJD(e{k+zunSLh=u`bTin;E0DL{K`piFtigOQ$dFG`6G|mS^J@7r z1h!<+rev*cz4kpHV)-$(V!Ww zbM*l`UKbu>+mZJ%1??asv7G4qFycpG+AF8xR!p&aXpcyp2gG5_TlW(EmTf`E2z)um z48FJSWfO+^f4aEps3@3sjerU&pe&)Z@X?5rbk~BExFES7AqXNM0@5J@0!v9Q-Hm{B zgXGfPDels-MXt+o(Ri^Mr^s2xAM&&Trbz(4X!3 zS`hoo>9}2P`4cnIl+?sf8C9w*Vr+Tl1q}gCN0k>;R;sa-QBE!4PJ36DD-N@q;opr{ z)fyk0`78Idrbwgc5^H@9CA_8H+-n!oEq~-k9XZm=OMe7hA2n3Xkjmv@3WY>_n$q$7 zTJETy7CwGF`$V&fT+jN{2?`r_hF3jhpVpHL!tX(aEU-$9W60=Gt?Os6^vbs#kC_AX z8=va7#o_nWe}`&xDp#fz%s!1yJiesvyAphwpZ(pr(9tewz24?A3+pP_1hr@)JJi6i z0aHiwUm#p7VuRVoY%uwLk(mN+EIhL`8 z0C+2}pY~90Of5ch?_6*iT|8!2;vvIz_C&KKXy9!DU=DjSt$WP~ko!L@0o!3*2osd` z-^IL_8#HdPtX7%yZozu#t6)6iR2`wrJ3&yC#g5lAY}7c^7dNCelh3mU4;?(^+#jrx z)sn`f?s?@#!A6{tB%g`B(|Lpw>7hssBOoMG%RB)-lmXLLG z5Bwh~MgFT=;c?iq1|`-B@jc$JGiK7Slk0$8ZKuG(qKI3%#=KNM^z&9F_ZA#7Z8}*9 zM9);{5h$0oVfDfFv&j4|+zP}w?25@&AvZcaC69mOT=h`N4@C%*JV)EvdpiKlR|@Yg zVNa8kOd1wuh^ekTT5Y}e`5c-k;+~ZW0^vDreWX@JK)mK&tsV%bj|9`J&KuBuj*GY(M-zzPEQE93o9l$ zfP8xQ@Wn!H$Uivzbq=Q-USUvvAxf^)pXA zV4hWVO-Ka%Ek3gI{`M@g>6=^p;YC>ldX+|=Yja}eymB6T{=|OV=OYKM-xIv!L`FfO zGr3b?3P;6U>?r6yiwO_VzUtKXyI22o`fvt`p9qnSWDxnY2XXtC>Vwi8=%tJVr8m(c znV4`3BEF&3@HCQodo?#)gOj~3Yh!7|r@UAhu4Sm|>i^NkG;Yc-MAv4`E_0yZ*#y4W zh2QDC9Gk3uux1)fDJK{+qI51hV9=gEp_RhmN!rklqX1nxbgKpQ+CY35ua$!%eG!fo zgw`j2v2EOmyf&|8fshU|RSBgj0mAZmY1P@kxiGUi7vm4*;x~xf6pQJ1^-wy!ZK z2mWMoiw>CBkDM=@zgf`MY@pC#nKWF^*1zj;CKGx>_Tsn|1}g_;6VxHW9- zoL=tR!B-#m)tu=81hN`YV^_CX10yWFXCFIadd9pEm3=Rx6NBObleP=@21{0Q|I$cI)!ixc46$sZJA>ODf)~nw27Ems&MmN6Y*Q z8}&q+;?dQ*the)pcg8|)zq0PP+`=gZxOhoVeh5Fhc=p#D?HnX1pKbdYqTrr>5)rYy zQguCsur)%XbjLF60YpK%v^G&-kGbf(hqH5reIaIzMxI>{bT~}I+?5b& zPd!LPS0`CrvS9K}Gj@Q6y%hBKly>3pNwwOBc=wRlOwI&nrP5%c^erGl2D9-p>cCT& zI_mHrP+gy?5h(h}(`0LhU9SC#!B4xcC%!o}vG?8_Q3$+Y)LcAvJCF>dzW5$x8nb)S z_oS&Q!;K|r6?6nL0J)>Ya+f5kk569lu=sd?-ejrh(64u8P#5qQfp|PcgWN}s{aT}Gn5V2bs3iC;0Oc1~F_M3|nLYsT|Vy9;354vko&+_VOyv>DN z8aQnIGlv+KHS65!^Ln4=y~j52)rd9Wv=Zk3n?hvEfz~Y(R?_}4w%|Zm?*4h zF#Lv{KIRpdYa|0O_cO78wST-xlKa%Lv`fveTSA$hOdtl(+Z9dJainwVB6V*x#T(lj#rM%RCN#Lna!IaC25=Fr?Nx2NOOmoKBoM&<{2h{A zkk!)?3!cJCzu%R{2#+v%CXE)khp&hV^zRo_cI_Dd{`oq81lBu^XmW(hn;_p{Cx?03gLyx@Nabq;t|ck2G}m_6?ojQI{=bnq6N`ZCzueLK z#lEF{H&BX2ZakvZ_Wt_bTkU^O2VBjM2Qn6Ht&hj7+8t~BQxok>`VC-rFIA0Ho~>qn zzy`l{UvZ9mEIj_R=JFM(gLUl96>* z*oloSg;3P7q5_K|9g>nSs0bJP{o>-HY24~Ek^iu})*4DL53rga`lCS9h!aeLrTc7$%Yw8IP!)HRfVDDf>%vT14 zah>c^_!+;2ib-g#ZJc&`M6u8)QV!+gcZH`s=L#?80PbikKlU@fINn2Fs&^yU>#Z8^snAvGAm%sU zoKcO4@8!QQ7AP`1Jq2~&L&E^Ea>+~5CX<0a_htSyi)}oU=v0PcD~_51Ai%1l;2N5} z%;}*wl2tjPf~`BPl5iPs$&C;Iio&{xa|6Gaz24LCag%qj#@Y($Qf(tXoaV5W=YMr( zxVvN!^*tYxMI&(JW75HR%f z!dG58_Z4Q!{Iv1m?N_Rv#6cAZS$>kf{LmjQp6l!xz8*3euAepZF7e!~)@RAC4*P2! zXHp7Q@%xYEs=`l4k8?NP=dJT-MbZ_g>o(2j=!>y5M$SmYh{Ycq{IlrI&-_w-O0~a2 zI2hGF9aD%FgN*KPiUs1RPc5X`7PG3jFbOi)YV!mY=5og?@zB`PQ(H)krn%0Dk_JRQOqzBh$Z^~#JvUs5?gsH6m;e=MJg zj)gWg^b(F}&Dgt!iGqlNs_qe+fDTUiQbRYOjT~%x%1TzZ-%04Qnf*=nKZ~VOG)V}u zHe!DuOLA?g{{|^>v$zF?1E(U}HGYYx@j5Agw7{sF!wQ z4wr6mIbisG7j|62703?(w~YEouD)^d+)W*nB{BWqiTY339I042YUD;=`l;ntsN2#Gj7lw8Z; z4`C`)M8l-xP*|opM*2?oDVdEM$IepnzpK{-2_hhxWgwSSv3FFMhGgC>NojG5AdHoJnLW_}GCo{oOi~*>$ z2qNgZ;jm7)RjNWDtf%XYhnNY#LWvTNjqMfl@&U5FgMg6&w}k({KmNCGb{vqo6h5RH zc13&2;tlbF!-)~xu$VC0f3F+8y`E3zqU|O5qu3i2CwW`%JsDo%DtxEV{{{5xpAT9z M)pS)Wl`TL24{iKN9{>OV literal 0 HcmV?d00001 diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.txt b/packages/google-cloud-dlp/samples/snippets/resources/test.txt new file mode 100644 index 000000000000..c2ee3815bc9b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/test.txt @@ -0,0 +1 @@ +My phone number is (223) 456-7890 and my email address is gary@somedomain.com. \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/risk.py b/packages/google-cloud-dlp/samples/snippets/risk.py new file mode 100644 index 000000000000..518f947eee6b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/risk.py @@ -0,0 +1,947 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to perform risk anaylsis.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_numerical_stats] +def numerical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of numerical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "numerical_stats_config": {"field": {"name": column_name}} + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + results = job.risk_details.numerical_stats_result + print( + "Value Range: [{}, {}]".format( + results.min_value.integer_value, + results.max_value.integer_value, + ) + ) + prev_value = None + for percent, result in enumerate(results.quantile_values): + value = result.integer_value + if prev_value != value: + print("Value at {}% quantile: {}".format(percent, value)) + prev_value = value + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_numerical_stats] + + +# [START dlp_categorical_stats] +def categorical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of categorical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": column_name}} + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.categorical_stats_result.value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Most common value occurs {} time(s)".format( + bucket.value_frequency_upper_bound + ) + ) + print( + " Least common value occurs {} time(s)".format( + bucket.value_frequency_lower_bound + ) + ) + print(" {} unique values total.".format(bucket.bucket_size)) + for value in bucket.bucket_values: + print( + " Value {} occurs {} time(s)".format( + value.value.integer_value, value.count + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_categorical_stats] + + +# [START dlp_k_anonymity] +def k_anonymity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-anonymity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}}, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + if bucket.equivalence_class_size_lower_bound: + print( + " Bucket size range: [{}, {}]".format( + bucket.equivalence_class_size_lower_bound, + bucket.equivalence_class_size_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_anonymity] + + +# [START dlp_l_diversity] +def l_diversity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + sensitive_attribute, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the l-diversity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + sensitive_attribute: The column to measure l-diversity relative to. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "l_diversity_config": { + "quasi_ids": quasi_ids, + "sensitive_attribute": {"name": sensitive_attribute}, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Bucket size range: [{}, {}]".format( + bucket.sensitive_value_frequency_lower_bound, + bucket.sensitive_value_frequency_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) + for value in value_bucket.top_sensitive_values: + print( + ( + " Sensitive value {} occurs {} time(s)".format( + value.value, value.count + ) + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_l_diversity] + + +# [START dlp_k_map] +def k_map_estimate_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + info_types, + region_code="US", + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-map risk estimation + of a column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key and optionally + their reidentification distributions. + info_types: Type of information of the quasi_id in order to provide a + statistical model of population. + region_code: The ISO 3166-1 region code that the data is representative + of. Can be omitted if using a region-specific infoType (such as + US_ZIP_5) + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Check that numbers of quasi-ids and info types are equal + if len(quasi_ids) != len(info_types): + raise ValueError( + """Number of infoTypes and number of quasi-identifiers + must be equal!""" + ) + + # Convert quasi id list to Protobuf type + def map_fields(quasi_id, info_type): + return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} + + quasi_ids = map(map_fields, quasi_ids, info_types) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "k_map_estimation_config": { + "quasi_ids": quasi_ids, + "region_code": region_code, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.k_map_estimation_result.k_map_estimation_histogram + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Anonymity range: [{}, {}]".format( + bucket.min_anonymity, bucket.max_anonymity + ) + ) + print(" Size: {}".format(bucket.bucket_size)) + for value_bucket in bucket.bucket_values: + print( + " Values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Estimated k-map anonymity: {}".format( + value_bucket.estimated_anonymity + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_map] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + numerical_parser = subparsers.add_parser("numerical", help="") + numerical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + numerical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + numerical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + numerical_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + numerical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + numerical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + numerical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + numerical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + categorical_parser = subparsers.add_parser("categorical", help="") + categorical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + categorical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + categorical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + categorical_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + categorical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + categorical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + categorical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + categorical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_anonymity_parser = subparsers.add_parser( + "k_anonymity", + help="Computes the k-anonymity of a column set in a Google BigQuery" + "table.", + ) + k_anonymity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_anonymity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_anonymity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + k_anonymity_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + k_anonymity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_anonymity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_anonymity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_anonymity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + l_diversity_parser = subparsers.add_parser( + "l_diversity", + help="Computes the l-diversity of a column set in a Google BigQuery" + "table.", + ) + l_diversity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + l_diversity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + l_diversity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + l_diversity_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + l_diversity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + l_diversity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + l_diversity_parser.add_argument( + "sensitive_attribute", + help="The column to measure l-diversity relative to.", + ) + l_diversity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + l_diversity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_map_parser = subparsers.add_parser( + "k_map", + help="Computes the k-map risk estimation of a column set in a Google" + "BigQuery table.", + ) + k_map_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_map_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_map_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + k_map_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + k_map_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_map_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_map_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_map_parser.add_argument( + "-t", + "--info-types", + nargs="+", + help="Type of information of the quasi_id in order to provide a" + "statistical model of population.", + required=True, + ) + k_map_parser.add_argument( + "-r", + "--region-code", + default="US", + help="The ISO 3166-1 region code that the data is representative of.", + ) + k_map_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + args = parser.parse_args() + + if args.content == "numerical": + numerical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "categorical": + categorical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "k_anonymity": + k_anonymity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "l_diversity": + l_diversity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.sensitive_attribute, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "k_map": + k_map_estimate_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + args.info_types, + region_code=args.region_code, + timeout=args.timeout, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/risk_test.py b/packages/google-cloud-dlp/samples/snippets/risk_test.py new file mode 100644 index 000000000000..25d9575d4b0f --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/risk_test.py @@ -0,0 +1,368 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.cloud.bigquery +import google.cloud.pubsub +import pytest + +import risk + + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TABLE_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +UNIQUE_FIELD = "Name" +REPEATED_FIELD = "Mystery" +NUMERIC_FIELD = "Age" +STRING_BOOLEAN_FIELD = "Gender" + +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING +BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING + +TIMEOUT = 120 # 2 minutes + + +# Create new custom topic/subscription +# We observe sometimes all the tests in this file fail. In a +# hypothesis where DLP service somehow loses the connection to the +# topic, now we use function scope for Pub/Sub fixtures. +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(topic_path) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path( + GCLOUD_PROJECT, SUBSCRIPTION_ID + ) + try: + subscriber.create_subscription(subscription_path, topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(subscription_path) + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + harmful_table_ref = dataset_ref.table(BIGQUERY_HARMFUL_TABLE_ID) + harmful_table = google.cloud.bigquery.Table(harmful_table_ref) + + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + harmful_table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField( + "TelephoneNumber", "STRING", "REQUIRED" + ), + google.cloud.bigquery.SchemaField("Mystery", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField("Age", "INTEGER", "REQUIRED"), + google.cloud.bigquery.SchemaField("Gender", "STRING"), + google.cloud.bigquery.SchemaField("RegionCode", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + try: + harmful_table = bigquery_client.create_table(harmful_table) + except google.api_core.exceptions.Conflict: + harmful_table = bigquery_client.get_table(harmful_table) + + rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")] + harmful_rows_to_insert = [ + ( + u"Gandalf", + u"(123) 456-7890", + "4231 5555 6781 9876", + 27, + "Male", + "US", + ), + ( + u"Dumbledore", + u"(313) 337-1337", + "6291 8765 1095 7629", + 27, + "Male", + "US", + ), + (u"Joe", u"(452) 123-1234", "3782 2288 1166 3030", 35, "Male", "US"), + (u"James", u"(567) 890-1234", "8291 3627 8250 1234", 19, "Male", "US"), + ( + u"Marie", + u"(452) 123-1234", + "8291 3627 8250 1234", + 35, + "Female", + "US", + ), + ( + u"Carrie", + u"(567) 890-1234", + "2253 5218 4251 4526", + 35, + "Female", + "US", + ), + ] + + bigquery_client.insert_rows(table, rows_to_insert) + bigquery_client.insert_rows(harmful_table, harmful_rows_to_insert) + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_numerical_risk_analysis( + topic_id, subscription_id, bigquery_project, capsys +): + risk.numerical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Value Range:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_string_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + UNIQUE_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_number_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_multiple_fields( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, REPEATED_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD, REPEATED_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + ["AGE"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE", "GENDER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_quasi_ids_info_types_equal( + topic_id, subscription_id, bigquery_project +): + with pytest.raises(ValueError): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE"], + timeout=TIMEOUT, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/templates.py b/packages/google-cloud-dlp/samples/snippets/templates.py new file mode 100644 index 000000000000..2d9f8137d5d1 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/templates.py @@ -0,0 +1,266 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API inspect templates.""" + +from __future__ import print_function + +import argparse +import os +import time + + +# [START dlp_create_template] +def create_inspect_template( + project, + info_types, + template_id=None, + display_name=None, + min_likelihood=None, + max_findings=None, + include_quote=None, +): + """Creates a Data Loss Prevention API inspect template. + Args: + project: The Google Cloud project id to use as a parent resource. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + template_id: The id of the template. If omitted, an id will be randomly + generated. + display_name: The optional display name of the template. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + inspect_template = { + "inspect_config": inspect_config, + "display_name": display_name, + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.create_inspect_template( + parent, inspect_template=inspect_template, template_id=template_id + ) + + print("Successfully created template {}".format(response.name)) + + +# [END dlp_create_template] + + +# [START dlp_list_templates] +def list_inspect_templates(project): + """Lists all Data Loss Prevention API inspect templates. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.list_inspect_templates(parent) + + # Define a helper function to convert the API's "seconds since the epoch" + # time format into a human-readable string. + def human_readable_time(timestamp): + return str(time.localtime(timestamp.seconds)) + + for template in response: + print("Template {}:".format(template.name)) + if template.display_name: + print(" Display Name: {}".format(template.display_name)) + print( + " Created: {}".format(human_readable_time(template.create_time)) + ) + print( + " Updated: {}".format(human_readable_time(template.update_time)) + ) + + config = template.inspect_config + print( + " InfoTypes: {}".format( + ", ".join([it.name for it in config.info_types]) + ) + ) + print(" Minimum likelihood: {}".format(config.min_likelihood)) + print(" Include quotes: {}".format(config.include_quote)) + print( + " Max findings per request: {}".format( + config.limits.max_findings_per_request + ) + ) + + +# [END dlp_list_templates] + + +# [START dlp_delete_template] +def delete_inspect_template(project, template_id): + """Deletes a Data Loss Prevention API template. + Args: + project: The id of the Google Cloud project which owns the template. + template_id: The id of the template to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Combine the template id with the parent id. + template_resource = "{}/inspectTemplates/{}".format(parent, template_id) + + # Call the API. + dlp.delete_inspect_template(template_resource) + + print("Template {} successfully deleted.".format(template_resource)) + + +# [END dlp_delete_template] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a template.") + parser_create.add_argument( + "--template_id", + help="The id of the template. If omitted, an id will be randomly " + "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the template." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_list = subparsers.add_parser("list", help="List all templates.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a template.") + parser_delete.add_argument( + "template_id", help="The id of the template to delete." + ) + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_inspect_template( + args.project, + args.info_types, + template_id=args.template_id, + display_name=args.display_name, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.action == "list": + list_inspect_templates(args.project) + elif args.action == "delete": + delete_inspect_template(args.project, args.template_id) diff --git a/packages/google-cloud-dlp/samples/snippets/templates_test.py b/packages/google-cloud-dlp/samples/snippets/templates_test.py new file mode 100644 index 000000000000..f8d22118bfcd --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/templates_test.py @@ -0,0 +1,60 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage + +import templates + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_TEMPLATE_ID = "test-template" + UNIQUE_STRING + + +def test_create_list_and_delete_template(capsys): + try: + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Template already exists, perhaps due to a previous interrupted test. + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + # Try again and move on. + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.list_inspect_templates(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out diff --git a/packages/google-cloud-dlp/samples/snippets/triggers.py b/packages/google-cloud-dlp/samples/snippets/triggers.py new file mode 100644 index 000000000000..7548ab893db8 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/triggers.py @@ -0,0 +1,297 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API automation triggers.""" + +from __future__ import print_function + +import argparse +import os +import time + + +# [START dlp_create_trigger] +def create_trigger( + project, + bucket, + scan_period_days, + info_types, + trigger_id=None, + display_name=None, + description=None, + min_likelihood=None, + max_findings=None, + auto_populate_timespan=False, +): + """Creates a scheduled Data Loss Prevention API inspect_content trigger. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket to scan. This sample scans all + files in the bucket using a wildcard. + scan_period_days: How often to repeat the scan, in days. + The minimum is 1 day. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + trigger_id: The id of the trigger. If omitted, an id will be randomly + generated. + display_name: The optional display name of the trigger. + description: The optional description of the trigger. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + auto_populate_timespan: Automatically populates time span config start + and end times in order to scan new content only. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a cloud_storage_options dictionary with the bucket's URL. + url = "gs://{}/*".format(bucket) + storage_config = { + "cloud_storage_options": {"file_set": {"url": url}}, + # Time-based configuration for each storage object. + "timespan_config": { + # Auto-populate start and end times in order to scan new objects + # only. + "enable_auto_population_of_timespan_config": auto_populate_timespan + }, + } + + # Construct the job definition. + job = {"inspect_config": inspect_config, "storage_config": storage_config} + + # Construct the schedule definition: + schedule = { + "recurrence_period_duration": { + "seconds": scan_period_days * 60 * 60 * 24 + } + } + + # Construct the trigger definition. + job_trigger = { + "inspect_job": job, + "display_name": display_name, + "description": description, + "triggers": [{"schedule": schedule}], + "status": "HEALTHY", + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.create_job_trigger( + parent, job_trigger=job_trigger, trigger_id=trigger_id + ) + + print("Successfully created trigger {}".format(response.name)) + + +# [END dlp_create_trigger] + + +# [START dlp_list_triggers] +def list_triggers(project): + """Lists all Data Loss Prevention API triggers. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.list_job_triggers(parent) + + # Define a helper function to convert the API's "seconds since the epoch" + # time format into a human-readable string. + def human_readable_time(timestamp): + return str(time.localtime(timestamp.seconds)) + + for trigger in response: + print("Trigger {}:".format(trigger.name)) + print(" Created: {}".format(human_readable_time(trigger.create_time))) + print(" Updated: {}".format(human_readable_time(trigger.update_time))) + if trigger.display_name: + print(" Display Name: {}".format(trigger.display_name)) + if trigger.description: + print(" Description: {}".format(trigger.discription)) + print(" Status: {}".format(trigger.status)) + print(" Error count: {}".format(len(trigger.errors))) + + +# [END dlp_list_triggers] + + +# [START dlp_delete_trigger] +def delete_trigger(project, trigger_id): + """Deletes a Data Loss Prevention API trigger. + Args: + project: The id of the Google Cloud project which owns the trigger. + trigger_id: The id of the trigger to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Combine the trigger id with the parent id. + trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id) + + # Call the API. + dlp.delete_job_trigger(trigger_resource) + + print("Trigger {} successfully deleted.".format(trigger_resource)) + + +# [END dlp_delete_triggers] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a trigger.") + parser_create.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_create.add_argument( + "scan_period_days", + type=int, + help="How often to repeat the scan, in days. The minimum is 1 day.", + ) + parser_create.add_argument( + "--trigger_id", + help="The id of the trigger. If omitted, an id will be randomly " + "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the trigger." + ) + parser_create.add_argument( + "--description", help="The optional description of the trigger." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--auto_populate_timespan", + type=bool, + help="Limit scan to new content only.", + ) + + parser_list = subparsers.add_parser("list", help="List all triggers.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a trigger.") + parser_delete.add_argument( + "trigger_id", help="The id of the trigger to delete." + ) + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_trigger( + args.project, + args.bucket, + args.scan_period_days, + args.info_types, + trigger_id=args.trigger_id, + display_name=args.display_name, + description=args.description, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + auto_populate_timespan=args.auto_populate_timespan, + ) + elif args.action == "list": + list_triggers(args.project) + elif args.action == "delete": + delete_trigger(args.project, args.trigger_id) diff --git a/packages/google-cloud-dlp/samples/snippets/triggers_test.py b/packages/google-cloud-dlp/samples/snippets/triggers_test.py new file mode 100644 index 000000000000..dc219d88c7a9 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/triggers_test.py @@ -0,0 +1,103 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage + +import pytest + +import triggers + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TEST_TRIGGER_ID = "test-trigger" + UNIQUE_STRING + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +def test_create_list_and_delete_trigger(bucket, capsys): + try: + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Trigger already exists, perhaps due to a previous interrupted test. + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + # Try again and move on. + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + auto_populate_timespan=True, + ) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.list_triggers(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out diff --git a/packages/google-cloud-dlp/scripts/decrypt-secrets.sh b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh new file mode 100755 index 000000000000..ff599eb2af25 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT=$( dirname "$DIR" ) + +# Work from the project root. +cd $ROOT + +# Use SECRET_MANAGER_PROJECT if set, fallback to cloud-devrel-kokoro-resources. +PROJECT_ID="${SECRET_MANAGER_PROJECT:-cloud-devrel-kokoro-resources}" + +gcloud secrets versions access latest --secret="python-docs-samples-test-env" \ + > testing/test-env.sh +gcloud secrets versions access latest \ + --secret="python-docs-samples-service-account" \ + > testing/service-account.json +gcloud secrets versions access latest \ + --secret="python-docs-samples-client-secrets" \ + > testing/client-secrets.json \ No newline at end of file diff --git a/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py new file mode 100644 index 000000000000..d309d6e97518 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# Copyright 2016 Google Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generates READMEs using configuration defined in yaml.""" + +import argparse +import io +import os +import subprocess + +import jinja2 +import yaml + + +jinja_env = jinja2.Environment( + trim_blocks=True, + loader=jinja2.FileSystemLoader( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates')))) + +README_TMPL = jinja_env.get_template('README.tmpl.rst') + + +def get_help(file): + return subprocess.check_output(['python', file, '--help']).decode() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('source') + parser.add_argument('--destination', default='README.rst') + + args = parser.parse_args() + + source = os.path.abspath(args.source) + root = os.path.dirname(source) + destination = os.path.join(root, args.destination) + + jinja_env.globals['get_help'] = get_help + + with io.open(source, 'r') as f: + config = yaml.load(f) + + # This allows get_help to execute in the right directory. + os.chdir(root) + + output = README_TMPL.render(config) + + with io.open(destination, 'w') as f: + f.write(output) + + +if __name__ == '__main__': + main() diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst new file mode 100644 index 000000000000..4fd239765b0a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst @@ -0,0 +1,87 @@ +{# The following line is a lie. BUT! Once jinja2 is done with it, it will + become truth! #} +.. This file is automatically generated. Do not edit this file directly. + +{{product.name}} Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/README.rst + + +This directory contains samples for {{product.name}}. {{product.description}} + +{{description}} + +.. _{{product.name}}: {{product.url}} + +{% if required_api_url %} +To run the sample, you need to enable the API at: {{required_api_url}} +{% endif %} + +{% if required_role %} +To run the sample, you need to have `{{required_role}}` role. +{% endif %} + +{{other_required_steps}} + +{% if setup %} +Setup +------------------------------------------------------------------------------- + +{% for section in setup %} + +{% include section + '.tmpl.rst' %} + +{% endfor %} +{% endif %} + +{% if samples %} +Samples +------------------------------------------------------------------------------- + +{% for sample in samples %} +{{sample.name}} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +{% if not sample.hide_cloudshell_button %} +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/{{sample.file}},{{folder}}/README.rst +{% endif %} + + +{{sample.description}} + +To run this sample: + +.. code-block:: bash + + $ python {{sample.file}} +{% if sample.show_help %} + + {{get_help(sample.file)|indent}} +{% endif %} + + +{% endfor %} +{% endif %} + +{% if cloud_client_library %} + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + +{% endif %} + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst new file mode 100644 index 000000000000..1446b94a5e3a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst @@ -0,0 +1,9 @@ +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst new file mode 100644 index 000000000000..11957ce2714a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst @@ -0,0 +1,14 @@ +Authentication +++++++++++++++ + +Authentication for this service is done via an `API Key`_. To obtain an API +Key: + +1. Open the `Cloud Platform Console`_ +2. Make sure that billing is enabled for your project. +3. From the **Credentials** page, create a new **API Key** or use an existing + one for your project. + +.. _API Key: + https://developers.google.com/api-client-library/python/guide/aaa_apikeys +.. _Cloud Console: https://console.cloud.google.com/project?_ diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst new file mode 100644 index 000000000000..a0406dba8c84 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst @@ -0,0 +1,29 @@ +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst new file mode 100644 index 000000000000..5ea33d18c00c --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst @@ -0,0 +1,35 @@ +Install PortAudio ++++++++++++++++++ + +Install `PortAudio`_. This is required by the `PyAudio`_ library to stream +audio from your computer's microphone. PyAudio depends on PortAudio for cross-platform compatibility, and is installed differently depending on the +platform. + +* For Mac OS X, you can use `Homebrew`_:: + + brew install portaudio + + **Note**: if you encounter an error when running `pip install` that indicates + it can't find `portaudio.h`, try running `pip install` with the following + flags:: + + pip install --global-option='build_ext' \ + --global-option='-I/usr/local/include' \ + --global-option='-L/usr/local/lib' \ + pyaudio + +* For Debian / Ubuntu Linux:: + + apt-get install portaudio19-dev python-all-dev + +* Windows may work without having to install PortAudio explicitly (it will get + installed with PyAudio). + +For more details, see the `PyAudio installation`_ page. + + +.. _PyAudio: https://people.csail.mit.edu/hubert/pyaudio/ +.. _PortAudio: http://www.portaudio.com/ +.. _PyAudio installation: + https://people.csail.mit.edu/hubert/pyaudio/#downloads +.. _Homebrew: http://brew.sh diff --git a/packages/google-cloud-dlp/synth.metadata b/packages/google-cloud-dlp/synth.metadata index be2c13723c6f..0ebb8d417d79 100644 --- a/packages/google-cloud-dlp/synth.metadata +++ b/packages/google-cloud-dlp/synth.metadata @@ -4,22 +4,21 @@ "git": { "name": ".", "remote": "https://github.com/googleapis/python-dlp.git", - "sha": "7973a441ae2226ce7c597cb5e7eebfa0e38cd94b" + "sha": "973bcc3783029e9b45b23fa13e52bcab4b6f2630" } }, { "git": { - "name": "googleapis", - "remote": "https://github.com/googleapis/googleapis.git", - "sha": "dec3204175104cef49bf21d685d5517caaf0058f", - "internalRef": "312689208" + "name": "synthtool", + "remote": "https://github.com/googleapis/synthtool.git", + "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e" } }, { "git": { "name": "synthtool", "remote": "https://github.com/googleapis/synthtool.git", - "sha": "d2364eb80b840a36136c8ce12f1c6efabcc9600e" + "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e" } } ], diff --git a/packages/google-cloud-dlp/synth.py b/packages/google-cloud-dlp/synth.py index a6daaa883338..802c4faa7c7a 100644 --- a/packages/google-cloud-dlp/synth.py +++ b/packages/google-cloud-dlp/synth.py @@ -16,6 +16,7 @@ import synthtool as s import synthtool.gcp as gcp +from synthtool.languages import python import logging logging.basicConfig(level=logging.DEBUG) @@ -259,8 +260,17 @@ # Add templated files # ---------------------------------------------------------------------------- templated_files = common.py_library( - cov_level=73, system_test_dependencies=["test_utils"] + cov_level=73, system_test_dependencies=["test_utils"], samples=True ) s.move(templated_files) +# ---------------------------------------------------------------------------- +# Samples templates +# ---------------------------------------------------------------------------- +python.py_samples() + +# Temporarily disable warnings due to +# https://github.com/googleapis/gapic-generator-python/issues/525 +s.replace("noxfile.py", '[\"\']-W[\"\']', '# "-W"') + s.shell.run(["nox", "-s", "blacken"], hide_output=False) diff --git a/packages/google-cloud-dlp/testing/.gitignore b/packages/google-cloud-dlp/testing/.gitignore new file mode 100644 index 000000000000..b05fbd630881 --- /dev/null +++ b/packages/google-cloud-dlp/testing/.gitignore @@ -0,0 +1,3 @@ +test-env.sh +service-account.json +client-secrets.json \ No newline at end of file