Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DLP code sample and test for person_name detector with hotwords #4081

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions dlp/custom_infotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,78 @@ def omit_name_if_also_email(

# [END dlp_omit_name_if_also_email]


# [START inspect_with_person_name_w_custom_hotword]
def inspect_with_person_name_w_custom_hotword(
project,
content_string,
custom_hotword="patient"
):
"""Uses the Data Loss Prevention API increase likelihood for matches on
PERSON_NAME if the user specified custom hotword is present. Only
includes findings with the increased likelihood by setting a minimum
likelihood threshold of VERY_LIKELY.
Args:
project: The Google Cloud project id to use as a parent resource.
content_string: The string to inspect.
custom_hotword: The custom hotword used for likelihood boosting.
Returns:
None; the response from the API is printed to the terminal.
"""

# Import the client library.
import google.cloud.dlp

# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Construct a rule set with caller provided hotword, with a likelihood
# boost to VERY_LIKELY when the hotword are present within the 50 character-
# window preceding the PII finding.
hotword_rule = {
"hotword_regex": {"pattern": custom_hotword},
"likelihood_adjustment": {"fixed_likelihood": "VERY_LIKELY"},
"proximity": {"window_before": 50},
}

rule_set = [
{
"info_types": [{"name": "PERSON_NAME"}],
"rules": [{"hotword_rule": hotword_rule}],
}
]

# Construct the configuration dictionary with the custom regex info type.
inspect_config = {
"rule_set": rule_set,
"min_likelihood": "VERY_LIKELY",
}

# Construct the `item`.
item = {"value": content_string}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.inspect_content(parent, inspect_config, item)

# Print out the results.
if response.result.findings:
for finding in response.result.findings:
try:
if finding.quote:
print(f"Quote: {finding.quote}")
except AttributeError:
pass
print(f"Info type: {finding.info_type.name}")
print(f"Likelihood: {finding.likelihood}")
else:
print("No findings.")

# [END inspect_with_person_name_w_custom_hotword]


# [START dlp_inspect_with_medical_record_number_custom_regex_detector]
def inspect_with_medical_record_number_custom_regex_detector(
project,
Expand Down
9 changes: 9 additions & 0 deletions dlp/custom_infotype_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ def test_omit_name_if_also_email(capsys):
assert info_types[0] == "EMAIL_ADDRESS"


def test_inspect_with_person_name_w_custom_hotword(capsys):
custom_infotype.inspect_with_person_name_w_custom_hotword(
GCLOUD_PROJECT, "patient's name is John Doe.", "patient")

out, _ = capsys.readouterr()
assert "Info type: PERSON_NAME" in out
assert "Likelihood: 5" in out


def test_inspect_with_medical_record_number_custom_regex_detector(capsys):
custom_infotype.inspect_with_medical_record_number_custom_regex_detector(
GCLOUD_PROJECT, "Patients MRN 444-5-22222")
Expand Down