Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add custom info type samples to inspect_content.py
Use flags to indicate dictionary word lists and regex patterns, then parse them into custom info types.
  • Loading branch information
mwdaub authored Jun 6, 2018
commit 8077cc7157d152379b7bb0fa884d72b66bd728d1
169 changes: 165 additions & 4 deletions dlp/inspect_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

# [START dlp_inspect_string]
def inspect_string(project, content_string, info_types,
custom_dictionaries=None, custom_regexes=None,
min_likelihood=None, max_findings=None, include_quote=True):
"""Uses the Data Loss Prevention API to analyze strings for protected data.
Args:
Expand Down Expand Up @@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types,
# dictionaries (protos are also accepted).
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dictionaries[i].split(',')}
}
} for i in range(len(custom_dictionaries))]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than using for i in range(len()) can we iterate directly over the values and use the enumerate() builtin to get the numbers? (Same throughout)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regexes[i]}
} for i in range(len(custom_regexes))]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'include_quote': include_quote,
'limits': {'max_findings_per_request': max_findings},
Expand Down Expand Up @@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types,

# [START dlp_inspect_file]
def inspect_file(project, filename, info_types, min_likelihood=None,
custom_dictionaries=None, custom_regexes=None,
max_findings=None, include_quote=True, mime_type=None):
"""Uses the Data Loss Prevention API to analyze a file for protected data.
Args:
Expand Down Expand Up @@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dictionaries[i].split(',')}
}
} for i in range(len(custom_dictionaries))]
if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regexes[i]}
} for i in range(len(custom_regexes))]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'limits': {'max_findings_per_request': max_findings},
}
Expand Down Expand Up @@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None,

# [START dlp_inspect_gcs]
def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
info_types, min_likelihood=None, max_findings=None,
timeout=300):
info_types, custom_dictionaries=None,
custom_regexes=None, min_likelihood=None,
max_findings=None, timeout=300):
"""Uses the Data Loss Prevention API to analyze a file on GCS.
Args:
project: The Google Cloud project id to use as a parent resource.
Expand Down Expand Up @@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dictionaries[i].split(',')}
}
} for i in range(len(custom_dictionaries))]
if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regexes[i]}
} for i in range(len(custom_regexes))]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'limits': {'max_findings_per_request': max_findings},
}
Expand Down Expand Up @@ -293,8 +353,10 @@ def callback(message):

# [START dlp_inspect_datastore]
def inspect_datastore(project, datastore_project, kind,
topic_id, subscription_id, info_types, namespace_id=None,
min_likelihood=None, max_findings=None, timeout=300):
topic_id, subscription_id, info_types,
custom_dictionaries=None, custom_regexes=None,
namespace_id=None, min_likelihood=None,
max_findings=None, timeout=300):
"""Uses the Data Loss Prevention API to analyze Datastore data.
Args:
project: The Google Cloud project id to use as a parent resource.
Expand Down Expand Up @@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind,
info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dictionaries[i].split(',')}
}
} for i in range(len(custom_dictionaries))]
if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regexes[i]}
} for i in range(len(custom_regexes))]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'limits': {'max_findings_per_request': max_findings},
}
Expand Down Expand Up @@ -424,6 +505,7 @@ def callback(message):
# [START dlp_inspect_bigquery]
def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
topic_id, subscription_id, info_types,
custom_dictionaries=None, custom_regexes=None,
min_likelihood=None, max_findings=None, timeout=300):
"""Uses the Data Loss Prevention API to analyze BigQuery data.
Args:
Expand Down Expand Up @@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
info_types = [{'name': info_type} for info_type in info_types]

# Prepare custom_info_types by parsing the dictionary word lists and
# regex patterns.
if custom_dictionaries is None:
custom_dictionaries = []
dictionaries = [{
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
'dictionary': {
'word_list': {'words': custom_dictionaries[i].split(',')}
}
} for i in range(len(custom_dictionaries))]
if custom_regexes is None:
custom_regexes = []
regexes = [{
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
'regex': {'pattern': custom_regexes[i]}
} for i in range(len(custom_regexes))]
custom_info_types = dictionaries + regexes

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
'info_types': info_types,
'custom_info_types': custom_info_types,
'min_likelihood': min_likelihood,
'limits': {'max_findings_per_request': max_findings},
}
Expand Down Expand Up @@ -571,6 +672,16 @@ def callback(message):
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_string.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
' to search for as custom info types.'
default=None)
parser_string.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.'
default=None)
parser_string.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
Expand Down Expand Up @@ -600,6 +711,16 @@ def callback(message):
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_file.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
' to search for as custom info types.'
default=None)
parser_file.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.'
default=None)
parser_file.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
Expand Down Expand Up @@ -648,6 +769,16 @@ def callback(message):
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_gcs.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intended to be a single string of comma-delimited words, or multiple strings of one word each, or multiple comma-delimited strings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is intended to be multiple strings, each of which is a string of comma-delimited words; I have updated the help message to make this more clear. This is how the other code samples are written.

' to search for as custom info types.'
default=None)
parser_gcs.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.'
default=None)
parser_gcs.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
Expand Down Expand Up @@ -692,6 +823,16 @@ def callback(message):
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_datastore.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
' to search for as custom info types.'
default=None)
parser_datastore.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.'
default=None)
parser_datastore.add_argument(
'--namespace_id',
help='The Datastore namespace to use, if applicable.')
Expand Down Expand Up @@ -742,6 +883,16 @@ def callback(message):
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
'If unspecified, the three above examples will be used.',
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
parser_bigquery.add_argument(
'--custom_dictionaries', action='append',
help='Strings representing comma-delimited lists of dictionary words'
' to search for as custom info types.'
default=None)
parser_bigquery.add_argument(
'--custom_regexes', action='append',
help='Strings representing regex patterns to search for as custom '
' info types.'
default=None)
parser_bigquery.add_argument(
'--min_likelihood',
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
Expand All @@ -762,12 +913,16 @@ def callback(message):
if args.content == 'string':
inspect_string(
args.project, args.item, args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
include_quote=args.include_quote)
elif args.content == 'file':
inspect_file(
args.project, args.filename, args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
include_quote=args.include_quote,
Expand All @@ -777,6 +932,8 @@ def callback(message):
args.project, args.bucket, args.filename,
args.topic_id, args.subscription_id,
args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
timeout=args.timeout)
Expand All @@ -785,6 +942,8 @@ def callback(message):
args.project, args.datastore_project, args.kind,
args.topic_id, args.subscription_id,
args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
namespace_id=args.namespace_id,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
Expand All @@ -794,6 +953,8 @@ def callback(message):
args.project, args.bigquery_project, args.dataset_id,
args.table_id, args.topic_id, args.subscription_id,
args.info_types,
custom_dictionaries=args.custom_dictionaries,
custom_regexes=args.custom_regexes,
min_likelihood=args.min_likelihood,
max_findings=args.max_findings,
timeout=args.timeout)