Add custom info type samples to inspect_content.py

Use flags to indicate dictionary word lists and regex patterns, then parse them into custom info types.
GoogleCloudPlatform · andrewsg · Jul 3, 2018 · Jun 6, 2018 · Jun 6, 2018 · Jun 6, 2018
commit 8077cc7157d152379b7bb0fa884d72b66bd728d1
diff --git a/dlp/inspect_content.py b/dlp/inspect_content.py
@@ -23,6 +23,7 @@
 
 # [START dlp_inspect_string]
 def inspect_string(project, content_string, info_types,
+                   custom_dictionaries=None, custom_regexes=None,
                    min_likelihood=None, max_findings=None, include_quote=True):
     """Uses the Data Loss Prevention API to analyze strings for protected data.
     Args:
@@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types,
     # dictionaries (protos are also accepted).
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'include_quote': include_quote,
         'limits': {'max_findings_per_request': max_findings},
@@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types,
 
 # [START dlp_inspect_file]
 def inspect_file(project, filename, info_types, min_likelihood=None,
+                 custom_dictionaries=None, custom_regexes=None,
                  max_findings=None, include_quote=True, mime_type=None):
     """Uses the Data Loss Prevention API to analyze a file for protected data.
     Args:
@@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None,
 
 # [START dlp_inspect_gcs]
 def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
-                     info_types, min_likelihood=None, max_findings=None,
-                     timeout=300):
+                     info_types, custom_dictionaries=None,
+                     custom_regexes=None, min_likelihood=None,
+                     max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze a file on GCS.
     Args:
         project: The Google Cloud project id to use as a parent resource.
@@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -293,8 +353,10 @@ def callback(message):
 
 # [START dlp_inspect_datastore]
 def inspect_datastore(project, datastore_project, kind,
-                      topic_id, subscription_id, info_types, namespace_id=None,
-                      min_likelihood=None, max_findings=None, timeout=300):
+                      topic_id, subscription_id, info_types,
+                      custom_dictionaries=None, custom_regexes=None,
+                      namespace_id=None, min_likelihood=None,
+                      max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze Datastore data.
     Args:
         project: The Google Cloud project id to use as a parent resource.
@@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -424,6 +505,7 @@ def callback(message):
 # [START dlp_inspect_bigquery]
 def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
                      topic_id, subscription_id, info_types,
+                     custom_dictionaries=None, custom_regexes=None,
                      min_likelihood=None, max_findings=None, timeout=300):
     """Uses the Data Loss Prevention API to analyze BigQuery data.
     Args:
@@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
         info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']
     info_types = [{'name': info_type} for info_type in info_types]
 
+    # Prepare custom_info_types by parsing the dictionary word lists and
+    # regex patterns.
+    if custom_dictionaries is None:
+      custom_dictionaries = []
+    dictionaries = [{
+        'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
+        'dictionary': {
+            'word_list': {'words': custom_dictionaries[i].split(',')}
+        }
+    } for i in range(len(custom_dictionaries))]
+    if custom_regexes is None:
+      custom_regexes = []
+    regexes = [{
+        'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
+        'regex': {'pattern': custom_regexes[i]}
+    } for i in range(len(custom_regexes))]
+    custom_info_types = dictionaries + regexes
+
     # Construct the configuration dictionary. Keys which are None may
     # optionally be omitted entirely.
     inspect_config = {
         'info_types': info_types,
+        'custom_info_types': custom_info_types,
         'min_likelihood': min_likelihood,
         'limits': {'max_findings_per_request': max_findings},
     }
@@ -571,6 +672,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_string.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_string.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_string.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -600,6 +711,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_file.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_file.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_file.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -648,6 +769,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_gcs.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_gcs.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_gcs.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -692,6 +823,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_datastore.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_datastore.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_datastore.add_argument(
         '--namespace_id',
         help='The Datastore namespace to use, if applicable.')
@@ -742,6 +883,16 @@ def callback(message):
              'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
              'If unspecified, the three above examples will be used.',
         default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
+    parser_bigquery.add_argument(
+        '--custom_dictionaries', action='append',
+        help='Strings representing comma-delimited lists of dictionary words'
+             ' to search for as custom info types.'
+        default=None)
+    parser_bigquery.add_argument(
+        '--custom_regexes', action='append',
+        help='Strings representing regex patterns to search for as custom '
+             ' info types.'
+        default=None)
     parser_bigquery.add_argument(
         '--min_likelihood',
         choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
@@ -762,12 +913,16 @@ def callback(message):
     if args.content == 'string':
         inspect_string(
             args.project, args.item, args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             include_quote=args.include_quote)
     elif args.content == 'file':
         inspect_file(
             args.project, args.filename, args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             include_quote=args.include_quote,
@@ -777,6 +932,8 @@ def callback(message):
             args.project, args.bucket, args.filename,
             args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             timeout=args.timeout)
@@ -785,6 +942,8 @@ def callback(message):
             args.project, args.datastore_project, args.kind,
             args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             namespace_id=args.namespace_id,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
@@ -794,6 +953,8 @@ def callback(message):
             args.project, args.bigquery_project, args.dataset_id,
             args.table_id, args.topic_id, args.subscription_id,
             args.info_types,
+            custom_dictionaries=args.custom_dictionaries,
+            custom_regexes=args.custom_regexes,
             min_likelihood=args.min_likelihood,
             max_findings=args.max_findings,
             timeout=args.timeout)