- 
                Notifications
    You must be signed in to change notification settings 
- Fork 6.6k
Add DLP code samples for custom info types #1524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
8077cc7
              dcdec44
              f2919ee
              2ddc9ca
              bd24d49
              5ecc915
              c80a2d9
              f789d26
              019b5f7
              5258658
              47fc04f
              eb35add
              b4ffea6
              4f71b5b
              72a9152
              9384c13
              b640d19
              08b4cce
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Use flags to indicate dictionary word lists and regex patterns, then parse them into custom info types.
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -23,6 +23,7 @@ | |
|  | ||
| # [START dlp_inspect_string] | ||
| def inspect_string(project, content_string, info_types, | ||
| custom_dictionaries=None, custom_regexes=None, | ||
| min_likelihood=None, max_findings=None, include_quote=True): | ||
| """Uses the Data Loss Prevention API to analyze strings for protected data. | ||
| Args: | ||
|  | @@ -50,10 +51,29 @@ def inspect_string(project, content_string, info_types, | |
| # dictionaries (protos are also accepted). | ||
| info_types = [{'name': info_type} for info_type in info_types] | ||
|  | ||
| # Prepare custom_info_types by parsing the dictionary word lists and | ||
| # regex patterns. | ||
| if custom_dictionaries is None: | ||
| custom_dictionaries = [] | ||
| dictionaries = [{ | ||
| 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, | ||
| 'dictionary': { | ||
| 'word_list': {'words': custom_dictionaries[i].split(',')} | ||
| } | ||
| } for i in range(len(custom_dictionaries))] | ||
| if custom_regexes is None: | ||
| custom_regexes = [] | ||
| regexes = [{ | ||
| 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, | ||
| 'regex': {'pattern': custom_regexes[i]} | ||
| } for i in range(len(custom_regexes))] | ||
| custom_info_types = dictionaries + regexes | ||
|  | ||
| # Construct the configuration dictionary. Keys which are None may | ||
| # optionally be omitted entirely. | ||
| inspect_config = { | ||
| 'info_types': info_types, | ||
| 'custom_info_types': custom_info_types, | ||
| 'min_likelihood': min_likelihood, | ||
| 'include_quote': include_quote, | ||
| 'limits': {'max_findings_per_request': max_findings}, | ||
|  | @@ -85,6 +105,7 @@ def inspect_string(project, content_string, info_types, | |
|  | ||
| # [START dlp_inspect_file] | ||
| def inspect_file(project, filename, info_types, min_likelihood=None, | ||
| custom_dictionaries=None, custom_regexes=None, | ||
| max_findings=None, include_quote=True, mime_type=None): | ||
| """Uses the Data Loss Prevention API to analyze a file for protected data. | ||
| Args: | ||
|  | @@ -118,10 +139,29 @@ def inspect_file(project, filename, info_types, min_likelihood=None, | |
| info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] | ||
| info_types = [{'name': info_type} for info_type in info_types] | ||
|  | ||
| # Prepare custom_info_types by parsing the dictionary word lists and | ||
| # regex patterns. | ||
| if custom_dictionaries is None: | ||
| custom_dictionaries = [] | ||
| dictionaries = [{ | ||
| 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, | ||
| 'dictionary': { | ||
| 'word_list': {'words': custom_dictionaries[i].split(',')} | ||
| } | ||
| } for i in range(len(custom_dictionaries))] | ||
| if custom_regexes is None: | ||
| custom_regexes = [] | ||
| regexes = [{ | ||
| 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, | ||
| 'regex': {'pattern': custom_regexes[i]} | ||
| } for i in range(len(custom_regexes))] | ||
| custom_info_types = dictionaries + regexes | ||
|  | ||
| # Construct the configuration dictionary. Keys which are None may | ||
| # optionally be omitted entirely. | ||
| inspect_config = { | ||
| 'info_types': info_types, | ||
| 'custom_info_types': custom_info_types, | ||
| 'min_likelihood': min_likelihood, | ||
| 'limits': {'max_findings_per_request': max_findings}, | ||
| } | ||
|  | @@ -168,8 +208,9 @@ def inspect_file(project, filename, info_types, min_likelihood=None, | |
|  | ||
| # [START dlp_inspect_gcs] | ||
| def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, | ||
| info_types, min_likelihood=None, max_findings=None, | ||
| timeout=300): | ||
| info_types, custom_dictionaries=None, | ||
| custom_regexes=None, min_likelihood=None, | ||
| max_findings=None, timeout=300): | ||
| """Uses the Data Loss Prevention API to analyze a file on GCS. | ||
| Args: | ||
| project: The Google Cloud project id to use as a parent resource. | ||
|  | @@ -211,10 +252,29 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id, | |
| info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] | ||
| info_types = [{'name': info_type} for info_type in info_types] | ||
|  | ||
| # Prepare custom_info_types by parsing the dictionary word lists and | ||
| # regex patterns. | ||
| if custom_dictionaries is None: | ||
| custom_dictionaries = [] | ||
| dictionaries = [{ | ||
| 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, | ||
| 'dictionary': { | ||
| 'word_list': {'words': custom_dictionaries[i].split(',')} | ||
| } | ||
| } for i in range(len(custom_dictionaries))] | ||
| if custom_regexes is None: | ||
| custom_regexes = [] | ||
| regexes = [{ | ||
| 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, | ||
| 'regex': {'pattern': custom_regexes[i]} | ||
| } for i in range(len(custom_regexes))] | ||
| custom_info_types = dictionaries + regexes | ||
|  | ||
| # Construct the configuration dictionary. Keys which are None may | ||
| # optionally be omitted entirely. | ||
| inspect_config = { | ||
| 'info_types': info_types, | ||
| 'custom_info_types': custom_info_types, | ||
| 'min_likelihood': min_likelihood, | ||
| 'limits': {'max_findings_per_request': max_findings}, | ||
| } | ||
|  | @@ -293,8 +353,10 @@ def callback(message): | |
|  | ||
| # [START dlp_inspect_datastore] | ||
| def inspect_datastore(project, datastore_project, kind, | ||
| topic_id, subscription_id, info_types, namespace_id=None, | ||
| min_likelihood=None, max_findings=None, timeout=300): | ||
| topic_id, subscription_id, info_types, | ||
| custom_dictionaries=None, custom_regexes=None, | ||
| namespace_id=None, min_likelihood=None, | ||
| max_findings=None, timeout=300): | ||
| """Uses the Data Loss Prevention API to analyze Datastore data. | ||
| Args: | ||
| project: The Google Cloud project id to use as a parent resource. | ||
|  | @@ -336,10 +398,29 @@ def inspect_datastore(project, datastore_project, kind, | |
| info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] | ||
| info_types = [{'name': info_type} for info_type in info_types] | ||
|  | ||
| # Prepare custom_info_types by parsing the dictionary word lists and | ||
| # regex patterns. | ||
| if custom_dictionaries is None: | ||
| custom_dictionaries = [] | ||
| dictionaries = [{ | ||
| 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, | ||
| 'dictionary': { | ||
| 'word_list': {'words': custom_dictionaries[i].split(',')} | ||
| } | ||
| } for i in range(len(custom_dictionaries))] | ||
| if custom_regexes is None: | ||
| custom_regexes = [] | ||
| regexes = [{ | ||
| 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, | ||
| 'regex': {'pattern': custom_regexes[i]} | ||
| } for i in range(len(custom_regexes))] | ||
| custom_info_types = dictionaries + regexes | ||
|  | ||
| # Construct the configuration dictionary. Keys which are None may | ||
| # optionally be omitted entirely. | ||
| inspect_config = { | ||
| 'info_types': info_types, | ||
| 'custom_info_types': custom_info_types, | ||
| 'min_likelihood': min_likelihood, | ||
| 'limits': {'max_findings_per_request': max_findings}, | ||
| } | ||
|  | @@ -424,6 +505,7 @@ def callback(message): | |
| # [START dlp_inspect_bigquery] | ||
| def inspect_bigquery(project, bigquery_project, dataset_id, table_id, | ||
| topic_id, subscription_id, info_types, | ||
| custom_dictionaries=None, custom_regexes=None, | ||
| min_likelihood=None, max_findings=None, timeout=300): | ||
| """Uses the Data Loss Prevention API to analyze BigQuery data. | ||
| Args: | ||
|  | @@ -467,10 +549,29 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id, | |
| info_types = ['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'] | ||
| info_types = [{'name': info_type} for info_type in info_types] | ||
|  | ||
| # Prepare custom_info_types by parsing the dictionary word lists and | ||
| # regex patterns. | ||
| if custom_dictionaries is None: | ||
| custom_dictionaries = [] | ||
| dictionaries = [{ | ||
| 'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)}, | ||
| 'dictionary': { | ||
| 'word_list': {'words': custom_dictionaries[i].split(',')} | ||
| } | ||
| } for i in range(len(custom_dictionaries))] | ||
| if custom_regexes is None: | ||
| custom_regexes = [] | ||
| regexes = [{ | ||
| 'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)}, | ||
| 'regex': {'pattern': custom_regexes[i]} | ||
| } for i in range(len(custom_regexes))] | ||
| custom_info_types = dictionaries + regexes | ||
|  | ||
| # Construct the configuration dictionary. Keys which are None may | ||
| # optionally be omitted entirely. | ||
| inspect_config = { | ||
| 'info_types': info_types, | ||
| 'custom_info_types': custom_info_types, | ||
| 'min_likelihood': min_likelihood, | ||
| 'limits': {'max_findings_per_request': max_findings}, | ||
| } | ||
|  | @@ -571,6 +672,16 @@ def callback(message): | |
| 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
| 'If unspecified, the three above examples will be used.', | ||
| default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) | ||
| parser_string.add_argument( | ||
| '--custom_dictionaries', action='append', | ||
| help='Strings representing comma-delimited lists of dictionary words' | ||
| ' to search for as custom info types.' | ||
| default=None) | ||
| parser_string.add_argument( | ||
| '--custom_regexes', action='append', | ||
| help='Strings representing regex patterns to search for as custom ' | ||
| ' info types.' | ||
| default=None) | ||
| parser_string.add_argument( | ||
| '--min_likelihood', | ||
| choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', | ||
|  | @@ -600,6 +711,16 @@ def callback(message): | |
| 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
| 'If unspecified, the three above examples will be used.', | ||
| default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) | ||
| parser_file.add_argument( | ||
| '--custom_dictionaries', action='append', | ||
| help='Strings representing comma-delimited lists of dictionary words' | ||
| ' to search for as custom info types.' | ||
| default=None) | ||
| parser_file.add_argument( | ||
| '--custom_regexes', action='append', | ||
| help='Strings representing regex patterns to search for as custom ' | ||
| ' info types.' | ||
| default=None) | ||
| parser_file.add_argument( | ||
| '--min_likelihood', | ||
| choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', | ||
|  | @@ -648,6 +769,16 @@ def callback(message): | |
| 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
| 'If unspecified, the three above examples will be used.', | ||
| default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) | ||
| parser_gcs.add_argument( | ||
| '--custom_dictionaries', action='append', | ||
| help='Strings representing comma-delimited lists of dictionary words' | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intended to be a single string of comma-delimited words, or multiple strings of one word each, or multiple comma-delimited strings? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is intended to be multiple strings, each of which is a string of comma-delimited words; I have updated the help message to make this more clear. This is how the other code samples are written. | ||
| ' to search for as custom info types.' | ||
| default=None) | ||
| parser_gcs.add_argument( | ||
| '--custom_regexes', action='append', | ||
| help='Strings representing regex patterns to search for as custom ' | ||
| ' info types.' | ||
| default=None) | ||
| parser_gcs.add_argument( | ||
| '--min_likelihood', | ||
| choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', | ||
|  | @@ -692,6 +823,16 @@ def callback(message): | |
| 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
| 'If unspecified, the three above examples will be used.', | ||
| default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) | ||
| parser_datastore.add_argument( | ||
| '--custom_dictionaries', action='append', | ||
| help='Strings representing comma-delimited lists of dictionary words' | ||
| ' to search for as custom info types.' | ||
| default=None) | ||
| parser_datastore.add_argument( | ||
| '--custom_regexes', action='append', | ||
| help='Strings representing regex patterns to search for as custom ' | ||
| ' info types.' | ||
| default=None) | ||
| parser_datastore.add_argument( | ||
| '--namespace_id', | ||
| help='The Datastore namespace to use, if applicable.') | ||
|  | @@ -742,6 +883,16 @@ def callback(message): | |
| 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' | ||
| 'If unspecified, the three above examples will be used.', | ||
| default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS']) | ||
| parser_bigquery.add_argument( | ||
| '--custom_dictionaries', action='append', | ||
| help='Strings representing comma-delimited lists of dictionary words' | ||
| ' to search for as custom info types.' | ||
| default=None) | ||
| parser_bigquery.add_argument( | ||
| '--custom_regexes', action='append', | ||
| help='Strings representing regex patterns to search for as custom ' | ||
| ' info types.' | ||
| default=None) | ||
| parser_bigquery.add_argument( | ||
| '--min_likelihood', | ||
| choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY', | ||
|  | @@ -762,12 +913,16 @@ def callback(message): | |
| if args.content == 'string': | ||
| inspect_string( | ||
| args.project, args.item, args.info_types, | ||
| custom_dictionaries=args.custom_dictionaries, | ||
| custom_regexes=args.custom_regexes, | ||
| min_likelihood=args.min_likelihood, | ||
| max_findings=args.max_findings, | ||
| include_quote=args.include_quote) | ||
| elif args.content == 'file': | ||
| inspect_file( | ||
| args.project, args.filename, args.info_types, | ||
| custom_dictionaries=args.custom_dictionaries, | ||
| custom_regexes=args.custom_regexes, | ||
| min_likelihood=args.min_likelihood, | ||
| max_findings=args.max_findings, | ||
| include_quote=args.include_quote, | ||
|  | @@ -777,6 +932,8 @@ def callback(message): | |
| args.project, args.bucket, args.filename, | ||
| args.topic_id, args.subscription_id, | ||
| args.info_types, | ||
| custom_dictionaries=args.custom_dictionaries, | ||
| custom_regexes=args.custom_regexes, | ||
| min_likelihood=args.min_likelihood, | ||
| max_findings=args.max_findings, | ||
| timeout=args.timeout) | ||
|  | @@ -785,6 +942,8 @@ def callback(message): | |
| args.project, args.datastore_project, args.kind, | ||
| args.topic_id, args.subscription_id, | ||
| args.info_types, | ||
| custom_dictionaries=args.custom_dictionaries, | ||
| custom_regexes=args.custom_regexes, | ||
| namespace_id=args.namespace_id, | ||
| min_likelihood=args.min_likelihood, | ||
| max_findings=args.max_findings, | ||
|  | @@ -794,6 +953,8 @@ def callback(message): | |
| args.project, args.bigquery_project, args.dataset_id, | ||
| args.table_id, args.topic_id, args.subscription_id, | ||
| args.info_types, | ||
| custom_dictionaries=args.custom_dictionaries, | ||
| custom_regexes=args.custom_regexes, | ||
| min_likelihood=args.min_likelihood, | ||
| max_findings=args.max_findings, | ||
| timeout=args.timeout) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than using
for i in range(len())can we iterate directly over the values and use theenumerate()builtin to get the numbers? (Same throughout)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.