Skip to content

Commit

Permalink
feat(privacy.providers): add aws_comprehend as scrubbing provider (#476)
Browse files Browse the repository at this point in the history
* add aws_comprehend provider module

* fix typo

* add boto3 and botocore to poetry

* ran poetry update

* add the AWS Comprehend Class

* update the script

* create test module to test comprhend

* refactor name

* add full fleged test module

* try to fix failing tests

* try to fix failing tests

* fixed pytest

* format test file

* fix all flake 8 formattings

* add naming meaning for NER

* remove repeated code and make a TextMixin

* format

* change scrub enabled to false
 after testing visualization

* remove redundant rmethods

* add enum style provider names

addressing: #476 (comment)

* fix all flak8 errors

* add files for private_ai scrubbing provider

* change name

* add scrub text function in
PrivateAIScrubbingProvider

* try to skip tests if api key is incorrect or missing or the syntax used in code is incorrect

* pytest passes for test_private_ai

* add temp code for pdfs redaction

* remove the PRIVATE_AI provider code as it will be added in seprate pr

* remove omre prvt_ai files

* ran black

---------

Co-authored-by: Richard Abrich <[email protected]>
  • Loading branch information
KrishPatel13 and abrichr authored Aug 29, 2023
1 parent e5d4955 commit 33fe244
Show file tree
Hide file tree
Showing 11 changed files with 583 additions and 212 deletions.
Binary file added assets/sample_llc_1.pdf
Binary file not shown.
6 changes: 3 additions & 3 deletions openadapt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# APP CONFIGURATIONS
"APP_DARK_MODE": False,
# SCRUBBING CONFIGURATIONS
"SCRUB_ENABLED": True,
"SCRUB_ENABLED": False,
"SCRUB_CHAR": "*",
"SCRUB_LANGUAGE": "en",
# TODO support lists in getenv_fallback
Expand All @@ -62,7 +62,7 @@
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_trf"}],
},
"SCRUB_IGNORE_ENTITIES": [
"SCRUB_PRESIDIO_IGNORE_ENTITIES": [
# 'US_PASSPORT',
# 'US_DRIVER_LICENSE',
# 'CRYPTO',
Expand Down Expand Up @@ -109,7 +109,7 @@
# Calculate and save the difference between 2 neighboring screenshots
"SAVE_SCREENSHOT_DIFF": False,
"SPACY_MODEL_NAME": "en_core_web_trf",
"SCRUB_PROVIDER_NAME": ["Presidio"],
"PRIVATE_AI_API_KEY": "<set your api key in .env>",
}

# each string in STOP_STRS should only contain strings
Expand Down
183 changes: 183 additions & 0 deletions openadapt/privacy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,189 @@ def scrub_mp4( # pylint: disable=too-many-arguments
raise NotImplementedError


class TextScrubbingMixin:
"""Mixin class to add scrub_text method."""

def scrub_text_all(self, text: str) -> str:
"""Scrub the text by replacing all characters with config.SCRUB_CHAR.
Args:
text (str): Text to be scrubbed
Returns:
str: Scrubbed text
"""
return config.SCRUB_CHAR * len(text) # pylint: disable=E1101

def scrub_dict(
self,
input_dict: dict,
list_keys: list = None,
scrub_all: bool = False,
force_scrub_children: bool = False,
) -> dict:
"""Scrub the dict of all PII/PHI using Presidio ANALYZER.TRF and Anonymizer.
Args:
input_dict (dict): A dict to be scrubbed
list_keys (list): List of keys to be scrubbed
scrub_all (bool): Whether to scrub all sub-fields/keys/values
of that particular key
force_scrub_children (bool): Whether to force scrub children
even if key is not present
Returns:
dict: The scrubbed dict with PII and PHI removed.
"""
if list_keys is None:
list_keys = config.SCRUB_KEYS_HTML # pylint: disable=E1101

scrubbed_dict = {}
for key, value in input_dict.items():
if self._should_scrub_text(key, value, list_keys, scrub_all):
scrubbed_text = self._scrub_text_item(value, key, force_scrub_children)
if key in ("text", "canonical_text") and self._is_scrubbed(
value, scrubbed_text
):
force_scrub_children = True
scrubbed_dict[key] = scrubbed_text
elif isinstance(value, list):
scrubbed_list = [
(
self._scrub_list_item(
item, key, list_keys, force_scrub_children
)
if self._should_scrub_list_item(item, key, list_keys)
else item
)
for item in value
]
scrubbed_dict[key] = scrubbed_list
force_scrub_children = False
elif isinstance(value, dict):
if isinstance(key, str) and key == "state":
scrubbed_dict[key] = self.scrub_dict(
value, list_keys, scrub_all=True
)
else:
scrubbed_dict[key] = self.scrub_dict(value, list_keys)
else:
scrubbed_dict[key] = value

return scrubbed_dict

def scrub_list_dicts(
self, input_list: list[dict], list_keys: list = None
) -> list[dict]:
"""Scrub list of dicts to remove PII/PHI.
Args:
input_list (list[dict]): A list of dicts to be scrubbed
list_keys (list): List of keys to be scrubbed
Returns:
list[dict]: The scrubbed list of dicts with PII and PHI removed.
"""
scrubbed_list_dicts = []
for input_dict in input_list:
scrubbed_list_dicts.append(self.scrub_dict(input_dict, list_keys))

return scrubbed_list_dicts

def _should_scrub_text(
self,
key: str,
value: str,
list_keys: list[str],
scrub_all: bool = False,
) -> bool:
"""Check if the key and value should be scrubbed and are of correct instance.
Args:
key (str): The key of the item.
value (str): The value of the item.
list_keys (list[str]): A list of keys that need to be scrubbed.
scrub_all (bool): Whether to scrub all sub-fields/keys/values
of that particular key.
Returns:
bool: True if the key and value should be scrubbed, False otherwise.
"""
return (
isinstance(value, str)
and isinstance(key, str)
and (key in list_keys or scrub_all)
)

def _is_scrubbed(self, old_text: str, new_text: str) -> bool:
"""Check if the text has been scrubbed.
Args:
old_text (str): The original text
new_text (str): The scrubbed text
Returns:
bool: True if the text has been scrubbed, False otherwise
"""
return old_text != new_text

def _scrub_text_item(
self, value: str, key: str, force_scrub_children: bool = False
) -> str:
"""Scrubs the value of a text item.
Args:
value (str): The value of the item
key (str): The key of the item
Returns:
str: The scrubbed value
"""
if key in ("text", "canonical_text"):
return self.scrub_text(value, is_separated=True)
if force_scrub_children:
return self.scrub_text_all(value)
return self.scrub_text(value)

def _should_scrub_list_item(
self, item: str, key: str, list_keys: list[str]
) -> bool:
"""Check if the key and item should be scrubbed and are of correct instance.
Args:
item (str): The value of the item
key (str): The key of the item
list_keys (list): A list of keys that are needed to be scrubbed
Returns:
bool: True if the key and value should be scrubbed, False otherwise
"""
return isinstance(item, (str)) and isinstance(key, str) and key in list_keys

def _scrub_list_item(
self,
item: str | dict,
key: str,
list_keys: list[str],
force_scrub_children: bool = False,
) -> str | dict:
"""Scrubs the value of a dict item.
Args:
item (str/dict): The value of the dict item
key (str): The key of the dict item
list_keys (list): A list of keys that are needed to be scrubbed
Returns:
dict/str: The scrubbed dict/value respectively
"""
if isinstance(item, dict):
return self.scrub_dict(
item, list_keys, force_scrub_children=force_scrub_children
)
return self._scrub_text_item(item, key)


class ScrubbingProviderFactory: # pylint: disable=too-few-public-methods
"""A Factory Class for Scrubbing Providers."""

Expand Down
7 changes: 7 additions & 0 deletions openadapt/privacy/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,10 @@
Module: __init__.py
"""


class ScrubProvider: # pylint: disable=too-few-public-methods
"""A Class for Scrubbing Provider."""

PRESIDIO = "PRESIDIO"
COMPREHEND = "COMPREHEND"
114 changes: 114 additions & 0 deletions openadapt/privacy/providers/aws_comprehend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""A Module for AWS Comprehend Scrubbing Provider Class."""

from typing import List

from botocore import client
from botocore.exceptions import ClientError
from loguru import logger
import boto3

from openadapt import config
from openadapt.privacy.base import Modality, ScrubbingProvider, TextScrubbingMixin
from openadapt.privacy.providers import ScrubProvider


# snippet-start:[python.example_code.comprehend.ComprehendDetect
class ComprehendDetect:
"""Encapsulates Comprehend detection functions."""

def __init__(self, comprehend_client: client) -> None:
""":param comprehend_client: A Boto3 Comprehend client."""
self.comprehend_client = comprehend_client

# snippet-end:[python.example_code.comprehend.ComprehendDetect]

# snippet-start:[python.example_code.comprehend.DetectDominantLanguage]
def detect_languages(self, text: str) -> List[dict]:
"""Detects languages used in a document.
:param text: The document to inspect.
:return: The list of languages along with their confidence scores.
"""
try: # pylint: disable=no-else-raise
response = self.comprehend_client.detect_dominant_language(Text=text)
languages = response["Languages"]
logger.info("Detected %s languages.", len(languages))
except ClientError:
logger.exception("Couldn't detect languages.")
raise
else:
return languages

# snippet-end:[python.example_code.comprehend.DetectDominantLanguage]

# snippet-start:[python.example_code.comprehend.DetectPiiEntities]
def detect_pii(self, text: str, language_code: str) -> List[dict]:
"""Detects personally identifiable information (PII) in a document.
PII can be things like names, account numbers, or addresses.
:param text: The document to inspect.
:param language_code: The language of the document.
:return: The list of PII entities along with their confidence scores.
"""
try: # pylint: disable=no-else-raise
response = self.comprehend_client.detect_pii_entities(
Text=text, LanguageCode=language_code
)
entities = response["Entities"]
logger.info("Detected %s PII entities.", len(entities))
except ClientError:
logger.exception("Couldn't detect PII entities.")
raise
else:
return entities


# snippet-end:[python.example_code.comprehend.DetectPiiEntities]


class ComprehendScrubbingProvider(
ScrubProvider, ScrubbingProvider, TextScrubbingMixin
): # pylint: disable=abstract-method
"""A Class for AWS Comprehend Scrubbing Provider."""

name: str = ScrubProvider.COMPREHEND
capabilities: List[Modality] = [Modality.TEXT]

def scrub_text(self, text: str, is_separated: bool = False) -> str:
"""Scrub the text of all PII/PHI using AWS Comprehend.
Args:
text (str): Text to be scrubbed
is_separated (bool): Whether the text is separated with special characters
Returns:
str: Scrubbed text
"""
if text == "": # empty text
return text

comp_detect = ComprehendDetect(
boto3.client(self.name.lower())
) # pylint: disable=E1101

languages = comp_detect.detect_languages(text)
lang_code = languages[0]["LanguageCode"]

pii_entities = comp_detect.detect_pii(text, lang_code)
if not pii_entities: # no pii/phi detected
return text

scrubbed_text = text

# ner = Named Entity Recognition
for ner in reversed(pii_entities):
scrubbed_text = (
scrubbed_text[: ner["BeginOffset"]]
+ config.ACTION_TEXT_NAME_PREFIX # pylint: disable=E1101
+ ner["Type"]
+ config.ACTION_TEXT_NAME_SUFFIX # pylint: disable=E1101
+ scrubbed_text[ner["EndOffset"] :] # noqa: E203
)

return scrubbed_text
Loading

0 comments on commit 33fe244

Please sign in to comment.