-
-
Notifications
You must be signed in to change notification settings - Fork 150
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(privacy.providers): add aws_comprehend as scrubbing provider (#476)
* add aws_comprehend provider module * fix typo * add boto3 and botocore to poetry * ran poetry update * add the AWS Comprehend Class * update the script * create test module to test comprhend * refactor name * add full fleged test module * try to fix failing tests * try to fix failing tests * fixed pytest * format test file * fix all flake 8 formattings * add naming meaning for NER * remove repeated code and make a TextMixin * format * change scrub enabled to false after testing visualization * remove redundant rmethods * add enum style provider names addressing: #476 (comment) * fix all flak8 errors * add files for private_ai scrubbing provider * change name * add scrub text function in PrivateAIScrubbingProvider * try to skip tests if api key is incorrect or missing or the syntax used in code is incorrect * pytest passes for test_private_ai * add temp code for pdfs redaction * remove the PRIVATE_AI provider code as it will be added in seprate pr * remove omre prvt_ai files * ran black --------- Co-authored-by: Richard Abrich <[email protected]>
- Loading branch information
1 parent
e5d4955
commit 33fe244
Showing
11 changed files
with
583 additions
and
212 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
"""A Module for AWS Comprehend Scrubbing Provider Class.""" | ||
|
||
from typing import List | ||
|
||
from botocore import client | ||
from botocore.exceptions import ClientError | ||
from loguru import logger | ||
import boto3 | ||
|
||
from openadapt import config | ||
from openadapt.privacy.base import Modality, ScrubbingProvider, TextScrubbingMixin | ||
from openadapt.privacy.providers import ScrubProvider | ||
|
||
|
||
# snippet-start:[python.example_code.comprehend.ComprehendDetect | ||
class ComprehendDetect: | ||
"""Encapsulates Comprehend detection functions.""" | ||
|
||
def __init__(self, comprehend_client: client) -> None: | ||
""":param comprehend_client: A Boto3 Comprehend client.""" | ||
self.comprehend_client = comprehend_client | ||
|
||
# snippet-end:[python.example_code.comprehend.ComprehendDetect] | ||
|
||
# snippet-start:[python.example_code.comprehend.DetectDominantLanguage] | ||
def detect_languages(self, text: str) -> List[dict]: | ||
"""Detects languages used in a document. | ||
:param text: The document to inspect. | ||
:return: The list of languages along with their confidence scores. | ||
""" | ||
try: # pylint: disable=no-else-raise | ||
response = self.comprehend_client.detect_dominant_language(Text=text) | ||
languages = response["Languages"] | ||
logger.info("Detected %s languages.", len(languages)) | ||
except ClientError: | ||
logger.exception("Couldn't detect languages.") | ||
raise | ||
else: | ||
return languages | ||
|
||
# snippet-end:[python.example_code.comprehend.DetectDominantLanguage] | ||
|
||
# snippet-start:[python.example_code.comprehend.DetectPiiEntities] | ||
def detect_pii(self, text: str, language_code: str) -> List[dict]: | ||
"""Detects personally identifiable information (PII) in a document. | ||
PII can be things like names, account numbers, or addresses. | ||
:param text: The document to inspect. | ||
:param language_code: The language of the document. | ||
:return: The list of PII entities along with their confidence scores. | ||
""" | ||
try: # pylint: disable=no-else-raise | ||
response = self.comprehend_client.detect_pii_entities( | ||
Text=text, LanguageCode=language_code | ||
) | ||
entities = response["Entities"] | ||
logger.info("Detected %s PII entities.", len(entities)) | ||
except ClientError: | ||
logger.exception("Couldn't detect PII entities.") | ||
raise | ||
else: | ||
return entities | ||
|
||
|
||
# snippet-end:[python.example_code.comprehend.DetectPiiEntities] | ||
|
||
|
||
class ComprehendScrubbingProvider( | ||
ScrubProvider, ScrubbingProvider, TextScrubbingMixin | ||
): # pylint: disable=abstract-method | ||
"""A Class for AWS Comprehend Scrubbing Provider.""" | ||
|
||
name: str = ScrubProvider.COMPREHEND | ||
capabilities: List[Modality] = [Modality.TEXT] | ||
|
||
def scrub_text(self, text: str, is_separated: bool = False) -> str: | ||
"""Scrub the text of all PII/PHI using AWS Comprehend. | ||
Args: | ||
text (str): Text to be scrubbed | ||
is_separated (bool): Whether the text is separated with special characters | ||
Returns: | ||
str: Scrubbed text | ||
""" | ||
if text == "": # empty text | ||
return text | ||
|
||
comp_detect = ComprehendDetect( | ||
boto3.client(self.name.lower()) | ||
) # pylint: disable=E1101 | ||
|
||
languages = comp_detect.detect_languages(text) | ||
lang_code = languages[0]["LanguageCode"] | ||
|
||
pii_entities = comp_detect.detect_pii(text, lang_code) | ||
if not pii_entities: # no pii/phi detected | ||
return text | ||
|
||
scrubbed_text = text | ||
|
||
# ner = Named Entity Recognition | ||
for ner in reversed(pii_entities): | ||
scrubbed_text = ( | ||
scrubbed_text[: ner["BeginOffset"]] | ||
+ config.ACTION_TEXT_NAME_PREFIX # pylint: disable=E1101 | ||
+ ner["Type"] | ||
+ config.ACTION_TEXT_NAME_SUFFIX # pylint: disable=E1101 | ||
+ scrubbed_text[ner["EndOffset"] :] # noqa: E203 | ||
) | ||
|
||
return scrubbed_text |
Oops, something went wrong.