Add scrub module

Add test_scrub module Add Scrubbing config in config.py Add presidio and required packages in requirements.txt
OpenAdaptAI · May 30, 2023 · ce67fd1 · ce67fd1
1 parent 2bbe036
commit ce67fd1
Show file tree

Hide file tree

Showing 6 changed files with 488 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -16,6 +16,7 @@ cache
 
 # VSCode
 .VSCode
+.vsCode
 
 # Generated performance charts
 performance

diff --git a/assets/test_scrub_image.png b/assets/test_scrub_image.png
diff --git a/openadapt/config.py b/openadapt/config.py
@@ -4,6 +4,10 @@
 
 from dotenv import load_dotenv
 from loguru import logger
+from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer import AnonymizerEngine
+from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 
 
 _DEFAULTS = {
@@ -43,3 +47,56 @@ def getenv_fallback(var_name):
     for key, val in locals().items():
         if not key.startswith("_") and key.isupper():
             logger.info(f"{key}={val}")
+
+
+# SCRUBBING CONFIGURATIONS
+
+SCRUB_CONFIG = {
+    "nlp_engine_name": "spacy",
+    "models": [{"lang_code": "en", "model_name": "en_core_web_trf"}],
+}
+SCRUB_PROVIDER = NlpEngineProvider(nlp_configuration=SCRUB_CONFIG)
+NLP_ENGINE = SCRUB_PROVIDER.create_engine()
+ANALYZER = AnalyzerEngine(
+    nlp_engine=NLP_ENGINE,
+    supported_languages=["en"]
+)
+ANONYMIZER = AnonymizerEngine()
+IMAGE_REDACTOR = ImageRedactorEngine(ImageAnalyzerEngine(ANALYZER))
+SCRUB_IGNORE_ENTITIES = [
+    # 'US_PASSPORT',
+    # 'US_DRIVER_LICENSE',
+    # 'CRYPTO',
+    # 'UK_NHS',
+    # 'PERSON',
+    # 'CREDIT_CARD',
+    # 'US_BANK_NUMBER',
+    # 'PHONE_NUMBER',
+    # 'US_ITIN',
+    # 'AU_ABN',
+    'DATE_TIME',
+    # 'NRP',
+    # 'SG_NRIC_FIN',
+    # 'AU_ACN',
+    # 'IP_ADDRESS',
+    # 'EMAIL_ADDRESS',
+    'URL',
+    # 'IBAN_CODE',
+    # 'AU_TFN',
+    # 'LOCATION',
+    # 'AU_MEDICARE',
+    # 'US_SSN',
+    # 'MEDICAL_LICENSE'
+]
+SCRUBBING_ENTITIES = [
+    entity
+    for entity in ANALYZER.get_supported_entities()
+    if entity not in SCRUB_IGNORE_ENTITIES
+]
+SCRUB_KEYS_HTML = [
+    'text',
+    'canonical_text',
+    'title',
+    'state'
+]
+DEFAULT_SCRUB_FILL_COLOR = (255,0,0)
diff --git a/openadapt/scrub.py b/openadapt/scrub.py
@@ -0,0 +1,210 @@
+"""Module to scrub text of all PII/PHI.
+
+Usage:
+
+    $ python openadapt/scrub.py scrub_text str_arg
+    $ python openadapt/scrub.py scrub_image Image_arg
+    
+"""
+
+from io import BytesIO
+from mss import base
+from PIL import Image
+from presidio_anonymizer.entities import OperatorConfig
+import fire
+
+from openadapt import config, utils
+
+
+def scrub_text(text: str,  is_hyphenated: bool = False) -> str:
+    """
+    Scrub the text of all PII/PHI using Presidio Analyzer and Anonymizer
+
+    Args:
+        text (str): Text to be scrubbed
+
+    Returns:
+        str: Scrubbed text
+    """
+
+    if text is None:
+        return None
+
+    if is_hyphenated:
+        text = ''.join(text.split('-'))
+
+    analyzer_results = config.ANALYZER.analyze(
+        text=text, entities=config.SCRUBBING_ENTITIES, language="en"
+    )
+
+    operators = {}
+    for entity in analyzer_results:
+        operators[entity.entity_type] = OperatorConfig(
+            "mask",
+            {
+                "masking_char": "*",
+                "chars_to_mask": entity.end - entity.start,
+                "from_end": True,
+            },
+        )
+
+        # TODO: remove this print statement after testing
+        print(
+            f"Recognized entity: {entity.entity_type} - start: {entity.start} end: {entity.end}"
+        )
+
+    anonymized_results = config.ANONYMIZER.anonymize(
+        text=text,
+        analyzer_results=analyzer_results,
+        operators=operators,
+    )
+
+    return anonymized_results.text
+
+
+def scrub_image(image: Image, fill_color=config.DEFAULT_SCRUB_FILL_COLOR) -> Image:
+    """
+    Scrub the image of all PII/PHI using Presidio Image Redactor
+
+    Args:
+        image (PIL.Image): A PIL.Image object to be scrubbed
+
+    Returns:
+        PIL.Image: The scrubbed image with PII and PHI removed.
+    """
+
+    # Redact the image
+    redacted_image = config.IMAGE_REDACTOR.redact(
+        image, fill=fill_color, entities=config.SCRUBBING_ENTITIES
+    )
+
+    # Return the redacted image data
+    return redacted_image
+
+
+def scrub_screenshot(
+    screenshot: base.ScreenShot, fill_color=config.DEFAULT_SCRUB_FILL_COLOR
+) -> base.ScreenShot:
+    """
+    Scrub the screenshot of all PII/PHI using Presidio Image Redactor
+
+    Args:
+        screenshot (mss.base.ScreenShot): An mss.base.ScreenShot object to be scrubbed
+
+    Returns:
+        mss.base.ScreenShot: The scrubbed screenshot with PII and PHI removed.
+    """
+
+    # Convert the MSS screenshot object to a PIL Image
+    image = Image.frombytes("RGBA", screenshot.size, screenshot.bgra, "raw", "BGRA")
+
+    # Use the scrub_image function to scrub the image
+    redacted_image = scrub_image(image, fill_color)
+
+    # Convert the redacted PIL Image back into an mss.base.ScreenShot object
+    raw_data = bytearray(redacted_image.tobytes("raw", "RGB"))
+
+    # Prepare monitor information from the original screenshot
+    monitor_info = {
+        "left": screenshot.left,
+        "top": screenshot.top,
+        "width": screenshot.width,
+        "height": screenshot.height
+    }
+
+    # Construct a new screenshot with the redacted image data
+    redacted_screenshot = base.ScreenShot(raw_data, monitor_info)
+
+    # Return the redacted screenshot
+    return redacted_screenshot
+
+
+def scrub_png_data(png_data: bytes, fill_color=config.DEFAULT_SCRUB_FILL_COLOR) -> bytes:
+    """
+    
+    Scrub the png_data of all PII/PHI using Presidio Image Redactor
+    
+    Args:
+        png_data (bytes): PNG data to be scrubbed
+        
+    Returns:
+        bytes: Scrubbed PNG data
+    
+    Raises:
+        None
+    """
+    # Load image from the input png_data
+    image = Image.open(BytesIO(png_data))
+
+    # Redact the image with red color
+    redacted_image = config.IMAGE_REDACTOR.redact(
+        image, fill=fill_color, entities=config.SCRUBBING_ENTITIES
+    )
+
+    # Save the redacted image to an in-memory buffer
+    output_buffer = BytesIO()
+    redacted_image.save(output_buffer, format='PNG') # type: ignore
+
+    # Get the redacted image data from the buffer
+    redacted_png_data = output_buffer.getvalue()
+
+    # Return the redacted image data
+    return redacted_png_data
+
+
+def scrub_dict(input_dict: dict, list_keys: list = None) -> dict:
+    """
+    Scrub the dict of all PII/PHI using Presidio Analyzer and Anonymizer.
+
+    Args:
+        input_dict (dict): A dict to be scrubbed
+
+    Returns:
+        dict: The scrubbed dict with PII and PHI removed.
+    """
+
+    if list_keys is None:
+        list_keys = config.SCRUB_KEYS_HTML
+
+    scrubbed_dict = {}
+    for key, value in input_dict.items():
+        if isinstance(value, str) and key in list_keys:
+            scrubbed_dict[key] = scrub_text(value)
+        elif isinstance(value, list):
+            scrubbed_list = []
+            for item in value:
+                if isinstance(item, str) and key in list_keys:
+                    scrubbed_list.append(scrub_text(item))
+                elif isinstance(item, dict):
+                    scrubbed_list.append(scrub_dict(item, list_keys))
+                else:
+                    scrubbed_list.append(item)
+            scrubbed_dict[key] = scrubbed_list
+        elif isinstance(value, dict):
+            scrubbed_dict[key] = scrub_dict(value, list_keys)
+        else:
+            scrubbed_dict[key] = value
+
+    return scrubbed_dict
+
+
+def scrub_list_dicts(input_list: list[dict], list_keys: list = None) -> list[dict]:
+    """
+    Scrub the list of dicts of all PII/PHI using Presidio Analyzer and Anonymizer.
+
+    Args:
+        input_list (list[dict]): A list of dicts to be scrubbed
+
+    Returns:
+        list[dict]: The scrubbed list of dicts with PII and PHI removed.
+    """
+
+    scrubbed_list_dicts = []
+    for input_dict in input_list:
+        scrubbed_list_dicts.append(scrub_dict(input_dict, list_keys))
+
+    return scrubbed_list_dicts
+
+
+if __name__ == "__main__":
+    fire.Fire(utils.get_functions(__name__))
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 alembic==1.8.1
 ascii_magic==2.3.0
--e git+https://github.com/abrichr/atomacos.git; sys_platform == 'darwin'
+git+https://github.com/abrichr/atomacos.git; sys_platform == 'darwin'
 bokeh==2.4.3
 clipboard==0.0.4
 deepdiff[optimize]==6.3.0
@@ -28,3 +28,7 @@ tiktoken==0.4.0
 torch==2.0.0
 tqdm==4.64.0
 transformers==4.28.1
+pytesseract==0.3.7
+presidio_analyzer
+presidio_anonymizer
+presidio_image_redactor
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,6 +16,7 @@ cache @@
     # VSCode
     .VSCode
+    .vsCode
     # Generated performance charts
     performance
@@ Expand Down @@