Skip to content

Commit

Permalink
Add scrub module
Browse files Browse the repository at this point in the history
Add test_scrub module
Add Scrubbing config in config.py
Add presidio and required packages in requirements.txt
  • Loading branch information
Krish Patel committed May 30, 2023
1 parent 2bbe036 commit ce67fd1
Show file tree
Hide file tree
Showing 6 changed files with 488 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ cache

# VSCode
.VSCode
.vsCode

# Generated performance charts
performance
Expand Down
Binary file added assets/test_scrub_image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
57 changes: 57 additions & 0 deletions openadapt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

from dotenv import load_dotenv
from loguru import logger
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine


_DEFAULTS = {
Expand Down Expand Up @@ -43,3 +47,56 @@ def getenv_fallback(var_name):
for key, val in locals().items():
if not key.startswith("_") and key.isupper():
logger.info(f"{key}={val}")


# SCRUBBING CONFIGURATIONS

SCRUB_CONFIG = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_trf"}],
}
SCRUB_PROVIDER = NlpEngineProvider(nlp_configuration=SCRUB_CONFIG)
NLP_ENGINE = SCRUB_PROVIDER.create_engine()
ANALYZER = AnalyzerEngine(
nlp_engine=NLP_ENGINE,
supported_languages=["en"]
)
ANONYMIZER = AnonymizerEngine()
IMAGE_REDACTOR = ImageRedactorEngine(ImageAnalyzerEngine(ANALYZER))
SCRUB_IGNORE_ENTITIES = [
# 'US_PASSPORT',
# 'US_DRIVER_LICENSE',
# 'CRYPTO',
# 'UK_NHS',
# 'PERSON',
# 'CREDIT_CARD',
# 'US_BANK_NUMBER',
# 'PHONE_NUMBER',
# 'US_ITIN',
# 'AU_ABN',
'DATE_TIME',
# 'NRP',
# 'SG_NRIC_FIN',
# 'AU_ACN',
# 'IP_ADDRESS',
# 'EMAIL_ADDRESS',
'URL',
# 'IBAN_CODE',
# 'AU_TFN',
# 'LOCATION',
# 'AU_MEDICARE',
# 'US_SSN',
# 'MEDICAL_LICENSE'
]
SCRUBBING_ENTITIES = [
entity
for entity in ANALYZER.get_supported_entities()
if entity not in SCRUB_IGNORE_ENTITIES
]
SCRUB_KEYS_HTML = [
'text',
'canonical_text',
'title',
'state'
]
DEFAULT_SCRUB_FILL_COLOR = (255,0,0)
210 changes: 210 additions & 0 deletions openadapt/scrub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""Module to scrub text of all PII/PHI.
Usage:
$ python openadapt/scrub.py scrub_text str_arg
$ python openadapt/scrub.py scrub_image Image_arg
"""

from io import BytesIO
from mss import base
from PIL import Image
from presidio_anonymizer.entities import OperatorConfig
import fire

from openadapt import config, utils


def scrub_text(text: str, is_hyphenated: bool = False) -> str:
"""
Scrub the text of all PII/PHI using Presidio Analyzer and Anonymizer
Args:
text (str): Text to be scrubbed
Returns:
str: Scrubbed text
"""

if text is None:
return None

if is_hyphenated:
text = ''.join(text.split('-'))

analyzer_results = config.ANALYZER.analyze(
text=text, entities=config.SCRUBBING_ENTITIES, language="en"
)

operators = {}
for entity in analyzer_results:
operators[entity.entity_type] = OperatorConfig(
"mask",
{
"masking_char": "*",
"chars_to_mask": entity.end - entity.start,
"from_end": True,
},
)

# TODO: remove this print statement after testing
print(
f"Recognized entity: {entity.entity_type} - start: {entity.start} end: {entity.end}"
)

anonymized_results = config.ANONYMIZER.anonymize(
text=text,
analyzer_results=analyzer_results,
operators=operators,
)

return anonymized_results.text


def scrub_image(image: Image, fill_color=config.DEFAULT_SCRUB_FILL_COLOR) -> Image:
"""
Scrub the image of all PII/PHI using Presidio Image Redactor
Args:
image (PIL.Image): A PIL.Image object to be scrubbed
Returns:
PIL.Image: The scrubbed image with PII and PHI removed.
"""

# Redact the image
redacted_image = config.IMAGE_REDACTOR.redact(
image, fill=fill_color, entities=config.SCRUBBING_ENTITIES
)

# Return the redacted image data
return redacted_image


def scrub_screenshot(
screenshot: base.ScreenShot, fill_color=config.DEFAULT_SCRUB_FILL_COLOR
) -> base.ScreenShot:
"""
Scrub the screenshot of all PII/PHI using Presidio Image Redactor
Args:
screenshot (mss.base.ScreenShot): An mss.base.ScreenShot object to be scrubbed
Returns:
mss.base.ScreenShot: The scrubbed screenshot with PII and PHI removed.
"""

# Convert the MSS screenshot object to a PIL Image
image = Image.frombytes("RGBA", screenshot.size, screenshot.bgra, "raw", "BGRA")

# Use the scrub_image function to scrub the image
redacted_image = scrub_image(image, fill_color)

# Convert the redacted PIL Image back into an mss.base.ScreenShot object
raw_data = bytearray(redacted_image.tobytes("raw", "RGB"))

# Prepare monitor information from the original screenshot
monitor_info = {
"left": screenshot.left,
"top": screenshot.top,
"width": screenshot.width,
"height": screenshot.height
}

# Construct a new screenshot with the redacted image data
redacted_screenshot = base.ScreenShot(raw_data, monitor_info)

# Return the redacted screenshot
return redacted_screenshot


def scrub_png_data(png_data: bytes, fill_color=config.DEFAULT_SCRUB_FILL_COLOR) -> bytes:
"""
Scrub the png_data of all PII/PHI using Presidio Image Redactor
Args:
png_data (bytes): PNG data to be scrubbed
Returns:
bytes: Scrubbed PNG data
Raises:
None
"""
# Load image from the input png_data
image = Image.open(BytesIO(png_data))

# Redact the image with red color
redacted_image = config.IMAGE_REDACTOR.redact(
image, fill=fill_color, entities=config.SCRUBBING_ENTITIES
)

# Save the redacted image to an in-memory buffer
output_buffer = BytesIO()
redacted_image.save(output_buffer, format='PNG') # type: ignore

# Get the redacted image data from the buffer
redacted_png_data = output_buffer.getvalue()

# Return the redacted image data
return redacted_png_data


def scrub_dict(input_dict: dict, list_keys: list = None) -> dict:
"""
Scrub the dict of all PII/PHI using Presidio Analyzer and Anonymizer.
Args:
input_dict (dict): A dict to be scrubbed
Returns:
dict: The scrubbed dict with PII and PHI removed.
"""

if list_keys is None:
list_keys = config.SCRUB_KEYS_HTML

scrubbed_dict = {}
for key, value in input_dict.items():
if isinstance(value, str) and key in list_keys:
scrubbed_dict[key] = scrub_text(value)
elif isinstance(value, list):
scrubbed_list = []
for item in value:
if isinstance(item, str) and key in list_keys:
scrubbed_list.append(scrub_text(item))
elif isinstance(item, dict):
scrubbed_list.append(scrub_dict(item, list_keys))
else:
scrubbed_list.append(item)
scrubbed_dict[key] = scrubbed_list
elif isinstance(value, dict):
scrubbed_dict[key] = scrub_dict(value, list_keys)
else:
scrubbed_dict[key] = value

return scrubbed_dict


def scrub_list_dicts(input_list: list[dict], list_keys: list = None) -> list[dict]:
"""
Scrub the list of dicts of all PII/PHI using Presidio Analyzer and Anonymizer.
Args:
input_list (list[dict]): A list of dicts to be scrubbed
Returns:
list[dict]: The scrubbed list of dicts with PII and PHI removed.
"""

scrubbed_list_dicts = []
for input_dict in input_list:
scrubbed_list_dicts.append(scrub_dict(input_dict, list_keys))

return scrubbed_list_dicts


if __name__ == "__main__":
fire.Fire(utils.get_functions(__name__))
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
alembic==1.8.1
ascii_magic==2.3.0
-e git+https://github.com/abrichr/atomacos.git; sys_platform == 'darwin'
git+https://github.com/abrichr/atomacos.git; sys_platform == 'darwin'
bokeh==2.4.3
clipboard==0.0.4
deepdiff[optimize]==6.3.0
Expand Down Expand Up @@ -28,3 +28,7 @@ tiktoken==0.4.0
torch==2.0.0
tqdm==4.64.0
transformers==4.28.1
pytesseract==0.3.7
presidio_analyzer
presidio_anonymizer
presidio_image_redactor
Loading

0 comments on commit ce67fd1

Please sign in to comment.