Skip to content

Commit

Permalink
Enabling allow list approach with all image redaction (#1145)
Browse files Browse the repository at this point in the history
* Enabling allow list approach

* Adding in empty list for allow_list in existing unit tests

* Added unit tests for newly introduced methods

* Adding unit test for allow list functionality

* Linting fixes

* Removing spaces in empty lines

* Fix integration test not accounting for empty space removal

* Updating notebook with more examples and adding ad_hoc_recognizers approach to standard image redactor engine as well

* Linting fixes

* Removing incomplete example code

* Fixing section header numbers

* Removing duplicate comment

---------

Co-authored-by: Omri Mendels <[email protected]>
  • Loading branch information
niwilso and omri374 authored Aug 23, 2023
1 parent 994074b commit 60e1f7d
Show file tree
Hide file tree
Showing 7 changed files with 784 additions and 64 deletions.
1 change: 1 addition & 0 deletions docs/samples/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
| Usage | Python Notebook | [Getting the identified entity value using a custom Operator](python/getting_entity_values.ipynb)|
| Usage | Python Notebook | [Anonymizing known values](https://github.com/microsoft/presidio/blob/main/docs/samples/python/Anonymizing%20known%20values.ipynb)
| Usage | Python Notebook | [Redacting text PII from DICOM images](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_dicom_image_redactor.ipynb)
| Usage | Python Notebook | [Using an allow list with image redaction](https://github.com/microsoft/presidio/blob/main/docs/samples/python/image_redaction_allow_list_approach.ipynb)
| Usage | Python Notebook | [Annotating PII in a PDF](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_pdf_annotation.ipynb)
| Usage | Python Notebook | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) |
| Usage | Python | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) |
Expand Down
571 changes: 571 additions & 0 deletions docs/samples/python/image_redaction_allow_list_approach.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from presidio_image_redactor import ImageRedactorEngine
from presidio_image_redactor import ImageAnalyzerEngine # noqa: F401
import presidio_analyzer # required for isinstance check which throws an error when trying to specify PatternRecognizer # noqa: E501
from presidio_analyzer import PatternRecognizer
from presidio_image_redactor.entities import ImageRecognizerResult

Expand Down Expand Up @@ -906,16 +905,7 @@ def _get_analyzer_results(
:return: Analyzer results.
"""
# Check the ad-hoc recognizers list
if isinstance(ad_hoc_recognizers, (list, type(None))):
if isinstance(ad_hoc_recognizers, list):
if len(ad_hoc_recognizers) >= 1:
are_recognizers = all(isinstance(x, presidio_analyzer.pattern_recognizer.PatternRecognizer) for x in ad_hoc_recognizers) # noqa: E501
if are_recognizers is False:
raise TypeError("All items in ad_hoc_recognizers list must be PatternRecognizer objects") # noqa: E501
else:
raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer") # noqa: E501
else:
raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer") # noqa: E501
self._check_ad_hoc_recognizer_list(ad_hoc_recognizers)

# Create custom recognizer using DICOM metadata
if use_metadata:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def analyze(
# Perform OCR
perform_ocr_kwargs, ocr_threshold = self._parse_ocr_kwargs(ocr_kwargs)
ocr_result = self.ocr.perform_ocr(image, **perform_ocr_kwargs)
ocr_result = self.remove_space_boxes(ocr_result)

# Apply OCR confidence threshold if it is passed in
if ocr_threshold:
Expand All @@ -53,8 +54,9 @@ def analyze(
analyzer_result = self.analyzer_engine.analyze(
text=text, language="en", **text_analyzer_kwargs
)
allow_list = self._check_for_allow_list(text_analyzer_kwargs)
bboxes = self.map_analyzer_results_to_bounding_boxes(
analyzer_result, ocr_result, text
analyzer_result, ocr_result, text, allow_list
)
return bboxes

Expand Down Expand Up @@ -83,9 +85,33 @@ def threshold_ocr_result(ocr_result: dict, ocr_threshold: float) -> dict:

return filtered_ocr_result

@staticmethod
def remove_space_boxes(ocr_result: dict) -> dict:
"""Remove OCR bboxes that are for spaces.
:param ocr_result: OCR results (raw or thresholded).
:return: OCR results with empty words removed.
"""
# Get indices of items with no text
idx = list()
for i, text in enumerate(ocr_result["text"]):
is_not_space = text.isspace() is False
if text != "" and is_not_space:
idx.append(i)

# Only retain items with text
filtered_ocr_result = {}
for key in list(ocr_result.keys()):
filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]

return filtered_ocr_result

@staticmethod
def map_analyzer_results_to_bounding_boxes(
text_analyzer_results: List[RecognizerResult], ocr_result: dict, text: str
text_analyzer_results: List[RecognizerResult],
ocr_result: dict,
text: str,
allow_list: List[str],
) -> List[ImageRecognizerResult]:
"""Map extracted PII entities to image bounding boxes.
Expand All @@ -95,6 +121,7 @@ def map_analyzer_results_to_bounding_boxes(
:param text_analyzer_results: PII entities recognized by presidio analyzer
:param ocr_result: dict results with words and bboxes from OCR
:param text: text the results are based on
:param allow_list: List of words to not redact
return: list of extracted entities with image bounding boxes
"""
Expand All @@ -117,40 +144,54 @@ def map_analyzer_results_to_bounding_boxes(
if (
max(pos, element.start) < min(element.end, pos + len(word))
) and ((text_element in word) or (word in text_element)):
bboxes.append(
ImageRecognizerResult(
element.entity_type,
element.start,
element.end,
element.score,
ocr_result["left"][index],
ocr_result["top"][index],
ocr_result["width"][index],
ocr_result["height"][index],
)
yes_make_bbox_for_word = (
(word is not None)
and (word != "")
and (word.isspace() is False)
and (word not in allow_list)
)
# Do not add bbox for standalone spaces / empty strings
if yes_make_bbox_for_word:
bboxes.append(
ImageRecognizerResult(
element.entity_type,
element.start,
element.end,
element.score,
ocr_result["left"][index],
ocr_result["top"][index],
ocr_result["width"][index],
ocr_result["height"][index],
)
)

# add bounding boxes for all words in ocr dict
# contained within the text of recognized entity
# based on relative position in the full text
while pos + len(word) < element.end:
prev_word = word
index, word = next(iter_ocr)
if word:
bboxes.append(
ImageRecognizerResult(
element.entity_type,
element.start,
element.end,
element.score,
ocr_result["left"][index],
ocr_result["top"][index],
ocr_result["width"][index],
ocr_result["height"][index],
)
# add bounding boxes for all words in ocr dict
# contained within the text of recognized entity
# based on relative position in the full text
while pos + len(word) < element.end:
prev_word = word
index, word = next(iter_ocr)
yes_make_bbox_for_word = (
(word is not None)
and (word != "")
and (word.isspace() is False)
and (word not in allow_list)
)
pos += len(prev_word) + 1
proc_indexes += 1
if yes_make_bbox_for_word:
bboxes.append(
ImageRecognizerResult(
element.entity_type,
element.start,
element.end,
element.score,
ocr_result["left"][index],
ocr_result["top"][index],
ocr_result["width"][index],
ocr_result["height"][index],
)
)
pos += len(prev_word) + 1
proc_indexes += 1

if proc_indexes == indexes:
break
Expand Down Expand Up @@ -179,3 +220,17 @@ def _parse_ocr_kwargs(ocr_kwargs: dict) -> Tuple[dict, float]:
ocr_kwargs = {}

return ocr_kwargs, ocr_threshold

@staticmethod
def _check_for_allow_list(text_analyzer_kwargs: dict) -> List[str]:
"""Check the text_analyzer_kwargs for an allow_list.
:param text_analyzer_kwargs: Text analyzer kwargs.
:return: The allow list if it exists.
"""
allow_list = []
if text_analyzer_kwargs is not None:
if "allow_list" in text_analyzer_kwargs:
allow_list = text_analyzer_kwargs["allow_list"]

return allow_list
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Union, Tuple, Optional
from typing import Union, Tuple, Optional, List

from PIL import Image, ImageDraw, ImageChops

from presidio_image_redactor import ImageAnalyzerEngine, BboxProcessor
import presidio_analyzer # required for isinstance check which throws an error when trying to specify PatternRecognizer # noqa: E501
from presidio_analyzer import PatternRecognizer


class ImageRedactorEngine:
Expand All @@ -24,6 +26,7 @@ def redact(
image: Image,
fill: Union[int, Tuple[int, int, int]] = (0, 0, 0),
ocr_kwargs: Optional[dict] = None,
ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None,
**text_analyzer_kwargs,
) -> Image:
"""Redact method to redact the given image.
Expand All @@ -34,6 +37,8 @@ def redact(
:param fill: colour to fill the shape - int (0-255) for
grayscale or Tuple(R, G, B) for RGB.
:param ocr_kwargs: Additional params for OCR methods.
:param ad_hoc_recognizers: List of PatternRecognizer objects to use
for ad-hoc recognizer.
:param text_analyzer_kwargs: Additional values for the analyze method
in AnalyzerEngine.
Expand All @@ -42,9 +47,24 @@ def redact(

image = ImageChops.duplicate(image)

bboxes = self.image_analyzer_engine.analyze(
image, ocr_kwargs, **text_analyzer_kwargs
)
# Check the ad-hoc recognizers list
self._check_ad_hoc_recognizer_list(ad_hoc_recognizers)

# Detect PII
if ad_hoc_recognizers is None:
bboxes = self.image_analyzer_engine.analyze(
image,
ocr_kwargs=ocr_kwargs,
**text_analyzer_kwargs,
)
else:
bboxes = self.image_analyzer_engine.analyze(
image,
ocr_kwargs=ocr_kwargs,
ad_hoc_recognizers=ad_hoc_recognizers,
**text_analyzer_kwargs,
)

draw = ImageDraw.Draw(image)

for box in bboxes:
Expand All @@ -55,3 +75,23 @@ def redact(
draw.rectangle([x0, y0, x1, y1], fill=fill)

return image

@staticmethod
def _check_ad_hoc_recognizer_list(
ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None
):
"""Check if the provided ad-hoc recognizer list is valid.
:param ad_hoc_recognizers: List of PatternRecognizer objects to use
for ad-hoc recognizer.
"""
if isinstance(ad_hoc_recognizers, (list, type(None))):
if isinstance(ad_hoc_recognizers, list):
if len(ad_hoc_recognizers) >= 1:
are_recognizers = all(isinstance(x, presidio_analyzer.pattern_recognizer.PatternRecognizer) for x in ad_hoc_recognizers) # noqa: E501
if are_recognizers is False:
raise TypeError("All items in ad_hoc_recognizers list must be PatternRecognizer objects") # noqa: E501
else:
raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer") # noqa: E501
else:
raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer") # noqa: E501
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ def test_given_image_without_text_then_no_entities_recognized(image_analyzer_eng
def __get_expected_ocr_test_image_analysis_results():
# fmt: off
return [
ImageRecognizerResult(entity_type="PERSON", start=31, end=44,
ImageRecognizerResult(entity_type="PERSON", start=27, end=40,
score=0.85, left=472, top=20, width=91, height=31),
ImageRecognizerResult(entity_type="PERSON", start=31, end=44,
ImageRecognizerResult(entity_type="PERSON", start=27, end=40,
score=0.85, left=576, top=20, width=147, height=31),
ImageRecognizerResult(entity_type="URL", start=295, end=320,
ImageRecognizerResult(entity_type="URL", start=286, end=311,
score=0.6, left=28, top=299, width=438, height=38),
ImageRecognizerResult(entity_type="PHONE_NUMBER", start=332, end=346,
ImageRecognizerResult(entity_type="PHONE_NUMBER", start=323, end=337,
score=0.4, left=666, top=298, width=88, height=40),
ImageRecognizerResult(entity_type="PHONE_NUMBER", start=332, end=346,
ImageRecognizerResult(entity_type="PHONE_NUMBER", start=323, end=337,
score=0.4, left=769, top=301, width=169, height=29),
ImageRecognizerResult(entity_type="EMAIL_ADDRESS", start=772, end=794,
ImageRecognizerResult(entity_type="EMAIL_ADDRESS", start=749, end=771,
score=1.0, left=27, top=912, width=458, height=39),
ImageRecognizerResult(entity_type="URL", start=781, end=794,
ImageRecognizerResult(entity_type="URL", start=758, end=771,
score=0.5, left=27, top=912, width=458, height=39),
]
# fmt: on
Loading

0 comments on commit 60e1f7d

Please sign in to comment.