Enabling allow list approach with all image redaction (#1145)

* Enabling allow list approach * Adding in empty list for allow_list in existing unit tests * Added unit tests for newly introduced methods * Adding unit test for allow list functionality * Linting fixes * Removing spaces in empty lines * Fix integration test not accounting for empty space removal * Updating notebook with more examples and adding ad_hoc_recognizers approach to standard image redactor engine as well * Linting fixes * Removing incomplete example code * Fixing section header numbers * Removing duplicate comment --------- Co-authored-by: Omri Mendels <[email protected]>
microsoft · Aug 23, 2023 · 60e1f7d · 60e1f7d
1 parent 994074b
commit 60e1f7d
Show file tree

Hide file tree

Showing 7 changed files with 784 additions and 64 deletions.
diff --git a/docs/samples/index.md b/docs/samples/index.md
@@ -9,6 +9,7 @@
 | Usage       | Python Notebook                       | [Getting the identified entity value using a custom Operator](python/getting_entity_values.ipynb)|
 | Usage       | Python Notebook                       | [Anonymizing known values](https://github.com/microsoft/presidio/blob/main/docs/samples/python/Anonymizing%20known%20values.ipynb)
 | Usage       | Python Notebook                       | [Redacting text PII from DICOM images](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_dicom_image_redactor.ipynb)
+| Usage       | Python Notebook                       | [Using an allow list with image redaction](https://github.com/microsoft/presidio/blob/main/docs/samples/python/image_redaction_allow_list_approach.ipynb)
 | Usage       | Python Notebook                       | [Annotating PII in a PDF](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_pdf_annotation.ipynb)
 | Usage       | Python Notebook                       | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) |
 | Usage       | Python                                | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) |

diff --git a/docs/samples/python/image_redaction_allow_list_approach.ipynb b/docs/samples/python/image_redaction_allow_list_approach.ipynb
diff --git a/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py b/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py
@@ -16,7 +16,6 @@
 
 from presidio_image_redactor import ImageRedactorEngine
 from presidio_image_redactor import ImageAnalyzerEngine  # noqa: F401
-import presidio_analyzer  # required for isinstance check which throws an error when trying to specify PatternRecognizer  # noqa: E501
 from presidio_analyzer import PatternRecognizer
 from presidio_image_redactor.entities import ImageRecognizerResult
 
@@ -906,16 +905,7 @@ def _get_analyzer_results(
         :return: Analyzer results.
         """
         # Check the ad-hoc recognizers list
-        if isinstance(ad_hoc_recognizers, (list, type(None))):
-            if isinstance(ad_hoc_recognizers, list):
-                if len(ad_hoc_recognizers) >= 1:
-                    are_recognizers = all(isinstance(x, presidio_analyzer.pattern_recognizer.PatternRecognizer) for x in ad_hoc_recognizers)  # noqa: E501
-                    if are_recognizers is False:
-                        raise TypeError("All items in ad_hoc_recognizers list must be PatternRecognizer objects")  # noqa: E501
-                else:
-                    raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer")  # noqa: E501
-        else:
-            raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer")  # noqa: E501
+        self._check_ad_hoc_recognizer_list(ad_hoc_recognizers)
 
         # Create custom recognizer using DICOM metadata
         if use_metadata:

diff --git a/presidio-image-redactor/presidio_image_redactor/image_analyzer_engine.py b/presidio-image-redactor/presidio_image_redactor/image_analyzer_engine.py
@@ -42,6 +42,7 @@ def analyze(
         # Perform OCR
         perform_ocr_kwargs, ocr_threshold = self._parse_ocr_kwargs(ocr_kwargs)
         ocr_result = self.ocr.perform_ocr(image, **perform_ocr_kwargs)
+        ocr_result = self.remove_space_boxes(ocr_result)
 
         # Apply OCR confidence threshold if it is passed in
         if ocr_threshold:
@@ -53,8 +54,9 @@ def analyze(
         analyzer_result = self.analyzer_engine.analyze(
             text=text, language="en", **text_analyzer_kwargs
         )
+        allow_list = self._check_for_allow_list(text_analyzer_kwargs)
         bboxes = self.map_analyzer_results_to_bounding_boxes(
-            analyzer_result, ocr_result, text
+            analyzer_result, ocr_result, text, allow_list
         )
         return bboxes
 
@@ -83,9 +85,33 @@ def threshold_ocr_result(ocr_result: dict, ocr_threshold: float) -> dict:
 
         return filtered_ocr_result
 
+    @staticmethod
+    def remove_space_boxes(ocr_result: dict) -> dict:
+        """Remove OCR bboxes that are for spaces.
+
+        :param ocr_result: OCR results (raw or thresholded).
+        :return: OCR results with empty words removed.
+        """
+        # Get indices of items with no text
+        idx = list()
+        for i, text in enumerate(ocr_result["text"]):
+            is_not_space = text.isspace() is False
+            if text != "" and is_not_space:
+                idx.append(i)
+
+        # Only retain items with text
+        filtered_ocr_result = {}
+        for key in list(ocr_result.keys()):
+            filtered_ocr_result[key] = [ocr_result[key][i] for i in idx]
+
+        return filtered_ocr_result
+
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
-        text_analyzer_results: List[RecognizerResult], ocr_result: dict, text: str
+        text_analyzer_results: List[RecognizerResult],
+        ocr_result: dict,
+        text: str,
+        allow_list: List[str],
     ) -> List[ImageRecognizerResult]:
         """Map extracted PII entities to image bounding boxes.
 
@@ -95,6 +121,7 @@ def map_analyzer_results_to_bounding_boxes(
         :param text_analyzer_results: PII entities recognized by presidio analyzer
         :param ocr_result: dict results with words and bboxes from OCR
         :param text: text the results are based on
+        :param allow_list: List of words to not redact
 
         return: list of extracted entities with image bounding boxes
         """
@@ -117,40 +144,54 @@ def map_analyzer_results_to_bounding_boxes(
                     if (
                         max(pos, element.start) < min(element.end, pos + len(word))
                     ) and ((text_element in word) or (word in text_element)):
-                        bboxes.append(
-                            ImageRecognizerResult(
-                                element.entity_type,
-                                element.start,
-                                element.end,
-                                element.score,
-                                ocr_result["left"][index],
-                                ocr_result["top"][index],
-                                ocr_result["width"][index],
-                                ocr_result["height"][index],
-                            )
+                        yes_make_bbox_for_word = (
+                            (word is not None)
+                            and (word != "")
+                            and (word.isspace() is False)
+                            and (word not in allow_list)
                         )
+                        # Do not add bbox for standalone spaces / empty strings
+                        if yes_make_bbox_for_word:
+                            bboxes.append(
+                                ImageRecognizerResult(
+                                    element.entity_type,
+                                    element.start,
+                                    element.end,
+                                    element.score,
+                                    ocr_result["left"][index],
+                                    ocr_result["top"][index],
+                                    ocr_result["width"][index],
+                                    ocr_result["height"][index],
+                                )
+                            )
 
-                        # add bounding boxes for all words in ocr dict
-                        # contained within the text of recognized entity
-                        # based on relative position in the full text
-                        while pos + len(word) < element.end:
-                            prev_word = word
-                            index, word = next(iter_ocr)
-                            if word:
-                                bboxes.append(
-                                    ImageRecognizerResult(
-                                        element.entity_type,
-                                        element.start,
-                                        element.end,
-                                        element.score,
-                                        ocr_result["left"][index],
-                                        ocr_result["top"][index],
-                                        ocr_result["width"][index],
-                                        ocr_result["height"][index],
-                                    )
+                            # add bounding boxes for all words in ocr dict
+                            # contained within the text of recognized entity
+                            # based on relative position in the full text
+                            while pos + len(word) < element.end:
+                                prev_word = word
+                                index, word = next(iter_ocr)
+                                yes_make_bbox_for_word = (
+                                    (word is not None)
+                                    and (word != "")
+                                    and (word.isspace() is False)
+                                    and (word not in allow_list)
                                 )
-                            pos += len(prev_word) + 1
-                        proc_indexes += 1
+                                if yes_make_bbox_for_word:
+                                    bboxes.append(
+                                        ImageRecognizerResult(
+                                            element.entity_type,
+                                            element.start,
+                                            element.end,
+                                            element.score,
+                                            ocr_result["left"][index],
+                                            ocr_result["top"][index],
+                                            ocr_result["width"][index],
+                                            ocr_result["height"][index],
+                                        )
+                                    )
+                                pos += len(prev_word) + 1
+                            proc_indexes += 1
 
                 if proc_indexes == indexes:
                     break
@@ -179,3 +220,17 @@ def _parse_ocr_kwargs(ocr_kwargs: dict) -> Tuple[dict, float]:
             ocr_kwargs = {}
 
         return ocr_kwargs, ocr_threshold
+
+    @staticmethod
+    def _check_for_allow_list(text_analyzer_kwargs: dict) -> List[str]:
+        """Check the text_analyzer_kwargs for an allow_list.
+
+        :param text_analyzer_kwargs: Text analyzer kwargs.
+        :return: The allow list if it exists.
+        """
+        allow_list = []
+        if text_analyzer_kwargs is not None:
+            if "allow_list" in text_analyzer_kwargs:
+                allow_list = text_analyzer_kwargs["allow_list"]
+
+        return allow_list
diff --git a/presidio-image-redactor/presidio_image_redactor/image_redactor_engine.py b/presidio-image-redactor/presidio_image_redactor/image_redactor_engine.py
@@ -1,8 +1,10 @@
-from typing import Union, Tuple, Optional
+from typing import Union, Tuple, Optional, List
 
 from PIL import Image, ImageDraw, ImageChops
 
 from presidio_image_redactor import ImageAnalyzerEngine, BboxProcessor
+import presidio_analyzer  # required for isinstance check which throws an error when trying to specify PatternRecognizer  # noqa: E501
+from presidio_analyzer import PatternRecognizer
 
 
 class ImageRedactorEngine:
@@ -24,6 +26,7 @@ def redact(
         image: Image,
         fill: Union[int, Tuple[int, int, int]] = (0, 0, 0),
         ocr_kwargs: Optional[dict] = None,
+        ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None,
         **text_analyzer_kwargs,
     ) -> Image:
         """Redact method to redact the given image.
@@ -34,6 +37,8 @@ def redact(
         :param fill: colour to fill the shape - int (0-255) for
         grayscale or Tuple(R, G, B) for RGB.
         :param ocr_kwargs: Additional params for OCR methods.
+        :param ad_hoc_recognizers: List of PatternRecognizer objects to use
+        for ad-hoc recognizer.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
 
@@ -42,9 +47,24 @@ def redact(
 
         image = ImageChops.duplicate(image)
 
-        bboxes = self.image_analyzer_engine.analyze(
-            image, ocr_kwargs, **text_analyzer_kwargs
-        )
+        # Check the ad-hoc recognizers list
+        self._check_ad_hoc_recognizer_list(ad_hoc_recognizers)
+
+        # Detect PII
+        if ad_hoc_recognizers is None:
+            bboxes = self.image_analyzer_engine.analyze(
+                image,
+                ocr_kwargs=ocr_kwargs,
+                **text_analyzer_kwargs,
+            )
+        else:
+            bboxes = self.image_analyzer_engine.analyze(
+                image,
+                ocr_kwargs=ocr_kwargs,
+                ad_hoc_recognizers=ad_hoc_recognizers,
+                **text_analyzer_kwargs,
+            )
+
         draw = ImageDraw.Draw(image)
 
         for box in bboxes:
@@ -55,3 +75,23 @@ def redact(
             draw.rectangle([x0, y0, x1, y1], fill=fill)
 
         return image
+
+    @staticmethod
+    def _check_ad_hoc_recognizer_list(
+        ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None
+    ):
+        """Check if the provided ad-hoc recognizer list is valid.
+
+        :param ad_hoc_recognizers: List of PatternRecognizer objects to use
+        for ad-hoc recognizer.
+        """
+        if isinstance(ad_hoc_recognizers, (list, type(None))):
+            if isinstance(ad_hoc_recognizers, list):
+                if len(ad_hoc_recognizers) >= 1:
+                    are_recognizers = all(isinstance(x, presidio_analyzer.pattern_recognizer.PatternRecognizer) for x in ad_hoc_recognizers)  # noqa: E501
+                    if are_recognizers is False:
+                        raise TypeError("All items in ad_hoc_recognizers list must be PatternRecognizer objects")  # noqa: E501
+                else:
+                    raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer")  # noqa: E501
+        else:
+            raise TypeError("ad_hoc_recognizers must be None or list of PatternRecognizer")  # noqa: E501
diff --git a/presidio-image-redactor/tests/integration/test_image_analyzer_engine_integration.py b/presidio-image-redactor/tests/integration/test_image_analyzer_engine_integration.py
@@ -22,19 +22,19 @@ def test_given_image_without_text_then_no_entities_recognized(image_analyzer_eng
 def __get_expected_ocr_test_image_analysis_results():
     # fmt: off
     return [
-        ImageRecognizerResult(entity_type="PERSON", start=31, end=44,
+        ImageRecognizerResult(entity_type="PERSON", start=27, end=40,
                               score=0.85, left=472, top=20, width=91, height=31),
-        ImageRecognizerResult(entity_type="PERSON", start=31, end=44,
+        ImageRecognizerResult(entity_type="PERSON", start=27, end=40,
                               score=0.85, left=576, top=20, width=147, height=31),
-        ImageRecognizerResult(entity_type="URL", start=295, end=320,
+        ImageRecognizerResult(entity_type="URL", start=286, end=311,
                               score=0.6, left=28, top=299, width=438, height=38),
-        ImageRecognizerResult(entity_type="PHONE_NUMBER", start=332, end=346,
+        ImageRecognizerResult(entity_type="PHONE_NUMBER", start=323, end=337,
                               score=0.4, left=666, top=298, width=88, height=40),
-        ImageRecognizerResult(entity_type="PHONE_NUMBER", start=332, end=346,
+        ImageRecognizerResult(entity_type="PHONE_NUMBER", start=323, end=337,
                               score=0.4, left=769, top=301, width=169, height=29),
-        ImageRecognizerResult(entity_type="EMAIL_ADDRESS", start=772, end=794,
+        ImageRecognizerResult(entity_type="EMAIL_ADDRESS", start=749, end=771,
                               score=1.0, left=27, top=912, width=458, height=39),
-        ImageRecognizerResult(entity_type="URL", start=781, end=794,
+        ImageRecognizerResult(entity_type="URL", start=758, end=771,
                               score=0.5, left=27, top=912, width=458, height=39),
     ]
     # fmt: on