DICOM redactor improvement: Enable return of redacted bboxes (#1111)

* Enable return of bboxes used to redact pixels * Adding return_bboxes arg values into existing tests * Adding test for return_bbox==True condition * Adding test for _save_bbox_json() * Making argument name more clear * Creating separate method to return redacted image and bboxes * Linting fix * Removing Union return type * Commenting out DICOM verification engine intergration test to see if that is still the cause of unit test hangup * Renaming test and removing redundancy in unit test for dicom image redactor * Fixing duplication of call to a single file likely from main merges * Removing extra cases for redact() test * Changing mocked return type from None to an empty list * Commenting out full unit test for redact to see effect on PR build hangup * Reintroduce verify integration test and non-parameterized redact test * Commenting out threshold and expected length test to see impact on PR build hang-up * Undo comment out of image analyzer engine test * Commenting out all unit tests for dicom image redactor engine * Comment out unit test for redact() * Fixing typing * Commenting out exception test for redact_and_return_bbox * Updated how exceptions are handled for redact_and_return_bbox, return all unit tests * Adding IsADirectoryError exception type * Commenting out happy path test for redact_and_return_bbox * Commenting out compressed and icon_image_sequence DICOM test input images for redact_and_return_bbox happy path test * Commenting out the type assertions in happy path test for redact_and_return_bbox * Commenting out the call count assertions in happy path for redact_and_return_bbox * Update type assertion and comment out all mocking and mocking assertions for happy path test for redact_and_return_bbox * Commenting out all assertions in happy path test for redact_and_return_bbox * Replacing mocker.patch with mocker.patch.object for all mocked methods in happy path test for redact_and_return_bbox * Changing all mocker.patch.object calls into mocker.patch for happy path test for redact_and_return_bbox * Reintroduce assertions for happy path test for redact_and_return_bbox * Turning off assertions for call count again for happy path for redact_and_return_bbox * Making assertion for returned bbox type even more explicit for happy path test for redact_and_return_bbox * Turning off type assertions and turning on mock call count assertions for happy path test for redact_and_return_bbox * Replacing call count assertions with assert_called_once * Reintroducing type assertions and changing return_value to include some placeholder mock data instead of being empty dictionaries in list * Comment out the image type assertion * Turning on image type assertion and turning off bbox type assertions * Removing assertion for dict * Using isinstance instead of type == * Removing assertions for bbox type --------- Co-authored-by: Omri Mendels <[email protected]>
microsoft · Aug 2, 2023 · e323fed · e323fed
1 parent 67833d5
commit e323fed
Show file tree

Hide file tree

Showing 2 changed files with 247 additions and 66 deletions.
diff --git a/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py b/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py
@@ -9,9 +9,10 @@
 from pydicom.pixel_data_handlers.util import apply_voi_lut
 import PIL
 import png
+import json
 import numpy as np
 from matplotlib import pyplot as plt  # necessary import for PIL typing # noqa: F401
-from typing import Tuple, List, Union, Optional
+from typing import Tuple, List, Dict, Union, Optional
 
 from presidio_image_redactor import ImageRedactorEngine
 from presidio_image_redactor import ImageAnalyzerEngine  # noqa: F401
@@ -24,16 +25,16 @@ class DicomImageRedactorEngine(ImageRedactorEngine):
     :param image_analyzer_engine: Engine which performs OCR + PII detection.
     """
 
-    def redact(
+    def redact_and_return_bbox(
         self,
         image: pydicom.dataset.FileDataset,
         fill: str = "contrast",
         padding_width: int = 25,
         crop_ratio: float = 0.75,
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
-    ):
-        """Redact method to redact the given DICOM image.
+    ) -> Tuple[pydicom.dataset.FileDataset, List[Dict[str, int]]]:
+        """Redact method to redact the given DICOM image and return redacted bboxes.
 
         Please note, this method duplicates the image, creates a
         new instance and manipulates it.
@@ -54,8 +55,12 @@ def redact(
             raise TypeError("The provided image must be a loaded DICOM instance.")
         try:
             image.PixelData
-        except AttributeError:
-            raise AttributeError("Provided DICOM instance lacks pixel data.")
+        except AttributeError as e:
+            raise AttributeError(f"Provided DICOM instance lacks pixel data: {e}")
+        except PermissionError as e:
+            raise PermissionError(f"Unable to access pixel data (may not exist): {e}")
+        except IsADirectoryError as e:
+            raise IsADirectoryError(f"DICOM instance is a directory: {e}")
 
         instance = deepcopy(image)
 
@@ -93,6 +98,42 @@ def redact(
         )
         redacted_image = self._add_redact_box(instance, bboxes, crop_ratio, fill)
 
+        return redacted_image, bboxes
+
+    def redact(
+        self,
+        image: pydicom.dataset.FileDataset,
+        fill: str = "contrast",
+        padding_width: int = 25,
+        crop_ratio: float = 0.75,
+        ocr_kwargs: Optional[dict] = None,
+        **text_analyzer_kwargs,
+    ) -> pydicom.dataset.FileDataset:
+        """Redact method to redact the given DICOM image.
+
+        Please note, this method duplicates the image, creates a
+        new instance and manipulates it.
+
+        :param image: Loaded DICOM instance including pixel data and metadata.
+        :param fill: Fill setting to use for redaction box ("contrast" or "background").
+        :param padding_width: Padding width to use when running OCR.
+        :param crop_ratio: Portion of image to consider when selecting
+        most common pixel value as the background color value.
+        :param ocr_kwargs: Additional params for OCR methods.
+        :param text_analyzer_kwargs: Additional values for the analyze method
+        in AnalyzerEngine.
+
+        :return: DICOM instance with redacted pixel data.
+        """
+        redacted_image, _ = self.redact_and_return_bbox(
+            image=image,
+            fill=fill,
+            padding_width=padding_width,
+            crop_ratio=crop_ratio,
+            ocr_kwargs=ocr_kwargs,
+            **text_analyzer_kwargs
+        )
+
         return redacted_image
 
     def redact_from_file(
@@ -102,6 +143,7 @@ def redact_from_file(
         padding_width: int = 25,
         crop_ratio: float = 0.75,
         fill: str = "contrast",
+        save_bboxes: bool = False,
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> None:
@@ -115,6 +157,7 @@ def redact_from_file(
         :param padding_width : Padding width to use when running OCR.
         :param fill: Color setting to use for redaction box
         ("contrast" or "background").
+        :param save_bboxes: True if we want to save boundings boxes.
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -140,6 +183,7 @@ def redact_from_file(
             padding_width=padding_width,
             overwrite=True,
             dst_parent_dir=".",
+            save_bboxes=save_bboxes,
             ocr_kwargs=ocr_kwargs,
             **text_analyzer_kwargs,
         )
@@ -155,6 +199,7 @@ def redact_from_directory(
         padding_width: int = 25,
         crop_ratio: float = 0.75,
         fill: str = "contrast",
+        save_bboxes: bool = False,
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> None:
@@ -170,6 +215,7 @@ def redact_from_directory(
         most common pixel value as the background color value.
         :param fill: Color setting to use for redaction box
         ("contrast" or "background").
+        :param save_bboxes: True if we want to save boundings boxes.
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -195,6 +241,7 @@ def redact_from_directory(
             padding_width=padding_width,
             overwrite=True,
             dst_parent_dir=".",
+            save_bboxes=save_bboxes,
             ocr_kwargs=ocr_kwargs,
             **text_analyzer_kwargs,
         )
@@ -516,7 +563,7 @@ def _copy_files_for_processing(src_path: str, dst_parent_dir: str) -> Path:
         elif Path(src_path).is_file() is True:
             # Create the output dir manually if working with a single file
             os.makedirs(Path(dst_path).parent, exist_ok=True)
-            shutil.copy(src_path, dst_path)
+            shutil.copyfile(src_path, dst_path)
         else:
             raise FileNotFoundError(f"{src_path} does not exist")
 
@@ -811,6 +858,19 @@ def _add_redact_box(
 
         return redacted_instance
 
+    @staticmethod
+    def _save_bbox_json(output_dcm_path: str, bboxes: List[Dict[str, int]]) -> None:
+        """Save the redacted bounding box info as a json file.
+
+        :param output_dcm_path: Path to the redacted DICOM file.
+
+        :param bboxes: Bounding boxes used in redaction.
+        """
+        output_json_path = Path(output_dcm_path).with_suffix(".json")
+
+        with open(output_json_path, "w") as write_file:
+            json.dump(bboxes, write_file, indent=4)
+
     def _redact_single_dicom_image(
         self,
         dcm_path: str,
@@ -819,6 +879,7 @@ def _redact_single_dicom_image(
         padding_width: int,
         overwrite: bool,
         dst_parent_dir: str,
+        save_bboxes: bool,
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> str:
@@ -833,6 +894,7 @@ def _redact_single_dicom_image(
         :param overwrite: Only set to True if you are providing the
         duplicated DICOM path in dcm_path.
         :param dst_parent_dir: String path to parent directory of where to store copies.
+        :param save_bboxes: True if we want to save boundings boxes.
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -892,6 +954,10 @@ def _redact_single_dicom_image(
         )
         redacted_dicom_instance.save_as(dst_path)
 
+        # Save redacted bboxes
+        if save_bboxes:
+            self._save_bbox_json(dst_path, bboxes)
+
         return dst_path
 
     def _redact_multiple_dicom_images(
@@ -902,6 +968,7 @@ def _redact_multiple_dicom_images(
         padding_width: int,
         overwrite: bool,
         dst_parent_dir: str,
+        save_bboxes: bool,
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> str:
@@ -916,6 +983,7 @@ def _redact_multiple_dicom_images(
         :param overwrite: Only set to True if you are providing
         the duplicated DICOM dir in dcm_dir.
         :param dst_parent_dir: String path to parent directory of where to store copies.
+        :param save_bboxes: True if we want to save boundings boxes.
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -945,6 +1013,7 @@ def _redact_multiple_dicom_images(
                 padding_width,
                 overwrite,
                 dst_parent_dir,
+                save_bboxes,
                 ocr_kwargs=ocr_kwargs,
                 **text_analyzer_kwargs,
             )