microsoft · omri374 · Oct 4, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/NOTICE b/NOTICE
@@ -3,6 +3,31 @@ Do Not Translate or Localize
 
 This project incorporates components from the projects listed below. The original copyright notices and the licenses under which Microsoft received such components are set forth below. Microsoft reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise.
 
+*******
+opencv-python
+
+MIT License
+
+Copyright (c) Olli-Pekka Heinisuo
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
 
 *******
 spaCy

diff --git a/presidio-analyzer/install_nlp_models.py b/presidio-analyzer/install_nlp_models.py
@@ -4,7 +4,7 @@
 import logging
 from typing import Union, Dict
 
-import spacy
+from spacy.cli import download as spacy_download
 import yaml
 
 try:
@@ -53,7 +53,7 @@ def install_models(conf_file: str) -> None:
 
 def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
     if engine_name == "spacy":
-        spacy.cli.download(model_name)
+        spacy_download(model_name)
     elif engine_name == "stanza":
         if stanza:
             stanza.download(model_name)
@@ -84,7 +84,7 @@ def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:
 
     # download spacy model/pipeline
     logger.info(f"Installing spaCy model: {spacy_model}")
-    spacy.cli.download(spacy_model)
+    spacy_download(spacy_model)
 
     # download transformers model
     logger.info(f"Installing transformers model: {transformers_model}")

diff --git a/presidio-image-redactor/Dockerfile b/presidio-image-redactor/Dockerfile
@@ -10,6 +10,10 @@ RUN apt-get update \
   && rm -rf /var/lib/apt/lists/* \
   && tesseract -v
 
+RUN apt-get update \ 
+  && apt-get install ffmpeg libsm6 libxext6  -y
+
+
 COPY ./Pipfile* /usr/bin/${NAME}/
 RUN pip install pipenv \
   && pipenv install --deploy \

diff --git a/presidio-image-redactor/Pipfile b/presidio-image-redactor/Pipfile
@@ -12,6 +12,7 @@ pydicom = ">=2.3.0,<3.0.0"
 pypng = ">=0.20220715.0,<1.0.0"
 python-gdcm = ">=3.0.22,<4.0.0"
 matplotlib = ">=3.6.2,<4.0.0"
+opencv-python = ">=4.8.0"
 typing-extensions = "*"
 
 [dev-packages]

diff --git a/presidio-image-redactor/presidio_image_redactor/__init__.py b/presidio-image-redactor/presidio_image_redactor/__init__.py
@@ -4,11 +4,18 @@
 from .ocr import OCR
 from .tesseract_ocr import TesseractOCR
 from .bbox import BboxProcessor
+from .image_processing_engine import ImagePreprocessor
 from .image_analyzer_engine import ImageAnalyzerEngine
 from .image_redactor_engine import ImageRedactorEngine
 from .image_pii_verify_engine import ImagePiiVerifyEngine
 from .dicom_image_redactor_engine import DicomImageRedactorEngine
 from .dicom_image_pii_verify_engine import DicomImagePiiVerifyEngine
+from .image_processing_engine import (
+    ContrastSegmentedImageEnhancer,
+    BilateralFilter,
+    SegmentedAdaptiveThreshold,
+    ImageRescaling,
+)
 
 # Set up default logging (with NullHandler)
 logging.getLogger("presidio-image-redactor").addHandler(logging.NullHandler())
@@ -19,7 +26,12 @@
     "BboxProcessor",
     "ImageAnalyzerEngine",
     "ImageRedactorEngine",
+    "ImagePreprocessor",
     "ImagePiiVerifyEngine",
     "DicomImageRedactorEngine",
     "DicomImagePiiVerifyEngine",
+    "ContrastSegmentedImageEnhancer",
+    "BilateralFilter",
+    "SegmentedAdaptiveThreshold",
+    "ImageRescaling",
 ]
diff --git a/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py b/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py
@@ -7,7 +7,6 @@
 from PIL import Image, ImageOps
 import pydicom
 from pydicom.pixel_data_handlers.util import apply_voi_lut
-import PIL
 import png
 import json
 import numpy as np
@@ -85,17 +84,19 @@ def redact_and_return_bbox(
 
         # Detect PII
         analyzer_results = self._get_analyzer_results(
-            image, instance, use_metadata, ocr_kwargs, ad_hoc_recognizers,
-            **text_analyzer_kwargs
+            image,
+            instance,
+            use_metadata,
+            ocr_kwargs,
+            ad_hoc_recognizers,
+            **text_analyzer_kwargs,
         )
 
         # Redact all bounding boxes from DICOM file
         analyzer_bboxes = self.bbox_processor.get_bboxes_from_analyzer_results(
             analyzer_results
         )
-        bboxes = self.bbox_processor.remove_bbox_padding(
-            analyzer_bboxes, padding_width
-        )
+        bboxes = self.bbox_processor.remove_bbox_padding(analyzer_bboxes, padding_width)
         redacted_image = self._add_redact_box(instance, bboxes, crop_ratio, fill)
 
         return redacted_image, bboxes
@@ -135,7 +136,7 @@ def redact(
             crop_ratio=crop_ratio,
             ocr_kwargs=ocr_kwargs,
             ad_hoc_recognizers=ad_hoc_recognizers,
-            **text_analyzer_kwargs
+            **text_analyzer_kwargs,
         )
 
         return redacted_image
@@ -306,7 +307,7 @@ def _check_if_greyscale(instance: pydicom.dataset.FileDataset) -> bool:
             color_scale = instance.PhotometricInterpretation
         except AttributeError:
             color_scale = None
-        is_greyscale = (color_scale in ["MONOCHROME1", "MONOCHROME2"])
+        is_greyscale = color_scale in ["MONOCHROME1", "MONOCHROME2"]
 
         return is_greyscale
 
@@ -402,11 +403,11 @@ def _convert_dcm_to_png(cls, filepath: Path, output_dir: str = "temp_dir") -> tu
 
     @staticmethod
     def _get_bg_color(
-        image: PIL.PngImagePlugin.PngImageFile, is_greyscale: bool, invert: bool = False
+        image: Image.Image, is_greyscale: bool, invert: bool = False
     ) -> Union[int, Tuple[int, int, int]]:
         """Select most common color as background color.
 
-        :param image: Loaded PNG image.
+        :param image: Loaded PIL image.
         :param colorscale: Colorscale of image (e.g., 'grayscale', 'RGB')
         :param invert: TRUE if you want to get the inverse of the bg color.
 
@@ -521,17 +522,17 @@ def _get_most_common_pixel_value(
     @classmethod
     def _add_padding(
         cls,
-        image: PIL.PngImagePlugin.PngImageFile,
+        image: Image.Image,
         is_greyscale: bool,
         padding_width: int,
-    ) -> PIL.PngImagePlugin.PngImageFile:
+    ) -> Image.Image:
         """Add border to image using most common color.
 
-        :param image: Loaded PNG image.
+        :param image: Loaded PIL image.
         :param is_greyscale: Whether image is in grayscale or not.
         :param padding_width: Pixel width of padding (uniform).
 
-        :return: PNG image with padding.
+        :return: PIL image with padding.
         """
         # Check padding width value
         if padding_width <= 0:
@@ -667,12 +668,7 @@ def augment_word(word: str, case_sensitive: bool = False) -> list:
 
                 # Append iterations
                 word_list.extend(
-                    [
-                        text_no_separator,
-                        text_upper,
-                        text_lower,
-                        text_title
-                    ]
+                    [text_no_separator, text_upper, text_lower, text_title]
                 )
 
                 # Adding each term as a separate item in the list
@@ -681,7 +677,7 @@ def augment_word(word: str, case_sensitive: bool = False) -> list:
                         text_no_separator.split(" "),
                         text_upper.split(" "),
                         text_lower.split(" "),
-                        text_title.split(" ")
+                        text_title.split(" "),
                     ]
                 )
 
@@ -819,8 +815,9 @@ def _check_if_compressed(instance: pydicom.dataset.FileDataset) -> bool:
             number_of_frames = instance[0x0028, 0x0008].value
         except KeyError:
             number_of_frames = 1
-        expected_num_bytes = (rows * columns * number_of_frames
-                              * samples_per_pixel * (bits_allocated/8))
+        expected_num_bytes = (
+            rows * columns * number_of_frames * samples_per_pixel * (bits_allocated / 8)
+        )
 
         # Compare expected vs actual
         is_compressed = (int(expected_num_bytes)) > len(instance.PixelData)
@@ -829,7 +826,7 @@ def _check_if_compressed(instance: pydicom.dataset.FileDataset) -> bool:
 
     @staticmethod
     def _compress_pixel_data(
-        instance: pydicom.dataset.FileDataset
+        instance: pydicom.dataset.FileDataset,
     ) -> pydicom.dataset.FileDataset:
         """Recompress pixel data that was decompressed during redaction.
 
@@ -840,17 +837,17 @@ def _compress_pixel_data(
         compression_method = pydicom.uid.RLELossless
 
         # Temporarily change syntax to an "uncompressed" method
-        instance.file_meta.TransferSyntaxUID = pydicom.uid.UID('1.2.840.10008.1.2')
+        instance.file_meta.TransferSyntaxUID = pydicom.uid.UID("1.2.840.10008.1.2")
 
         # Compress and update syntax
-        instance.compress(compression_method, encoding_plugin='gdcm')
+        instance.compress(compression_method, encoding_plugin="gdcm")
         instance.file_meta.TransferSyntaxUID = compression_method
 
         return instance
 
     @staticmethod
     def _check_if_has_image_icon_sequence(
-        instance: pydicom.dataset.FileDataset
+        instance: pydicom.dataset.FileDataset,
     ) -> bool:
         """Check if there is an image icon sequence tag in the metadata.
 
@@ -927,12 +924,12 @@ def _add_redact_box(
 
     def _get_analyzer_results(
         self,
-        image: PIL.PngImagePlugin.PngImageFile,
+        image: Image.Image,
         instance: pydicom.dataset.FileDataset,
         use_metadata: bool,
         ocr_kwargs: Optional[dict],
         ad_hoc_recognizers: Optional[List[PatternRecognizer]],
-        **text_analyzer_kwargs
+        **text_analyzer_kwargs,
     ) -> List[ImageRecognizerResult]:
         """Analyze image with selected redaction approach.
 
@@ -953,12 +950,8 @@ def _get_analyzer_results(
 
         # Create custom recognizer using DICOM metadata
         if use_metadata:
-            original_metadata, is_name, is_patient = self._get_text_metadata(
-                instance
-            )
-            phi_list = self._make_phi_list(
-                original_metadata, is_name, is_patient
-            )
+            original_metadata, is_name, is_patient = self._get_text_metadata(instance)
+            phi_list = self._make_phi_list(original_metadata, is_name, is_patient)
             deny_list_recognizer = PatternRecognizer(
                 supported_entity="PERSON", deny_list=phi_list
             )
@@ -1064,17 +1057,19 @@ def _redact_single_dicom_image(
 
         # Detect PII
         analyzer_results = self._get_analyzer_results(
-            image, instance, use_metadata, ocr_kwargs, ad_hoc_recognizers,
-            **text_analyzer_kwargs
+            image,
+            instance,
+            use_metadata,
+            ocr_kwargs,
+            ad_hoc_recognizers,
+            **text_analyzer_kwargs,
         )
 
         # Redact all bounding boxes from DICOM file
         analyzer_bboxes = self.bbox_processor.get_bboxes_from_analyzer_results(
             analyzer_results
         )
-        bboxes = self.bbox_processor.remove_bbox_padding(
-            analyzer_bboxes, padding_width
-        )
+        bboxes = self.bbox_processor.remove_bbox_padding(analyzer_bboxes, padding_width)
         redacted_dicom_instance = self._add_redact_box(
             instance, bboxes, crop_ratio, fill
         )