Skip to content

Commit

Permalink
Updating verification engines to include latest updates to redactor e…
Browse files Browse the repository at this point in the history
…ngines (#1162)

* Enabling use of ad-hoc recognizers in verifier

* Adding support to standard image verification engine as well

* Linting fix

* Removing redundant init

* Removing unused import
  • Loading branch information
niwilso authored Sep 6, 2023
1 parent 93934a9 commit 4e8490c
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,21 @@ def verify_dicom_instance(
instance: pydicom.dataset.FileDataset,
padding_width: int = 25,
display_image: bool = True,
use_metadata: bool = True,
ocr_kwargs: Optional[dict] = None,
ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None,
**text_analyzer_kwargs,
) -> Tuple[Optional[PIL.Image.Image], dict, list]:
"""Verify PII on a single DICOM instance.
:param instance: Loaded DICOM instance including pixel data and metadata.
:param padding_width: Padding width to use when running OCR.
:param display_image: If the verificationimage is displayed and returned.
:param use_metadata: Whether to redact text in the image that
are present in the metadata.
:param ocr_kwargs: Additional params for OCR methods.
:param ad_hoc_recognizers: List of PatternRecognizer objects to use
for ad-hoc recognizer.
:param text_analyzer_kwargs: Additional values for the analyze method
in ImageAnalyzerEngine.
Expand All @@ -82,24 +88,17 @@ def verify_dicom_instance(
loaded_image = Image.open(png_filepath)
image = self._add_padding(loaded_image, is_greyscale, padding_width)

# Create custom recognizer using DICOM metadata
original_metadata, is_name, is_patient = self._get_text_metadata(instance_copy)
phi_list = self._make_phi_list(original_metadata, is_name, is_patient)
deny_list_recognizer = PatternRecognizer(
supported_entity="PERSON", deny_list=phi_list
)
# Get analyzer results
ocr_results = self.ocr_engine.perform_ocr(image)
analyzer_results = self.image_analyzer_engine.analyze(
image,
ocr_kwargs=ocr_kwargs,
ad_hoc_recognizers=[deny_list_recognizer],
**text_analyzer_kwargs,
analyzer_results = self._get_analyzer_results(
image, instance, use_metadata, ocr_kwargs, ad_hoc_recognizers,
**text_analyzer_kwargs
)

# Get image with verification boxes
verify_image = (
self.verify(
image, ad_hoc_recognizers=[deny_list_recognizer], **text_analyzer_kwargs
image, ad_hoc_recognizers=ad_hoc_recognizers, **text_analyzer_kwargs
)
if display_image
else None
Expand All @@ -114,7 +113,9 @@ def eval_dicom_instance(
padding_width: int = 25,
tolerance: int = 50,
display_image: bool = False,
use_metadata: bool = True,
ocr_kwargs: Optional[dict] = None,
ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None,
**text_analyzer_kwargs,
) -> Tuple[Optional[PIL.Image.Image], dict]:
"""Evaluate performance for a single DICOM instance.
Expand All @@ -124,7 +125,11 @@ def eval_dicom_instance(
:param padding_width: Padding width to use when running OCR.
:param tolerance: Pixel distance tolerance for matching to ground truth.
:param display_image: If the verificationimage is displayed and returned.
:param use_metadata: Whether to redact text in the image that
are present in the metadata.
:param ocr_kwargs: Additional params for OCR methods.
:param ad_hoc_recognizers: List of PatternRecognizer objects to use
for ad-hoc recognizer.
:param text_analyzer_kwargs: Additional values for the analyze method
in ImageAnalyzerEngine.
Expand All @@ -135,7 +140,9 @@ def eval_dicom_instance(
instance,
padding_width,
display_image,
use_metadata,
ocr_kwargs=ocr_kwargs,
ad_hoc_recognizers=ad_hoc_recognizers,
**text_analyzer_kwargs,
)
formatted_ocr_results = self.bbox_processor.get_bboxes_from_ocr_results(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from PIL import Image, ImageChops
from presidio_image_redactor.image_analyzer_engine import ImageAnalyzerEngine
from presidio_image_redactor.image_redactor_engine import ImageRedactorEngine
from presidio_analyzer import PatternRecognizer
import matplotlib
import io
from matplotlib import pyplot as plt
from typing import Optional
from typing import Optional, List


def fig2img(fig):
Expand All @@ -16,16 +17,15 @@ def fig2img(fig):
return img


class ImagePiiVerifyEngine:
class ImagePiiVerifyEngine(ImageRedactorEngine):
"""ImagePiiVerifyEngine class only supporting Pii verification currently."""

def __init__(self, image_analyzer_engine: Optional[ImageAnalyzerEngine] = None):
if not image_analyzer_engine:
image_analyzer_engine = ImageAnalyzerEngine()
self.image_analyzer_engine = image_analyzer_engine

def verify(
self, image: Image, ocr_kwargs: Optional[dict] = None, **text_analyzer_kwargs
self,
image: Image,
ocr_kwargs: Optional[dict] = None,
ad_hoc_recognizers: Optional[List[PatternRecognizer]] = None,
**text_analyzer_kwargs
) -> Image:
"""Annotate image with the detect PII entity.
Expand All @@ -34,6 +34,8 @@ def verify(
:param image: PIL Image to be processed.
:param ocr_kwargs: Additional params for OCR methods.
:param ad_hoc_recognizers: List of PatternRecognizer objects to use
for ad-hoc recognizer.
:param text_analyzer_kwargs: Additional values for the analyze method
in ImageAnalyzerEngine.
Expand All @@ -42,9 +44,23 @@ def verify(

image = ImageChops.duplicate(image)
image_x, image_y = image.size
bboxes = self.image_analyzer_engine.analyze(
image, ocr_kwargs, **text_analyzer_kwargs
)

# Detect PII
self._check_ad_hoc_recognizer_list(ad_hoc_recognizers)
if ad_hoc_recognizers is None:
bboxes = self.image_analyzer_engine.analyze(
image,
ocr_kwargs=ocr_kwargs,
**text_analyzer_kwargs,
)
else:
bboxes = self.image_analyzer_engine.analyze(
image,
ocr_kwargs=ocr_kwargs,
ad_hoc_recognizers=ad_hoc_recognizers,
**text_analyzer_kwargs,
)

fig, ax = plt.subplots()
image_r = 70
fig.set_size_inches(image_x / image_r, image_y / image_r)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,21 +86,11 @@ def test_verify_dicom_instance_happy_path(
mock_add_padding = mocker.patch.object(
DicomImagePiiVerifyEngine, "_add_padding", return_value=None
)
mock_get_metadata = mocker.patch.object(
DicomImagePiiVerifyEngine, "_get_text_metadata", return_value=[None, None, None]
)
mock_make_phi_list = mocker.patch.object(
DicomImagePiiVerifyEngine, "_make_phi_list", return_value=None
)
mock_patternrecognizer = mocker.patch(
"presidio_image_redactor.dicom_image_pii_verify_engine.PatternRecognizer",
return_value=None,
)
mock_perform_ocr = mocker.patch.object(
TesseractOCR, "perform_ocr", return_value=None
)
mock_analyze = mocker.patch.object(
ImageAnalyzerEngine, "analyze", return_value=None
DicomImagePiiVerifyEngine, "_get_analyzer_results", return_value=None
)
mock_verify = mocker.patch.object(
DicomImagePiiVerifyEngine, "verify", return_value=None
Expand All @@ -115,9 +105,6 @@ def test_verify_dicom_instance_happy_path(
assert mock_save_pixel_array.call_count == 1
assert mock_image_open.call_count == 1
assert mock_add_padding.call_count == 1
assert mock_get_metadata.call_count == 1
assert mock_make_phi_list.call_count == 1
assert mock_patternrecognizer.call_count == 1
assert mock_perform_ocr.call_count == 1
assert mock_analyze.call_count == 1
assert mock_verify.call_count == 1
Expand Down

0 comments on commit 4e8490c

Please sign in to comment.