Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added image processing class to preprocess the image before running OCR #1166

Merged
merged 22 commits into from
Oct 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f186520
added image processing class to preprocess the image before running OCR
ayabel Sep 5, 2023
3d12c68
solved precommit
ayabel Sep 5, 2023
23f96da
solved precommit
ayabel Sep 5, 2023
53885d8
linting fix
ayabel Sep 5, 2023
b65bfe8
fixed linting error
ayabel Sep 5, 2023
c45d5c3
reverted changes of the notebook
ayabel Sep 6, 2023
188eca4
added docstrings and hints
ayabel Sep 6, 2023
7a5e7b7
added a call to the parent class
ayabel Sep 6, 2023
8b6d7ed
fixed typo
ayabel Sep 6, 2023
a01585e
added opencv-python to the list of 3rd party components
ayabel Sep 6, 2023
da3d088
Merge branch 'main' into ayabellicha/dicom/text-detection-improvements
ayabel Sep 11, 2023
8b75db4
added dependencies to the docker file
ayabel Sep 11, 2023
e24ce0b
Merge branch 'ayabellicha/dicom/text-detection-improvements' of https…
ayabel Sep 11, 2023
463e7bb
removed comments
ayabel Sep 11, 2023
6717b71
Merge branch 'main' into ayabellicha/dicom/text-detection-improvements
omri374 Sep 19, 2023
62b5b18
Merge branch 'main' into ayabellicha/dicom/text-detection-improvements
omri374 Sep 20, 2023
3e8119b
changed input type from PngImageFile to Image
ayabel Sep 28, 2023
7b4dbad
Merge branch 'ayabellicha/dicom/text-detection-improvements' of https…
ayabel Sep 28, 2023
080e664
fixed docstring
ayabel Sep 28, 2023
bd4ed3b
Merge branch 'main' into ayabellicha/dicom/text-detection-improvements
omri374 Oct 4, 2023
023228a
Update install_nlp_models.py
omri374 Oct 4, 2023
0c54a99
Update install_nlp_models.py
omri374 Oct 4, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@ Do Not Translate or Localize

This project incorporates components from the projects listed below. The original copyright notices and the licenses under which Microsoft received such components are set forth below. Microsoft reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise.

*******
opencv-python

MIT License

Copyright (c) Olli-Pekka Heinisuo

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


*******
spaCy
Expand Down
6 changes: 3 additions & 3 deletions presidio-analyzer/install_nlp_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from typing import Union, Dict

import spacy
from spacy.cli import download as spacy_download
import yaml

try:
Expand Down Expand Up @@ -53,7 +53,7 @@ def install_models(conf_file: str) -> None:

def _download_model(engine_name: str, model_name: Union[str, Dict[str, str]]) -> None:
if engine_name == "spacy":
spacy.cli.download(model_name)
spacy_download(model_name)
elif engine_name == "stanza":
if stanza:
stanza.download(model_name)
Expand Down Expand Up @@ -84,7 +84,7 @@ def _install_transformers_spacy_models(model_name: Dict[str, str]) -> None:

# download spacy model/pipeline
logger.info(f"Installing spaCy model: {spacy_model}")
spacy.cli.download(spacy_model)
spacy_download(spacy_model)

# download transformers model
logger.info(f"Installing transformers model: {transformers_model}")
Expand Down
4 changes: 4 additions & 0 deletions presidio-image-redactor/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/* \
&& tesseract -v

RUN apt-get update \
&& apt-get install ffmpeg libsm6 libxext6 -y


COPY ./Pipfile* /usr/bin/${NAME}/
RUN pip install pipenv \
&& pipenv install --deploy \
Expand Down
1 change: 1 addition & 0 deletions presidio-image-redactor/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pydicom = ">=2.3.0,<3.0.0"
pypng = ">=0.20220715.0,<1.0.0"
python-gdcm = ">=3.0.22,<4.0.0"
matplotlib = ">=3.6.2,<4.0.0"
opencv-python = ">=4.8.0"
typing-extensions = "*"

[dev-packages]
Expand Down
12 changes: 12 additions & 0 deletions presidio-image-redactor/presidio_image_redactor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,18 @@
from .ocr import OCR
from .tesseract_ocr import TesseractOCR
from .bbox import BboxProcessor
from .image_processing_engine import ImagePreprocessor
from .image_analyzer_engine import ImageAnalyzerEngine
from .image_redactor_engine import ImageRedactorEngine
from .image_pii_verify_engine import ImagePiiVerifyEngine
from .dicom_image_redactor_engine import DicomImageRedactorEngine
from .dicom_image_pii_verify_engine import DicomImagePiiVerifyEngine
from .image_processing_engine import (
ContrastSegmentedImageEnhancer,
BilateralFilter,
SegmentedAdaptiveThreshold,
ImageRescaling,
)

# Set up default logging (with NullHandler)
logging.getLogger("presidio-image-redactor").addHandler(logging.NullHandler())
Expand All @@ -19,7 +26,12 @@
"BboxProcessor",
"ImageAnalyzerEngine",
"ImageRedactorEngine",
"ImagePreprocessor",
"ImagePiiVerifyEngine",
"DicomImageRedactorEngine",
"DicomImagePiiVerifyEngine",
"ContrastSegmentedImageEnhancer",
"BilateralFilter",
"SegmentedAdaptiveThreshold",
"ImageRescaling",
]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from PIL import Image, ImageOps
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import PIL
import png
import json
import numpy as np
Expand Down Expand Up @@ -85,17 +84,19 @@ def redact_and_return_bbox(

# Detect PII
analyzer_results = self._get_analyzer_results(
image, instance, use_metadata, ocr_kwargs, ad_hoc_recognizers,
**text_analyzer_kwargs
image,
instance,
use_metadata,
ocr_kwargs,
ad_hoc_recognizers,
**text_analyzer_kwargs,
)

# Redact all bounding boxes from DICOM file
analyzer_bboxes = self.bbox_processor.get_bboxes_from_analyzer_results(
analyzer_results
)
bboxes = self.bbox_processor.remove_bbox_padding(
analyzer_bboxes, padding_width
)
bboxes = self.bbox_processor.remove_bbox_padding(analyzer_bboxes, padding_width)
redacted_image = self._add_redact_box(instance, bboxes, crop_ratio, fill)

return redacted_image, bboxes
Expand Down Expand Up @@ -135,7 +136,7 @@ def redact(
crop_ratio=crop_ratio,
ocr_kwargs=ocr_kwargs,
ad_hoc_recognizers=ad_hoc_recognizers,
**text_analyzer_kwargs
**text_analyzer_kwargs,
)

return redacted_image
Expand Down Expand Up @@ -306,7 +307,7 @@ def _check_if_greyscale(instance: pydicom.dataset.FileDataset) -> bool:
color_scale = instance.PhotometricInterpretation
except AttributeError:
color_scale = None
is_greyscale = (color_scale in ["MONOCHROME1", "MONOCHROME2"])
is_greyscale = color_scale in ["MONOCHROME1", "MONOCHROME2"]

return is_greyscale

Expand Down Expand Up @@ -402,11 +403,11 @@ def _convert_dcm_to_png(cls, filepath: Path, output_dir: str = "temp_dir") -> tu

@staticmethod
def _get_bg_color(
image: PIL.PngImagePlugin.PngImageFile, is_greyscale: bool, invert: bool = False
image: Image.Image, is_greyscale: bool, invert: bool = False
) -> Union[int, Tuple[int, int, int]]:
"""Select most common color as background color.

:param image: Loaded PNG image.
:param image: Loaded PIL image.
:param colorscale: Colorscale of image (e.g., 'grayscale', 'RGB')
:param invert: TRUE if you want to get the inverse of the bg color.

Expand Down Expand Up @@ -521,17 +522,17 @@ def _get_most_common_pixel_value(
@classmethod
def _add_padding(
cls,
image: PIL.PngImagePlugin.PngImageFile,
image: Image.Image,
is_greyscale: bool,
padding_width: int,
) -> PIL.PngImagePlugin.PngImageFile:
) -> Image.Image:
"""Add border to image using most common color.

:param image: Loaded PNG image.
:param image: Loaded PIL image.
:param is_greyscale: Whether image is in grayscale or not.
:param padding_width: Pixel width of padding (uniform).

:return: PNG image with padding.
:return: PIL image with padding.
"""
# Check padding width value
if padding_width <= 0:
Expand Down Expand Up @@ -667,12 +668,7 @@ def augment_word(word: str, case_sensitive: bool = False) -> list:

# Append iterations
word_list.extend(
[
text_no_separator,
text_upper,
text_lower,
text_title
]
[text_no_separator, text_upper, text_lower, text_title]
)

# Adding each term as a separate item in the list
Expand All @@ -681,7 +677,7 @@ def augment_word(word: str, case_sensitive: bool = False) -> list:
text_no_separator.split(" "),
text_upper.split(" "),
text_lower.split(" "),
text_title.split(" ")
text_title.split(" "),
]
)

Expand Down Expand Up @@ -819,8 +815,9 @@ def _check_if_compressed(instance: pydicom.dataset.FileDataset) -> bool:
number_of_frames = instance[0x0028, 0x0008].value
except KeyError:
number_of_frames = 1
expected_num_bytes = (rows * columns * number_of_frames
* samples_per_pixel * (bits_allocated/8))
expected_num_bytes = (
rows * columns * number_of_frames * samples_per_pixel * (bits_allocated / 8)
)

# Compare expected vs actual
is_compressed = (int(expected_num_bytes)) > len(instance.PixelData)
Expand All @@ -829,7 +826,7 @@ def _check_if_compressed(instance: pydicom.dataset.FileDataset) -> bool:

@staticmethod
def _compress_pixel_data(
instance: pydicom.dataset.FileDataset
instance: pydicom.dataset.FileDataset,
) -> pydicom.dataset.FileDataset:
"""Recompress pixel data that was decompressed during redaction.

Expand All @@ -840,17 +837,17 @@ def _compress_pixel_data(
compression_method = pydicom.uid.RLELossless

# Temporarily change syntax to an "uncompressed" method
instance.file_meta.TransferSyntaxUID = pydicom.uid.UID('1.2.840.10008.1.2')
instance.file_meta.TransferSyntaxUID = pydicom.uid.UID("1.2.840.10008.1.2")

# Compress and update syntax
instance.compress(compression_method, encoding_plugin='gdcm')
instance.compress(compression_method, encoding_plugin="gdcm")
instance.file_meta.TransferSyntaxUID = compression_method

return instance

@staticmethod
def _check_if_has_image_icon_sequence(
instance: pydicom.dataset.FileDataset
instance: pydicom.dataset.FileDataset,
) -> bool:
"""Check if there is an image icon sequence tag in the metadata.

Expand Down Expand Up @@ -927,12 +924,12 @@ def _add_redact_box(

def _get_analyzer_results(
self,
image: PIL.PngImagePlugin.PngImageFile,
image: Image.Image,
instance: pydicom.dataset.FileDataset,
use_metadata: bool,
ocr_kwargs: Optional[dict],
ad_hoc_recognizers: Optional[List[PatternRecognizer]],
**text_analyzer_kwargs
**text_analyzer_kwargs,
) -> List[ImageRecognizerResult]:
"""Analyze image with selected redaction approach.

Expand All @@ -953,12 +950,8 @@ def _get_analyzer_results(

# Create custom recognizer using DICOM metadata
if use_metadata:
original_metadata, is_name, is_patient = self._get_text_metadata(
instance
)
phi_list = self._make_phi_list(
original_metadata, is_name, is_patient
)
original_metadata, is_name, is_patient = self._get_text_metadata(instance)
phi_list = self._make_phi_list(original_metadata, is_name, is_patient)
deny_list_recognizer = PatternRecognizer(
supported_entity="PERSON", deny_list=phi_list
)
Expand Down Expand Up @@ -1064,17 +1057,19 @@ def _redact_single_dicom_image(

# Detect PII
analyzer_results = self._get_analyzer_results(
image, instance, use_metadata, ocr_kwargs, ad_hoc_recognizers,
**text_analyzer_kwargs
image,
instance,
use_metadata,
ocr_kwargs,
ad_hoc_recognizers,
**text_analyzer_kwargs,
)

# Redact all bounding boxes from DICOM file
analyzer_bboxes = self.bbox_processor.get_bboxes_from_analyzer_results(
analyzer_results
)
bboxes = self.bbox_processor.remove_bbox_padding(
analyzer_bboxes, padding_width
)
bboxes = self.bbox_processor.remove_bbox_padding(analyzer_bboxes, padding_width)
redacted_dicom_instance = self._add_redact_box(
instance, bboxes, crop_ratio, fill
)
Expand Down
Loading