Skip to content

Commit

Permalink
Improve process names method in DICOM image redactor (#1150)
Browse files Browse the repository at this point in the history
* Adding method to augment words more thoroughly

* Adding unit test

* PR comments changes

* Linting fix

---------

Co-authored-by: Omri Mendels <[email protected]>
  • Loading branch information
niwilso and omri374 authored Aug 23, 2023
1 parent 60e1f7d commit 1a12771
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,74 @@ def _get_text_metadata(
return metadata_text, is_name, is_patient

@staticmethod
def _process_names(text_metadata: list, is_name: list) -> list:
def augment_word(word: str, case_sensitive: bool = False) -> list:
"""Apply multiple types of casing to the provided string.
:param words: String containing the word or term of interest.
:param case_sensitive: True if we want to preserve casing.
:return: List of the same string with different casings and spacing.
"""
word_list = []
if word != "":
# Replacing separator character with space, if any
text_no_separator = word.replace("^", " ")
text_no_separator = text_no_separator.replace("-", " ")
text_no_separator = " ".join(text_no_separator.split())

if case_sensitive:
word_list.append(text_no_separator)
word_list.extend(
[
text_no_separator.split(" "),
]
)
else:
# Capitalize all characters in string
text_upper = text_no_separator.upper()

# Lowercase all characters in string
text_lower = text_no_separator.lower()

# Capitalize first letter in each part of string
text_title = text_no_separator.title()

# Append iterations
word_list.extend(
[
text_no_separator,
text_upper,
text_lower,
text_title
]
)

# Adding each term as a separate item in the list
word_list.extend(
[
text_no_separator.split(" "),
text_upper.split(" "),
text_lower.split(" "),
text_title.split(" ")
]
)

# Flatten list
flat_list = []
for item in word_list:
if isinstance(item, list):
flat_list.extend(item)
else:
flat_list.append(item)

# Remove any duplicates and empty strings
word_list = list(set(flat_list))
word_list = list(filter(None, word_list))

return word_list

@classmethod
def _process_names(cls, text_metadata: list, is_name: list) -> list:
"""Process names to have multiple iterations in our PHI list.
:param metadata_text: List of all the instance's element values
Expand All @@ -647,30 +714,7 @@ def _process_names(text_metadata: list, is_name: list) -> list:
for i in range(0, len(text_metadata)):
if is_name[i] is True:
original_text = str(text_metadata[i])

# Replacing separator character with space
text_1 = original_text.replace("^", " ")

# Capitalize all characters in name
text_2 = text_1.upper()

# Lowercase all characters in name
text_3 = text_1.lower()

# Capitalize first letter in each name
text_4 = text_1.title()

# Append iterations
phi_list.append(text_1)
phi_list.append(text_2)
phi_list.append(text_3)
phi_list.append(text_4)

# Adding each name as a separate item in the list
phi_list = phi_list + text_1.split(" ")
phi_list = phi_list + text_2.split(" ")
phi_list = phi_list + text_3.split(" ")
phi_list = phi_list + text_4.split(" ")
phi_list += cls.augment_word(original_text)

return phi_list

Expand Down
86 changes: 86 additions & 0 deletions presidio-image-redactor/tests/test_dicom_image_redactor_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,92 @@ def test_get_text_metadata_happy_path(
assert type(test_metadata_text[idx_is_name[0]]) == str


# ------------------------------------------------------
# DicomImageRedactorEngine.augment_word()
# ------------------------------------------------------
@pytest.mark.parametrize(
"word, case_sensitive, expected_list",
[
("", False, []),
(" ", True, []),
("JOHN^DOE", False, [
"JOHN",
"DOE",
"John",
"Doe",
"john",
"doe",
"JOHN DOE",
"John Doe",
"john doe"
]
),
("JOHN^DOE", True, [
"JOHN",
"DOE",
"JOHN DOE"
]
),
("JOHN-DOE", False, [
"JOHN",
"DOE",
"John",
"Doe",
"john",
"doe",
"JOHN DOE",
"John Doe",
"john doe"
]
),
("JOHN^-DOE", False, [
"JOHN",
"DOE",
"John",
"Doe",
"john",
"doe",
"JOHN DOE",
"John Doe",
"john doe"
]
),
("City Hospital", False, [
"City Hospital",
"CITY HOSPITAL",
"city hospital",
"city",
"hospital",
"City",
"Hospital",
"CITY",
"HOSPITAL"
]
),
("12345", False, ["12345"])
],
)
def test_augment_word_happy_path(
mock_engine: DicomImageRedactorEngine,
word: str,
case_sensitive: bool,
expected_list: list,
):
"""Test happy path for DicomImageRedactorEngine.augment_word
Args:
word (str): String to augment.
case_sensitive (bool): True if casing matters.
expected_list (list): List of expected output.
"""
# Arrange

# Act
test_list = mock_engine.augment_word(word, case_sensitive)

# Assert
assert set(test_list) == set(expected_list)

# ------------------------------------------------------
# DicomImageRedactorEngine._process_names()
# ------------------------------------------------------
Expand Down

0 comments on commit 1a12771

Please sign in to comment.