-
Notifications
You must be signed in to change notification settings - Fork 603
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add UK National Insurance Number Recognizer (#1446)
- Loading branch information
Showing
8 changed files
with
108 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nino_recognizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
from typing import List, Optional | ||
|
||
from presidio_analyzer import Pattern, PatternRecognizer | ||
|
||
|
||
class UkNinoRecognizer(PatternRecognizer): | ||
""" | ||
Recognizes UK National Insurance Number using regex. | ||
:param patterns: List of patterns to be used by this recognizer | ||
:param context: List of context words to increase confidence in detection | ||
:param supported_language: Language this recognizer supports | ||
:param supported_entity: The entity this recognizer can detect | ||
""" | ||
|
||
PATTERNS = [ | ||
Pattern( | ||
"NINO (medium)", | ||
r"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D{1}])\b", # noqa: E501 | ||
0.5, | ||
), | ||
] | ||
|
||
CONTEXT = ["national insurance", "ni number", "nino"] | ||
|
||
def __init__( | ||
self, | ||
patterns: Optional[List[Pattern]] = None, | ||
context: Optional[List[str]] = None, | ||
supported_language: str = "en", | ||
supported_entity: str = "UK_NINO", | ||
): | ||
patterns = patterns if patterns else self.PATTERNS | ||
context = context if context else self.CONTEXT | ||
super().__init__( | ||
supported_entity=supported_entity, | ||
patterns=patterns, | ||
context=context, | ||
supported_language=supported_language, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pytest | ||
|
||
from presidio_analyzer.predefined_recognizers import UkNinoRecognizer | ||
from tests.assertions import assert_result_within_score_range | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def recognizer(): | ||
return UkNinoRecognizer() | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def entities(): | ||
return ["UK_NINO"] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"text, expected_len, expected_positions, expected_score_ranges", | ||
[ | ||
# fmt: off | ||
# Valid National Insurance Numbers | ||
("AA 12 34 56 B", 1, ((0, 13),), ((0.5, 0.5),), ), | ||
("hh 01 02 03 d", 1, ((0, 13),), ((0.5, 0.5),), ), | ||
("tw987654a", 1, ((0, 9),), ((0.5, 0.5),), ), | ||
("nino: PR 123612C", 1, ((6, 16),), ((0.5, 0.5),), ), | ||
("Here is my National Insurance Number YZ 61 48 68 B", 1, ((36, 50),), ((0.5, 0.5),), ), | ||
# Invalid National Insurance Numbers | ||
("AA 12 34 56 H", 0, (), (), ), | ||
("FQ 00 00 00 C", 0, (), (), ), | ||
("BG123612A", 0, (), (), ), | ||
("nino: nt 99 88 77 a", 0, (), (), ), | ||
("This isn't a valid national insurance number UV 98 76 54 B", 0, (), (), ), | ||
# fmt: on | ||
] | ||
) | ||
def test_when_nino_in_text_then_all_uk_ninos_found( | ||
text, | ||
expected_len, | ||
expected_positions, | ||
expected_score_ranges, | ||
recognizer, | ||
entities, | ||
max_score, | ||
): | ||
results = recognizer.analyze(text, entities) | ||
assert len(results) == expected_len | ||
|
||
for res, (st_pos, fn_pos), (st_score, fn_score) in zip( | ||
results, expected_positions, expected_score_ranges | ||
): | ||
if fn_score == "max": | ||
fn_score = max_score | ||
assert_result_within_score_range( | ||
res, entities[0], st_pos, fn_pos, st_score, fn_score | ||
) |