Skip to content

Commit

Permalink
Add UK National Insurance Number Recognizer (#1446)
Browse files Browse the repository at this point in the history
  • Loading branch information
hhobson authored Sep 22, 2024
1 parent 9321e14 commit c54ce2b
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/analyzer/adding_recognizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ To add a recognizer to the list of pre-defined recognizers:

1. Clone the repo.
2. Create a file containing the new recognizer Python class.
3. Add the recognizer to the `recognizers_map` dict in the `RecognizerRegistry.load_predefined_recognizers` method. In this map, the key is the language the recognizer supports, and the value is the class itself. If your recognizer detects entities in multiple languages, add it to under the "ALL" key.
3. Add the recognizer to the `recognizers` in the [`default_recognizers`](../../presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml) config. Details of recognizer paramers are given [Here](./recognizer_registry_provider.md#the-recognizer-parameters).
4. Optional: Update documentation (e.g., the [supported entities list](../supported_entities.md)).

### Azure AI Language recognizer
Expand Down
2 changes: 1 addition & 1 deletion docs/analyzer/recognizer_registry_provider.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ The recognizer list comprises of both the predefined and custom recognizers, for
deny_list_score: 1
```
The recognizer parameters:
### The recognizer parameters
- `supported_languages`: A list of supported languages that the analyzer will support. In case this field is missing, a recognizer will be created for each supported language provided to the `AnalyzerEngine`.
In addition to the language code, this field also contains a list of context words, which increases confidence in the detection in case it is found in the surroundings of a detected entity (as seen in the credit card example above).
Expand Down
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
|Entity Type|Description|Detection Method|
|--- |--- |--- |
|UK_NHS|A UK NHS number is 10 digits.|Pattern match, context and checksum|
|UK_NINO|UK [National Insurance Number](https://en.wikipedia.org/wiki/National_Insurance_number) is a unique identifier used in the administration of National Insurance and tax.|Pattern match and context|

### Spain

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ recognizers:
- en
type: predefined

- name: UkNinoRecognizer
supported_languages:
- en
type: predefined

- name: SgFinRecognizer
supported_languages:
- en
Expand Down Expand Up @@ -163,4 +168,4 @@ recognizers:
type: predefined

- name: InVoterRecognizer
type: predefined
type: predefined
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from .spacy_recognizer import SpacyRecognizer
from .stanza_recognizer import StanzaRecognizer
from .uk_nhs_recognizer import NhsRecognizer
from .uk_nino_recognizer import UkNinoRecognizer
from .url_recognizer import UrlRecognizer
from .us_bank_recognizer import UsBankRecognizer
from .us_driver_license_recognizer import UsLicenseRecognizer
Expand Down Expand Up @@ -104,4 +105,5 @@
"InPassportRecognizer",
"FiPersonalIdentityCodeRecognizer",
"EsNieRecognizer",
"UkNinoRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class UkNinoRecognizer(PatternRecognizer):
"""
Recognizes UK National Insurance Number using regex.
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"NINO (medium)",
r"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D{1}])\b", # noqa: E501
0.5,
),
]

CONTEXT = ["national insurance", "ni number", "nino"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "UK_NINO",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
4 changes: 2 additions & 2 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry = mock_recognizer_registry
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 26 predefined
assert len(recognizers) == 1 + 26
# 1 custom recognizer in english + 27 predefined
assert len(recognizers) == 1 + 27


def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
Expand Down
55 changes: 55 additions & 0 deletions presidio-analyzer/tests/test_uk_nino_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from presidio_analyzer.predefined_recognizers import UkNinoRecognizer
from tests.assertions import assert_result_within_score_range


@pytest.fixture(scope="module")
def recognizer():
return UkNinoRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["UK_NINO"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions, expected_score_ranges",
[
# fmt: off
# Valid National Insurance Numbers
("AA 12 34 56 B", 1, ((0, 13),), ((0.5, 0.5),), ),
("hh 01 02 03 d", 1, ((0, 13),), ((0.5, 0.5),), ),
("tw987654a", 1, ((0, 9),), ((0.5, 0.5),), ),
("nino: PR 123612C", 1, ((6, 16),), ((0.5, 0.5),), ),
("Here is my National Insurance Number YZ 61 48 68 B", 1, ((36, 50),), ((0.5, 0.5),), ),
# Invalid National Insurance Numbers
("AA 12 34 56 H", 0, (), (), ),
("FQ 00 00 00 C", 0, (), (), ),
("BG123612A", 0, (), (), ),
("nino: nt 99 88 77 a", 0, (), (), ),
("This isn't a valid national insurance number UV 98 76 54 B", 0, (), (), ),
# fmt: on
]
)
def test_when_nino_in_text_then_all_uk_ninos_found(
text,
expected_len,
expected_positions,
expected_score_ranges,
recognizer,
entities,
max_score,
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len

for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
results, expected_positions, expected_score_ranges
):
if fn_score == "max":
fn_score = max_score
assert_result_within_score_range(
res, entities[0], st_pos, fn_pos, st_score, fn_score
)

0 comments on commit c54ce2b

Please sign in to comment.