Add UK National Insurance Number Recognizer (#1446)

microsoft · Sep 22, 2024 · c54ce2b · c54ce2b
1 parent 9321e14
commit c54ce2b
Show file tree

Hide file tree

Showing 8 changed files with 108 additions and 5 deletions.
diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md
@@ -150,7 +150,7 @@ To add a recognizer to the list of pre-defined recognizers:
 
 1. Clone the repo.
 2. Create a file containing the new recognizer Python class.
-3. Add the recognizer to the `recognizers_map` dict in the `RecognizerRegistry.load_predefined_recognizers` method. In this map, the key is the language the recognizer supports, and the value is the class itself. If your recognizer detects entities in multiple languages, add it to under the "ALL" key.
+3. Add the recognizer to the `recognizers` in the [`default_recognizers`](../../presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml) config. Details of recognizer paramers are given [Here](./recognizer_registry_provider.md#the-recognizer-parameters).
 4. Optional: Update documentation (e.g., the [supported entities list](../supported_entities.md)).
 
 ### Azure AI Language recognizer

diff --git a/docs/analyzer/recognizer_registry_provider.md b/docs/analyzer/recognizer_registry_provider.md
@@ -89,7 +89,7 @@ The recognizer list comprises of both the predefined and custom recognizers, for
     deny_list_score: 1
 ```
 
-The recognizer parameters:
+### The recognizer parameters
 
   - `supported_languages`: A list of supported languages that the analyzer will support. In case this field is missing, a recognizer will be created for each supported language provided to the `AnalyzerEngine`. 
   In addition to the language code, this field also contains a list of context words, which increases confidence in the detection in case it is found in the surroundings of a detected entity (as seen in the credit card example above).

diff --git a/docs/supported_entities.md b/docs/supported_entities.md
@@ -40,6 +40,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
 |Entity Type|Description|Detection Method|
 |--- |--- |--- |
 |UK_NHS|A UK NHS number is 10 digits.|Pattern match, context and checksum|
+|UK_NINO|UK [National Insurance Number](https://en.wikipedia.org/wiki/National_Insurance_number) is a unique identifier used in the administration of National Insurance and tax.|Pattern match and context|
 
 ### Spain
 

diff --git a/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml b/presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml
@@ -53,6 +53,11 @@ recognizers:
     - en
     type: predefined
 
+  - name: UkNinoRecognizer
+    supported_languages:
+    - en
+    type: predefined
+
   - name: SgFinRecognizer
     supported_languages: 
     - en
@@ -163,4 +168,4 @@ recognizers:
     type: predefined
 
   - name: InVoterRecognizer
-    type: predefined
+    type: predefined
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py
@@ -37,6 +37,7 @@
 from .spacy_recognizer import SpacyRecognizer
 from .stanza_recognizer import StanzaRecognizer
 from .uk_nhs_recognizer import NhsRecognizer
+from .uk_nino_recognizer import UkNinoRecognizer
 from .url_recognizer import UrlRecognizer
 from .us_bank_recognizer import UsBankRecognizer
 from .us_driver_license_recognizer import UsLicenseRecognizer
@@ -104,4 +105,5 @@
     "InPassportRecognizer",
     "FiPersonalIdentityCodeRecognizer",
     "EsNieRecognizer",
+    "UkNinoRecognizer",
 ]
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nino_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/uk_nino_recognizer.py
@@ -0,0 +1,40 @@
+from typing import List, Optional
+
+from presidio_analyzer import Pattern, PatternRecognizer
+
+
+class UkNinoRecognizer(PatternRecognizer):
+    """
+    Recognizes UK National Insurance Number using regex.
+
+    :param patterns: List of patterns to be used by this recognizer
+    :param context: List of context words to increase confidence in detection
+    :param supported_language: Language this recognizer supports
+    :param supported_entity: The entity this recognizer can detect
+    """
+
+    PATTERNS = [
+        Pattern(
+            "NINO (medium)",
+            r"\b(?!bg|gb|nk|kn|nt|tn|zz|BG|GB|NK|KN|NT|TN|ZZ) ?([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z]{1}[a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]{1}) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D{1}])\b",  # noqa: E501
+            0.5,
+        ),
+    ]
+
+    CONTEXT = ["national insurance", "ni number", "nino"]
+
+    def __init__(
+        self,
+        patterns: Optional[List[Pattern]] = None,
+        context: Optional[List[str]] = None,
+        supported_language: str = "en",
+        supported_entity: str = "UK_NINO",
+    ):
+        patterns = patterns if patterns else self.PATTERNS
+        context = context if context else self.CONTEXT
+        super().__init__(
+            supported_entity=supported_entity,
+            patterns=patterns,
+            context=context,
+            supported_language=supported_language,
+        )
diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py
@@ -52,8 +52,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
     registry = mock_recognizer_registry
     registry.load_predefined_recognizers()
     recognizers = registry.get_recognizers(language="en", all_fields=True)
-    # 1 custom recognizer in english + 26 predefined
-    assert len(recognizers) == 1 + 26
+    # 1 custom recognizer in english + 27 predefined
+    assert len(recognizers) == 1 + 27
 
 
 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):

diff --git a/presidio-analyzer/tests/test_uk_nino_recognizer.py b/presidio-analyzer/tests/test_uk_nino_recognizer.py
@@ -0,0 +1,55 @@
+import pytest
+
+from presidio_analyzer.predefined_recognizers import UkNinoRecognizer
+from tests.assertions import assert_result_within_score_range
+
+
+@pytest.fixture(scope="module")
+def recognizer():
+    return UkNinoRecognizer()
+
+
+@pytest.fixture(scope="module")
+def entities():
+    return ["UK_NINO"]
+
+
+@pytest.mark.parametrize(
+    "text, expected_len, expected_positions, expected_score_ranges",
+    [
+        # fmt: off
+        # Valid National Insurance Numbers
+        ("AA 12 34 56 B", 1, ((0, 13),), ((0.5, 0.5),), ),
+        ("hh 01 02 03 d", 1, ((0, 13),), ((0.5, 0.5),), ),
+        ("tw987654a", 1, ((0, 9),), ((0.5, 0.5),), ),
+        ("nino: PR 123612C", 1, ((6, 16),), ((0.5, 0.5),), ),
+        ("Here is my National Insurance Number YZ 61 48 68 B", 1, ((36, 50),), ((0.5, 0.5),), ),
+        # Invalid National Insurance Numbers
+        ("AA 12 34 56 H", 0, (), (), ),
+        ("FQ 00 00 00 C", 0, (), (), ),
+        ("BG123612A", 0, (), (), ),
+        ("nino: nt 99 88 77 a", 0, (), (), ),
+        ("This isn't a valid national insurance number UV 98 76 54 B", 0, (), (), ),
+        # fmt: on
+    ]
+)
+def test_when_nino_in_text_then_all_uk_ninos_found(
+    text,
+    expected_len,
+    expected_positions,
+    expected_score_ranges,
+    recognizer,
+    entities,
+    max_score,
+):
+    results = recognizer.analyze(text, entities)
+    assert len(results) == expected_len
+
+    for res, (st_pos, fn_pos), (st_score, fn_score) in zip(
+        results, expected_positions, expected_score_ranges
+    ):
+        if fn_score == "max":
+            fn_score = max_score
+        assert_result_within_score_range(
+            res, entities[0], st_pos, fn_pos, st_score, fn_score
+        )