Skip to content

Commit

Permalink
Spanish NIE (Foreigners ID card) recognizer (#1359)
Browse files Browse the repository at this point in the history
  • Loading branch information
areyesfalcon authored Apr 24, 2024
1 parent f29e112 commit e64d8ec
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 6 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

### Added
#### Analyzer
Recognizer for Spanish Foreigners Identity Code (NIE Numero de Identificacion de Extranjeros).

## [Unreleased]

### Added
#### Analyzer
Recognizer for Finnish Personal Identity Codes (Henkilötunnus).
Expand Down
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
|Entity Type|Description|Detection Method|
|--- |--- |--- |
|ES_NIF| A spanish NIF number (Personal tax ID) .|Pattern match, context and checksum|
|ES_NIE| A spanish NIE number (Foreigners ID card) .|Pattern match, context and checksum|

### Italy

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from .in_voter_recognizer import InVoterRecognizer
from .in_passport_recognizer import InPassportRecognizer
from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer
from .es_nie_recognizer import EsNieRecognizer

NLP_RECOGNIZERS = {
"spacy": SpacyRecognizer,
Expand Down Expand Up @@ -89,4 +90,5 @@
"InVoterRecognizer",
"InPassportRecognizer",
"FiPersonalIdentityCodeRecognizer",
"EsNieRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import List, Tuple, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class EsNieRecognizer(PatternRecognizer):
"""
Recognize NIE number using regex and checksum.
Reference(s):
https://es.wikipedia.org/wiki/N%C3%BAmero_de_identidad_de_extranjero
https://www.interior.gob.es/opencms/ca/servicios-al-ciudadano/tramites-y-gestiones/dni/calculo-del-digito-de-control-del-nif-nie/
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes
or spaces.
"""

PATTERNS = [
Pattern(
"NIE",
r"\b[X-Z]?[0-9]?[0-9]{7}[-]?[A-Z]\b",
0.5,
),
]

CONTEXT = ["número de identificación de extranjero", "NIE"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "es",
supported_entity: str = "ES_NIE",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool:
"""Validate the pattern by using the control character."""

pattern_text = EsNieRecognizer.__sanitize_value(pattern_text)

letters = "TRWAGMYFPDXBNJZSQVHLCKE"
letter = pattern_text[-1]

# check last is a letter, and first is in X,Y,Z
if not pattern_text[1:-1].isdigit or pattern_text[:1] not in 'XYZ':
return False
# check size is 8 or 9
if len(pattern_text) < 8 or len(pattern_text) > 9:
return False

# replace XYZ with 012, and check the mod 23
number = int(str('XYZ'.index(pattern_text[0])) + pattern_text[1:-1])
return letter == letters[number % 23]

@staticmethod
def __sanitize_value(text: str) -> str:
return text.replace("-", "").replace(" ", "")
21 changes: 15 additions & 6 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import yaml

from presidio_analyzer import EntityRecognizer, PatternRecognizer
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, StanzaNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, \
StanzaNlpEngine
from presidio_analyzer.predefined_recognizers import (
CreditCardRecognizer,
CryptoRecognizer,
Expand All @@ -32,6 +33,7 @@
SgFinRecognizer,
SpacyRecognizer,
EsNifRecognizer,
EsNieRecognizer,
StanzaRecognizer,
AuAbnRecognizer,
AuAcnRecognizer,
Expand Down Expand Up @@ -68,7 +70,8 @@ class RecognizerRegistry:
def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE |
re.IGNORECASE,
):
if recognizers:
self.recognizers = recognizers
Expand All @@ -77,7 +80,8 @@ def __init__(
self.global_regex_flags = global_regex_flags

def load_predefined_recognizers(
self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
self, languages: Optional[List[str]] = None,
nlp_engine: NlpEngine = None
) -> None:
"""
Load the existing recognizers into memory.
Expand Down Expand Up @@ -110,7 +114,10 @@ def load_predefined_recognizers(
InVoterRecognizer,
InPassportRecognizer,
],
"es": [EsNifRecognizer],
"es": [
EsNifRecognizer,
EsNieRecognizer,
],
"it": [
ItDriverLicenseRecognizer,
ItFiscalCodeRecognizer,
Expand Down Expand Up @@ -183,7 +190,8 @@ def get_recognizers(
ad_hoc_recognizers: Optional[List[EntityRecognizer]] = None,
) -> List[EntityRecognizer]:
"""
Return a list of recognizers which supports the specified name and language.
Return a list of recognizers which supports the specified name and\
language.
:param entities: the requested entities
:param language: the requested language
Expand Down Expand Up @@ -348,7 +356,8 @@ def get_supported_entities(

supported_entities = []
for language in languages:
recognizers = self.get_recognizers(language=language, all_fields=True)
recognizers = self.get_recognizers(language=language,
all_fields=True)

for recognizer in recognizers:
supported_entities.extend(recognizer.get_supported_entities())
Expand Down
45 changes: 45 additions & 0 deletions presidio-analyzer/tests/test_es_nie_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import EsNieRecognizer


@pytest.fixture(scope="module")
def recognizer():
"""Return an instance of the EsNieRecognizer."""
return EsNieRecognizer()


@pytest.fixture(scope="module")
def entities():
"""Return entities to analyze."""
return ["ES_NIE"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# valid NIE scores
("Z8078221M", 1, ((0, 9),),),
("X9613851N", 1, ((0, 9),),),
("Y8063915Z", 1, ((0, 9),),),
("Y8063915-Z", 1, ((0, 10),),),
("Mi NIE es X9613851N", 1, ((10, 19),),),
("Z8078221M en mi NIE", 1, ((0, 9),),),
("Mi Número de identificación de extranjero es Y8063915-Z", 1, \
((45, 55),),),
# invalid NIE scores
("Y8063915Q", 0, ()),
("Y806391Q", 0, ()),
("58063915Q", 0, ()),
("W8063915Q", 0, ()),
],
)
def test_when_all_es_nie_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
"""Tests the ES_NIE recognizer against valid & invalid examples."""
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)

0 comments on commit e64d8ec

Please sign in to comment.