Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Predefined Recognizer for Indian Passport #1350 #1351

Merged
merged 20 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context |
| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum |
| IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum |
| IN_PASSPORT | Indian Passport Number | Pattern match, Context |

### Finland
| FieldType | Description | Detection Method |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
from .in_aadhaar_recognizer import InAadhaarRecognizer
from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer
from .sg_uen_recognizer import SgUenRecognizer

from .in_passport_recognizer import InPassportRecognizer
from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer

NLP_RECOGNIZERS = {
Expand Down Expand Up @@ -84,5 +86,7 @@
"InAadhaarRecognizer",
"InVehicleRegistrationRecognizer",
"SgUenRecognizer",

"InPassportRecognizer",
"FiPersonalIdentityCodeRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Optional, List

from presidio_analyzer import Pattern, PatternRecognizer


class InPassportRecognizer(PatternRecognizer):
"""
Recognizes Indian Passport Number.

Indian Passport Number is a eight digit alphanumeric number.

Reference:
https://www.bajajallianz.com/blog/travel-insurance-articles/where-is-passport-number-in-indian-passport.html

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"PASSPORT",
r"\b[A-Z][1-9][0-9]{2}[0-9]{4}[1-9]\b",
0.1,
),
]

CONTEXT = [
"passport",
"indian passport",
"passport number"
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "IN_PASSPORT",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
PlPeselRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InPassportRecognizer,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -105,6 +106,7 @@ def load_predefined_recognizers(
InPanRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InPassportRecognizer,
],
"es": [EsNifRecognizer],
"it": [
Expand Down
6 changes: 5 additions & 1 deletion presidio-analyzer/tests/data/context_sentences_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,8 @@ Typical tax filing identifier is known as PAN in India also known as permanent a

#Verify IN PAN mixed case
IN_PAN
my PAN number is DJPMS1234Z
my PAN number is DJPMS1234Z

#Verify IN PASSPORT context words
IN_PASSPORT
my passport number is T1234567. Indian Passport number is of 8 characters long, always starting with a capital letter.
5 changes: 4 additions & 1 deletion presidio-analyzer/tests/test_context_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
UsSsnRecognizer,
SgFinRecognizer,
InPanRecognizer,
InPassportRecognizer,

)
from presidio_analyzer.nlp_engine import NlpArtifacts
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
Expand All @@ -34,6 +36,7 @@ def recognizers_map():
"US_PASSPORT": UsPassportRecognizer(),
"FIN": SgFinRecognizer(),
"IN_PAN": InPanRecognizer(),
"IN_PASSPORT": InPassportRecognizer(),
}
return rec_map

Expand Down Expand Up @@ -68,7 +71,7 @@ def dataset(recognizers_map):

test_items.append((item, recognizer, [entity_type]))
# Currently we have 31 sentences, this is a sanity check
if not len(test_items) == 31:
if not len(test_items) == 32:
raise ValueError(f"expected 31 context sentences but found {len(test_items)}")

yield test_items
Expand Down
48 changes: 48 additions & 0 deletions presidio-analyzer/tests/test_in_passport_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest
from presidio_analyzer.predefined_recognizers import InPassportRecognizer
# from tests import assert_result
from tests.assertions import assert_result

@pytest.fixture(scope="module")
def recognizer():
return InPassportRecognizer()

@pytest.fixture(scope="module")
def entities():
return ["IN_PASSPORT"]

@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
[
# fmt: off
#Valid Passport Numbers
("A3456781", 1, (0,8), 0.7),
("B3097651", 1, (0,8), 0.7),
("C3590543", 1, (0,8), 0.7),
("my passport number is T3569075", 1, (22,30), 0.7),
("passport number: J6932157", 1, (17,25), 0.7),

#Invalid Passport Numbers
("A3456781", 0, (0,8), 0),
("b0097650", 0, (), 0),
("my passport number is t3569075", 0, (), 0),
# fmt: on
],
)
def test_when_all_passport_numers_then_succeed(
text,
expected_len,
expected_position,
expected_score,
recognizer,
entities,
):
results = recognizer.analyze(text, entities)
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)
2 changes: 1 addition & 1 deletion presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 24 predefined
assert len(recognizers) == 1 + 24
assert len(recognizers) == 1 + 25


def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
Expand Down
Loading