Skip to content

Commit

Permalink
feat: Add new recognizer for IN_VOTER #1344 (#1345)
Browse files Browse the repository at this point in the history
  • Loading branch information
kjdeveloper8 authored Apr 16, 2024
1 parent 5ea004d commit 41e0202
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ For more information, refer to the [adding new recognizers documentation](analyz
| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context |
| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum |
| IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum |
| IN_VOTER | Indian Election Commission issued 10 digit alpha numeric voter id for all indian citizens (age 18 or above) | Pattern match, context |
| IN_PASSPORT | Indian Passport Number | Pattern match, Context |

### Finland
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from .in_aadhaar_recognizer import InAadhaarRecognizer
from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer
from .sg_uen_recognizer import SgUenRecognizer

from .in_voter_recognizer import InVoterRecognizer
from .in_passport_recognizer import InPassportRecognizer
from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer

Expand Down Expand Up @@ -86,7 +86,7 @@
"InAadhaarRecognizer",
"InVehicleRegistrationRecognizer",
"SgUenRecognizer",

"InVoterRecognizer",
"InPassportRecognizer",
"FiPersonalIdentityCodeRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import Optional, List
from presidio_analyzer import Pattern, PatternRecognizer


class InVoterRecognizer(PatternRecognizer):
"""
Recognize Indian Voter/Election Id(EPIC).
The Elector's Photo Identity Card or Voter id is a ten digit
alpha-numeric code issued by Election Commission of India
to adult domiciles who have reached the age of 18
Ref: https://en.wikipedia.org/wiki/Voter_ID_(India)
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""

PATTERNS = [
Pattern(
"VOTER",
r"\b([A-Za-z]{1}[ABCDGHJKMNPRSYabcdghjkmnprsy]{1}[A-Za-z]{1}([0-9]){7})\b",
0.4,
),
Pattern(
"VOTER",
r"\b([A-Za-z]){3}([0-9]){7}\b",
0.3,
),
]

CONTEXT = [
"voter",
"epic",
"elector photo identity card",
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "IN_VOTER",
):

patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
patterns=patterns,
context=context,
supported_language=supported_language,
supported_entity=supported_entity,
)
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
PlPeselRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
InPassportRecognizer,
)

Expand Down Expand Up @@ -106,6 +107,7 @@ def load_predefined_recognizers(
InPanRecognizer,
InAadhaarRecognizer,
InVehicleRegistrationRecognizer,
InVoterRecognizer,
InPassportRecognizer,
],
"es": [EsNifRecognizer],
Expand Down
52 changes: 52 additions & 0 deletions presidio-analyzer/tests/test_in_voter_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import InVoterRecognizer


@pytest.fixture(scope="module")
def recognizer():
return InVoterRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["IN_VOTER"]


@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
[
# fmt: off
# valid
("KSD1287349", 1, (0, 10), 0.4),
("my voter: DBJ2289013", 1, (10, 20), (0.4)),
("uzb2345117", 1, (0, 10), 0.3),
("this MUP5632811", 1, (5, 15), 0.3),
("You can vote with your CPJ4467918 number", 1, (23, 33), 0.4),
# invalid
("zxdf8923q1", 0, (), (),),
("A8923571WZ", 0, (), (),),
# fmt: on
],
)
def test_when_voter_in_text_then_all_voter_found(
text,
expected_len,
expected_position,
expected_score,
recognizer,
entities,
):
results = recognizer.analyze(text, entities)
print(results)

assert len(results) == expected_len
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)
4 changes: 2 additions & 2 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry = mock_recognizer_registry
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 24 predefined
assert len(recognizers) == 1 + 25
# 1 custom recognizer in english + 26 predefined
assert len(recognizers) == 1 + 26


def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
Expand Down

0 comments on commit 41e0202

Please sign in to comment.