Skip to content

Commit

Permalink
Added Finnish Personal Identity Code Recognizer. (#1349)
Browse files Browse the repository at this point in the history
  • Loading branch information
honderr authored Apr 9, 2024
1 parent 5bc4b67 commit c7fa825
Show file tree
Hide file tree
Showing 5 changed files with 241 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

### Added
#### Analyzer
Recognizer for Finnish Personal Identity Codes (Henkilötunnus).


## [2.2.353] - March 31st 2024

Expand Down
5 changes: 5 additions & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ For more information, refer to the [adding new recognizers documentation](analyz
| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum |
| IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum |

### Finland
| FieldType | Description | Detection Method |
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|
| FI_PERSONAL_IDENTITY_CODE | The Finnish Personal Identity Code (Henkilötunnus) is a unique 11 character individual identity number. | Pattern match, context and custom logic. |

## Adding a custom PII entity

See [this documentation](analyzer/adding_recognizers.md) for instructions on how to add a new Recognizer for a new type of PII entity.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .in_aadhaar_recognizer import InAadhaarRecognizer
from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer
from .sg_uen_recognizer import SgUenRecognizer
from .fi_personal_identity_code_recognizer import FiPersonalIdentityCodeRecognizer

NLP_RECOGNIZERS = {
"spacy": SpacyRecognizer,
Expand Down Expand Up @@ -83,4 +84,5 @@
"InAadhaarRecognizer",
"InVehicleRegistrationRecognizer",
"SgUenRecognizer",
"FiPersonalIdentityCodeRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from datetime import datetime
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class FiPersonalIdentityCodeRecognizer(PatternRecognizer):
"""
Recognizes and validates Finnish Personal Identity Codes (Henkilötunnus).
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"Finnish Personal Identity Code (Medium)",
r"\b(\d{6})([+-ABCDEFYXWVU])(\d{3})([0123456789ABCDEFHJKLMNPRSTUVWXY])\b",
0.5,
),
Pattern(
"Finnish Personal Identity Code (Very Weak)",
r"(\d{6})([+-ABCDEFYXWVU])(\d{3})([0123456789ABCDEFHJKLMNPRSTUVWXY])",
0.1,
),
]
CONTEXT = ["hetu", "henkilötunnus", "personbeteckningen", "personal identity code"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "fi",
supported_entity: str = "FI_PERSONAL_IDENTITY_CODE",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> Optional[bool]:
"""Validate the pattern by using the control character."""

# More information on the validation logic from:
# https://dvv.fi/en/personal-identity-code
# Under "How is the control character for a personal identity code calculated?".
if len(pattern_text) != 11:
return False

date_part = pattern_text[0:6]
try:
# Checking if we do not have invalid dates e.g. 310211.
datetime.strptime(date_part, "%d%m%y")
except ValueError:
return False
individual_number = pattern_text[7:10]
control_character = pattern_text[-1]
valid_control_characters = "0123456789ABCDEFHJKLMNPRSTUVWXY"
number_to_check = int(date_part + individual_number)
return valid_control_characters[number_to_check % 31] == control_character
164 changes: 164 additions & 0 deletions presidio-analyzer/tests/test_fi_personal_identity_code_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import FiPersonalIdentityCodeRecognizer


@pytest.fixture(scope="module")
def recognizer():
"""Return an instance of the FiPersonalIdentityCodeRecognizer."""
return FiPersonalIdentityCodeRecognizer()


@pytest.fixture(scope="module")
def entities():
"""Return entities to analyze."""
return ["FI_PERSONAL_IDENTITY_CODE"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# Valid Finnish personal identity codes.
(
"010594Y9032",
1,
((0, 11),),
),
(
"My personal identity code is: 010594Y9032. Thank you.",
1,
((30, 41),),
),
(
"010594Y9021",
1,
((0, 11),),
),
(
"020594X903P",
1,
((0, 11),),
),
(
"020594X903P is my hetu.",
1,
((0, 11),),
),
(
"020594X902N",
1,
((0, 11),),
),
(
"Here's my henkilötunnus 020594X902N.",
1,
((24, 35),),
),
(
"030594W903B",
1,
((0, 11),),
),
(
"My finnish id code is030594W903B.",
1,
((21, 32),),
),
(
"030694W9024",
1,
((0, 11),),
),
(
"040594V9030",
1,
((0, 11),),
),
(
"040594V902Y",
1,
((0, 11),),
),
(
"050594U903M",
1,
((0, 11),),
),
(
"050594U902L",
1,
((0, 11),),
),
(
"010516B903X",
1,
((0, 11),),
),
(
"010516B902W",
1,
((0, 11),),
),
(
"020516C903K",
1,
((0, 11),),
),
(
"020516C902J",
1,
((0, 11),),
),
(
"030516D9037",
1,
((0, 11),),
),
(
"030516D9026",
1,
((0, 11),),
),
(
"010501E9032",
1,
((0, 11),),
),
(
"020502E902X",
1,
((0, 11),),
),
(
"020503F9037",
1,
((0, 11),),
),
(
"020504A902E",
1,
((0, 11),),
),
(
"020504B904H",
1,
((0, 11),),
),
# invalid Personal Identity Codes scores
("111111-111A", 0, ()),
("111111+110G", 0, ()),
("311190-1111", 0, ()),
("310289-211C", 0, ()),
("012245A110G", 0, ()),
("010324A110G", 0, ()),
],
)
def test_when_all_finnish_personal_identity_code_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
"""Tests our recognizer against valid & invalid Finnish personal identity codes."""
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)

0 comments on commit c7fa825

Please sign in to comment.