Skip to content

Commit

Permalink
feat: Add Bech32 and Bech32m Bitcoin Address Validation in Crypto Rec…
Browse files Browse the repository at this point in the history
…ognizer and expand tests (#1307)

* feat: Support 'M' prefix in SG_NRIC_FIN Recognizer and expand tests

- Updated SG_NRIC_FIN Recognizer to include 'M' prefix for validating NRIC numbers issued to foreigners from 2022 onwards.
- Added new test cases to ensure comprehensive coverage and validation accuracy for all supported NRIC prefixes (S, T, F, G, M).

* test: Add NRIC/FIN recognition test with surrounding text

- Introduced a new test case to validate the recognition of valid NRIC/FIN numbers when embedded within a sentence

* docs: Update FIN number pattern to include 'M'

* feat: Add Bech32 and Bech32m address validation support for Bitcoin

- Extended the CryptoRecognizer with the capability to validate Bech32 and Bech32m Bitcoin addresses
- Added new test cases have been added to cover Bech32 and Bech32m address validation

---------

Co-authored-by: Omri Mendels <[email protected]>
  • Loading branch information
miltonsim and omri374 authored Feb 20, 2024
1 parent 45c418d commit a8d2c90
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,19 @@

from presidio_analyzer import Pattern, PatternRecognizer

# Copied from:
# This class includes references to addresses validation algorithms.
# The original implementation of the P2PKH and P2SH address validation
# algorithm can be found at:
# http://rosettacode.org/wiki/Bitcoin/address_validation#Python
# The Bech32/Bech32m address validation algorithm by Pieter Wuille can be found
# at: https://github.com/sipa/bech32/blob/master/ref/python/segwit_addr.py

# Constants for encoding types
BECH32 = 1
BECH32M = 2

CHARSET = "qpzry9x8gf2tvdw0s3jn54khce6mua7l"
BECH32M_CONST = 0x2bc830a3


class CryptoRecognizer(PatternRecognizer):
Expand All @@ -17,7 +28,7 @@ class CryptoRecognizer(PatternRecognizer):
"""

PATTERNS = [
Pattern("Crypto (Medium)", r"\b[13][a-km-zA-HJ-NP-Z1-9]{26,33}\b", 0.5),
Pattern("Crypto (Medium)", r"(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,59}", 0.5),
]

CONTEXT = ["wallet", "btc", "bitcoin", "crypto"]
Expand All @@ -38,12 +49,26 @@ def __init__(
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
try:
bcbytes = self.__decode_base58(pattern_text, 25)
return bcbytes[-4:] == sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
except ValueError:
return False
def validate_result(self, pattern_text: str) -> bool:
"""Validate the Bitcoin address using checksum.
:param pattern_text: The cryptocurrency address to validate.
:return: True if the address is valid according to its respective
format, False otherwise.
"""
if pattern_text.startswith("1") or pattern_text.startswith("3"):
# P2PKH or P2SH address validation
try:
bcbytes = self.__decode_base58(pattern_text, 25)
checksum = sha256(sha256(bcbytes[:-4]).digest()).digest()[:4]
return bcbytes[-4:] == checksum
except ValueError:
return False
elif pattern_text.startswith("bc1"):
# Bech32 or Bech32m address validation
if CryptoRecognizer.validate_bech32_address(pattern_text)[0]:
return True
return False

@staticmethod
def __decode_base58(bc: str, length: int) -> bytes:
Expand All @@ -52,3 +77,60 @@ def __decode_base58(bc: str, length: int) -> bytes:
for char in bc:
n = n * 58 + digits58.index(char)
return n.to_bytes(length, "big")

@staticmethod
def bech32_polymod(values):
"""Compute the Bech32 checksum."""
generator = [0x3b6a57b2, 0x26508e6d, 0x1ea119fa, 0x3d4233dd,
0x2a1462b3]
chk = 1
for value in values:
top = chk >> 25
chk = (chk & 0x1ffffff) << 5 ^ value
for i in range(5):
chk ^= generator[i] if ((top >> i) & 1) else 0
return chk

@staticmethod
def bech32_hrp_expand(hrp):
"""Expand the HRP into values for checksum computation."""
return [ord(x) >> 5 for x in hrp] + [0] + [ord(x) & 31 for x in hrp]

@staticmethod
def bech32_verify_checksum(hrp, data):
"""Verify a checksum given HRP and converted data characters."""
const = CryptoRecognizer.bech32_polymod(
CryptoRecognizer.bech32_hrp_expand(hrp) + data
)
if const == 1:
return BECH32
if const == BECH32M_CONST:
return BECH32M
return None

@staticmethod
def bech32_decode(bech):
"""Validate a Bech32/Bech32m string, and determine HRP and data."""
if ((any(ord(x) < 33 or ord(x) > 126 for x in bech)) or
(bech.lower() != bech and bech.upper() != bech)):
return (None, None, None)
bech = bech.lower()
pos = bech.rfind('1')
if pos < 1 or pos + 7 > len(bech) or len(bech) > 90:
return (None, None, None)
if not all(x in CHARSET for x in bech[pos+1:]):
return (None, None, None)
hrp = bech[:pos]
data = [CHARSET.find(x) for x in bech[pos+1:]]
spec = CryptoRecognizer.bech32_verify_checksum(hrp, data)
if spec is None:
return (None, None, None)
return (hrp, data[:-6], spec)

@staticmethod
def validate_bech32_address(address):
"""Validate a Bech32 or Bech32m address."""
hrp, data, spec = CryptoRecognizer.bech32_decode(address)
if hrp is not None and data is not None:
return True, spec
return False, None
19 changes: 18 additions & 1 deletion presidio-analyzer/tests/test_crypto_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,27 @@ def entities():
"text, expected_len, expected_positions",
[
# fmt: off
("16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ", 1, ((0, 34),),),
## Match
# Test with valid P2PKH address that starts with 1 and 33 base58 characters
("16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ", 1, ((0, 34),),),
# Test with valid P2SH address that starts with 3 and 33 base58 characters
("3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy", 1, ((0, 34),),),
# Test with valid Bech32 address that starts with bc1 and 39 base58 characters
("bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq", 1, ((0, 42),),),
# Test with valid Bech32m address that starts with bc1 and 59 base58 characters
("bc1p5d7rjq7g6rdk2yhzks9smlaqtedr4dekq08ge8ztwac72sfr9rusxg3297", 1, ((0, 62),),),
# Test with multiple valid addresses
("16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ 3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy", 2, ((0, 34), (35, 69)),),
# Test with valid address in a sentence
("my wallet address is: 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ", 1, ((22, 56),),),
## No match
# Test with invalid address
("16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ2", 0, ()),
# Test with invalid address in a sentence
("my wallet address is: 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ2", 0, ()),
# Test with empty string
("", 0, (),),
# fmt: on
],
)
Expand Down

0 comments on commit a8d2c90

Please sign in to comment.