Skip to content

Commit

Permalink
added batching support (#1449)
Browse files Browse the repository at this point in the history
  • Loading branch information
roeybc authored Sep 10, 2024
1 parent 1bf22ed commit 4aeb56b
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 3 deletions.
5 changes: 4 additions & 1 deletion presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@ def analyze_iterator(
self,
texts: Iterable[Union[str, bool, float, int]],
language: str,
batch_size: Optional[int] = None,
**kwargs,
) -> List[List[RecognizerResult]]:
"""
Analyze an iterable of strings.
:param texts: An list containing strings to be analyzed.
:param language: Input language
:param batch_size: Batch size to process in a single iteration
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
(default value depends on the nlp engine implementation)
"""

# validate types
Expand All @@ -43,7 +46,7 @@ def analyze_iterator(
# Process the texts as batch for improved performance
nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
self.analyzer_engine.nlp_engine.process_batch(
texts=texts, language=language
texts=texts, language=language, batch_size=batch_size
)
)

Expand Down
3 changes: 2 additions & 1 deletion presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
from typing import Iterable, Iterator, List, Tuple
from typing import Iterable, Iterator, List, Optional, Tuple

from presidio_analyzer.nlp_engine import NlpArtifacts

Expand Down Expand Up @@ -29,6 +29,7 @@ def process_batch(
self,
texts: Iterable[str],
language: str,
batch_size: Optional[int] = None,
**kwargs, # noqa ANN003
) -> Iterator[Tuple[str, NlpArtifacts]]:
"""Execute the NLP pipeline on a batch of texts.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def process_batch(
self,
texts: Union[List[str], List[Tuple[str, object]]],
language: str,
batch_size: Optional[int] = None,
as_tuples: bool = False,
) -> Iterator[Optional[NlpArtifacts]]:
"""Execute the NLP pipeline on a batch of texts using spacy pipe.
Expand All @@ -120,7 +121,9 @@ def process_batch(
raise ValueError("NLP engine is not loaded. Consider calling .load()")

texts = (str(text) for text in texts)
docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
docs = self.nlp[language].pipe(texts,
as_tuples=as_tuples,
batch_size=batch_size)
for doc in docs:
yield doc.text, self._doc_to_nlp_artifact(doc, language)

Expand Down
12 changes: 12 additions & 0 deletions presidio-analyzer/tests/test_batch_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,15 @@ def test_analyze_dict_with_nones_returns_empty_result(batch_analyzer_engine_simp
assert len(res) == len(input_list)
for r in res:
assert not r

def test_batch_analyze_iterator_returns_list_of_recognizer_results(
batch_analyzer_engine_simple
):
texts = ["My name is David", "Call me at 2352351232", "I was born at 1/5/1922"]
expected_output = [[], [RecognizerResult(entity_type="PHONE_NUMBER", start=11, end=21, score= 0.4)], []]

results = batch_analyzer_engine_simple.analyze_iterator(texts=texts, language="en", batch_size=2)

assert len(results) == len(expected_output)
for result, expected_result in zip(results, expected_output):
assert result == expected_result

0 comments on commit 4aeb56b

Please sign in to comment.