added batching support (#1449)

microsoft · Sep 10, 2024 · 4aeb56b · 4aeb56b
1 parent 1bf22ed
commit 4aeb56b
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 3 deletions.
diff --git a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py
@@ -27,14 +27,17 @@ def analyze_iterator(
         self,
         texts: Iterable[Union[str, bool, float, int]],
         language: str,
+        batch_size: Optional[int] = None,
         **kwargs,
     ) -> List[List[RecognizerResult]]:
         """
         Analyze an iterable of strings.
 
         :param texts: An list containing strings to be analyzed.
         :param language: Input language
+        :param batch_size: Batch size to process in a single iteration
         :param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
+        (default value depends on the nlp engine implementation)
         """
 
         # validate types
@@ -43,7 +46,7 @@ def analyze_iterator(
         # Process the texts as batch for improved performance
         nlp_artifacts_batch: Iterator[Tuple[str, NlpArtifacts]] = (
             self.analyzer_engine.nlp_engine.process_batch(
-                texts=texts, language=language
+                texts=texts, language=language, batch_size=batch_size
             )
         )
 

diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Iterable, Iterator, List, Tuple
+from typing import Iterable, Iterator, List, Optional, Tuple
 
 from presidio_analyzer.nlp_engine import NlpArtifacts
 
@@ -29,6 +29,7 @@ def process_batch(
         self,
         texts: Iterable[str],
         language: str,
+        batch_size: Optional[int] = None,
         **kwargs,  # noqa ANN003
     ) -> Iterator[Tuple[str, NlpArtifacts]]:
         """Execute the NLP pipeline on a batch of texts.

diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
@@ -105,6 +105,7 @@ def process_batch(
         self,
         texts: Union[List[str], List[Tuple[str, object]]],
         language: str,
+        batch_size: Optional[int] = None,
         as_tuples: bool = False,
     ) -> Iterator[Optional[NlpArtifacts]]:
         """Execute the NLP pipeline on a batch of texts using spacy pipe.
@@ -120,7 +121,9 @@ def process_batch(
             raise ValueError("NLP engine is not loaded. Consider calling .load()")
 
         texts = (str(text) for text in texts)
-        docs = self.nlp[language].pipe(texts, as_tuples=as_tuples)
+        docs = self.nlp[language].pipe(texts,
+                                       as_tuples=as_tuples,
+                                       batch_size=batch_size)
         for doc in docs:
             yield doc.text, self._doc_to_nlp_artifact(doc, language)
 

diff --git a/presidio-analyzer/tests/test_batch_analyzer_engine.py b/presidio-analyzer/tests/test_batch_analyzer_engine.py
@@ -266,3 +266,15 @@ def test_analyze_dict_with_nones_returns_empty_result(batch_analyzer_engine_simp
     assert len(res) == len(input_list)
     for r in res:
         assert not r
+
+def test_batch_analyze_iterator_returns_list_of_recognizer_results(
+    batch_analyzer_engine_simple
+):
+    texts = ["My name is David", "Call me at 2352351232", "I was born at 1/5/1922"]
+    expected_output = [[], [RecognizerResult(entity_type="PHONE_NUMBER", start=11, end=21, score= 0.4)], []]
+
+    results = batch_analyzer_engine_simple.analyze_iterator(texts=texts, language="en", batch_size=2)
+
+    assert len(results) == len(expected_output)
+    for result, expected_result in zip(results, expected_output):
+        assert result == expected_result