-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathfixed_text_service.py
More file actions
291 lines (248 loc) · 12 KB
/
fixed_text_service.py
File metadata and controls
291 lines (248 loc) · 12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""Text processing service for PII annotation.
Provides synchronous and asynchronous methods for annotating text with personally identifiable information (PII) using SpaCy or regex patterns. Supports chunking long texts and batch processing.
"""
import asyncio
from typing import Dict, List, Union
from datafog.processing.text_processing.regex_annotator.regex_annotator import (
RegexAnnotator,
Span,
)
from datafog.processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
class TextService:
"""
Service for annotating text with PII entities.
This service provides methods to detect and annotate personally identifiable information (PII)
in text using different annotation engines. It supports chunking long texts for efficient processing
and combining annotations from multiple chunks.
"""
def __init__(self, text_chunk_length: int = 1000, engine: str = "auto"):
"""
Initialize the TextService with specified chunk length and annotation engine.
Args:
text_chunk_length: Maximum length of text chunks for processing. Default is 1000 characters.
engine: The annotation engine to use. Options are:
- "regex": Use only the RegexAnnotator for pattern-based entity detection
- "spacy": Use only the SpacyPIIAnnotator for NLP-based entity detection
- "auto": (Default) Try RegexAnnotator first and fall back to SpacyPIIAnnotator if no entities are found
Raises:
AssertionError: If an invalid engine type is provided
"""
assert engine in {"regex", "spacy", "auto"}, "Invalid engine"
self.engine = engine
self.spacy_annotator = SpacyPIIAnnotator.create()
self.regex_annotator = RegexAnnotator()
self.text_chunk_length = text_chunk_length
def _chunk_text(self, text: str) -> List[str]:
"""Split the text into chunks of specified length."""
return [
text[i : i + self.text_chunk_length]
for i in range(0, len(text), self.text_chunk_length)
]
def _combine_annotations(
self, annotations: List[Dict[str, List[str]]]
) -> Dict[str, List[str]]:
"""Combine annotations from multiple chunks."""
combined: Dict[str, List[str]] = {}
for annotation in annotations:
for key, value in annotation.items():
if key not in combined:
combined[key] = []
combined[key].extend(value)
return combined
def _annotate_with_engine(
self, text: str, structured: bool = False
) -> Union[Dict[str, List[str]], List[Span]]:
"""
Annotate text using the selected engine based on the engine parameter.
This method implements the engine selection logic:
- For "regex" mode: Uses only the RegexAnnotator
- For "spacy" mode: Uses only the SpacyPIIAnnotator
- For "auto" mode: Tries RegexAnnotator first and falls back to SpacyPIIAnnotator if no entities are found
Args:
text: The text to annotate
structured: If True, return structured output (list of Span objects)
Returns:
If structured=False: Dictionary of annotations by entity type where keys are entity types (e.g., "EMAIL", "PERSON", "ORG")
and values are lists of detected entities of that type
If structured=True: List of Span objects with entity information
"""
if structured:
# Handle structured output mode
if self.engine == "regex":
_, annotation_result = self.regex_annotator.annotate_with_spans(text)
return annotation_result.spans
elif self.engine == "spacy":
# For spaCy, we need to convert the dictionary format to spans
spacy_dict = self.spacy_annotator.annotate(text)
spacy_spans: List[Span] = []
for label, entities in spacy_dict.items():
for entity in entities:
# Find the start and end positions of the entity in the text
start = text.find(entity)
if start >= 0:
end = start + len(entity)
span = Span(start=start, end=end, label=label, text=entity)
spacy_spans.append(span)
return spacy_spans
else: # "auto" mode
# Try regex first
_, annotation_result = self.regex_annotator.annotate_with_spans(text)
if annotation_result.spans:
return annotation_result.spans
# If regex found nothing, fall back to spaCy
spacy_dict = self.spacy_annotator.annotate(text)
auto_spans: List[Span] = []
for label, entities in spacy_dict.items():
for entity in entities:
# Find the start and end positions of the entity in the text
start = text.find(entity)
if start >= 0:
end = start + len(entity)
span = Span(start=start, end=end, label=label, text=entity)
auto_spans.append(span)
return auto_spans
else:
# Handle legacy dictionary output mode
if self.engine == "regex":
return self.regex_annotator.annotate(text)
elif self.engine == "spacy":
return self.spacy_annotator.annotate(text)
else: # auto mode
# Try regex first
regex_dict = self.regex_annotator.annotate(text)
# Check if any entities were found
has_entities = any(
len(entities) > 0 for entities in regex_dict.values()
)
# If regex found entities, return those results
if has_entities:
return regex_dict
# Otherwise, fall back to spaCy
return self.spacy_annotator.annotate(text)
def annotate_text_sync(
self, text: str, structured: bool = False
) -> Union[Dict[str, List[str]], List[Span]]:
"""
Synchronously annotate a text string.
Args:
text: The text to annotate
structured: If True, return structured output (list of Span objects)
Returns:
If structured=False: Dictionary mapping entity types to lists of strings
If structured=True: List of Span objects with entity information
"""
if not text:
return [] if structured else {}
chunks = self._chunk_text(text)
if structured:
# Handle structured output mode
all_spans: List[Span] = []
chunk_offset = 0 # Track the offset for each chunk in the original text
for chunk in chunks:
# Process each chunk and get spans
chunk_spans = self._annotate_with_engine(chunk, structured=True)
if not isinstance(chunk_spans, list):
continue # Skip if not a list of spans
# Adjust span positions based on chunk offset in the original text
for span in chunk_spans:
if not isinstance(span, Span):
continue # Skip if not a Span object
span.start += chunk_offset
span.end += chunk_offset
# Verify the span text matches the text at the adjusted position
if span.start < len(text) and span.end <= len(text):
span.text = text[span.start : span.end]
all_spans.append(span)
# Update offset for the next chunk
chunk_offset += len(chunk)
print(f"Done processing {text.split()[0]}")
return all_spans
else:
# Handle legacy dictionary output mode
annotations: List[Dict[str, List[str]]] = []
for chunk in chunks:
res = self._annotate_with_engine(chunk)
if isinstance(res, dict):
annotations.append(res)
combined = self._combine_annotations(annotations)
print(f"Done processing {text.split()[0]}")
return combined
def batch_annotate_text_sync(
self, texts: List[str], structured: bool = False
) -> Dict[str, Union[Dict[str, List[str]], List[Span]]]:
"""
Synchronously annotate a list of text input.
Args:
texts: List of text strings to annotate
structured: If True, return structured output (list of Span objects) for each text
Returns:
Dictionary mapping each input text to its annotation result
"""
results = [
self.annotate_text_sync(text, structured=structured) for text in texts
]
return dict(zip(texts, results, strict=True))
async def annotate_text_async(
self, text: str, structured: bool = False
) -> Union[Dict[str, List[str]], List[Span]]:
"""
Asynchronously annotate a text string.
Args:
text: The text to annotate
structured: If True, return structured output (list of Span objects)
Returns:
If structured=False: Dictionary mapping entity types to lists of strings
If structured=True: List of Span objects with entity information
"""
if not text:
return [] if structured else {}
chunks = self._chunk_text(text)
if structured:
# Handle structured output mode asynchronously
all_spans: List[Span] = []
chunk_offset = 0 # Track the offset for each chunk in the original text
for chunk in chunks:
# We can't easily parallelize this due to the need to track offsets sequentially
# In a production environment, you might want a more sophisticated approach
chunk_spans = self._annotate_with_engine(chunk, structured=True)
if not isinstance(chunk_spans, list):
continue # Skip if not a list of spans
# Adjust span positions based on chunk offset in the original text
for span in chunk_spans:
if not isinstance(span, Span):
continue # Skip if not a Span object
span.start += chunk_offset
span.end += chunk_offset
# Verify the span text matches the text at the adjusted position
if span.start < len(text) and span.end <= len(text):
span.text = text[span.start : span.end]
all_spans.append(span)
# Update offset for the next chunk
chunk_offset += len(chunk)
return all_spans
else:
# Handle legacy dictionary output mode asynchronously
tasks = [
asyncio.to_thread(self._annotate_with_engine, chunk) for chunk in chunks
]
results = await asyncio.gather(*tasks)
annotations: List[Dict[str, List[str]]] = [
r for r in results if isinstance(r, dict)
]
return self._combine_annotations(annotations)
async def batch_annotate_text_async(
self, texts: List[str], structured: bool = False
) -> Dict[str, Union[Dict[str, List[str]], List[Span]]]:
"""
Asynchronously annotate a list of text input.
Args:
texts: List of text strings to annotate
structured: If True, return structured output (list of Span objects) for each text
Returns:
Dictionary mapping each input text to its annotation result
"""
tasks = [
self.annotate_text_async(text, structured=structured) for text in texts
]
results = await asyncio.gather(*tasks)
return dict(zip(texts, results, strict=True))