-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcore.py
More file actions
340 lines (289 loc) · 11.6 KB
/
core.py
File metadata and controls
340 lines (289 loc) · 11.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# -*- coding: utf-8 -*-
"""Main module."""
from collections import Counter
from typing import Union
from pandas import DataFrame, Series
from math import log
from tqdm import tqdm
from networkx import DiGraph
from collections import namedtuple
ScoredElement = namedtuple('ScoredElement', ['index', 'score'])
PandasObj = Union[DataFrame, Series]
class Disamby(object):
"""
Class for disambiguation fitting, scoring and ranking of potential matches
A `Disamby` instance stores the pre-processing pipeline applied to the
strings for a given field as well as the the computed frequencies from the
entire corpus of strings to match against.
`Disamby` can be instantiated either with not arguments, with a list of
strings, pandas.Series or pandas.DataFrame. This triggers the immediate
call to the `fit` method, whose doc explains the parameters.
Examples
--------
>>> import pandas as pd
>>> import disamby.preprocessors as pre
>>> df = pd.DataFrame(
... {'a': ['Luca Georger', 'Luca Geroger', 'Adrian Sulzer'],
... 'b': ['Mira, 34, Augsburg', 'Miri, 34, Augsburg', 'Milano, 34']
... }, index=['L1', 'L2', 'O1']
... )
>>> pipeline = [
... pre.normalize_whitespace,
... pre.remove_punctuation,
... pre.trigram
... ]
>>> dis = Disamby(df, pipeline)
>>> dis.disambiguated_sets(threshold=0.5, verbose=False)
[{'L2', 'L1'}, {'O1'}]
"""
def __init__(self, data: PandasObj=None, preprocessors: list=None, field: str=None):
self.field_freq = dict()
self.preprocessors = dict()
self.records = dict()
self._processed_token_cache = dict()
self._most_common = dict()
self._token_to_instance = dict()
if data is not None:
if preprocessors is None:
raise ValueError("Preprocessor not provided")
self.fit(data, preprocessors, field)
def fit(self, data: PandasObj, preprocessors: list, field: str=None):
"""
Computes the frequencies of the terms by field.
Parameters
----------
data : pandas.DataFrame, pandas.Series or list of strings
list of strings or pandas.DataFrame
if dataframe is given then the field defaults to the column name
preprocessors : list
list of functions to apply in that order
note the first function must accept a string, the other functions
must be such that a pipeline is possible the result is a tuple of
strings.
field : str
string identifying which field this data belongs to
Examples
--------
>>> import pandas as pd
>>> from disamby.preprocessors import split_words
>>> df = pd.DataFrame(
... {'a': ['Luca Georger', 'Luke Geroge', 'Adrian Sulzer'],
... 'b': ['Mira, 34, Augsburg', 'Miri, 32', 'Milano, 34']
... })
>>> dis = Disamby()
>>> prep = [split_words]
>>> dis.fit(df, prep)
"""
try:
columns = data.columns
unique_columns = len(set(columns)) == len(columns)
if not unique_columns:
raise KeyError("Some columns have identical names")
except AttributeError:
self._fit_field(data, preprocessors=preprocessors, field=field)
return None
for col in columns:
self._fit_field(data[col], preprocessors=preprocessors)
def find(self, idx, threshold=0.0, weights: dict=None, **kwargs) -> list:
"""
returns the list of scored instances which have a score above the
threshold. Note that strings which do not share any token are omitted
since their score is 0 by default.
Parameters
----------
idx
index of the record to find
threshold
weights : dict
Returns
-------
"""
fields = self.fields
scored_candidates = dict()
if weights is None:
weights = {f: 1 / len(fields) for f in fields}
for field in fields:
own_term = self.records[field][idx]
own_tokens = self._processed_token_cache[field][own_term]
potential_candidates = set()
for token in own_tokens:
potential_candidates |= self._token_to_instance[field][token]
for candidate in potential_candidates:
score = self.score(own_term, candidate[1], field, **kwargs)
candidate_idx = candidate[0]
if candidate_idx not in scored_candidates:
scored_candidates[candidate_idx] = {field: score * weights[field]}
else:
scored_candidates[candidate_idx][field] = score * weights[field]
# compute weighted score
final_candidates = []
for idx, scores in scored_candidates.items():
total_score = sum(scores.values())
if total_score >= threshold:
final_candidates.append(ScoredElement(idx, total_score))
return final_candidates
def _fit_field(self, data: PandasObj, preprocessors: list=None, field: str=None):
if field not in self.preprocessors:
ValueError('preprocessors have already been defined, '
'cannot redefine. This would render the lookup '
'inconsistent')
if field is None:
try:
field = data.name
except AttributeError: # was not a pandas.Series
raise KeyError("The provided data are not a pandas Series, "
"if the data is a list you need to provide the"
"`field` argument.")
self.preprocessors[field] = preprocessors
self._processed_token_cache[field] = dict()
self._token_to_instance[field] = dict()
self.records[field] = dict()
counter = Counter()
sample_item = data.iloc[0]
if not isinstance(sample_item, str):
raise ValueError('type of field/column "%s" is not `str`' % field)
for i, name in data.items():
norm_tokens = self.pre_process(name, preprocessors)
self._processed_token_cache[field][name] = norm_tokens
counter.update(norm_tokens)
self.records[field][i] = name
for token in norm_tokens:
if token in self._token_to_instance[field]:
self._token_to_instance[field][token] |= {(i, name)}
else:
self._token_to_instance[field][token] = {(i, name)}
self._most_common[field] = counter.most_common(1)[0][1]
self.field_freq[field] = counter
def score(self, term: str, other_term: str, field: str,
smoother=None, offset=0) -> float:
"""
Computes the score between the two strings using the frequency data
Parameters
----------
term : str
term to search for
other_term : str
the other term to compare too
field : str
the name of the column to which this term belongs
smoother : str (optional)
one of {None, 'offset', 'log'}
offset : int
offset to add to count only needed for smoothers 'log' and 'offset'
Returns
-------
float
Notes
-----
The score is not commutative (i.e. score(A,B) != score(B,A))
"""
try:
own_parts = self._processed_token_cache[field][term]
other_parts = self._processed_token_cache[field][other_term]
except KeyError:
own_parts = self.pre_process(term, self.preprocessors[field])
other_parts = self.pre_process(other_term, self.preprocessors[field])
# get list of potential scores
weights = self.id_potential(own_parts, field, smoother, offset)
score = sum(weights.get(tok, 0) for tok in other_parts)
return score
def id_potential(self, term: Union[tuple, str], field: str,
smoother: str = None, offset=0) -> dict:
"""
Computes the weights of the words based on the observed frequency
and normalized.
Parameters
----------
term : str, tuple
term to look for or a tuple of proper tokens
field : str
field the word falls under
smoother : str (optional)
one of {None, 'offset', 'log'}
offset : int
offset to add to count only needed for smoothers 'log' and 'offset'
Returns
-------
float
"""
if isinstance(term, str):
words = self.pre_process(term, self.preprocessors[field])
else:
words = term
smoothers = {
None: self._smooth_none,
'offset': self._smooth_offset,
'log': self._smooth_log
}
if smoother not in smoothers:
raise KeyError(
'Chosen smother `{}` is not one of {}'.format(
smoother, smoothers.keys())
)
counter = self.field_freq[field]
s_fun = smoothers[smoother]
max_occ = self._most_common[field]
id_potentials = {
word: s_fun(counter[word], offset, max_occ)
for word in words
}
total_weight = sum(id_potentials.values())
return {w: idp / total_weight for w, idp in id_potentials.items()}
def alias_graph(self, threshold=0.7, verbose=True, weights=None, **kwargs) -> DiGraph:
"""
This function creates the directed network connecting an instance to an other
through a directed edge if the the target instance has a similarity score above
the threshold.
Parameters
----------
weights
threshold : float
between 0 and 1
verbose : whether to show the progressbar
kwargs :
arguments to pass to the score function (i.e. offset, smoother)
Returns
-------
DiGraph
"""
if not verbose:
def t(x):
return x
else:
t = tqdm
edges = []
fields = self.records.keys()
a_field = list(fields)[0]
for idx in t(self.records[a_field]):
targets = self.find(idx, threshold=threshold, weights=weights, **kwargs)
new_edges = [(idx, x.index, {'score': x.score}) for x in targets]
edges.extend(new_edges)
alias_graph = DiGraph()
alias_graph.add_edges_from(edges)
return alias_graph
def disambiguated_sets(self, threshold=0.7, verbose=True, weights=None, **kwargs):
from networkx import strongly_connected_components
alias_graph = self.alias_graph(threshold, verbose, weights, **kwargs)
strong = strongly_connected_components(alias_graph)
return list(strong)
@staticmethod
def pre_process(base_name, functions: list):
"""apply every function consecutively to base_name"""
norm_name = base_name
for f in functions:
norm_name = f(norm_name)
return set(norm_name)
@property
def fields(self):
return self.field_freq.keys()
@staticmethod
def _smooth_none(occurrences, *args):
return 1 / max(occurrences, 1)
@staticmethod
def _smooth_offset(occurrences, offset, *args):
return 1 / max(occurrences + offset, 1)
@staticmethod
def _smooth_log(occ, offset, max_occ):
max_offset = max(max_occ + offset + 1, 1)
word_offset = max(occ + offset, 1)
return log(max_offset / word_offset)