-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_disamby.py
More file actions
84 lines (62 loc) · 2.32 KB
/
test_disamby.py
File metadata and controls
84 lines (62 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `disamby` package."""
import pytest
import pandas as pd
from disamby import Disamby
import disamby.preprocessors as prep
pipeline = [
prep.normalize_whitespace,
prep.remove_punctuation,
prep.compact_abbreviations,
lambda x: prep.ngram(x, 4)
]
@pytest.fixture
def disamby_fitted_instance(fake_names):
names = fake_names(90)
data_series = pd.Series(names)
dis = Disamby()
dis.fit(data_series, pipeline, 'streets')
return dis
def test_frequency_counter(disamby_fitted_instance):
dis = disamby_fitted_instance
assert 'streets' in dis.fields
counter = dis.field_freq['streets']
assert counter.most_common(1)[0][1] >= 1
def test_identification_potential(disamby_fitted_instance):
dis = disamby_fitted_instance
weights = dis.id_potential(('st', 'street', 'suite'), 'streets')
assert sum(weights.values()) == pytest.approx(1)
@pytest.mark.parametrize('smoother,offset,expected', [
(None, 0, 0),
('offset', 1000, 0),
('log', 10000, 0),
])
def test_scoring(smoother, offset, expected, disamby_fitted_instance):
dis = disamby_fitted_instance
score = dis.score('David Heights', 'Rebecca Shoal Suite', 'streets',
smoother=smoother, offset=offset)
assert score == pytest.approx(expected, abs=.01)
def test_instant_instantiation(company_df):
df = company_df(500)
dis = Disamby(df, pipeline)
dis.field_freq['address'].most_common()
def test_find(company_df):
df = company_df(100)
dis = Disamby(df, pipeline)
term = list(sorted(dis.records['address'].keys()))[0]
results = dis.find(term, threshold=0, weights={'name': .2, 'address': .8})
assert len(results) == 31
score_of_searched = max(x.score for x in results)
assert score_of_searched == pytest.approx(1)
def test_disambiguated_tests(company_df):
df = company_df(200)
dis = Disamby(df, preprocessors=pipeline)
components = dis.disambiguated_sets(verbose=True, threshold=.7,
weights={'name': .99, 'address': .01}
)
assert max(len(c) for c in components) == 2
def test_two_identical_columns(company_df):
df = company_df(20)
with pytest.raises(KeyError):
dis = Disamby(df[['name', 'name']], pipeline)