forked from ukosuagwu/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path_encode.py
More file actions
366 lines (289 loc) · 11.1 KB
/
Copy path_encode.py
File metadata and controls
366 lines (289 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
from contextlib import suppress
from collections import Counter
from typing import NamedTuple
import numpy as np
from . import is_scalar_nan
def _unique(values, *, return_inverse=False, return_counts=False):
"""Helper function to find unique values with support for python objects.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
Parameters
----------
values : ndarray
Values to check for unknowns.
return_inverse : bool, default=False
If True, also return the indices of the unique values.
return_counts : bool, default=False
If True, also return the number of times each unique item appears in
values.
Returns
-------
unique : ndarray
The sorted unique values.
unique_inverse : ndarray
The indices to reconstruct the original array from the unique array.
Only provided if `return_inverse` is True.
unique_counts : ndarray
The number of times each of the unique values comes up in the original
array. Only provided if `return_counts` is True.
"""
if values.dtype == object:
return _unique_python(
values, return_inverse=return_inverse, return_counts=return_counts
)
# numerical
return _unique_np(
values, return_inverse=return_inverse, return_counts=return_counts
)
def _unique_np(values, return_inverse=False, return_counts=False):
"""Helper function to find unique values for numpy arrays that correctly
accounts for nans. See `_unique` documentation for details."""
uniques = np.unique(
values, return_inverse=return_inverse, return_counts=return_counts
)
inverse, counts = None, None
if return_counts:
*uniques, counts = uniques
if return_inverse:
*uniques, inverse = uniques
if return_counts or return_inverse:
uniques = uniques[0]
# np.unique will have duplicate missing values at the end of `uniques`
# here we clip the nans and remove it from uniques
if uniques.size and is_scalar_nan(uniques[-1]):
nan_idx = np.searchsorted(uniques, np.nan)
uniques = uniques[: nan_idx + 1]
if return_inverse:
inverse[inverse > nan_idx] = nan_idx
if return_counts:
counts[nan_idx] = np.sum(counts[nan_idx:])
counts = counts[: nan_idx + 1]
ret = (uniques,)
if return_inverse:
ret += (inverse,)
if return_counts:
ret += (counts,)
return ret[0] if len(ret) == 1 else ret
class MissingValues(NamedTuple):
"""Data class for missing data information"""
nan: bool
none: bool
def to_list(self):
"""Convert tuple to a list where None is always first."""
output = []
if self.none:
output.append(None)
if self.nan:
output.append(np.nan)
return output
def _extract_missing(values):
"""Extract missing values from `values`.
Parameters
----------
values: set
Set of values to extract missing from.
Returns
-------
output: set
Set with missing values extracted.
missing_values: MissingValues
Object with missing value information.
"""
missing_values_set = {
value for value in values if value is None or is_scalar_nan(value)
}
if not missing_values_set:
return values, MissingValues(nan=False, none=False)
if None in missing_values_set:
if len(missing_values_set) == 1:
output_missing_values = MissingValues(nan=False, none=True)
else:
# If there is more than one missing value, then it has to be
# float('nan') or np.nan
output_missing_values = MissingValues(nan=True, none=True)
else:
output_missing_values = MissingValues(nan=True, none=False)
# create set without the missing values
output = values - missing_values_set
return output, output_missing_values
class _nandict(dict):
"""Dictionary with support for nans."""
def __init__(self, mapping):
super().__init__(mapping)
for key, value in mapping.items():
if is_scalar_nan(key):
self.nan_value = value
break
def __missing__(self, key):
if hasattr(self, "nan_value") and is_scalar_nan(key):
return self.nan_value
raise KeyError(key)
def _map_to_integer(values, uniques):
"""Map values based on its position in uniques."""
table = _nandict({val: i for i, val in enumerate(uniques)})
return np.array([table[v] for v in values])
def _unique_python(values, *, return_inverse, return_counts):
# Only used in `_uniques`, see docstring there for details
try:
uniques_set = set(values)
uniques_set, missing_values = _extract_missing(uniques_set)
uniques = sorted(uniques_set)
uniques.extend(missing_values.to_list())
uniques = np.array(uniques, dtype=values.dtype)
except TypeError:
types = sorted(t.__qualname__ for t in set(type(v) for v in values))
raise TypeError(
"Encoders require their input to be uniformly "
f"strings or numbers. Got {types}"
)
ret = (uniques,)
if return_inverse:
ret += (_map_to_integer(values, uniques),)
if return_counts:
ret += (_get_counts(values, uniques),)
return ret[0] if len(ret) == 1 else ret
def _encode(values, *, uniques, check_unknown=True):
"""Helper function to encode values into [0, n_uniques - 1].
Uses pure python method for object dtype, and numpy method for
all other dtypes.
The numpy method has the limitation that the `uniques` need to
be sorted. Importantly, this is not checked but assumed to already be
the case. The calling method needs to ensure this for all non-object
values.
Parameters
----------
values : ndarray
Values to encode.
uniques : ndarray
The unique values in `values`. If the dtype is not object, then
`uniques` needs to be sorted.
check_unknown : bool, default=True
If True, check for values in `values` that are not in `unique`
and raise an error. This is ignored for object dtype, and treated as
True in this case. This parameter is useful for
_BaseEncoder._transform() to avoid calling _check_unknown()
twice.
Returns
-------
encoded : ndarray
Encoded values
"""
if values.dtype.kind in "OUS":
try:
return _map_to_integer(values, uniques)
except KeyError as e:
raise ValueError(f"y contains previously unseen labels: {str(e)}")
else:
if check_unknown:
diff = _check_unknown(values, uniques)
if diff:
raise ValueError(f"y contains previously unseen labels: {str(diff)}")
return np.searchsorted(uniques, values)
def _check_unknown(values, known_values, return_mask=False):
"""
Helper function to check for unknowns in values to be encoded.
Uses pure python method for object dtype, and numpy method for
all other dtypes.
Parameters
----------
values : array
Values to check for unknowns.
known_values : array
Known values. Must be unique.
return_mask : bool, default=False
If True, return a mask of the same shape as `values` indicating
the valid values.
Returns
-------
diff : list
The unique values present in `values` and not in `know_values`.
valid_mask : boolean array
Additionally returned if ``return_mask=True``.
"""
valid_mask = None
if values.dtype.kind in "OUS":
values_set = set(values)
values_set, missing_in_values = _extract_missing(values_set)
uniques_set = set(known_values)
uniques_set, missing_in_uniques = _extract_missing(uniques_set)
diff = values_set - uniques_set
nan_in_diff = missing_in_values.nan and not missing_in_uniques.nan
none_in_diff = missing_in_values.none and not missing_in_uniques.none
def is_valid(value):
return (
value in uniques_set
or missing_in_uniques.none
and value is None
or missing_in_uniques.nan
and is_scalar_nan(value)
)
if return_mask:
if diff or nan_in_diff or none_in_diff:
valid_mask = np.array([is_valid(value) for value in values])
else:
valid_mask = np.ones(len(values), dtype=bool)
diff = list(diff)
if none_in_diff:
diff.append(None)
if nan_in_diff:
diff.append(np.nan)
else:
unique_values = np.unique(values)
diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
if return_mask:
if diff.size:
valid_mask = np.in1d(values, known_values)
else:
valid_mask = np.ones(len(values), dtype=bool)
# check for nans in the known_values
if np.isnan(known_values).any():
diff_is_nan = np.isnan(diff)
if diff_is_nan.any():
# removes nan from valid_mask
if diff.size and return_mask:
is_nan = np.isnan(values)
valid_mask[is_nan] = 1
# remove nan from diff
diff = diff[~diff_is_nan]
diff = list(diff)
if return_mask:
return diff, valid_mask
return diff
class _NaNCounter(Counter):
"""Counter with support for nan values."""
def __init__(self, items):
super().__init__(self._generate_items(items))
def _generate_items(self, items):
"""Generate items without nans. Stores the nan counts separately."""
for item in items:
if not is_scalar_nan(item):
yield item
continue
if not hasattr(self, "nan_count"):
self.nan_count = 0
self.nan_count += 1
def __missing__(self, key):
if hasattr(self, "nan_count") and is_scalar_nan(key):
return self.nan_count
raise KeyError(key)
def _get_counts(values, uniques):
"""Get the count of each of the `uniques` in `values`.
The counts will use the order passed in by `uniques`. For non-object dtypes,
`uniques` is assumed to be sorted and `np.nan` is at the end.
"""
if values.dtype.kind in "OU":
counter = _NaNCounter(values)
output = np.zeros(len(uniques), dtype=np.int64)
for i, item in enumerate(uniques):
with suppress(KeyError):
output[i] = counter[item]
return output
unique_values, counts = _unique_np(values, return_counts=True)
# Recorder unique_values based on input: `uniques`
uniques_in_values = np.isin(uniques, unique_values, assume_unique=True)
if np.isnan(unique_values[-1]) and np.isnan(uniques[-1]):
uniques_in_values[-1] = True
unique_valid_indices = np.searchsorted(unique_values, uniques[uniques_in_values])
output = np.zeros_like(uniques, dtype=np.int64)
output[uniques_in_values] = counts[unique_valid_indices]
return output