|
16 | 16 | # under the License. |
17 | 17 |
|
18 | 18 | import codecs |
| 19 | +import decimal |
| 20 | +from functools import partial |
| 21 | +import itertools |
19 | 22 | import os |
20 | 23 | import sys |
21 | 24 | import unicodedata |
22 | 25 |
|
23 | 26 | import numpy as np |
24 | 27 |
|
| 28 | +import pyarrow as pa |
| 29 | + |
25 | 30 |
|
26 | 31 | KILOBYTE = 1 << 10 |
27 | 32 | MEGABYTE = KILOBYTE * KILOBYTE |
28 | 33 |
|
| 34 | +DEFAULT_NONE_PROB = 0.3 |
| 35 | + |
29 | 36 |
|
30 | 37 | def _multiplicate_sequence(base, target_size): |
31 | 38 | q, r = divmod(target_size, len(base)) |
@@ -97,3 +104,248 @@ def get_random_unicode(n, *, seed=42): |
97 | 104 | result = ''.join(unicode_arr.tolist()) |
98 | 105 | assert len(result) == n, (len(result), len(unicode_arr)) |
99 | 106 | return result |
| 107 | + |
| 108 | + |
| 109 | +class BuiltinsGenerator(object): |
| 110 | + |
| 111 | + def __init__(self, seed=42): |
| 112 | + self.rnd = np.random.RandomState(seed) |
| 113 | + |
| 114 | + def sprinkle(self, lst, prob, value): |
| 115 | + """ |
| 116 | + Sprinkle *value* entries in list *lst* with likelihood *prob*. |
| 117 | + """ |
| 118 | + for i, p in enumerate(self.rnd.random_sample(size=len(lst))): |
| 119 | + if p < prob: |
| 120 | + lst[i] = value |
| 121 | + |
| 122 | + def sprinkle_nones(self, lst, prob): |
| 123 | + """ |
| 124 | + Sprinkle None entries in list *lst* with likelihood *prob*. |
| 125 | + """ |
| 126 | + self.sprinkle(lst, prob, None) |
| 127 | + |
| 128 | + def generate_int_list(self, n, none_prob=DEFAULT_NONE_PROB): |
| 129 | + """ |
| 130 | + Generate a list of Python ints with *none_prob* probability of |
| 131 | + an entry being None. |
| 132 | + """ |
| 133 | + data = list(range(n)) |
| 134 | + self.sprinkle_nones(data, none_prob) |
| 135 | + return data |
| 136 | + |
| 137 | + def generate_float_list(self, n, none_prob=DEFAULT_NONE_PROB, |
| 138 | + use_nan=False): |
| 139 | + """ |
| 140 | + Generate a list of Python floats with *none_prob* probability of |
| 141 | + an entry being None (or NaN if *use_nan* is true). |
| 142 | + """ |
| 143 | + # Make sure we get Python floats, not np.float64 |
| 144 | + data = list(map(float, self.rnd.uniform(0.0, 1.0, n))) |
| 145 | + assert len(data) == n |
| 146 | + self.sprinkle(data, none_prob, value=float('nan') if use_nan else None) |
| 147 | + return data |
| 148 | + |
| 149 | + def generate_bool_list(self, n, none_prob=DEFAULT_NONE_PROB): |
| 150 | + """ |
| 151 | + Generate a list of Python bools with *none_prob* probability of |
| 152 | + an entry being None. |
| 153 | + """ |
| 154 | + # Make sure we get Python bools, not np.bool_ |
| 155 | + data = [bool(x >= 0.5) for x in self.rnd.uniform(0.0, 1.0, n)] |
| 156 | + assert len(data) == n |
| 157 | + self.sprinkle_nones(data, none_prob) |
| 158 | + return data |
| 159 | + |
| 160 | + def generate_decimal_list(self, n, none_prob=DEFAULT_NONE_PROB, |
| 161 | + use_nan=False): |
| 162 | + """ |
| 163 | + Generate a list of Python Decimals with *none_prob* probability of |
| 164 | + an entry being None (or NaN if *use_nan* is true). |
| 165 | + """ |
| 166 | + data = [decimal.Decimal('%.9f' % f) |
| 167 | + for f in self.rnd.uniform(0.0, 1.0, n)] |
| 168 | + assert len(data) == n |
| 169 | + self.sprinkle(data, none_prob, |
| 170 | + value=decimal.Decimal('nan') if use_nan else None) |
| 171 | + return data |
| 172 | + |
| 173 | + def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB): |
| 174 | + """ |
| 175 | + Generate a list of generic Python objects with *none_prob* |
| 176 | + probability of an entry being None. |
| 177 | + """ |
| 178 | + data = [object() for i in range(n)] |
| 179 | + self.sprinkle_nones(data, none_prob) |
| 180 | + return data |
| 181 | + |
| 182 | + def _generate_varying_sequences(self, random_factory, n, min_size, max_size, none_prob): |
| 183 | + """ |
| 184 | + Generate a list of *n* sequences of varying size between *min_size* |
| 185 | + and *max_size*, with *none_prob* probability of an entry being None. |
| 186 | + The base material for each sequence is obtained by calling |
| 187 | + `random_factory(<some size>)` |
| 188 | + """ |
| 189 | + base_size = 10000 |
| 190 | + base = random_factory(base_size + max_size) |
| 191 | + data = [] |
| 192 | + for i in range(n): |
| 193 | + off = self.rnd.randint(base_size) |
| 194 | + if min_size == max_size: |
| 195 | + size = min_size |
| 196 | + else: |
| 197 | + size = self.rnd.randint(min_size, max_size + 1) |
| 198 | + data.append(base[off:off + size]) |
| 199 | + self.sprinkle_nones(data, none_prob) |
| 200 | + assert len(data) == n |
| 201 | + return data |
| 202 | + |
| 203 | + def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB): |
| 204 | + """ |
| 205 | + Generate a list of bytestrings with a fixed *size*. |
| 206 | + """ |
| 207 | + return self._generate_varying_sequences(get_random_bytes, n, |
| 208 | + size, size, none_prob) |
| 209 | + |
| 210 | + |
| 211 | + def generate_varying_binary_list(self, n, min_size, max_size, |
| 212 | + none_prob=DEFAULT_NONE_PROB): |
| 213 | + """ |
| 214 | + Generate a list of bytestrings with a random size between |
| 215 | + *min_size* and *max_size*. |
| 216 | + """ |
| 217 | + return self._generate_varying_sequences(get_random_bytes, n, |
| 218 | + min_size, max_size, none_prob) |
| 219 | + |
| 220 | + |
| 221 | + def generate_ascii_string_list(self, n, min_size, max_size, |
| 222 | + none_prob=DEFAULT_NONE_PROB): |
| 223 | + """ |
| 224 | + Generate a list of ASCII strings with a random size between |
| 225 | + *min_size* and *max_size*. |
| 226 | + """ |
| 227 | + return self._generate_varying_sequences(get_random_ascii, n, |
| 228 | + min_size, max_size, none_prob) |
| 229 | + |
| 230 | + |
| 231 | + def generate_unicode_string_list(self, n, min_size, max_size, |
| 232 | + none_prob=DEFAULT_NONE_PROB): |
| 233 | + """ |
| 234 | + Generate a list of unicode strings with a random size between |
| 235 | + *min_size* and *max_size*. |
| 236 | + """ |
| 237 | + return self._generate_varying_sequences(get_random_unicode, n, |
| 238 | + min_size, max_size, none_prob) |
| 239 | + |
| 240 | + |
| 241 | + def generate_int_list_list(self, n, min_size, max_size, |
| 242 | + none_prob=DEFAULT_NONE_PROB): |
| 243 | + """ |
| 244 | + Generate a list of lists of Python ints with a random size between |
| 245 | + *min_size* and *max_size*. |
| 246 | + """ |
| 247 | + return self._generate_varying_sequences( |
| 248 | + partial(self.generate_int_list, none_prob=none_prob), |
| 249 | + n, min_size, max_size, none_prob) |
| 250 | + |
| 251 | + def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB): |
| 252 | + """ |
| 253 | + Generate a list of tuples with random values. |
| 254 | + Each tuple has the form `(int value, float value, bool value)` |
| 255 | + """ |
| 256 | + dicts = self.generate_dict_list(n, none_prob=none_prob) |
| 257 | + tuples = [(d.get('u'), d.get('v'), d.get('w')) |
| 258 | + if d is not None else None |
| 259 | + for d in dicts] |
| 260 | + assert len(tuples) == n |
| 261 | + return tuples |
| 262 | + |
| 263 | + def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB): |
| 264 | + """ |
| 265 | + Generate a list of dicts with random values. |
| 266 | + Each dict has the form `{'u': int value, 'v': float value, 'w': bool value}` |
| 267 | + """ |
| 268 | + ints = self.generate_int_list(n, none_prob=none_prob) |
| 269 | + floats = self.generate_float_list(n, none_prob=none_prob) |
| 270 | + bools = self.generate_bool_list(n, none_prob=none_prob) |
| 271 | + dicts = [] |
| 272 | + # Keep half the Nones, omit the other half |
| 273 | + keep_nones = itertools.cycle([True, False]) |
| 274 | + for u, v, w in zip(ints, floats, bools): |
| 275 | + d = {} |
| 276 | + if u is not None or next(keep_nones): |
| 277 | + d['u'] = u |
| 278 | + if v is not None or next(keep_nones): |
| 279 | + d['v'] = v |
| 280 | + if w is not None or next(keep_nones): |
| 281 | + d['w'] = w |
| 282 | + dicts.append(d) |
| 283 | + self.sprinkle_nones(dicts, none_prob) |
| 284 | + assert len(dicts) == n |
| 285 | + return dicts |
| 286 | + |
| 287 | + def get_type_and_builtins(self, n, type_name): |
| 288 | + """ |
| 289 | + Return a `(arrow type, list)` tuple where the arrow type |
| 290 | + corresponds to the given logical *type_name*, and the list |
| 291 | + is a list of *n* random-generated Python objects compatible |
| 292 | + with the arrow type. |
| 293 | + """ |
| 294 | + size = None |
| 295 | + |
| 296 | + if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'): |
| 297 | + kind = type_name |
| 298 | + elif type_name.startswith(('int', 'uint')): |
| 299 | + kind = 'int' |
| 300 | + elif type_name.startswith('float'): |
| 301 | + kind = 'float' |
| 302 | + elif type_name.startswith('struct'): |
| 303 | + kind = 'struct' |
| 304 | + elif type_name == 'binary': |
| 305 | + kind = 'varying binary' |
| 306 | + elif type_name.startswith('binary'): |
| 307 | + kind = 'fixed binary' |
| 308 | + size = int(type_name[6:]) |
| 309 | + assert size > 0 |
| 310 | + else: |
| 311 | + raise ValueError("unrecognized type %r" % (type_name,)) |
| 312 | + |
| 313 | + if kind in ('int', 'float'): |
| 314 | + ty = getattr(pa, type_name)() |
| 315 | + elif kind == 'bool': |
| 316 | + ty = pa.bool_() |
| 317 | + elif kind == 'decimal': |
| 318 | + ty = pa.decimal128(9, 9) |
| 319 | + elif kind == 'fixed binary': |
| 320 | + ty = pa.binary(size) |
| 321 | + elif kind == 'varying binary': |
| 322 | + ty = pa.binary() |
| 323 | + elif kind in ('ascii', 'unicode'): |
| 324 | + ty = pa.string() |
| 325 | + elif kind == 'int64 list': |
| 326 | + ty = pa.list_(pa.int64()) |
| 327 | + elif kind == 'struct': |
| 328 | + ty = pa.struct([pa.field('u', pa.int64()), |
| 329 | + pa.field('v', pa.float64()), |
| 330 | + pa.field('w', pa.bool_())]) |
| 331 | + |
| 332 | + factories = { |
| 333 | + 'int': self.generate_int_list, |
| 334 | + 'float': self.generate_float_list, |
| 335 | + 'bool': self.generate_bool_list, |
| 336 | + 'decimal': self.generate_decimal_list, |
| 337 | + 'fixed binary': partial(self.generate_fixed_binary_list, |
| 338 | + size=size), |
| 339 | + 'varying binary': partial(self.generate_varying_binary_list, |
| 340 | + min_size=3, max_size=40), |
| 341 | + 'ascii': partial(self.generate_ascii_string_list, |
| 342 | + min_size=3, max_size=40), |
| 343 | + 'unicode': partial(self.generate_unicode_string_list, |
| 344 | + min_size=3, max_size=40), |
| 345 | + 'int64 list': partial(self.generate_int_list_list, |
| 346 | + min_size=0, max_size=20), |
| 347 | + 'struct': self.generate_dict_list, |
| 348 | + 'struct from tuples': self.generate_tuple_list, |
| 349 | + } |
| 350 | + data = factories[kind](n) |
| 351 | + return ty, data |
0 commit comments