Skip to content

Commit 109419b

Browse files
committed
checkpoint
1 parent ed653c1 commit 109419b

File tree

2 files changed

+225
-186
lines changed

2 files changed

+225
-186
lines changed

patsy/build.py

Lines changed: 25 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This file is part of Patsy
2-
# Copyright (C) 2011-2012 Nathaniel Smith <njs@pobox.com>
2+
# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com>
33
# See file COPYING for license information.
44

55
# This file defines the core design matrix building functions.
@@ -10,7 +10,9 @@
1010

1111
import numpy as np
1212
from patsy import PatsyError
13-
from patsy.categorical import CategoricalTransform, Categorical
13+
from patsy.categorical import (guess_categorical,
14+
CatLevelSniffer,
15+
categorical_to_int)
1416
from patsy.util import (atleast_2d_column_default,
1517
have_pandas, have_pandas_categorical,
1618
asarray_or_pandas)
@@ -53,44 +55,6 @@ def test__max_allowed_dim():
5355
_max_allowed_dim(2, np.array([[1]]), f)
5456
assert_raises(PatsyError, _max_allowed_dim, 2, np.array([[[1]]]), f)
5557

56-
class _BoolToCat(object):
57-
def __init__(self, factor):
58-
self.factor = factor
59-
60-
def memorize_finish(self):
61-
pass
62-
63-
def levels(self):
64-
return (False, True)
65-
66-
def transform(self, data):
67-
data = asarray_or_pandas(data)
68-
_max_allowed_dim(1, data, self.factor)
69-
# issubdtype(int, bool) is true! So we can't use it:
70-
if not data.dtype.kind == "b":
71-
raise PatsyError("factor %s, which I thought was boolean, "
72-
"gave non-boolean data of dtype %s"
73-
% (self.factor.name(), data.dtype),
74-
self.factor)
75-
return Categorical(data, levels=[False, True])
76-
77-
def test__BoolToCat():
78-
from nose.tools import assert_raises
79-
f = _MockFactor()
80-
btc = _BoolToCat(f)
81-
cat = btc.transform([True, False, True, True])
82-
assert cat.levels == (False, True)
83-
assert np.issubdtype(cat.int_array.dtype, int)
84-
assert np.all(cat.int_array == [1, 0, 1, 1])
85-
assert_raises(PatsyError, btc.transform, [1, 0, 1])
86-
assert_raises(PatsyError, btc.transform, ["a", "b"])
87-
assert_raises(PatsyError, btc.transform, [[True]])
88-
if have_pandas:
89-
pandas_cat = btc.transform(pandas.Series([True, False, True],
90-
index=[10, 20, 30]))
91-
assert np.array_equal(pandas_cat.int_array, [1, 0, 1])
92-
assert np.array_equal(pandas_cat.int_array.index, [10, 20, 30])
93-
9458
class _NumFactorEvaluator(object):
9559
def __init__(self, factor, state, expected_columns):
9660
# This one instance variable is part of our public API:
@@ -165,24 +129,17 @@ def test__NumFactorEvaluator():
165129

166130

167131
class _CatFactorEvaluator(object):
168-
def __init__(self, factor, state, postprocessor, expected_levels):
132+
def __init__(self, factor, state, levels):
169133
# This one instance variable is part of our public API:
170134
self.factor = factor
171135
self._state = state
172-
self._postprocessor = postprocessor
173-
self._expected_levels = tuple(expected_levels)
136+
self._levels = tuple(levels)
174137

175138
def eval(self, data):
176139
# returns either a 2d ndarray or a DataFrame
177140
result = self.factor.eval(self._state, data)
178-
result = self._postprocessor.transform(result)
179-
if not isinstance(result, Categorical):
180-
msg = ("when evaluating categoric factor %r, I got a "
181-
"result that is not of type Categorical (but rather %s)"
182-
# result.__class__.__name__ would be better, but not
183-
# defined for old-style classes:
184-
% (self.factor.name(), result.__class__))
185-
raise PatsyError(msg, self.factor)
141+
# XX FIXME: use the real NA action
142+
result = categorical_to_int(result, self._levels, NAAction())
186143
if result.levels != self._expected_levels:
187144
msg = ("when evaluating categoric factor %r, I got Categorical "
188145
"data with unexpected levels (wanted %s, got %s)"
@@ -431,11 +388,11 @@ def __call__(self):
431388
}
432389
assert factor_states == expected
433390

434-
def _examine_factor_types(factors, factor_states, data_iter_maker):
391+
def _examine_factor_types(factors, factor_states, data_iter_maker,
392+
NA_action):
435393
num_column_counts = {}
436394
cat_levels_contrasts = {}
437-
cat_postprocessors = {}
438-
prefinished_postprocessors = {}
395+
cat_level_sniffers = {}
439396
examine_needed = set(factors)
440397
for data in data_iter_maker():
441398
# We might have gathered all the information we need after the first
@@ -445,54 +402,23 @@ def _examine_factor_types(factors, factor_states, data_iter_maker):
445402
break
446403
for factor in list(examine_needed):
447404
value = factor.eval(factor_states[factor], data)
448-
if (have_pandas_categorical
449-
and isinstance(value, pandas.Categorical)):
450-
value = Categorical.from_pandas_categorical(value)
451-
# fall through into the next 'if':
452-
if isinstance(value, Categorical):
453-
postprocessor = CategoricalTransform(levels=value.levels)
454-
prefinished_postprocessors[factor] = postprocessor
455-
cat_levels_contrasts[factor] = (value.levels,
456-
value.contrast)
457-
examine_needed.remove(factor)
458-
continue
459-
value = atleast_2d_column_default(value)
460-
_max_allowed_dim(2, value, factor)
461-
if np.issubdtype(value.dtype, np.number):
405+
if factor in cat_level_sniffers or guess_categorical(value):
406+
if factor not in cat_level_sniffers:
407+
cat_level_sniffers[factor] = CatLevelSniffer(NA_action)
408+
done = cat_level_sniffers[factor].sniff_levels(value)
409+
if done:
410+
levels = cat_level_sniffers.pop(factor).levels()
411+
contrast = getattr(value, "contrast", None)
412+
cat_levels_contrasts[factor] = (levels, contrast)
413+
examine_needed.remove(factor)
414+
else:
415+
# Numeric
416+
value = atleast_2d_column_default(value)
417+
_max_allowed_dim(2, value, factor)
462418
column_count = value.shape[1]
463419
num_column_counts[factor] = column_count
464420
examine_needed.remove(factor)
465-
# issubdtype(X, bool) isn't reliable -- it returns true for
466-
# X == int! So check the kind code instead:
467-
elif value.dtype.kind == "b":
468-
# Special case: give it a transformer, but don't bother
469-
# processing the rest of the data
470-
if value.shape[1] > 1:
471-
msg = ("factor '%s' evaluates to a boolean array with "
472-
"%s columns; I can only handle single-column "
473-
"boolean arrays" % (factor.name(), value.shape[1]))
474-
raise PatsyError(msg, factor)
475-
cat_postprocessors[factor] = _BoolToCat(factor)
476-
examine_needed.remove(factor)
477-
else:
478-
if value.shape[1] > 1:
479-
msg = ("factor '%s' appears to be categorical but has "
480-
"%s columns; I can only handle single-column "
481-
"categorical factors"
482-
% (factor.name(), value.shape[1]))
483-
raise PatsyError(msg, factor)
484-
if factor not in cat_postprocessors:
485-
cat_postprocessors[factor] = CategoricalTransform()
486-
processor = cat_postprocessors[factor]
487-
processor.memorize_chunk(value)
488-
for factor, processor in cat_postprocessors.iteritems():
489-
processor.memorize_finish()
490-
cat_levels_contrasts[factor] = (processor.levels(), None)
491-
cat_postprocessors.update(prefinished_postprocessors)
492-
assert set(cat_postprocessors) == set(cat_levels_contrasts)
493-
return (num_column_counts,
494-
cat_levels_contrasts,
495-
cat_postprocessors)
421+
return (num_column_counts, cat_levels_contrasts)
496422

497423
def test__examine_factor_types():
498424
class MockFactor(object):

0 commit comments

Comments
 (0)