11# This file is part of Patsy
2- # Copyright (C) 2011-2012 Nathaniel Smith <njs@pobox.com>
2+ # Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com>
33# See file COPYING for license information.
44
55# This file defines the core design matrix building functions.
1010
1111import numpy as np
1212from patsy import PatsyError
13- from patsy .categorical import CategoricalTransform , Categorical
13+ from patsy .categorical import (guess_categorical ,
14+ CatLevelSniffer ,
15+ categorical_to_int )
1416from patsy .util import (atleast_2d_column_default ,
1517 have_pandas , have_pandas_categorical ,
1618 asarray_or_pandas )
@@ -53,44 +55,6 @@ def test__max_allowed_dim():
5355 _max_allowed_dim (2 , np .array ([[1 ]]), f )
5456 assert_raises (PatsyError , _max_allowed_dim , 2 , np .array ([[[1 ]]]), f )
5557
56- class _BoolToCat (object ):
57- def __init__ (self , factor ):
58- self .factor = factor
59-
60- def memorize_finish (self ):
61- pass
62-
63- def levels (self ):
64- return (False , True )
65-
66- def transform (self , data ):
67- data = asarray_or_pandas (data )
68- _max_allowed_dim (1 , data , self .factor )
69- # issubdtype(int, bool) is true! So we can't use it:
70- if not data .dtype .kind == "b" :
71- raise PatsyError ("factor %s, which I thought was boolean, "
72- "gave non-boolean data of dtype %s"
73- % (self .factor .name (), data .dtype ),
74- self .factor )
75- return Categorical (data , levels = [False , True ])
76-
77- def test__BoolToCat ():
78- from nose .tools import assert_raises
79- f = _MockFactor ()
80- btc = _BoolToCat (f )
81- cat = btc .transform ([True , False , True , True ])
82- assert cat .levels == (False , True )
83- assert np .issubdtype (cat .int_array .dtype , int )
84- assert np .all (cat .int_array == [1 , 0 , 1 , 1 ])
85- assert_raises (PatsyError , btc .transform , [1 , 0 , 1 ])
86- assert_raises (PatsyError , btc .transform , ["a" , "b" ])
87- assert_raises (PatsyError , btc .transform , [[True ]])
88- if have_pandas :
89- pandas_cat = btc .transform (pandas .Series ([True , False , True ],
90- index = [10 , 20 , 30 ]))
91- assert np .array_equal (pandas_cat .int_array , [1 , 0 , 1 ])
92- assert np .array_equal (pandas_cat .int_array .index , [10 , 20 , 30 ])
93-
9458class _NumFactorEvaluator (object ):
9559 def __init__ (self , factor , state , expected_columns ):
9660 # This one instance variable is part of our public API:
@@ -165,24 +129,17 @@ def test__NumFactorEvaluator():
165129
166130
167131class _CatFactorEvaluator (object ):
168- def __init__ (self , factor , state , postprocessor , expected_levels ):
132+ def __init__ (self , factor , state , levels ):
169133 # This one instance variable is part of our public API:
170134 self .factor = factor
171135 self ._state = state
172- self ._postprocessor = postprocessor
173- self ._expected_levels = tuple (expected_levels )
136+ self ._levels = tuple (levels )
174137
175138 def eval (self , data ):
176139 # returns either a 2d ndarray or a DataFrame
177140 result = self .factor .eval (self ._state , data )
178- result = self ._postprocessor .transform (result )
179- if not isinstance (result , Categorical ):
180- msg = ("when evaluating categoric factor %r, I got a "
181- "result that is not of type Categorical (but rather %s)"
182- # result.__class__.__name__ would be better, but not
183- # defined for old-style classes:
184- % (self .factor .name (), result .__class__ ))
185- raise PatsyError (msg , self .factor )
141+ # XX FIXME: use the real NA action
142+ result = categorical_to_int (result , self ._levels , NAAction ())
186143 if result .levels != self ._expected_levels :
187144 msg = ("when evaluating categoric factor %r, I got Categorical "
188145 "data with unexpected levels (wanted %s, got %s)"
@@ -431,11 +388,11 @@ def __call__(self):
431388 }
432389 assert factor_states == expected
433390
434- def _examine_factor_types (factors , factor_states , data_iter_maker ):
391+ def _examine_factor_types (factors , factor_states , data_iter_maker ,
392+ NA_action ):
435393 num_column_counts = {}
436394 cat_levels_contrasts = {}
437- cat_postprocessors = {}
438- prefinished_postprocessors = {}
395+ cat_level_sniffers = {}
439396 examine_needed = set (factors )
440397 for data in data_iter_maker ():
441398 # We might have gathered all the information we need after the first
@@ -445,54 +402,23 @@ def _examine_factor_types(factors, factor_states, data_iter_maker):
445402 break
446403 for factor in list (examine_needed ):
447404 value = factor .eval (factor_states [factor ], data )
448- if (have_pandas_categorical
449- and isinstance (value , pandas .Categorical )):
450- value = Categorical .from_pandas_categorical (value )
451- # fall through into the next 'if':
452- if isinstance (value , Categorical ):
453- postprocessor = CategoricalTransform (levels = value .levels )
454- prefinished_postprocessors [factor ] = postprocessor
455- cat_levels_contrasts [factor ] = (value .levels ,
456- value .contrast )
457- examine_needed .remove (factor )
458- continue
459- value = atleast_2d_column_default (value )
460- _max_allowed_dim (2 , value , factor )
461- if np .issubdtype (value .dtype , np .number ):
405+ if factor in cat_level_sniffers or guess_categorical (value ):
406+ if factor not in cat_level_sniffers :
407+ cat_level_sniffers [factor ] = CatLevelSniffer (NA_action )
408+ done = cat_level_sniffers [factor ].sniff_levels (value )
409+ if done :
410+ levels = cat_level_sniffers .pop (factor ).levels ()
411+ contrast = getattr (value , "contrast" , None )
412+ cat_levels_contrasts [factor ] = (levels , contrast )
413+ examine_needed .remove (factor )
414+ else :
415+ # Numeric
416+ value = atleast_2d_column_default (value )
417+ _max_allowed_dim (2 , value , factor )
462418 column_count = value .shape [1 ]
463419 num_column_counts [factor ] = column_count
464420 examine_needed .remove (factor )
465- # issubdtype(X, bool) isn't reliable -- it returns true for
466- # X == int! So check the kind code instead:
467- elif value .dtype .kind == "b" :
468- # Special case: give it a transformer, but don't bother
469- # processing the rest of the data
470- if value .shape [1 ] > 1 :
471- msg = ("factor '%s' evaluates to a boolean array with "
472- "%s columns; I can only handle single-column "
473- "boolean arrays" % (factor .name (), value .shape [1 ]))
474- raise PatsyError (msg , factor )
475- cat_postprocessors [factor ] = _BoolToCat (factor )
476- examine_needed .remove (factor )
477- else :
478- if value .shape [1 ] > 1 :
479- msg = ("factor '%s' appears to be categorical but has "
480- "%s columns; I can only handle single-column "
481- "categorical factors"
482- % (factor .name (), value .shape [1 ]))
483- raise PatsyError (msg , factor )
484- if factor not in cat_postprocessors :
485- cat_postprocessors [factor ] = CategoricalTransform ()
486- processor = cat_postprocessors [factor ]
487- processor .memorize_chunk (value )
488- for factor , processor in cat_postprocessors .iteritems ():
489- processor .memorize_finish ()
490- cat_levels_contrasts [factor ] = (processor .levels (), None )
491- cat_postprocessors .update (prefinished_postprocessors )
492- assert set (cat_postprocessors ) == set (cat_levels_contrasts )
493- return (num_column_counts ,
494- cat_levels_contrasts ,
495- cat_postprocessors )
421+ return (num_column_counts , cat_levels_contrasts )
496422
497423def test__examine_factor_types ():
498424 class MockFactor (object ):
0 commit comments