Add ability to build a subset of terms.

njsmith · njsmith · commit 2b1740d8f2bb · 2013-08-07T23:24:07.000+01:00
Esp. useful for things like graphing fitted splines.
diff --git a/patsy/build.py b/patsy/build.py
@@ -19,6 +19,7 @@
 from patsy.design_info import DesignMatrix, DesignInfo
 from patsy.redundancy import pick_contrasts_for_term
 from patsy.desc import ModelDesc
+from patsy.eval import EvalEnvironment
 from patsy.contrasts import code_contrast_matrix, Treatment
 from patsy.compat import itertools_product, OrderedDict
 from patsy.missing import NAAction
@@ -718,6 +719,79 @@ def design_info(self):
         return DesignInfo(self._column_names, self._term_slices,
                           builder=self)
 
+    def term_subset_builder(self, which_terms):
+        """Create a new :class:`DesignMatrixBuilder` that includes only a
+        subset of the terms that this object does.
+
+        For example, if `builder` has terms `x`, `y`, and `z`, then::
+
+          builder2 = builder.term_subset_builder(["x", "z"])
+
+        will return a new builder that will return design matrices with only
+        the columns corresponding to the terms `x` and `y`. For example, in
+        general these two expressions will return the same thing::
+
+          build_design_matrix([builder], data)[0][:, [0, 2, 3, 4]]
+          build_design_matrix([builder2], data)[0]
+
+        However, a critical difference is that in the second case, `data` need
+        not contain any values for `y`. This is very useful when doing
+        prediction using a subset of a model, in which situation R usually
+        forces you to specify dummy values for `y`.
+
+        If using a formula to specify the terms to include, remember that like
+        any formula, the intercept term will be included by default, so use
+        `0` or `-1` in your formula if you want to avoid this.
+
+        :arg which_terms: The terms which should be kept in the new
+          :class:`DesignMatrixBuilder`. If this is a string, then it is parsed
+          as a formula, and then the names of the resulting terms are taken as
+          the terms to keep. If it is a list, then it can contain a mixture of
+          term names (as strings) and :class:`Term` objects.
+        """
+        factor_to_evaluators = {}
+        for evaluator in self._evaluators:
+            factor_to_evaluators[evaluator.factor] = evaluator
+        design_info = self.design_info
+        term_name_to_term = dict(zip(design_info.term_names,
+                                     design_info.terms))
+        if isinstance(which_terms, basestring):
+            # We don't use this EvalEnvironment -- all we want to do is to
+            # find matching terms, and we can't do that use == on Term
+            # objects, because that calls == on factor objects, which in turn
+            # compares EvalEnvironments. So all we do with the parsed formula
+            # is pull out the term *names*, which the EvalEnvironment doesn't
+            # effect. This is just a placeholder then to allow the ModelDesc
+            # to be created:
+            env = EvalEnvironment({})
+            desc = ModelDesc.from_formula(which_terms, env)
+            if desc.lhs_termlist:
+                raise PatsyError("right-hand-side-only formula required")
+            which_terms = [term.name() for term in desc.rhs_termlist]
+        terms = []
+        evaluators = set()
+        term_to_column_builders = {}
+        for term_or_name in which_terms:
+            if isinstance(term_or_name, basestring):
+                if term_or_name not in term_name_to_term:
+                    raise PatsyError("requested term %r not found in "
+                                     "this DesignMatrixBuilder"
+                                     % (term_or_name,))
+                term = term_name_to_term[term_or_name]
+            else:
+                term = term_or_name
+            if term not in self._termlist:
+                raise PatsyError("requested term '%s' not found in this "
+                                 "DesignMatrixBuilder" % (term,))
+            for factor in term.factors:
+                evaluators.add(factor_to_evaluators[factor])
+            terms.append(term)
+            column_builder = self._term_to_column_builders[term]
+            term_to_column_builders[term] = column_builder
+        return DesignMatrixBuilder(terms,
+                                   evaluators,
+                                   term_to_column_builders)
+
     def _build(self, evaluator_to_values, dtype):
         factor_to_values = {}
         need_reshape = False
diff --git a/patsy/test_build.py b/patsy/test_build.py
@@ -603,4 +603,61 @@ def test_contrast():
                            [8, -1],
                            [7, 12],
                            [2, 13]])
-    
+
+def test_term_subset_builder():
+    # For each combination of:
+    #   formula, term names, term objects, mixed term name and term objects
+    # check that results match subset of full build
+    # and that removed variables don't hurt
+    all_data = {"x": [1, 2],
+                "y": [[3.1, 3.2],
+                      [4.1, 4.2]],
+                "z": [5, 6]}
+    all_terms = make_termlist("x", "y", "z")
+    def iter_maker():
+        yield all_data
+    all_builder = design_matrix_builders([all_terms], iter_maker)[0]
+    full_matrix = build_design_matrices([all_builder], all_data)[0]
+
+    def t(which_terms, variables, columns):
+        sub_builder = all_builder.term_subset_builder(which_terms)
+        sub_data = {}
+        for variable in variables:
+            sub_data[variable] = all_data[variable]
+        sub_matrix = build_design_matrices([sub_builder], sub_data)[0]
+        sub_full_matrix = full_matrix[:, columns]
+        if not isinstance(which_terms, basestring):
+            assert len(which_terms) == len(sub_builder.design_info.terms)
+        assert np.array_equal(sub_matrix, sub_full_matrix)
+
+    t("~ 0 + x + y + z", ["x", "y", "z"], slice(None))
+    t(["x", "y", "z"], ["x", "y", "z"], slice(None))
+    t([unicode("x"), unicode("y"), unicode("z")],
+      ["x", "y", "z"], slice(None))
+    t(all_terms, ["x", "y", "z"], slice(None))
+    t([all_terms[0], "y", all_terms[2]], ["x", "y", "z"], slice(None))
+
+    t("~ 0 + x + z", ["x", "z"], [0, 3])
+    t(["x", "z"], ["x", "z"], [0, 3])
+    t([unicode("x"), unicode("z")], ["x", "z"], [0, 3])
+    t([all_terms[0], all_terms[2]], ["x", "z"], [0, 3])
+    t([all_terms[0], "z"], ["x", "z"], [0, 3])
+
+    t("~ 0 + z + x", ["x", "z"], [3, 0])
+    t(["z", "x"], ["x", "z"], [3, 0])
+    t([unicode("z"), unicode("x")], ["x", "z"], [3, 0])
+    t([all_terms[2], all_terms[0]], ["x", "z"], [3, 0])
+    t([all_terms[2], "x"], ["x", "z"], [3, 0])
+
+    t("~ 0 + y", ["y"], [1, 2])
+    t(["y"], ["y"], [1, 2])
+    t([unicode("y")], ["y"], [1, 2])
+    t([all_terms[1]], ["y"], [1, 2])
+
+    # Formula can't have a LHS
+    assert_raises(PatsyError, all_builder.term_subset_builder, "a ~ a")
+    # Term must exist
+    assert_raises(PatsyError, all_builder.term_subset_builder, "~ asdf")
+    assert_raises(PatsyError, all_builder.term_subset_builder, ["asdf"])
+    assert_raises(PatsyError,
+                  all_builder.term_subset_builder, [Term(["asdf"])])