diff --git a/patsy/build.py b/patsy/build.py index 470a83d..e739cff 100644 --- a/patsy/build.py +++ b/patsy/build.py @@ -18,6 +18,7 @@ from patsy.util import (atleast_2d_column_default, have_pandas, asarray_or_pandas, safe_issubdtype) +from patsy.desc import INTERCEPT from patsy.design_info import (DesignMatrix, DesignInfo, FactorInfo, SubtermInfo) from patsy.redundancy import pick_contrasts_for_term @@ -570,7 +571,8 @@ def next(self): def _make_subterm_infos(terms, num_column_counts, - cat_levels_contrasts): + cat_levels_contrasts, + implicit_intercept): # Sort each term into a bucket based on the set of numeric factors it # contains: term_buckets = OrderedDict() @@ -601,9 +603,14 @@ def _make_subterm_infos(terms, used_subterms = set() for term in bucket_terms: subterm_infos = [] + # If this bucket is empty, the unique noncategorical subterm is the + # intercept, so it should be skipped in pick_contrasts_for_term + # if implicit_intercept is True. + skip_noncategorical_subterm = implicit_intercept and not bucket factor_codings = pick_contrasts_for_term(term, num_column_counts, - used_subterms) + used_subterms, + skip_noncategorical_subterm) # Construct one SubtermInfo for each subterm for factor_coding in factor_codings: subterm_factors = [] @@ -636,7 +643,7 @@ def _make_subterm_infos(terms, return term_to_subterm_infos def design_matrix_builders(termlists, data_iter_maker, eval_env, - NA_action="drop"): + NA_action="drop", implicit_intercept=False): """Construct several :class:`DesignInfo` objects from termlists. This is one of Patsy's fundamental functions. This function and @@ -661,6 +668,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. + :arg implicit_intercept: A boolean value, default `False`. When set to + `True`, the design matrix excludes the intercept, but its subterms are + chosen as if the intercept were included, even if it reduces the rank + of the resulting matrix. :returns: A list of :class:`DesignInfo` objects, one for each termlist passed in. @@ -683,6 +694,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, if isinstance(NA_action, str): NA_action = NAAction(NA_action) all_factors = set() + for termlist in termlists: + if implicit_intercept and INTERCEPT in termlist: + raise PatsyError("An intercept should not be included explicitly" + "if implicit_intercept is set to True") for termlist in termlists: for term in termlist: all_factors.update(term.factors) @@ -718,7 +733,8 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env, for termlist in termlists: term_to_subterm_infos = _make_subterm_infos(termlist, num_column_counts, - cat_levels_contrasts) + cat_levels_contrasts, + implicit_intercept) assert isinstance(term_to_subterm_infos, OrderedDict) assert frozenset(term_to_subterm_infos) == frozenset(termlist) this_design_factor_infos = {} diff --git a/patsy/highlevel.py b/patsy/highlevel.py index 78d2942..fed7e90 100644 --- a/patsy/highlevel.py +++ b/patsy/highlevel.py @@ -31,7 +31,7 @@ # data source. If formula_like is not capable of doing this, then returns # None. def _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action): + NA_action, implicit_intercept): if isinstance(formula_like, DesignInfo): return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], formula_like) @@ -67,11 +67,13 @@ def _try_incr_builders(formula_like, data_iter_maker, eval_env, formula_like.rhs_termlist], data_iter_maker, eval_env, - NA_action) + NA_action, + implicit_intercept) else: return None -def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): +def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop", + implicit_intercept=False): """Construct a design matrix builder incrementally from a large data set. :arg formula_like: Similar to :func:`dmatrix`, except that explicit @@ -94,6 +96,11 @@ def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): :arg NA_action: An :class:`NAAction` object or string, used to determine what values count as 'missing' for purposes of determining the levels of categorical factors. + :arg implicit_intercept: Boolean value, default `False`. When set to + `True`, no intercept is included in the :class:`DesignInfo` that is + returned, but other columns are chosen as if an intercept were included, + even if it reduces the rank of design matrices built from the + :class:`DesignInfo`. :returns: A :class:`DesignInfo` Tip: for `data_iter_maker`, write a generator like:: @@ -109,7 +116,7 @@ def iter_maker(): """ eval_env = EvalEnvironment.capture(eval_env, reference=1) design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + NA_action, implicit_intercept) if design_infos is None: raise PatsyError("bad formula-like object") if len(design_infos[0].column_names) > 0: @@ -118,7 +125,7 @@ def iter_maker(): return design_infos[1] def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, - NA_action="drop"): + NA_action="drop", implicit_intercept=False): """Construct two design matrix builders incrementally from a large data set. @@ -127,7 +134,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, """ eval_env = EvalEnvironment.capture(eval_env, reference=1) design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + NA_action, implicit_intercept) if design_infos is None: raise PatsyError("bad formula-like object") if len(design_infos[0].column_names) == 0: @@ -152,7 +159,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, # (DesignInfo, DesignInfo) # any object with a special method __patsy_get_model_desc__ def _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type): + NA_action, return_type, implicit_intercept): if return_type == "dataframe" and not have_pandas: raise PatsyError("pandas.DataFrame was requested, but pandas " "is not installed") @@ -162,7 +169,7 @@ def _do_highlevel_design(formula_like, data, eval_env, def data_iter_maker(): return iter([data]) design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, - NA_action) + NA_action, implicit_intercept) if design_infos is not None: return build_design_matrices(design_infos, data, NA_action=NA_action, @@ -223,7 +230,7 @@ def _regularize_matrix(m, default_column_prefix): return (lhs, rhs) def dmatrix(formula_like, data={}, eval_env=0, - NA_action="drop", return_type="matrix"): + NA_action="drop", return_type="matrix", implicit_intercept=False): """Construct a single design matrix given a formula_like and data. :arg formula_like: An object that can be used to construct a design @@ -243,6 +250,10 @@ def dmatrix(formula_like, data={}, eval_env=0, :class:`NAAction` object. See :class:`NAAction` for details on what values count as 'missing' (and how to alter this). :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below. + :arg implicit_intercept: Boolean value, default `False`. When set to + `True`, no intercept is included in the design matrix, but other + columns are chosen as if an intercept were included, even if it + reduces the rank of the matrix. The `formula_like` can take a variety of forms. You can use any of the following: @@ -288,14 +299,15 @@ def dmatrix(formula_like, data={}, eval_env=0, """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type) + NA_action, return_type, + implicit_intercept) if lhs.shape[1] != 0: raise PatsyError("encountered outcome variables for a model " "that does not expect them") return rhs def dmatrices(formula_like, data={}, eval_env=0, - NA_action="drop", return_type="matrix"): + NA_action="drop", return_type="matrix", implicit_intercept=False): """Construct two design matrices given a formula_like and data. This function is identical to :func:`dmatrix`, except that it requires @@ -307,7 +319,7 @@ def dmatrices(formula_like, data={}, eval_env=0, """ eval_env = EvalEnvironment.capture(eval_env, reference=1) (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, - NA_action, return_type) + NA_action, return_type, implicit_intercept) if lhs.shape[1] == 0: raise PatsyError("model is missing required outcome variables") return (lhs, rhs) diff --git a/patsy/redundancy.py b/patsy/redundancy.py index 415fa96..8156d2b 100644 --- a/patsy/redundancy.py +++ b/patsy/redundancy.py @@ -211,22 +211,29 @@ def t(given, expected): # of each factor to this set in place. So the first time this routine is # called, pass in an empty set, and then just keep passing the same set to # any future calls. +# 'skip_noncategorical_subterm' is a boolean value indicating whether subterms +# should be chosen so that the unique subterm with no categorical factors is +# not in their span, even if the resulting matrix is not full rank. Setting +# skip_noncategorical_subterm to True is equivalent to including this subterm +# in used_subterms. # Returns: a list of dicts. Each dict maps from factors to booleans. The # coding for the given term should use a full-rank contrast for those factors # which map to True, a (n-1)-rank contrast for those factors which map to # False, and any factors which are not mentioned are numeric and should be # added back in. These dicts should add columns to the design matrix from left # to right. -def pick_contrasts_for_term(term, numeric_factors, used_subterms): +def pick_contrasts_for_term(term, numeric_factors, used_subterms, + skip_noncategorical_subterm): categorical_factors = [f for f in term.factors if f not in numeric_factors] # Converts a term into an expanded list of subterms like: # a:b -> 1 + a- + b- + a-:b- # and discards the ones that have already been used. subterms = [] for subset in _subsets_sorted(categorical_factors): - subterm = _Subterm([_ExpandedFactor(False, f) for f in subset]) - if subterm not in used_subterms: - subterms.append(subterm) + if subset or not skip_noncategorical_subterm: + subterm = _Subterm([_ExpandedFactor(False, f) for f in subset]) + if subterm not in used_subterms: + subterms.append(subterm) used_subterms.update(subterms) _simplify_subterms(subterms) factor_codings = [] @@ -239,17 +246,28 @@ def pick_contrasts_for_term(term, numeric_factors, used_subterms): def test_pick_contrasts_for_term(): from patsy.desc import Term + # Test with skip_noncategorical_subterm=False used = set() - codings = pick_contrasts_for_term(Term([]), set(), used) + codings = pick_contrasts_for_term(Term([]), set(), used, False) assert codings == [{}] - codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used) + codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, False) assert codings == [{"a": False}] - codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used) + codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, False) assert codings == [{"a": True, "b": False}] used_snapshot = set(used) - codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used) + codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used, False) assert codings == [{"d": False}, {"c": False, "d": True}] # Do it again backwards, to make sure we're deterministic with respect to # order: - codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot) + codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot, + False) assert codings == [{"c": False}, {"c": True, "d": False}] + + # Test with skip_noncategorical_subterm=True + used = set() + codings = pick_contrasts_for_term(Term(["a"]), set(), used, True) + assert codings == [{"a": False}] + codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, True) + assert codings == [] + codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, True) + assert codings == [{"b": False}] diff --git a/patsy/test_build.py b/patsy/test_build.py index c843f9f..0942157 100644 --- a/patsy/test_build.py +++ b/patsy/test_build.py @@ -263,6 +263,49 @@ def iter_maker(): build_design_matrices, [builder], data, return_type="asdfsadf") +def test_implicit_intercept(): + data = balanced(c=2, d=2) + data["x"] = [1, 2, 3, 4] + def iter_maker(): + yield data + + # Test that when there is only a numeric term, it does not get excluded + builder = design_matrix_builders([make_termlist("x")], iter_maker, 0, + implicit_intercept=True)[0] + mat = build_design_matrices([builder], + {"x": [10.0, 15.0, 20.0]})[0] + assert mat.shape == (3, 1) + assert np.array_equal(mat, [[10], [15], [20]]) + + # Test that when there are a numeric term and a categorical term, one of + # the categorical subterms is omitted + builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0, + implicit_intercept=True)[0] + mat = build_design_matrices([builder], + {"x": [10.0, 15.0, 20.0], + "c": np.asarray(["c1", "c2", "c1"], + dtype=object)})[0] + assert mat.shape == (3, 2) + assert np.array_equal(mat, [[0, 10], [1, 15], [0, 20]]) + + # Test that when there is a product of numeric and categorical factors, + # all subterms are included + builder = design_matrix_builders([make_termlist("xc")], iter_maker, 0, + implicit_intercept=True)[0] + mat = build_design_matrices([builder], + {"x": [10.0, 15.0, 20.0], + "c": np.asarray(["c1", "c2", "c1"], + dtype=object)})[0] + assert mat.shape == (3, 2) + assert np.array_equal(mat, [[10, 0], [0, 15], [20, 0]]) + + # Test that when there are a numeric term and a product of categorical + # factors, one of the categorical subterms is omitted + builder = design_matrix_builders([make_termlist("x", "cd")], iter_maker, 0, + implicit_intercept=True)[0] + mat = build_design_matrices([builder], data)[0] + assert mat.shape[1] == 4 # 3 categorical subterms, one numeric + def test_NA_action(): initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]} def iter_maker(): diff --git a/patsy/test_highlevel.py b/patsy/test_highlevel.py index 0ef0438..b57f20b 100644 --- a/patsy/test_highlevel.py +++ b/patsy/test_highlevel.py @@ -660,6 +660,64 @@ def raise_patsy_error(x): else: assert False +def test_dmatrix_implicit_intercept(): + data = balanced(a=2, b=2) + data["x"] = [1, 2, 3, 4] + data["y"] = [2, 3, 4, 5] + + # Test that in a formula with one simple categorical term, one of + # the term's levels is excluded + mat = dmatrix("0 + a", data=data, implicit_intercept=True) + assert np.allclose(mat, [[0], + [0], + [1], + [1]]) + + # In a more complicated formula with a categorical term, + # the encoding still excludes one categorical level + mat = dmatrix("0 + a:b + x", data=data, return_type="dataframe", + implicit_intercept=True) + assert mat.shape[1] == 4 + assert np.allclose(mat[['b[T.b2]', 'a[T.a2]:b[b1]', 'a[T.a2]:b[b2]', 'x']], + [[0, 0, 0, 1], + [1, 0, 0, 2], + [0, 1, 0, 3], + [1, 0, 1, 4]]) + + # A term with numerical and categorical factors should get a full-rank + # coding + mat = dmatrix("0 + a:x", data=data, return_type="dataframe", + implicit_intercept=True) + assert mat.shape[1] == 2 + assert np.allclose(mat[['a[a1]:x', 'a[a2]:x']], + [[1, 0], + [2, 0], + [0, 3], + [0, 4]]) + + # When using implicit_intercept, users should pass a formula that + # explicitly specifies no intercept (e.g. "0 + a" instead of "a"). + # This tests that dmatrix raises an error if a user fails to do this. + assert_raises(PatsyError, dmatrix, "a", data=data, implicit_intercept=True) + + # Test that in a formula with one simple categorical term, one of + # the term's levels is excluded + lmat, rmat = dmatrices("y ~ 0 + a", data=data, implicit_intercept=True) + assert np.allclose(lmat, [[2], + [3], + [4], + [5]]) + assert np.allclose(rmat, [[0], + [0], + [1], + [1]]) + + # When using implicit_intercept, users should pass a formula that + # explicitly specifies no intercept (e.g. "0 + a" instead of "a"). + # This tests that dmatrices raises an error if a user fails to do this. + assert_raises(PatsyError, dmatrices, "y ~ 1 + a", data=data, + implicit_intercept=True) + def test_dmatrix_NA_action(): data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]}