From c82557429faaae8216a2884855dcd749ea858c90 Mon Sep 17 00:00:00 2001 From: Sergio Oller Date: Sun, 13 Nov 2016 16:55:37 +0100 Subject: [PATCH] Fix issue #2203. combine deals with NA, has stricter coercion rules This commit fixes issue #2203, allowing combine to deal with missing values. Additionally it restricts coercion rules, in particular coercing logical to integer or double is not allowed anymore. Other coercion cases will give warnings, if information may be lost in the conversion, for instance when coercing integers with classes, such as difftime. --- inst/include/dplyr/Collecter.h | 153 ++++++++++++++++++++++++++------- tests/testthat/test-binds.R | 6 +- 2 files changed, 126 insertions(+), 33 deletions(-) diff --git a/inst/include/dplyr/Collecter.h b/inst/include/dplyr/Collecter.h index 005e29e50f..1f6243a8c7 100644 --- a/inst/include/dplyr/Collecter.h +++ b/inst/include/dplyr/Collecter.h @@ -9,6 +9,24 @@ namespace dplyr { + static inline bool has_classes(SEXP x) { + SEXP classes; + int i, num_classes; + if (OBJECT(x)) { + classes = Rf_getAttrib(x, R_ClassSymbol); + num_classes = Rf_length(classes); + if (num_classes > 0) { + return true; + } else { + return false; + } + } + return false; + } + static inline bool all_logical_na(SEXP x, SEXPTYPE xtype) { + return LGLSXP == xtype && all_na(x); + }; + class Collecter { public: virtual ~Collecter() {}; @@ -33,10 +51,10 @@ namespace dplyr { Collecter_Impl(int n_): data(n_, Rcpp::traits::get_na()) {} void collect(const SlicingIndex& index, SEXP v) { - Vector source(v); - STORAGE* source_ptr = Rcpp::internal::r_vector_start(source); - for (int i=0; i data; + + private: + void collect_logicalNA(const SlicingIndex& index) { + for (int i=0; i(); + } + } + + void collect_sexp(const SlicingIndex& index, SEXP v) { + Vector source(v); + STORAGE* source_ptr = Rcpp::internal::r_vector_start(source); + for (int i=0; i @@ -83,7 +117,9 @@ namespace dplyr { inline bool compatible(SEXP x) { int RTYPE = TYPEOF(x); - return RTYPE == REALSXP || (RTYPE == INTSXP && !Rf_inherits(x, "factor")) || RTYPE == LGLSXP; + return (RTYPE == REALSXP && !has_classes(x)) || + (RTYPE == INTSXP && !has_classes(x)) || + all_logical_na(x, RTYPE); } bool can_promote(SEXP x) const { @@ -109,6 +145,8 @@ namespace dplyr { collect_strings(index, v); } else if (Rf_inherits(v, "factor")) { collect_factor(index, v); + } else if (all_logical_na(v, TYPEOF(v))) { + collect_logicalNA(index, v); } else { CharacterVector vec(v); collect_strings(index, vec); @@ -120,7 +158,7 @@ namespace dplyr { } inline bool compatible(SEXP x) { - return (STRSXP == TYPEOF(x)) || Rf_inherits(x, "factor"); + return (STRSXP == TYPEOF(x)) || Rf_inherits(x, "factor") || all_logical_na(x, TYPEOF(x)); } bool can_promote(SEXP x) const { @@ -136,6 +174,14 @@ namespace dplyr { private: + void collect_logicalNA(const SlicingIndex& index, LogicalVector source) { + SEXP* p_data = Rcpp::internal::r_vector_start(data); + int n = index.size(); + for (int i=0; i(source); SEXP* p_data = Rcpp::internal::r_vector_start(data); @@ -177,11 +223,11 @@ namespace dplyr { inline bool compatible(SEXP x) { int RTYPE = TYPEOF(x); - return (INTSXP == RTYPE || RTYPE == LGLSXP) && !Rf_inherits(x, "factor"); + return (INTSXP == RTYPE && !has_classes(x)) || all_logical_na(x, RTYPE); } bool can_promote(SEXP x) const { - return TYPEOF(x) == REALSXP; + return TYPEOF(x) == REALSXP && !has_classes(x); } std::string describe() const { @@ -206,7 +252,7 @@ namespace dplyr { inline bool compatible(SEXP x) { String type = STRING_ELT(types,0); - return Rf_inherits(x, type.get_cstring()); + return Rf_inherits(x, type.get_cstring()) || all_logical_na(x, TYPEOF(x)); } inline bool can_promote(SEXP x) const { @@ -229,8 +275,12 @@ namespace dplyr { Parent(n), tz(tz_) {} void collect(const SlicingIndex& index, SEXP v) { - Parent::collect(index, v); - update_tz(v); + if (Rf_inherits(v, "POSIXct")) { + Parent::collect(index, v); + update_tz(v); + } else if (all_logical_na(v, TYPEOF(v))) { + Parent::collect(index, v); + } } inline SEXP get() { @@ -242,7 +292,7 @@ namespace dplyr { } inline bool compatible(SEXP x) { - return Rf_inherits(x, "POSIXct"); + return Rf_inherits(x, "POSIXct") || all_logical_na(x, TYPEOF(x)); } inline bool can_promote(SEXP x) const { @@ -295,20 +345,10 @@ namespace dplyr { } void collect(const SlicingIndex& index, SEXP v) { - // here we can assume that v is a factor with the right levels - // we however do not assume that they are in the same order - IntegerVector source(v); - CharacterVector levels = source.attr("levels"); - - SEXP* levels_ptr = Rcpp::internal::r_vector_start(levels); - int* source_ptr = Rcpp::internal::r_vector_start(source); - for (int i=0; isecond; - } + if (Rf_inherits(v, "factor") && has_same_levels_as(v)) { + collect_factor(index, v); + } else if (all_logical_na(v, TYPEOF(v))) { + collect_logicalNA(index); } } @@ -319,7 +359,8 @@ namespace dplyr { } inline bool compatible(SEXP x) { - return Rf_inherits(x, "factor") && has_same_levels_as(x); + return ((Rf_inherits(x, "factor") && has_same_levels_as(x)) || + all_logical_na(x, TYPEOF(x))); } inline bool can_promote(SEXP x) const { @@ -347,11 +388,34 @@ namespace dplyr { RObject model; CharacterVector levels; LevelsMap levels_map; + + void collect_factor(const SlicingIndex& index, SEXP v) { + // here we can assume that v is a factor with the right levels + // we however do not assume that they are in the same order + IntegerVector source(v); + CharacterVector levels = source.attr("levels"); + + SEXP* levels_ptr = Rcpp::internal::r_vector_start(levels); + int* source_ptr = Rcpp::internal::r_vector_start(source); + for (int i=0; isecond; + } + } + } + void collect_logicalNA(const SlicingIndex& index) { + for (int i=0; i inline bool Collecter_Impl::can_promote(SEXP x) const { - return (TYPEOF(x) == INTSXP && ! Rf_inherits(x, "factor")) || TYPEOF(x) == REALSXP; + return is_logical_all_na(); } inline Collecter* collecter(SEXP model, int n) { @@ -363,12 +427,24 @@ namespace dplyr { return new FactorCollecter(n, model); if (Rf_inherits(model, "Date")) return new TypedCollecter(n, get_date_classes()); + if (has_classes(model)) { + SEXP classes = Rf_getAttrib(model, R_ClassSymbol); + Rf_warning("Coercing class %s into an integer vector, with possible loss of information", + CHAR(STRING_ELT(classes, 0))); + return new TypedCollecter(n, classes); + } return new Collecter_Impl(n); case REALSXP: if (Rf_inherits(model, "POSIXct")) return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone"))); if (Rf_inherits(model, "Date")) return new TypedCollecter(n, get_date_classes()); + if (has_classes(model)) { + SEXP classes = Rf_getAttrib(model, R_ClassSymbol); + Rf_warning("Coercing class %s into a numeric vector, with possible loss of information", + CHAR(STRING_ELT(classes, 0))); + return new TypedCollecter(n, classes); + } return new Collecter_Impl(n); case CPLXSXP: return new Collecter_Impl(n); @@ -399,18 +475,35 @@ namespace dplyr { return new Collecter_Impl(n); } + // logical NA can be promoted to whatever type comes next + if (previous->is_logical_all_na()) { + return collecter(model, n); + } + switch (TYPEOF(model)) { case INTSXP: if (Rf_inherits(model, "Date")) return new TypedCollecter(n, get_date_classes()); if (Rf_inherits(model, "factor")) return new Collecter_Impl(n); + if (has_classes(model)) { + SEXP classes = Rf_getAttrib(model, R_ClassSymbol); + Rf_warning("Coercing class %s into an integer vector, with possible loss of information", + CHAR(STRING_ELT(classes, 0))); + return new TypedCollecter(n, classes); + } return new Collecter_Impl(n); case REALSXP: if (Rf_inherits(model, "POSIXct")) return new POSIXctCollecter(n, Rf_getAttrib(model, Rf_install("tzone"))); if (Rf_inherits(model, "Date")) return new TypedCollecter(n, get_date_classes()); + if (has_classes(model)) { + SEXP classes = Rf_getAttrib(model, R_ClassSymbol); + Rf_warning("Coercing class %s into a numeric vector, with possible loss of information", + CHAR(STRING_ELT(classes, 0))); + return new TypedCollecter(n, classes); + } return new Collecter_Impl(n); case LGLSXP: return new Collecter_Impl(n); diff --git a/tests/testthat/test-binds.R b/tests/testthat/test-binds.R index 642c471ae5..c05b6c22da 100644 --- a/tests/testthat/test-binds.R +++ b/tests/testthat/test-binds.R @@ -116,12 +116,12 @@ test_that("bind_rows promotes integer to numeric", { expect_equal(typeof(res$b), "integer") }) -test_that("bind_rows promotes logical to integer", { +test_that("bind_rows does not coerce logical to integer", { df1 <- data_frame(a = FALSE) df2 <- data_frame(a = 1L) - res <- bind_rows(df1, df2) - expect_equal(res$a, c(0L, 1L)) + expect_error(bind_rows(df1, df2), + "Can not automatically convert from logical to integer in column \"a\"") }) test_that("bind_rows promotes factor to character with warning", {