From d68f64d8472f96476566fa011ceb8b0930a97ee2 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:21:16 -0500 Subject: [PATCH 1/8] Added janitor.Rproj to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 09a72cbe..1411f6ce 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData inst/doc +janitor.Rproj From 2e08f44a055c09a8616a9dd9080a48e4f14789fb Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:21:58 -0500 Subject: [PATCH 2/8] Created S3 dispatch for crosstab. It can be pipelined now. --- R/crosstab.R | 85 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/R/crosstab.R b/R/crosstab.R index f9a8f3d7..ee08d8a7 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -8,58 +8,101 @@ #' @param percent which grouping to use for percentages, if desired (defaults to counts). Must be one of "none", "row", "col", or "all". #' @param show_na should cases where either variable is NA be included? #' @return Returns a data.frame (actually a \code{tbl_df}) with the frequencies of the crosstabulated variables. -#' @export + #' @examples #' crosstab(mtcars$cyl, mtcars$gear) #' crosstab(mtcars$cyl, mtcars$gear, "row") #' # Crosstab table of two variables -# Take two vectors and one of "none", "row", "col", and "full" to calculate %s +# Take two vectors and one of "none", "row", "col", and "all" to calculate %s # Could also take a data.frame and two vector names, for pipeline, but this seems simpler -crosstab <- function(vec1, vec2, percent = "none", show_na = TRUE){ - if(!mode(vec1) %in% c("logical", "numeric", "character")){ - stop("vec1 must be a vector of type logical, numeric, character, or factor")} - if(!mode(vec2) %in% c("logical", "numeric", "character")){ - stop("vec2 must be a vector of type logical, numeric, character, or factor")} +crosstab <- function(x, ... +) UseMethod("crosstab") + +#' @export +crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE){ + + + if(!mode(vec1) %in% c("logical", "numeric", "character", "list")){ + stop("vec1 must be a vector of type logical, numeric, character, list, or factor")} + if(!mode(vec2) %in% c("logical", "numeric", "character","list")){ + stop("vec2 must be a vector of type logical, numeric, character, list, or factor")} if(length(vec1) != length(vec2)){ stop("the two vectors are not the same length")} - dat <- data.frame(vec1, vec2, stringsAsFactors = FALSE) - var_name <- deparse(substitute(vec1)) + dat <- data.frame(vec1=vec1, vec2=vec2, stringsAsFactors = FALSE) + + dat_col_names <- names(dat) + + if (is.null(names(vec1))) { + var_name <- deparse(substitute(vec1)) + } else { + var_name <- dat_col_names[[1]] + } + + if(!show_na){ - dat <- dat[!is.na(dat$vec1) & !is.na(dat$vec2), ] + dat <- dat[!is.na(dat[[1]]) & !is.na(dat[[2]]), ] } - + # create long data.frame with initial counts tabl <- dat %>% - dplyr::count(vec1, vec2) %>% + dplyr::count_(dat_col_names) %>% dplyr::ungroup() - + # calculate percentages, if specified if(percent == "row"){ tabl <- tabl %>% - dplyr::group_by(vec1) %>% + dplyr::group_by_(dat_col_names[[1]]) %>% dplyr::mutate(n = n / sum(n, na.rm = TRUE)) } else if (percent == "col"){ tabl <- tabl %>% - dplyr::group_by(vec2) %>% + dplyr::group_by_(dat_col_names[[2]]) %>% dplyr::mutate(n = n / sum(n, na.rm = TRUE)) } else if (percent == "all"){ tabl <- tabl %>% dplyr::mutate(n = n / sum(n, na.rm = TRUE)) } - + # replace NA with string NA in vec2 to avoid invalid col name after spreading - # if this col is a factor, need to add that level to the factor - if(is.factor(tabl$vec2)){ - levels(tabl$vec2) <- c(levels(tabl$vec2), "NA") + # if this col is a factor, need to add that level to the factor + if(is.factor(tabl[[2]])){ + levels(tabl[[2]]) <- c(levels(tabl$vec2), "NA") } - tabl$vec2[is.na(tabl$vec2)] <- "NA" + tabl[2][is.na(tabl[2])] <- "NA" # spread to wide, ungroup() for cleanliness of result, and rename 1st col tabl %>% - tidyr::spread(vec2, n) %>% + tidyr::spread_(dat_col_names[[2]], "n") %>% dplyr::ungroup() %>% stats::setNames(., c(var_name, names(.)[-1])) } + +#' @export +crosstab.data.frame <- function(.data, ...){ + # collect dots + dots <- as.list(substitute(list(...)))[-1L] # + n <- length(dots) + + # select columns from .data + columns <- dots[1:2] + + arguments <- list() + + if(n > 2) arguments <- dots[3:n] + + x <- list() + x[[deparse(columns[[1]])]] <- as.numeric(.data[,deparse(columns[[1]])]) + x[[deparse(columns[[2]])]] <- .data[,deparse(columns[[2]])] + x <- as.data.frame(x) + + arguments$vec1 <- x[1] + arguments$vec2 <- x[2] + + + # create parameters vector to sue with do.call + do.call(crosstab.default, + args = arguments) + +} From f67bfc85f4dc833518d9fa7ee2959870bb152db3 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:26:53 -0500 Subject: [PATCH 3/8] Moved some code around to clear up when args are being constructed --- R/crosstab.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/R/crosstab.R b/R/crosstab.R index ee08d8a7..ae863060 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -88,20 +88,19 @@ crosstab.data.frame <- function(.data, ...){ # select columns from .data columns <- dots[1:2] - arguments <- list() - - if(n > 2) arguments <- dots[3:n] - x <- list() - x[[deparse(columns[[1]])]] <- as.numeric(.data[,deparse(columns[[1]])]) + x[[deparse(columns[[1]])]] <- .data[,deparse(columns[[1]])] x[[deparse(columns[[2]])]] <- .data[,deparse(columns[[2]])] x <- as.data.frame(x) + # create args list to use with do.call + arguments <- list() + + if(n > 2) arguments <- dots[3:n] + arguments$vec1 <- x[1] arguments$vec2 <- x[2] - - # create parameters vector to sue with do.call do.call(crosstab.default, args = arguments) From fc738d49245501cacbc0b5438d7882f52aed0cbe Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:30:10 -0500 Subject: [PATCH 4/8] Updated tests to accept crosstab.data.frame Needed crosstab.default to accept lists, expect_equal was also failing on the percent='all' test but shouldnt' have been. --- tests/testthat/test-crosstab.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-crosstab.R b/tests/testthat/test-crosstab.R index 47317de0..62babfc4 100644 --- a/tests/testthat/test-crosstab.R +++ b/tests/testthat/test-crosstab.R @@ -13,10 +13,10 @@ dat <- data.frame( ) test_that("bad inputs are handled properly", { - expect_error(crosstab(list(1, 2), dat$v1), "vec1 must be a vector of type logical, numeric, character, or factor") - expect_error(crosstab(dat$v1, list(1, 2)), "vec2 must be a vector of type logical, numeric, character, or factor") + #expect_error(crosstab(list(1, 2), dat$v1), "vec1 must be a vector of type logical, numeric, character, or factor") + #expect_error(crosstab(dat$v1, list(1, 2)), "vec2 must be a vector of type logical, numeric, character, or factor") expect_error(crosstab(c(1, 1), c(1)), "the two vectors are not the same length") - }) +}) # simple crosstab w/o NAs res <- crosstab(dat$v2, dat$v4) @@ -38,14 +38,14 @@ test_that("percentages are correct", { expect_equal(res_row[[2]], c(0.5, 1/3, NA)) expect_equal(res_row[[3]], c(NA, 0.5, NA)) expect_equal(res_row[[4]], c(0.5, 1/6, 1)) - + res_col <- crosstab(dat$v2, dat$v4, "col") expect_equal(res_col[[2]], c(1/3, 2/3, NA)) expect_equal(res_col[[3]], c(NA, 1, NA)) expect_equal(res_col[[4]], c(1/3, 1/3, 1/3)) - + res_all <- crosstab(dat$v2, dat$v4, "all") - expect_equal(res_all[, 2:4],tbl_df(res[, 2:4]/9)) + expect_equal(as.data.frame(res_all[, 2:4]),as.data.frame(res[, 2:4]/9)) }) z <- crosstab(dat$v3, dat$v1) From 8facd7073a13adccc1aba95d9836020783dc45e3 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:47:26 -0500 Subject: [PATCH 5/8] Added tests for crosstab.data.frame. Updated documentation. --- NAMESPACE | 3 ++- R/crosstab.R | 2 +- man/crosstab.Rd | 2 +- tests/testthat/test-crosstab.R | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 71799de9..43932a6d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,9 @@ # Generated by roxygen2: do not edit by hand +S3method(crosstab,data.frame) +S3method(crosstab,default) export(clean_names) export(convert_to_NA) -export(crosstab) export(excel_numeric_to_date) export(get_dupes) export(remove_empty_cols) diff --git a/R/crosstab.R b/R/crosstab.R index ae863060..f961d931 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -68,7 +68,7 @@ crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE){ # replace NA with string NA in vec2 to avoid invalid col name after spreading # if this col is a factor, need to add that level to the factor if(is.factor(tabl[[2]])){ - levels(tabl[[2]]) <- c(levels(tabl$vec2), "NA") + levels(tabl[[2]]) <- c(levels(tabl[[2]]), "NA") } tabl[2][is.na(tabl[2])] <- "NA" diff --git a/man/crosstab.Rd b/man/crosstab.Rd index 39d20816..1ae835e9 100644 --- a/man/crosstab.Rd +++ b/man/crosstab.Rd @@ -4,7 +4,7 @@ \alias{crosstab} \title{Generate a crosstabulation of two vectors.} \usage{ -crosstab(vec1, vec2, percent = "none", show_na = TRUE) +crosstab(x, ...) } \arguments{ \item{vec1}{the vector to place on the crosstab column.} diff --git a/tests/testthat/test-crosstab.R b/tests/testthat/test-crosstab.R index 62babfc4..951784c2 100644 --- a/tests/testthat/test-crosstab.R +++ b/tests/testthat/test-crosstab.R @@ -69,3 +69,42 @@ test_that("factor levels order correctly", { expect_equal(as.character(vv[[1]]), c("hi", "med", "lo", NA)) expect_true(is.factor(vv[[1]])) }) + +test_that("crosstab.data.frame dispatches", { + + z <- crosstab(dat, v3, v1) + + expect_equal(z[[1]], as.factor(c("a", "b", NA))) + expect_equal(z[[2]], c(1, 1, NA)) + expect_equal(z[[3]], c(1, 2, 1)) + expect_equal(z[[4]], c(2, NA, NA)) + expect_equal(z[[5]], c(NA, NA, 1)) + expect_equal(names(z), c("v3", "hi", "med", "lo", "NA")) +}) + +test_that("crosstab.data.frame is pipeable", { + z <- dat %>% + crosstab(v3, v1) + + expect_equal(z[[1]], as.factor(c("a", "b", NA))) + expect_equal(z[[2]], c(1, 1, NA)) + expect_equal(z[[3]], c(1, 2, 1)) + expect_equal(z[[4]], c(2, NA, NA)) + expect_equal(z[[5]], c(NA, NA, 1)) + expect_equal(names(z), c("v3", "hi", "med", "lo", "NA")) +}) + +test_that("crosstab.data.frame renders percentages are correct", { + res_row <- crosstab(dat, v2, v4, "row") + expect_equal(res_row[[2]], c(0.5, 1/3, NA)) + expect_equal(res_row[[3]], c(NA, 0.5, NA)) + expect_equal(res_row[[4]], c(0.5, 1/6, 1)) + + res_col <- crosstab(dat, v2, v4, "col") + expect_equal(res_col[[2]], c(1/3, 2/3, NA)) + expect_equal(res_col[[3]], c(NA, 1, NA)) + expect_equal(res_col[[4]], c(1/3, 1/3, 1/3)) + + res_all <- crosstab(dat, v2, v4, "all") + expect_equal(as.data.frame(res_all[, 2:4]),as.data.frame(res[, 2:4]/9)) +}) From b8ff8c68a4ac85fe9dac24f92a217c2e1fa7ef72 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 14:52:28 -0500 Subject: [PATCH 6/8] Exported crosstab method. Updated NAMESPACE. R CMD Check passes. --- NAMESPACE | 1 + R/crosstab.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 43932a6d..51f50580 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(crosstab,data.frame) S3method(crosstab,default) export(clean_names) export(convert_to_NA) +export(crosstab) export(excel_numeric_to_date) export(get_dupes) export(remove_empty_cols) diff --git a/R/crosstab.R b/R/crosstab.R index f961d931..592460e9 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -17,6 +17,7 @@ # Crosstab table of two variables # Take two vectors and one of "none", "row", "col", and "all" to calculate %s # Could also take a data.frame and two vector names, for pipeline, but this seems simpler +#' @export crosstab <- function(x, ... ) UseMethod("crosstab") From 8980f3ffcf09a7ca6bde343b4f79bd9468583d64 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 15:11:33 -0500 Subject: [PATCH 7/8] Updated crosstab method docs --- R/crosstab.R | 12 +++++++++++- man/crosstab.Rd | 22 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/R/crosstab.R b/R/crosstab.R index 592460e9..2af5a586 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -18,9 +18,13 @@ # Take two vectors and one of "none", "row", "col", and "all" to calculate %s # Could also take a data.frame and two vector names, for pipeline, but this seems simpler #' @export -crosstab <- function(x, ... +crosstab <- function(vec1, ... ) UseMethod("crosstab") +#' @inheritParams crosstab +#' @describeIn crosstab Create a crosstab from two vectors, +#' displaying either frequencies or percentages calculated by row, column, or overall. +#' Vectors don't have to be from the same data.frame, but typically are. #' @export crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE){ @@ -80,6 +84,12 @@ crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE){ stats::setNames(., c(var_name, names(.)[-1])) } +#' @inheritParams crosstab.default +#' @param .data a data.frame. +#' @param ... arguments passed to crosstab.default. +#' @describeIn crosstab Create a crosstab from a data.frame, +#' displaying either frequencies or percentages calculated by row, column, or overall. +#' Vectors don't have to be from the same data.frame, but typically are. #' @export crosstab.data.frame <- function(.data, ...){ # collect dots diff --git a/man/crosstab.Rd b/man/crosstab.Rd index 1ae835e9..0efa0de6 100644 --- a/man/crosstab.Rd +++ b/man/crosstab.Rd @@ -2,18 +2,28 @@ % Please edit documentation in R/crosstab.R \name{crosstab} \alias{crosstab} +\alias{crosstab.data.frame} +\alias{crosstab.default} \title{Generate a crosstabulation of two vectors.} \usage{ -crosstab(x, ...) +crosstab(vec1, ...) + +\method{crosstab}{default}(vec1, vec2, percent = "none", show_na = TRUE) + +\method{crosstab}{data.frame}(.data, ...) } \arguments{ \item{vec1}{the vector to place on the crosstab column.} +\item{...}{arguments passed to crosstab.default.} + \item{vec2}{the vector to place on the crosstab row.} \item{percent}{which grouping to use for percentages, if desired (defaults to counts). Must be one of "none", "row", "col", or "all".} \item{show_na}{should cases where either variable is NA be included?} + +\item{.data}{a data.frame.} } \value{ Returns a data.frame (actually a \code{tbl_df}) with the frequencies of the crosstabulated variables. @@ -21,6 +31,16 @@ Returns a data.frame (actually a \code{tbl_df}) with the frequencies of the cros \description{ Create a crosstab, displaying either frequencies or percentages calculated by row, column, or overall. Vectors don't have to be from the same data.frame, but typically are. } +\section{Methods (by class)}{ +\itemize{ +\item \code{default}: Create a crosstab from two vectors, +displaying either frequencies or percentages calculated by row, column, or overall. +Vectors don't have to be from the same data.frame, but typically are. + +\item \code{data.frame}: Create a crosstab from a data.frame, +displaying either frequencies or percentages calculated by row, column, or overall. +Vectors don't have to be from the same data.frame, but typically are. +}} \examples{ crosstab(mtcars$cyl, mtcars$gear) crosstab(mtcars$cyl, mtcars$gear, "row") From 3844188eeb292bac9fb93aa5e7e5915aadc6e690 Mon Sep 17 00:00:00 2001 From: Chris Haid Date: Thu, 7 Jul 2016 15:16:31 -0500 Subject: [PATCH 8/8] Final fix for crosstab method consistancy --- R/crosstab.R | 5 ++--- man/crosstab.Rd | 9 +++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/R/crosstab.R b/R/crosstab.R index 2af5a586..99cd5e34 100644 --- a/R/crosstab.R +++ b/R/crosstab.R @@ -18,15 +18,14 @@ # Take two vectors and one of "none", "row", "col", and "all" to calculate %s # Could also take a data.frame and two vector names, for pipeline, but this seems simpler #' @export -crosstab <- function(vec1, ... -) UseMethod("crosstab") +crosstab <- function(...) UseMethod("crosstab") #' @inheritParams crosstab #' @describeIn crosstab Create a crosstab from two vectors, #' displaying either frequencies or percentages calculated by row, column, or overall. #' Vectors don't have to be from the same data.frame, but typically are. #' @export -crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE){ +crosstab.default <- function(vec1, vec2, percent = "none", show_na = TRUE, ...){ if(!mode(vec1) %in% c("logical", "numeric", "character", "list")){ diff --git a/man/crosstab.Rd b/man/crosstab.Rd index 0efa0de6..b7f5c156 100644 --- a/man/crosstab.Rd +++ b/man/crosstab.Rd @@ -6,17 +6,18 @@ \alias{crosstab.default} \title{Generate a crosstabulation of two vectors.} \usage{ -crosstab(vec1, ...) +crosstab(...) -\method{crosstab}{default}(vec1, vec2, percent = "none", show_na = TRUE) +\method{crosstab}{default}(vec1, vec2, percent = "none", show_na = TRUE, + ...) \method{crosstab}{data.frame}(.data, ...) } \arguments{ -\item{vec1}{the vector to place on the crosstab column.} - \item{...}{arguments passed to crosstab.default.} +\item{vec1}{the vector to place on the crosstab column.} + \item{vec2}{the vector to place on the crosstab row.} \item{percent}{which grouping to use for percentages, if desired (defaults to counts). Must be one of "none", "row", "col", or "all".}