From 4e3c5309122fe170470a1a457fc6000ef9d6b0f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Fri, 9 Dec 2016 22:07:20 +0100 Subject: [PATCH 1/6] scripts to replicate issues --- .Rbuildignore | 1 + .gitignore | 1 + issues/2018.R | 53 ++++++++++++++++++++++++++++++++++ issues/2080.R | 24 ++++++++++++++++ issues/2109.R | 24 ++++++++++++++++ issues/2198-2.R | 14 +++++++++ issues/2198-3.R | 3 ++ issues/2198.R | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ issues/2231.R | 8 ++++++ issues/2267.R | 5 ++++ issues/2272.R | 15 ++++++++++ issues/2280.R | 17 +++++++++++ issues/2288.R | 9 ++++++ issues/2290.R | 6 ++++ issues/2292.R | 32 +++++++++++++++++++++ issues/2293.R | 29 +++++++++++++++++++ issues/2297.R | 2 ++ issues/2300.R | 5 ++++ issues/2301.R | 17 +++++++++++ issues/2302.R | 30 ++++++++++++++++++++ 20 files changed, 370 insertions(+) create mode 100644 issues/2018.R create mode 100644 issues/2080.R create mode 100644 issues/2109.R create mode 100644 issues/2198-2.R create mode 100644 issues/2198-3.R create mode 100644 issues/2198.R create mode 100644 issues/2231.R create mode 100644 issues/2267.R create mode 100644 issues/2272.R create mode 100644 issues/2280.R create mode 100644 issues/2288.R create mode 100644 issues/2290.R create mode 100644 issues/2292.R create mode 100644 issues/2293.R create mode 100644 issues/2297.R create mode 100644 issues/2300.R create mode 100644 issues/2301.R create mode 100644 issues/2302.R diff --git a/.Rbuildignore b/.Rbuildignore index a0bb0a54a3..f2ff098ca2 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,3 +18,4 @@ demo/pandas ^src/Makevars\.local$ ^Doxyfile$ ^clion-test\.R$ +^issues$ diff --git a/.gitignore b/.gitignore index f5e0093048..921ca1070d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ vignettes/*.R .DS_Store /.idea /clion-test.R +issues diff --git a/issues/2018.R b/issues/2018.R new file mode 100644 index 0000000000..008c26898a --- /dev/null +++ b/issues/2018.R @@ -0,0 +1,53 @@ +#dplyr 0.5.0 +library(dplyr) + +test_data = data.frame( + grp = rep(c("A", "B"), each = 4), + y = rnorm(8), stringsAsFactors = F +) + +#this works +test_data %>% group_by(grp) %>% + mutate( + cdf = ecdf(y)(y) + ) + +#this throws an error: Error: object 'y' not found +test_data %>% group_by(grp) %>% + mutate( + surv = 1 - ecdf(y)(y) + ) + +#but this works +custom_fun = function(input) 1 - ecdf(input)(input) + +test_data %>% group_by(grp) %>% + mutate( + surv = custom_fun(y) + ) + +# example with wilcox.test + +test_data2 = data.frame( + grp = rep(c("A", "B"), each = 4), + grp2 = rep(c("C", "D"), 4), + y = rnorm(8), stringsAsFactors = F +) + +test_data2 %>% group_by(grp) %>% + mutate( + p_value = 1 - wilcox.test(y)$p.value + ) + +# Error: object 'y' not found +test_data2 %>% group_by(grp) %>% + mutate( + p_value = wilcox.test(y ~ grp2)$p.value + ) + +wilcox_fun = function(outcome, group) wilcox.test(outcome ~ factor(group))$p.value + +test_data2 %>% group_by(grp) %>% + mutate( + p_value = wilcox_fun(y, grp2) + ) diff --git a/issues/2080.R b/issues/2080.R new file mode 100644 index 0000000000..aca2e6191b --- /dev/null +++ b/issues/2080.R @@ -0,0 +1,24 @@ +library(dplyr) +library(purrr) + +df <- tibble(x = list( + tibble(y = 1:2), + tibble(y = 1:3), + tibble(y = 1:4) +)) + +nrows <- function(df) { + df %>% summarise(n = n()) %>% .[["n"]] +} + +df %>% + mutate( + n1 = x %>% map_int(nrows), + n2 = x %>% map_int(. %>% summarise(n = n()) %>% .[["n"]]) + ) +#> # A tibble: 3 × 3 +#> x n1 n2 +#> +#> 1 2 3 +#> 2 3 3 +#> 3 4 3 diff --git a/issues/2109.R b/issues/2109.R new file mode 100644 index 0000000000..1ffcdc7dca --- /dev/null +++ b/issues/2109.R @@ -0,0 +1,24 @@ +df <- data_frame(id = rep(1:2, each = 4), id2 = rep(1:2, 4)) + +df %>% group_by(id, id2) %>% distinct(id) +# Source: local data frame [4 x 3] +# Groups: id, id2 [4] +# +# id id id2 +# +# 1 1 1 1 +# 2 1 1 2 +# 3 2 2 1 +# 4 2 2 2 + +df %>% group_by(id, id2) %>% select(-id2) %>% distinct(id) +# Adding missing grouping variables: `id2` +# Source: local data frame [4 x 3] +# Groups: id, id2 [4] +# +# id id id2 +# +# 1 1 1 1 +# 2 1 1 2 +# 3 2 2 1 +# 4 2 2 2 diff --git a/issues/2198-2.R b/issues/2198-2.R new file mode 100644 index 0000000000..18f1ddcaaa --- /dev/null +++ b/issues/2198-2.R @@ -0,0 +1,14 @@ +benchmark <- function(df, col, summarize) { + force(df) + gc() + if (summarize) { + system.time(group_by_(df, col) %>% count()) + } else { + system.time(group_by_(df, col)) + } +} + +devtools::load_all() + +benchmark(Lahman::Batting %>% mutate(id = paste(teamID, yearID, playerID)) %>% sample_frac() %>% transmute(id, n = 0), ~id, summarize = FALSE) +# benchmark(Lahman::Batting, ~playerId, ~teamId, summarize = FALSE) diff --git a/issues/2198-3.R b/issues/2198-3.R new file mode 100644 index 0000000000..12951f6fb8 --- /dev/null +++ b/issues/2198-3.R @@ -0,0 +1,3 @@ +devtools::load_all() +batting_df <- Lahman::Batting +system.time(batting_df %>% group_by(playerID) %>% summarise(ab = mean(AB))) diff --git a/issues/2198.R b/issues/2198.R new file mode 100644 index 0000000000..6a1b569a2b --- /dev/null +++ b/issues/2198.R @@ -0,0 +1,75 @@ +set.seed(123) + +ALPHABET <- letters[1:4] +ALPHABET <- letters[1:10] +ALPHABET <- letters + +create_ids <- function(N) { + s <- paste(sample(c(ALPHABET, "|"), N, replace = TRUE), collapse = "") + ss <- strsplit(s, "|", fixed = TRUE)[[1]] + ss <- unique(ss) + ss <- ss[nchar(ss) > 3] + ss +} + +N <- 1e4 +ids <- create_ids(N) + +benchmark <- function(ids, summarize) { + force(ids) + df <- data_frame(ids, n = 0) + gc() + if (summarize) { + system.time(group_by(df, ids) %>% summarize(n = mean(n))) + } else { + system.time(group_by(df, ids)) + } +} + +devtools::load_all() + +# master: +# +# > # benchmark(ids, TRUE) +# > # benchmark(sample(ids, NN, replace = FALSE), TRUE) +# > # benchmark(sample(ids, NN, replace = TRUE), TRUE) +# > benchmark(ids, F .... [TRUNCATED] +# user system elapsed +# 4.440 0.032 4.469 +# +# > benchmark(sample(ids, NN, replace = FALSE), FALSE) +# user system elapsed +# 2.164 0.000 2.166 +# +# > benchmark(sample(ids, NN, replace = TRUE), FALSE) +# user system elapsed +# 3.176 0.000 3.175 + +# f: +# > benchmark(ids, TRUE) +# user system elapsed +# 2.500 0.024 2.522 +# +# > benchmark(sample(ids, NN, replace = FALSE), TRUE) +# user system elapsed +# 2.320 0.000 2.319 +# +# > benchmark(sample(ids, NN, replace = TRUE), TRUE) +# user system elapsed +# 2.584 0.000 2.584 + + +NN <- 3e2 + +#gprofiler::start_profiler() + +benchmark(ids, TRUE) + +#gprofiler::stop_profiler() +#gprofiler::show_profiler_pdf() + +#benchmark(sample(ids, NN, replace = FALSE), TRUE) +# benchmark(sample(ids, NN, replace = TRUE), TRUE) +# benchmark(ids, FALSE) +# benchmark(sample(ids, NN, replace = FALSE), FALSE) +# benchmark(sample(ids, NN, replace = TRUE), FALSE) diff --git a/issues/2231.R b/issues/2231.R new file mode 100644 index 0000000000..70895ce3d9 --- /dev/null +++ b/issues/2231.R @@ -0,0 +1,8 @@ +library(dplyr) +d <- data_frame( x = rep(c(1,2), c(2,4)), y = 1:6, names = letters[1:6] ) +d +res <- d %>% group_by(x) %>% summarise( y = list( setNames(y, names) ) ) %>% ungroup +res$y[[1]] +res$y[[2]] +names( res$y[[1]]) +names( res$y[[2]]) diff --git a/issues/2267.R b/issues/2267.R new file mode 100644 index 0000000000..f33a368cd3 --- /dev/null +++ b/issues/2267.R @@ -0,0 +1,5 @@ +data1 <- data.frame(var1 = sample(c(1,2,3),50,replace=T), var2 = sample(c("cond1", "cond2"), 50,replace=T),RT = sample(as.numeric(300:1000),50,replace=T)) + +data1 <- data1 %>% + group_by(var1) %>% + mutate(median_var = median(RT[var2=="cond1"])) diff --git a/issues/2272.R b/issues/2272.R new file mode 100644 index 0000000000..af57ff2f0f --- /dev/null +++ b/issues/2272.R @@ -0,0 +1,15 @@ +devtools::load_all() + +df1 <- data.frame(a = c(1,2,NA), b = c(5,NA, NA)) +df2 <- data.frame(a = c(1,NA,NA), c = c(9,8, NA)) +left_join(df1, df2) + +src <- src_sqlite("", create = TRUE) +sqlite1 <- copy_to(src, df1) +sqlite2 <- copy_to(src, df2) +left_join(sqlite1, sqlite2) + +src <- src_postgres() +postgres1 <- copy_to(src, df1, temporary = TRUE, name = random_table_name()) +postgres2 <- copy_to(src, df2, temporary = TRUE, name = random_table_name()) +left_join(postgres1, postgres2) diff --git a/issues/2280.R b/issues/2280.R new file mode 100644 index 0000000000..d37256d6e0 --- /dev/null +++ b/issues/2280.R @@ -0,0 +1,17 @@ +library("dplyr") +pings = read.csv(text = " +timestamp,round_start_timestamp,node,seq,nb_bytes,ttl,latency +1480525318.042879,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,2,64,42,527 +1480525318.654011,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,3,64,42,138 +1480525319.555820,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,4,64,42,38.8 +1480525330.320386,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,2,64,42,642 +1480525330.876448,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,3,64,42,198 +1480525331.898099,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,4,64,42,217 +1480525330.268665,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,2,64,42,701 +1480525331.107886,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,3,64,42,540 +1480525332.268447,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,4,64,42,700 +") + +data = pings[pings$seq == 2,] +df_grouped = group_by(data, node) +df_lag = mutate(df_grouped, latency_change=df_grouped$latency - lag(df_grouped$latency)) diff --git a/issues/2288.R b/issues/2288.R new file mode 100644 index 0000000000..f9d8413eb6 --- /dev/null +++ b/issues/2288.R @@ -0,0 +1,9 @@ +devtools::load_all(".") + src <- src_mysql("test", user = "muelleki") + src <- src_sqlite(":memory:", create = TRUE) + name <- dplyr:::random_table_name() + DBI::dbWriteTable(src$con, name, data_frame(a = 2:5)) + data <- src %>% tbl(name) + data %>% + mutate(b = log(a), c = log(exp(1), a)) %>% + mutate(d = b * c) diff --git a/issues/2290.R b/issues/2290.R new file mode 100644 index 0000000000..2a7e4f1ae5 --- /dev/null +++ b/issues/2290.R @@ -0,0 +1,6 @@ +devtools::load_all(".") +mtcars2 <- copy_to(src_postgres(), mtcars, dplyr:::random_table_name()) +mtcars2 %>% + group_by(cyl) %>% + arrange(disp) %>% + summarize(mpg2 = first(mpg)) diff --git a/issues/2292.R b/issues/2292.R new file mode 100644 index 0000000000..ba59b72fc9 --- /dev/null +++ b/issues/2292.R @@ -0,0 +1,32 @@ +library(DBI) +library(dplyr) +"%||%" <- function(x, y) if(is.null(x)) y else x + +db_disconnector <- function(con, name, quiet = FALSE) { + reg.finalizer(environment(), function(...) { + if (!quiet) { + message("Auto-disconnecting ", name, " connection ", + "(", paste(con@Id, collapse = ", "), ")") + } + dbDisconnect(con) + }) + environment() +} + +src_postgres2 <- function(dbname = NULL, host = NULL, port = NULL, user = NULL, + password = NULL, ...) { + if (!requireNamespace("RPostgres", quietly = TRUE)) { + stop("RPostgres package required to connect to postgres db", call. = FALSE) + } + + user <- user %||% "" + + con <- dbConnect(RPostgres::Postgres(), host = host %||% "", dbname = dbname %||% "", + user = user, password = password %||% "", port = port %||% "", ...) + info <- dbGetInfo(con) + + src_sql("postgres", con, + info = info, disco = db_disconnector(con, "postgres")) +} + +src_postgres2() diff --git a/issues/2293.R b/issues/2293.R new file mode 100644 index 0000000000..9e66dcc974 --- /dev/null +++ b/issues/2293.R @@ -0,0 +1,29 @@ +df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) +scrambled <- df[sample(nrow(df)), ] +wrong <- mutate(scrambled, running = cummax(cumsum(value))) +arrange(wrong, year) +# year value running +#1 2000 0 30 +#2 2001 1 1 +#3 2002 4 5 +#4 2003 9 39 +#5 2004 16 55 +#6 2005 25 30 +right <- mutate(scrambled, running = order_by(year, cummax(cumsum(value)))) +arrange(right, year) +# year value running +#1 2000 0 30 +#2 2001 1 30 +#3 2002 4 30 +#4 2003 9 39 +#5 2004 16 55 +#6 2005 25 55 +right2 <- arrange(scrambled,year) %>%mutate(running = cummax(cumsum(value))) +arrange(right2, year) +# year value running +#1 2000 0 0 +#2 2001 1 1 +#3 2002 4 5 +#4 2003 9 14 +#5 2004 16 30 +#6 2005 25 55 diff --git a/issues/2297.R b/issues/2297.R new file mode 100644 index 0000000000..65cb894d4d --- /dev/null +++ b/issues/2297.R @@ -0,0 +1,2 @@ +dplyr::data_frame(a = 1) +dplyr::order_by(10:1, cumsum(1:10)) diff --git a/issues/2300.R b/issues/2300.R new file mode 100644 index 0000000000..590a14cb26 --- /dev/null +++ b/issues/2300.R @@ -0,0 +1,5 @@ +library(dplyr) + +df_1 <- data_frame(a = as.integer(1:3), b = runif(3)) +df_2 <- data_frame(a = as.factor(1:3), c = runif(3)) +left_join(df_1, df_2) diff --git a/issues/2301.R b/issues/2301.R new file mode 100644 index 0000000000..971f4d1d76 --- /dev/null +++ b/issues/2301.R @@ -0,0 +1,17 @@ +library(dplyr) +library(tidyr) + +# works +df <- data.frame(key = c("a","b"), value = c(1,2)) +df_spread <- df %>% spread(key, value) +mutate_if(df_spread, is.numeric, function(x) {x+1}) + +# fails with : Error in eval(expr, envir, enclos) : object 'b' not found +df <- data.frame(key = c("a","b-a"), value = c(1,2)) +df_spread <- df %>% spread(key, value) +mutate_if(df_spread, is.numeric, function(x) {x+1}) + +# fails with: Error in parse(text = x) : :1:3: unexpected symbol +df <- data.frame(key = c("a","c d"), value = c(1,2)) +df_spread <- df %>% spread(key, value) +mutate_if(df_spread, is.numeric, function(x) {x+1}) diff --git a/issues/2302.R b/issues/2302.R new file mode 100644 index 0000000000..b140db95cc --- /dev/null +++ b/issues/2302.R @@ -0,0 +1,30 @@ +library('dplyr') +d <- data.frame(x=c(1,2,2),y=c(3,5,NA),z=c(NA,'a','b'), + rowNum=1:3, + stringsAsFactors = FALSE) +print(d) + +fnam <- tempfile(pattern = "dplyr_doc_narm", tmpdir = tempdir(), fileext = "sqlite3") +my_db <- dplyr::src_sqlite(fnam, create = TRUE) +class(my_db) +dRemote <- copy_to(my_db,d,'d',rowNumberColumn='rowNum',overwrite=TRUE) + + +# correct calculation +dRemote %>% mutate(nna=0) %>% + mutate(nna=nna+ifelse(is.na(x),1,0)) %>% + mutate(nna=nna+ifelse(is.na(y),1,0)) %>% + mutate(nna=nna+ifelse(is.na(z),1,0)) + +# incorrect calculation (last step seems to always clobber the previous result) +dRemote %>% mutate(nna=0) %>% + mutate(nna=nna+is.na(x)) %>% + mutate(nna=nna+is.na(y)) %>% + mutate(nna=nna+is.na(z)) + +# clean up +rm(list=setdiff(ls(),'fnam')) +if(!is.null(fnam)) { + file.remove(fnam) +} +gc() From cfc613624b8ea8c237ad505a6f18fa5f1335d2fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 15 Dec 2016 15:38:32 +0100 Subject: [PATCH 2/6] update --- issues/2293.R | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/issues/2293.R b/issues/2293.R index 9e66dcc974..5cbfe2a53c 100644 --- a/issues/2293.R +++ b/issues/2293.R @@ -1,29 +1,31 @@ +set.seed(10) df <- data.frame(year = 2000:2005, value = (0:5) ^ 2) scrambled <- df[sample(nrow(df)), ] wrong <- mutate(scrambled, running = cummax(cumsum(value))) arrange(wrong, year) -# year value running -#1 2000 0 30 -#2 2001 1 1 -#3 2002 4 5 -#4 2003 9 39 -#5 2004 16 55 -#6 2005 25 30 +## year value running +## 1 2000 0 30 +## 2 2001 1 10 +## 3 2002 4 30 +## 4 2003 9 9 +## 5 2004 16 26 +## 6 2005 25 55 right <- mutate(scrambled, running = order_by(year, cummax(cumsum(value)))) arrange(right, year) -# year value running -#1 2000 0 30 -#2 2001 1 30 -#3 2002 4 30 -#4 2003 9 39 -#5 2004 16 55 -#6 2005 25 55 +## year value running +## 1 2000 0 30 +## 2 2001 1 30 +## 3 2002 4 30 +## 4 2003 9 30 +## 5 2004 16 30 +## 6 2005 25 55 right2 <- arrange(scrambled,year) %>%mutate(running = cummax(cumsum(value))) arrange(right2, year) -# year value running -#1 2000 0 0 -#2 2001 1 1 -#3 2002 4 5 -#4 2003 9 14 -#5 2004 16 30 -#6 2005 25 55 +## year value running +## 1 2000 0 0 +## 2 2001 1 1 +## 3 2002 4 5 +## 4 2003 9 14 +## 5 2004 16 30 +## 6 2005 25 55 +mutate(scrambled, running1 = order_by(year, cumsum(value)), running2 = order_by(year, cummax(running1))) %>% arrange(year) From fc5e329982b746315fb227e7ea88a138183c009d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 26 Jan 2017 10:28:44 +0100 Subject: [PATCH 3/6] add more --- issues/2322.R | 33 ++++++++++++++++++++ issues/2330.R | 8 +++++ issues/2338.R | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ issues/2348.R | 9 ++++++ issues/2352.R | 24 +++++++++++++++ issues/2359.R | 21 +++++++++++++ issues/2362.R | 11 +++++++ issues/2374.R | 6 ++++ issues/2379.R | 17 +++++++++++ 9 files changed, 214 insertions(+) create mode 100644 issues/2322.R create mode 100644 issues/2330.R create mode 100644 issues/2338.R create mode 100644 issues/2348.R create mode 100644 issues/2352.R create mode 100644 issues/2359.R create mode 100644 issues/2362.R create mode 100644 issues/2374.R create mode 100644 issues/2379.R diff --git a/issues/2322.R b/issues/2322.R new file mode 100644 index 0000000000..a4225818dd --- /dev/null +++ b/issues/2322.R @@ -0,0 +1,33 @@ +library(dplyr) +library(magrittr) +library(data.table) + +bind_rows( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) + ,data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")) +) %>% use_series(b) + +bind_rows( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")), + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) +) %>% use_series(b) + +bind_rows( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) + ,data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) +) %>% use_series(b) + +bind_rows( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")) + ,data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")) +) %>% use_series(b) + +rbindlist(list( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) + ,data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")) +)) %>% use_series(b) + +rbind( + data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00", tz= Sys.timezone())) + ,data.frame(a= 4, b= as.POSIXct("2016-01-01 01:00")) +) %>% use_series(b) diff --git a/issues/2330.R b/issues/2330.R new file mode 100644 index 0000000000..97e177c12d --- /dev/null +++ b/issues/2330.R @@ -0,0 +1,8 @@ +devtools::load_all() +df1 <- data.frame(x = 1:10, y = 1:10) +df2 <- expand.grid(x = 1:10, y = 1:10) + +df1g <- df1 %>% group_by(x, y) + +df3 <- inner_join(df1g, df2, by = "x") +df3 diff --git a/issues/2338.R b/issues/2338.R new file mode 100644 index 0000000000..64d1eac41b --- /dev/null +++ b/issues/2338.R @@ -0,0 +1,85 @@ +library(tidyverse, warn.conflicts = FALSE) + +# Insert some NAs, convert doubles to integers +to_fix <- as.matrix(mtcars[1:3]) +diag(to_fix) <- NA +to_fix <- to_fix %>% as.data.frame() %>% mutate_all(as.integer) + +replacements <- mtcars[1:3] + +str(to_fix) # Integers +#> 'data.frame': 32 obs. of 3 variables: +#> $ mpg : int NA 21 22 21 18 18 14 24 22 19 ... +#> $ cyl : int 6 NA 4 6 8 6 8 4 4 6 ... +#> $ disp: int 160 160 NA 258 360 225 360 146 140 167 ... +str(replacements) # Doubles +#> 'data.frame': 32 obs. of 3 variables: +#> $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... +#> $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... +#> $ disp: num 160 160 108 258 360 ... + +coalesce(to_fix$mpg, replacements$mpg) # Type error +#> Error: Vector 1 has type 'double' not 'integer' + +coalesce(to_fix, replacements) %>% str() # Works??? Coerces all back to double +#> 'data.frame': 32 obs. of 3 variables: +#> $ mpg : num 21 21 22 21 18 18 14 24 22 19 ... +#> $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... +#> $ disp: num 160 160 108 258 360 225 360 146 140 167 ... + +coalesce(as_data_frame(to_fix), replacements) # Fails if one is tibble +#> Error: Vector 1 has class data.frame not tbl_df/tbl/data.frame + +coalesce(as_data_frame(to_fix), as_data_frame(replacements)) # Fails differently if both tibble +#> Error: Unsupported use of matrix or array for column indexing + +coalesce(as.matrix(to_fix), as.matrix(replacements)) # Matrices with different types fail +#> Error: Vector 1 has type 'double' not 'integer' + +coalesce(to_fix %>% mutate_all(as.numeric) %>% as.matrix(), + as.matrix(replacements)) %>% str() # Matrices with same types work! +#> num [1:32, 1:3] 21 21 22 21 18 18 14 24 22 19 ... +#> - attr(*, "dimnames")=List of 2 +#> ..$ : NULL +#> ..$ : chr [1:3] "mpg" "cyl" "disp" + +# With list column, coalesces non-list columns, makes everything list column +coalesce(as.data.frame(nest(to_fix, -cyl)), + as.data.frame(nest(replacements, -cyl))) %>% str() +#> 'data.frame': 4 obs. of 2 variables: +#> $ cyl :List of 4 +#> ..$ : int 6 +#> ..$ : num 4 +#> ..$ : int 4 +#> ..$ : int 8 +#> $ data:List of 4 +#> ..$ :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 2 variables: +#> .. ..$ mpg : int NA 21 18 19 17 19 +#> .. ..$ disp: int 160 258 225 167 167 145 +#> ..$ :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 2 variables: +#> .. ..$ mpg : int 21 +#> .. ..$ disp: int 160 +#> ..$ :Classes 'tbl_df', 'tbl' and 'data.frame': 11 obs. of 2 variables: +#> .. ..$ mpg : int 22 24 22 32 30 33 21 27 26 30 ... +#> .. ..$ disp: int NA 146 140 78 75 71 120 79 120 95 ... +#> ..$ :Classes 'tbl_df', 'tbl' and 'data.frame': 14 obs. of 2 variables: +#> .. ..$ mpg : int 18 14 16 17 15 10 10 14 15 15 ... +#> .. ..$ disp: int 360 360 275 275 275 472 460 440 318 304 ... + +# Works on first level of lists with one number per element +coalesce(list(1, NA, 3.2, list(NA)), list(1L, 2L, 3L, list(4L))) %>% str() +#> List of 4 +#> $ : num 1 +#> $ : int 2 +#> $ : num 3.2 +#> $ :List of 1 +#> ..$ : logi NA + +# With more complicated lists, returns x, but doesn't coalesce anything. +# Not sure why coalesce(to_fix, replacements) works but this doesn't. +coalesce(unclass(to_fix), unclass(replacements)) %>% str() +#> List of 3 +#> $ mpg : int [1:32] NA 21 22 21 18 18 14 24 22 19 ... +#> $ cyl : int [1:32] 6 NA 4 6 8 6 8 4 4 6 ... +#> $ disp: int [1:32] 160 160 NA 258 360 225 360 146 140 167 ... +#> - attr(*, "row.names")= int [1:32] 1 2 3 4 5 6 7 8 9 10 ... diff --git a/issues/2348.R b/issues/2348.R new file mode 100644 index 0000000000..6a32819104 --- /dev/null +++ b/issues/2348.R @@ -0,0 +1,9 @@ +func <- function(x) { + fund <- function(y) { + mean(y) + } + + summarize_(x, result = ~fund(a)) +} + +func(data.frame(a=1:5)) diff --git a/issues/2352.R b/issues/2352.R new file mode 100644 index 0000000000..cacef6fcb9 --- /dev/null +++ b/issues/2352.R @@ -0,0 +1,24 @@ +x1 <- data_frame( + a = 1:5, + b = 6:10, + c = 11:15, + d = 16:20 +) +names(x1) <- c("a", "b", "b", "b") + +# Same df but as a data.frame +x2 <- data.frame( + a = 1:5, + b = 6:10, + b = 11:15, + b = 16:20, + check.names = FALSE +) + +y <- data.frame( + a = 1:4, + d = letters[1:4] +) + +# the join is completed on the tibble but the first b column values displace the others: +left_join(x1, y, by = "a") diff --git a/issues/2359.R b/issues/2359.R new file mode 100644 index 0000000000..ffe82dc048 --- /dev/null +++ b/issues/2359.R @@ -0,0 +1,21 @@ +devtools::load_all() + +mtcars_sqlite <- copy_to(src_sqlite(path = tempfile(), create = TRUE), mtcars) + +# This works: +mtcars_sqlite %>% + distinct(cyl) %>% + collect() + +# This doesn't: +mtcars_sqlite %>% + distinct(cyl) %>% + compute + + + +# But it works if we explicitly select: +mtcars_sqlite %>% + select(cyl) %>% + distinct(cyl) %>% + compute() diff --git a/issues/2362.R b/issues/2362.R new file mode 100644 index 0000000000..f3401cbc8f --- /dev/null +++ b/issues/2362.R @@ -0,0 +1,11 @@ +devtools::load_all() + +dffun <- function(x) { + data.frame(a=1, time=3) +} + +tmp <- data.frame(d=1:5, e=6:10) + +summarize(tmp, d=dffun(b)$time) +summarize(tmp, d=dffun(b)[["time"]]) +summarize(tmp, d=dffun(b)$a) diff --git a/issues/2374.R b/issues/2374.R new file mode 100644 index 0000000000..6eab513993 --- /dev/null +++ b/issues/2374.R @@ -0,0 +1,6 @@ +devtools::load_all() + +memdb_frame(a = 1:3) %>% filter(a %in% 1:2) + +ok <- 1:2 +memdb_frame(a = 1:3) %>% filter(a %in% ok) diff --git a/issues/2379.R b/issues/2379.R new file mode 100644 index 0000000000..059312e0da --- /dev/null +++ b/issues/2379.R @@ -0,0 +1,17 @@ +devtools::load_all() +library(tidyr) +set.seed(1) +(d1 = data_frame(name = letters[1:3], id = list(1:3, 4:6, 7:9), other_data1 = rnorm(3))) +(d2 = data_frame(name = LETTERS[1:3], id = list(1:3, 4:6, 7:9), other_data2 = rnorm(3))) + +#long form +(d1_long = unnest(d1)) +(d2_long = unnest(d2)) + +#full join on long form +full_join(d1_long, d2_long, by = "id") + +#full join on nested form +full_join(d1, d2, by = "id") +#not supplying the id produces the same result +# From 1e7490dadb82b68a597ad6eae8277b8cd7c9c1fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 26 Jan 2017 10:29:01 +0100 Subject: [PATCH 4/6] why was I ignoring it? --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 921ca1070d..f5e0093048 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,3 @@ vignettes/*.R .DS_Store /.idea /clion-test.R -issues From fa3e81b16d33972a7b50892761ec8a422b65fb54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 26 Jan 2017 11:47:41 +0100 Subject: [PATCH 5/6] add --- issues/2358.Rmd | 12 ++++++++++++ issues/2358.sh | 6 ++++++ 2 files changed, 18 insertions(+) create mode 100644 issues/2358.Rmd create mode 100755 issues/2358.sh diff --git a/issues/2358.Rmd b/issues/2358.Rmd new file mode 100644 index 0000000000..daea5b7f62 --- /dev/null +++ b/issues/2358.Rmd @@ -0,0 +1,12 @@ +```{r setup, include=FALSE} +unlink("bindr-2_cache", recursive = TRUE) +knitr::opts_chunk$set(cache = TRUE) +``` + +```{r} +devtools::load_all() +obj <- data_frame(x = 1) %>% + mutate(f = list(some ~ formula + here)) + +obj +``` diff --git a/issues/2358.sh b/issues/2358.sh new file mode 100755 index 0000000000..57ed4517ee --- /dev/null +++ b/issues/2358.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +git co . +rm src/*.d +R -e 'rmarkdown::render("bindr-2.Rmd")' +git co . From ac7c168c6d11f785510ec67f0e30e69eddacd34d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Thu, 9 Feb 2017 14:43:45 +0100 Subject: [PATCH 6/6] added --- issues/2392.R | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++ issues/2410.R | 45 ++++++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 issues/2392.R create mode 100644 issues/2410.R diff --git a/issues/2392.R b/issues/2392.R new file mode 100644 index 0000000000..f51243263e --- /dev/null +++ b/issues/2392.R @@ -0,0 +1,102 @@ +library(tidyverse) +#> Loading tidyverse: ggplot2 +#> Loading tidyverse: tibble +#> Loading tidyverse: tidyr +#> Loading tidyverse: readr +#> Loading tidyverse: purrr +#> Loading tidyverse: dplyr +#> Conflicts with tidy packages ---------------------------------------------- +#> filter(): dplyr, stats +#> lag(): dplyr, stats + +con <- src_memdb() +copy_to(con, iris, "iris") +#> Source: query [?? x 5] +#> Database: sqlite 3.11.1 [:memory:] +#> +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> +#> 1 5.1 3.5 1.4 0.2 setosa +#> 2 4.9 3.0 1.4 0.2 setosa +#> 3 4.7 3.2 1.3 0.2 setosa +#> 4 4.6 3.1 1.5 0.2 setosa +#> 5 5.0 3.6 1.4 0.2 setosa +#> 6 5.4 3.9 1.7 0.4 setosa +#> 7 4.6 3.4 1.4 0.3 setosa +#> 8 5.0 3.4 1.5 0.2 setosa +#> 9 4.4 2.9 1.4 0.2 setosa +#> 10 4.9 3.1 1.5 0.1 setosa +#> # ... with more rows +iris_sql <- tbl(con, "iris") + +iris_sql %>% do({ + head(.) +}) +#> Error: length(select) not greater than 0L + +iris_sql %>% group_by(Species) %>% do({ + head(.) +}) +#> Source: local data frame [5 x 5] +#> Groups: Species [2] +#> +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> +#> 1 4.3 3.0 1.1 0.1 setosa +#> 2 4.4 2.9 1.4 0.2 setosa +#> 3 4.4 3.0 1.3 0.2 setosa +#> 4 4.4 3.2 1.3 0.2 setosa +#> 5 7.9 3.8 6.4 2.0 virginica + +iris_sql %>% group_by(Species) %>% collect() %>% do({ + head(.) +}) +#> Source: local data frame [18 x 5] +#> Groups: Species [3] +#> +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> +#> 1 5.1 3.5 1.4 0.2 setosa +#> 2 4.9 3.0 1.4 0.2 setosa +#> 3 4.7 3.2 1.3 0.2 setosa +#> 4 4.6 3.1 1.5 0.2 setosa +#> 5 5.0 3.6 1.4 0.2 setosa +#> 6 5.4 3.9 1.7 0.4 setosa +#> 7 7.0 3.2 4.7 1.4 versicolor +#> 8 6.4 3.2 4.5 1.5 versicolor +#> 9 6.9 3.1 4.9 1.5 versicolor +#> 10 5.5 2.3 4.0 1.3 versicolor +#> 11 6.5 2.8 4.6 1.5 versicolor +#> 12 5.7 2.8 4.5 1.3 versicolor +#> 13 6.3 3.3 6.0 2.5 virginica +#> 14 5.8 2.7 5.1 1.9 virginica +#> 15 7.1 3.0 5.9 2.1 virginica +#> 16 6.3 2.9 5.6 1.8 virginica +#> 17 6.5 3.0 5.8 2.2 virginica +#> 18 7.6 3.0 6.6 2.1 virginica + +SpeciesList <- iris_sql %>% summarize(distinct(Species)) %>% collect() %>% .[["Species"]] +map_df(SpeciesList, function(name) { + iris_sql %>% filter(Species == name) %>% collect() %>% head() +}) +#> # A tibble: 18 × 5 +#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species +#> +#> 1 5.1 3.5 1.4 0.2 setosa +#> 2 4.9 3.0 1.4 0.2 setosa +#> 3 4.7 3.2 1.3 0.2 setosa +#> 4 4.6 3.1 1.5 0.2 setosa +#> 5 5.0 3.6 1.4 0.2 setosa +#> 6 5.4 3.9 1.7 0.4 setosa +#> 7 7.0 3.2 4.7 1.4 versicolor +#> 8 6.4 3.2 4.5 1.5 versicolor +#> 9 6.9 3.1 4.9 1.5 versicolor +#> 10 5.5 2.3 4.0 1.3 versicolor +#> 11 6.5 2.8 4.6 1.5 versicolor +#> 12 5.7 2.8 4.5 1.3 versicolor +#> 13 6.3 3.3 6.0 2.5 virginica +#> 14 5.8 2.7 5.1 1.9 virginica +#> 15 7.1 3.0 5.9 2.1 virginica +#> 16 6.3 2.9 5.6 1.8 virginica +#> 17 6.5 3.0 5.8 2.2 virginica +#> 18 7.6 3.0 6.6 2.1 virginica diff --git a/issues/2410.R b/issues/2410.R new file mode 100644 index 0000000000..3612130c45 --- /dev/null +++ b/issues/2410.R @@ -0,0 +1,45 @@ +devtools::load_all() +#> +#> Attaching package: 'dplyr' +#> The following objects are masked from 'package:stats': +#> +#> filter, lag +#> The following objects are masked from 'package:base': +#> +#> intersect, setdiff, setequal, union +my_db <- src_sqlite("my_db.sqlite3", create = T) +foo <- data.frame(a=1:3, c=4:6) +bar <- data.frame(b=1:3, d=7:9) +foo <- copy_to(my_db, foo) +bar <- copy_to(my_db, bar) +foo +#> Source: query [?? x 2] +#> Database: sqlite 3.16.2 [my_db.sqlite3] +#> +#> a c +#> +#> 1 1 4 +#> 2 2 5 +#> 3 3 6 +bar +#> Source: query [?? x 2] +#> Database: sqlite 3.16.2 [my_db.sqlite3] +#> +#> b d +#> +#> 1 1 7 +#> 2 2 8 +#> 3 3 9 +foobar <- inner_join(foo, bar, by=c("a"="b")) +colnames(foobar) +#> [1] "a" "c" "d" +print(foobar) +#> Source: query [?? x 3] +#> Database: sqlite 3.16.2 [my_db.sqlite3] +#> +#> a c b d +#> +#> 1 1 4 1 7 +#> 2 2 5 2 8 +#> 3 3 6 3 9 +stopifnot(identical(tbl_vars(foobar), names(collect(foobar))))