tidyverse · krlmlr · Dec 9, 2016 · Dec 15, 2016 · Jan 26, 2017 · Jan 26, 2017
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -18,3 +18,4 @@ demo/pandas
 ^src/Makevars\.local$
 ^Doxyfile$
 ^clion-test\.R$
+^issues$
diff --git a/issues/2018.R b/issues/2018.R
@@ -0,0 +1,53 @@
+#dplyr 0.5.0
+library(dplyr)
+
+test_data = data.frame(
+  grp = rep(c("A", "B"), each = 4),
+  y = rnorm(8), stringsAsFactors = F
+)
+
+#this works
+test_data %>% group_by(grp) %>%
+  mutate(
+    cdf = ecdf(y)(y)
+  )
+
+#this throws an error:  Error: object 'y' not found
+test_data %>% group_by(grp) %>%
+  mutate(
+    surv = 1 - ecdf(y)(y)
+  )
+
+#but this works
+custom_fun = function(input) 1 - ecdf(input)(input)
+
+test_data %>% group_by(grp) %>%
+  mutate(
+    surv = custom_fun(y)
+  )
+
+# example with wilcox.test
+
+test_data2 = data.frame(
+  grp = rep(c("A", "B"), each = 4),
+  grp2 = rep(c("C", "D"), 4),
+  y = rnorm(8), stringsAsFactors = F
+)
+
+test_data2 %>% group_by(grp) %>%
+  mutate(
+    p_value = 1 - wilcox.test(y)$p.value
+  )
+
+# Error: object 'y' not found
+test_data2 %>% group_by(grp) %>%
+  mutate(
+    p_value = wilcox.test(y ~ grp2)$p.value
+  )
+
+wilcox_fun = function(outcome, group) wilcox.test(outcome ~ factor(group))$p.value
+
+test_data2 %>% group_by(grp) %>%
+  mutate(
+    p_value = wilcox_fun(y, grp2)
+  )
diff --git a/issues/2080.R b/issues/2080.R
@@ -0,0 +1,24 @@
+library(dplyr)
+library(purrr)
+
+df <- tibble(x = list(
+  tibble(y = 1:2),
+  tibble(y = 1:3),
+  tibble(y = 1:4)
+))
+
+nrows <- function(df) {
+  df %>% summarise(n = n()) %>% .[["n"]]
+}
+
+df %>%
+  mutate(
+    n1 = x %>% map_int(nrows),
+    n2 = x %>% map_int(. %>% summarise(n = n()) %>% .[["n"]])
+  )
+#> # A tibble: 3 × 3
+#>                  x    n1    n2
+#>             <list> <int> <int>
+#> 1 <tibble [2 × 1]>     2     3
+#> 2 <tibble [3 × 1]>     3     3
+#> 3 <tibble [4 × 1]>     4     3
diff --git a/issues/2109.R b/issues/2109.R
@@ -0,0 +1,24 @@
+df <- data_frame(id = rep(1:2, each = 4), id2 = rep(1:2, 4))
+
+df %>% group_by(id, id2) %>% distinct(id)
+# Source: local data frame [4 x 3]
+# Groups: id, id2 [4]
+#
+# id    id   id2
+# <int> <int> <int>
+#   1     1     1     1
+# 2     1     1     2
+# 3     2     2     1
+# 4     2     2     2
+
+df %>% group_by(id, id2) %>% select(-id2) %>% distinct(id)
+# Adding missing grouping variables: `id2`
+# Source: local data frame [4 x 3]
+# Groups: id, id2 [4]
+#
+# id    id   id2
+# <int> <int> <int>
+#   1     1     1     1
+# 2     1     1     2
+# 3     2     2     1
+# 4     2     2     2
diff --git a/issues/2198-2.R b/issues/2198-2.R
@@ -0,0 +1,14 @@
+benchmark <- function(df, col, summarize) {
+  force(df)
+  gc()
+  if (summarize) {
+    system.time(group_by_(df, col) %>% count())
+  } else {
+    system.time(group_by_(df, col))
+  }
+}
+
+devtools::load_all()
+
+benchmark(Lahman::Batting %>% mutate(id = paste(teamID, yearID, playerID)) %>% sample_frac() %>% transmute(id, n = 0), ~id, summarize = FALSE)
+# benchmark(Lahman::Batting, ~playerId, ~teamId, summarize = FALSE)
diff --git a/issues/2198-3.R b/issues/2198-3.R
@@ -0,0 +1,3 @@
+devtools::load_all()
+batting_df <- Lahman::Batting
+system.time(batting_df %>% group_by(playerID) %>% summarise(ab = mean(AB)))
diff --git a/issues/2198.R b/issues/2198.R
@@ -0,0 +1,75 @@
+set.seed(123)
+
+ALPHABET <- letters[1:4]
+ALPHABET <- letters[1:10]
+ALPHABET <- letters
+
+create_ids <- function(N) {
+  s <- paste(sample(c(ALPHABET, "|"), N, replace = TRUE), collapse = "")
+  ss <- strsplit(s, "|", fixed = TRUE)[[1]]
+  ss <- unique(ss)
+  ss <- ss[nchar(ss) > 3]
+  ss
+}
+
+N <- 1e4
+ids <- create_ids(N)
+
+benchmark <- function(ids, summarize) {
+  force(ids)
+  df <- data_frame(ids, n = 0)
+  gc()
+  if (summarize) {
+    system.time(group_by(df, ids) %>% summarize(n = mean(n)))
+  } else {
+    system.time(group_by(df, ids))
+  }
+}
+
+devtools::load_all()
+
+# master:
+#
+# > # benchmark(ids, TRUE)
+# > # benchmark(sample(ids, NN, replace = FALSE), TRUE)
+# > # benchmark(sample(ids, NN, replace = TRUE), TRUE)
+# > benchmark(ids, F .... [TRUNCATED]
+# user  system elapsed
+# 4.440   0.032   4.469
+#
+# > benchmark(sample(ids, NN, replace = FALSE), FALSE)
+# user  system elapsed
+# 2.164   0.000   2.166
+#
+# > benchmark(sample(ids, NN, replace = TRUE), FALSE)
+# user  system elapsed
+# 3.176   0.000   3.175
+
+# f:
+# > benchmark(ids, TRUE)
+# user  system elapsed
+# 2.500   0.024   2.522
+#
+# > benchmark(sample(ids, NN, replace = FALSE), TRUE)
+# user  system elapsed
+# 2.320   0.000   2.319
+#
+# > benchmark(sample(ids, NN, replace = TRUE), TRUE)
+# user  system elapsed
+# 2.584   0.000   2.584
+
+
+NN <- 3e2
+
+#gprofiler::start_profiler()
+
+benchmark(ids, TRUE)
+
+#gprofiler::stop_profiler()
+#gprofiler::show_profiler_pdf()
+
+#benchmark(sample(ids, NN, replace = FALSE), TRUE)
+# benchmark(sample(ids, NN, replace = TRUE), TRUE)
+# benchmark(ids, FALSE)
+# benchmark(sample(ids, NN, replace = FALSE), FALSE)
+# benchmark(sample(ids, NN, replace = TRUE), FALSE)
diff --git a/issues/2231.R b/issues/2231.R
@@ -0,0 +1,8 @@
+library(dplyr)
+d <- data_frame( x = rep(c(1,2), c(2,4)), y = 1:6, names = letters[1:6] )
+d
+res <- d %>% group_by(x) %>% summarise( y = list( setNames(y, names) ) ) %>% ungroup
+res$y[[1]]
+res$y[[2]]
+names( res$y[[1]])
+names( res$y[[2]])
diff --git a/issues/2267.R b/issues/2267.R
@@ -0,0 +1,5 @@
+data1 <- data.frame(var1 = sample(c(1,2,3),50,replace=T), var2 = sample(c("cond1", "cond2"), 50,replace=T),RT = sample(as.numeric(300:1000),50,replace=T))
+
+data1 <- data1 %>%
+  group_by(var1) %>%
+  mutate(median_var = median(RT[var2=="cond1"]))
diff --git a/issues/2272.R b/issues/2272.R
@@ -0,0 +1,15 @@
+devtools::load_all()
+
+df1 <- data.frame(a = c(1,2,NA), b = c(5,NA, NA))
+df2 <- data.frame(a = c(1,NA,NA), c = c(9,8, NA))
+left_join(df1, df2)
+
+src <- src_sqlite("", create = TRUE)
+sqlite1 <- copy_to(src, df1)
+sqlite2 <- copy_to(src, df2)
+left_join(sqlite1, sqlite2)
+
+src <- src_postgres()
+postgres1 <- copy_to(src, df1, temporary = TRUE, name = random_table_name())
+postgres2 <- copy_to(src, df2, temporary = TRUE, name = random_table_name())
+left_join(postgres1, postgres2)
diff --git a/issues/2280.R b/issues/2280.R
@@ -0,0 +1,17 @@
+library("dplyr")
+pings = read.csv(text = "
+timestamp,round_start_timestamp,node,seq,nb_bytes,ttl,latency
+1480525318.042879,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,2,64,42,527
+1480525318.654011,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,3,64,42,138
+1480525319.555820,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,4,64,42,38.8
+1480525330.320386,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,2,64,42,642
+1480525330.876448,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,3,64,42,198
+1480525331.898099,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,4,64,42,217
+1480525330.268665,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,2,64,42,701
+1480525331.107886,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,3,64,42,540
+1480525332.268447,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,4,64,42,700
+")
+
+data = pings[pings$seq == 2,]
+df_grouped = group_by(data, node)
+df_lag = mutate(df_grouped, latency_change=df_grouped$latency - lag(df_grouped$latency))
diff --git a/issues/2288.R b/issues/2288.R
@@ -0,0 +1,9 @@
+devtools::load_all(".")
+  src <- src_mysql("test", user = "muelleki")
+  src <- src_sqlite(":memory:", create = TRUE)
+  name <- dplyr:::random_table_name()
+  DBI::dbWriteTable(src$con, name, data_frame(a = 2:5))
+  data <- src %>% tbl(name)
+  data %>%
+    mutate(b = log(a), c = log(exp(1), a)) %>%
+    mutate(d = b * c)
diff --git a/issues/2290.R b/issues/2290.R
@@ -0,0 +1,6 @@
+devtools::load_all(".")
+mtcars2 <- copy_to(src_postgres(), mtcars, dplyr:::random_table_name())
+mtcars2 %>%
+  group_by(cyl) %>%
+  arrange(disp) %>%
+  summarize(mpg2 = first(mpg))
diff --git a/issues/2292.R b/issues/2292.R
@@ -0,0 +1,32 @@
+library(DBI)
+library(dplyr)
+"%||%" <- function(x, y) if(is.null(x)) y else x
+
+db_disconnector <- function(con, name, quiet = FALSE) {
+  reg.finalizer(environment(), function(...) {
+    if (!quiet) {
+      message("Auto-disconnecting ", name, " connection ",
+              "(", paste(con@Id, collapse = ", "), ")")
+    }
+    dbDisconnect(con)
+  })
+  environment()
+}
+
+src_postgres2 <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
+                         password = NULL, ...) {
+  if (!requireNamespace("RPostgres", quietly = TRUE)) {
+    stop("RPostgres package required to connect to postgres db", call. = FALSE)
+  }
+
+  user <- user %||% ""
+
+  con <- dbConnect(RPostgres::Postgres(), host = host %||% "", dbname = dbname %||% "",
+                   user = user, password = password %||% "", port = port %||% "", ...)
+  info <- dbGetInfo(con)
+
+  src_sql("postgres", con,
+          info = info, disco = db_disconnector(con, "postgres"))
+}
+
+src_postgres2()
diff --git a/issues/2293.R b/issues/2293.R
@@ -0,0 +1,31 @@
+set.seed(10)
+df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
+scrambled <- df[sample(nrow(df)), ]
+wrong <- mutate(scrambled, running = cummax(cumsum(value)))
+arrange(wrong, year)
+## year value running
+## 1 2000     0      30
+## 2 2001     1      10
+## 3 2002     4      30
+## 4 2003     9       9
+## 5 2004    16      26
+## 6 2005    25      55
+right <- mutate(scrambled, running = order_by(year, cummax(cumsum(value))))
+arrange(right, year)
+## year value running
+## 1 2000     0      30
+## 2 2001     1      30
+## 3 2002     4      30
+## 4 2003     9      30
+## 5 2004    16      30
+## 6 2005    25      55
+right2 <- arrange(scrambled,year) %>%mutate(running = cummax(cumsum(value)))
+arrange(right2, year)
+## year value running
+## 1 2000     0       0
+## 2 2001     1       1
+## 3 2002     4       5
+## 4 2003     9      14
+## 5 2004    16      30
+## 6 2005    25      55
+mutate(scrambled, running1 = order_by(year, cumsum(value)), running2 = order_by(year, cummax(running1))) %>% arrange(year)
diff --git a/issues/2297.R b/issues/2297.R
@@ -0,0 +1,2 @@
+dplyr::data_frame(a = 1)
+dplyr::order_by(10:1, cumsum(1:10))
diff --git a/issues/2300.R b/issues/2300.R
@@ -0,0 +1,5 @@
+library(dplyr)
+
+df_1 <- data_frame(a = as.integer(1:3), b = runif(3))
+df_2 <- data_frame(a = as.factor(1:3), c = runif(3))
+left_join(df_1, df_2)
diff --git a/issues/2301.R b/issues/2301.R
@@ -0,0 +1,17 @@
+library(dplyr)
+library(tidyr)
+
+# works
+df <- data.frame(key = c("a","b"), value = c(1,2))
+df_spread <- df %>% spread(key, value)
+mutate_if(df_spread, is.numeric, function(x) {x+1})
+
+# fails with : Error in eval(expr, envir, enclos) : object 'b' not found
+df <- data.frame(key = c("a","b-a"), value = c(1,2))
+df_spread <- df %>% spread(key, value)
+mutate_if(df_spread, is.numeric, function(x) {x+1})
+
+# fails with: Error in parse(text = x) : <text>:1:3: unexpected symbol
+df <- data.frame(key = c("a","c d"), value = c(1,2))
+df_spread <- df %>% spread(key, value)
+mutate_if(df_spread, is.numeric, function(x) {x+1})
diff --git a/issues/2302.R b/issues/2302.R
@@ -0,0 +1,30 @@
+library('dplyr')
+d <- data.frame(x=c(1,2,2),y=c(3,5,NA),z=c(NA,'a','b'),
+                rowNum=1:3,
+                stringsAsFactors = FALSE)
+print(d)
+
+fnam <- tempfile(pattern = "dplyr_doc_narm", tmpdir = tempdir(), fileext = "sqlite3")
+my_db <- dplyr::src_sqlite(fnam, create = TRUE)
+class(my_db)
+dRemote <- copy_to(my_db,d,'d',rowNumberColumn='rowNum',overwrite=TRUE)
+
+
+# correct calculation
+dRemote %>% mutate(nna=0) %>%
+  mutate(nna=nna+ifelse(is.na(x),1,0)) %>%
+  mutate(nna=nna+ifelse(is.na(y),1,0)) %>%
+  mutate(nna=nna+ifelse(is.na(z),1,0))
+
+# incorrect calculation (last step seems to always clobber the previous result)
+dRemote %>% mutate(nna=0) %>%
+  mutate(nna=nna+is.na(x)) %>%
+  mutate(nna=nna+is.na(y)) %>%
+  mutate(nna=nna+is.na(z))
+
+# clean up
+rm(list=setdiff(ls(),'fnam'))
+if(!is.null(fnam)) {
+  file.remove(fnam)
+}
+gc()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dplyr::data_frame(a = 1)
		dplyr::order_by(10:1, cumsum(1:10))