Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ad-hoc scripts to replicate issues #2304

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ demo/pandas
^src/Makevars\.local$
^Doxyfile$
^clion-test\.R$
^issues$
53 changes: 53 additions & 0 deletions issues/2018.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#dplyr 0.5.0
library(dplyr)

test_data = data.frame(
grp = rep(c("A", "B"), each = 4),
y = rnorm(8), stringsAsFactors = F
)

#this works
test_data %>% group_by(grp) %>%
mutate(
cdf = ecdf(y)(y)
)

#this throws an error: Error: object 'y' not found
test_data %>% group_by(grp) %>%
mutate(
surv = 1 - ecdf(y)(y)
)

#but this works
custom_fun = function(input) 1 - ecdf(input)(input)

test_data %>% group_by(grp) %>%
mutate(
surv = custom_fun(y)
)

# example with wilcox.test

test_data2 = data.frame(
grp = rep(c("A", "B"), each = 4),
grp2 = rep(c("C", "D"), 4),
y = rnorm(8), stringsAsFactors = F
)

test_data2 %>% group_by(grp) %>%
mutate(
p_value = 1 - wilcox.test(y)$p.value
)

# Error: object 'y' not found
test_data2 %>% group_by(grp) %>%
mutate(
p_value = wilcox.test(y ~ grp2)$p.value
)

wilcox_fun = function(outcome, group) wilcox.test(outcome ~ factor(group))$p.value

test_data2 %>% group_by(grp) %>%
mutate(
p_value = wilcox_fun(y, grp2)
)
24 changes: 24 additions & 0 deletions issues/2080.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
library(dplyr)
library(purrr)

df <- tibble(x = list(
tibble(y = 1:2),
tibble(y = 1:3),
tibble(y = 1:4)
))

nrows <- function(df) {
df %>% summarise(n = n()) %>% .[["n"]]
}

df %>%
mutate(
n1 = x %>% map_int(nrows),
n2 = x %>% map_int(. %>% summarise(n = n()) %>% .[["n"]])
)
#> # A tibble: 3 × 3
#> x n1 n2
#> <list> <int> <int>
#> 1 <tibble [2 × 1]> 2 3
#> 2 <tibble [3 × 1]> 3 3
#> 3 <tibble [4 × 1]> 4 3
24 changes: 24 additions & 0 deletions issues/2109.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
df <- data_frame(id = rep(1:2, each = 4), id2 = rep(1:2, 4))

df %>% group_by(id, id2) %>% distinct(id)
# Source: local data frame [4 x 3]
# Groups: id, id2 [4]
#
# id id id2
# <int> <int> <int>
# 1 1 1 1
# 2 1 1 2
# 3 2 2 1
# 4 2 2 2

df %>% group_by(id, id2) %>% select(-id2) %>% distinct(id)
# Adding missing grouping variables: `id2`
# Source: local data frame [4 x 3]
# Groups: id, id2 [4]
#
# id id id2
# <int> <int> <int>
# 1 1 1 1
# 2 1 1 2
# 3 2 2 1
# 4 2 2 2
14 changes: 14 additions & 0 deletions issues/2198-2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
benchmark <- function(df, col, summarize) {
force(df)
gc()
if (summarize) {
system.time(group_by_(df, col) %>% count())
} else {
system.time(group_by_(df, col))
}
}

devtools::load_all()

benchmark(Lahman::Batting %>% mutate(id = paste(teamID, yearID, playerID)) %>% sample_frac() %>% transmute(id, n = 0), ~id, summarize = FALSE)
# benchmark(Lahman::Batting, ~playerId, ~teamId, summarize = FALSE)
3 changes: 3 additions & 0 deletions issues/2198-3.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
devtools::load_all()
batting_df <- Lahman::Batting
system.time(batting_df %>% group_by(playerID) %>% summarise(ab = mean(AB)))
75 changes: 75 additions & 0 deletions issues/2198.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
set.seed(123)

ALPHABET <- letters[1:4]
ALPHABET <- letters[1:10]
ALPHABET <- letters

create_ids <- function(N) {
s <- paste(sample(c(ALPHABET, "|"), N, replace = TRUE), collapse = "")
ss <- strsplit(s, "|", fixed = TRUE)[[1]]
ss <- unique(ss)
ss <- ss[nchar(ss) > 3]
ss
}

N <- 1e4
ids <- create_ids(N)

benchmark <- function(ids, summarize) {
force(ids)
df <- data_frame(ids, n = 0)
gc()
if (summarize) {
system.time(group_by(df, ids) %>% summarize(n = mean(n)))
} else {
system.time(group_by(df, ids))
}
}

devtools::load_all()

# master:
#
# > # benchmark(ids, TRUE)
# > # benchmark(sample(ids, NN, replace = FALSE), TRUE)
# > # benchmark(sample(ids, NN, replace = TRUE), TRUE)
# > benchmark(ids, F .... [TRUNCATED]
# user system elapsed
# 4.440 0.032 4.469
#
# > benchmark(sample(ids, NN, replace = FALSE), FALSE)
# user system elapsed
# 2.164 0.000 2.166
#
# > benchmark(sample(ids, NN, replace = TRUE), FALSE)
# user system elapsed
# 3.176 0.000 3.175

# f:
# > benchmark(ids, TRUE)
# user system elapsed
# 2.500 0.024 2.522
#
# > benchmark(sample(ids, NN, replace = FALSE), TRUE)
# user system elapsed
# 2.320 0.000 2.319
#
# > benchmark(sample(ids, NN, replace = TRUE), TRUE)
# user system elapsed
# 2.584 0.000 2.584


NN <- 3e2

#gprofiler::start_profiler()

benchmark(ids, TRUE)

#gprofiler::stop_profiler()
#gprofiler::show_profiler_pdf()

#benchmark(sample(ids, NN, replace = FALSE), TRUE)
# benchmark(sample(ids, NN, replace = TRUE), TRUE)
# benchmark(ids, FALSE)
# benchmark(sample(ids, NN, replace = FALSE), FALSE)
# benchmark(sample(ids, NN, replace = TRUE), FALSE)
8 changes: 8 additions & 0 deletions issues/2231.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
library(dplyr)
d <- data_frame( x = rep(c(1,2), c(2,4)), y = 1:6, names = letters[1:6] )
d
res <- d %>% group_by(x) %>% summarise( y = list( setNames(y, names) ) ) %>% ungroup
res$y[[1]]
res$y[[2]]
names( res$y[[1]])
names( res$y[[2]])
5 changes: 5 additions & 0 deletions issues/2267.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
data1 <- data.frame(var1 = sample(c(1,2,3),50,replace=T), var2 = sample(c("cond1", "cond2"), 50,replace=T),RT = sample(as.numeric(300:1000),50,replace=T))

data1 <- data1 %>%
group_by(var1) %>%
mutate(median_var = median(RT[var2=="cond1"]))
15 changes: 15 additions & 0 deletions issues/2272.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
devtools::load_all()

df1 <- data.frame(a = c(1,2,NA), b = c(5,NA, NA))
df2 <- data.frame(a = c(1,NA,NA), c = c(9,8, NA))
left_join(df1, df2)

src <- src_sqlite("", create = TRUE)
sqlite1 <- copy_to(src, df1)
sqlite2 <- copy_to(src, df2)
left_join(sqlite1, sqlite2)

src <- src_postgres()
postgres1 <- copy_to(src, df1, temporary = TRUE, name = random_table_name())
postgres2 <- copy_to(src, df2, temporary = TRUE, name = random_table_name())
left_join(postgres1, postgres2)
17 changes: 17 additions & 0 deletions issues/2280.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
library("dplyr")
pings = read.csv(text = "
timestamp,round_start_timestamp,node,seq,nb_bytes,ttl,latency
1480525318.042879,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,2,64,42,527
1480525318.654011,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,3,64,42,138
1480525319.555820,1480525317.121227,fc92:bb4b:bff6:9102:693d:15b1:6443:3776,4,64,42,38.8
1480525330.320386,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,2,64,42,642
1480525330.876448,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,3,64,42,198
1480525331.898099,1480525329.48615,fc84:3c77:7149:24dc:7450:cade:4954:3b04,4,64,42,217
1480525330.268665,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,2,64,42,701
1480525331.107886,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,3,64,42,540
1480525332.268447,1480525329.4887602,fc42:9714:8805:0ed1:a8ff:ec45:a27f:739f,4,64,42,700
")

data = pings[pings$seq == 2,]
df_grouped = group_by(data, node)
df_lag = mutate(df_grouped, latency_change=df_grouped$latency - lag(df_grouped$latency))
9 changes: 9 additions & 0 deletions issues/2288.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
devtools::load_all(".")
src <- src_mysql("test", user = "muelleki")
src <- src_sqlite(":memory:", create = TRUE)
name <- dplyr:::random_table_name()
DBI::dbWriteTable(src$con, name, data_frame(a = 2:5))
data <- src %>% tbl(name)
data %>%
mutate(b = log(a), c = log(exp(1), a)) %>%
mutate(d = b * c)
6 changes: 6 additions & 0 deletions issues/2290.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
devtools::load_all(".")
mtcars2 <- copy_to(src_postgres(), mtcars, dplyr:::random_table_name())
mtcars2 %>%
group_by(cyl) %>%
arrange(disp) %>%
summarize(mpg2 = first(mpg))
32 changes: 32 additions & 0 deletions issues/2292.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
library(DBI)
library(dplyr)
"%||%" <- function(x, y) if(is.null(x)) y else x

db_disconnector <- function(con, name, quiet = FALSE) {
reg.finalizer(environment(), function(...) {
if (!quiet) {
message("Auto-disconnecting ", name, " connection ",
"(", paste(con@Id, collapse = ", "), ")")
}
dbDisconnect(con)
})
environment()
}

src_postgres2 <- function(dbname = NULL, host = NULL, port = NULL, user = NULL,
password = NULL, ...) {
if (!requireNamespace("RPostgres", quietly = TRUE)) {
stop("RPostgres package required to connect to postgres db", call. = FALSE)
}

user <- user %||% ""

con <- dbConnect(RPostgres::Postgres(), host = host %||% "", dbname = dbname %||% "",
user = user, password = password %||% "", port = port %||% "", ...)
info <- dbGetInfo(con)

src_sql("postgres", con,
info = info, disco = db_disconnector(con, "postgres"))
}

src_postgres2()
31 changes: 31 additions & 0 deletions issues/2293.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
set.seed(10)
df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
scrambled <- df[sample(nrow(df)), ]
wrong <- mutate(scrambled, running = cummax(cumsum(value)))
arrange(wrong, year)
## year value running
## 1 2000 0 30
## 2 2001 1 10
## 3 2002 4 30
## 4 2003 9 9
## 5 2004 16 26
## 6 2005 25 55
right <- mutate(scrambled, running = order_by(year, cummax(cumsum(value))))
arrange(right, year)
## year value running
## 1 2000 0 30
## 2 2001 1 30
## 3 2002 4 30
## 4 2003 9 30
## 5 2004 16 30
## 6 2005 25 55
right2 <- arrange(scrambled,year) %>%mutate(running = cummax(cumsum(value)))
arrange(right2, year)
## year value running
## 1 2000 0 0
## 2 2001 1 1
## 3 2002 4 5
## 4 2003 9 14
## 5 2004 16 30
## 6 2005 25 55
mutate(scrambled, running1 = order_by(year, cumsum(value)), running2 = order_by(year, cummax(running1))) %>% arrange(year)
2 changes: 2 additions & 0 deletions issues/2297.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dplyr::data_frame(a = 1)
dplyr::order_by(10:1, cumsum(1:10))
5 changes: 5 additions & 0 deletions issues/2300.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
library(dplyr)

df_1 <- data_frame(a = as.integer(1:3), b = runif(3))
df_2 <- data_frame(a = as.factor(1:3), c = runif(3))
left_join(df_1, df_2)
17 changes: 17 additions & 0 deletions issues/2301.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
library(dplyr)
library(tidyr)

# works
df <- data.frame(key = c("a","b"), value = c(1,2))
df_spread <- df %>% spread(key, value)
mutate_if(df_spread, is.numeric, function(x) {x+1})

# fails with : Error in eval(expr, envir, enclos) : object 'b' not found
df <- data.frame(key = c("a","b-a"), value = c(1,2))
df_spread <- df %>% spread(key, value)
mutate_if(df_spread, is.numeric, function(x) {x+1})

# fails with: Error in parse(text = x) : <text>:1:3: unexpected symbol
df <- data.frame(key = c("a","c d"), value = c(1,2))
df_spread <- df %>% spread(key, value)
mutate_if(df_spread, is.numeric, function(x) {x+1})
30 changes: 30 additions & 0 deletions issues/2302.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
library('dplyr')
d <- data.frame(x=c(1,2,2),y=c(3,5,NA),z=c(NA,'a','b'),
rowNum=1:3,
stringsAsFactors = FALSE)
print(d)

fnam <- tempfile(pattern = "dplyr_doc_narm", tmpdir = tempdir(), fileext = "sqlite3")
my_db <- dplyr::src_sqlite(fnam, create = TRUE)
class(my_db)
dRemote <- copy_to(my_db,d,'d',rowNumberColumn='rowNum',overwrite=TRUE)


# correct calculation
dRemote %>% mutate(nna=0) %>%
mutate(nna=nna+ifelse(is.na(x),1,0)) %>%
mutate(nna=nna+ifelse(is.na(y),1,0)) %>%
mutate(nna=nna+ifelse(is.na(z),1,0))

# incorrect calculation (last step seems to always clobber the previous result)
dRemote %>% mutate(nna=0) %>%
mutate(nna=nna+is.na(x)) %>%
mutate(nna=nna+is.na(y)) %>%
mutate(nna=nna+is.na(z))

# clean up
rm(list=setdiff(ls(),'fnam'))
if(!is.null(fnam)) {
file.remove(fnam)
}
gc()
Loading