下面是@大卫Arenburg的解决方案的标杆存在,以及一些解决方案的回顾张贴在这里(@mnel,@Sven海恩斯坦,@Henrik):
library(dplyr)
library(data.table)
library(microbenchmark)
library(tidyr)
library(ggplot2)
df <- mtcars
DT <- as.data.table(df)
DT_32k <- rbindlist(replicate(1e3, mtcars, simplify = FALSE))
df_32k <- as.data.frame(DT_32k)
DT_32M <- rbindlist(replicate(1e6, mtcars, simplify = FALSE))
df_32M <- as.data.frame(DT_32M)
bench <- microbenchmark(
base_32 = aggregate(hp ~ cyl, df, function(x) length(unique(x))),
base_32k = aggregate(hp ~ cyl, df_32k, function(x) length(unique(x))),
base_32M = aggregate(hp ~ cyl, df_32M, function(x) length(unique(x))),
dplyr_32 = summarise(group_by(df, cyl), count = n_distinct(hp)),
dplyr_32k = summarise(group_by(df_32k, cyl), count = n_distinct(hp)),
dplyr_32M = summarise(group_by(df_32M, cyl), count = n_distinct(hp)),
data.table_32 = DT[, .(count = uniqueN(hp)), by = cyl],
data.table_32k = DT_32k[, .(count = uniqueN(hp)), by = cyl],
data.table_32M = DT_32M[, .(count = uniqueN(hp)), by = cyl],
times = 10
)
结果:
print(bench)
情节:
as_tibble(bench) %>%
group_by(expr) %>%
summarise(time = median(time)) %>%
separate(expr, c("framework", "nrow"), "_", remove = FALSE) %>%
mutate(nrow = recode(nrow, "32" = 32, "32k" = 32e3, "32M" = 32e6),
time = time / 1e3) %>%
ggplot(aes(nrow, time, col = framework)) +
geom_line() +
scale_x_log10() +
scale_y_log10() + ylab("microseconds")
会话信息:
sessionInfo()