Data Tables and the “Tidyverse”

This chapter discusses the improvements and new features introduced by the package data.table and some notable packages in the “tidyverse” collection of packages. In particular, the packages data.table and dplyr are compared in terms of the features they provide and how computationally efficient they are. Also the features for “tidying” data using the tidyr package are discussed.

Below is the supporting material for the various sections of the chapter.

Data Tables

Creating data tables

Subsetting data tables

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: subsetting-data-tables.R

    library(data.table)
    load("bes2010feelings.RData")
    setDT(bes2010feelings)
    
    sctl2010feelings <- bes2010feelings[region=="Scotland"]
    bes2010feelings.srtd <- bes2010feelings[order(wave,region)]
    
    
    bes2010feelings.sub <- bes2010feelings[,.(flng.brown,wave,region)]
    names(bes2010feelings.sub)
    
    str(bes2010feelings.sub)
    
    head(bes2010feelings.sub)
    
    sctl2010feelings <- bes2010feelings[region=="Scotland",
                                        .(wave,
                                          flng.brown,
                                          flng.cameron,
                                          flng.clegg,
                                          flng.salmond)]
    str(sctl2010feelings)
    head(sctl2010feelings)
    

    Required data file: bes2010feelings.RData (This data set is prepared from the original available at https://www.britishelectionstudy.com/data-object/2010-bes-cross-section/ by removing identifying information and scrambling the data)

    The script makes use of the data.table package, which is available from https://cran.r-project.org/package=data.table

Summarizing data tables

Modifying data tables

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: modifying-data-tables.R

    library(data.table)
    UK <- data.table(
               Population = c(55619400,1885400,5424800,3125000),
               Area = c(50301,5460,30090,8023),
               GVA = c(28096,20000,24800,19900),
               country = c("England",
                             "Northern Ireland",
                             "Scotland",
                             "Wales"))
    
    UK1 <- UK
    UK[,Density := Population/Area]
    UK
    UK1
    

    The script makes use of the data.table package, which is available from https://cran.r-project.org/package=data.table

The Tidyverse

Importing data using haven

Subsetting data with dplyr

Summarizing data with dplyr

Modifying data with dplyr

A Comparison between the Approaches

Creating group summaries

  • No interactive notebook is provided, due to the computational demands of the benchmarking script.

  • Script file: comparison-group-summaries.R

    library(data.table)
    library(dplyr)
    library(memisc)
    library(rbenchmark)
    
    grouped_summary_benchmark_1 <- benchmark(
        aggregate =
            aggregate(X1~a+b,data=BDataF, FUN=mean),
        `with + tapply` =
            with(BDataF,tapply(X1,list(a,b),mean)),
        data.table =
            BDataT[,mean(X1),by=.(a,b)],
        `group_by + summarize` =
            BDTbl %>% group_by(a,b) %>% summarize(mean(X1)),
        `select + group_by + summarize` =
            BDTbl %>% select(X1,a,b) %>% group_by(a,b) %>%
                      summarize(mean(X1)),
        withGroups =
            with(Groups(BDataF,~a+b),mean(X1)),
      columns = c("test","user.self","relative"),
      replications = 100,
      order = NULL,
      relative = "user.self"
    )
    grouped_summary_benchmark_1
    
    grouped_summary_benchmark_2 <- benchmark(
        aggregate =
            aggregate(X1~a+b,data=SDataF, FUN=mean),
        `with + tapply` =
            with(SDataF,tapply(X1,list(a,b),mean)),
        data.table =
            SDataT[,mean(X1),by=.(a,b)],
        `group_by + summarize` =
            SDTbl %>% group_by(a,b) %>% summarize(mean(X1)),
        `select + group_by + summarize` =
            SDTbl %>% select(X1,a,b) %>% group_by(a,b) %>%
                      summarize(mean(X1)),
        withGroups =
            with(Groups(SDataF,~a+b),mean(X1)),
      columns = c("test","user.self","relative"),
      replications = 100,
      order = NULL,
      relative = "user.self"
    )
    grouped_summary_benchmark_2
    
    save(grouped_summary_benchmark_1,
         grouped_summary_benchmark_2,
         file="grouped-summary-benchmark.RData")
    

    The script makes use of the following add-on packages:

Modifying data within groups

  • No interactive notebook is provided, due to the computational demands of the benchmarking script.

  • Script file: comparison-modifying-data-within-groups.R

    library(data.table)
    library(dplyr)
    library(memisc)
    library(rbenchmark)
    
    grouped_modification_benchmark_1 <- benchmark(
        within =
            within(BDataF,{
                X1c <- X1 - ave(X1, a,b,FUN = mean)
                X2c2 <- (X2 - ave(X2, a,b,FUN = mean))^2
                X3ca <- abs(X3 - ave(X3,a,b,FUN = median))
                X4cm <- X4 - ave(X4,a,b,FUN = max)
            }),
        data.table =
            BDataT[,`:=`(X1c = X1 - mean(X1),
                         X2c2 = (X2 - mean(X2))^2,
                         X3ca = abs(X3 - median(X3)),
                         X4cm = X4 - max(X4)),
                   by = .(a,b)],
        `group_by + mutate` =
            BDTbl %>% group_by(a,b) %>%
            mutate(X1c = X1 - mean(X1),
                   X2c2 = (X2 - mean(X2))^2,
                   X3ca = abs(X3 - median(X3)),
                   X4cm = X4 - max(X4)),
        withinGroups =
            withinGroups(BDataF, ~a+b, {
                X1c <- X1 - mean(X1)
                X2c2 <- (X2 - mean(X2))^2
                X3ca <- abs(X3 - median(X3))
                X4cm <- X4 - max(X4)
            }),
        columns = c("test","user.self","relative"),
        replications = 100,
        order = NULL,
        relative = "user.self"
    )
    grouped_modification_benchmark_1
    
    grouped_modification_benchmark_2 <- benchmark(
        within =
            within(SDataF,{
                X1c <- X1 - ave(X1, a,b,FUN = mean)
                X2c2 <- (X2 - ave(X2, a,b,FUN = mean))^2
                X3ca <- abs(X3 - ave(X3,a,b,FUN = median))
                X4cm <- X4 - ave(X4,a,b,FUN = max)
            }),
        data.table =
            SDataT[,`:=`(X1c = X1 - mean(X1),
                         X2c2 = (X2 - mean(X2))^2,
                         X3ca = abs(X3 - median(X3)),
                         X4cm = X4 - max(X4)),
                   by = .(a,b)],
        `group_by + mutate` =
            SDTbl %>% group_by(a,b) %>%
            mutate(X1c = X1 - mean(X1),
                   X2c2 = (X2 - mean(X2))^2,
                   X3ca = abs(X3 - median(X3)),
                   X4cm = X4 - max(X4)),
        withinGroups =
            withinGroups(SDataF, ~a+b, {
                X1c <- X1 - mean(X1)
                X2c2 <- (X2 - mean(X2))^2
                X3ca <- abs(X3 - median(X3))
                X4cm <- X4 - max(X4)
            }),
        columns = c("test","user.self","relative"),
        replications = 100,
        order = NULL,
        relative = "user.self"
    )
    grouped_modification_benchmark_2
    
    save(grouped_modification_benchmark_1,
         grouped_modification_benchmark_2,
         file="grouped-modification-benchmark.RData")
    

    The script makes use of the following add-on packages:

Comparison summary

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: comparison-summary-tables.R

    library(data.table)
    library(dplyr)
    library(memisc)
    library(rbenchmark)
    
    bench_matrix <- function(x){
        rn <- x$test
        x <- as.matrix(x[,-1])
        rownames(x) <- rn
        x
    }
    
    load("grouped-summary-benchmark.RData")
    
    grouped_summary_benchmark_1 <- bench_matrix(grouped_summary_benchmark_1)
    grouped_summary_benchmark_2 <- bench_matrix(grouped_summary_benchmark_2)
    
    grouped_summary_benchmark <- memisc::collect(
        "`Big data'"    = grouped_summary_benchmark_1,
        "`Survey data'" = grouped_summary_benchmark_2)
    grouped_summary_benchmark <- grouped_summary_benchmark[-5,,]
    colnames(grouped_summary_benchmark) <- c("abs.","rel.")
    names(dimnames(grouped_summary_benchmark)) <- c("Method","Timing","Data")
    
    ftable(grouped_summary_benchmark,col.vars=3:2)
    
    load("grouped-modification-benchmark.RData")
    
    grouped_modification_benchmark_1 <- bench_matrix(grouped_modification_benchmark_1)
    grouped_modification_benchmark_2 <- bench_matrix(grouped_modification_benchmark_2)
    
    grouped_modification_benchmark <- collect(
        "`Big data'"    = grouped_modification_benchmark_1,
        "`Survey data'" = grouped_modification_benchmark_2)
    colnames(grouped_modification_benchmark) <- c("abs.","rel.")
    names(dimnames(grouped_modification_benchmark)) <- c("Method","Timing","Data")
    
    ftable(grouped_modification_benchmark,col.vars=3:2)
    

    The following data sets are required:

    The script makes use of the following add-on packages:

Tidying Data Using the tidyr Package

Reshaping data with gather() and pivot_longer() into a long arrangement

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: reshaping-to-long-with-tidyr.R

    # Inspecting the file
    substr(readLines("gini-oecd.tsv",n=5),start=1,stop=50)
    
    library(readr)
    
    gini.oecd <- read_tsv("gini-oecd.tsv",
                          skip=1)
    gini.oecd
    
    library(tidyr)
    
    gini.oecd %>% gather(`2007`,`2008`,`2009`,`2010`,`2011`,`2012`,
                          `2013`,`2014`,`2015`,`2016`,`2017`,
                          key="year",value="gini") -> gini.oecd.long
    
    gini.oecd %>% gather(-Location,
                          key="year",value="gini") -> gini.oecd.long
    
    gini.oecd %>% pivot_longer(-Location,
                               names_to="year",
                               values_to="gini") -> gini.oecd.long
    gini.oecd.long
    

    Required data file: gini-oecd.tsv (scraped from https://data.oecd.org at 9 Dec 2019)

    The script makes use of the following add-on packages:

Reshaping data with spread() and pivot_wider() into a wides arrangement

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: reshaping-to-wide-with-tidyr.R

    substr(readLines("inequality-oecd-downloaded.csv",n=5),
           start=1,stop=40)
    
    inequality.oecd.dld <- read_csv("inequality-oecd-downloaded.csv")
    inequality.oecd.dld
    
    library(tidyr)
    inequality.oecd.dld %>% spread(key="SUBJECT",value="Value") ->
                                                   inequality.oecd
    inequality.oecd[-c(2,4,6)]
    
    library(dplyr)
    inequality.oecd.sub <- select(inequality.oecd.dld,
                                  LOCATION,SUBJECT,TIME,Value)
    inequality.oecd.sub
    
    inequality.oecd.sub %>% spread(key=SUBJECT,
                                   value=Value) -> inequality.oecd
    inequality.oecd
    
    
    inequality.oecd.dld %>% pivot_wider(names_from=SUBJECT,
                                        values_from=Value,
                                        id_cols=c(LOCATION,TIME)) ->
                                                    inequality.oecd
    inequality.oecd
    

    Required data file: inequality-oecd-downloaded.csv (manually downloaded from https://data.oecd.org at 9 Dec 2019)

    The script makes use of the following add-on packages:

Filling missing values with fill() and completing data by missing values with complete()

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: filling-and-completing-with-tidyr.R

    library(tidyr)
    library(readr)
    
    ## Filling missing values with 'fill()' #############################################################
    
    messy_data_str <- "
    country,  year,var1, var2
    Rodinia,  1297,  67, -3.0
    ,         1298,  69, -2.9
    ,         1299,  70, -2.8
    Pannotia, 1296,  73, -4.1
    ,         1297,  74, -3.9
    ,         1298,  75, -3.9
    Pangaea,  1296,  54, -1.2
    ,         1297,  53, -1.1
    ,         1298,  52, -1.0
    ,         1299,  51, -0.9
    "
    
    messy_data_str %>% read_csv() -> messy_data
    messy_data
    
    messy_data %>% fill(country) -> filled_data
    filled_data
    
    ## Completing data by missing values with 'complete()' ##############################################
    
    filled_data %>% complete(crossing(country,year))
    

    The script makes use of the following add-on packages: