Social Science Surveys

This chapter introduces the extension package memisc, which is specifically designed to address several of the challenges that were discussed in the previous chapter. It shows how a the system memory can be saved by importing subsets of variables and observations; how variables can be renamed so that results are more easy to interpret; how certain metadata can be used, that are not provided for by a basic R installation, such as value labels and user-defined missing values. The chapter also provides examples for more complex recodings of variables, e.g. for the construction of Goldthorpe class categories for households from ISCO-coded occupations of survey respondents and the creation of codebooks.

Below is the supporting material for the various sections of the chapter.

Importing survey data

Importing data from the British Election Study in SPSS format

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: importing-BES-data.R

    ## Preparing Data from the 1983 British Election Study ##############################################
    ## (Note that the data must be downloaded from a different website.)
    
    library(memisc) # The functions used here are in this package.
    # The first step: Optainin the location and description of the survey data file
    BES.1983.por <- spss.portable.file("83BES.sav")
    description(BES.1983.por)
    
    # The second step: Loading a subset of the data - only the variables that are
    # really needed for the analysis.
    BES.1983.classvot <- subset(
        BES.1983.por,
        select=c(
            voted          = Q7A,
            vote           = Q9A,
            lrself         = Q46G,
            fglclass       = FGLCLASS,
            rglclass       = RGLCLASS,
            sglclass       = SGLCLASS,
            ethnicity      = Q64A,
            gender         = Q64B,
            age            = Q56,
            educ           = Q59A,
            religion       = Q63A,
            religatt       = Q63B
        ))
    description(BES.1983.classvot)
    
    # Declaring the measurement level of a single variable
    measurement(BES.1983.classvot$lrself) <- "interval"
    
    # Declaring the measurement level of several variables
    BES.1983.classvot <- within(BES.1983.classvot,{
       measurement(lrself) <- "interval"
       measurement(age) <- "interval"
       measurement(educ) <- "interval"
    })
    
    # Declaring the measurement level of several variables using a loop
    BES.1983.classvot <- within(BES.1983.classvot,{
       foreach(var=c(lrself,age,educ),{
               measurement(var) <- "interval"
       })
    })
    
    # Obtaining the codebook a single variable
    codebook(BES.1983.classvot["age"])
    
    # Declaring the missing values for this variable
    missing.values(BES.1983.classvot$age) <- 99
    
    # Declaring several missing values
    
    BES.1983.classvot <- within(BES.1983.classvot,{
        missing.values(voted)     <- 9
        missing.values(vote)      <- 95:99
        missing.values(lrself)    <- 95:99
        missing.values(fglclass)  <- 0
        missing.values(rglclass)  <- 0
        missing.values(sglclass)  <- 0
        missing.values(ethnicity) <- 9
        missing.values(gender)    <- 9
        missing.values(age)       <- 9
        missing.values(age)       <- 98:99
        missing.values(religion)  <- 99
        missing.values(religatt)  <- 7:9
    })
    
    # The codebook of the resulting data set object
    codebook(BES.1983.classvot)
    
    save(BES.1983.classvot,
         file="BES-1983-classvot.RData")
    

    Required data file: 83BES.sav which is available from https://www.britishelectionstudy.com/data-object/1983-bes-cross-section/ (Note that the file available from this link has filename extension .sav, but [last time I checked] is actually has an SPSS “portable” format, so that the filename extension .por would have appeared more appropriate.)

    The script makes use of the memisc package, which is available from https://cran.r-project.org/package=memisc

Importing data from the American National Election Study in ASCII format with acompanying SPSS code

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: importing-BES-data.R

    ## Preparing Data from the 1983 British Election Study ##############################################
    ## (Note that the data must be downloaded from a different website.)
    
    library(memisc) # The functions used here are in this package.
    # The first step: Optainin the location and description of the survey data file
    BES.1983.por <- spss.portable.file("83BES.sav")
    description(BES.1983.por)
    
    # The second step: Loading a subset of the data - only the variables that are
    # really needed for the analysis.
    BES.1983.classvot <- subset(
        BES.1983.por,
        select=c(
            voted          = Q7A,
            vote           = Q9A,
            lrself         = Q46G,
            fglclass       = FGLCLASS,
            rglclass       = RGLCLASS,
            sglclass       = SGLCLASS,
            ethnicity      = Q64A,
            gender         = Q64B,
            age            = Q56,
            educ           = Q59A,
            religion       = Q63A,
            religatt       = Q63B
        ))
    description(BES.1983.classvot)
    
    # Declaring the measurement level of a single variable
    measurement(BES.1983.classvot$lrself) <- "interval"
    
    # Declaring the measurement level of several variables
    BES.1983.classvot <- within(BES.1983.classvot,{
       measurement(lrself) <- "interval"
       measurement(age) <- "interval"
       measurement(educ) <- "interval"
    })
    
    # Declaring the measurement level of several variables using a loop
    BES.1983.classvot <- within(BES.1983.classvot,{
       foreach(var=c(lrself,age,educ),{
               measurement(var) <- "interval"
       })
    })
    
    # Obtaining the codebook a single variable
    codebook(BES.1983.classvot["age"])
    
    # Declaring the missing values for this variable
    missing.values(BES.1983.classvot$age) <- 99
    
    # Declaring several missing values
    
    BES.1983.classvot <- within(BES.1983.classvot,{
        missing.values(voted)     <- 9
        missing.values(vote)      <- 95:99
        missing.values(lrself)    <- 95:99
        missing.values(fglclass)  <- 0
        missing.values(rglclass)  <- 0
        missing.values(sglclass)  <- 0
        missing.values(ethnicity) <- 9
        missing.values(gender)    <- 9
        missing.values(age)       <- 9
        missing.values(age)       <- 98:99
        missing.values(religion)  <- 99
        missing.values(religatt)  <- 7:9
    })
    
    # The codebook of the resulting data set object
    codebook(BES.1983.classvot)
    
    save(BES.1983.classvot,
         file="BES-1983-classvot.RData")
    

    Required data files: anes2008TS_dat.txt, anes2008TS_col.sps, anes2008TS_lab.sps, anes2008TS_cod.sps, anes2008TS_md.sps which are available from https://electionstudies.org/data-center/2008-time-series-study/ (as ASCII variant of the data).

    The script makes use of the memisc package, which is available from https://cran.r-project.org/package=memisc

Recoding and other transformations

Recoding data from the British Election Study

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: recoding-BES.R

    library(memisc)
    ## The following code picks up with the British Election Study data of the previous script
    load("BES-1983-classvot.RData")
    
    # This code collapses the categories of the vote variable into just four:
    
    BES.1983.classvot <- within(BES.1983.classvot,{
        vote.new <- vote
        vote.new[vote %in% 3:5]        <- 3
        vote.new[vote %in% c(6:10,97)] <- 4
    })
    # Checking the result:
    codebook(BES.1983.classvot$vote.new)
    
    # It is somewhat more convenient to use the 'recode()' function from the
    # 'memisc' package:
    
    BES.1983.classvot <- within(BES.1983.classvot,{
        vote.new <- recode(vote,
                           3 <- 3:5,
                           4 <- c(6:10,97),
                           otherwise="copy"
                           )
    })
    # Checking the result:
    codebook(BES.1983.classvot$vote.new)
    
    # Since 'BES.1983.classvot' is not a data frame, but a "data.set" object, we can
    # provide value labels while recoding:
    
    BES.1983.classvot <- within(BES.1983.classvot,{
        vote.new <- recode(vote,
                           Conservative  = 1 <- 1,
                           Labour        = 2 <- 2,
                           Alliance      = 3 <- 3:5,
                           Other         = 4 <- c(6:10,97),
                           "Didn't vote" = 5 <- 0,
                           DK            = 8 <- 98,
                           Refused       = 9 <- 95)
    })
    # Checking the result:
    codebook(BES.1983.classvot$vote.new)
    

    Required data file: BES-1983-classvot.RData, which is created in the first example.

    The script makes use of the memisc package, which is available from https://cran.r-project.org/package=memisc

Combining variables using case distinctions

  • Interactive notebook: https://mybinder.org/badge_logo.svg

  • Script file: recoding-GLES.R

    library(memisc)
    ## The following code works with example data from the 2017 German Longitudinal
    ## Election study: It code combines pre- and post-election variables in the to a single
    ## party-preference variable for the first (candidate) vote and the second (list) vote
    
    gles2017.sav <- spss.system.file("ZA6802_en_v3-0-1.sav")
    description(gles2017.sav)
    
    gles2017.vote <- subset(gles2017.sav,
                               select=c(
                                   survey = survey1,
                                   pre.turnout.int = v10,
                                   post.turnout = n10,
                                   pre.voteint.first = v11ab,
                                   pre.voteint.second = v11bb,
                                   post.vote.first = n11ab,
                                   post.vote.second = n11bb,
                                   pre.postvote.first = v12ab,
                                   pre.postvote.second = v12bb
                          ))
    codebook(gles2017.vote)
    
    gles2017.vote <- within(gles2017.vote,{
      vote.first <- cases(
                  survey == 0 & pre.turnout.int == 6 -> pre.postvote.first,
                  survey == 0 & pre.turnout.int %in% 4:5 -> -85,
                  survey == 0 & pre.turnout.int %in% 1:3 -> pre.voteint.first,
                  survey == 1 & post.turnout ==1 -> post.vote.first,
                  survey == 1 & post.turnout ==2 -> -85,
                  TRUE -> -97
                )
      vote.second <- cases(
                  survey == 0 & pre.turnout.int == 6 -> pre.postvote.second,
                  survey == 0 & pre.turnout.int %in% 4:5 -> -85,
                  survey == 0 & pre.turnout.int %in% 1:3 -> pre.voteint.second,
                  survey == 1 & post.turnout ==1 -> post.vote.second,
                  survey == 1 & post.turnout ==2 -> -85,
                  TRUE -> -97
      )
      vote.first <- as.item(vote.first, labels = labels(pre.postvote.first))
      vote.second <- as.item(vote.second, labels = labels(pre.postvote.second))
      valid.range(vote.first) <- valid.range(vote.second) <- c(1,900)
    })
    
    codebook(gles2017.vote[c("vote.first","vote.second")])
    

    Required data file: ZA6802_en_v3-0-1.sav, which is available from https://doi.org/10.4232/1.13236

    The script makes use of the memisc package, which is available from https://cran.r-project.org/package=memisc