Data from Complex Samples¶
Data from social science surveys often come from complex samples. In order to achieve efficient or at least accurate inference one may need to take into account the sampling design in the computation of sample summaries, e.g. by the application of sampling weights. The chapter shows how the survey package can help to take into account the sampling design in data management and data analysis.
Below is the supporting material for the various sections of the chapter.
Survey Design Objects¶
## Creating a survey object from NHANES data ############################################
# First we activate the 'survey' package
library(survey)
# Next we make the NHANES data set available (which comes with the 'survey'
# package)
data(nhanes)
# Now we can take a look at the data set, it has 7 variables and 8581 observations
head(nhanes)
nrow(nhanes)
# According to the documentation, the primary sampling units are identified by
# the variable SDMVPSU, the strata from which the PSUs are drawn are
# identified by the variable SDMVSTRA, and the sample weights are WTMEC2YR.
# The sum of the sample weights corresponds to the population size:
with(nhanes,sum(WTMEC2YR))
# Here were create 'ordinary' sampling weights as they are common in social
# science data sets:
nhanes <- within(nhanes,{
smplw <- WTMEC2YR/sum(WTMEC2YR)*nrow(nhanes)
})
# We now create two survey design objects, one with the "population size"
# weights and one with the "sample size" weights:
try(design_pop <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
weights = ~WTMEC2YR, data = nhanes))
# First attempt fails: the cluster numbers are not nested in the strata
# We try again with 'nest = TRUE'
design_pop <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
weights = ~WTMEC2YR, data = nhanes,
nest = TRUE)
design_smpl <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
weights = ~smplw, data = nhanes,
nest = TRUE)
# We first estimate the population proportions with high cholesterol (HI_CHOL==1)
svymean(~HI_CHOL, design=design_pop, na.rm=TRUE)
svymean(~HI_CHOL, design=design_smpl, na.rm=TRUE)
# We secondly estimate the number of those with high cholesterol (HI_CHOL==1) in
# the population
svytotal(~HI_CHOL, design=design_pop, na.rm=TRUE)
svytotal(~HI_CHOL, design=design_smpl, na.rm=TRUE)
# Obviously, we need the "population size" weights for an unbiased estimate of
# the population total, otherwise we just get a sample total.
#
# Using the "population size" weights, we can reconstruct the sizes of the
# population strata:
nhanes <- within(nhanes,{
strata_size <- ave(WTMEC2YR,SDMVSTRA,FUN=sum)
})
design_pop <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
weights = ~WTMEC2YR, data = nhanes,
nest = TRUE, fpc = ~ stata_size)
# Of course, usually its the other way round: starting with the strata and the
# cluster IDs, the sampling probabilities and sampling weights are computed.
## Creating a survey data from ANES 2016 ###############################################
library(memisc)
anes_2016_sav <- spss.file("anes_timeseries_2016.sav")
# Loading a subset: Only pre-election waves and only
# face-to-face interviews
anes_2016_pre_work_ds <- subset(anes_2016_sav,
V160501 == 1,
select=c(
# According to docs, these are the
# sample weights for the
# face-to-face component
pre_w_f2f = V160101f,
# Face-to-face strata
strat_f2f = V160201f,
psu_f2f = V160202f,
pre_voted12 = V161005,
pre_recall12 = V161006,
pre_voted = V161026,
pre_vote = V161027,
pre_intov = V161030,
pre_voteint = V161031#,
))
library(magrittr) # For the '%<>%' operator
anes_2016_pre_work_ds %<>% within({
# Setting up recalled votes of 2012
# Since a "default" value for the remaining conditions
# is used, we use 'check.xor = FALSE' to avoid warnings.
recall12 <- cases(
'Did not vote' = 9 <- pre_voted12 == 2,
'Obama' = 1 <- pre_recall12 == 1,
'Romney' = 2 <- pre_recall12 == 2,
'Other' = 3 <- pre_recall12 == 5,
'Inap' = 99 <- TRUE, check.xor = FALSE
)
# Early voters
vote16_1 <- cases(
'Clinton' = 1 <- pre_voted == 1 & pre_vote == 1,
'Trump' = 2 <- pre_voted == 1 & pre_vote == 2,
'Other' = 3 <- pre_voted == 1 & pre_vote %in% 3:5,
'Inap' = 99 <- TRUE, check.xor = FALSE)
# Vote intentions
vote16 <- cases(
'Clinton' = 1 <- pre_intov == 1 & pre_voteint == 1,
'Trump' = 2 <- pre_intov == 1 & pre_voteint == 2,
'Other' = 3 <- pre_intov == 1 & pre_voteint %in% 3:6,
'Will not vote/Not registered' = 8 <- pre_intov %in% c(-1,2),
'Inap' = 99 <- TRUE, check.xor = FALSE)
vote16[] <- ifelse(vote16 == 99 & vote16_1 != 99,
vote16_1,
vote16)
})
anes_2016_prevote <- as.data.frame(anes_2016_pre_work_ds)
#Unweighted crosstable
xtabs(~ vote16 + recall12,
data=anes_2016_prevote)
library(survey)
anes_2016_prevote_desgn <- svydesign(id = ~psu_f2f,
strata = ~strat_f2f,
weights = ~pre_w_f2f,
data = anes_2016_prevote,
nest = TRUE)
anes_2016_prevote_desgn
# We reduce the digits after dot
ops <- options(digits=2)
(tab <- svytable(~ vote16 + recall12,
design = anes_2016_prevote_desgn))
# and drop counts of non-valid responses before
# we compute percentages
percentages(vote16 ~ recall12, data=tab[-6,-5])
options(ops)
# Here we compute a F-test of independence with the table, which uses the
# Rao-Scott second-order correction with a Satterthwaite approximation of the
# denominator degrees of freedom is used.
summary(tab)
# The more conventional Pearson-Chi-squared test adjusted with a design-effect
# estimate is obtained by a slight modification
summary(tab, statistic="Chisq")
Script file: survey-design-objects.R
Data file: anes_timeseries_2016.sav is available from
https://electionstudies.org/data-center/2016-time-series-study/
Add-on packages:
survey available from https://cran.r-project.org/package=survey
memisc available from https://cran.r-project.org/package=memisc
Survey Replication Weights¶
## Survey replicate weights in the CHIS data ###################################################
library(survey)
library(foreign)
# The file 'adult.dta' is downloaded from
# http://healthpolicy.ucla.edu/chis/data/Pages/GetCHISData.aspx
# and contains the 2005 wave of the California Health Interview Survey.
# Redistribution of the data is prohibited, so readers who want to preproduce
# the following will need to download their own copy of the data set
adult_chis <- read.dta("adult.dta",
warn.missing.labels=FALSE)
# The data set contains 80 set of (raked) replicate weights. They are in the
# variables named "rakedw1" through "rakedw80". Raked sampling weights are
# in raked0
# We obtain the column numbers of these variables, making use of our knowledge
# of the name pattern
repw <- which(names(adult_chis) %in% paste0("rakedw",1:80))
# To apecify replicate weights, we call the function 'svrepdesgin'.
# The first argument specifies the variables that will be used for data
# analysis. The 'weights' argument specifies sampling weights, while the
# function 'repweights' specifies the replicate weights. The 'data=' argument
# specifies the data frame where the data all come from.
# The 'combined.weights=' argument is needed here, because the replicate weights
# were constructed from sampling weights and "pure" replicate weights. Since we
# do not know the way the replicate weights were constructed we have to specify
# 'type="other"'.
adult_chis_rd <- svrepdesign(adult_chis[-repw],
weights=~rakedw0,
repweights=adult_chis[repw],
data=adult_chis,
combined.weights=TRUE,
type="other",
scale=1,rscales=1)
# With 'svymean()' we get the estimated proportions of the various categories of
# health insurance status in California 2005, along with standard errors,
# multiplying by 100, we get percentages.
100*svymean(~instyp_p, design=adult_chis_rd)
# With 'svytotal' we obtain estimates of how many people have which health
# insurance status.
svytotal(~instyp_p, design=adult_chis_rd)
## Replicate weights for the ANES 2016 data ####################################################
# The 'automatic' type gives jackknife replicates
anes_2016_prevote_jk <- as.svrepdesign(anes_2016_prevote_desgn,
type="auto")
# The number of replicates is determined by the number of clusters
anes_2016_prevote_jk
# Here we select the multistage rescaled bootstrap
anes_2016_prevote_boo <- as.svrepdesign(anes_2016_prevote_desgn,
type="mrbbootstrap")
anes_2016_prevote_boo
# By default the number of bootstrap replicates is 50, we can
# change it to 200
anes_2016_prevote_boo <- as.svrepdesign(anes_2016_prevote_desgn,
type="mrbbootstrap",
replicates=200)
anes_2016_prevote_boo
# A function to compute the percentage of 2012 Democratic and Republican voters
# who vote for a candidate of the same party in 2016
StayerPerc <- function(weights,data){
tab <- xtabs(weights~vote16+recall12,data=data)
# Remove 'inap' responses
tab <- tab[-6,-5]
# Column percentages
ptab <- 100*prop.table(tab,2)
# The diagonal are the percentages of 'stayers'
# among the voters of 2012.
# The first two elements of the diagonal are
# the Democratic and Republican stayers.
structure(diag(ptab)[1:2],
names=c("Democratic",
"Republican"))
}
# Estimates and replication based standard errors based on jackknife
withReplicates(anes_2016_prevote_jk,
StayerPerc)
withReplicates(anes_2016_prevote_boo,
StayerPerc)
Script file: survey-replication-weights.R
Data files:
adult.dtaavailable from http://healthpolicy.ucla.edu/chis/data/Pages/GetCHISData.aspxanes_timeseries_2016.savas processed in the previous script
Add-on package: survey available from https://cran.r-project.org/package=survey
Poststratification, Raking, and Calibration¶
## Poststratification of the ANES 2016 data set ########################################
# There must not be any missing values in the stratifying variables.
anes_2016_vprevote <- subset(anes_2016_prevote,
vote16 != "Inap" &
recall12 != "Inap"
)
# In order to make poststratification possible, we need to make sure that the
# levels of the stratification variables match the population
# information. Therefore we relabel the variables 'recall12' and 'vote16'.
anes_2016_vprevote <- within(anes_2016_vprevote,{
recall12 <- recall12[,drop=TRUE]
vote16 <- vote16[,drop=TRUE]
recall12 <- relabel(recall12,"Did not vote"="No vote")
vote16 <- relabel(vote16,
"Will not vote/Not registered"="No vote")
})
# Finally, we set up a survey design object
anes_2016_vprevote_desgn <- svydesign(id = ~psu_f2f,
strata = ~strat_f2f,
weights = ~pre_w_f2f,
data = anes_2016_vprevote,
nest = TRUE)
# We collect the electoral results of 2012
result.2012 = c(Obama = 65915795,
Romney = 60933504,
# Other candidates are combined
Other = sum(c(
Johson = 1275971,
Stein = 469627,
Others = 490510
)))
# The number of non-voters is computed from the sum of the results and census
# data on the population in voting age
result.2012 <- c(result.2012,
"No vote" = 235248000 - sum(result.2012))
# Here we collect the results for 2016
result.2016 <- c(Clinton = 65853514,
Trump = 62984828,
Other = sum(c(
Johnson = 4489341,
Stein = 1457218,
McMullin = 731991,
Others = 1154084
)))
result.2016 <- c(result.2016,
"No vote" = 250056000 - sum(result.2016))
# The poststratification function expect population data to be in the form of
# data frames.
pop.vote16 <- data.frame(
vote16=names(result.2016),
Freq=result.2016)
pop.recall12 <- data.frame(
recall12=names(result.2012),
Freq=result.2012/sum(result.2012)*sum(result.2016)
)
# We poststratify the sample design object by recalled vote in 2012
anes_2016_prevote_desgn_post <- postStratify(
anes_2016_vprevote_desgn,~recall12,population=pop.recall12)
# We compare the estimated percentages of 2012 votes
100*svymean(~recall12,design=anes_2016_vprevote_desgn)
100*svymean(~recall12,design=anes_2016_prevote_desgn_post)
# As should be expected, post-stratification eliminates the uncertainty about
# 2012 votes. It also corrects for turnout overreporting.
# We now compare the estimated percentages of 2016 votes
100*svymean(~vote16,design=anes_2016_vprevote_desgn)
100*svymean(~vote16,design=anes_2016_prevote_desgn_post)
# The percentages of Clinton voters and Trump voters are closer after
# poststratification.
# We rake the sample design object by recalled vote in 2012 and vote in 2016
anes_2016_prevote_desgn_rake <- rake(
anes_2016_vprevote_desgn,list(~recall12,~vote16),
population=list(pop.recall12,pop.vote16),
control=list(maxit=20))
# Of course, the marginal distribution of vote in 2012 and vote in 2016 is now
# no longer estimated, so that standard errors disappear.
100*svymean(~recall12,design=anes_2016_prevote_desgn_rake)
100*svymean(~vote16,design=anes_2016_prevote_desgn_rake)
# The actual percentage of non-voters in 2016 is obviously much higher than the
# one estimated from the sample, be it post-stratified or not.
# 'calibrate' expects the names of the calibration vectors to be like those of
# regression coefficents
cal_names(~recall12+vote16,anes_2016_vprevote_desgn)
# This function creates a vector suitable for calibration from the
# data frames that `postStratify()` or `rake()` use
calib_counts <- function(formula,frames){
dframe2coef <- function(data){
fname <- names(data)[1]
flevels <- as.character(data[[1]])
Freq <- data$Freq
coefs <- c(sum(Freq),Freq[-1])
names(coefs) <- c("(Intercept)",
paste0(fname,flevels[-1]))
coefs
}
vars <- all.vars(formula)
for(i in seq_along(vars)){
var_i <- vars[i]
frame_i <- frames[[var_i]]
coef_i <- dframe2coef(frame_i)
if(i==1)
res <- coef_i
else
res <- c(res,coef_i[-1])
}
res
}
calib_anes16 <- calib_counts(~recall12+vote16,
list(recall12=pop.recall12,
vote16=pop.vote16))
anes_2016_prevote_desgn_calib <- calibrate(
anes_2016_vprevote_desgn,~recall12+vote16,
population=calib_anes16)
100*svymean(~recall12,design=anes_2016_prevote_desgn_calib)
100*svymean(~vote16,design=anes_2016_prevote_desgn_calib)
# Let's compare the effect of poststratification and raking on the relation
# between variables.
#
# First, we create a table from the data with valid responses about voting
# behaviour in 2012 and 2016.
tab <- svytable(~ vote16 + recall12,
design = anes_2016_vprevote_desgn)
percentages(vote16 ~ recall12, data=tab)
# Second, we create a table from the poststatified data.
tab_post <- svytable(~ vote16 + recall12,
design = anes_2016_prevote_desgn_post)
percentages(vote16 ~ recall12, data=tab_post)
# Third, we create a table from the raked data.
tab_rak <- svytable(~ vote16 + recall12,
design = anes_2016_prevote_desgn_rake)
percentages(vote16 ~ recall12, data=tab_rak)
# Fourth, we create a table from the calibrated data
tab_calib <- svytable(~ vote16 + recall12,
design = anes_2016_prevote_desgn_calib)
percentages(vote16 ~ recall12, data=tab_calib)
# Poststratification does not alter percentages that are conditional on the
# variable used for poststratification. Yet raking does change the conditional percentages.
# To examine whether raking affects relations between recalled vote in 2012 and
# vote in 2016 we compute log-odds ratios:
log.odds <- function(x) log((x[1,1]/x[1,2])/(x[2,1]/x[2,2]))
# Log-odds ratios are a way to describe the relation between two dichotomous
# variables. Like correlations between continuous variables they are not
# affected by the marginal distribution.
log.odds(tab)
log.odds(tab_post)
log.odds(tab_rak)
log.odds(tab_calib)
# Clearly, both poststratfication and raking leaves log-odds ratios
# unaffected. Calibration has an effect, but this appears to be minor (at least
# in the present case).
Script file: poststratification-raking-calibration.R
Data file: anes_timeseries_2016.sav as processed in the previous script
Add-on package: survey available from https://cran.r-project.org/package=survey