library("stringr")
library("dplyr")
library("ggplot2")


# Set working directories
## Assuming that 'YOURPATH' is the directory where
## 'ReplicationFiles.zip' was unpacked.
loc <- "YOURPATH/ReplicationFiles/R_Coverage_Replication/"
## Assuming that R is run from the working directory is where the script is
## located
loc <- "../"

## Read the simulation results: direct context effect
# First the linear ones
LinearNoCLI.files <-  list.files(str_c(loc, "RSimulations/Linear/CDiff-0/"), pattern = "NoCLI")
for (i in 1:length(LinearNoCLI.files)) {
	if (i == 1) {
		sims.lin <- readRDS(str_c(loc, "RSimulations/Linear/CDiff-0/", LinearNoCLI.files[i]))
	} else {
		sims.lin <- dplyr::bind_rows(sims.lin, readRDS(str_c(loc, "RSimulations/Linear/CDiff-0/", LinearNoCLI.files[i])))
	}
}

# Next the linear ones with compositional differences
LinearNoCLI05.files <-  list.files(str_c(loc, "RSimulations/Linear/CDiff-05/"), pattern = "NoCLI")
for (i in 1:length(LinearNoCLI05.files)) {
	if (i == 1) {
		sims.lin05 <- readRDS(str_c(loc, "RSimulations/Linear/CDiff-05/", LinearNoCLI05.files[i]))
	} else {
		sims.lin05 <- dplyr::bind_rows(sims.lin05, readRDS(str_c(loc, "RSimulations/Linear/CDiff-05/", LinearNoCLI05.files[i])))
	}
}

# Next the probit ones
ProbitNoCLI.files <-  list.files(str_c(loc, "RSimulations/Probit/"), pattern = "NoCLI")
for (i in 1:length(ProbitNoCLI.files)) {
	if (i == 1) {
		sims.probit <- readRDS(str_c(loc, "RSimulations/Probit/", ProbitNoCLI.files[i]))
	} else {
		sims.probit <- dplyr::bind_rows(sims.probit, readRDS(str_c(loc, "RSimulations/Probit/", ProbitNoCLI.files[i])))
	}
}

# Select vars and bind together
sims.lin            <- sims.lin[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "sd.cons", "convinfo1", "convinfo2", "dfsat.z")]
sims.lin$link       <- "Linear"
sims.lin05            <- sims.lin05[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "sd.cons", "convinfo1", "convinfo2", "dfsat.z")]
sims.lin05$link       <- "Linear05"
sims.probit         <- sims.probit[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "sd.cons", "convinfo1", "convinfo2")]
sims.probit$dfsat.z <- NA
sims.probit$link    <- "Probit"

sims <- data.frame(rbind(sims.lin, sims.lin05, sims.probit))
# Clean up
rm(LinearNoCLI.files, LinearNoCLI05.files, ProbitNoCLI.files, sims.lin, sims.lin05, sims.probit)
gc()

## 1.3 Recoding, redefinition, and renaming
# 1.3.1 Define the outcomes as numeric variables
sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))] <- sapply(sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))], as.character)
sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))] <- sapply(sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))], as.numeric)

# Calculate random intercept variance
sims$var.cons <- sims$sd.cons^2

options(dplyr.width = Inf)
options(dplyr.print_max = Inf)

# Inspect data, check plausibility
sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count =n(),
			  min = min(sd.cons),
			  p1 = quantile(sd.cons, probs = .01),
			  p5 = quantile(sd.cons, probs = .05),
			  p50 = quantile(sd.cons, probs = .5),
			  p95 = quantile(sd.cons, probs = .95),			 
			  p99 = quantile(sd.cons, probs = .99),
			  max = max(sd.cons)) %>%
	ungroup()

sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count = n(),
			  min = min(beta.z),
			  p1 = quantile(beta.z, probs = .01),
			  p5 = quantile(beta.z, probs = .05),
			  p50 = quantile(beta.z, probs = .5),
			  p95 = quantile(beta.z, probs = .95),			 
			  p99 = quantile(beta.z, probs = .99),
			  max = max(beta.z)) %>%
	ungroup()

# Flag cases that are 10 x IQR below 1st/above 3rd quartile
sims <- sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	mutate(flag_sd.cons = sd.cons < quantile(sd.cons, probs = .25) - 10 * IQR(sd.cons) |
						  sd.cons > quantile(sd.cons, probs = .75) + 10 * IQR(sd.cons),
		   flag_beta.z = beta.z < quantile(beta.z, probs = .25) - 10 * IQR(beta.z) |
						  beta.z > quantile(beta.z, probs = .75) + 10 * IQR(beta.z)) %>%
	ungroup()

table(sims$flag_sd.cons, sims$flag_beta.z)
table(sims$estimator, sims$flag_sd.cons)
table(sims$estimator, sims$flag_beta.z)
table(sims$convinfo1, sims$flag_sd.cons)
table(sims$convinfo1, sims$flag_beta.z)
# Outliers concentrated among REML/EQL Probits with convergence warnings

# Drop flagged replications
temp <- filter(sims, !flag_sd.cons & !flag_beta.z)
nrow(sims)
nrow(sims)-nrow(temp) # Dropping 105 out of 420,000
sims <- temp
rm(temp)
gc()

# Add reference values for intercept variance (from DGP)
sims$var.true[sims$link == "Probit" & sims$icc == 5] <- .05263158
sims$var.true[sims$link == "Probit" & sims$icc == 10] <- .1111111
sims$var.true[sims$link == "Probit" & sims$icc == 15] <- .1764706

sims$var.true[grepl("Linear", sims$link) & sims$icc == 5] <- .1052632
sims$var.true[grepl("Linear", sims$link) & sims$icc == 10] <- .2222222
sims$var.true[grepl("Linear", sims$link) & sims$icc == 15] <- .3529412

## 3. Aggregation by experimental condition (type of model, scenario, and so on)
# (True coefficient of z is .2 for all experimental conditions)

# How many missings
sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	select(beta.z, dfsat.z, se.z, var.cons) %>%
	summarize_all(funs(sum(is.na(.)))) %>%
	ungroup()
# Essentially no missings, except by design:
# Satterthwaite DF only available for linear models and we only obtained them 
# for the case of ICC == 10 & no compositional differences

# Seven replications with missing SE for z, all of these for REML/EQL
table(filter(sims, is.na(se.z))$estimator)
# Drop them
sims <- filter(sims, !is.na(se.z))

simsag <- sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(
		sim.count       = n(),
		prop.missing = (5000 -n())/5000 * 100,

		cov95sat.z      = (mean(beta.z - qt(.975, df = dfsat.z)*se.z <= .2 & beta.z + qt(.975, df = dfsat.z)*se.z >= .2) - 0.95) * 100,
		cov95MminL.z    = (mean(beta.z - qt(.975, df = clusts-2)*se.z <= .2 & beta.z + qt(.975, df = clusts-2)*se.z >= .2) - 0.95) * 100,
		cov95norm.z     = (mean(beta.z - qnorm(.975)*se.z <= .2 & beta.z + qnorm(.975)*se.z >= .2) - 0.95) * 100,
		biasVCcons      = ((mean(var.cons) - mean(var.true)) / mean(var.true)) * 100,
		min95biasVCcons = ((mean(var.cons) - qnorm(.975) * (sd(var.cons) / sqrt(sim.count)) - mean(var.true)) / mean(var.true)) * 100,
		max95biasVCcons = ((mean(var.cons) + qnorm(.975) * (sd(var.cons) / sqrt(sim.count)) - mean(var.true)) / mean(var.true)) * 100
	)

# Inspect and save
data.frame(simsag)
# Looks good ()
saveRDS(simsag, str_c(loc,'RSimulations/NoCrossLevelAggregate_GO.rds'))
rm(sims, simsag)
gc()






## Read the simulation results: cross level interaction
# First the linear ones
LinearCLI.files <-  list.files(str_c(loc, "RSimulations/Linear/CDiff-0/"), pattern = "_CLI_")
for (i in 1:length(LinearCLI.files)) {
	if (i == 1) {
		sims.lin <- readRDS(str_c(loc, "RSimulations/Linear/CDiff-0/", LinearCLI.files[i]))
	} else {
		sims.lin <- dplyr::bind_rows(sims.lin, readRDS(str_c(loc, "RSimulations/Linear/CDiff-0/", LinearCLI.files[i])))
	}
}

# Next the linear ones with compositional differences
LinearCLI05.files <-  list.files(str_c(loc, "RSimulations/Linear/CDiff-05/"), pattern = "_CLI_")
for (i in 1:length(LinearCLI05.files)) {
	if (i == 1) {
		sims.lin05 <- readRDS(str_c(loc, "RSimulations/Linear/CDiff-05/", LinearCLI05.files[i]))
	} else {
		sims.lin05 <- dplyr::bind_rows(sims.lin05, readRDS(str_c(loc, "RSimulations/Linear/CDiff-05/", LinearCLI05.files[i])))
	}
}

# Next the probit ones
ProbitCLI.files <-  list.files(str_c(loc, "RSimulations/Probit/"), pattern = "_CLI_")
for (i in 1:length(ProbitCLI.files)) {
	if (i == 1) {
		sims.probit <- readRDS(str_c(loc, "RSimulations/Probit/", ProbitCLI.files[i]))
	} else {
		sims.probit <- dplyr::bind_rows(sims.probit, readRDS(str_c(loc, "RSimulations/Probit/", ProbitCLI.files[i])))
	}
}

# Select vars and bind together
sims.lin      <- sims.lin[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "beta.x", "se.x", "beta.x.z", "se.x.z", "sd.cons", "sd.x", "convinfo1", "convinfo2", "dfsat.z", "dfsat.x", "dfsat.x.z")]
sims.lin$link <- "Linear"
sims.lin05      <- sims.lin05[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "beta.x", "se.x", "beta.x.z", "se.x.z", "sd.cons", "sd.x", "convinfo1", "convinfo2", "dfsat.z", "dfsat.x", "dfsat.x.z")]
sims.lin05$link <- "Linear05"
sims.probit   <- sims.probit[, c("icc", "cli", "clusts", "estimator", "beta.z", "se.z", "beta.x", "se.x", "beta.x.z", "se.x.z", "sd.cons", "sd.x", "convinfo1", "convinfo2")]
sims.probit$link      <- "Probit"
sims.probit$dfsat.z   <- NA
sims.probit$dfsat.x   <- NA
sims.probit$dfsat.x.z <- NA

sims <- as.data.frame(rbind(sims.lin, sims.lin05, sims.probit))
# Clean up
rm(LinearCLI.files, LinearCLI05.files, ProbitCLI.files, sims.lin, sims.lin05, sims.probit)
gc()

## 1.3 Recoding, redefinition, and renaming
# 1.3.1 Define the outcomes as numeric variables
sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))] <- sapply(sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))], as.character)
sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))] <- sapply(sims[, -grep("conv|cli|icc|clusts|estimator|link", colnames(sims))], as.numeric)


# Calculate Intercept variance
sims$var.cons <- sims$sd.cons^2
sims$var.x    <- sims$sd.x^2

options(dplyr.width = Inf)
options(dplyr.print_max = Inf)

# Inspect data, check plausibility

sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count = n(),
			  min = min(sd.cons, na.rm = TRUE),
			  p1 = quantile(sd.cons, probs = .01, na.rm = TRUE),
			  p5 = quantile(sd.cons, probs = .05, na.rm = TRUE),
			  p50 = quantile(sd.cons, probs = .5, na.rm = TRUE),
			  p95 = quantile(sd.cons, probs = .95, na.rm = TRUE),			 
			  p99 = quantile(sd.cons, probs = .99, na.rm = TRUE),
			  max = max(sd.cons, na.rm = TRUE)) %>%
	ungroup()

sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count = n(),
			  min = min(beta.z, na.rm = TRUE),
			  p1 = quantile(beta.z, probs = .01, na.rm = TRUE),
			  p5 = quantile(beta.z, probs = .05, na.rm = TRUE),
			  p50 = quantile(beta.z, probs = .5, na.rm = TRUE),
			  p95 = quantile(beta.z, probs = .95, na.rm = TRUE),			 
			  p99 = quantile(beta.z, probs = .99, na.rm = TRUE),
			  max = max(beta.z, na.rm = TRUE)) %>%
	ungroup()

# Inspect data, check plausibility
sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count =n(),
			  min = min(sd.x, na.rm = TRUE),
			  p1 = quantile(sd.x, probs = .01, na.rm = TRUE),
			  p5 = quantile(sd.x, probs = .05, na.rm = TRUE),
			  p50 = quantile(sd.x, probs = .5, na.rm = TRUE),
			  p95 = quantile(sd.x, probs = .95, na.rm = TRUE),			 
			  p99 = quantile(sd.x, probs = .99, na.rm = TRUE),
			  max = max(sd.x, na.rm = TRUE)) %>%
	ungroup()

sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count = n(),
			  min = min(beta.x.z, na.rm = TRUE),
			  p1 = quantile(beta.x.z, probs = .01, na.rm = TRUE),
			  p5 = quantile(beta.x.z, probs = .05, na.rm = TRUE),
			  p50 = quantile(beta.x.z, probs = .5, na.rm = TRUE),
			  p95 = quantile(beta.x.z, probs = .95, na.rm = TRUE),			 
			  p99 = quantile(beta.x.z, probs = .99, na.rm = TRUE),
			  max = max(beta.x.z, na.rm = TRUE)) %>%
	ungroup()

sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(count = n(),
			  min = min(beta.x, na.rm = TRUE),
			  p1 = quantile(beta.x, probs = .01, na.rm = TRUE),
			  p5 = quantile(beta.x, probs = .05, na.rm = TRUE),
			  p50 = quantile(beta.x, probs = .5, na.rm = TRUE),
			  p95 = quantile(beta.x, probs = .95, na.rm = TRUE),			 
			  p99 = quantile(beta.x, probs = .99, na.rm = TRUE),
			  max = max(beta.x, na.rm = TRUE)) %>%
	ungroup()

# Flag cases that are 10 x IQR below 1st/above 3rd quartile
sims <- sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	mutate_at(vars(sd.cons, beta.z, sd.x, beta.x.z, beta.x),
			  funs(flag = . < quantile(., probs = .25, na.rm = TRUE) - 10 * IQR(., na.rm = TRUE) |
						  . > quantile(., probs = .75, na.rm = TRUE) + 10 * IQR(., na.rm = TRUE))) %>%
	ungroup()

lapply(select(sims, matches("flag")), function(x) table(sims$estimator, x))
# Again few extreme outliers and these are concentrated among REML/EQL probit

# Drop flagged replications
temp <- filter(sims, !sd.cons_flag & !beta.z_flag & !sd.x_flag & !beta.x.z_flag & !beta.x_flag)
nrow(sims)
nrow(sims) - nrow(temp) # 217 out of 420,000 reps dropped
sims <- temp
rm(temp)

# How many missings
sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	select(beta.z, dfsat.z, se.z, beta.x.z, se.x.z, beta.x, se.x, var.x, var.cons) %>%
	summarize_all(funs(sum(is.na(.)))) %>%
	ungroup()
# Very few missings, except by design:
# Satterthwaite DF only available for linear models and we only obtained them 
# for the case of ICC == 10 & no compositional differences

# 53 replications with missing SEs for fixed effects, all of these for REML/EQL
table(filter(sims, is.na(se.z))$estimator)
# Drop them
sims <- filter(sims, !is.na(se.z))

# Double check
sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	select(beta.z, dfsat.z, se.z, beta.x.z, se.x.z, beta.x, se.x, var.x, var.cons) %>%
	summarize_all(funs(sum(is.na(.)))) %>%
	ungroup()
# All zero now, except for Satterthwaite DF

# Reference (i.e. true) values for random intercept variance
# (var of random slope is always .3)
sims$var.true[sims$link == "Probit" & sims$icc == 5] <- .05263158
sims$var.true[sims$link == "Probit" & sims$icc == 10] <- .1111111
sims$var.true[sims$link == "Probit" & sims$icc == 15] <- .1764706

sims$var.true[grepl("Linear", sims$link) & sims$icc == 5] <- .1052632
sims$var.true[grepl("Linear", sims$link) & sims$icc == 10] <- .2222222
sims$var.true[grepl("Linear", sims$link) & sims$icc == 15] <- .3529412

## 3. Aggregation by type of model, scenario, and so on
simsag <- sims %>% group_by(icc, cli, clusts, estimator, link) %>%
	summarize(
		sim.count    = n(),
		prop.missing = (5000 - n())/5000 * 100,

		# True coefficient for x*z is .1
		cov95sat.x.z   = (mean(beta.x.z - qt(.975, df = dfsat.x.z)*se.x.z <= .1 & beta.x.z + qt(.975, df = dfsat.x.z)*se.x.z >= .1) - 0.95) *100,
		cov95MminL.x.z = (mean(beta.x.z - qt(.975, df = clusts-2)*se.x.z <= .1 & beta.x.z + qt(.975, df = clusts-2)*se.x.z >= .1) - 0.95) *100,
		cov95norm.x.z  = (mean(beta.x.z - qnorm(.975)*se.x.z <= .1 & beta.x.z + qnorm(.975)*se.x.z >= .1) - 0.95) *100,

		# True coefficient for z is .2
		cov95sat.z     = (mean(beta.z - qt(.975, df = dfsat.z)*se.z <= .2 & beta.z + qt(.975, df = dfsat.z)*se.z >= .2) - 0.95) *100,
		cov95MminL.z   = (mean(beta.z - qt(.975, df = clusts-2)*se.z <= .2 & beta.z + qt(.975, df = clusts-2)*se.z >= .2) - 0.95) *100,
		cov95norm.z    = (mean(beta.z - qnorm(.975)*se.z <= .2 & beta.z + qnorm(.975)*se.z >= .2) - 0.95) *100,

		# True coefficient for x is .25
		cov95sat.x     = (mean(beta.x - qt(.975, df = dfsat.x)*se.x <= .25 & beta.x + qt(.975, df = dfsat.x)*se.x >= .25) - 0.95) *100,
		cov95MminL.x   = (mean(beta.x - qt(.975, df = clusts-2)*se.x <= .25 & beta.x + qt(.975, df = clusts-2)*se.x >= .25) - 0.95) *100,
		cov95norm.x     = (mean(beta.x - qnorm(.975)*se.x <= .25 & beta.x + qnorm(.975)*se.x >= .25) - 0.95) *100,

		# True value of random intercept variance depends on (conditional) ICC
		# For random slope on x it is always .3
		biasVCcons      = ((mean(var.cons) - mean(var.true)) / mean(var.true)) * 100,
		biasVCx         = ((mean(var.x) - 0.3) / 0.3) * 100,

		# Confidence intervals for bias estimates
	    min95biasVCcons = ((mean(var.cons) - qnorm(.975) * (sd(var.cons) / sqrt(sim.count)) - mean(var.true)) / mean(var.true)) * 100,
	    max95biasVCcons = ((mean(var.cons) + qnorm(.975) * (sd(var.cons) / sqrt(sim.count)) - mean(var.true)) / mean(var.true)) * 100,
	    min95biasVCx = ((mean(var.x) - qnorm(.975) * (sd(var.x) / sqrt(sim.count)) - 0.3) / 0.3) * 100,
	    max95biasVCx = ((mean(var.x) + qnorm(.975) * (sd(var.x) / sqrt(sim.count)) - 0.3) / 0.3) * 100
	)
data.frame(simsag)
saveRDS(simsag, str_c(loc,'RSimulations/CrossLevelAggregate_GO.rds'))

