First Add Some R Packages to the Workspace.
Caution: warning messages are suppressed to reduce clutter in the output.
tidyverse: Importing data, cleaning data, data manipulation, & data visualization
kableExtra: Build HTML tables
DataExplorer: Exploratory Data Analysis & Feature Engineering
tableone: Standardized mean differences for before and after matching
survey: Matched data with match weights
Matching: Propensity score matching
cobalt: Covariate balance
reshape2: Covariate balance plot
rbounds: Rosenbaum Sensitivity test
library(tidyverse)
library(kableExtra)
library(DataExplorer)
library(tableone)
library(survey)
library(Matching)
library(cobalt)
library(reshape2)
library(rbounds)
select <- dplyr::select # Resolves package conflicts with select
options(width = 120) # Format print width
General functions used throughout the analysis.
# Update palN for Chem 24 Spring 2019 ------------------------------------------
update.chem24s19 <- function(chem.dat) {
PAL.course.data <- read_rds("palCourseData.rds")
chem24.S19 <- PAL.course.data %>%
filter(term == "Spring 2019", course == "CHEM 24")
# Add a palN indicator for Chem 24 Spring 2019
chem24.S19 <- chem24.S19 %>%
mutate(palN.chem24.S19 = case_when(
pal.grade == "CR" ~ 2,
is.na(pal.grade) ~ 0,
TRUE ~ 1
)) %>%
select(emplid, palN.chem24.S19)
# Check how many student are non-PAL, incomplete PAL, and PAL
table(chem24.S19$palN.chem24.S19)
# 0 1 2
# 51 10 52
chem.dat <- left_join(chem.dat, chem24.S19, by= "emplid" )
chem.dat <- chem.dat %>%
mutate(palN = case_when(
course == "CHEM 24" & term == "Spring 2019" ~ palN.chem24.S19,
TRUE ~ palN
)) %>%
select(-palN.chem24.S19)
return(chem.dat)
}
# Get raw table of mean gpa for PAL and non-PAL -------------------------------
get.raw.tab <- function(classes, df)
{
raw.table = data.frame(class=character(),
nonPALavg=numeric(),
PALavg=numeric(),
Diff=numeric(),
NonPAL_Num= integer(),
PAL_Num=integer(),
CompletePAL=numeric(),
TermPALStart=integer(),
row.names=NULL,
stringsAsFactors = FALSE)
for (i in 1:length(classes))
{
curr.class = classes[i]
temp = subset(df, course==curr.class & course.seq==0)
pal.start=min(unique(temp$term.code[temp$palN==2]))
# only include terms after PAL start term
temp = subset(temp, term.code>= pal.start)
x=tapply(temp$grd.pt.unt,temp$palN,
mean, na.rm=T) %>%
as.numeric %>%
round(2)
y=table(temp$palN) %>% as.numeric
raw.table[i, 'class' ] = curr.class
raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
round(y[3]/sum(y),2))
raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
}
# formatted table
raw.table <- kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)") %>%
kable_styling(full_width= T, position = "left")
return(raw.table)
}
# Data cleaning ----------------------------------------------------------------
clean.data <- function(df)
{
# Replaced coh.term with coh.term.course
yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",df$coh.term.course))
df$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(df$hs.grad.yr),
yr.course.taken-df$hs.grad.yr, NA)
sum(is.na(df$delay.from.hs))
# remove students who did not complete PAL
df=subset(df, palN!=1)
#recode palN to factor with 0/1 levels
df$palN = ifelse(df$palN==2, 1, 0)
#clean up category names in m.rmd and e.rmd
df$m.rmd[df$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
df$m.rmd[df$m.rmd=="Remedial\nin Math"]="Remedial in Math"
df$e.rmd[df$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
df$e.rmd[df$e.rmd=="Remedial\nin English"]="Remedial in English"
df <- df %>% mutate(m.rmd = factor(m.rmd), e.rmd = factor(e.rmd))
# table(df$e.rmd)
# Create feature, proportion of cumulative units taken that were passes
# To distinguish the students who have taken 0 units from the students who
# have passed 0 units they have taken, students who have taken 0 units are
# labeled as -1. Then the -1 is replaced by the mean of cum.percent.units.passed
df <- df %>%
mutate(cum.percent.units.passed = ifelse(tot.taken.prgrss.start == 0, -1,
tot.passd.prgrss.start / tot.taken.prgrss.start)) %>%
mutate(cum.percent.units.passed = ifelse(cum.percent.units.passed == -1, mean(cum.percent.units.passed, na.rm =TRUE),
cum.percent.units.passed ))
# code instructor as alphabetic letter for anonymity
df$Instructor_01=droplevels(factor(df$Instructor_01))
instructor.vec = sort(unique(df$Instructor_01))
num.instr = length(instructor.vec)
df$Instructor_01 = factor(
df$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr)
)
key.instr.code = cbind(as.character(instructor.vec), 1:num.instr)
# Add "cMaj", census majors without concentrations/specializations/tracks/etc.
major_lookup <- read.csv("Census Major Lookup.csv", header = TRUE,
stringsAsFactors = FALSE)
df <- merge(df, major_lookup %>% select(censusMajor, cMaj),
by = "censusMajor", all.x = TRUE)
# Recode mother's education and father's education variables.
non.hs.grad= c("No High School","Some High School")
hs.grad= c("High School Graduate","Some College","2-Year College Graduate")
coll.grad= c("4-Year College Graduate","Postgraduate")
parent.ed.levels= c(
"Non-HS Graduate","HS Graduate", "College Graduate", "Unknown"
)
df <- df %>%
mutate(
mother.ed = ifelse(mother.ed %in% non.hs.grad, "Non-HS Graduate",
ifelse(mother.ed %in% hs.grad, "HS Graduate",
ifelse(mother.ed %in% coll.grad, "College Graduate", "Unknown"))),
mother.ed= factor(mother.ed, levels= parent.ed.levels),
father.ed = ifelse(father.ed %in% non.hs.grad,"Non-HS Graduate",
ifelse(father.ed %in% hs.grad, "HS Graduate",
ifelse(father.ed %in% coll.grad, "College Graduate", "Unknown"))),
father.ed= factor(father.ed, levels= parent.ed.levels))
# Recoded adm.area with these counties as local: 'El Dorado', 'Nevada',
# 'Placer', 'Sacramento', 'San Joaquin', 'Solano', 'Yolo'.
counties.rad <- read_csv(
"countiesRadius120mi.csv",
col_types = cols(
state = col_skip(), city = col_skip(), distance.km = col_skip()
)
)
df <- left_join(df, counties.rad, by = "zip")
local.adm.counties <- c(
'El Dorado', 'Nevada', 'Placer', 'Sacramento', 'San Joaquin', 'Solano',
'Yolo'
)
# County will be NA if the zip code is not within 120 mile radius of
# CSUS zip code(95819)
df <- df %>%
mutate(
adm.area =
if_else(!(county %in% local.adm.counties) | is.na(county),
"nonlocal", "local")
) %>%
mutate(sac.county.flg =
if_else(!(county == "Sacramento") | is.na(county), 0, 1)
) %>%
mutate(sac.county.flg = factor(sac.county.flg))
return(df)
}
# Extract prerequisite course grade ---------------------------------------------
get.prereq.grades <- function(course.df, df, prereq) {
# Get student's recent Chem 1B grade
course.stu <- course.df$emplid
prereq.df <- df %>%
select(emplid, course, course.seq, grd.pt.unt, grade) %>%
filter(emplid %in% course.stu, course== prereq) %>%
group_by(emplid) %>%
filter(course.seq == max(course.seq)) %>%
rename(
prereq.course.seq = course.seq, prereq.grd.pt.unt = grd.pt.unt,
prereq.grade = grade
) %>%
select(-course)
dim(prereq.df) # [1] 275 6
prereq.stu <- prereq.df$emplid
course.df <- course.df %>%
filter(emplid %in% prereq.stu)
course.df <- left_join(course.df, prereq.df, by = "emplid")
return(course.df)
}
# Get only the variables that have missing values ---------------------------------------------
get.missing.only <- function(course.df) {
get.missing.only <- course.df %>%
summarise(across(everything(), ~ sum(is.na(.x)))) %>%
gather() %>%
filter(value != 0)
get.missing.only <- course.df %>%
dplyr::select(all_of(get.missing.only$key))
return(get.missing.only)
}
# Get imbalanced variables with SMD > 0.1------------------------------------
get.imbal.vars <- function(tab)
{
get.imbal.vars <- as.data.frame(ExtractSmd(tab))
get.imbal.vars <- get.imbal.vars %>%
rownames_to_column(var = "Variable") %>%
rename(`Before Matching SMD` = `1 vs 2`) %>%
filter(`Before Matching SMD` > 0.1) %>%
arrange(desc(`Before Matching SMD`))
get.imbal.vars <- kable(
get.imbal.vars, caption = "Variables with SMD > 0.1"
) %>%
kable_styling(full_width= F)
return(get.imbal.vars)
}
# Unadjusted means -------------------------------------------------------------
get.unadj.means <- function(df.final)
{
get.unadj.means <- df.final %>%
group_by(palN) %>% summarise(unadj.means = mean(grd.pt.unt)) %>%
pivot_wider(names_from = "palN", values_from = "unadj.means") %>%
rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
mutate(Diff. = `PAL`-`Non-PAL`)
get.unadj.means<- kable(
get.unadj.means, caption = "Unadjusted Mean Grades"
) %>%
kable_styling(full_width= F)
return(get.unadj.means)
}
# Adjusted means --------------------------------------------------------------
adj.means <- function(match.list, matched.dat) {
get.adj.means <- matched.dat %>%
group_by(palN) %>%
summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
pivot_wider(names_from = "palN", values_from = "adj.means") %>%
rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
mutate(Diff. = `PAL`-`Non-PAL`)
# formatted table
get.adj.means<- kable(get.adj.means, caption = "Adjusted Mean Grades") %>%
kable_styling(full_width= F)
return(get.adj.means)
}
# Match Table ------------------------------------------------------------------
create.match.tab <- function(matched.dat) {
matched.dat <- matched.dat %>%
mutate(pal = if_else(palN == 0, "Non-PAL", "PAL"))
pal.flg <- c('Non-PAL', 'PAL')
for (i in seq_along(pal.flg)) {
multiple.matches <- matched.dat %>%
filter(pal ==pal.flg[i]) %>%
count(id) %>%
filter(n> 1) %>%
summarise(n())
single.matches <- matched.dat %>%
filter(pal == pal.flg[i]) %>%
count(id) %>%
filter(n==1) %>%
summarise(n())
if(pal.flg[i] == 'Non-PAL') {
match.tab <- bind_rows(single.matches, multiple.matches)
match.tab <- match.tab %>%
rename('Non-PAL'= 'n()')
}
pal.matches <- bind_rows(single.matches, multiple.matches)
match.tab$PAL <- pal.matches$`n()`
row.names(match.tab) <- c("Single Matches", "Multiple Matches")
}
match.tab <-rbind(
match.tab, "Total Students" = c(sum(match.tab$`Non-PAL`), sum(match.tab$`PAL`))
)
match.tab <- kable(match.tab, caption = "PAL and Non-PAL Matches") %>%
kable_styling(full_width= F)
return(match.tab)
}
# ATT plot ---------------------------------------------------------------------
# https://livefreeordichotomize.com/2019/01/17/understanding-propensity-score-weighting/
# https://www.csus.edu/brand/colors.html
get.att.plot <- function(df.final, match.list)
{
df.final$p.score <- p.score
df.final <- df.final %>%
select(-id) %>%
rownames_to_column(var = "id")
ps.dat <- df.final %>%
select(id, palN, p.score) %>%
pivot_wider(
names_from = "palN", values_from = "p.score", names_prefix = "b.pal."
)
before.match <- ps.dat %>%
select(b.pal.0, b.pal.1)
matched.dat <- df.final[unlist(match.list[c("index.treated", "index.control")]), ]
matched.dat$match.weights<- c(match.list$weights, match.list$weights)
after.match <-matched.dat %>%
select(-id) %>%
rownames_to_column(var = "id")
after.match <- after.match %>%
pivot_wider(names_from = "palN", values_from = "p.score", names_prefix = "pal.")
after.match <- after.match %>%
select(pal.0, pal.1, match.weights)
get.att.plot <- ggplot() +
geom_histogram(data = before.match, bins = 50, aes(b.pal.1), alpha = 0.5) +
geom_histogram(data = after.match,bins = 50, aes(pal.1, weight = match.weights),
fill = "#043927", alpha = 0.5) +
geom_histogram(data = before.match, bins = 50, alpha = 0.5,
aes(x = b.pal.0, y = -..count..)) +
geom_histogram(data = after.match, bins = 50,
aes(x = pal.0, weight = match.weights, y = -..count..),
fill = "#c4b581", alpha = 0.5) +
ylab("Count") + xlab("Propensity Scores") +
geom_hline(yintercept = 0, lwd = 0.5) +
scale_y_continuous(label = abs)
return(get.att.plot)
}
# Variable Percent Improvement -------------------------------------------------
get.var.perc.tab <- function(list.bal) {
get.var.perc.tab <- list.bal %>%
pluck("Balance") %>%
rownames_to_column("Variable") %>%
dplyr::select("Variable", "Type", "Diff.Un","Diff.Adj") %>%
mutate(`% Improvement` = if_else(Diff.Un == 0, 0, round(((abs(Diff.Un) - abs(Diff.Adj))/ abs(Diff.Un)) * 100 , 0))) %>%
arrange(desc(`% Improvement`))
get.var.perc.tab <- get.var.perc.tab %>% dplyr::select("Variable", "Diff.Un", "Diff.Adj", `% Improvement`)
return(get.var.perc.tab)
}
# Covariate Balance Plots -------------------------------------------------------
# https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
# https://www.csus.edu/brand/colors.html
get.bal.plot <- function(unmatched.tab, matched.tab) {
## Construct a data frame containing variable name and SMD from all methods
dataPlot <- data.frame(variable = rownames(ExtractSmd(unmatched.tab)),
Unmatched = as.numeric(ExtractSmd(unmatched.tab)),
Matched = as.numeric(ExtractSmd(matched.tab)) )
## Create long-format data for ggplot2
dataPlotMelt <- melt(data = dataPlot,
id.vars = c("variable"),
variable.name = "Method",
value.name = "SMD")
## Order variable names by magnitude of SMD
varNames <- as.character(dataPlot$variable)[order(dataPlot$Unmatched)]
## Order factor levels in the same order
dataPlotMelt$variable <- factor(dataPlotMelt$variable,
levels = varNames)
## Plot using ggplot2
# Sac State colors and dashed line
get.bal.plot <-ggplot(
data = dataPlotMelt, mapping =
aes(x = variable, y = SMD, group = Method, color= Method)) +
scale_color_manual(values = c("#043927", "#c4b581")) +
geom_line(aes(linetype = Method)) +
geom_point() +
scale_linetype_manual(values= c("dashed", "solid")) +
geom_hline(yintercept = 0.1, color = "black", size = 0.1) +
coord_flip() +
theme_bw() + theme(legend.key = element_blank())
return(get.bal.plot)
}
# PAL Effect -------------------------------------------------------------------
get.pal.effect <- function(match.list, matched.dat, course) {
get.gamma <- psens(match.list, Gamma=2.0, GammaInc = 0.1)[["bounds"]] %>%
filter(`Lower bound` < 0.05 & 0.05 < `Upper bound`) %>%
slice_min(Gamma) %>%
select(Gamma)
get.pal.effect <- matched.dat %>%
group_by(palN) %>%
summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
pivot_wider(names_from = "palN", values_from = "adj.means") %>%
rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
mutate(Course= course, .before= "Non-PAL") %>%
mutate(Diff. = `PAL`-`Non-PAL`) %>%
mutate(`Std. error`= match.list$se, .after= "Diff.") %>%
mutate(
`p-val`= formatC( 1-pnorm(Diff./`Std. error`), format = "e", digits = 2),
Sensitivity= get.gamma$Gamma,
`N(non-PAL)`= length(unique(match.list$index.control)),
`N(PAL)`= match.list$wnobs
)
return(get.pal.effect)
}
Specialized functions for each course.
## BIO 22 ====================================================================
## Filter to relevant variables
bio22.step.vars <- function(course.df) {
vars.to.keep <- c(
'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 'course.count',
'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason', 'sac.county.flg',
'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'sat.math.score',
'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg', 'AP_CALBC.flg',
'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
)
new.vars <- intersect(vars.to.keep, names(bio22.dat))
bio22.final <- bio22.dat[ ,new.vars]
return(bio22.final)
}
## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
bio22.step <- function(final.df) {
# AP_CALAB
min.model <- glm(
palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score +
sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg,
data= bio22.final, family=binomial
)
summary(min.model)
biggest <- formula(glm(palN ~. - grd.pt.unt, data=bio22.final, family=binomial))
bio22.step.first.order <- step(
min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
summary(bio22.step.first.order)
bio22.step.first.order$anova
model.first.order <- formula(bio22.step.first.order)
bio22.first.order.prop.model <- glm(
model.first.order, data=bio22.final, family=binomial
)
return(bio22.first.order.prop.model)
}
## CHEM 1A ====================================================================
## Filter to relevant variables
chem1a.step.vars <- function(course.df) {
vars.to.keep <- c(
'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',
'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason', 'sac.county.flg',
'term.units.attemptedCensus','palN', 'grd.pt.unt', 'sat.math.score',
'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg', 'AP_CALBC.flg',
'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
)
new.vars <- intersect(vars.to.keep, names(chem1a.dat))
chem1a.final <- chem1a.dat[ ,new.vars]
return(chem1a.final)
}
## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem1a.step <- function(final.df) {
# Stepwise selection selected AP_CALAB.flg, AP_BIOL.flg, AP_CHEM, and
# AP_CHEM.flg
min.model <- glm(
palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score +
sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg + AP_BIOL +
AP_BIOL.flg + AP_CHEM + AP_CHEM.flg, data= chem1a.final, family=binomial
)
summary(min.model)
biggest <- formula(
glm(palN ~. - grd.pt.unt, data=chem1a.final, family=binomial)
)
chem1a.step.first.order <- step(
min.model, direction="forward", scope = biggest, trace=FALSE, k=2
)
summary(chem1a.step.first.order)
chem1a.step.first.order$anova
model.first.order <- formula(chem1a.step.first.order)
chem1a.first.order.prop.model <- glm(
model.first.order, data=chem1a.final, family=binomial
)
return(chem1a.first.order.prop.model)
}
## CHEM 1B ====================================================================
## Filter to relevant variables
chem1b.step.vars <- function(course.df) {
vars.to.keep <- c(
'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',
'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason', 'sac.county.flg',
'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1a.grd.pt.unt',
'AP_BIOL', 'AP_CALAB', 'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',
'AP_CALBC.flg', 'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
)
new.vars <- intersect(vars.to.keep, names(chem1b.dat))
chem1b.final <- chem1b.dat[ ,new.vars]
return(chem1b.final)
}
## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem1b.step <- function(final.df) {
# Stepwise selection selected AP_BIOL.flg and AP_CHEM.flg
# Removed AP_BIOL.flg. Then stepwise selection selected AP_CALAB.flg.
# Removed AP_CALAB.flg and pct.female.head.flg
min.model <- glm(
palN ~ chem1a.grd.pt.unt + cum.percent.units.passed + eth.erss + gender +
AP_CHEM + AP_CHEM.flg, data= chem1b.final, family=binomial
)
summary(min.model)
biggest <- formula(
glm(palN ~. - grd.pt.unt - AP_BIOL.flg - AP_CALAB.flg - pct.female.head.flg,
data=chem1b.final, family=binomial)
)
chem1b.step.first.order <- step(min.model,
direction="forward",scope = biggest,
trace=FALSE, k=2)
summary(chem1b.step.first.order)
chem1b.step.first.order$anova
model.first.order <- formula(chem1b.step.first.order)
chem1b.first.order.prop.model <- glm(model.first.order, data=chem1b.final, family=binomial)
return(chem1b.first.order.prop.model)
}
## CHEM 4 ====================================================================
## Filter to relevant variables
chem4.step.vars <- function(course.df)
{
vars.to.keep <- c(
'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',
'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason', 'sac.county.flg',
'term.units.attemptedCensus', 'palN', 'grd.pt.unt','sat.math.score',
'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg', 'AP_CALBC.flg',
'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
)
new.vars <- intersect(vars.to.keep, names(chem4.dat))
chem4.final <- chem4.dat[ ,new.vars]
return(chem4.final)
}
## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem4.step <- function(final.df)
{
# "AP_BIOL"
min.model <- glm(
palN ~ cum.percent.units.passed + eth.erss + gender+ sat.math.score +
sat.verbal.score+sat.math.flg + AP_CALAB+AP_CALAB.flg, data= chem4.final,
family=binomial
)
summary(min.model)
biggest <- formula(
glm(palN ~. - grd.pt.unt - AP_BIOL, data=chem4.final, family=binomial)
)
chem4.step.first.order <- step(
min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
summary(chem4.step.first.order)
chem4.step.first.order$anova
model.first.order <- formula(chem4.step.first.order)
chem4.first.order.prop.model <- glm(
model.first.order, data=chem4.final, family=binomial
)
return(chem4.first.order.prop.model)
}
## CHEM 24 ====================================================================
## Filter to relevant variables
chem24.step.vars <- function(course.df)
{
vars.to.keep <- c(
'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',
'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason', 'sac.county.flg',
'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1b.grd.pt.unt',
'chem1b.term.gpa', 'chem1b.units.attempted', 'AP_BIOL', 'AP_CALAB',
'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg', 'AP_CALBC.flg',
'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
)
new.vars <- intersect(vars.to.keep, names(chem24.dat))
chem24.final <- chem24.dat[ ,new.vars]
return(chem24.final)
}
## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem24.step <- function(final.df) {
min.model <- glm(
palN ~ chem1b.grd.pt.unt + cum.percent.units.passed + eth.erss + gender,
data= chem24.final, family=binomial
)
summary(min.model)
biggest <- formula(
glm(palN ~.- grd.pt.unt - acad.stand - reason - pct.female.head.flg,
data=chem24.final, family=binomial)
)
chem24.step.first.order <- step(
min.model, direction="forward", scope = biggest, trace=FALSE, k=2
)
summary(chem24.step.first.order)
chem24.step.first.order$anova
model.first.order <- formula(chem24.step.first.order)
chem24.first.order.prop.model <- glm(
model.first.order, data=chem24.final, family=binomial
)
return(chem24.first.order.prop.model)
}
Make sure the PAL datafile in the same directory as this RMarkdown file.
PALdatafull <- read_rds("paldatafull_csv.rds")
dim(PALdatafull)
## [1] 1099371 174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555
The files which includes data through the Spring 2019 semester has 1099371 rows and 174 columns. The total of the grd.pt.unt column is 2237555.
bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))
bio.dat <- PALdatafull %>%
filter(base.time.course == 1, course %in% bio.classes) %>%
mutate(course = factor(course, levels = bio.classes))
dim(bio.dat)
## [1] 19652 174
num.stu <- dim(bio.dat)[1]
num.vars <- dim(bio.dat)[2]
There are 19652 rows and 174 variables. Each row is a biology student. So, there is a total of 19652 biology students.
The course.seq variables indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.
Note: Excludes incomplete PAL students
get.raw.tab(bio.classes, bio.dat)
class | nonPALavg | PALavg | Diff | NonPAL_Num | PAL_Num | CompletePAL | TermPALStart |
---|---|---|---|---|---|---|---|
BIO 22 | 1.81 | 2.33 | 0.52 | 949 | 413 | 0.29 | 2158 |
BIO 121 | 2.18 | 2.39 | 0.21 | 888 | 390 | 0.30 | 2148 |
BIO 131 | 2.33 | 2.76 | 0.43 | 1516 | 544 | 0.26 | 2148 |
BIO 139 | 2.41 | 2.61 | 0.20 | 308 | 98 | 0.24 | 2178 |
BIO 184 | 2.51 | 2.73 | 0.22 | 509 | 149 | 0.22 | 2178 |
Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State
Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.
# Check how many students did not complete PAL
sum(bio.dat$palN==1)
## [1] 106
incl.pal.stu <- sum(bio.dat$palN==1)
bio.dat <- clean.data(bio.dat)
dim(bio.dat)
## [1] 19546 179
There were 106 biology students who did not complete PAL and were removed from the analysis. There are now 19546 biology students instead of 19652.
There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.
Based on data for 413 PAL students and 949 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.52 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.
After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.60\pm 0.13\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Bio 22, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.
A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Bio 22 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.
For Bio 22, 16 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: female gender, lower AP Calculus exam scores, higher term units attempted, CSUS GPA at start of term, enrollment in PAL in the past, academic major, living in off-campus housing during their first year, fewer years between first term at CSUS and high school graduation, and not living in Sacramento county at the time of application.
Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0951. Consequently, adequate balance appears to have been achieved.
The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.60\pm 0.13\) or between 0.47 and 0.73 on a 4.0 grade scale. This result is statistically significant with a P-value of \(1.28x10^{-6}\) and is based on 326 PAL students and 332 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.52.
The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Bio 22. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Bio 22 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of living in on-campus housing during their first year, ethnicity, or major observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.
Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 413 PAL students in Bio 22 being reduced to 326. Also, 332 non-PAL students were selected out of 949 original non-PAL students.
When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 523 non-PAL matches. Of the 326 PAL students, 196 were matched one-to-one with non-PAL students and 130 were matched one-to-many with non-PAL students.
The non-PAL and PAL groups will include students with only first attempts at BIO 22. They will also include students with previous PAL participation and/or are currently in a PAL for another course.
# Excludes course repeats
bio22.dat <- bio.dat %>%
filter(course=="BIO 22", pass.term.flg == "PASS Term", course.seq== 0)
dim(bio22.dat) # 1362 179
## [1] 1362 179
There are 1,362 BIO 22 first attempt only students.
Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.
# Collapsed cMaj categories to Biology and Other majors at 0.07
with(bio22.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Anthropology 2 0
## Biology 333 125
## Business 8 1
## Chemistry 27 7
## Child Devel/Early Childhood Ed 14 4
## Civil Engineering 1 0
## Communications 3 1
## Computer Engineering 1 0
## Computer Science 6 0
## Construction Management 0 1
## Criminal Justice 6 2
## Dance 1 1
## Economics 1 0
## English 1 1
## Family & Consumer Sciences 2 0
## Film 0 1
## French 1 0
## Gender/Ethnic/Women's Studies 1 0
## Gerontology 0 3
## Graphic Design 1 2
## Health Science 8 5
## History 2 0
## Interdisciplinary Studies/Special Major 2 0
## Interior Design 0 2
## International Business 1 0
## Journalism 1 0
## Kinesiology/Physical Education 422 222
## Liberal Studies 1 0
## Mechanical Engineering 1 0
## Nursing 23 17
## Nutrition 8 2
## Philosophy 1 1
## Physics 1 0
## Psychology 7 1
## Recreation Administration 1 0
## Social Science 2 0
## Sociology 2 0
## Spanish 1 0
## Speech Pathology 1 2
## Undeclared 35 11
bio22.dat <- group_category(data = bio22.dat, feature = "cMaj", threshold = 0.07, update = TRUE)
with(bio22.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 333 125
## Chemistry 27 7
## Child Devel/Early Childhood Ed 14 4
## Kinesiology/Physical Education 422 222
## Nursing 23 17
## OTHER 75 26
## Undeclared 35 11
Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.
# Include only variables that are missing values
vars.missing <- get.missing.only(bio22.dat)
missingness.rpt <- profile_missing(vars.missing)
# missingness.rpt[order(missingness.rpt$pct_missing, decreasing = TRUE), ]
# check which variables are missing >5% values
vars.missing.data <- missingness.rpt %>%
filter(pct_missing > 0.05)
sum(missingness.rpt$pct_missing>0.05)
## [1] 37
vars.missing.data [order(vars.missing.data $pct_missing, decreasing = TRUE), ]
## feature num_missing pct_missing
## 19 Instructor_02 1362 1.00000000
## 25 deg.plan4 1362 1.00000000
## 26 deg.plan5 1362 1.00000000
## 27 deg.plan6 1362 1.00000000
## 24 deg.plan3 1361 0.99926579
## 21 withdraw_reason 1357 0.99632893
## 23 deg.plan2 1356 0.99559471
## 13 trf.gpaADM 1083 0.79515419
## 4 pledge.term 1050 0.77092511
## 20 treat.section 938 0.68869310
## 29 grad.termERS 876 0.64317181
## 22 deg.plan1 874 0.64170338
## 28 grad.term 874 0.64170338
## 30 ttg 874 0.64170338
## 1 fys.term.code 839 0.61600587
## 2 fys.grd 839 0.61600587
## 3 fys.rpt.flg 839 0.61600587
## 15 ge.critical.thinking.status 442 0.32452276
## 16 ge.english.comp.status 442 0.32452276
## 17 ge.math.status 442 0.32452276
## 18 ge.oral.comm.status 442 0.32452276
## 14 admit.term 433 0.31791483
## 7 sat.math.score 361 0.26505140
## 8 sat.math.flg 361 0.26505140
## 9 sat.verbal.score 361 0.26505140
## 10 sat.verbal.flg 361 0.26505140
## 11 sat.test.date 361 0.26505140
## 12 hs.gpa 260 0.19089574
## 31 plan.college 244 0.17914831
## 32 plan.college.desc 244 0.17914831
## 33 plan.dept 244 0.17914831
## 34 plan.deptAbbr 244 0.17914831
## 35 plan.degree 244 0.17914831
## 36 plan.type 244 0.17914831
## 37 county 189 0.13876652
## 5 m.rmd.admin 82 0.06020558
## 6 m.rmd.admin.detail 82 0.06020558
plot_missing(bio22.dat %>% select(all_of(vars.missing.data$feature)))
# Remove variables missing >5%, except for "sat.math.score", "sat.verbal.score", "hs.gpa", "sat.math.flg"
bio22.dat <- bio22.dat %>%
select(!all_of(vars.missing.data$feature),
c("sat.math.score", "sat.verbal.score", "hs.gpa","sat.math.flg"))
dim(bio22.dat) # 1362 146
## [1] 1362 146
37 variables missing >5%
The variables below are important and force included, even though they are missing >5%
“sat.math.score”, “sat.verbal.score”, “hs.gpa”,“sat.math.flg”
Only 33 variables were removed due to missingness and there are now 146 variables instead of 179 variables.
bio22.dat <- bio22.dat[complete.cases(bio22.dat), ]
dim(bio22.dat) # 937 146
## [1] 937 146
937 out of 1362 students are kept
425 students were removed due to missingness of variables
single.vars <- bio22.dat %>%
summarise(across(everything(), ~ n_distinct(.x))) %>%
select_if(. == 1)
# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = bio22.dat)
##
## Overall
## n 937
## country = USA (%) 937 (100.0)
## career.course = UGRD (%) 937 (100.0)
## acad.prog.course = UGD (%) 937 (100.0)
## course (%)
## BIO 22 937 (100.0)
## BIO 121 0 ( 0.0)
## BIO 131 0 ( 0.0)
## BIO 139 0 ( 0.0)
## BIO 184 0 ( 0.0)
## component = LEC (%) 937 (100.0)
## units (mean (SD)) 4.00 (0.00)
## course.numeric (mean (SD)) 22.00 (0.00)
## div = Lower Division (%) 937 (100.0)
## course.seq (mean (SD)) 0.00 (0.00)
## rpt.flg = First Attempt (%) 937 (100.0)
## c2s = Non-C2S (%) 937 (100.0)
## base.time.course (mean (SD)) 1.00 (0.00)
## years (mean (SD)) 0.50 (0.00)
## withdraw_code = NWD (%) 937 (100.0)
## enrl.flg = Enrolled (%) 937 (100.0)
## enrl.flgERS = Enrolled (%) 937 (100.0)
## rtn.flg = Retained (%) 937 (100.0)
## rtn.flgERS = Retained (%) 937 (100.0)
## pass.term.flg = PASS Term (%) 937 (100.0)
## csus.gpa.start.flg = Not Missing (%) 937 (100.0)
## higher.ed.gpa.start.flg = Not Missing (%) 937 (100.0)
sum(single.vars) # 21
## [1] 21
# remove single-valued variables
bio22.dat<- bio22.dat %>%
dplyr::select(-names(single.vars))
dim(bio22.dat) # 937 125
## [1] 937 125
125 out of 146 variables are kept
21 variables removed due to single values
Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.
bio22.final <- bio22.step.vars(bio22.dat)
names(bio22.final)
## [1] "acad.stand" "adm.area" "bot.level" "cMaj"
## [5] "coh" "course.age" "csus.gpa.start" "cum.percent.units.passed"
## [9] "delay.from.hs" "e.rmd" "eth.erss" "father.ed"
## [13] "fys.flg" "gender" "hous.coh.term.flg" "hs.gpa"
## [17] "Instructor_01" "median.income" "m.rmd" "mother.ed"
## [21] "pct.female.head" "pell.coh.term.flg" "prevPAL" "prevPASS"
## [25] "reason" "sac.county.flg" "term.units.attemptedCensus" "palN"
## [29] "grd.pt.unt" "sat.math.score" "sat.math.flg" "sat.verbal.score"
## [33] "AP_BIOL" "AP_CALAB" "AP_CALBC" "AP_CHEM"
## [37] "AP_BIOL.flg" "AP_CALAB.flg" "AP_CALBC.flg" "AP_CHEM.flg"
## [41] "pct.female.head.flg" "med.inc.flg"
Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
bio22.first.order.prop.model <- bio22.step(bio22.final)
summary(bio22.first.order.prop.model)
##
## Call:
## glm(formula = model.first.order, family = binomial, data = bio22.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7671 -0.9137 -0.6395 1.1352 2.3038
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3775015 1.2040989 -0.314 0.753890
## cum.percent.units.passed -1.0822415 1.0554624 -1.025 0.305188
## eth.erssAsian -0.4634201 0.3721018 -1.245 0.212980
## eth.erssForeign -0.6166642 0.8106835 -0.761 0.446853
## eth.erssHispanic -0.0583669 0.3641114 -0.160 0.872645
## eth.erssPacific Islander 0.9985099 0.7949037 1.256 0.209065
## eth.erssTwo or More Races -0.5168364 0.4579780 -1.129 0.259101
## eth.erssUnknown -0.1312048 0.5574675 -0.235 0.813930
## eth.erssWhite -0.4577725 0.3921462 -1.167 0.243068
## genderMale -0.2872814 0.1562340 -1.839 0.065946 .
## sat.math.score -0.0006397 0.0014134 -0.453 0.650813
## sat.verbal.score -0.0021488 0.0013662 -1.573 0.115751
## sat.math.flgold -0.3615956 0.2329028 -1.553 0.120528
## AP_CALAB -0.3784461 0.1622482 -2.333 0.019674 *
## AP_CALAB.flgNot Missing -0.0812130 0.2570992 -0.316 0.752092
## term.units.attemptedCensus 0.1419638 0.0383030 3.706 0.000210 ***
## csus.gpa.start 0.5422919 0.1893888 2.863 0.004191 **
## prevPAL 0.3630843 0.0838341 4.331 1.48e-05 ***
## cMajChemistry 0.2181870 0.5406120 0.404 0.686512
## cMajChild Devel/Early Childhood Ed -0.4823010 0.8039725 -0.600 0.548575
## cMajKinesiology/Physical Education 0.8378476 0.2289283 3.660 0.000252 ***
## cMajNursing 1.4989657 0.4443443 3.373 0.000742 ***
## cMajOTHER 0.1981652 0.3601025 0.550 0.582112
## cMajUndeclared 0.4871465 0.4576017 1.065 0.287073
## hous.coh.term.flgOn-Campus Housing -0.5708439 0.2003577 -2.849 0.004384 **
## delay.from.hs -0.1437080 0.0719453 -1.997 0.045775 *
## sac.county.flg1 -0.3061509 0.1718863 -1.781 0.074892 .
## prevPASS 0.2165654 0.1462875 1.480 0.138764
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1213.4 on 936 degrees of freedom
## Residual deviance: 1092.9 on 909 degrees of freedom
## AIC: 1148.9
##
## Number of Fisher Scoring iterations: 4
p.score <- bio22.first.order.prop.model$fitted.values
bio22.covs <- names(bio22.first.order.prop.model %>% pluck("model") %>% dplyr::select(-palN))
# Unadjusted mean grades
get.unadj.means(bio22.final)
Non-PAL | PAL | Diff. |
---|---|---|
1.760099 | 2.306402 | 0.5463039 |
# Variable Summary Table for unmatched data
unmatched.tab <- CreateTableOne(vars = bio22.covs, strata = "palN",
data = bio22.final, smd = TRUE, test= FALSE)
print(unmatched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 609 328
## cum.percent.units.passed (mean (SD)) 0.91 (0.10) 0.91 (0.09) 0.007
## eth.erss (%) 0.259
## African American 26 ( 4.3) 16 ( 4.9)
## Asian 218 (35.8) 96 (29.3)
## Foreign 8 ( 1.3) 3 ( 0.9)
## Hispanic 178 (29.2) 130 (39.6)
## Pacific Islander 4 ( 0.7) 5 ( 1.5)
## Two or More Races 44 ( 7.2) 17 ( 5.2)
## Unknown 16 ( 2.6) 9 ( 2.7)
## White 115 (18.9) 52 (15.9)
## gender = Male (%) 273 (44.8) 115 (35.1) 0.200
## sat.math.score (mean (SD)) 504.93 (79.84) 495.43 (79.20) 0.119
## sat.verbal.score (mean (SD)) 487.27 (78.52) 478.84 (72.64) 0.111
## sat.math.flg = old (%) 538 (88.3) 270 (82.3) 0.171
## AP_CALAB (mean (SD)) 2.58 (0.52) 2.48 (0.53) 0.200
## AP_CALAB.flg = Not Missing (%) 79 (13.0) 49 (14.9) 0.057
## term.units.attemptedCensus (mean (SD)) 13.57 (2.24) 14.27 (1.93) 0.330
## csus.gpa.start (mean (SD)) 3.01 (0.53) 3.12 (0.49) 0.212
## prevPAL (mean (SD)) 0.52 (1.06) 0.71 (1.29) 0.158
## cMaj (%) 0.291
## Biology 215 (35.3) 90 (27.4)
## Chemistry 14 ( 2.3) 6 ( 1.8)
## Child Devel/Early Childhood Ed 13 ( 2.1) 2 ( 0.6)
## Kinesiology/Physical Education 283 (46.5) 188 (57.3)
## Nursing 15 ( 2.5) 15 ( 4.6)
## OTHER 45 ( 7.4) 17 ( 5.2)
## Undeclared 24 ( 3.9) 10 ( 3.0)
## hous.coh.term.flg = On-Campus Housing (%) 160 (26.3) 75 (22.9) 0.079
## delay.from.hs (mean (SD)) 2.83 (1.56) 2.47 (1.26) 0.255
## sac.county.flg = 1 (%) 294 (48.3) 140 (42.7) 0.112
## prevPASS (mean (SD)) 0.23 (0.58) 0.26 (0.58) 0.040
Check how many variables have SMD > 0.1
addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
##
## FALSE TRUE Sum
## 4 12 16
get.imbal.vars(unmatched.tab)
Variable | Before Matching SMD |
---|---|
term.units.attemptedCensus | 0.3299348 |
cMaj | 0.2912810 |
eth.erss | 0.2585443 |
delay.from.hs | 0.2553547 |
csus.gpa.start | 0.2118672 |
gender | 0.2004051 |
AP_CALAB | 0.1998329 |
sat.math.flg | 0.1708929 |
prevPAL | 0.1582117 |
sat.math.score | 0.1194512 |
sac.county.flg | 0.1124963 |
sat.verbal.score | 0.1114884 |
12 variables have SMD >0.1
Implement a propensity score matching method.
match.bio22 <- with(bio22.final, Match(
Y=bio22.final$grd.pt.unt, Tr = bio22.final$palN, X = p.score,
BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25, replace = TRUE, ties = TRUE))
Standardized mean differences for continuous variables and categorical variables.
# Needed for match table
bio22.final <- bio22.final %>%
rownames_to_column(var = "id")
# Matched data
bio22.matched.dat <- bio22.final[unlist(match.bio22[c("index.treated", "index.control")]), ]
bio22.matched.dat$match.weights<- c(match.bio22$weights, match.bio22$weights)
# Add match weights to matched data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = bio22.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = bio22.covs, strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 326.00 326.00
## cum.percent.units.passed (mean (SD)) 0.92 (0.09) 0.91 (0.09) 0.094
## eth.erss (%) 0.131
## African American 17.6 ( 5.4) 16.0 ( 4.9)
## Asian 111.8 (34.3) 96.0 (29.4)
## Foreign 3.2 ( 1.0) 3.0 ( 0.9)
## Hispanic 112.2 (34.4) 128.0 (39.3)
## Pacific Islander 3.8 ( 1.2) 5.0 ( 1.5)
## Two or More Races 19.4 ( 6.0) 17.0 ( 5.2)
## Unknown 9.7 ( 3.0) 9.0 ( 2.8)
## White 48.3 (14.8) 52.0 (16.0)
## gender = Male (%) 119.2 (36.6) 115.0 (35.3) 0.027
## sat.math.score (mean (SD)) 502.55 (79.44) 495.64 (79.29) 0.087
## sat.verbal.score (mean (SD)) 484.86 (78.53) 479.51 (72.31) 0.071
## sat.math.flg = old (%) 261.2 (80.1) 268.0 (82.2) 0.053
## AP_CALAB (mean (SD)) 2.47 (0.60) 2.48 (0.53) 0.009
## AP_CALAB.flg = Not Missing (%) 59.0 (18.1) 49.0 (15.0) 0.083
## term.units.attemptedCensus (mean (SD)) 14.44 (2.06) 14.26 (1.94) 0.088
## csus.gpa.start (mean (SD)) 3.11 (0.48) 3.12 (0.49) 0.014
## prevPAL (mean (SD)) 0.70 (1.31) 0.68 (1.24) 0.013
## cMaj (%) 0.124
## Biology 88.5 (27.2) 89.0 (27.3)
## Chemistry 4.6 ( 1.4) 6.0 ( 1.8)
## Child Devel/Early Childhood Ed 1.5 ( 0.5) 2.0 ( 0.6)
## Kinesiology/Physical Education 181.4 (55.7) 187.0 (57.4)
## Nursing 24.0 ( 7.4) 15.0 ( 4.6)
## OTHER 17.1 ( 5.3) 17.0 ( 5.2)
## Undeclared 8.8 ( 2.7) 10.0 ( 3.1)
## hous.coh.term.flg = On-Campus Housing (%) 80.2 (24.6) 75.0 (23.0) 0.038
## delay.from.hs (mean (SD)) 2.45 (1.39) 2.46 (1.26) 0.010
## sac.county.flg = 1 (%) 136.9 (42.0) 140.0 (42.9) 0.019
## prevPASS (mean (SD)) 0.23 (0.59) 0.25 (0.57) 0.030
Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group Binary variables: Raw differences in proportion
All variables are balanced and under the <0.1 mean threshold.
bio22.bal <- bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final,
distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
bio22.bal
## Balance Measures
## All covariates are balanced.
##
## Balance tally for mean differences
## count
## Balanced, <0.1 30
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## cum.percent.units.passed -0.0951 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 609. 328
## Matched (ESS) 169.3 326
## Matched (Unweighted) 332. 326
## Unmatched 277. 0
## Discarded 0. 2
get.var.perc.tab(bio22.bal)
## Variable Diff.Un Diff.Adj % Improvement
## 1 p.score 0.757140033 -5.483664e-05 100
## 2 cMaj_Biology -0.078647523 1.431493e-03 98
## 3 cMaj_OTHER -0.022062357 -4.601227e-04 98
## 4 delay.from.hs -0.287221743 1.036247e-02 96
## 5 AP_CALAB -0.198548550 1.004536e-02 95
## 6 csus.gpa.start 0.220446905 1.400119e-02 94
## 7 prevPAL 0.144568525 -1.297841e-02 91
## 8 cMaj_Child Devel/Early Childhood Ed -0.015248909 1.533742e-03 90
## 9 eth.erss_Foreign -0.003989948 -5.112474e-04 87
## 10 gender_Male -0.097666106 -1.303681e-02 87
## 11 cMaj_Kinesiology/Physical Education 0.108474508 1.702454e-02 84
## 12 sac.county.flg -0.055929352 9.406953e-03 83
## 13 term.units.attemptedCensus 0.356927519 -9.032332e-02 75
## 14 sat.math.flg_old -0.060244703 2.091002e-02 65
## 15 eth.erss_Two or More Races -0.020420321 -7.413088e-03 64
## 16 eth.erss_White -0.030297569 1.140082e-02 62
## 17 eth.erss_Pacific Islander 0.008675758 3.578732e-03 59
## 18 cMaj_Undeclared -0.008921062 3.834356e-03 57
## 19 eth.erss_Hispanic 0.104059033 4.836401e-02 54
## 20 hous.coh.term.flg_On-Campus Housing -0.034067243 -1.610429e-02 53
## 21 sat.verbal.score -0.116085624 -7.372148e-02 36
## 22 sat.math.score -0.119934927 -8.720503e-02 27
## 23 eth.erss_Asian -0.065280948 -4.831288e-02 26
## 24 prevPASS 0.039806845 3.026697e-02 24
## 25 eth.erss_African American 0.006087549 -5.061350e-03 17
## 26 cMaj_Chemistry -0.004695823 4.243354e-03 10
## 27 cMaj_Nursing 0.021101165 -2.760736e-02 -31
## 28 AP_CALAB.flg_Not Missing 0.019669390 -3.072597e-02 -56
## 29 eth.erss_Unknown 0.001166446 -2.044990e-03 -75
## 30 cum.percent.units.passed 0.007863796 -9.507585e-02 -1109
get.bal.plot(unmatched.tab, matched.tab)
love.plot(bio22.bal,binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
create.match.tab(bio22.matched.dat)
Non-PAL | PAL | |
---|---|---|
Single Matches | 213 | 196 |
Multiple Matches | 119 | 130 |
Total Students | 332 | 326 |
Out of 328 PAL students, 326 were matched and 2 were unable to be matched. Out of 326 PAL student matches, 196 PAL students were matched to one non-PAL student and 130 PAL students were matched to multiple non-PAL students.
Out of 523 non-PAL student matches, there were 332 non-PAL students, 213 of the non-PAL students were matched to one PAL student and 119 of the non-PAL students were matched to multiple PAL students.
get.att.plot(bio22.final, match.bio22)
The standardized mean differences of the prognostic scores is -0.0171, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.
# Outcome model of non-PAL group
ctrl.data <- bio22.final %>% filter(palN == 0)
ctrl.fit <- glm(f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
summary(ctrl.fit)
##
## Call:
## glm(formula = f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7560 -0.8716 0.0303 0.8395 4.1864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.1402550 0.7201409 -4.361 1.53e-05 ***
## cum.percent.units.passed -0.1622929 0.6399986 -0.254 0.799907
## eth.erssAsian -0.0779431 0.2438373 -0.320 0.749347
## eth.erssForeign 0.3085674 0.4645496 0.664 0.506807
## eth.erssHispanic -0.1654707 0.2422559 -0.683 0.494853
## eth.erssPacific Islander -0.2477167 0.6156367 -0.402 0.687556
## eth.erssTwo or More Races 0.0802446 0.2883046 0.278 0.780856
## eth.erssUnknown 0.3015452 0.3699353 0.815 0.415332
## eth.erssWhite 0.2307871 0.2558751 0.902 0.367456
## genderMale 0.0716462 0.0978066 0.733 0.464141
## sat.math.score 0.0019424 0.0008767 2.216 0.027112 *
## sat.verbal.score 0.0002403 0.0008407 0.286 0.775085
## sat.math.flgold -0.0241787 0.1607446 -0.150 0.880488
## AP_CALAB -0.0216909 0.0911965 -0.238 0.812083
## AP_CALAB.flgNot Missing 0.0779702 0.1512937 0.515 0.606500
## term.units.attemptedCensus 0.0214280 0.0217106 0.987 0.324062
## csus.gpa.start 1.1731403 0.1167655 10.047 < 2e-16 ***
## prevPAL 0.1710735 0.0535187 3.197 0.001466 **
## cMajChemistry 0.2927482 0.3206663 0.913 0.361654
## cMajChild Devel/Early Childhood Ed 0.3579739 0.3331423 1.075 0.283028
## cMajKinesiology/Physical Education -0.5104136 0.1382659 -3.692 0.000244 ***
## cMajNursing -0.3789272 0.3221003 -1.176 0.239906
## cMajOTHER -0.3813397 0.2029681 -1.879 0.060770 .
## cMajUndeclared -0.0800524 0.2693843 -0.297 0.766445
## hous.coh.term.flgOn-Campus Housing -0.0613837 0.1248750 -0.492 0.623215
## delay.from.hs 0.1497663 0.0376607 3.977 7.87e-05 ***
## sac.county.flg1 -0.1025577 0.1086385 -0.944 0.345548
## prevPASS -0.0261742 0.0910270 -0.288 0.773799
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.264761)
##
## Null deviance: 1275.90 on 608 degrees of freedom
## Residual deviance: 734.83 on 581 degrees of freedom
## AIC: 1900.6
##
## Number of Fisher Scoring iterations: 2
# Predict values for both non-PAL and PAL groups
bio22.final$prog.score <- predict(ctrl.fit, bio22.final)
# Compare the standardized mean differences of the prognostic scores in the non-PAL and PAL groups
# p.score is at the bottom of the table
prog.bal <-bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final, distance = bio22.final["prog.score"], sd.denom = "treated", addl = "p.score", thresholds = c(m = .1))
prog.bal
## Balance Measures
## Type Diff.Adj M.Threshold
## prog.score Distance -0.0171 Balanced, <0.1
## cum.percent.units.passed Contin. -0.0951 Balanced, <0.1
## eth.erss_African American Binary -0.0051 Balanced, <0.1
## eth.erss_Asian Binary -0.0483 Balanced, <0.1
## eth.erss_Foreign Binary -0.0005 Balanced, <0.1
## eth.erss_Hispanic Binary 0.0484 Balanced, <0.1
## eth.erss_Pacific Islander Binary 0.0036 Balanced, <0.1
## eth.erss_Two or More Races Binary -0.0074 Balanced, <0.1
## eth.erss_Unknown Binary -0.0020 Balanced, <0.1
## eth.erss_White Binary 0.0114 Balanced, <0.1
## gender_Male Binary -0.0130 Balanced, <0.1
## sat.math.score Contin. -0.0872 Balanced, <0.1
## sat.verbal.score Contin. -0.0737 Balanced, <0.1
## sat.math.flg_old Binary 0.0209 Balanced, <0.1
## AP_CALAB Contin. 0.0100 Balanced, <0.1
## AP_CALAB.flg_Not Missing Binary -0.0307 Balanced, <0.1
## term.units.attemptedCensus Contin. -0.0903 Balanced, <0.1
## csus.gpa.start Contin. 0.0140 Balanced, <0.1
## prevPAL Contin. -0.0130 Balanced, <0.1
## cMaj_Biology Binary 0.0014 Balanced, <0.1
## cMaj_Chemistry Binary 0.0042 Balanced, <0.1
## cMaj_Child Devel/Early Childhood Ed Binary 0.0015 Balanced, <0.1
## cMaj_Kinesiology/Physical Education Binary 0.0170 Balanced, <0.1
## cMaj_Nursing Binary -0.0276 Balanced, <0.1
## cMaj_OTHER Binary -0.0005 Balanced, <0.1
## cMaj_Undeclared Binary 0.0038 Balanced, <0.1
## hous.coh.term.flg_On-Campus Housing Binary -0.0161 Balanced, <0.1
## delay.from.hs Contin. 0.0104 Balanced, <0.1
## sac.county.flg Binary 0.0094 Balanced, <0.1
## prevPASS Contin. 0.0303 Balanced, <0.1
## p.score Contin. -0.0001 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 31
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## cum.percent.units.passed -0.0951 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 609. 328
## Matched (ESS) 169.3 326
## Matched (Unweighted) 332. 326
## Unmatched 277. 0
## Discarded 0. 2
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.5987 . This result is statistically significant with a P-value of \(2.556x10^{-6}\) and is based on 326 PAL students and 523 non-PAL student matches(332 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.
summary(match.bio22)
##
## Estimate... 0.5987
## AI SE...... 0.12728
## T-stat..... 4.7036
## p.val...... 2.556e-06
##
## Original number of observations.............. 937
## Original number of treated obs............... 328
## Matched number of observations............... 326
## Matched number of observations (unweighted). 523
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 2
psens(match.bio22, Gamma=2.0, GammaInc = 0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0001
## 1.3 0 0.0010
## 1.4 0 0.0078
## 1.5 0 0.0363
## 1.6 0 0.1123
## 1.7 0 0.2517
## 1.8 0 0.4380
## 1.9 0 0.6290
## 2.0 0 0.7852
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
Note that in the above table \(\Gamma=1.6\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “csus.gpa.start” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.
kable(sort(exp(abs(bio22.first.order.prop.model$coefficients))))
x | |
---|---|
sat.math.score | 1.000640 |
sat.verbal.score | 1.002151 |
eth.erssHispanic | 1.060104 |
AP_CALAB.flgNot Missing | 1.084602 |
eth.erssUnknown | 1.140201 |
term.units.attemptedCensus | 1.152535 |
delay.from.hs | 1.154547 |
cMajOTHER | 1.219164 |
prevPASS | 1.241804 |
cMajChemistry | 1.243820 |
genderMale | 1.332799 |
sac.county.flg1 | 1.358187 |
sat.math.flgold | 1.435618 |
prevPAL | 1.437757 |
(Intercept) | 1.458636 |
AP_CALAB | 1.460014 |
eth.erssWhite | 1.580549 |
eth.erssAsian | 1.589501 |
cMajChild Devel/Early Childhood Ed | 1.619797 |
cMajUndeclared | 1.627665 |
eth.erssTwo or More Races | 1.676715 |
csus.gpa.start | 1.719944 |
hous.coh.term.flgOn-Campus Housing | 1.769760 |
eth.erssForeign | 1.852737 |
cMajKinesiology/Physical Education | 2.311386 |
eth.erssPacific Islander | 2.714234 |
cum.percent.units.passed | 2.951287 |
cMajNursing | 4.477056 |
chem.classes <- paste("CHEM", c(4, '1A', '1B', 24))
chem.dat <- PALdatafull %>%
filter(base.time.course == 1, course %in% chem.classes) %>%
mutate(course = factor(course, levels = chem.classes))
dim(chem.dat) # 18948 174
## [1] 18948 174
num.stu <- dim(chem.dat)[1]
num.vars <- dim(chem.dat)[2]
There are 18948 rows and 174 variables. Each row is a chemistry student. So, there is a total of 18948 chemistry students.
There are 83 first attempt only Chem 24 Spring 2019 students. Some of them are incorrectly labeled as non-PAL and need to be relabeled.
with(chem.dat %>%
filter(base.time.course == 1, pass.term.flg == "PASS Term", course == "CHEM 24", term == "Spring 2019", course.seq == 0),
table(palN))
## palN
## 0
## 83
chem.dat <- update.chem24s19(chem.dat)
with(chem.dat %>%
filter(base.time.course == 1, pass.term.flg == "PASS Term", course == "CHEM 24", term == "Spring 2019", course.seq == 0),
table(palN))
## palN
## 0 1 2
## 28 9 46
# 0 1 2
# 28 9 46
After relabeling, there are 28 non-PAL students, 9 incomplete PAL students, and 46 PAL students for Chem 24 Spring 2019.
The course.seq variable indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.
Note: Excludes incomplete PAL students
get.raw.tab(chem.classes, chem.dat)
class | nonPALavg | PALavg | Diff | NonPAL_Num | PAL_Num | CompletePAL | TermPALStart |
---|---|---|---|---|---|---|---|
CHEM 4 | 2.03 | 2.39 | 0.36 | 1929 | 759 | 0.28 | 2123 |
CHEM 1A | 1.63 | 2.04 | 0.41 | 1717 | 1055 | 0.37 | 2128 |
CHEM 1B | 1.70 | 2.11 | 0.41 | 1090 | 769 | 0.40 | 2138 |
CHEM 24 | 1.63 | 2.06 | 0.43 | 224 | 177 | 0.43 | 2178 |
Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State
Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.
# Check how many students did not complete PAL
sum(chem.dat$palN==1) # 226
## [1] 226
incl.pal.stu <- sum(chem.dat$palN==1)
chem.dat <- clean.data(chem.dat)
dim(chem.dat) # 18722 179
## [1] 18722 179
There were 226 chemistry students who did not complete PAL and were removed from the analysis. There are now 18722 chemistry students instead of 18948.
There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.
Based on data for 1055 PAL students and 1717 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.41 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.
After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.50\pm 0.07\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 1A, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.
A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 1A course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.
For Chem 1A, 19 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: Hispanic ethnicity, female gender, lower SAT scores, has an AP Calculus exam score, has an AP Biology exam score, has an AP Chemistry exam score, CSUS GPA at start of term, enrollment in PAL in the past, academic major, higher term units attempted, lower high school GPA, and fewer years between first term at CSUS and high school graduation.
Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0627. Consequently, adequate balance appears to have been achieved.
The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.50\pm 0.07\) or between 0.43 and 0.57 on a 4.0 grade scale. This result is statistically significant with a P-value of \(2.43x10^{-13}\) and is based on 757 PAL students and 711 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.41.
The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 1A. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 1A indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of CSUS GPA at start of term, major, or has an AP Chemistry exam score observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.
Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 1055 PAL students in Chem 1A being reduced to 757. Also, 711 non-PAL students were selected out of 1717 original non-PAL students.
When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 1561 non-PAL matches. Of the 757 PAL students, 337 were matched one-to-one with non-PAL students and 420 were matched one-to-many with non-PAL students.
The non-PAL and PAL groups will include students with only first attempts at CHEM 1A.They will also include students with previous PAL participation and/or are currently in a PAL for another course.
# Excludes course repeats
chem1a.dat <- chem.dat %>%
filter(course=="CHEM 1A", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem1a.dat) # 2772 179
## [1] 2772 179
There are 2,772 CHEM 1A first attempt only students.
Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.
# Collapsed cMaj categories to Biology and Other majors at 0.09
with(chem1a.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Anthropology 3 5
## Biology 659 547
## Business 8 4
## Chemistry 265 104
## Child Devel/Early Childhood Ed 9 2
## Civil Engineering 50 16
## Communications 4 2
## Computer Engineering 13 2
## Computer Science 15 4
## Criminal Justice 5 6
## Dance 0 1
## Electrical Engineering 21 4
## English 3 1
## Environmental Studies 28 15
## Film 0 1
## French 1 0
## Geography 0 1
## Geology 39 16
## Gerontology 2 0
## Health Science 22 8
## History 3 2
## Interdisciplinary Studies/Special Major 1 0
## Kinesiology/Physical Education 226 164
## Liberal Studies 1 2
## Mathematics 7 1
## Mechanical Engineering 73 17
## Music 4 1
## Nursing 40 22
## Nutrition 77 55
## Philosophy 3 0
## Physical Science 0 1
## Physics 42 7
## Political Science 0 1
## Psychology 24 9
## Social Science 2 1
## Sociology 2 1
## Spanish 3 1
## Speech Pathology 2 2
## Undeclared 45 27
chem1a.dat <- group_category(data = chem1a.dat, feature = "cMaj", threshold = 0.09, update = TRUE)
with(chem1a.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 659 547
## Chemistry 265 104
## Civil Engineering 50 16
## Geology 39 16
## Kinesiology/Physical Education 226 164
## Mechanical Engineering 73 17
## Nursing 40 22
## Nutrition 77 55
## OTHER 201 80
## Physics 42 7
## Undeclared 45 27
Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.
## [1] 38
## feature num_missing pct_missing
## 17 Instructor_02 2772 1.0000000
## 22 deg.plan3 2772 1.0000000
## 23 deg.plan4 2772 1.0000000
## 24 deg.plan5 2772 1.0000000
## 25 deg.plan6 2772 1.0000000
## 19 withdraw_reason 2762 0.9963925
## 21 deg.plan2 2739 0.9880952
## 4 pledge.term 2366 0.8535354
## 11 trf.gpaADM 2308 0.8326118
## 1 fys.term.code 1630 0.5880231
## 2 fys.grd 1630 0.5880231
## 3 fys.rpt.flg 1630 0.5880231
## 27 grad.termERS 1453 0.5241703
## 20 deg.plan1 1415 0.5104618
## 26 grad.term 1415 0.5104618
## 28 ttg 1415 0.5104618
## 18 treat.section 1237 0.4462482
## 31 plan.college 925 0.3336941
## 32 plan.college.desc 925 0.3336941
## 33 plan.dept 925 0.3336941
## 34 plan.deptAbbr 925 0.3336941
## 35 plan.degree 925 0.3336941
## 36 plan.type 925 0.3336941
## 5 sat.math.score 616 0.2222222
## 6 sat.math.flg 616 0.2222222
## 7 sat.verbal.score 616 0.2222222
## 8 sat.verbal.flg 616 0.2222222
## 9 sat.test.date 616 0.2222222
## 13 ge.critical.thinking.status 524 0.1890332
## 14 ge.english.comp.status 524 0.1890332
## 15 ge.math.status 524 0.1890332
## 16 ge.oral.comm.status 524 0.1890332
## 12 admit.term 511 0.1843434
## 10 hs.gpa 420 0.1515152
## 38 county 355 0.1280664
## 29 tot.passd.prgrss.start 325 0.1172439
## 30 tot.taken.prgrss.start 325 0.1172439
## 37 cum.percent.units.passed 325 0.1172439
## [1] 2772 146
38 variables missing >10%
5 out of 38 variables were important and force included, even though they were missing >10%
So, 33 variables were removed due to missingness and there are now 146 variables instead of 179 variables.
chem1a.dat <- chem1a.dat[complete.cases(chem1a.dat), ]
dim(chem1a.dat) # 1769 146
## [1] 1769 146
1769 out of 2772 students are kept
1043 students were removed due to missingness of variables
single.vars <- chem1a.dat %>%
summarise(across(everything(), ~ n_distinct(.x))) %>%
select_if(. == 1)
# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem1a.dat)
##
## Overall
## n 1769
## country = USA (%) 1769 (100.0)
## career.course = UGRD (%) 1769 (100.0)
## acad.prog.course = UGD (%) 1769 (100.0)
## course (%)
## CHEM 4 0 ( 0.0)
## CHEM 1A 1769 (100.0)
## CHEM 1B 0 ( 0.0)
## CHEM 24 0 ( 0.0)
## component = LEC (%) 1769 (100.0)
## units (mean (SD)) 5.00 (0.00)
## course.numeric (mean (SD)) 1.00 (0.00)
## div = Lower Division (%) 1769 (100.0)
## Instructor_01 (%)
## 1 0 ( 0.0)
## 2 0 ( 0.0)
## 3 0 ( 0.0)
## 4 0 ( 0.0)
## 5 0 ( 0.0)
## 6 1769 (100.0)
## 7 0 ( 0.0)
## 8 0 ( 0.0)
## 9 0 ( 0.0)
## 10 0 ( 0.0)
## 11 0 ( 0.0)
## 12 0 ( 0.0)
## 13 0 ( 0.0)
## 14 0 ( 0.0)
## 15 0 ( 0.0)
## 16 0 ( 0.0)
## 17 0 ( 0.0)
## 18 0 ( 0.0)
## 19 0 ( 0.0)
## 20 0 ( 0.0)
## 21 0 ( 0.0)
## 22 0 ( 0.0)
## course.seq (mean (SD)) 0.00 (0.00)
## rpt.flg = First Attempt (%) 1769 (100.0)
## c2s = Non-C2S (%) 1769 (100.0)
## base.time.course (mean (SD)) 1.00 (0.00)
## years (mean (SD)) 0.50 (0.00)
## withdraw_code = NWD (%) 1769 (100.0)
## enrl.flg = Enrolled (%) 1769 (100.0)
## enrl.flgERS = Enrolled (%) 1769 (100.0)
## rtn.flg = Retained (%) 1769 (100.0)
## rtn.flgERS = Retained (%) 1769 (100.0)
## pass.term.flg = PASS Term (%) 1769 (100.0)
## csus.gpa.start.flg = Not Missing (%) 1769 (100.0)
## higher.ed.gpa.start.flg = Not Missing (%) 1769 (100.0)
sum(single.vars) # 22
## [1] 22
# remove single-valued variables
chem1a.dat<- chem1a.dat %>%
dplyr::select(-names(single.vars))
dim(chem1a.dat) # 1769 124
## [1] 1769 124
124 out of 146 variables are kept
22 variables removed due to single values
# Remove non chem1a instructors
chem1a.dat <- chem1a.dat %>%
droplevels(chem1a.dat$Instructor_01)
# Combine sparse ethnicity categories to Other
chem1a.dat <- chem1a.dat %>%
mutate(eth.erss = fct_collapse(eth.erss, `Other` = c("Foreign", "Native American", "Pacific Islander")))
with(chem1a.dat, table(eth.erss, palN))
## palN
## eth.erss 0 1
## African American 44 30
## Asian 336 237
## Other 20 21
## Hispanic 250 242
## Two or More Races 81 40
## Unknown 38 15
## White 238 177
# Collapse sparse categories for acad.stand
# Other: Academic Dismissal, Academic Disqualification
chem1a.dat <- chem1a.dat %>%
mutate(acad.stand = fct_other(acad.stand, keep = c("Good Standing")))
with(chem1a.dat, table(acad.stand, palN))
## palN
## acad.stand 0 1
## Good Standing 955 748
## Other 52 14
Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.
chem1a.final <- chem1a.step.vars(chem1a.dat)
kable(names(chem1a.final))
x |
---|
acad.stand |
adm.area |
bot.level |
cMaj |
coh |
course.age |
csus.gpa.start |
cum.percent.units.passed |
delay.from.hs |
e.rmd |
eth.erss |
father.ed |
fys.flg |
gender |
hous.coh.term.flg |
hs.gpa |
median.income |
m.rmd |
mother.ed |
pct.female.head |
pell.coh.term.flg |
prevPAL |
prevPASS |
reason |
sac.county.flg |
term.units.attemptedCensus |
palN |
grd.pt.unt |
sat.math.score |
sat.math.flg |
sat.verbal.score |
AP_BIOL |
AP_CALAB |
AP_CALBC |
AP_CHEM |
AP_BIOL.flg |
AP_CALAB.flg |
AP_CALBC.flg |
AP_CHEM.flg |
pct.female.head.flg |
med.inc.flg |
Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
chem1a.first.order.prop.model <- chem1a.step(chem1a.final)
summary(chem1a.first.order.prop.model)
##
## Call:
## glm(formula = model.first.order, family = binomial, data = chem1a.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8322 -1.0126 -0.6089 1.0904 2.4921
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.2066673 1.3077056 1.687 0.09152 .
## cum.percent.units.passed 0.0185532 0.7023697 0.026 0.97893
## eth.erssAsian 0.2006993 0.2729443 0.735 0.46215
## eth.erssOther 0.4196394 0.4155541 1.010 0.31258
## eth.erssHispanic 0.4779929 0.2730791 1.750 0.08005 .
## eth.erssTwo or More Races -0.1297781 0.3288717 -0.395 0.69313
## eth.erssUnknown -0.2929460 0.4188639 -0.699 0.48431
## eth.erssWhite 0.3413619 0.2798446 1.220 0.22253
## genderMale -0.3338347 0.1175127 -2.841 0.00450 **
## sat.math.score -0.0031376 0.0009867 -3.180 0.00147 **
## sat.verbal.score -0.0024583 0.0009013 -2.728 0.00638 **
## sat.math.flgold -0.1286384 0.2106295 -0.611 0.54138
## AP_CALAB -0.0296383 0.1061827 -0.279 0.78015
## AP_CALAB.flgNot Missing -0.3448884 0.1673607 -2.061 0.03933 *
## AP_BIOL 0.0113165 0.1876565 0.060 0.95191
## AP_BIOL.flgNot Missing -0.3273270 0.1949865 -1.679 0.09321 .
## AP_CHEM -0.7434323 0.4691420 -1.585 0.11304
## AP_CHEM.flgNot Missing -1.1385896 0.4174880 -2.727 0.00639 **
## csus.gpa.start 0.7188766 0.1466119 4.903 9.43e-07 ***
## prevPAL 0.5319611 0.1061938 5.009 5.46e-07 ***
## cMajChemistry -0.7074529 0.1725795 -4.099 4.14e-05 ***
## cMajCivil Engineering -0.5976542 0.3925320 -1.523 0.12787
## cMajGeology -0.0987511 0.4921359 -0.201 0.84097
## cMajKinesiology/Physical Education -0.1260763 0.1522197 -0.828 0.40753
## cMajMechanical Engineering -0.8508806 0.3391102 -2.509 0.01210 *
## cMajNursing 0.0066722 0.3863916 0.017 0.98622
## cMajNutrition -0.0288961 0.2734756 -0.106 0.91585
## cMajOTHER -0.6094946 0.2157415 -2.825 0.00473 **
## cMajPhysics -0.7988319 0.4622694 -1.728 0.08398 .
## cMajUndeclared -0.4627973 0.3003148 -1.541 0.12331
## term.units.attemptedCensus 0.0710439 0.0279899 2.538 0.01114 *
## hs.gpa -0.2970032 0.1446650 -2.053 0.04007 *
## delay.from.hs -0.1154636 0.0537763 -2.147 0.03178 *
## acad.standOther -0.4841617 0.3451585 -1.403 0.16070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2418.3 on 1768 degrees of freedom
## Residual deviance: 2168.3 on 1735 degrees of freedom
## AIC: 2236.3
##
## Number of Fisher Scoring iterations: 5
p.score <- chem1a.first.order.prop.model$fitted.values
chem1a.covs <- names(chem1a.first.order.prop.model %>% pluck("model") %>% dplyr::select(-palN))
# Unadjusted mean grades
get.unadj.means(chem1a.final)
Non-PAL | PAL | Diff. |
---|---|---|
1.601986 | 1.974409 | 0.3724234 |
## Stratified by palN
## 0 1 SMD
## n 1007 762
## cum.percent.units.passed (mean (SD)) 0.91 (0.10) 0.91 (0.09) 0.005
## eth.erss (%) 0.213
## African American 44 ( 4.4) 30 ( 3.9)
## Asian 336 (33.4) 237 (31.1)
## Other 20 ( 2.0) 21 ( 2.8)
## Hispanic 250 (24.8) 242 (31.8)
## Two or More Races 81 ( 8.0) 40 ( 5.2)
## Unknown 38 ( 3.8) 15 ( 2.0)
## White 238 (23.6) 177 (23.2)
## gender = Male (%) 471 (46.8) 241 (31.6) 0.314
## sat.math.score (mean (SD)) 529.74 (78.45) 499.23 (74.18) 0.400
## sat.verbal.score (mean (SD)) 499.95 (82.51) 477.60 (74.57) 0.284
## sat.math.flg = old (%) 942 (93.5) 702 (92.1) 0.055
## AP_CALAB (mean (SD)) 2.57 (0.62) 2.54 (0.47) 0.053
## AP_CALAB.flg = Not Missing (%) 197 (19.6) 94 (12.3) 0.198
## AP_BIOL (mean (SD)) 2.51 (0.31) 2.50 (0.29) 0.040
## AP_BIOL.flg = Not Missing (%) 109 (10.8) 61 ( 8.0) 0.097
## AP_CHEM (mean (SD)) 1.96 (0.23) 1.96 (0.13) 0.030
## AP_CHEM.flg = Not Missing (%) 54 ( 5.4) 16 ( 2.1) 0.173
## csus.gpa.start (mean (SD)) 3.11 (0.53) 3.22 (0.46) 0.237
## prevPAL (mean (SD)) 0.22 (0.46) 0.37 (0.56) 0.308
## cMaj (%) 0.365
## Biology 412 (40.9) 405 (53.1)
## Chemistry 145 (14.4) 76 (10.0)
## Civil Engineering 28 ( 2.8) 12 ( 1.6)
## Geology 15 ( 1.5) 7 ( 0.9)
## Kinesiology/Physical Education 150 (14.9) 129 (16.9)
## Mechanical Engineering 48 ( 4.8) 14 ( 1.8)
## Nursing 16 ( 1.6) 16 ( 2.1)
## Nutrition 35 ( 3.5) 34 ( 4.5)
## OTHER 90 ( 8.9) 43 ( 5.6)
## Physics 26 ( 2.6) 7 ( 0.9)
## Undeclared 42 ( 4.2) 19 ( 2.5)
## term.units.attemptedCensus (mean (SD)) 13.65 (2.10) 13.90 (1.91) 0.125
## hs.gpa (mean (SD)) 3.42 (0.44) 3.40 (0.42) 0.030
## delay.from.hs (mean (SD)) 2.23 (1.28) 2.16 (1.08) 0.052
## acad.stand = Other (%) 52 ( 5.2) 14 ( 1.8) 0.182
Check how many variables have SMD > 0.1
addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
##
## FALSE TRUE Sum
## 8 11 19
get.imbal.vars(unmatched.tab)
Variable | Before Matching SMD |
---|---|
sat.math.score | 0.3997009 |
cMaj | 0.3649218 |
gender | 0.3140303 |
prevPAL | 0.3077470 |
sat.verbal.score | 0.2842378 |
csus.gpa.start | 0.2374740 |
eth.erss | 0.2125984 |
AP_CALAB.flg | 0.1983562 |
acad.stand | 0.1817409 |
AP_CHEM.flg | 0.1727961 |
term.units.attemptedCensus | 0.1253328 |
11 out of 19 variables have SMD >0.1
Implement a propensity score matching method.
match.chem1a <- with(chem1a.final, Match(
Y=chem1a.final$grd.pt.unt, Tr = chem1a.final$palN, X = p.score,
BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25, replace = TRUE, ties = TRUE))
Standardized mean differences for continuous variables and categorical variables.
# Needed for match table
chem1a.final <- chem1a.final %>%
rownames_to_column(var = "id")
# Matched data
chem1a.matched.dat <- chem1a.final[unlist(match.chem1a[c("index.treated", "index.control")]), ]
chem1a.matched.dat$match.weights<- c(match.chem1a$weights, match.chem1a$weights)
# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem1a.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem1a.covs, strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 757.00 757.00
## cum.percent.units.passed (mean (SD)) 0.91 (0.09) 0.91 (0.09) 0.053
## eth.erss (%) 0.116
## African American 21.2 ( 2.8) 29.0 ( 3.8)
## Asian 230.7 (30.5) 237.0 (31.3)
## Other 18.1 ( 2.4) 21.0 ( 2.8)
## Hispanic 224.1 (29.6) 238.0 (31.4)
## Two or More Races 37.0 ( 4.9) 40.0 ( 5.3)
## Unknown 14.7 ( 1.9) 15.0 ( 2.0)
## White 211.1 (27.9) 177.0 (23.4)
## gender = Male (%) 226.1 (29.9) 240.0 (31.7) 0.040
## sat.math.score (mean (SD)) 498.04 (73.29) 500.17 (73.25) 0.029
## sat.verbal.score (mean (SD)) 478.74 (75.32) 478.10 (74.42) 0.009
## sat.math.flg = old (%) 697.2 (92.1) 697.0 (92.1) 0.001
## AP_CALAB (mean (SD)) 2.50 (0.49) 2.54 (0.47) 0.069
## AP_CALAB.flg = Not Missing (%) 91.8 (12.1) 94.0 (12.4) 0.009
## AP_BIOL (mean (SD)) 2.52 (0.21) 2.50 (0.30) 0.059
## AP_BIOL.flg = Not Missing (%) 57.4 ( 7.6) 61.0 ( 8.1) 0.018
## AP_CHEM (mean (SD)) 1.96 (0.12) 1.96 (0.13) 0.009
## AP_CHEM.flg = Not Missing (%) 13.8 ( 1.8) 16.0 ( 2.1) 0.021
## csus.gpa.start (mean (SD)) 3.20 (0.47) 3.22 (0.46) 0.039
## prevPAL (mean (SD)) 0.38 (0.61) 0.36 (0.54) 0.041
## cMaj (%) 0.123
## Biology 390.5 (51.6) 401.0 (53.0)
## Chemistry 92.9 (12.3) 76.0 (10.0)
## Civil Engineering 13.0 ( 1.7) 12.0 ( 1.6)
## Geology 9.6 ( 1.3) 7.0 ( 0.9)
## Kinesiology/Physical Education 117.2 (15.5) 128.0 (16.9)
## Mechanical Engineering 11.0 ( 1.5) 14.0 ( 1.8)
## Nursing 13.7 ( 1.8) 16.0 ( 2.1)
## Nutrition 32.0 ( 4.2) 34.0 ( 4.5)
## OTHER 56.3 ( 7.4) 43.0 ( 5.7)
## Physics 5.4 ( 0.7) 7.0 ( 0.9)
## Undeclared 15.2 ( 2.0) 19.0 ( 2.5)
## term.units.attemptedCensus (mean (SD)) 13.96 (1.83) 13.90 (1.91) 0.030
## hs.gpa (mean (SD)) 3.40 (0.42) 3.41 (0.42) 0.013
## delay.from.hs (mean (SD)) 2.20 (1.21) 2.16 (1.08) 0.039
## acad.stand = Other (%) 22.8 ( 3.0) 14.0 ( 1.8) 0.075
Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion
All variables are balanced and under the <0.1 mean threshold.
chem1a.bal <- bal.tab(match.chem1a, formula = f.build("palN", chem1a.covs), data = chem1a.final,
distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
chem1a.bal
## Balance Measures
## All covariates are balanced.
##
## Balance tally for mean differences
## count
## Balanced, <0.1 36
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## AP_CALAB 0.07 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 1007. 762
## Matched (ESS) 313.69 757
## Matched (Unweighted) 711. 757
## Unmatched 296. 0
## Discarded 0. 5
get.var.perc.tab(chem1a.bal)
## Variable Diff.Un Diff.Adj % Improvement
## 1 p.score 0.813473150 0.0009607886 100
## 2 eth.erss_Unknown -0.018050810 0.0004010191 98
## 3 sat.math.flg_old -0.014191995 -0.0002642008 98
## 4 sat.verbal.score -0.299750402 -0.0086395931 97
## 5 AP_CALAB.flg_Not Missing -0.072271006 0.0028952004 96
## 6 sat.math.score -0.411373506 0.0286721308 93
## 7 AP_CHEM.flg_Not Missing -0.032627252 0.0029502422 91
## 8 cMaj_Biology 0.122360015 0.0138422344 89
## 9 cMaj_Civil Engineering -0.012057331 -0.0013099956 89
## 10 gender_Male -0.151452953 0.0183855444 88
## 11 cMaj_Physics -0.016632913 0.0020695729 88
## 12 cMaj_Mechanical Engineering -0.029293632 0.0039189784 87
## 13 eth.erss_Two or More Races -0.027943503 0.0038969617 86
## 14 prevPAL 0.281028450 -0.0422296608 85
## 15 csus.gpa.start 0.255366543 0.0396354928 84
## 16 AP_BIOL.flg_Not Missing -0.028189810 0.0047225892 83
## 17 AP_CHEM -0.043044496 0.0088420199 79
## 18 term.units.attemptedCensus 0.131689543 -0.0297896263 77
## 19 eth.erss_Hispanic 0.069323137 0.0183478015 74
## 20 cMaj_Nutrition 0.009862719 0.0025869661 74
## 21 cMaj_Undeclared -0.016773660 0.0050166698 70
## 22 acad.stand_Other -0.033265827 -0.0115918098 65
## 23 eth.erss_Asian -0.022640728 0.0082609926 64
## 24 hs.gpa -0.030258269 0.0130619500 57
## 25 eth.erss_Other 0.007698082 0.0037837328 51
## 26 cMaj_Chemistry -0.044254523 -0.0223469837 50
## 27 cMaj_OTHER -0.032943933 -0.0175803611 47
## 28 cMaj_Geology -0.005709378 -0.0033937221 41
## 29 cMaj_Nursing 0.005108597 0.0029942756 41
## 30 cMaj_Kinesiology/Physical Education 0.020334040 0.0142023652 30
## 31 delay.from.hs -0.057063209 -0.0409351417 28
## 32 AP_CALAB -0.062685060 0.0700268159 -12
## 33 AP_BIOL -0.041283872 -0.0516391001 -25
## 34 eth.erss_African American -0.004324062 0.0103478644 -139
## 35 eth.erss_White -0.004062116 -0.0450383720 -1009
## 36 cum.percent.units.passed 0.004657179 0.0537413078 -1054
get.bal.plot(unmatched.tab, matched.tab)
love.plot(chem1a.bal,binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
create.match.tab(chem1a.matched.dat)
Non-PAL | PAL | |
---|---|---|
Single Matches | 278 | 337 |
Multiple Matches | 433 | 420 |
Total Students | 711 | 757 |
Out of 762 PAL students, 757 were matched and 5 were unable to be matched. Out of 757 PAL student matches, 337 PAL students were matched to one non-PAL student and 420 PAL students were matched to multiple non-PAL students.
Out of 1561 non-PAL student matches, there were 711 non-PAL students, 278 of the non-PAL students were matched to one PAL student and 433 of the non-PAL students were matched to multiple PAL students.
get.att.plot(chem1a.final, match.chem1a)
The standardized mean differences of the prognostic scores is 0.0335, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.
##
## Call:
## glm(formula = f.build("grd.pt.unt", chem1a.covs), data = ctrl.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.39006 -0.58465 0.02405 0.57994 2.76737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.2196838 0.5382662 -9.697 < 2e-16 ***
## cum.percent.units.passed 0.2918154 0.3625120 0.805 0.42103
## eth.erssAsian 0.0601629 0.1430710 0.421 0.67421
## eth.erssOther 0.0209315 0.2374072 0.088 0.92976
## eth.erssHispanic -0.0631047 0.1453120 -0.434 0.66419
## eth.erssTwo or More Races -0.0504216 0.1665614 -0.303 0.76217
## eth.erssUnknown 0.2034702 0.1984971 1.025 0.30559
## eth.erssWhite 0.0313683 0.1474357 0.213 0.83156
## genderMale 0.1350774 0.0621716 2.173 0.03005 *
## sat.math.score 0.0024302 0.0005127 4.740 2.45e-06 ***
## sat.verbal.score -0.0008151 0.0004630 -1.760 0.07866 .
## sat.math.flgold 0.1244919 0.1187407 1.048 0.29470
## AP_CALAB 0.0791501 0.0472506 1.675 0.09423 .
## AP_CALAB.flgNot Missing 0.0481646 0.0796500 0.605 0.54552
## AP_BIOL 0.1680120 0.0935846 1.795 0.07292 .
## AP_BIOL.flgNot Missing 0.0269552 0.0953309 0.283 0.77743
## AP_CHEM 0.3201416 0.1258924 2.543 0.01115 *
## AP_CHEM.flgNot Missing 0.3035225 0.1284433 2.363 0.01832 *
## csus.gpa.start 0.9656280 0.0758034 12.739 < 2e-16 ***
## prevPAL -0.0695985 0.0624447 -1.115 0.26531
## cMajChemistry 0.1194315 0.0869728 1.373 0.17000
## cMajCivil Engineering 0.3998032 0.1789340 2.234 0.02569 *
## cMajGeology -0.3463892 0.2371614 -1.461 0.14446
## cMajKinesiology/Physical Education 0.0310270 0.0855213 0.363 0.71683
## cMajMechanical Engineering 0.1324651 0.1417757 0.934 0.35037
## cMajNursing -0.4427774 0.2251455 -1.967 0.04951 *
## cMajNutrition -0.0323645 0.1578557 -0.205 0.83759
## cMajOTHER 0.1995494 0.1050010 1.900 0.05767 .
## cMajPhysics -0.0867119 0.1849268 -0.469 0.63925
## cMajUndeclared -0.1265802 0.1437063 -0.881 0.37863
## term.units.attemptedCensus 0.0079314 0.0141139 0.562 0.57427
## hs.gpa 0.2435173 0.0756390 3.219 0.00133 **
## delay.from.hs 0.1102145 0.0257807 4.275 2.10e-05 ***
## acad.standOther -0.1641247 0.1412638 -1.162 0.24559
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.7558023)
##
## Null deviance: 1209.9 on 1006 degrees of freedom
## Residual deviance: 735.4 on 973 degrees of freedom
## AIC: 2611.2
##
## Number of Fisher Scoring iterations: 2
## Balance Measures
## Type Diff.Adj M.Threshold
## prog.score Distance 0.0335 Balanced, <0.1
## cum.percent.units.passed Contin. 0.0537 Balanced, <0.1
## eth.erss_African American Binary 0.0103 Balanced, <0.1
## eth.erss_Asian Binary 0.0083 Balanced, <0.1
## eth.erss_Other Binary 0.0038 Balanced, <0.1
## eth.erss_Hispanic Binary 0.0183 Balanced, <0.1
## eth.erss_Two or More Races Binary 0.0039 Balanced, <0.1
## eth.erss_Unknown Binary 0.0004 Balanced, <0.1
## eth.erss_White Binary -0.0450 Balanced, <0.1
## gender_Male Binary 0.0184 Balanced, <0.1
## sat.math.score Contin. 0.0287 Balanced, <0.1
## sat.verbal.score Contin. -0.0086 Balanced, <0.1
## sat.math.flg_old Binary -0.0003 Balanced, <0.1
## AP_CALAB Contin. 0.0700 Balanced, <0.1
## AP_CALAB.flg_Not Missing Binary 0.0029 Balanced, <0.1
## AP_BIOL Contin. -0.0516 Balanced, <0.1
## AP_BIOL.flg_Not Missing Binary 0.0047 Balanced, <0.1
## AP_CHEM Contin. 0.0088 Balanced, <0.1
## AP_CHEM.flg_Not Missing Binary 0.0030 Balanced, <0.1
## csus.gpa.start Contin. 0.0396 Balanced, <0.1
## prevPAL Contin. -0.0422 Balanced, <0.1
## cMaj_Biology Binary 0.0138 Balanced, <0.1
## cMaj_Chemistry Binary -0.0223 Balanced, <0.1
## cMaj_Civil Engineering Binary -0.0013 Balanced, <0.1
## cMaj_Geology Binary -0.0034 Balanced, <0.1
## cMaj_Kinesiology/Physical Education Binary 0.0142 Balanced, <0.1
## cMaj_Mechanical Engineering Binary 0.0039 Balanced, <0.1
## cMaj_Nursing Binary 0.0030 Balanced, <0.1
## cMaj_Nutrition Binary 0.0026 Balanced, <0.1
## cMaj_OTHER Binary -0.0176 Balanced, <0.1
## cMaj_Physics Binary 0.0021 Balanced, <0.1
## cMaj_Undeclared Binary 0.0050 Balanced, <0.1
## term.units.attemptedCensus Contin. -0.0298 Balanced, <0.1
## hs.gpa Contin. 0.0131 Balanced, <0.1
## delay.from.hs Contin. -0.0409 Balanced, <0.1
## acad.stand_Other Binary -0.0116 Balanced, <0.1
## p.score Contin. 0.0010 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 37
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## AP_CALAB 0.07 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 1007. 762
## Matched (ESS) 313.69 757
## Matched (Unweighted) 711. 757
## Unmatched 296. 0
## Discarded 0. 5
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.4952651 . This result is statistically significant with a P-value of \(4.8561x10^{-13}\) and is based on 757 PAL students and 1561 non-PAL student matches(711 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.
summary(match.chem1a)
##
## Estimate... 0.49527
## AI SE...... 0.068509
## T-stat..... 7.2292
## p.val...... 4.8561e-13
##
## Original number of observations.............. 1769
## Original number of treated obs............... 762
## Matched number of observations............... 757
## Matched number of observations (unweighted). 1561
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 5
psens(match.chem1a, Gamma=2.0, GammaInc = 0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0000
## 1.3 0 0.0000
## 1.4 0 0.0000
## 1.5 0 0.0000
## 1.6 0 0.0006
## 1.7 0 0.0106
## 1.8 0 0.0764
## 1.9 0 0.2732
## 2.0 0 0.5716
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “csus.gpa.start” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.
kable(sort(exp(abs(chem1a.first.order.prop.model$coefficients))))
x | |
---|---|
sat.verbal.score | 1.002461 |
sat.math.score | 1.003143 |
cMajNursing | 1.006695 |
AP_BIOL | 1.011381 |
cum.percent.units.passed | 1.018726 |
cMajNutrition | 1.029318 |
AP_CALAB | 1.030082 |
term.units.attemptedCensus | 1.073628 |
cMajGeology | 1.103792 |
delay.from.hs | 1.122394 |
cMajKinesiology/Physical Education | 1.134369 |
sat.math.flgold | 1.137279 |
eth.erssTwo or More Races | 1.138576 |
eth.erssAsian | 1.222257 |
eth.erssUnknown | 1.340370 |
hs.gpa | 1.345820 |
AP_BIOL.flgNot Missing | 1.387255 |
genderMale | 1.396312 |
eth.erssWhite | 1.406862 |
AP_CALAB.flgNot Missing | 1.411832 |
eth.erssOther | 1.521413 |
cMajUndeclared | 1.588511 |
eth.erssHispanic | 1.612834 |
acad.standOther | 1.622814 |
prevPAL | 1.702267 |
cMajCivil Engineering | 1.817850 |
cMajOTHER | 1.839501 |
cMajChemistry | 2.028817 |
csus.gpa.start | 2.052126 |
AP_CHEM | 2.103142 |
cMajPhysics | 2.222943 |
cMajMechanical Engineering | 2.341708 |
AP_CHEM.flgNot Missing | 3.122362 |
(Intercept) | 9.085387 |
Based on data for 769 PAL students and 1090 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.41 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.
After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.43\pm 0.08\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 1B, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.
A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 1B course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.
For Chem 1B, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: ethnicity, has an AP Chemistry exam score, enrollment in PAL in the past, class level, being remedial in math, being eligible for a Pell grant when entering CSUS, academic major, CSUS GPA at start of term, and from CSUS local admission area.
Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0733. Consequently, adequate balance appears to have been achieved.
The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.43\pm 0.08\) or between 0.35 and 0.51 on a 4.0 grade scale. This result is statistically significant with a P-value of \(8.59x10^{-9}\) and is based on 573 PAL students and 457 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.41.
The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 1B. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 1B indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of being remedial in math, class level, or has an AP Chemistry exam score observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.
Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 769 PAL students in Chem 1B being reduced to 573. Also, 457 non-PAL students were selected out of 1090 original non-PAL students.
When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 991 non-PAL matches. Of the 573 PAL students, 331 were matched one-to-one with non-PAL students and 242 were matched one-to-many with non-PAL students.
The non-PAL and PAL groups will include students with only first attempts at CHEM 1B.They will also include students with previous PAL participation and/or are currently in a PAL for another course.
For the prerequisite course CHEM 1A, the grade for the student’s last attempt and the number of times it was taken are added to the CHEM 1B data set. Only 1416 out of 1859 CHEM 1B students have CHEM 1A grades.
However, few students retook CHEM 1A so there was inadequate balance on number of times Chem 1B was taken, and it was removed from the propensity model after balance checks.
# Excludes course repeats
chem1b.dat <- chem.dat %>%
filter(course=="CHEM 1B", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem1b.dat) # 1859 179
## [1] 1859 179
prereq <- "CHEM 1A"
chem1b.dat <- get.prereq.grades(chem1b.dat, chem.dat, prereq)
# Rename CHEM 1A variables
chem1b.dat <- chem1b.dat %>%
rename(chem1a.course.seq= prereq.course.seq, chem1a.grd.pt.unt= prereq.grd.pt.unt, chem1a.grade = prereq.grade)
dim(chem1b.dat) # 1416 182
## [1] 1416 182
There are 1,416 CHEM 1B first attempt only students with prerequisite CHEM 1A grades.
The variables below were added to added to the CHEM 1B data, so there are now 182 variables instead of 179 variables.
chem1a.course.seq: Is the student taking CHEM 1A for the first time or is it the second attempt, third attempt, etc.
chem1a.grd.pt.unt: Numeric grade on 0 to 4 scale (0=F, 4=A) for CHEM 1A
chem1a.grade: Course grade for CHEM 1A
Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.
# # Collapsed cMaj categories to Biology and Other majors at 0.04
# with(chem1b.dat, table(cMaj, palN))
chem1b.dat <- group_category(data = chem1b.dat, feature = "cMaj", threshold = 0.04, update = TRUE)
with(chem1b.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 433 338
## Chemistry 119 83
## Geology 7 5
## Health Science 7 4
## Kinesiology/Physical Education 129 107
## Nutrition 48 19
## OTHER 36 22
## Physics 33 2
## Undeclared 12 12
Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.
## [1] 35
## feature num_missing pct_missing
## 22 deg.plan3 1416 1.0000000
## 23 deg.plan4 1416 1.0000000
## 24 deg.plan5 1416 1.0000000
## 25 deg.plan6 1416 1.0000000
## 19 withdraw_reason 1414 0.9985876
## 21 deg.plan2 1393 0.9837571
## 4 pledge.term 1251 0.8834746
## 11 trf.gpaADM 1203 0.8495763
## 18 treat.section 824 0.5819209
## 1 fys.term.code 778 0.5494350
## 2 fys.grd 778 0.5494350
## 3 fys.rpt.flg 778 0.5494350
## 17 Instructor_02 723 0.5105932
## 27 grad.termERS 670 0.4731638
## 20 deg.plan1 656 0.4632768
## 26 grad.term 656 0.4632768
## 28 ttg 656 0.4632768
## 29 plan.college 607 0.4286723
## 30 plan.college.desc 607 0.4286723
## 31 plan.dept 607 0.4286723
## 32 plan.deptAbbr 607 0.4286723
## 33 plan.degree 607 0.4286723
## 34 plan.type 607 0.4286723
## 5 sat.math.score 277 0.1956215
## 6 sat.math.flg 277 0.1956215
## 7 sat.verbal.score 277 0.1956215
## 8 sat.verbal.flg 277 0.1956215
## 9 sat.test.date 277 0.1956215
## 13 ge.critical.thinking.status 260 0.1836158
## 14 ge.english.comp.status 260 0.1836158
## 15 ge.math.status 260 0.1836158
## 16 ge.oral.comm.status 260 0.1836158
## 12 admit.term 256 0.1807910
## 10 hs.gpa 176 0.1242938
## 35 county 166 0.1172316
## [1] 1416 147
35 variables missing >10%
So, 35 variables were removed due to missingness and there are now 147 variables instead of 182 variables.
chem1b.dat <- chem1b.dat[complete.cases(chem1b.dat), ]
dim(chem1b.dat) # 1337 147
## [1] 1337 147
1337 out of 1416 students are kept
79 students were removed due to missingness of variables
single.vars <- chem1b.dat %>%
summarise(across(everything(), ~ n_distinct(.x))) %>%
select_if(. == 1)
# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem1b.dat)
##
## Overall
## n 1337
## country = USA (%) 1337 (100.0)
## career.course = UGRD (%) 1337 (100.0)
## acad.prog.course = UGD (%) 1337 (100.0)
## course (%)
## CHEM 4 0 ( 0.0)
## CHEM 1A 0 ( 0.0)
## CHEM 1B 1337 (100.0)
## CHEM 24 0 ( 0.0)
## component = LEC (%) 1337 (100.0)
## units (mean (SD)) 5.00 (0.00)
## course.numeric (mean (SD)) 1.00 (0.00)
## div = Lower Division (%) 1337 (100.0)
## course.seq (mean (SD)) 0.00 (0.00)
## rpt.flg = First Attempt (%) 1337 (100.0)
## pass = Non-PASS (%) 1337 (100.0)
## c2s = Non-C2S (%) 1337 (100.0)
## base.time.course (mean (SD)) 1.00 (0.00)
## years (mean (SD)) 0.50 (0.00)
## withdraw_code = NWD (%) 1337 (100.0)
## enrl.flg = Enrolled (%) 1337 (100.0)
## enrl.flgERS = Enrolled (%) 1337 (100.0)
## rtn.flg = Retained (%) 1337 (100.0)
## rtn.flgERS = Retained (%) 1337 (100.0)
## pass.term.flg = PASS Term (%) 1337 (100.0)
## passN (mean (SD)) 0.00 (0.00)
## csus.gpa.start.flg = Not Missing (%) 1337 (100.0)
## higher.ed.gpa.start.flg = Not Missing (%) 1337 (100.0)
sum(single.vars) # 23
## [1] 23
# remove single-valued variables
chem1b.dat<- chem1b.dat %>%
dplyr::select(-names(single.vars))
dim(chem1b.dat) # 1337 124
## [1] 1337 124
124 out of 147 variables are kept
23 variables removed due to single values
# Remove non chem1b instructors
chem1b.dat <- chem1b.dat %>%
droplevels(chem1b.dat$Instructor_01)
# Combine sparse ethnicity categories to Other
chem1b.dat <- chem1b.dat %>%
mutate(eth.erss = fct_other(eth.erss, drop = c("Native American", "Pacific Islander")))
with(chem1b.dat, table(eth.erss, palN))
## palN
## eth.erss 0 1
## African American 31 21
## Asian 233 208
## Foreign 8 6
## Hispanic 167 154
## Two or More Races 52 31
## Unknown 34 13
## White 227 137
## Other 12 3
Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.
chem1b.final <- chem1b.step.vars(chem1b.dat)
kable(names(chem1b.final))
x |
---|
acad.stand |
adm.area |
bot.level |
cMaj |
coh |
course.age |
csus.gpa.start |
cum.percent.units.passed |
delay.from.hs |
e.rmd |
eth.erss |
father.ed |
fys.flg |
gender |
hous.coh.term.flg |
Instructor_01 |
median.income |
m.rmd |
mother.ed |
pct.female.head |
pell.coh.term.flg |
prevPAL |
prevPASS |
reason |
sac.county.flg |
term.units.attemptedCensus |
palN |
grd.pt.unt |
chem1a.grd.pt.unt |
AP_BIOL |
AP_CALAB |
AP_CALBC |
AP_CHEM |
AP_BIOL.flg |
AP_CALAB.flg |
AP_CALBC.flg |
AP_CHEM.flg |
pct.female.head.flg |
med.inc.flg |
Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, and chem1a.grd.pt.unt. Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
chem1b.final <- chem1b.step.vars(chem1b.dat)
chem1b.first.order.prop.model <- chem1b.step(chem1b.final)
summary(chem1b.first.order.prop.model)
##
## Call:
## glm(formula = model.first.order, family = binomial, data = chem1b.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0704 -1.0105 -0.6102 1.0971 2.5795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.4187425 1.1236815 -2.153 0.031357 *
## chem1a.grd.pt.unt -0.0952206 0.1183413 -0.805 0.421035
## cum.percent.units.passed 0.4253555 1.0331478 0.412 0.680553
## eth.erssAsian 0.2258778 0.3168833 0.713 0.475963
## eth.erssForeign 0.1193382 0.6437159 0.185 0.852923
## eth.erssHispanic 0.2176552 0.3213975 0.677 0.498269
## eth.erssTwo or More Races -0.0008854 0.3852899 -0.002 0.998166
## eth.erssUnknown -0.4839073 0.4644237 -1.042 0.297434
## eth.erssWhite -0.0732071 0.3225052 -0.227 0.820427
## eth.erssOther -1.5590171 0.7498221 -2.079 0.037601 *
## genderMale 0.1921808 0.1281980 1.499 0.133849
## AP_CHEM 0.0172767 0.3201902 0.054 0.956969
## AP_CHEM.flgNot Missing -0.8156336 0.2959788 -2.756 0.005856 **
## prevPAL 0.5954449 0.0787251 7.564 3.92e-14 ***
## bot.levelJunior -0.5557096 0.3064733 -1.813 0.069795 .
## bot.levelSenior -0.7147519 0.3160084 -2.262 0.023709 *
## bot.levelSophomore -0.0971246 0.3014493 -0.322 0.747307
## m.rmdRemedial in Math 0.6764007 0.1892499 3.574 0.000351 ***
## pell.coh.term.flgPell 0.4699412 0.1222645 3.844 0.000121 ***
## cMajChemistry -0.0751801 0.1856516 -0.405 0.685512
## cMajGeology 0.2741545 0.6425678 0.427 0.669631
## cMajHealth Science -0.1534683 0.6711026 -0.229 0.819117
## cMajKinesiology/Physical Education 0.2702222 0.1694312 1.595 0.110739
## cMajNutrition -0.3721911 0.3149924 -1.182 0.237369
## cMajOTHER 0.0981896 0.3051662 0.322 0.747636
## cMajPhysics -2.2262195 0.7492173 -2.971 0.002965 **
## cMajUndeclared 0.2144541 0.4430973 0.484 0.628394
## csus.gpa.start 0.4743598 0.2143101 2.213 0.026868 *
## adm.areanonlocal -0.2446215 0.1375927 -1.778 0.075426 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1826.1 on 1336 degrees of freedom
## Residual deviance: 1641.8 on 1308 degrees of freedom
## AIC: 1699.8
##
## Number of Fisher Scoring iterations: 5
p.score <- chem1b.first.order.prop.model$fitted.values
chem1b.covs <- names(chem1b.first.order.prop.model %>% pluck("model") %>% dplyr::select(-palN))
# Unadjusted mean grades
get.unadj.means(chem1b.final)
Non-PAL | PAL | Diff. |
---|---|---|
1.859686 | 2.187609 | 0.3279232 |
Standardized mean differences for continuous variables and categorical variables.
unmatched.tab <- CreateTableOne(vars = chem1b.covs, strata = "palN",
data = chem1b.final, smd = TRUE, test = FALSE)
print(unmatched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 764 573
## chem1a.grd.pt.unt (mean (SD)) 2.59 (0.68) 2.58 (0.64) 0.006
## cum.percent.units.passed (mean (SD)) 0.92 (0.08) 0.92 (0.08) 0.008
## eth.erss (%) 0.249
## African American 31 ( 4.1) 21 ( 3.7)
## Asian 233 (30.5) 208 (36.3)
## Foreign 8 ( 1.0) 6 ( 1.0)
## Hispanic 167 (21.9) 154 (26.9)
## Two or More Races 52 ( 6.8) 31 ( 5.4)
## Unknown 34 ( 4.5) 13 ( 2.3)
## White 227 (29.7) 137 (23.9)
## Other 12 ( 1.6) 3 ( 0.5)
## gender = Male (%) 308 (40.3) 223 (38.9) 0.029
## AP_CHEM (mean (SD)) 1.97 (0.26) 1.96 (0.18) 0.037
## AP_CHEM.flg = Not Missing (%) 55 ( 7.2) 19 ( 3.3) 0.175
## prevPAL (mean (SD)) 0.63 (0.77) 1.02 (0.80) 0.499
## bot.level (%) 0.210
## Freshman 32 ( 4.2) 29 ( 5.1)
## Junior 270 (35.3) 211 (36.8)
## Senior 256 (33.5) 141 (24.6)
## Sophomore 206 (27.0) 192 (33.5)
## m.rmd = Remedial in Math (%) 75 ( 9.8) 106 (18.5) 0.251
## pell.coh.term.flg = Pell (%) 358 (46.9) 343 (59.9) 0.263
## cMaj (%) 0.299
## Biology 399 (52.2) 324 (56.5)
## Chemistry 114 (14.9) 79 (13.8)
## Geology 6 ( 0.8) 5 ( 0.9)
## Health Science 7 ( 0.9) 4 ( 0.7)
## Kinesiology/Physical Education 120 (15.7) 107 (18.7)
## Nutrition 44 ( 5.8) 18 ( 3.1)
## OTHER 32 ( 4.2) 22 ( 3.8)
## Physics 30 ( 3.9) 2 ( 0.3)
## Undeclared 12 ( 1.6) 12 ( 2.1)
## csus.gpa.start (mean (SD)) 3.16 (0.45) 3.21 (0.41) 0.105
## adm.area = nonlocal (%) 231 (30.2) 149 (26.0) 0.094
Check how many variables have SMD > 0.1
addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
##
## FALSE TRUE Sum
## 5 8 13
get.imbal.vars(unmatched.tab)
Variable | Before Matching SMD |
---|---|
prevPAL | 0.4992792 |
cMaj | 0.2985656 |
pell.coh.term.flg | 0.2628653 |
m.rmd | 0.2510048 |
eth.erss | 0.2493008 |
bot.level | 0.2101597 |
AP_CHEM.flg | 0.1746490 |
csus.gpa.start | 0.1050944 |
8 variables have SMD >0.1
Implement a propensity score matching method.
match.chem1b <- with(chem1b.final, Match(
Y=chem1b.final$grd.pt.unt, Tr = chem1b.final$palN, X = p.score,
BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25, replace = TRUE, ties = TRUE))
Standardized mean differences for continuous variables and categorical variables.
# Needed for match table
chem1b.final <- chem1b.final %>%
rownames_to_column(var = "id")
# Matched data
chem1b.matched.dat <- chem1b.final[unlist(match.chem1b[c("index.treated", "index.control")]), ]
chem1b.matched.dat$match.weights<- c(match.chem1b$weights, match.chem1b$weights)
# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem1b.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem1b.covs, strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 573.00 573.00
## chem1a.grd.pt.unt (mean (SD)) 2.54 (0.67) 2.58 (0.64) 0.071
## cum.percent.units.passed (mean (SD)) 0.92 (0.08) 0.92 (0.08) 0.002
## eth.erss (%) 0.176
## African American 33.9 ( 5.9) 21.0 ( 3.7)
## Asian 189.9 (33.1) 208.0 (36.3)
## Foreign 7.5 ( 1.3) 6.0 ( 1.0)
## Hispanic 153.9 (26.9) 154.0 (26.9)
## Two or More Races 48.8 ( 8.5) 31.0 ( 5.4)
## Unknown 11.0 ( 1.9) 13.0 ( 2.3)
## White 125.7 (21.9) 137.0 (23.9)
## Other 2.2 ( 0.4) 3.0 ( 0.5)
## gender = Male (%) 195.8 (34.2) 223.0 (38.9) 0.099
## AP_CHEM (mean (SD)) 1.96 (0.11) 1.96 (0.18) 0.003
## AP_CHEM.flg = Not Missing (%) 10.9 ( 1.9) 19.0 ( 3.3) 0.089
## prevPAL (mean (SD)) 1.04 (0.92) 1.02 (0.80) 0.025
## bot.level (%) 0.078
## Freshman 37.9 ( 6.6) 29.0 ( 5.1)
## Junior 218.6 (38.2) 211.0 (36.8)
## Senior 135.4 (23.6) 141.0 (24.6)
## Sophomore 181.2 (31.6) 192.0 (33.5)
## m.rmd = Remedial in Math (%) 106.2 (18.5) 106.0 (18.5) 0.001
## pell.coh.term.flg = Pell (%) 325.4 (56.8) 343.0 (59.9) 0.062
## cMaj (%) 0.118
## Biology 298.0 (52.0) 324.0 (56.5)
## Chemistry 85.5 (14.9) 79.0 (13.8)
## Geology 3.6 ( 0.6) 5.0 ( 0.9)
## Health Science 6.8 ( 1.2) 4.0 ( 0.7)
## Kinesiology/Physical Education 119.0 (20.8) 107.0 (18.7)
## Nutrition 19.1 ( 3.3) 18.0 ( 3.1)
## OTHER 28.7 ( 5.0) 22.0 ( 3.8)
## Physics 2.0 ( 0.3) 2.0 ( 0.3)
## Undeclared 10.2 ( 1.8) 12.0 ( 2.1)
## csus.gpa.start (mean (SD)) 3.18 (0.42) 3.21 (0.41) 0.055
## adm.area = nonlocal (%) 147.6 (25.8) 149.0 (26.0) 0.005
Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion
All variables are balanced and under the <0.1 mean threshold.
## Balance Measures
## All covariates are balanced.
##
## Balance tally for mean differences
## count
## Balanced, <0.1 32
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## chem1a.grd.pt.unt 0.0733 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 764. 573
## Matched (ESS) 214.85 573
## Matched (Unweighted) 457. 573
## Unmatched 307. 0
get.var.perc.tab(chem1b.bal)
## Variable Diff.Un Diff.Adj % Improvement
## 1 p.score 0.7985494927 0.0016913096 100
## 2 eth.erss_Hispanic 0.0501745201 0.0002326934 100
## 3 m.rmd_Remedial in Math 0.0868237347 -0.0002908668 100
## 4 cMaj_Physics -0.0357766143 0.0000000000 100
## 5 prevPAL 0.4896307024 -0.0266280848 95
## 6 AP_CHEM -0.0460365334 0.0026776474 94
## 7 adm.area_nonlocal -0.0423211169 0.0023851076 94
## 8 cMaj_Nutrition -0.0261780105 -0.0020069808 92
## 9 bot.level_Senior -0.0890052356 0.0098312973 89
## 10 eth.erss_Other -0.0104712042 0.0013089005 88
## 11 eth.erss_Unknown -0.0218150087 0.0034322280 84
## 12 cum.percent.units.passed -0.0084163831 0.0017311130 79
## 13 pell.coh.term.flg_Pell 0.1300174520 0.0307737056 76
## 14 bot.level_Sophomore 0.0654450262 0.0189063409 71
## 15 eth.erss_White -0.0580279232 0.0197498546 66
## 16 AP_CHEM.flg_Not Missing -0.0388307155 0.0141942990 63
## 17 csus.gpa.start 0.1110693721 0.0559114629 50
## 18 eth.erss_Asian 0.0580279232 0.0315299593 46
## 19 cMaj_Undeclared 0.0052356021 0.0031413613 40
## 20 cMaj_Kinesiology/Physical Education 0.0296684119 -0.0209714951 29
## 21 bot.level_Junior 0.0148342059 -0.0132635253 11
## 22 eth.erss_Foreign 0.0000000000 -0.0026178010 0
## 23 cMaj_Chemistry -0.0113438045 -0.0113728912 0
## 24 cMaj_Biology 0.0431937173 0.0454043048 -5
## 25 bot.level_Freshman 0.0087260035 -0.0154741129 -77
## 26 cMaj_Health Science -0.0021815009 -0.0047993019 -120
## 27 eth.erss_Two or More Races -0.0139616056 -0.0310936591 -123
## 28 cMaj_Geology 0.0008726003 0.0023560209 -170
## 29 cMaj_OTHER -0.0034904014 -0.0117510180 -237
## 30 gender_Male -0.0139616056 0.0474694590 -240
## 31 eth.erss_African American -0.0039267016 -0.0225421757 -474
## 32 chem1a.grd.pt.unt -0.0065835834 0.0732743689 -1013
get.bal.plot(unmatched.tab, matched.tab)
love.plot(chem1b.bal,binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
create.match.tab(chem1b.matched.dat)
Non-PAL | PAL | |
---|---|---|
Single Matches | 168 | 331 |
Multiple Matches | 289 | 242 |
Total Students | 457 | 573 |
Out of 573 PAL students, 331 PAL students were matched to one non-PAL student and 242 PAL students were matched to multiple non-PAL students.
Out of 991 non-PAL student matches, there were 457 non-PAL students, 168 of the non-PAL students were matched to one PAL student and 289 of the non-PAL students were matched to multiple PAL students.
get.att.plot(chem1b.final, match.chem1b)
The standardized mean differences of the prognostic scores is 0.0751, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.
##
## Call:
## glm(formula = f.build("grd.pt.unt", chem1b.covs), data = ctrl.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.38485 -0.44720 0.07171 0.50204 2.90517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.560100 0.506831 -5.051 5.54e-07 ***
## chem1a.grd.pt.unt 0.383550 0.056839 6.748 3.04e-11 ***
## cum.percent.units.passed 0.263816 0.493095 0.535 0.5928
## eth.erssAsian 0.173864 0.153593 1.132 0.2580
## eth.erssForeign 0.230472 0.316260 0.729 0.4664
## eth.erssHispanic 0.120339 0.154148 0.781 0.4352
## eth.erssTwo or More Races 0.224773 0.180187 1.247 0.2126
## eth.erssUnknown 0.213163 0.199194 1.070 0.2849
## eth.erssWhite 0.209831 0.154464 1.358 0.1747
## eth.erssOther 0.663180 0.269936 2.457 0.0142 *
## genderMale 0.137315 0.062484 2.198 0.0283 *
## AP_CHEM 0.112366 0.113839 0.987 0.3239
## AP_CHEM.flgNot Missing 0.062124 0.113912 0.545 0.5857
## prevPAL -0.161055 0.039232 -4.105 4.49e-05 ***
## bot.levelJunior 0.054140 0.158553 0.341 0.7329
## bot.levelSenior 0.032432 0.160796 0.202 0.8402
## bot.levelSophomore -0.021684 0.155768 -0.139 0.8893
## m.rmdRemedial in Math -0.077182 0.104382 -0.739 0.4599
## pell.coh.term.flgPell -0.079351 0.059006 -1.345 0.1791
## cMajChemistry 0.148660 0.090582 1.641 0.1012
## cMajGeology 0.131348 0.324729 0.404 0.6860
## cMajHealth Science 0.535358 0.304482 1.758 0.0791 .
## cMajKinesiology/Physical Education 0.091306 0.083899 1.088 0.2768
## cMajNutrition -0.289441 0.129282 -2.239 0.0255 *
## cMajOTHER -0.084585 0.146957 -0.576 0.5651
## cMajPhysics -0.009565 0.155174 -0.062 0.9509
## cMajUndeclared 0.241947 0.231955 1.043 0.2973
## csus.gpa.start 0.895163 0.101919 8.783 < 2e-16 ***
## adm.areanonlocal -0.036492 0.064317 -0.567 0.5706
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.6128765)
##
## Null deviance: 802.88 on 763 degrees of freedom
## Residual deviance: 450.46 on 735 degrees of freedom
## AIC: 1824.5
##
## Number of Fisher Scoring iterations: 2
## Balance Measures
## Type Diff.Adj M.Threshold
## prog.score Distance 0.0751 Balanced, <0.1
## chem1a.grd.pt.unt Contin. 0.0733 Balanced, <0.1
## cum.percent.units.passed Contin. 0.0017 Balanced, <0.1
## eth.erss_African American Binary -0.0225 Balanced, <0.1
## eth.erss_Asian Binary 0.0315 Balanced, <0.1
## eth.erss_Foreign Binary -0.0026 Balanced, <0.1
## eth.erss_Hispanic Binary 0.0002 Balanced, <0.1
## eth.erss_Two or More Races Binary -0.0311 Balanced, <0.1
## eth.erss_Unknown Binary 0.0034 Balanced, <0.1
## eth.erss_White Binary 0.0197 Balanced, <0.1
## eth.erss_Other Binary 0.0013 Balanced, <0.1
## gender_Male Binary 0.0475 Balanced, <0.1
## AP_CHEM Contin. 0.0027 Balanced, <0.1
## AP_CHEM.flg_Not Missing Binary 0.0142 Balanced, <0.1
## prevPAL Contin. -0.0266 Balanced, <0.1
## bot.level_Freshman Binary -0.0155 Balanced, <0.1
## bot.level_Junior Binary -0.0133 Balanced, <0.1
## bot.level_Senior Binary 0.0098 Balanced, <0.1
## bot.level_Sophomore Binary 0.0189 Balanced, <0.1
## m.rmd_Remedial in Math Binary -0.0003 Balanced, <0.1
## pell.coh.term.flg_Pell Binary 0.0308 Balanced, <0.1
## cMaj_Biology Binary 0.0454 Balanced, <0.1
## cMaj_Chemistry Binary -0.0114 Balanced, <0.1
## cMaj_Geology Binary 0.0024 Balanced, <0.1
## cMaj_Health Science Binary -0.0048 Balanced, <0.1
## cMaj_Kinesiology/Physical Education Binary -0.0210 Balanced, <0.1
## cMaj_Nutrition Binary -0.0020 Balanced, <0.1
## cMaj_OTHER Binary -0.0118 Balanced, <0.1
## cMaj_Physics Binary 0.0000 Balanced, <0.1
## cMaj_Undeclared Binary 0.0031 Balanced, <0.1
## csus.gpa.start Contin. 0.0559 Balanced, <0.1
## adm.area_nonlocal Binary 0.0024 Balanced, <0.1
## p.score Contin. 0.0017 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 33
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## chem1a.grd.pt.unt 0.0733 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 764. 573
## Matched (ESS) 214.85 573
## Matched (Unweighted) 457. 573
## Unmatched 307. 0
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.43171. This result is statistically significant with a P-value of \(1.7177x10^{-8}\) and is based on 573 PAL students and 991 non-PAL student matches(457 total non-PAL students) Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.
summary(match.chem1b)
##
## Estimate... 0.43171
## AI SE...... 0.076567
## T-stat..... 5.6383
## p.val...... 1.7177e-08
##
## Original number of observations.............. 1337
## Original number of treated obs............... 573
## Matched number of observations............... 573
## Matched number of observations (unweighted). 991
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 0
psens(match.chem1b, Gamma=2.0, GammaInc = 0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0000
## 1.3 0 0.0000
## 1.4 0 0.0000
## 1.5 0 0.0005
## 1.6 0 0.0058
## 1.7 0 0.0363
## 1.8 0 0.1335
## 1.9 0 0.3217
## 2.0 0 0.5599
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “instructor”bot.level" or “eth.erss” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.
kable(sort(exp(abs(chem1b.first.order.prop.model$coefficients))))
x | |
---|---|
eth.erssTwo or More Races | 1.000886 |
AP_CHEM | 1.017427 |
eth.erssWhite | 1.075953 |
cMajChemistry | 1.078078 |
chem1a.grd.pt.unt | 1.099901 |
bot.levelSophomore | 1.101998 |
cMajOTHER | 1.103172 |
eth.erssForeign | 1.126751 |
cMajHealth Science | 1.165871 |
genderMale | 1.211890 |
cMajUndeclared | 1.239185 |
eth.erssHispanic | 1.243158 |
eth.erssAsian | 1.253423 |
adm.areanonlocal | 1.277138 |
cMajKinesiology/Physical Education | 1.310256 |
cMajGeology | 1.315418 |
cMajNutrition | 1.450910 |
cum.percent.units.passed | 1.530134 |
pell.coh.term.flgPell | 1.599900 |
csus.gpa.start | 1.606985 |
eth.erssUnknown | 1.622401 |
bot.levelJunior | 1.743177 |
prevPAL | 1.813838 |
m.rmdRemedial in Math | 1.966786 |
bot.levelSenior | 2.043680 |
AP_CHEM.flgNot Missing | 2.260607 |
eth.erssOther | 4.754146 |
cMajPhysics | 9.264774 |
(Intercept) | 11.231726 |
Based on data for 759 PAL students and 1929 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.36 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.
After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.44\pm 0.09\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 4, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.
A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 4 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.
For Chem 4, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: female gender, lower SAT scores, lower AP Calculus exam scores, higher term units attempted, academic major, class level, and CSUS GPA at start of term.
Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0794. Consequently, adequate balance appears to have been achieved.
The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.44\pm 0.09\) or between 0.35 and 0.53 on a 4.0 grade scale. This result is statistically significant with a P-value of \(2.19x10^{-7}\) and is based on 530 PAL students and 680 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.36.
The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 4. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.7 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 4 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of class level, instructor, or cohort (Transfer or Native Freshmen) observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.
Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 759 PAL students in Chem 4 being reduced to 530. Also, 680 non-PAL students were selected out of 1929 original non-PAL students.
When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 1238 non-PAL matches. Of the 530 PAL students, 206 were matched one-to-one with non-PAL students and 324 were matched one-to-many with non-PAL students.
The non-PAL and PAL groups will include students with only first attempts at CHEM 4. They will also include students with previous PAL participation and/or are currently in a PAL for another course.
# Excludes course repeats
chem4.dat <- chem.dat %>%
filter(course=="CHEM 4", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem4.dat) # 2688 179
## [1] 2688 179
There are 2688 first attempt CHEM 4 students.
Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.
# Collapsed cMaj categories to Biology and Other majors at 0.09
with(chem4.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Anthropology 4 2
## Art 1 0
## Biology 590 324
## Business 14 2
## Chemistry 168 66
## Child Devel/Early Childhood Ed 8 6
## Civil Engineering 220 42
## Communications 5 1
## Computer Engineering 21 8
## Computer Science 17 7
## Construction Management 2 2
## Criminal Justice 16 8
## Deaf Studies 3 0
## Economics 1 0
## Electrical Engineering 86 13
## English 3 3
## Environmental Studies 24 6
## Family & Consumer Sciences 1 0
## Film 1 0
## Finance 3 1
## French 1 0
## Geology 27 8
## Gerontology 2 0
## Graphic Design 3 0
## Health Science 17 6
## History 2 3
## Humanities 0 1
## Interior Design 1 0
## International Business 1 2
## Journalism 1 1
## Kinesiology/Physical Education 202 81
## Liberal Studies 8 0
## Mathematics 7 2
## Mechanical Engineering 212 62
## Music 0 1
## Nursing 29 25
## Nutrition 89 35
## Philosophy 3 1
## Photography 0 1
## Physical Science 1 0
## Physics 13 3
## Political Science 2 0
## Psychology 42 8
## Recreation Administration 0 1
## Social Science 1 1
## Social Work 1 3
## Sociology 1 0
## Spanish 1 2
## Speech Pathology 8 0
## Theatre Arts 1 1
## Undeclared 61 20
chem4.dat <- group_category(data = chem4.dat, feature = "cMaj", threshold = 0.09, update = TRUE)
with(chem4.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 590 324
## Chemistry 168 66
## Civil Engineering 220 42
## Electrical Engineering 86 13
## Environmental Studies 24 6
## Geology 27 8
## Kinesiology/Physical Education 202 81
## Mechanical Engineering 212 62
## Nursing 29 25
## Nutrition 89 35
## OTHER 179 69
## Psychology 42 8
## Undeclared 61 20
Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.
## [1] 40
## feature num_missing pct_missing
## 19 Instructor_02 2688 1.00000000
## 24 deg.plan3 2688 1.00000000
## 25 deg.plan4 2688 1.00000000
## 26 deg.plan5 2688 1.00000000
## 27 deg.plan6 2688 1.00000000
## 21 withdraw_reason 2671 0.99367560
## 23 deg.plan2 2667 0.99218750
## 13 trf.gpaADM 2330 0.86681548
## 4 pledge.term 2157 0.80245536
## 29 grad.termERS 1738 0.64657738
## 22 deg.plan1 1713 0.63727679
## 28 grad.term 1713 0.63727679
## 30 ttg 1713 0.63727679
## 1 fys.term.code 1579 0.58742560
## 2 fys.grd 1579 0.58742560
## 3 fys.rpt.flg 1579 0.58742560
## 20 treat.section 1367 0.50855655
## 15 ge.critical.thinking.status 813 0.30245536
## 17 ge.math.status 813 0.30245536
## 18 ge.oral.comm.status 813 0.30245536
## 16 ge.english.comp.status 812 0.30208333
## 14 admit.term 790 0.29389881
## 33 plan.college 769 0.28608631
## 34 plan.college.desc 769 0.28608631
## 35 plan.dept 769 0.28608631
## 36 plan.deptAbbr 769 0.28608631
## 37 plan.degree 769 0.28608631
## 38 plan.type 769 0.28608631
## 7 sat.math.score 519 0.19308036
## 8 sat.math.flg 519 0.19308036
## 9 sat.verbal.score 519 0.19308036
## 10 sat.verbal.flg 519 0.19308036
## 11 sat.test.date 519 0.19308036
## 31 tot.passd.prgrss.start 419 0.15587798
## 32 tot.taken.prgrss.start 419 0.15587798
## 39 cum.percent.units.passed 419 0.15587798
## 40 county 406 0.15104167
## 12 hs.gpa 321 0.11941964
## 5 m.rmd.admin 146 0.05431548
## 6 m.rmd.admin.detail 146 0.05431548
## [1] 2688 144
40 variables missing >5%
5 out of 40 variables were important and force included, even though they were missing >5%
So, 35 variables were removed due to missingness and there are now 144 variables instead of 179 variables.
chem4.dat <- chem4.dat[complete.cases(chem4.dat), ]
dim(chem4.dat) # 1723 144
## [1] 1723 144
1723 out of 2688 students are kept
965 students were removed due to missingness of variables
single.vars <- chem4.dat %>%
summarise(across(everything(), ~ n_distinct(.x))) %>%
select_if(. == 1)
# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem4.dat)
##
## Overall
## n 1723
## country = USA (%) 1723 (100.0)
## career.course = UGRD (%) 1723 (100.0)
## acad.prog.course = UGD (%) 1723 (100.0)
## course (%)
## CHEM 4 1723 (100.0)
## CHEM 1A 0 ( 0.0)
## CHEM 1B 0 ( 0.0)
## CHEM 24 0 ( 0.0)
## component = LEC (%) 1723 (100.0)
## units (mean (SD)) 3.00 (0.00)
## course.numeric (mean (SD)) 4.00 (0.00)
## div = Lower Division (%) 1723 (100.0)
## course.seq (mean (SD)) 0.00 (0.00)
## rpt.flg = First Attempt (%) 1723 (100.0)
## base.time.course (mean (SD)) 1.00 (0.00)
## years (mean (SD)) 0.50 (0.00)
## enroll.status = Continuing (%) 1723 (100.0)
## withdraw_code = NWD (%) 1723 (100.0)
## enrl.flg = Enrolled (%) 1723 (100.0)
## enrl.flgERS = Enrolled (%) 1723 (100.0)
## rtn.flg = Retained (%) 1723 (100.0)
## rtn.flgERS = Retained (%) 1723 (100.0)
## pass.term.flg = PASS Term (%) 1723 (100.0)
## csus.gpa.start.flg = Not Missing (%) 1723 (100.0)
## higher.ed.gpa.start.flg = Not Missing (%) 1723 (100.0)
sum(single.vars) # 21
## [1] 21
# remove single-valued variables
chem4.dat<- chem4.dat %>%
dplyr::select(-names(single.vars))
dim(chem4.dat) # 1723 123
## [1] 1723 123
123 out of 144 variables are kept
21 variables removed due to single values
# Remove non chem4 instructors
chem4.dat <- chem4.dat %>%
droplevels(chem4.dat$Instructor_01)
# Combine sparse ethnicity categories to Other
chem4.dat <- chem4.dat %>%
mutate(eth.erss = fct_collapse(eth.erss, `N.A/P.I` = c("Pacific Islander", "Native American")))
with(chem4.dat, table(eth.erss, palN))
## palN
## eth.erss 0 1
## African American 70 32
## Asian 361 132
## Foreign 26 9
## Hispanic 404 234
## N.A/P.I 15 11
## Two or More Races 64 31
## Unknown 35 9
## White 217 73
# Collapse sparse categories for acad.stand
# Other: Academic Dismissal, Academic Disqualification
chem4.dat <- chem4.dat %>%
mutate(acad.stand = fct_other(acad.stand, keep = c("Good Standing")))
with(chem4.dat, table(acad.stand, palN))
## palN
## acad.stand 0 1
## Good Standing 1076 498
## Other 116 33
Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.
chem4.final <- chem4.step.vars(chem4.dat)
kable(names(chem4.final))
x |
---|
acad.stand |
adm.area |
bot.level |
cMaj |
coh |
course.age |
csus.gpa.start |
cum.percent.units.passed |
delay.from.hs |
e.rmd |
eth.erss |
father.ed |
fys.flg |
gender |
hous.coh.term.flg |
hs.gpa |
Instructor_01 |
median.income |
m.rmd |
mother.ed |
pct.female.head |
pell.coh.term.flg |
prevPAL |
prevPASS |
reason |
sac.county.flg |
term.units.attemptedCensus |
palN |
grd.pt.unt |
sat.math.score |
sat.math.flg |
sat.verbal.score |
AP_BIOL |
AP_CALAB |
AP_CALBC |
AP_CHEM |
AP_BIOL.flg |
AP_CALAB.flg |
AP_CALBC.flg |
AP_CHEM.flg |
pct.female.head.flg |
med.inc.flg |
Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
chem4.first.order.prop.model <- chem4.step(chem4.final)
summary(chem4.first.order.prop.model)
##
## Call:
## glm(formula = model.first.order, family = binomial, data = chem4.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0928 -0.8582 -0.5962 1.0823 2.5189
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.313546 0.984757 -0.318 0.750182
## cum.percent.units.passed -0.874892 0.563737 -1.552 0.120674
## eth.erssAsian -0.183902 0.257877 -0.713 0.475761
## eth.erssForeign 0.097238 0.470616 0.207 0.836309
## eth.erssHispanic 0.389267 0.249464 1.560 0.118663
## eth.erssN.A/P.I 0.426014 0.492404 0.865 0.386944
## eth.erssTwo or More Races 0.108771 0.329229 0.330 0.741113
## eth.erssUnknown -0.448564 0.460291 -0.975 0.329796
## eth.erssWhite -0.124588 0.276471 -0.451 0.652252
## genderMale -0.363496 0.130878 -2.777 0.005480 **
## sat.math.score -0.002785 0.001077 -2.586 0.009715 **
## sat.verbal.score -0.001747 0.001014 -1.723 0.084824 .
## sat.math.flgold 0.392623 0.243203 1.614 0.106444
## AP_CALAB -0.411235 0.206074 -1.996 0.045981 *
## AP_CALAB.flgNot Missing -0.664814 0.309910 -2.145 0.031938 *
## term.units.attemptedCensus 0.147671 0.027876 5.298 1.17e-07 ***
## Instructor_012 -0.330922 0.600790 -0.551 0.581763
## Instructor_013 -0.720154 0.559506 -1.287 0.198051
## Instructor_016 -0.893954 0.785129 -1.139 0.254867
## Instructor_017 -0.395600 0.747739 -0.529 0.596762
## Instructor_0110 -0.181681 0.617853 -0.294 0.768718
## Instructor_0112 0.293936 0.508475 0.578 0.563214
## Instructor_0120 0.615611 0.549709 1.120 0.262763
## Instructor_0122 -0.816065 0.636632 -1.282 0.199896
## cMajChemistry -0.405909 0.229873 -1.766 0.077429 .
## cMajCivil Engineering -0.870867 0.239665 -3.634 0.000279 ***
## cMajElectrical Engineering -0.787875 0.378779 -2.080 0.037522 *
## cMajEnvironmental Studies 0.232229 0.636605 0.365 0.715266
## cMajGeology -1.415123 0.793576 -1.783 0.074550 .
## cMajKinesiology/Physical Education -0.436497 0.185440 -2.354 0.018580 *
## cMajMechanical Engineering -0.657946 0.229622 -2.865 0.004166 **
## cMajNursing 0.367788 0.367903 1.000 0.317463
## cMajNutrition -0.730409 0.297964 -2.451 0.014233 *
## cMajOTHER -0.444793 0.221100 -2.012 0.044248 *
## cMajPsychology -1.636895 0.571105 -2.866 0.004154 **
## cMajUndeclared -0.518958 0.307354 -1.688 0.091321 .
## bot.levelJunior 0.475672 0.194301 2.448 0.014360 *
## bot.levelSenior 0.636372 0.428091 1.487 0.137138
## bot.levelSophomore 0.515050 0.145909 3.530 0.000416 ***
## csus.gpa.start 0.332728 0.123350 2.697 0.006988 **
## cohTransfers -0.679730 0.456568 -1.489 0.136545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2128.4 on 1722 degrees of freedom
## Residual deviance: 1892.6 on 1682 degrees of freedom
## AIC: 1974.6
##
## Number of Fisher Scoring iterations: 4
p.score <- chem4.first.order.prop.model$fitted.values
chem4.covs <- names(chem4.first.order.prop.model %>% pluck("model") %>% dplyr::select(-palN))
# Unadjusted mean grades
get.unadj.means(chem4.final)
Non-PAL | PAL | Diff. |
---|---|---|
2.01552 | 2.343503 | 0.3279827 |
## Stratified by palN
## 0 1 SMD
## n 1192 531
## cum.percent.units.passed (mean (SD)) 0.87 (0.13) 0.85 (0.13) 0.125
## eth.erss (%) 0.251
## African American 70 ( 5.9) 32 ( 6.0)
## Asian 361 (30.3) 132 (24.9)
## Foreign 26 ( 2.2) 9 ( 1.7)
## Hispanic 404 (33.9) 234 (44.1)
## N.A/P.I 15 ( 1.3) 11 ( 2.1)
## Two or More Races 64 ( 5.4) 31 ( 5.8)
## Unknown 35 ( 2.9) 9 ( 1.7)
## White 217 (18.2) 73 (13.7)
## gender = Male (%) 589 (49.4) 185 (34.8) 0.298
## sat.math.score (mean (SD)) 492.77 (71.49) 465.20 (69.12) 0.392
## sat.verbal.score (mean (SD)) 470.15 (73.09) 451.90 (72.72) 0.250
## sat.math.flg = old (%) 1042 (87.4) 495 (93.2) 0.197
## AP_CALAB (mean (SD)) 2.51 (0.49) 2.51 (0.41) 0.003
## AP_CALAB.flg = Not Missing (%) 148 (12.4) 41 ( 7.7) 0.156
## term.units.attemptedCensus (mean (SD)) 13.67 (2.19) 14.16 (2.23) 0.226
## Instructor_01 (%) 0.348
## 1 25 ( 2.1) 6 ( 1.1)
## 2 43 ( 3.6) 11 ( 2.1)
## 3 116 ( 9.7) 22 ( 4.1)
## 6 19 ( 1.6) 4 ( 0.8)
## 7 24 ( 2.0) 4 ( 0.8)
## 10 37 ( 3.1) 12 ( 2.3)
## 12 818 (68.6) 415 (78.2)
## 20 69 ( 5.8) 48 ( 9.0)
## 22 41 ( 3.4) 9 ( 1.7)
## cMaj (%) 0.418
## Biology 361 (30.3) 232 (43.7)
## Chemistry 85 ( 7.1) 36 ( 6.8)
## Civil Engineering 139 (11.7) 31 ( 5.8)
## Electrical Engineering 46 ( 3.9) 11 ( 2.1)
## Environmental Studies 7 ( 0.6) 5 ( 0.9)
## Geology 15 ( 1.3) 2 ( 0.4)
## Kinesiology/Physical Education 142 (11.9) 70 (13.2)
## Mechanical Engineering 150 (12.6) 39 ( 7.3)
## Nursing 18 ( 1.5) 18 ( 3.4)
## Nutrition 52 ( 4.4) 20 ( 3.8)
## OTHER 102 ( 8.6) 45 ( 8.5)
## Psychology 24 ( 2.0) 4 ( 0.8)
## Undeclared 51 ( 4.3) 18 ( 3.4)
## bot.level (%) 0.175
## Freshman 390 (32.7) 134 (25.2)
## Junior 210 (17.6) 98 (18.5)
## Senior 30 ( 2.5) 11 ( 2.1)
## Sophomore 562 (47.1) 288 (54.2)
## csus.gpa.start (mean (SD)) 2.94 (0.57) 3.01 (0.52) 0.129
## coh = Transfers (%) 33 ( 2.8) 7 ( 1.3) 0.103
Check how many variables have SMD > 0.1
addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
##
## FALSE TRUE Sum
## 1 13 14
get.imbal.vars(unmatched.tab)
Variable | Before Matching SMD |
---|---|
cMaj | 0.4176251 |
sat.math.score | 0.3920790 |
Instructor_01 | 0.3483413 |
gender | 0.2984060 |
eth.erss | 0.2508546 |
sat.verbal.score | 0.2503217 |
term.units.attemptedCensus | 0.2255303 |
sat.math.flg | 0.1972337 |
bot.level | 0.1752879 |
AP_CALAB.flg | 0.1564959 |
csus.gpa.start | 0.1288580 |
cum.percent.units.passed | 0.1248668 |
coh | 0.1026375 |
13 variables have SMD >0.1
Implement a propensity score matching method.
match.chem4 <- with(chem4.final, Match(
Y=chem4.final$grd.pt.unt, Tr = chem4.final$palN, X = p.score,
BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25, replace = TRUE, ties = TRUE))
Standardized mean differences for continuous variables and categorical variables.
# Needed for match table
chem4.final <- chem4.final %>%
rownames_to_column(var = "id")
# Matched data
chem4.matched.dat <- chem4.final[unlist(match.chem4[c("index.treated", "index.control")]), ]
chem4.matched.dat$match.weights<- c(match.chem4$weights, match.chem4$weights)
# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem4.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem4.covs, strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE, noSpaces = TRUE)
## Stratified by palN
## 0 1 SMD
## n 530.00 530.00
## cum.percent.units.passed (mean (SD)) 0.85 (0.13) 0.85 (0.13) 0.019
## eth.erss (%) 0.147
## African American 26.1 (4.9) 32.0 (6.0)
## Asian 125.3 (23.6) 132.0 (24.9)
## Foreign 11.8 (2.2) 9.0 (1.7)
## Hispanic 243.9 (46.0) 233.0 (44.0)
## N.A/P.I 4.4 (0.8) 11.0 (2.1)
## Two or More Races 25.8 (4.9) 31.0 (5.8)
## Unknown 12.7 (2.4) 9.0 (1.7)
## White 80.1 (15.1) 73.0 (13.8)
## gender = Male (%) 191.2 (36.1) 185.0 (34.9) 0.025
## sat.math.score (mean (SD)) 465.40 (70.72) 465.40 (69.00) <0.001
## sat.verbal.score (mean (SD)) 453.36 (69.57) 452.25 (72.31) 0.016
## sat.math.flg = old (%) 489.5 (92.3) 494.0 (93.2) 0.033
## AP_CALAB (mean (SD)) 2.51 (0.40) 2.51 (0.41) 0.004
## AP_CALAB.flg = Not Missing (%) 45.6 (8.6) 41.0 (7.7) 0.032
## term.units.attemptedCensus (mean (SD)) 14.19 (2.18) 14.16 (2.23) 0.014
## Instructor_01 (%) 0.146
## 1 11.7 (2.2) 6.0 (1.1)
## 2 8.5 (1.6) 11.0 (2.1)
## 3 20.7 (3.9) 22.0 (4.2)
## 6 7.0 (1.3) 4.0 (0.8)
## 7 7.1 (1.3) 4.0 (0.8)
## 10 12.5 (2.4) 12.0 (2.3)
## 12 418.0 (78.9) 414.0 (78.1)
## 20 38.9 (7.3) 48.0 (9.1)
## 22 5.6 (1.1) 9.0 (1.7)
## cMaj (%) 0.134
## Biology 238.6 (45.0) 231.0 (43.6)
## Chemistry 33.2 (6.3) 36.0 (6.8)
## Civil Engineering 25.4 (4.8) 31.0 (5.8)
## Electrical Engineering 16.4 (3.1) 11.0 (2.1)
## Environmental Studies 6.6 (1.2) 5.0 (0.9)
## Geology 4.0 (0.7) 2.0 (0.4)
## Kinesiology/Physical Education 64.3 (12.1) 70.0 (13.2)
## Mechanical Engineering 35.3 (6.7) 39.0 (7.4)
## Nursing 18.2 (3.4) 18.0 (3.4)
## Nutrition 16.9 (3.2) 20.0 (3.8)
## OTHER 48.4 (9.1) 45.0 (8.5)
## Psychology 1.7 (0.3) 4.0 (0.8)
## Undeclared 21.0 (4.0) 18.0 (3.4)
## bot.level (%) 0.074
## Freshman 119.0 (22.5) 134.0 (25.3)
## Junior 95.6 (18.0) 98.0 (18.5)
## Senior 12.6 (2.4) 11.0 (2.1)
## Sophomore 302.7 (57.1) 287.0 (54.2)
## csus.gpa.start (mean (SD)) 2.97 (0.50) 3.01 (0.52) 0.081
## coh = Transfers (%) 5.7 (1.1) 7.0 (1.3) 0.023
Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group Binary variables: Raw differences in proportion
All variables are balanced and under the <0.1 mean threshold.
chem4.bal <- bal.tab(match.chem4, formula = f.build("palN", chem4.covs), data = chem4.final,
distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
chem4.bal
## Balance Measures
## All covariates are balanced.
##
## Balance tally for mean differences
## count
## Balanced, <0.1 45
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## csus.gpa.start 0.0794 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 1192. 531
## Matched (ESS) 318.44 530
## Matched (Unweighted) 680. 530
## Unmatched 512. 0
## Discarded 0. 1
get.var.perc.tab(chem4.bal)
## Variable Diff.Un Diff.Adj % Improvement
## 1 p.score 0.7912852849 2.660153e-04 100
## 2 sat.math.score -0.3988551284 -9.585924e-05 100
## 3 cMaj_Nursing 0.0187976339 -3.773585e-04 98
## 4 Instructor_01_3 -0.0558841745 2.493261e-03 96
## 5 sat.verbal.score -0.2509643508 -1.536474e-02 94
## 6 term.units.attemptedCensus 0.2235767269 -1.400638e-02 94
## 7 gender_Male -0.1457282701 -1.173405e-02 92
## 8 Instructor_01_12 0.0953026454 -7.520216e-03 92
## 9 cMaj_Biology 0.1340591388 -1.434187e-02 89
## 10 Instructor_01_10 -0.0084413984 -1.013028e-03 88
## 11 cMaj_Mechanical Engineering -0.0523925985 6.889039e-03 87
## 12 cum.percent.units.passed -0.1231213537 1.902176e-02 85
## 13 sat.math.flg_old 0.0580423160 8.580413e-03 85
## 14 coh_Transfers -0.0145018896 2.493261e-03 83
## 15 cMaj_Civil Engineering -0.0582303239 1.052785e-02 82
## 16 AP_CALAB.flg_Not Missing -0.0469482678 -8.728661e-03 81
## 17 eth.erss_Hispanic 0.1017517916 -2.060872e-02 80
## 18 eth.erss_Asian -0.0542647784 1.268643e-02 77
## 19 eth.erss_White -0.0445705204 -1.332210e-02 70
## 20 Instructor_01_2 -0.0153581946 4.683288e-03 70
## 21 cMaj_Psychology -0.0126012715 4.429470e-03 65
## 22 Instructor_01_22 -0.0174468206 6.331986e-03 64
## 23 bot.level_Freshman -0.0748271591 2.823001e-02 62
## 24 cMaj_Geology -0.0088174143 -3.679245e-03 58
## 25 bot.level_Sophomore 0.0708963713 -2.965633e-02 58
## 26 Instructor_01_7 -0.0126012715 -5.864780e-03 53
## 27 Instructor_01_20 0.0325095742 1.725966e-02 47
## 28 bot.level_Junior 0.0083829422 4.451932e-03 47
## 29 eth.erss_Unknown -0.0124132636 -6.951932e-03 44
## 30 cMaj_Electrical Engineering -0.0178749731 -1.019317e-02 43
## 31 csus.gpa.start 0.1357473318 7.943121e-02 41
## 32 cMaj_Undeclared -0.0088869298 -5.754717e-03 35
## 33 Instructor_01_6 -0.0084066406 -5.566038e-03 34
## 34 bot.level_Senior -0.0044521543 -3.025606e-03 32
## 35 cMaj_Environmental Studies 0.0035437126 -2.955975e-03 17
## 36 cMaj_Kinesiology/Physical Education 0.0126992252 1.075921e-02 15
## 37 cMaj_Nutrition -0.0059593776 5.824349e-03 2
## 38 AP_CALAB 0.0034431498 -3.675201e-03 -7
## 39 eth.erss_Foreign -0.0048629280 -5.332435e-03 -10
## 40 Instructor_01_1 -0.0096737193 -1.080413e-02 -12
## 41 cMaj_Chemistry -0.0035121147 5.370620e-03 -53
## 42 eth.erss_N.A/P.I 0.0081317383 1.250000e-02 -54
## 43 eth.erss_Two or More Races 0.0046891391 9.838275e-03 -110
## 44 eth.erss_African American 0.0015388213 1.119048e-02 -627
## 45 cMaj_OTHER -0.0008247071 -6.498203e-03 -688
get.bal.plot(unmatched.tab, matched.tab)
love.plot(chem4.bal,binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
create.match.tab(chem4.matched.dat)
Non-PAL | PAL | |
---|---|---|
Single Matches | 343 | 206 |
Multiple Matches | 337 | 324 |
Total Students | 680 | 530 |
Out of 531 PAL students, 530 were matched and 1 was unable to be matched. Out of 530 PAL student matches, 206 PAL students were matched to one non-PAL student and 324 PAL students were matched to multiple non-PAL students.
Out of 1238 non-PAL student matches, there were 680 non-PAL students, 343 of the non-PAL students were matched to one PAL student and 337 of the non-PAL students were matched to multiple PAL students.
get.att.plot(chem4.final, match.chem4)
The standardized mean differences of the prognostic scores is 0.0900, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.
##
## Call:
## glm(formula = f.build("grd.pt.unt", chem4.covs), data = ctrl.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.09610 -0.71453 0.09259 0.72726 2.81319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.1551135 0.4687028 -6.732 2.64e-11 ***
## cum.percent.units.passed 0.5280403 0.3058402 1.727 0.08452 .
## eth.erssAsian 0.6156161 0.1398936 4.401 1.18e-05 ***
## eth.erssForeign 0.4445376 0.2444358 1.819 0.06923 .
## eth.erssHispanic 0.4145461 0.1384392 2.994 0.00281 **
## eth.erssN.A/P.I 1.1880648 0.3029530 3.922 9.32e-05 ***
## eth.erssTwo or More Races 0.5392068 0.1848077 2.918 0.00360 **
## eth.erssUnknown 0.6844153 0.2210742 3.096 0.00201 **
## eth.erssWhite 0.4857870 0.1486030 3.269 0.00111 **
## genderMale -0.0452349 0.0698444 -0.648 0.51734
## sat.math.score 0.0034058 0.0005773 5.900 4.78e-09 ***
## sat.verbal.score -0.0008005 0.0005374 -1.490 0.13660
## sat.math.flgold -0.0266766 0.1141820 -0.234 0.81531
## AP_CALAB 0.0838340 0.0740435 1.132 0.25777
## AP_CALAB.flgNot Missing 0.3171721 0.1142736 2.776 0.00560 **
## term.units.attemptedCensus -0.0210416 0.0148051 -1.421 0.15552
## Instructor_012 0.6256332 0.2704276 2.313 0.02087 *
## Instructor_013 -0.0692320 0.2442575 -0.283 0.77689
## Instructor_016 -0.3435549 0.3370825 -1.019 0.30832
## Instructor_017 0.2694129 0.3038283 0.887 0.37541
## Instructor_0110 -0.0389665 0.2877060 -0.135 0.89229
## Instructor_0112 0.1022879 0.2288037 0.447 0.65492
## Instructor_0120 -0.0277218 0.2644355 -0.105 0.91653
## Instructor_0122 0.0430879 0.2833216 0.152 0.87915
## cMajChemistry 0.1952620 0.1279747 1.526 0.12734
## cMajCivil Engineering 0.1616398 0.1126137 1.435 0.15146
## cMajElectrical Engineering 0.0486928 0.1720735 0.283 0.77725
## cMajEnvironmental Studies -0.8540900 0.4077684 -2.095 0.03643 *
## cMajGeology -0.5551327 0.2853390 -1.946 0.05196 .
## cMajKinesiology/Physical Education 0.1843223 0.1093237 1.686 0.09206 .
## cMajMechanical Engineering -0.1065179 0.1122961 -0.949 0.34305
## cMajNursing -0.5533050 0.2563063 -2.159 0.03107 *
## cMajNutrition -0.0422353 0.1648899 -0.256 0.79789
## cMajOTHER 0.0852959 0.1240686 0.687 0.49191
## cMajPsychology 0.0750013 0.2259877 0.332 0.74004
## cMajUndeclared 0.0353021 0.1595944 0.221 0.82498
## bot.levelJunior 0.0487889 0.1048907 0.465 0.64192
## bot.levelSenior 0.0325395 0.2267044 0.144 0.88589
## bot.levelSophomore -0.1937920 0.0758361 -2.555 0.01073 *
## csus.gpa.start 1.0094300 0.0660161 15.291 < 2e-16 ***
## cohTransfers 0.0724838 0.2024442 0.358 0.72038
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.095629)
##
## Null deviance: 1984.6 on 1191 degrees of freedom
## Residual deviance: 1261.1 on 1151 degrees of freedom
## AIC: 3533.9
##
## Number of Fisher Scoring iterations: 2
## Balance Measures
## Type Diff.Adj M.Threshold
## prog.score Distance 0.0900 Balanced, <0.1
## cum.percent.units.passed Contin. 0.0190 Balanced, <0.1
## eth.erss_African American Binary 0.0112 Balanced, <0.1
## eth.erss_Asian Binary 0.0127 Balanced, <0.1
## eth.erss_Foreign Binary -0.0053 Balanced, <0.1
## eth.erss_Hispanic Binary -0.0206 Balanced, <0.1
## eth.erss_N.A/P.I Binary 0.0125 Balanced, <0.1
## eth.erss_Two or More Races Binary 0.0098 Balanced, <0.1
## eth.erss_Unknown Binary -0.0070 Balanced, <0.1
## eth.erss_White Binary -0.0133 Balanced, <0.1
## gender_Male Binary -0.0117 Balanced, <0.1
## sat.math.score Contin. -0.0001 Balanced, <0.1
## sat.verbal.score Contin. -0.0154 Balanced, <0.1
## sat.math.flg_old Binary 0.0086 Balanced, <0.1
## AP_CALAB Contin. -0.0037 Balanced, <0.1
## AP_CALAB.flg_Not Missing Binary -0.0087 Balanced, <0.1
## term.units.attemptedCensus Contin. -0.0140 Balanced, <0.1
## Instructor_01_1 Binary -0.0108 Balanced, <0.1
## Instructor_01_2 Binary 0.0047 Balanced, <0.1
## Instructor_01_3 Binary 0.0025 Balanced, <0.1
## Instructor_01_6 Binary -0.0056 Balanced, <0.1
## Instructor_01_7 Binary -0.0059 Balanced, <0.1
## Instructor_01_10 Binary -0.0010 Balanced, <0.1
## Instructor_01_12 Binary -0.0075 Balanced, <0.1
## Instructor_01_20 Binary 0.0173 Balanced, <0.1
## Instructor_01_22 Binary 0.0063 Balanced, <0.1
## cMaj_Biology Binary -0.0143 Balanced, <0.1
## cMaj_Chemistry Binary 0.0054 Balanced, <0.1
## cMaj_Civil Engineering Binary 0.0105 Balanced, <0.1
## cMaj_Electrical Engineering Binary -0.0102 Balanced, <0.1
## cMaj_Environmental Studies Binary -0.0030 Balanced, <0.1
## cMaj_Geology Binary -0.0037 Balanced, <0.1
## cMaj_Kinesiology/Physical Education Binary 0.0108 Balanced, <0.1
## cMaj_Mechanical Engineering Binary 0.0069 Balanced, <0.1
## cMaj_Nursing Binary -0.0004 Balanced, <0.1
## cMaj_Nutrition Binary 0.0058 Balanced, <0.1
## cMaj_OTHER Binary -0.0065 Balanced, <0.1
## cMaj_Psychology Binary 0.0044 Balanced, <0.1
## cMaj_Undeclared Binary -0.0058 Balanced, <0.1
## bot.level_Freshman Binary 0.0282 Balanced, <0.1
## bot.level_Junior Binary 0.0045 Balanced, <0.1
## bot.level_Senior Binary -0.0030 Balanced, <0.1
## bot.level_Sophomore Binary -0.0297 Balanced, <0.1
## csus.gpa.start Contin. 0.0794 Balanced, <0.1
## coh_Transfers Binary 0.0025 Balanced, <0.1
## p.score Contin. 0.0003 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 46
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## csus.gpa.start 0.0794 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 1192. 531
## Matched (ESS) 318.44 530
## Matched (Unweighted) 680. 530
## Unmatched 512. 0
## Discarded 0. 1
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.44347. This result is statistically significant with a P-value of \(4.3873x10^{-7}\) and is based on 530 PAL students and 1238 non-PAL student matches(680 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.
summary(match.chem4)
##
## Estimate... 0.44347
## AI SE...... 0.087792
## T-stat..... 5.0513
## p.val...... 4.3873e-07
##
## Original number of observations.............. 1723
## Original number of treated obs............... 531
## Matched number of observations............... 530
## Matched number of observations (unweighted). 1238
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 1
psens(match.chem4, Gamma=2.5, GammaInc = 0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0000
## 1.3 0 0.0000
## 1.4 0 0.0001
## 1.5 0 0.0019
## 1.6 0 0.0228
## 1.7 0 0.1219
## 1.8 0 0.3517
## 1.9 0 0.6410
## 2.0 0 0.8566
## 2.1 0 0.9587
## 2.2 0 0.9912
## 2.3 0 0.9986
## 2.4 0 0.9998
## 2.5 0 1.0000
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
Note that in the above table \(\Gamma=1.7\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.7 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “coh” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.
kable(sort(exp(abs(chem4.first.order.prop.model$coefficients))))
x | |
---|---|
sat.verbal.score | 1.001748 |
sat.math.score | 1.002788 |
eth.erssForeign | 1.102122 |
eth.erssTwo or More Races | 1.114907 |
eth.erssWhite | 1.132682 |
term.units.attemptedCensus | 1.159132 |
Instructor_0110 | 1.199232 |
eth.erssAsian | 1.201897 |
cMajEnvironmental Studies | 1.261409 |
Instructor_0112 | 1.341698 |
(Intercept) | 1.368269 |
Instructor_012 | 1.392251 |
csus.gpa.start | 1.394769 |
genderMale | 1.438349 |
cMajNursing | 1.444535 |
eth.erssHispanic | 1.475899 |
sat.math.flgold | 1.480860 |
Instructor_017 | 1.485276 |
cMajChemistry | 1.500667 |
AP_CALAB | 1.508679 |
eth.erssN.A/P.I | 1.531142 |
cMajKinesiology/Physical Education | 1.547277 |
cMajOTHER | 1.560167 |
eth.erssUnknown | 1.566062 |
bot.levelJunior | 1.609096 |
bot.levelSophomore | 1.673723 |
cMajUndeclared | 1.680276 |
Instructor_0120 | 1.850787 |
bot.levelSenior | 1.889612 |
cMajMechanical Engineering | 1.930822 |
AP_CALAB.flgNot Missing | 1.944130 |
cohTransfers | 1.973346 |
Instructor_013 | 2.054749 |
cMajNutrition | 2.075929 |
cMajElectrical Engineering | 2.198720 |
Instructor_0122 | 2.261584 |
cMajCivil Engineering | 2.388982 |
cum.percent.units.passed | 2.398617 |
Instructor_016 | 2.444778 |
cMajGeology | 4.116994 |
cMajPsychology | 5.139186 |
Based on data for 177 PAL students and 224 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.43 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.
After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.71\pm 0.26\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 24, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.
A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 24 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.
For Chem 24, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: ethnicity, enrollment in PAL in the past, instructor, Biology major, being remedial in English, class level, lower percent of households with a female head in the zip code where the student lived at the time of application to CSUS, and lower median income where the student lived at the time of application to CSUS.
Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.145. Consequently, adequate balance appears to have been achieved.
The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.71\pm 0.26\) or between 0.45 and 0.97 on a 4.0 grade scale. This result is statistically significant with a P-value of \(3.16x10^{-3}\) and is based on 126 PAL students and 50 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.43.
The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 24. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 24 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of enrollment in PAL in the past, CSUS GPA at start of term, or remedial in English observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.
Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 177 PAL students in Chem 24 being reduced to 126. Also, 50 non-PAL students were selected out of 224 original non-PAL students.
When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 130 non-PAL matches. Of the 126 PAL students, 122 were matched one-to-one with non-PAL students and 4 were matched one-to-many with non-PAL students.
The non-PAL and PAL groups will include students with only first attempts at CHEM 24.They will also include students with previous PAL participation and/or are currently in a PAL for another course.
For the prerequisite course CHEM 1B, the grade for the student’s last attempt and the number of times it was taken are added to the CHEM 1B data set. Only 275 out of 401 CHEM 24 students have CHEM 1B grades.
However, few students retook Chem 1B so there was inadequate balance on number of times Chem 1B was taken, and it was removed from the propensity model after balance checks.
# Excludes course repeats
chem24.dat <- chem.dat %>%
filter(course=="CHEM 24", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem24.dat) # 401 179
## [1] 401 179
prereq <- "CHEM 1B"
chem24.dat <- get.prereq.grades(chem24.dat, chem.dat, prereq)
# Rename Chem 1B variables
chem24.dat <- chem24.dat %>%
rename(chem1b.course.seq= prereq.course.seq, chem1b.grd.pt.unt= prereq.grd.pt.unt, chem1b.grade = prereq.grade)
dim(chem24.dat) # 275 182
## [1] 275 182
There is a total of 275 CHEM 24 first attempt only students with prerequisite CHEM 1B grades.
The variables below were added to the CHEM 24 data, so there are now 182 variables instead of 179 variables.
chem1b.course.seq: Is the student taking CHEM 1B for the first time or is it the second attempt, third attempt, etc.?
chem1b.grd.pt.unt: Numeric grade on 0 to 4 scale (0=F, 4=A) for CHEM 1B
chem1b.grade: Course grade for CHEM 1B
Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.
# Collapsed cMaj categories to Biology and Other majors at 0.10
with(chem24.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 103 122
## Business 1 0
## Chemistry 23 5
## Child Devel/Early Childhood Ed 1 0
## Criminal Justice 2 0
## Dance 1 0
## Geology 1 0
## Gerontology 1 0
## Health Science 1 0
## Kinesiology/Physical Education 5 3
## Liberal Studies 1 0
## Mechanical Engineering 2 0
## Nutrition 0 1
## Psychology 0 1
## Sociology 1 0
chem24.dat <- group_category(data = chem24.dat, feature = "cMaj", threshold = 0.10, update = TRUE)
with(chem24.dat, table(cMaj, palN))
## palN
## cMaj 0 1
## Biology 103 122
## OTHER 40 10
Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.
## [1] 35
## feature num_missing pct_missing
## 17 Instructor_02 275 1.0000000
## 19 withdraw_reason 275 1.0000000
## 22 deg.plan3 275 1.0000000
## 23 deg.plan4 275 1.0000000
## 24 deg.plan5 275 1.0000000
## 25 deg.plan6 275 1.0000000
## 21 deg.plan2 274 0.9963636
## 20 deg.plan1 243 0.8836364
## 26 grad.term 243 0.8836364
## 27 grad.termERS 243 0.8836364
## 28 ttg 243 0.8836364
## 11 trf.gpaADM 242 0.8800000
## 4 pledge.term 199 0.7236364
## 18 treat.section 181 0.6581818
## 1 fys.term.code 134 0.4872727
## 2 fys.grd 134 0.4872727
## 3 fys.rpt.flg 134 0.4872727
## 13 ge.critical.thinking.status 91 0.3309091
## 14 ge.english.comp.status 91 0.3309091
## 15 ge.math.status 91 0.3309091
## 16 ge.oral.comm.status 91 0.3309091
## 12 admit.term 89 0.3236364
## 5 sat.math.score 47 0.1709091
## 6 sat.math.flg 47 0.1709091
## 7 sat.verbal.score 47 0.1709091
## 8 sat.verbal.flg 47 0.1709091
## 9 sat.test.date 47 0.1709091
## 29 plan.college 41 0.1490909
## 30 plan.college.desc 41 0.1490909
## 31 plan.dept 41 0.1490909
## 32 plan.deptAbbr 41 0.1490909
## 33 plan.degree 41 0.1490909
## 34 plan.type 41 0.1490909
## 35 county 39 0.1418182
## 10 hs.gpa 30 0.1090909
## [1] 275 147
35 variables missing >10%
So, 35 variables were removed due to missingness and there are now 147 variables instead of 182 variables.
chem24.dat <- chem24.dat[complete.cases(chem24.dat), ]
dim(chem24.dat) # 265 147
## [1] 265 147
265 out of 275 students are kept
10 students were removed due to missingness of variables
single.vars <- chem24.dat %>%
summarise(across(everything(), ~ n_distinct(.x))) %>%
select_if(. == 1)
# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem24.dat)
##
## Overall
## n 265
## AP_PHYSC (mean (SD)) 2.40 (0.00)
## AP_PHYSM (mean (SD)) 2.47 (0.00)
## AP_PHYSC.flg = Missing (%) 265 (100.0)
## AP_PHYSM.flg = Missing (%) 265 (100.0)
## country = USA (%) 265 (100.0)
## med.inc.flg = Not Missing (%) 265 (100.0)
## career.course = UGRD (%) 265 (100.0)
## acad.prog.course = UGD (%) 265 (100.0)
## course (%)
## CHEM 4 0 ( 0.0)
## CHEM 1A 0 ( 0.0)
## CHEM 1B 0 ( 0.0)
## CHEM 24 265 (100.0)
## component = LEC (%) 265 (100.0)
## units (mean (SD)) 3.00 (0.00)
## course.numeric (mean (SD)) 24.00 (0.00)
## div = Lower Division (%) 265 (100.0)
## course.seq (mean (SD)) 0.00 (0.00)
## rpt.flg = First Attempt (%) 265 (100.0)
## c2s = Non-C2S (%) 265 (100.0)
## base.time.course (mean (SD)) 1.00 (0.00)
## years (mean (SD)) 0.50 (0.00)
## enroll.status = Continuing (%) 265 (100.0)
## withdraw_code = NWD (%) 265 (100.0)
## enrl.flg = Enrolled (%) 265 (100.0)
## enrl.flgERS = Enrolled (%) 265 (100.0)
## pst.flg = Enrolled (%) 265 (100.0)
## pst.flgERS = Enrolled (%) 265 (100.0)
## grad.flg = Not Graduated (%) 265 (100.0)
## grad.flgERS = Not Graduated (%) 265 (100.0)
## rtn.flg = Retained (%) 265 (100.0)
## rtn.flgERS = Retained (%) 265 (100.0)
## pass.term.flg = PASS Term (%) 265 (100.0)
## csus.gpa.start.flg = Not Missing (%) 265 (100.0)
## higher.ed.gpa.start.flg = Not Missing (%) 265 (100.0)
sum(single.vars) # 1
## [1] 31
# remove single-valued variables
chem24.dat<- chem24.dat %>%
dplyr::select(-names(single.vars))
dim(chem24.dat) # 265 116
## [1] 265 116
116 out of 147 variables are kept
31 variables removed due to single values
# Remove non chem24 instructors
chem24.dat <- chem24.dat %>%
droplevels(chem24.dat$Instructor_01)
# Combine sparse ethnicity categories to Other
chem24.dat <- chem24.dat %>%
mutate(eth.erss = fct_other(eth.erss, drop = c("Foreign", "Pacific Islander", "Unknown")))
with(chem24.dat, table(eth.erss, palN))
## palN
## eth.erss 0 1
## African American 5 10
## Asian 50 35
## Hispanic 30 38
## Two or More Races 9 9
## White 38 28
## Other 7 6
# Only one Freshman student, so combined with Sophomore
chem24.dat <- chem24.dat %>%
mutate(bot.level = fct_collapse(bot.level, Sophomore = c("Sophomore", "Freshman")))
with(chem24.dat, table(bot.level, palN))
## palN
## bot.level 0 1
## Sophomore 22 27
## Junior 50 60
## Senior 67 39
Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.
chem24.final <- chem24.step.vars(chem24.dat)
kable(names(chem24.final))
x |
---|
acad.stand |
adm.area |
bot.level |
cMaj |
coh |
course.age |
csus.gpa.start |
cum.percent.units.passed |
delay.from.hs |
e.rmd |
eth.erss |
father.ed |
fys.flg |
gender |
hous.coh.term.flg |
Instructor_01 |
median.income |
m.rmd |
mother.ed |
pct.female.head |
pell.coh.term.flg |
prevPAL |
prevPASS |
reason |
sac.county.flg |
term.units.attemptedCensus |
palN |
grd.pt.unt |
chem1b.grd.pt.unt |
AP_BIOL |
AP_CALAB |
AP_CALBC |
AP_CHEM |
AP_BIOL.flg |
AP_CALAB.flg |
AP_CALBC.flg |
AP_CHEM.flg |
pct.female.head.flg |
Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, and chem1b.grd.pt.unt . Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
##
## Call:
## glm(formula = model.first.order, family = binomial, data = chem24.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3386 -0.8796 -0.1799 0.8482 2.2670
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.508e-01 2.893e+00 -0.260 0.79521
## chem1b.grd.pt.unt -3.697e-01 2.564e-01 -1.442 0.14940
## cum.percent.units.passed 3.443e+00 2.976e+00 1.157 0.24738
## eth.erssAsian -1.543e+00 7.270e-01 -2.123 0.03377 *
## eth.erssHispanic -1.155e+00 7.313e-01 -1.579 0.11438
## eth.erssTwo or More Races -1.209e+00 8.740e-01 -1.383 0.16657
## eth.erssWhite -1.817e+00 7.544e-01 -2.409 0.01599 *
## eth.erssOther -1.906e+00 9.454e-01 -2.016 0.04380 *
## genderMale 6.725e-02 3.340e-01 0.201 0.84043
## prevPAL 8.030e-01 1.464e-01 5.485 4.13e-08 ***
## Instructor_015 -4.481e-01 4.879e-01 -0.918 0.35839
## Instructor_0110 -1.730e+00 4.521e-01 -3.827 0.00013 ***
## Instructor_0114 -4.912e-01 4.002e-01 -1.227 0.21967
## cMajOTHER -1.572e+00 4.910e-01 -3.202 0.00137 **
## e.rmdRemedial in English 1.013e+00 4.046e-01 2.504 0.01227 *
## bot.levelJunior -2.517e-01 4.541e-01 -0.554 0.57942
## bot.levelSenior -1.353e+00 5.719e-01 -2.365 0.01801 *
## delay.from.hs 1.506e-01 1.121e-01 1.343 0.17919
## csus.gpa.start 8.065e-01 5.858e-01 1.377 0.16859
## pct.female.head -8.324e-02 3.506e-02 -2.374 0.01759 *
## median.income -2.480e-05 1.264e-05 -1.962 0.04979 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 366.73 on 264 degrees of freedom
## Residual deviance: 270.15 on 244 degrees of freedom
## AIC: 312.15
##
## Number of Fisher Scoring iterations: 5
# Unadjusted mean grades
get.unadj.means(chem24.final)
Non-PAL | PAL | Diff. |
---|---|---|
1.689209 | 2.101587 | 0.4123787 |
Standardized mean differences for continuous variables and categorical variables.
unmatched.tab <- CreateTableOne(vars = chem24.covs, strata = "palN",
data = chem24.final, smd = TRUE, test = FALSE)
print(unmatched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 139 126
## chem1b.grd.pt.unt (mean (SD)) 2.28 (0.86) 2.32 (0.82) 0.048
## cum.percent.units.passed (mean (SD)) 0.92 (0.09) 0.92 (0.07) 0.091
## eth.erss (%) 0.309
## African American 5 ( 3.6) 10 ( 7.9)
## Asian 50 (36.0) 35 (27.8)
## Hispanic 30 (21.6) 38 (30.2)
## Two or More Races 9 ( 6.5) 9 ( 7.1)
## White 38 (27.3) 28 (22.2)
## Other 7 ( 5.0) 6 ( 4.8)
## gender = Male (%) 56 (40.3) 41 (32.5) 0.162
## prevPAL (mean (SD)) 1.03 (1.23) 1.87 (1.12) 0.715
## Instructor_01 (%) 0.527
## 4 36 (25.9) 52 (41.3)
## 5 14 (10.1) 21 (16.7)
## 10 50 (36.0) 20 (15.9)
## 14 39 (28.1) 33 (26.2)
## cMaj = OTHER (%) 37 (26.6) 10 ( 7.9) 0.510
## e.rmd = Remedial in English (%) 20 (14.4) 34 (27.0) 0.315
## bot.level (%) 0.358
## Sophomore 22 (15.8) 27 (21.4)
## Junior 50 (36.0) 60 (47.6)
## Senior 67 (48.2) 39 (31.0)
## delay.from.hs (mean (SD)) 3.72 (2.07) 3.40 (2.67) 0.132
## csus.gpa.start (mean (SD)) 3.11 (0.48) 3.19 (0.42) 0.185
## pct.female.head (mean (SD)) 19.44 (6.32) 19.43 (7.09) 0.001
## median.income (mean (SD)) 66770.71 (19689.75) 64023.91 (18940.87) 0.142
Check how many variables have SMD > 0.1
addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
##
## FALSE TRUE Sum
## 3 10 13
get.imbal.vars(unmatched.tab)
Variable | Before Matching SMD |
---|---|
prevPAL | 0.7154758 |
Instructor_01 | 0.5273159 |
cMaj | 0.5099804 |
bot.level | 0.3584403 |
e.rmd | 0.3147882 |
eth.erss | 0.3094389 |
csus.gpa.start | 0.1849083 |
gender | 0.1615446 |
median.income | 0.1421813 |
delay.from.hs | 0.1317131 |
10 variables have SMD >0.1
Implement a propensity score matching method.
match.chem24 <- with(chem24.final, Match(
Y=chem24.final$grd.pt.unt, Tr = chem24.final$palN, X = p.score,
BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25, replace = TRUE, ties = TRUE))
Standardized mean differences for continuous variables and categorical variables.
# Needed for match table
chem24.final <- chem24.final %>%
rownames_to_column(var = "id")
# Matched data
chem24.matched.dat <- chem24.final[unlist(match.chem24[c("index.treated", "index.control")]), ]
chem24.matched.dat$match.weights<- c(match.chem24$weights, match.chem24$weights)
# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem24.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem24.covs, strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
## Stratified by palN
## 0 1 SMD
## n 126.00 126.00
## chem1b.grd.pt.unt (mean (SD)) 2.35 (0.85) 2.32 (0.82) 0.037
## cum.percent.units.passed (mean (SD)) 0.93 (0.09) 0.92 (0.07) 0.028
## eth.erss (%) 0.330
## African American 14.5 (11.5) 10.0 ( 7.9)
## Asian 45.0 (35.7) 35.0 (27.8)
## Hispanic 36.0 (28.6) 38.0 (30.2)
## Two or More Races 6.0 ( 4.8) 9.0 ( 7.1)
## White 23.5 (18.7) 28.0 (22.2)
## Other 1.0 ( 0.8) 6.0 ( 4.8)
## gender = Male (%) 50.5 (40.1) 41.0 (32.5) 0.157
## prevPAL (mean (SD)) 1.82 (1.34) 1.87 (1.12) 0.042
## Instructor_01 (%) 0.066
## 4 55.0 (43.7) 52.0 (41.3)
## 5 22.0 (17.5) 21.0 (16.7)
## 10 19.0 (15.1) 20.0 (15.9)
## 14 30.0 (23.8) 33.0 (26.2)
## cMaj = OTHER (%) 19.0 (15.1) 10.0 ( 7.9) 0.225
## e.rmd = Remedial in English (%) 24.0 (19.0) 34.0 (27.0) 0.189
## bot.level (%) 0.116
## Sophomore 28.0 (22.2) 27.0 (21.4)
## Junior 65.5 (52.0) 60.0 (47.6)
## Senior 32.5 (25.8) 39.0 (31.0)
## delay.from.hs (mean (SD)) 3.00 (1.21) 3.40 (2.67) 0.193
## csus.gpa.start (mean (SD)) 3.20 (0.41) 3.19 (0.42) 0.017
## pct.female.head (mean (SD)) 18.76 (6.39) 19.43 (7.09) 0.100
## median.income (mean (SD)) 61841.54 (13978.07) 64023.91 (18938.54) 0.131
Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion
The variables below, “delay.from.hs” and “median.income” are not under the <0.1 mean threshold, but they are under the <0.25 mean threshold.
## Balance Measures
## Type Diff.Un Diff.Adj M.Threshold
## delay.from.hs Contin. -0.1178 0.1500 Not Balanced, >0.1
## median.income Contin. -0.1450 0.1152 Not Balanced, >0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 22
## Not Balanced, >0.1 2
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## delay.from.hs 0.15 Not Balanced, >0.1
##
## Sample sizes
## Control Treated
## All 139. 126
## Matched (ESS) 25.48 126
## Matched (Unweighted) 50. 126
## Unmatched 89. 0
get.var.perc.tab(chem24.bal)
## Variable Diff.Un Diff.Adj % Improvement
## 1 p.score 1.4150085757 0.002828720 100
## 2 Instructor_01_10 -0.2009820715 0.007936508 96
## 3 prevPAL 0.7509922479 0.045889456 94
## 4 csus.gpa.start 0.1977011453 -0.016216685 92
## 5 Instructor_01_5 0.0659472422 -0.007936508 88
## 6 bot.level_Sophomore 0.0560123330 -0.007936508 86
## 7 Instructor_01_4 0.1537056069 -0.023809524 85
## 8 eth.erss_Hispanic 0.0857599635 0.015873016 81
## 9 bot.level_Senior -0.1724905790 0.051587302 70
## 10 cum.percent.units.passed 0.1012499852 -0.030907174 69
## 11 bot.level_Junior 0.1164782460 -0.043650794 63
## 12 cMaj_OTHER -0.1868219710 -0.071428571 62
## 13 e.rmd_Remedial in English 0.1259563778 0.079365079 37
## 14 eth.erss_White -0.0511590727 0.035714286 30
## 15 chem1b.grd.pt.unt 0.0493920155 -0.037396912 24
## 16 median.income -0.1450193421 0.115220532 21
## 17 eth.erss_African American 0.0433938563 -0.035714286 18
## 18 eth.erss_Asian -0.0819344524 -0.079365079 3
## 19 gender_Male -0.0774808724 -0.075396825 3
## 20 delay.from.hs -0.1177689275 0.150005260 -27
## 21 Instructor_01_14 -0.0186707777 0.023809524 -28
## 22 eth.erss_Two or More Races 0.0066803700 0.023809524 -256
## 23 eth.erss_Other -0.0027406646 0.039682540 -1348
## 24 pct.female.head -0.0007108937 0.094801544 -13236
get.bal.plot(unmatched.tab, matched.tab)
love.plot(chem24.bal,binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
create.match.tab(chem24.matched.dat)
Non-PAL | PAL | |
---|---|---|
Single Matches | 23 | 122 |
Multiple Matches | 27 | 4 |
Total Students | 50 | 126 |
Out of 126 PAL students, 122 PAL students were matched to one non-PAL student and 4 PAL students were matched to multiple non-PAL students.
Out of 130 non-PAL student matches, there were 50 non-PAL students, 23 of the non-PAL students were matched to one PAL student and 27 of the non-PAL students were matched to multiple PAL students.
get.att.plot(chem24.final, match.chem24)
The standardized mean differences of the prognostic scores is 0.0194, which indicates balance. The variables, “delay.from.hs” and “median.income”, are not under the 0.10 mean difference threshold, but they are under the 0.25 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the imbalanced variables are not highly prognostic and the estimated prognostic score is balanced.
##
## Call:
## glm(formula = f.build("grd.pt.unt", chem24.covs), data = ctrl.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4293 -0.5496 0.1083 0.6382 2.5388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.151e+00 1.772e+00 -1.214 0.227
## chem1b.grd.pt.unt 7.545e-01 1.440e-01 5.238 7.19e-07 ***
## cum.percent.units.passed 2.546e-01 1.688e+00 0.151 0.880
## eth.erssAsian -4.020e-01 4.818e-01 -0.834 0.406
## eth.erssHispanic -1.580e-01 4.970e-01 -0.318 0.751
## eth.erssTwo or More Races -7.777e-01 5.815e-01 -1.338 0.184
## eth.erssWhite -1.290e-01 4.959e-01 -0.260 0.795
## eth.erssOther 4.655e-01 6.339e-01 0.734 0.464
## genderMale -1.011e-01 1.916e-01 -0.528 0.599
## prevPAL -5.747e-02 7.795e-02 -0.737 0.462
## Instructor_015 1.694e+00 3.509e-01 4.826 4.21e-06 ***
## Instructor_0110 1.558e+00 2.444e-01 6.373 3.73e-09 ***
## Instructor_0114 1.081e+00 2.400e-01 4.503 1.59e-05 ***
## cMajOTHER 2.367e-01 2.236e-01 1.058 0.292
## e.rmdRemedial in English 2.513e-02 2.837e-01 0.089 0.930
## bot.levelJunior 9.747e-02 2.922e-01 0.334 0.739
## bot.levelSenior -3.210e-01 3.450e-01 -0.930 0.354
## delay.from.hs 1.016e-02 7.022e-02 0.145 0.885
## csus.gpa.start 1.764e-01 3.344e-01 0.528 0.599
## pct.female.head 6.096e-03 2.337e-02 0.261 0.795
## median.income 8.076e-06 7.239e-06 1.116 0.267
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9597258)
##
## Null deviance: 240.75 on 138 degrees of freedom
## Residual deviance: 113.25 on 118 degrees of freedom
## AIC: 409.98
##
## Number of Fisher Scoring iterations: 2
## Balance Measures
## Type Diff.Adj M.Threshold
## prog.score Distance 0.0194 Balanced, <0.1
## chem1b.grd.pt.unt Contin. -0.0374 Balanced, <0.1
## cum.percent.units.passed Contin. -0.0309 Balanced, <0.1
## eth.erss_African American Binary -0.0357 Balanced, <0.1
## eth.erss_Asian Binary -0.0794 Balanced, <0.1
## eth.erss_Hispanic Binary 0.0159 Balanced, <0.1
## eth.erss_Two or More Races Binary 0.0238 Balanced, <0.1
## eth.erss_White Binary 0.0357 Balanced, <0.1
## eth.erss_Other Binary 0.0397 Balanced, <0.1
## gender_Male Binary -0.0754 Balanced, <0.1
## prevPAL Contin. 0.0459 Balanced, <0.1
## Instructor_01_4 Binary -0.0238 Balanced, <0.1
## Instructor_01_5 Binary -0.0079 Balanced, <0.1
## Instructor_01_10 Binary 0.0079 Balanced, <0.1
## Instructor_01_14 Binary 0.0238 Balanced, <0.1
## cMaj_OTHER Binary -0.0714 Balanced, <0.1
## e.rmd_Remedial in English Binary 0.0794 Balanced, <0.1
## bot.level_Sophomore Binary -0.0079 Balanced, <0.1
## bot.level_Junior Binary -0.0437 Balanced, <0.1
## bot.level_Senior Binary 0.0516 Balanced, <0.1
## delay.from.hs Contin. 0.1500 Not Balanced, >0.1
## csus.gpa.start Contin. -0.0162 Balanced, <0.1
## pct.female.head Contin. 0.0948 Balanced, <0.1
## median.income Contin. 0.1152 Not Balanced, >0.1
## p.score Contin. 0.0028 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 23
## Not Balanced, >0.1 2
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## delay.from.hs 0.15 Not Balanced, >0.1
##
## Sample sizes
## Control Treated
## All 139. 126
## Matched (ESS) 25.48 126
## Matched (Unweighted) 50. 126
## Unmatched 89. 0
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.70754 . This result is statistically significant with a P-value of 0.0063135. and is based on 126 PAL students and 130 non-PAL student matches(50 non-PAL students total). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.
summary(match.chem24)
##
## Estimate... 0.70754
## AI SE...... 0.25907
## T-stat..... 2.731
## p.val...... 0.0063135
##
## Original number of observations.............. 265
## Original number of treated obs............... 126
## Matched number of observations............... 126
## Matched number of observations (unweighted). 130
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 0
psens(match.chem24, Gamma=2.0, GammaInc = 0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0001
## 1.2 0 0.0004
## 1.3 0 0.0015
## 1.4 0 0.0041
## 1.5 0 0.0095
## 1.6 0 0.0193
## 1.7 0 0.0352
## 1.8 0 0.0586
## 1.9 0 0.0905
## 2.0 0 0.1310
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to csus.gpa.start or ethnicity the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.
kable(sort(exp(abs(chem24.first.order.prop.model$coefficients))))
x | |
---|---|
median.income | 1.000025 |
genderMale | 1.069562 |
pct.female.head | 1.086803 |
delay.from.hs | 1.162585 |
bot.levelJunior | 1.286195 |
chem1b.grd.pt.unt | 1.447259 |
Instructor_015 | 1.565295 |
Instructor_0114 | 1.634353 |
(Intercept) | 2.118759 |
prevPAL | 2.232138 |
csus.gpa.start | 2.240135 |
e.rmdRemedial in English | 2.754383 |
eth.erssHispanic | 3.172879 |
eth.erssTwo or More Races | 3.350293 |
bot.levelSenior | 3.867930 |
eth.erssAsian | 4.679341 |
cMajOTHER | 4.817476 |
Instructor_0110 | 5.641448 |
eth.erssWhite | 6.155689 |
eth.erssOther | 6.725577 |
cum.percent.units.passed | 31.271288 |
Course | Non-PAL | PAL | Diff. | Std. error | p-val | Sensitivity | N(non-PAL) | N(PAL) |
---|---|---|---|---|---|---|---|---|
BIO 22 | 1.71 | 2.31 | 0.60 | 0.13 | 1.28e-06 | 1.6 | 332 | 326 |
CHEM 1A | 1.48 | 1.98 | 0.50 | 0.07 | 2.43e-13 | 1.8 | 711 | 757 |
CHEM 1B | 1.76 | 2.19 | 0.43 | 0.08 | 8.59e-09 | 1.8 | 457 | 573 |
CHEM 4 | 1.90 | 2.35 | 0.44 | 0.09 | 2.19e-07 | 1.7 | 680 | 530 |
CHEM 24 | 1.39 | 2.10 | 0.71 | 0.26 | 3.16e-03 | 1.8 | 50 | 126 |
Greifer, Noah. 2020. Cobalt: Covariate Balance Tables and Plots. https://CRAN.R-project.org/package=cobalt.
Leite, W. L. 2017. Practical Propensity Score Methods Using R. Thousand Oaks, CA: Sage Publishing. https://osf.io/nygb5/.
Sekhon, Jasjeet S. 2011. “Multivariate and Propensity Score Matching Software with Automated Balance Optimization: The Matching Package for R.” Journal of Statistical Software 42 (7): 1–52. http://www.jstatsoft.org/v42/i07/.
Yoshida, Kazuki, and Alexander Bartel. 2020. Tableone: Create ’Table 1’ to Describe Baseline Characteristics with or Without Propensity Score Weights. https://CRAN.R-project.org/package=tableone.
Zhang, Z., H. J. Kim, G. Lonjon, Y. Zhu, and written on behalf of AME Big-Data Clinical Trial Collaborative Group. 2019. “Balance Diagnostics After Propensity Score Matching.” Annals of Translational Medicine 7 (1): 16. https://doi.org/10.21037/atm.2018.12.10.