Load packages

First Add Some R Packages to the Workspace.
Caution: warning messages are suppressed to reduce clutter in the output.

tidyverse: Importing data, cleaning data, data manipulation, & data visualization
kableExtra: Build HTML tables
DataExplorer: Exploratory Data Analysis & Feature Engineering
tableone: Standardized mean differences for before and after matching
survey: Matched data with match weights
Matching: Propensity score matching
cobalt: Covariate balance
reshape2: Covariate balance plot
rbounds: Rosenbaum Sensitivity test

library(tidyverse)
library(kableExtra)
library(DataExplorer) 
library(tableone)
library(survey)
library(Matching)
library(cobalt)
library(reshape2)
library(rbounds)

select <- dplyr::select # Resolves package conflicts with select
options(width = 120) # Format print width

Load functions

General functions used throughout the analysis.

# Update palN for Chem 24 Spring 2019 ------------------------------------------
update.chem24s19 <- function(chem.dat) {
  PAL.course.data <- read_rds("palCourseData.rds")
  chem24.S19 <- PAL.course.data  %>%
    filter(term == "Spring 2019", course == "CHEM 24")
  # Add a palN indicator for Chem 24 Spring 2019
  chem24.S19 <- chem24.S19 %>%
    mutate(palN.chem24.S19 = case_when(
      pal.grade == "CR" ~ 2,
      is.na(pal.grade) ~ 0,
      TRUE ~ 1
    )) %>%
    select(emplid, palN.chem24.S19) 
  
  # Check how many student are non-PAL, incomplete PAL, and PAL
  table(chem24.S19$palN.chem24.S19)
  # 0  1  2 
  # 51 10 52 
  
  chem.dat <- left_join(chem.dat, chem24.S19, by= "emplid" )
  
  chem.dat  <- chem.dat %>%
    mutate(palN = case_when(
      course == "CHEM 24" & term == "Spring 2019" ~ palN.chem24.S19,
      TRUE ~ palN
    )) %>%
    select(-palN.chem24.S19)
  
  return(chem.dat)
}

# Get raw table of mean gpa for PAL and non-PAL  -------------------------------
get.raw.tab <- function(classes, df)
{ 
 raw.table = data.frame(class=character(),
                         nonPALavg=numeric(),
                         PALavg=numeric(), 
                         Diff=numeric(), 
                         NonPAL_Num= integer(),
                         PAL_Num=integer(),
                         CompletePAL=numeric(),
                         TermPALStart=integer(),
                         row.names=NULL,
                         stringsAsFactors = FALSE)
  
  
  for (i in 1:length(classes))
  {
    curr.class = classes[i]
    temp = subset(df, course==curr.class & course.seq==0)
    pal.start=min(unique(temp$term.code[temp$palN==2]))
    # only include terms after PAL start term
    temp = subset(temp, term.code>= pal.start)
    
    x=tapply(temp$grd.pt.unt,temp$palN, 
             mean, na.rm=T) %>% 
      as.numeric %>% 
      round(2)
    
    y=table(temp$palN) %>% as.numeric
    
    raw.table[i, 'class' ] = curr.class
    raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
                             round(y[3]/sum(y),2))
    raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
    
  }
  
  # formatted table
  raw.table <- kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)") %>%
    kable_styling(full_width= T, position = "left")
 
   return(raw.table)
}

# Data cleaning ----------------------------------------------------------------
clean.data <- function(df)
{
  # Replaced coh.term with coh.term.course
  yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",df$coh.term.course))
  df$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(df$hs.grad.yr),
                                  yr.course.taken-df$hs.grad.yr, NA)
  
  sum(is.na(df$delay.from.hs)) 
  
  # remove students who did not complete PAL 
  df=subset(df, palN!=1) 
  
  #recode palN to factor with 0/1 levels
  df$palN = ifelse(df$palN==2, 1, 0)
  
  #clean up category names in m.rmd and e.rmd
  df$m.rmd[df$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
  df$m.rmd[df$m.rmd=="Remedial\nin Math"]="Remedial in Math"
  df$e.rmd[df$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
  df$e.rmd[df$e.rmd=="Remedial\nin English"]="Remedial in English"
  
  df <- df %>% mutate(m.rmd = factor(m.rmd), e.rmd = factor(e.rmd))
  # table(df$e.rmd)
  
  # Create feature, proportion of cumulative units taken that were passes
  # To distinguish the students who have taken 0 units from the students who 
  #   have passed 0  units they have taken, students who have taken 0 units are 
  #   labeled as -1. Then the -1 is replaced by the mean of cum.percent.units.passed
  df <- df %>%
    mutate(cum.percent.units.passed = ifelse(tot.taken.prgrss.start == 0, -1,
                                             tot.passd.prgrss.start / tot.taken.prgrss.start)) %>%
    mutate(cum.percent.units.passed = ifelse(cum.percent.units.passed  == -1, mean(cum.percent.units.passed,  na.rm =TRUE),
                                             cum.percent.units.passed  ))
  
  # code instructor as alphabetic letter for anonymity
  df$Instructor_01=droplevels(factor(df$Instructor_01))
  
  instructor.vec = sort(unique(df$Instructor_01))
  num.instr = length(instructor.vec)
  
  df$Instructor_01 = factor(
    df$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr)
  )
  
  key.instr.code = cbind(as.character(instructor.vec), 1:num.instr)

  # Add "cMaj", census majors without concentrations/specializations/tracks/etc. 
  major_lookup <- read.csv("Census Major Lookup.csv", header = TRUE, 
                           stringsAsFactors = FALSE)
  df <- merge(df, major_lookup %>% select(censusMajor, cMaj),
              by = "censusMajor", all.x = TRUE)

  # Recode mother's education and father's education variables.
  non.hs.grad= c("No High School","Some High School")
  hs.grad= c("High School Graduate","Some College","2-Year College Graduate")
  coll.grad= c("4-Year College Graduate","Postgraduate")
  parent.ed.levels= c(
    "Non-HS Graduate","HS Graduate", "College Graduate", "Unknown"
  )
  
  df <- df %>%
    mutate(
      mother.ed = ifelse(mother.ed %in% non.hs.grad, "Non-HS Graduate",
        ifelse(mother.ed %in% hs.grad, "HS Graduate", 
          ifelse(mother.ed %in% coll.grad, "College Graduate", "Unknown"))),
      mother.ed= factor(mother.ed, levels= parent.ed.levels),
      father.ed = ifelse(father.ed %in% non.hs.grad,"Non-HS Graduate",
        ifelse(father.ed %in% hs.grad, "HS Graduate", 
          ifelse(father.ed %in% coll.grad, "College Graduate", "Unknown"))),
      father.ed= factor(father.ed, levels= parent.ed.levels))
  
  # Recoded adm.area with these counties as local: 'El Dorado', 'Nevada', 
  #   'Placer', 'Sacramento', 'San Joaquin', 'Solano', 'Yolo'.
  counties.rad <- read_csv(
    "countiesRadius120mi.csv", 
    col_types = cols(
      state = col_skip(), city = col_skip(), distance.km = col_skip()
    )
  )                                     
  
  df <- left_join(df, counties.rad, by = "zip")
  
  local.adm.counties <- c(
    'El Dorado', 'Nevada', 'Placer', 'Sacramento', 'San Joaquin', 'Solano', 
    'Yolo'
  )
  
  # County will be NA if the zip code is not within 120 mile radius of 
  #   CSUS zip code(95819) 
  df <- df %>%
    mutate(
      adm.area = 
        if_else(!(county %in% local.adm.counties) | is.na(county), 
                         "nonlocal", "local")
    ) %>%
    mutate(sac.county.flg =
             if_else(!(county == "Sacramento") | is.na(county), 0, 1)
    ) %>%
    mutate(sac.county.flg = factor(sac.county.flg))

return(df)
}

# Extract prerequisite course grade ---------------------------------------------
get.prereq.grades <- function(course.df, df, prereq) {
  # Get student's recent Chem 1B grade
  course.stu <- course.df$emplid
  prereq.df <- df %>%
    select(emplid, course, course.seq, grd.pt.unt, grade) %>%
    filter(emplid %in% course.stu, course== prereq) %>% 
    group_by(emplid) %>%
    filter(course.seq == max(course.seq)) %>%
    rename(
      prereq.course.seq = course.seq, prereq.grd.pt.unt = grd.pt.unt, 
      prereq.grade = grade
    ) %>% 
    select(-course)
  
  dim(prereq.df) # [1] 275   6
  prereq.stu <- prereq.df$emplid
  
  course.df <- course.df %>%
    filter(emplid %in% prereq.stu)
  course.df <- left_join(course.df, prereq.df, by = "emplid")
  
  return(course.df)
}

# Get only the variables that have missing values ---------------------------------------------
get.missing.only <- function(course.df) {
  get.missing.only <- course.df %>% 
    summarise(across(everything(), ~ sum(is.na(.x)))) %>%
    gather() %>%
    filter(value != 0) 
  get.missing.only <- course.df %>%
    dplyr::select(all_of(get.missing.only$key)) 

  return(get.missing.only)
}

# Get imbalanced variables with SMD > 0.1------------------------------------
get.imbal.vars <- function(tab)
{
  get.imbal.vars <- as.data.frame(ExtractSmd(tab))
  get.imbal.vars <- get.imbal.vars %>%
    rownames_to_column(var = "Variable") %>%
    rename(`Before Matching SMD` = `1 vs 2`) %>%
    filter(`Before Matching SMD` > 0.1) %>% 
    arrange(desc(`Before Matching SMD`))
  get.imbal.vars <- kable(
    get.imbal.vars, caption = "Variables with SMD > 0.1"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.imbal.vars)
}
# Unadjusted means -------------------------------------------------------------
get.unadj.means <- function(df.final)
{
  get.unadj.means <- df.final %>%
    group_by(palN) %>% summarise(unadj.means = mean(grd.pt.unt)) %>%
    pivot_wider(names_from = "palN", values_from = "unadj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  get.unadj.means<- kable(
    get.unadj.means, caption = "Unadjusted Mean Grades"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.unadj.means)
}
# Adjusted means  --------------------------------------------------------------
adj.means <- function(match.list, matched.dat) {
  get.adj.means <- matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  # formatted table
  get.adj.means<- kable(get.adj.means, caption = "Adjusted Mean Grades") %>%
    kable_styling(full_width= F)
  
  return(get.adj.means)
}

# Match Table ------------------------------------------------------------------
create.match.tab <- function(matched.dat) {
  matched.dat <- matched.dat %>%
    mutate(pal = if_else(palN == 0, "Non-PAL", "PAL"))
  pal.flg <- c('Non-PAL', 'PAL')
  
  for (i in seq_along(pal.flg)) {
    multiple.matches <- matched.dat %>%
      filter(pal ==pal.flg[i]) %>%
      count(id) %>%
      filter(n> 1) %>%
      summarise(n())
    single.matches <- matched.dat %>%
      filter(pal == pal.flg[i]) %>%
      count(id) %>%
      filter(n==1) %>%
      summarise(n())
    if(pal.flg[i] == 'Non-PAL') {
      match.tab <- bind_rows(single.matches,  multiple.matches)
      match.tab <- match.tab %>%
        rename('Non-PAL'= 'n()')
    }
    pal.matches <- bind_rows(single.matches, multiple.matches)
    match.tab$PAL <- pal.matches$`n()`
    row.names(match.tab) <- c("Single Matches", "Multiple Matches")
  } 
  match.tab <-rbind(
    match.tab, "Total Students" = c(sum(match.tab$`Non-PAL`), sum(match.tab$`PAL`))
  )
  match.tab <- kable(match.tab, caption = "PAL and Non-PAL Matches") %>%
    kable_styling(full_width= F)
  
  return(match.tab)
}

# ATT plot ---------------------------------------------------------------------
# https://livefreeordichotomize.com/2019/01/17/understanding-propensity-score-weighting/
# https://www.csus.edu/brand/colors.html
get.att.plot <- function(df.final, match.list)
{
  df.final$p.score <- p.score
  
  df.final <- df.final %>%
    select(-id) %>%
    rownames_to_column(var = "id")

  ps.dat <- df.final %>%
    select(id, palN, p.score) %>%
    pivot_wider(
      names_from = "palN", values_from = "p.score", names_prefix = "b.pal."
    )
  before.match <- ps.dat %>%
    select(b.pal.0, b.pal.1)
  
  matched.dat <- df.final[unlist(match.list[c("index.treated", "index.control")]), ]
  matched.dat$match.weights<-  c(match.list$weights, match.list$weights)

  after.match <-matched.dat %>% 
    select(-id) %>%
    rownames_to_column(var = "id")
  after.match <- after.match %>%
    pivot_wider(names_from = "palN", values_from = "p.score", names_prefix = "pal.")
  after.match <- after.match %>%
    select(pal.0, pal.1, match.weights)
  
  get.att.plot <- ggplot() +
    geom_histogram(data = before.match, bins = 50, aes(b.pal.1), alpha = 0.5) + 
    geom_histogram(data = after.match,bins = 50, aes(pal.1, weight = match.weights), 
                   fill = "#043927", alpha = 0.5) + 
    geom_histogram(data = before.match, bins = 50, alpha = 0.5, 
                   aes(x = b.pal.0, y = -..count..)) + 
    geom_histogram(data = after.match, bins = 50, 
                   aes(x = pal.0, weight = match.weights, y = -..count..), 
                   fill = "#c4b581", alpha = 0.5) + 
    ylab("Count") + xlab("Propensity Scores") +
    geom_hline(yintercept = 0, lwd = 0.5) +
    scale_y_continuous(label = abs) 

return(get.att.plot)
}

# Variable Percent Improvement -------------------------------------------------
get.var.perc.tab <- function(list.bal) {
  get.var.perc.tab <- list.bal %>%
    pluck("Balance") %>%
    rownames_to_column("Variable") %>%
    dplyr::select("Variable", "Type", "Diff.Un","Diff.Adj") %>%
    mutate(`% Improvement` = if_else(Diff.Un == 0, 0, round(((abs(Diff.Un) - abs(Diff.Adj))/ abs(Diff.Un)) * 100 , 0))) %>%
    arrange(desc(`% Improvement`))
  get.var.perc.tab <- get.var.perc.tab %>% dplyr::select("Variable", "Diff.Un", "Diff.Adj", `% Improvement`)
  
  return(get.var.perc.tab)
}

# Covariate Balance Plots -------------------------------------------------------
# https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
# https://www.csus.edu/brand/colors.html
get.bal.plot <- function(unmatched.tab, matched.tab) {
  ## Construct a data frame containing variable name and SMD from all methods
  dataPlot <- data.frame(variable  = rownames(ExtractSmd(unmatched.tab)),
                         Unmatched = as.numeric(ExtractSmd(unmatched.tab)),
                         Matched   = as.numeric(ExtractSmd(matched.tab))  )
  
  ## Create long-format data for ggplot2
  dataPlotMelt <- melt(data          = dataPlot,
                       id.vars       = c("variable"),
                       variable.name = "Method",
                       value.name    = "SMD")
  
  ## Order variable names by magnitude of SMD
  varNames <- as.character(dataPlot$variable)[order(dataPlot$Unmatched)]
  
  ## Order factor levels in the same order
  dataPlotMelt$variable <- factor(dataPlotMelt$variable,
                                  levels = varNames)
  
  ## Plot using ggplot2
  # Sac State colors and dashed line
  get.bal.plot <-ggplot(
    data = dataPlotMelt, mapping = 
      aes(x = variable, y = SMD, group = Method, color= Method)) +
    scale_color_manual(values = c("#043927", "#c4b581")) +
    geom_line(aes(linetype = Method)) +
    geom_point() +
    scale_linetype_manual(values= c("dashed", "solid")) +
    geom_hline(yintercept = 0.1, color = "black", size = 0.1) +
    coord_flip() +
    theme_bw() + theme(legend.key = element_blank())
  
  return(get.bal.plot)
}

# PAL Effect -------------------------------------------------------------------
get.pal.effect <- function(match.list, matched.dat, course) {  
 get.gamma <- psens(match.list, Gamma=2.0, GammaInc = 0.1)[["bounds"]] %>%
    filter(`Lower bound` < 0.05 & 0.05 < `Upper bound`) %>%
    slice_min(Gamma) %>% 
    select(Gamma) 
  
  get.pal.effect <-  matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Course= course, .before= "Non-PAL") %>%
    mutate(Diff. = `PAL`-`Non-PAL`) %>%
    mutate(`Std. error`= match.list$se, .after= "Diff.") %>%
    mutate(
      `p-val`= formatC( 1-pnorm(Diff./`Std. error`), format = "e", digits = 2), 
      Sensitivity= get.gamma$Gamma, 
      `N(non-PAL)`= length(unique(match.list$index.control)),
      `N(PAL)`= match.list$wnobs
    )

  return(get.pal.effect)
  }

Specialized functions for each course.

## BIO 22 ====================================================================
## Filter to relevant variables 
bio22.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 'course.count',
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',    
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',    'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(bio22.dat))
  bio22.final <- bio22.dat[ ,new.vars]
  
  return(bio22.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
bio22.step <- function(final.df) {
  # AP_CALAB 
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score +
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg, 
    data= bio22.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(glm(palN ~. - grd.pt.unt,  data=bio22.final, family=binomial))
  bio22.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(bio22.step.first.order)
  bio22.step.first.order$anova
  
  model.first.order <- formula(bio22.step.first.order)
  bio22.first.order.prop.model <- glm(
    model.first.order, data=bio22.final, family=binomial
  )
  
  return(bio22.first.order.prop.model)
}


## CHEM 1A ====================================================================
## Filter to relevant variables 
chem1a.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',   
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg',
    'term.units.attemptedCensus','palN', 'grd.pt.unt', 'sat.math.score', 
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1a.dat))
  chem1a.final <- chem1a.dat[ ,new.vars]
  
  return(chem1a.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem1a.step <- function(final.df) {
  # Stepwise selection selected AP_CALAB.flg, AP_BIOL.flg, AP_CHEM, and
  # AP_CHEM.flg
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score + 
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg + AP_BIOL +
      AP_BIOL.flg + AP_CHEM + AP_CHEM.flg, data= chem1a.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt, data=chem1a.final, family=binomial)
  )

  chem1a.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
  )
  summary(chem1a.step.first.order)
  chem1a.step.first.order$anova
  
  model.first.order <- formula(chem1a.step.first.order)
  chem1a.first.order.prop.model <- glm(
    model.first.order, data=chem1a.final, family=binomial
  )
  
  return(chem1a.first.order.prop.model)
}

## CHEM 1B ====================================================================
## Filter to relevant variables
chem1b.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1a.grd.pt.unt',
    'AP_BIOL',  'AP_CALAB', 'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',
    'AP_CALBC.flg', 'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1b.dat))
  chem1b.final <- chem1b.dat[ ,new.vars]

  return(chem1b.final)
}

## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem1b.step <- function(final.df) {
  # Stepwise selection selected AP_BIOL.flg and AP_CHEM.flg
  # Removed AP_BIOL.flg. Then stepwise selection selected AP_CALAB.flg.
  # Removed AP_CALAB.flg and pct.female.head.flg
  min.model <- glm(
    palN ~ chem1a.grd.pt.unt + cum.percent.units.passed + eth.erss + gender +
      AP_CHEM + AP_CHEM.flg, data= chem1b.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL.flg - AP_CALAB.flg - pct.female.head.flg,  
        data=chem1b.final, family=binomial)
  )

  chem1b.step.first.order <- step(min.model,
                                  direction="forward",scope = biggest,
                                  trace=FALSE, k=2)
  summary(chem1b.step.first.order)
  chem1b.step.first.order$anova

  model.first.order <- formula(chem1b.step.first.order)
  chem1b.first.order.prop.model <- glm(model.first.order, data=chem1b.final, family=binomial)

  return(chem1b.first.order.prop.model)
}

## CHEM 4 ====================================================================
## Filter to relevant variables 
chem4.step.vars <- function(course.df)
{
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt','sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  
  new.vars <- intersect(vars.to.keep, names(chem4.dat))
  chem4.final <- chem4.dat[ ,new.vars]
  
  return(chem4.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem4.step <- function(final.df)
{
 # "AP_BIOL"
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender+ sat.math.score + 
      sat.verbal.score+sat.math.flg + AP_CALAB+AP_CALAB.flg, data= chem4.final, 
    family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL, data=chem4.final, family=binomial)
  )

  chem4.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(chem4.step.first.order)
  chem4.step.first.order$anova
  
  model.first.order <- formula(chem4.step.first.order)
  chem4.first.order.prop.model <- glm(
    model.first.order, data=chem4.final, family=binomial
    )
  
  return(chem4.first.order.prop.model)
}

## CHEM 24 ====================================================================
## Filter to relevant variables 
chem24.step.vars <- function(course.df)
{
  vars.to.keep <- c(  
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1b.grd.pt.unt', 
    'chem1b.term.gpa', 'chem1b.units.attempted', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem24.dat))
  chem24.final <- chem24.dat[ ,new.vars]
  
  return(chem24.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem24.step <- function(final.df) {
  min.model <- glm(
    palN ~ chem1b.grd.pt.unt + cum.percent.units.passed + eth.erss + gender,
    data= chem24.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(
    glm(palN ~.- grd.pt.unt - acad.stand - reason - pct.female.head.flg, 
        data=chem24.final, family=binomial)
  )
  
  chem24.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
    )
  summary(chem24.step.first.order)
  chem24.step.first.order$anova
  
  model.first.order <- formula(chem24.step.first.order)
  chem24.first.order.prop.model <- glm(
    model.first.order, data=chem24.final, family=binomial
  )
 
  return(chem24.first.order.prop.model)
}

Import the Data

Make sure the PAL datafile in the same directory as this RMarkdown file.

PALdatafull <- read_rds("paldatafull_csv.rds")
dim(PALdatafull)
## [1] 1099371     174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555

The files which includes data through the Spring 2019 semester has 1099371 rows and 174 columns. The total of the grd.pt.unt column is 2237555.

Biology classes

Subset data for biology classes

bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))
bio.dat <- PALdatafull %>%
  filter(base.time.course == 1, course %in% bio.classes) %>%
  mutate(course = factor(course, levels = bio.classes)) 
dim(bio.dat)
## [1] 19652   174
num.stu <- dim(bio.dat)[1]
num.vars <- dim(bio.dat)[2]

There are 19652 rows and 174 variables. Each row is a biology student. So, there is a total of 19652 biology students.

Compare the mean gpa for PAL and non-PAL students by Course without Propensity Score Adjustment

The course.seq variables indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.

Note: Excludes incomplete PAL students

get.raw.tab(bio.classes, bio.dat)
Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)
class nonPALavg PALavg Diff NonPAL_Num PAL_Num CompletePAL TermPALStart
BIO 22 1.81 2.33 0.52 949 413 0.29 2158
BIO 121 2.18 2.39 0.21 888 390 0.30 2148
BIO 131 2.33 2.76 0.43 1516 544 0.26 2148
BIO 139 2.41 2.61 0.20 308 98 0.24 2178
BIO 184 2.51 2.73 0.22 509 149 0.22 2178

Data Cleaning & Feature Engineering

Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State

Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.

# Check how many students did not complete PAL
sum(bio.dat$palN==1)
## [1] 106
incl.pal.stu <- sum(bio.dat$palN==1)      
bio.dat <- clean.data(bio.dat)
dim(bio.dat)
## [1] 19546   179

There were 106 biology students who did not complete PAL and were removed from the analysis. There are now 19546 biology students instead of 19652.

There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.

BIO 22

Executive Summary

Based on data for 413 PAL students and 949 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.52 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.60\pm 0.13\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Bio 22, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Bio 22 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Bio 22, 16 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: female gender, lower AP Calculus exam scores, higher term units attempted, CSUS GPA at start of term, enrollment in PAL in the past, academic major, living in off-campus housing during their first year, fewer years between first term at CSUS and high school graduation, and not living in Sacramento county at the time of application.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0951. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.60\pm 0.13\) or between 0.47 and 0.73 on a 4.0 grade scale. This result is statistically significant with a P-value of \(1.28x10^{-6}\) and is based on 326 PAL students and 332 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.52.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Bio 22. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Bio 22 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of living in on-campus housing during their first year, ethnicity, or major observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 413 PAL students in Bio 22 being reduced to 326. Also, 332 non-PAL students were selected out of 949 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 523 non-PAL matches. Of the 326 PAL students, 196 were matched one-to-one with non-PAL students and 130 were matched one-to-many with non-PAL students.

Extract BIO 22 Data

The non-PAL and PAL groups will include students with only first attempts at BIO 22. They will also include students with previous PAL participation and/or are currently in a PAL for another course.

# Excludes course repeats
bio22.dat <- bio.dat %>%
  filter(course=="BIO 22", pass.term.flg == "PASS Term", course.seq== 0)
dim(bio22.dat) # 1362  179
## [1] 1362  179

There are 1,362 BIO 22 first attempt only students.

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.07
with(bio22.dat, table(cMaj, palN))
##                                          palN
## cMaj                                        0   1
##   Anthropology                              2   0
##   Biology                                 333 125
##   Business                                  8   1
##   Chemistry                                27   7
##   Child Devel/Early Childhood Ed           14   4
##   Civil Engineering                         1   0
##   Communications                            3   1
##   Computer Engineering                      1   0
##   Computer Science                          6   0
##   Construction Management                   0   1
##   Criminal Justice                          6   2
##   Dance                                     1   1
##   Economics                                 1   0
##   English                                   1   1
##   Family & Consumer Sciences                2   0
##   Film                                      0   1
##   French                                    1   0
##   Gender/Ethnic/Women's Studies             1   0
##   Gerontology                               0   3
##   Graphic Design                            1   2
##   Health Science                            8   5
##   History                                   2   0
##   Interdisciplinary Studies/Special Major   2   0
##   Interior Design                           0   2
##   International Business                    1   0
##   Journalism                                1   0
##   Kinesiology/Physical Education          422 222
##   Liberal Studies                           1   0
##   Mechanical Engineering                    1   0
##   Nursing                                  23  17
##   Nutrition                                 8   2
##   Philosophy                                1   1
##   Physics                                   1   0
##   Psychology                                7   1
##   Recreation Administration                 1   0
##   Social Science                            2   0
##   Sociology                                 2   0
##   Spanish                                   1   0
##   Speech Pathology                          1   2
##   Undeclared                               35  11
bio22.dat <- group_category(data = bio22.dat, feature = "cMaj", threshold = 0.07,  update = TRUE)
with(bio22.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        333 125
##   Chemistry                       27   7
##   Child Devel/Early Childhood Ed  14   4
##   Kinesiology/Physical Education 422 222
##   Nursing                         23  17
##   OTHER                           75  26
##   Undeclared                      35  11

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

# Include only variables that are missing values
vars.missing <- get.missing.only(bio22.dat)

missingness.rpt <- profile_missing(vars.missing)
# missingness.rpt[order(missingness.rpt$pct_missing, decreasing = TRUE), ]

# check which variables are missing >5% values
vars.missing.data <- missingness.rpt %>%
  filter(pct_missing > 0.05)
sum(missingness.rpt$pct_missing>0.05)
## [1] 37
vars.missing.data [order(vars.missing.data $pct_missing, decreasing = TRUE), ]
##                        feature num_missing pct_missing
## 19               Instructor_02        1362  1.00000000
## 25                   deg.plan4        1362  1.00000000
## 26                   deg.plan5        1362  1.00000000
## 27                   deg.plan6        1362  1.00000000
## 24                   deg.plan3        1361  0.99926579
## 21             withdraw_reason        1357  0.99632893
## 23                   deg.plan2        1356  0.99559471
## 13                  trf.gpaADM        1083  0.79515419
## 4                  pledge.term        1050  0.77092511
## 20               treat.section         938  0.68869310
## 29                grad.termERS         876  0.64317181
## 22                   deg.plan1         874  0.64170338
## 28                   grad.term         874  0.64170338
## 30                         ttg         874  0.64170338
## 1                fys.term.code         839  0.61600587
## 2                      fys.grd         839  0.61600587
## 3                  fys.rpt.flg         839  0.61600587
## 15 ge.critical.thinking.status         442  0.32452276
## 16      ge.english.comp.status         442  0.32452276
## 17              ge.math.status         442  0.32452276
## 18         ge.oral.comm.status         442  0.32452276
## 14                  admit.term         433  0.31791483
## 7               sat.math.score         361  0.26505140
## 8                 sat.math.flg         361  0.26505140
## 9             sat.verbal.score         361  0.26505140
## 10              sat.verbal.flg         361  0.26505140
## 11               sat.test.date         361  0.26505140
## 12                      hs.gpa         260  0.19089574
## 31                plan.college         244  0.17914831
## 32           plan.college.desc         244  0.17914831
## 33                   plan.dept         244  0.17914831
## 34               plan.deptAbbr         244  0.17914831
## 35                 plan.degree         244  0.17914831
## 36                   plan.type         244  0.17914831
## 37                      county         189  0.13876652
## 5                  m.rmd.admin          82  0.06020558
## 6           m.rmd.admin.detail          82  0.06020558
plot_missing(bio22.dat %>% select(all_of(vars.missing.data$feature)))

# Remove variables missing >5%, except for "sat.math.score", "sat.verbal.score", "hs.gpa", "sat.math.flg"

bio22.dat <- bio22.dat %>%
  select(!all_of(vars.missing.data$feature),
                c("sat.math.score", "sat.verbal.score", "hs.gpa","sat.math.flg"))
dim(bio22.dat) # 1362  146
## [1] 1362  146

37 variables missing >5%
The variables below are important and force included, even though they are missing >5%
“sat.math.score”, “sat.verbal.score”, “hs.gpa”,“sat.math.flg”
Only 33 variables were removed due to missingness and there are now 146 variables instead of 179 variables.

Subset on Complete Cases only in BIO 22 Data

bio22.dat <- bio22.dat[complete.cases(bio22.dat), ]
dim(bio22.dat) # 937 146
## [1] 937 146

937 out of 1362 students are kept
425 students were removed due to missingness of variables

Removed single-valued variables

single.vars <- bio22.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = bio22.dat)
##                                            
##                                             Overall       
##   n                                           937         
##   country = USA (%)                           937 (100.0) 
##   career.course = UGRD (%)                    937 (100.0) 
##   acad.prog.course = UGD (%)                  937 (100.0) 
##   course (%)                                              
##      BIO 22                                   937 (100.0) 
##      BIO 121                                    0 (  0.0) 
##      BIO 131                                    0 (  0.0) 
##      BIO 139                                    0 (  0.0) 
##      BIO 184                                    0 (  0.0) 
##   component = LEC (%)                         937 (100.0) 
##   units (mean (SD))                          4.00 (0.00)  
##   course.numeric (mean (SD))                22.00 (0.00)  
##   div = Lower Division (%)                    937 (100.0) 
##   course.seq (mean (SD))                     0.00 (0.00)  
##   rpt.flg = First Attempt (%)                 937 (100.0) 
##   c2s = Non-C2S (%)                           937 (100.0) 
##   base.time.course (mean (SD))               1.00 (0.00)  
##   years (mean (SD))                          0.50 (0.00)  
##   withdraw_code = NWD (%)                     937 (100.0) 
##   enrl.flg = Enrolled (%)                     937 (100.0) 
##   enrl.flgERS = Enrolled (%)                  937 (100.0) 
##   rtn.flg = Retained (%)                      937 (100.0) 
##   rtn.flgERS = Retained (%)                   937 (100.0) 
##   pass.term.flg = PASS Term (%)               937 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)        937 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%)   937 (100.0)
sum(single.vars) # 21
## [1] 21
# remove single-valued variables
bio22.dat<- bio22.dat %>%
  dplyr::select(-names(single.vars))
dim(bio22.dat) # 937  125
## [1] 937 125

125 out of 146 variables are kept
21 variables removed due to single values

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

bio22.final <- bio22.step.vars(bio22.dat)
names(bio22.final)
##  [1] "acad.stand"                 "adm.area"                   "bot.level"                  "cMaj"                      
##  [5] "coh"                        "course.age"                 "csus.gpa.start"             "cum.percent.units.passed"  
##  [9] "delay.from.hs"              "e.rmd"                      "eth.erss"                   "father.ed"                 
## [13] "fys.flg"                    "gender"                     "hous.coh.term.flg"          "hs.gpa"                    
## [17] "Instructor_01"              "median.income"              "m.rmd"                      "mother.ed"                 
## [21] "pct.female.head"            "pell.coh.term.flg"          "prevPAL"                    "prevPASS"                  
## [25] "reason"                     "sac.county.flg"             "term.units.attemptedCensus" "palN"                      
## [29] "grd.pt.unt"                 "sat.math.score"             "sat.math.flg"               "sat.verbal.score"          
## [33] "AP_BIOL"                    "AP_CALAB"                   "AP_CALBC"                   "AP_CHEM"                   
## [37] "AP_BIOL.flg"                "AP_CALAB.flg"               "AP_CALBC.flg"               "AP_CHEM.flg"               
## [41] "pct.female.head.flg"        "med.inc.flg"

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

bio22.first.order.prop.model <- bio22.step(bio22.final)
summary(bio22.first.order.prop.model)
## 
## Call:
## glm(formula = model.first.order, family = binomial, data = bio22.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7671  -0.9137  -0.6395   1.1352   2.3038  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -0.3775015  1.2040989  -0.314 0.753890    
## cum.percent.units.passed           -1.0822415  1.0554624  -1.025 0.305188    
## eth.erssAsian                      -0.4634201  0.3721018  -1.245 0.212980    
## eth.erssForeign                    -0.6166642  0.8106835  -0.761 0.446853    
## eth.erssHispanic                   -0.0583669  0.3641114  -0.160 0.872645    
## eth.erssPacific Islander            0.9985099  0.7949037   1.256 0.209065    
## eth.erssTwo or More Races          -0.5168364  0.4579780  -1.129 0.259101    
## eth.erssUnknown                    -0.1312048  0.5574675  -0.235 0.813930    
## eth.erssWhite                      -0.4577725  0.3921462  -1.167 0.243068    
## genderMale                         -0.2872814  0.1562340  -1.839 0.065946 .  
## sat.math.score                     -0.0006397  0.0014134  -0.453 0.650813    
## sat.verbal.score                   -0.0021488  0.0013662  -1.573 0.115751    
## sat.math.flgold                    -0.3615956  0.2329028  -1.553 0.120528    
## AP_CALAB                           -0.3784461  0.1622482  -2.333 0.019674 *  
## AP_CALAB.flgNot Missing            -0.0812130  0.2570992  -0.316 0.752092    
## term.units.attemptedCensus          0.1419638  0.0383030   3.706 0.000210 ***
## csus.gpa.start                      0.5422919  0.1893888   2.863 0.004191 ** 
## prevPAL                             0.3630843  0.0838341   4.331 1.48e-05 ***
## cMajChemistry                       0.2181870  0.5406120   0.404 0.686512    
## cMajChild Devel/Early Childhood Ed -0.4823010  0.8039725  -0.600 0.548575    
## cMajKinesiology/Physical Education  0.8378476  0.2289283   3.660 0.000252 ***
## cMajNursing                         1.4989657  0.4443443   3.373 0.000742 ***
## cMajOTHER                           0.1981652  0.3601025   0.550 0.582112    
## cMajUndeclared                      0.4871465  0.4576017   1.065 0.287073    
## hous.coh.term.flgOn-Campus Housing -0.5708439  0.2003577  -2.849 0.004384 ** 
## delay.from.hs                      -0.1437080  0.0719453  -1.997 0.045775 *  
## sac.county.flg1                    -0.3061509  0.1718863  -1.781 0.074892 .  
## prevPASS                            0.2165654  0.1462875   1.480 0.138764    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1213.4  on 936  degrees of freedom
## Residual deviance: 1092.9  on 909  degrees of freedom
## AIC: 1148.9
## 
## Number of Fisher Scoring iterations: 4
p.score <- bio22.first.order.prop.model$fitted.values
bio22.covs <-  names(bio22.first.order.prop.model %>%  pluck("model") %>% dplyr::select(-palN))

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(bio22.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.760099 2.306402 0.5463039
# Variable Summary Table for unmatched data
unmatched.tab <- CreateTableOne(vars = bio22.covs, strata = "palN",
                             data = bio22.final, smd = TRUE, test= FALSE)
print(unmatched.tab, smd = TRUE)
##                                            Stratified by palN
##                                             0              1              SMD   
##   n                                            609            328               
##   cum.percent.units.passed (mean (SD))        0.91 (0.10)    0.91 (0.09)   0.007
##   eth.erss (%)                                                             0.259
##      African American                           26 ( 4.3)      16 ( 4.9)        
##      Asian                                     218 (35.8)      96 (29.3)        
##      Foreign                                     8 ( 1.3)       3 ( 0.9)        
##      Hispanic                                  178 (29.2)     130 (39.6)        
##      Pacific Islander                            4 ( 0.7)       5 ( 1.5)        
##      Two or More Races                          44 ( 7.2)      17 ( 5.2)        
##      Unknown                                    16 ( 2.6)       9 ( 2.7)        
##      White                                     115 (18.9)      52 (15.9)        
##   gender = Male (%)                            273 (44.8)     115 (35.1)   0.200
##   sat.math.score (mean (SD))                504.93 (79.84) 495.43 (79.20)  0.119
##   sat.verbal.score (mean (SD))              487.27 (78.52) 478.84 (72.64)  0.111
##   sat.math.flg = old (%)                       538 (88.3)     270 (82.3)   0.171
##   AP_CALAB (mean (SD))                        2.58 (0.52)    2.48 (0.53)   0.200
##   AP_CALAB.flg = Not Missing (%)                79 (13.0)      49 (14.9)   0.057
##   term.units.attemptedCensus (mean (SD))     13.57 (2.24)   14.27 (1.93)   0.330
##   csus.gpa.start (mean (SD))                  3.01 (0.53)    3.12 (0.49)   0.212
##   prevPAL (mean (SD))                         0.52 (1.06)    0.71 (1.29)   0.158
##   cMaj (%)                                                                 0.291
##      Biology                                   215 (35.3)      90 (27.4)        
##      Chemistry                                  14 ( 2.3)       6 ( 1.8)        
##      Child Devel/Early Childhood Ed             13 ( 2.1)       2 ( 0.6)        
##      Kinesiology/Physical Education            283 (46.5)     188 (57.3)        
##      Nursing                                    15 ( 2.5)      15 ( 4.6)        
##      OTHER                                      45 ( 7.4)      17 ( 5.2)        
##      Undeclared                                 24 ( 3.9)      10 ( 3.0)        
##   hous.coh.term.flg = On-Campus Housing (%)    160 (26.3)      75 (22.9)   0.079
##   delay.from.hs (mean (SD))                   2.83 (1.56)    2.47 (1.26)   0.255
##   sac.county.flg = 1 (%)                       294 (48.3)     140 (42.7)   0.112
##   prevPASS (mean (SD))                        0.23 (0.58)    0.26 (0.58)   0.040

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     4    12    16
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
term.units.attemptedCensus 0.3299348
cMaj 0.2912810
eth.erss 0.2585443
delay.from.hs 0.2553547
csus.gpa.start 0.2118672
gender 0.2004051
AP_CALAB 0.1998329
sat.math.flg 0.1708929
prevPAL 0.1582117
sat.math.score 0.1194512
sac.county.flg 0.1124963
sat.verbal.score 0.1114884

12 variables have SMD >0.1

Implement a propensity score matching method.

match.bio22 <- with(bio22.final, Match(
  Y=bio22.final$grd.pt.unt, Tr = bio22.final$palN, X = p.score,
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
bio22.final <- bio22.final %>%
  rownames_to_column(var = "id")

# Matched data
bio22.matched.dat <- bio22.final[unlist(match.bio22[c("index.treated", "index.control")]), ]
bio22.matched.dat$match.weights<-  c(match.bio22$weights, match.bio22$weights)

# Add match weights to matched data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = bio22.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = bio22.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                            Stratified by palN
##                                             0              1              SMD   
##   n                                         326.00         326.00               
##   cum.percent.units.passed (mean (SD))        0.92 (0.09)    0.91 (0.09)   0.094
##   eth.erss (%)                                                             0.131
##      African American                         17.6 ( 5.4)    16.0 ( 4.9)        
##      Asian                                   111.8 (34.3)    96.0 (29.4)        
##      Foreign                                   3.2 ( 1.0)     3.0 ( 0.9)        
##      Hispanic                                112.2 (34.4)   128.0 (39.3)        
##      Pacific Islander                          3.8 ( 1.2)     5.0 ( 1.5)        
##      Two or More Races                        19.4 ( 6.0)    17.0 ( 5.2)        
##      Unknown                                   9.7 ( 3.0)     9.0 ( 2.8)        
##      White                                    48.3 (14.8)    52.0 (16.0)        
##   gender = Male (%)                          119.2 (36.6)   115.0 (35.3)   0.027
##   sat.math.score (mean (SD))                502.55 (79.44) 495.64 (79.29)  0.087
##   sat.verbal.score (mean (SD))              484.86 (78.53) 479.51 (72.31)  0.071
##   sat.math.flg = old (%)                     261.2 (80.1)   268.0 (82.2)   0.053
##   AP_CALAB (mean (SD))                        2.47 (0.60)    2.48 (0.53)   0.009
##   AP_CALAB.flg = Not Missing (%)              59.0 (18.1)    49.0 (15.0)   0.083
##   term.units.attemptedCensus (mean (SD))     14.44 (2.06)   14.26 (1.94)   0.088
##   csus.gpa.start (mean (SD))                  3.11 (0.48)    3.12 (0.49)   0.014
##   prevPAL (mean (SD))                         0.70 (1.31)    0.68 (1.24)   0.013
##   cMaj (%)                                                                 0.124
##      Biology                                  88.5 (27.2)    89.0 (27.3)        
##      Chemistry                                 4.6 ( 1.4)     6.0 ( 1.8)        
##      Child Devel/Early Childhood Ed            1.5 ( 0.5)     2.0 ( 0.6)        
##      Kinesiology/Physical Education          181.4 (55.7)   187.0 (57.4)        
##      Nursing                                  24.0 ( 7.4)    15.0 ( 4.6)        
##      OTHER                                    17.1 ( 5.3)    17.0 ( 5.2)        
##      Undeclared                                8.8 ( 2.7)    10.0 ( 3.1)        
##   hous.coh.term.flg = On-Campus Housing (%)   80.2 (24.6)    75.0 (23.0)   0.038
##   delay.from.hs (mean (SD))                   2.45 (1.39)    2.46 (1.26)   0.010
##   sac.county.flg = 1 (%)                     136.9 (42.0)   140.0 (42.9)   0.019
##   prevPASS (mean (SD))                        0.23 (0.59)    0.25 (0.57)   0.030

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group Binary variables: Raw differences in proportion

All variables are balanced and under the <0.1 mean threshold.

bio22.bal <- bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final,
        distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
bio22.bal
## Balance Measures
## All covariates are balanced.
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        30
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##                  Variable Diff.Adj    M.Threshold
##  cum.percent.units.passed  -0.0951 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                    609.      328
## Matched (ESS)          169.3     326
## Matched (Unweighted)   332.      326
## Unmatched              277.        0
## Discarded                0.        2

Check variable percent improvement

get.var.perc.tab(bio22.bal)
##                               Variable      Diff.Un      Diff.Adj % Improvement
## 1                              p.score  0.757140033 -5.483664e-05           100
## 2                         cMaj_Biology -0.078647523  1.431493e-03            98
## 3                           cMaj_OTHER -0.022062357 -4.601227e-04            98
## 4                        delay.from.hs -0.287221743  1.036247e-02            96
## 5                             AP_CALAB -0.198548550  1.004536e-02            95
## 6                       csus.gpa.start  0.220446905  1.400119e-02            94
## 7                              prevPAL  0.144568525 -1.297841e-02            91
## 8  cMaj_Child Devel/Early Childhood Ed -0.015248909  1.533742e-03            90
## 9                     eth.erss_Foreign -0.003989948 -5.112474e-04            87
## 10                         gender_Male -0.097666106 -1.303681e-02            87
## 11 cMaj_Kinesiology/Physical Education  0.108474508  1.702454e-02            84
## 12                      sac.county.flg -0.055929352  9.406953e-03            83
## 13          term.units.attemptedCensus  0.356927519 -9.032332e-02            75
## 14                    sat.math.flg_old -0.060244703  2.091002e-02            65
## 15          eth.erss_Two or More Races -0.020420321 -7.413088e-03            64
## 16                      eth.erss_White -0.030297569  1.140082e-02            62
## 17           eth.erss_Pacific Islander  0.008675758  3.578732e-03            59
## 18                     cMaj_Undeclared -0.008921062  3.834356e-03            57
## 19                   eth.erss_Hispanic  0.104059033  4.836401e-02            54
## 20 hous.coh.term.flg_On-Campus Housing -0.034067243 -1.610429e-02            53
## 21                    sat.verbal.score -0.116085624 -7.372148e-02            36
## 22                      sat.math.score -0.119934927 -8.720503e-02            27
## 23                      eth.erss_Asian -0.065280948 -4.831288e-02            26
## 24                            prevPASS  0.039806845  3.026697e-02            24
## 25           eth.erss_African American  0.006087549 -5.061350e-03            17
## 26                      cMaj_Chemistry -0.004695823  4.243354e-03            10
## 27                        cMaj_Nursing  0.021101165 -2.760736e-02           -31
## 28            AP_CALAB.flg_Not Missing  0.019669390 -3.072597e-02           -56
## 29                    eth.erss_Unknown  0.001166446 -2.044990e-03           -75
## 30            cum.percent.units.passed  0.007863796 -9.507585e-02         -1109

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(bio22.bal,binary = "raw", stars = "std", var.order = "unadjusted",
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(bio22.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 213 196
Multiple Matches 119 130
Total Students 332 326

Out of 328 PAL students, 326 were matched and 2 were unable to be matched. Out of 326 PAL student matches, 196 PAL students were matched to one non-PAL student and 130 PAL students were matched to multiple non-PAL students.

Out of 523 non-PAL student matches, there were 332 non-PAL students, 213 of the non-PAL students were matched to one PAL student and 119 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(bio22.final, match.bio22)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is -0.0171, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.

# Outcome model of non-PAL group
ctrl.data <- bio22.final %>% filter(palN == 0)
ctrl.fit <- glm(f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
summary(ctrl.fit)
## 
## Call:
## glm(formula = f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7560  -0.8716   0.0303   0.8395   4.1864  
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -3.1402550  0.7201409  -4.361 1.53e-05 ***
## cum.percent.units.passed           -0.1622929  0.6399986  -0.254 0.799907    
## eth.erssAsian                      -0.0779431  0.2438373  -0.320 0.749347    
## eth.erssForeign                     0.3085674  0.4645496   0.664 0.506807    
## eth.erssHispanic                   -0.1654707  0.2422559  -0.683 0.494853    
## eth.erssPacific Islander           -0.2477167  0.6156367  -0.402 0.687556    
## eth.erssTwo or More Races           0.0802446  0.2883046   0.278 0.780856    
## eth.erssUnknown                     0.3015452  0.3699353   0.815 0.415332    
## eth.erssWhite                       0.2307871  0.2558751   0.902 0.367456    
## genderMale                          0.0716462  0.0978066   0.733 0.464141    
## sat.math.score                      0.0019424  0.0008767   2.216 0.027112 *  
## sat.verbal.score                    0.0002403  0.0008407   0.286 0.775085    
## sat.math.flgold                    -0.0241787  0.1607446  -0.150 0.880488    
## AP_CALAB                           -0.0216909  0.0911965  -0.238 0.812083    
## AP_CALAB.flgNot Missing             0.0779702  0.1512937   0.515 0.606500    
## term.units.attemptedCensus          0.0214280  0.0217106   0.987 0.324062    
## csus.gpa.start                      1.1731403  0.1167655  10.047  < 2e-16 ***
## prevPAL                             0.1710735  0.0535187   3.197 0.001466 ** 
## cMajChemistry                       0.2927482  0.3206663   0.913 0.361654    
## cMajChild Devel/Early Childhood Ed  0.3579739  0.3331423   1.075 0.283028    
## cMajKinesiology/Physical Education -0.5104136  0.1382659  -3.692 0.000244 ***
## cMajNursing                        -0.3789272  0.3221003  -1.176 0.239906    
## cMajOTHER                          -0.3813397  0.2029681  -1.879 0.060770 .  
## cMajUndeclared                     -0.0800524  0.2693843  -0.297 0.766445    
## hous.coh.term.flgOn-Campus Housing -0.0613837  0.1248750  -0.492 0.623215    
## delay.from.hs                       0.1497663  0.0376607   3.977 7.87e-05 ***
## sac.county.flg1                    -0.1025577  0.1086385  -0.944 0.345548    
## prevPASS                           -0.0261742  0.0910270  -0.288 0.773799    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.264761)
## 
##     Null deviance: 1275.90  on 608  degrees of freedom
## Residual deviance:  734.83  on 581  degrees of freedom
## AIC: 1900.6
## 
## Number of Fisher Scoring iterations: 2
# Predict values for both non-PAL and PAL groups
bio22.final$prog.score <- predict(ctrl.fit, bio22.final)

# Compare the standardized mean differences of the prognostic scores in the non-PAL and PAL groups
# p.score is at the bottom of the table
prog.bal <-bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final, distance = bio22.final["prog.score"], sd.denom = "treated", addl = "p.score", thresholds = c(m = .1))
prog.bal
## Balance Measures
##                                         Type Diff.Adj    M.Threshold
## prog.score                          Distance  -0.0171 Balanced, <0.1
## cum.percent.units.passed             Contin.  -0.0951 Balanced, <0.1
## eth.erss_African American             Binary  -0.0051 Balanced, <0.1
## eth.erss_Asian                        Binary  -0.0483 Balanced, <0.1
## eth.erss_Foreign                      Binary  -0.0005 Balanced, <0.1
## eth.erss_Hispanic                     Binary   0.0484 Balanced, <0.1
## eth.erss_Pacific Islander             Binary   0.0036 Balanced, <0.1
## eth.erss_Two or More Races            Binary  -0.0074 Balanced, <0.1
## eth.erss_Unknown                      Binary  -0.0020 Balanced, <0.1
## eth.erss_White                        Binary   0.0114 Balanced, <0.1
## gender_Male                           Binary  -0.0130 Balanced, <0.1
## sat.math.score                       Contin.  -0.0872 Balanced, <0.1
## sat.verbal.score                     Contin.  -0.0737 Balanced, <0.1
## sat.math.flg_old                      Binary   0.0209 Balanced, <0.1
## AP_CALAB                             Contin.   0.0100 Balanced, <0.1
## AP_CALAB.flg_Not Missing              Binary  -0.0307 Balanced, <0.1
## term.units.attemptedCensus           Contin.  -0.0903 Balanced, <0.1
## csus.gpa.start                       Contin.   0.0140 Balanced, <0.1
## prevPAL                              Contin.  -0.0130 Balanced, <0.1
## cMaj_Biology                          Binary   0.0014 Balanced, <0.1
## cMaj_Chemistry                        Binary   0.0042 Balanced, <0.1
## cMaj_Child Devel/Early Childhood Ed   Binary   0.0015 Balanced, <0.1
## cMaj_Kinesiology/Physical Education   Binary   0.0170 Balanced, <0.1
## cMaj_Nursing                          Binary  -0.0276 Balanced, <0.1
## cMaj_OTHER                            Binary  -0.0005 Balanced, <0.1
## cMaj_Undeclared                       Binary   0.0038 Balanced, <0.1
## hous.coh.term.flg_On-Campus Housing   Binary  -0.0161 Balanced, <0.1
## delay.from.hs                        Contin.   0.0104 Balanced, <0.1
## sac.county.flg                        Binary   0.0094 Balanced, <0.1
## prevPASS                             Contin.   0.0303 Balanced, <0.1
## p.score                              Contin.  -0.0001 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        31
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##                  Variable Diff.Adj    M.Threshold
##  cum.percent.units.passed  -0.0951 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                    609.      328
## Matched (ESS)          169.3     326
## Matched (Unweighted)   332.      326
## Unmatched              277.        0
## Discarded                0.        2

Estimate Difference Between Mean grade in BIO 22 of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.5987 . This result is statistically significant with a P-value of \(2.556x10^{-6}\) and is based on 326 PAL students and 523 non-PAL student matches(332 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.bio22)
## 
## Estimate...  0.5987 
## AI SE......  0.12728 
## T-stat.....  4.7036 
## p.val......  2.556e-06 
## 
## Original number of observations..............  937 
## Original number of treated obs...............  328 
## Matched number of observations...............  326 
## Matched number of observations  (unweighted).  523 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  2

Sensitivity Analysis

psens(match.bio22, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0001
##    1.3           0      0.0010
##    1.4           0      0.0078
##    1.5           0      0.0363
##    1.6           0      0.1123
##    1.7           0      0.2517
##    1.8           0      0.4380
##    1.9           0      0.6290
##    2.0           0      0.7852
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.6\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “csus.gpa.start” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(bio22.first.order.prop.model$coefficients))))
x
sat.math.score 1.000640
sat.verbal.score 1.002151
eth.erssHispanic 1.060104
AP_CALAB.flgNot Missing 1.084602
eth.erssUnknown 1.140201
term.units.attemptedCensus 1.152535
delay.from.hs 1.154547
cMajOTHER 1.219164
prevPASS 1.241804
cMajChemistry 1.243820
genderMale 1.332799
sac.county.flg1 1.358187
sat.math.flgold 1.435618
prevPAL 1.437757
(Intercept) 1.458636
AP_CALAB 1.460014
eth.erssWhite 1.580549
eth.erssAsian 1.589501
cMajChild Devel/Early Childhood Ed 1.619797
cMajUndeclared 1.627665
eth.erssTwo or More Races 1.676715
csus.gpa.start 1.719944
hous.coh.term.flgOn-Campus Housing 1.769760
eth.erssForeign 1.852737
cMajKinesiology/Physical Education 2.311386
eth.erssPacific Islander 2.714234
cum.percent.units.passed 2.951287
cMajNursing 4.477056

Chemistry classes

Subset data for chemistry classes

chem.classes <- paste("CHEM", c(4, '1A', '1B', 24))
chem.dat <- PALdatafull %>%
  filter(base.time.course == 1, course %in% chem.classes) %>%
  mutate(course = factor(course, levels = chem.classes)) 
dim(chem.dat) #  18948   174
## [1] 18948   174
num.stu <- dim(chem.dat)[1]
num.vars <- dim(chem.dat)[2]

There are 18948 rows and 174 variables. Each row is a chemistry student. So, there is a total of 18948 chemistry students.

Update CHEM 24 Spring 2019 for chemistry data

There are 83 first attempt only Chem 24 Spring 2019 students. Some of them are incorrectly labeled as non-PAL and need to be relabeled.

with(chem.dat %>% 
       filter(base.time.course == 1, pass.term.flg == "PASS Term",  course == "CHEM 24",  term == "Spring 2019", course.seq == 0), 
     table(palN))
## palN
##  0 
## 83
chem.dat <- update.chem24s19(chem.dat)

with(chem.dat %>% 
       filter(base.time.course == 1, pass.term.flg == "PASS Term",  course == "CHEM 24",  term == "Spring 2019", course.seq == 0), 
     table(palN))
## palN
##  0  1  2 
## 28  9 46
#  0  1  2 
# 28  9 46 

After relabeling, there are 28 non-PAL students, 9 incomplete PAL students, and 46 PAL students for Chem 24 Spring 2019.

Compare the mean gpa for PAL and non-PAL students by Course without Propensity Score Adjustment

The course.seq variable indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.

Note: Excludes incomplete PAL students

get.raw.tab(chem.classes, chem.dat)
Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)
class nonPALavg PALavg Diff NonPAL_Num PAL_Num CompletePAL TermPALStart
CHEM 4 2.03 2.39 0.36 1929 759 0.28 2123
CHEM 1A 1.63 2.04 0.41 1717 1055 0.37 2128
CHEM 1B 1.70 2.11 0.41 1090 769 0.40 2138
CHEM 24 1.63 2.06 0.43 224 177 0.43 2178

Data Cleaning & Feature Engineering

Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State

Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.

# Check how many students did not complete PAL
sum(chem.dat$palN==1) # 226 
## [1] 226
incl.pal.stu <- sum(chem.dat$palN==1)
chem.dat <- clean.data(chem.dat)
dim(chem.dat) # 18722   179
## [1] 18722   179

There were 226 chemistry students who did not complete PAL and were removed from the analysis. There are now 18722 chemistry students instead of 18948.

There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.

CHEM 1A

Executive Summary

Based on data for 1055 PAL students and 1717 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.41 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.50\pm 0.07\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 1A, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 1A course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Chem 1A, 19 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: Hispanic ethnicity, female gender, lower SAT scores, has an AP Calculus exam score, has an AP Biology exam score, has an AP Chemistry exam score, CSUS GPA at start of term, enrollment in PAL in the past, academic major, higher term units attempted, lower high school GPA, and fewer years between first term at CSUS and high school graduation.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0627. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.50\pm 0.07\) or between 0.43 and 0.57 on a 4.0 grade scale. This result is statistically significant with a P-value of \(2.43x10^{-13}\) and is based on 757 PAL students and 711 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.41.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 1A. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 1A indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of CSUS GPA at start of term, major, or has an AP Chemistry exam score observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 1055 PAL students in Chem 1A being reduced to 757. Also, 711 non-PAL students were selected out of 1717 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 1561 non-PAL matches. Of the 757 PAL students, 337 were matched one-to-one with non-PAL students and 420 were matched one-to-many with non-PAL students.

Extract CHEM 1A Data

The non-PAL and PAL groups will include students with only first attempts at CHEM 1A.They will also include students with previous PAL participation and/or are currently in a PAL for another course.

# Excludes course repeats
chem1a.dat <- chem.dat %>%
  filter(course=="CHEM 1A", pass.term.flg == "PASS Term", course.seq== 0) 
dim(chem1a.dat) # 2772  179
## [1] 2772  179

There are 2,772 CHEM 1A first attempt only students.

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.09
with(chem1a.dat, table(cMaj, palN))
##                                          palN
## cMaj                                        0   1
##   Anthropology                              3   5
##   Biology                                 659 547
##   Business                                  8   4
##   Chemistry                               265 104
##   Child Devel/Early Childhood Ed            9   2
##   Civil Engineering                        50  16
##   Communications                            4   2
##   Computer Engineering                     13   2
##   Computer Science                         15   4
##   Criminal Justice                          5   6
##   Dance                                     0   1
##   Electrical Engineering                   21   4
##   English                                   3   1
##   Environmental Studies                    28  15
##   Film                                      0   1
##   French                                    1   0
##   Geography                                 0   1
##   Geology                                  39  16
##   Gerontology                               2   0
##   Health Science                           22   8
##   History                                   3   2
##   Interdisciplinary Studies/Special Major   1   0
##   Kinesiology/Physical Education          226 164
##   Liberal Studies                           1   2
##   Mathematics                               7   1
##   Mechanical Engineering                   73  17
##   Music                                     4   1
##   Nursing                                  40  22
##   Nutrition                                77  55
##   Philosophy                                3   0
##   Physical Science                          0   1
##   Physics                                  42   7
##   Political Science                         0   1
##   Psychology                               24   9
##   Social Science                            2   1
##   Sociology                                 2   1
##   Spanish                                   3   1
##   Speech Pathology                          2   2
##   Undeclared                               45  27
chem1a.dat <- group_category(data = chem1a.dat, feature = "cMaj", threshold = 0.09,  update = TRUE)
with(chem1a.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        659 547
##   Chemistry                      265 104
##   Civil Engineering               50  16
##   Geology                         39  16
##   Kinesiology/Physical Education 226 164
##   Mechanical Engineering          73  17
##   Nursing                         40  22
##   Nutrition                       77  55
##   OTHER                          201  80
##   Physics                         42   7
##   Undeclared                      45  27

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

## [1] 38
##                        feature num_missing pct_missing
## 17               Instructor_02        2772   1.0000000
## 22                   deg.plan3        2772   1.0000000
## 23                   deg.plan4        2772   1.0000000
## 24                   deg.plan5        2772   1.0000000
## 25                   deg.plan6        2772   1.0000000
## 19             withdraw_reason        2762   0.9963925
## 21                   deg.plan2        2739   0.9880952
## 4                  pledge.term        2366   0.8535354
## 11                  trf.gpaADM        2308   0.8326118
## 1                fys.term.code        1630   0.5880231
## 2                      fys.grd        1630   0.5880231
## 3                  fys.rpt.flg        1630   0.5880231
## 27                grad.termERS        1453   0.5241703
## 20                   deg.plan1        1415   0.5104618
## 26                   grad.term        1415   0.5104618
## 28                         ttg        1415   0.5104618
## 18               treat.section        1237   0.4462482
## 31                plan.college         925   0.3336941
## 32           plan.college.desc         925   0.3336941
## 33                   plan.dept         925   0.3336941
## 34               plan.deptAbbr         925   0.3336941
## 35                 plan.degree         925   0.3336941
## 36                   plan.type         925   0.3336941
## 5               sat.math.score         616   0.2222222
## 6                 sat.math.flg         616   0.2222222
## 7             sat.verbal.score         616   0.2222222
## 8               sat.verbal.flg         616   0.2222222
## 9                sat.test.date         616   0.2222222
## 13 ge.critical.thinking.status         524   0.1890332
## 14      ge.english.comp.status         524   0.1890332
## 15              ge.math.status         524   0.1890332
## 16         ge.oral.comm.status         524   0.1890332
## 12                  admit.term         511   0.1843434
## 10                      hs.gpa         420   0.1515152
## 38                      county         355   0.1280664
## 29      tot.passd.prgrss.start         325   0.1172439
## 30      tot.taken.prgrss.start         325   0.1172439
## 37    cum.percent.units.passed         325   0.1172439

## [1] 2772  146

38 variables missing >10%
5 out of 38 variables were important and force included, even though they were missing >10%
So, 33 variables were removed due to missingness and there are now 146 variables instead of 179 variables.

Subset on Complete Cases only in CHEM 1A Data

chem1a.dat <- chem1a.dat[complete.cases(chem1a.dat), ]
dim(chem1a.dat) # 1769  146
## [1] 1769  146

1769 out of 2772 students are kept
1043 students were removed due to missingness of variables

single.vars <- chem1a.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem1a.dat)
##                                            
##                                             Overall      
##   n                                         1769         
##   country = USA (%)                         1769 (100.0) 
##   career.course = UGRD (%)                  1769 (100.0) 
##   acad.prog.course = UGD (%)                1769 (100.0) 
##   course (%)                                             
##      CHEM 4                                    0 (  0.0) 
##      CHEM 1A                                1769 (100.0) 
##      CHEM 1B                                   0 (  0.0) 
##      CHEM 24                                   0 (  0.0) 
##   component = LEC (%)                       1769 (100.0) 
##   units (mean (SD))                         5.00 (0.00)  
##   course.numeric (mean (SD))                1.00 (0.00)  
##   div = Lower Division (%)                  1769 (100.0) 
##   Instructor_01 (%)                                      
##      1                                         0 (  0.0) 
##      2                                         0 (  0.0) 
##      3                                         0 (  0.0) 
##      4                                         0 (  0.0) 
##      5                                         0 (  0.0) 
##      6                                      1769 (100.0) 
##      7                                         0 (  0.0) 
##      8                                         0 (  0.0) 
##      9                                         0 (  0.0) 
##      10                                        0 (  0.0) 
##      11                                        0 (  0.0) 
##      12                                        0 (  0.0) 
##      13                                        0 (  0.0) 
##      14                                        0 (  0.0) 
##      15                                        0 (  0.0) 
##      16                                        0 (  0.0) 
##      17                                        0 (  0.0) 
##      18                                        0 (  0.0) 
##      19                                        0 (  0.0) 
##      20                                        0 (  0.0) 
##      21                                        0 (  0.0) 
##      22                                        0 (  0.0) 
##   course.seq (mean (SD))                    0.00 (0.00)  
##   rpt.flg = First Attempt (%)               1769 (100.0) 
##   c2s = Non-C2S (%)                         1769 (100.0) 
##   base.time.course (mean (SD))              1.00 (0.00)  
##   years (mean (SD))                         0.50 (0.00)  
##   withdraw_code = NWD (%)                   1769 (100.0) 
##   enrl.flg = Enrolled (%)                   1769 (100.0) 
##   enrl.flgERS = Enrolled (%)                1769 (100.0) 
##   rtn.flg = Retained (%)                    1769 (100.0) 
##   rtn.flgERS = Retained (%)                 1769 (100.0) 
##   pass.term.flg = PASS Term (%)             1769 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)      1769 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%) 1769 (100.0)
sum(single.vars) # 22
## [1] 22
# remove single-valued variables
chem1a.dat<- chem1a.dat %>%
  dplyr::select(-names(single.vars))
dim(chem1a.dat) # 1769  124
## [1] 1769  124

124 out of 146 variables are kept
22 variables removed due to single values

Identify variables causing complete separation in logistic regression

# Remove non chem1a instructors
chem1a.dat <- chem1a.dat %>%
  droplevels(chem1a.dat$Instructor_01)

# Combine sparse ethnicity categories to Other
chem1a.dat <- chem1a.dat %>%
  mutate(eth.erss = fct_collapse(eth.erss, `Other` = c("Foreign", "Native American", "Pacific Islander")))
with(chem1a.dat, table(eth.erss, palN))
##                    palN
## eth.erss              0   1
##   African American   44  30
##   Asian             336 237
##   Other              20  21
##   Hispanic          250 242
##   Two or More Races  81  40
##   Unknown            38  15
##   White             238 177
# Collapse sparse categories for acad.stand 
# Other:  Academic Dismissal, Academic Disqualification 
chem1a.dat <- chem1a.dat %>%
  mutate(acad.stand = fct_other(acad.stand, keep = c("Good Standing")))
with(chem1a.dat, table(acad.stand, palN))
##                palN
## acad.stand        0   1
##   Good Standing 955 748
##   Other          52  14

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

chem1a.final <- chem1a.step.vars(chem1a.dat)
kable(names(chem1a.final))
x
acad.stand
adm.area
bot.level
cMaj
coh
course.age
csus.gpa.start
cum.percent.units.passed
delay.from.hs
e.rmd
eth.erss
father.ed
fys.flg
gender
hous.coh.term.flg
hs.gpa
median.income
m.rmd
mother.ed
pct.female.head
pell.coh.term.flg
prevPAL
prevPASS
reason
sac.county.flg
term.units.attemptedCensus
palN
grd.pt.unt
sat.math.score
sat.math.flg
sat.verbal.score
AP_BIOL
AP_CALAB
AP_CALBC
AP_CHEM
AP_BIOL.flg
AP_CALAB.flg
AP_CALBC.flg
AP_CHEM.flg
pct.female.head.flg
med.inc.flg

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

chem1a.first.order.prop.model <- chem1a.step(chem1a.final)
summary(chem1a.first.order.prop.model)
## 
## Call:
## glm(formula = model.first.order, family = binomial, data = chem1a.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8322  -1.0126  -0.6089   1.0904   2.4921  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         2.2066673  1.3077056   1.687  0.09152 .  
## cum.percent.units.passed            0.0185532  0.7023697   0.026  0.97893    
## eth.erssAsian                       0.2006993  0.2729443   0.735  0.46215    
## eth.erssOther                       0.4196394  0.4155541   1.010  0.31258    
## eth.erssHispanic                    0.4779929  0.2730791   1.750  0.08005 .  
## eth.erssTwo or More Races          -0.1297781  0.3288717  -0.395  0.69313    
## eth.erssUnknown                    -0.2929460  0.4188639  -0.699  0.48431    
## eth.erssWhite                       0.3413619  0.2798446   1.220  0.22253    
## genderMale                         -0.3338347  0.1175127  -2.841  0.00450 ** 
## sat.math.score                     -0.0031376  0.0009867  -3.180  0.00147 ** 
## sat.verbal.score                   -0.0024583  0.0009013  -2.728  0.00638 ** 
## sat.math.flgold                    -0.1286384  0.2106295  -0.611  0.54138    
## AP_CALAB                           -0.0296383  0.1061827  -0.279  0.78015    
## AP_CALAB.flgNot Missing            -0.3448884  0.1673607  -2.061  0.03933 *  
## AP_BIOL                             0.0113165  0.1876565   0.060  0.95191    
## AP_BIOL.flgNot Missing             -0.3273270  0.1949865  -1.679  0.09321 .  
## AP_CHEM                            -0.7434323  0.4691420  -1.585  0.11304    
## AP_CHEM.flgNot Missing             -1.1385896  0.4174880  -2.727  0.00639 ** 
## csus.gpa.start                      0.7188766  0.1466119   4.903 9.43e-07 ***
## prevPAL                             0.5319611  0.1061938   5.009 5.46e-07 ***
## cMajChemistry                      -0.7074529  0.1725795  -4.099 4.14e-05 ***
## cMajCivil Engineering              -0.5976542  0.3925320  -1.523  0.12787    
## cMajGeology                        -0.0987511  0.4921359  -0.201  0.84097    
## cMajKinesiology/Physical Education -0.1260763  0.1522197  -0.828  0.40753    
## cMajMechanical Engineering         -0.8508806  0.3391102  -2.509  0.01210 *  
## cMajNursing                         0.0066722  0.3863916   0.017  0.98622    
## cMajNutrition                      -0.0288961  0.2734756  -0.106  0.91585    
## cMajOTHER                          -0.6094946  0.2157415  -2.825  0.00473 ** 
## cMajPhysics                        -0.7988319  0.4622694  -1.728  0.08398 .  
## cMajUndeclared                     -0.4627973  0.3003148  -1.541  0.12331    
## term.units.attemptedCensus          0.0710439  0.0279899   2.538  0.01114 *  
## hs.gpa                             -0.2970032  0.1446650  -2.053  0.04007 *  
## delay.from.hs                      -0.1154636  0.0537763  -2.147  0.03178 *  
## acad.standOther                    -0.4841617  0.3451585  -1.403  0.16070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2418.3  on 1768  degrees of freedom
## Residual deviance: 2168.3  on 1735  degrees of freedom
## AIC: 2236.3
## 
## Number of Fisher Scoring iterations: 5
p.score <- chem1a.first.order.prop.model$fitted.values
chem1a.covs <-  names(chem1a.first.order.prop.model %>%  pluck("model") %>% dplyr::select(-palN)) 

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(chem1a.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.601986 1.974409 0.3724234
##                                         Stratified by palN
##                                          0              1              SMD   
##   n                                        1007            762               
##   cum.percent.units.passed (mean (SD))     0.91 (0.10)    0.91 (0.09)   0.005
##   eth.erss (%)                                                          0.213
##      African American                        44 ( 4.4)      30 ( 3.9)        
##      Asian                                  336 (33.4)     237 (31.1)        
##      Other                                   20 ( 2.0)      21 ( 2.8)        
##      Hispanic                               250 (24.8)     242 (31.8)        
##      Two or More Races                       81 ( 8.0)      40 ( 5.2)        
##      Unknown                                 38 ( 3.8)      15 ( 2.0)        
##      White                                  238 (23.6)     177 (23.2)        
##   gender = Male (%)                         471 (46.8)     241 (31.6)   0.314
##   sat.math.score (mean (SD))             529.74 (78.45) 499.23 (74.18)  0.400
##   sat.verbal.score (mean (SD))           499.95 (82.51) 477.60 (74.57)  0.284
##   sat.math.flg = old (%)                    942 (93.5)     702 (92.1)   0.055
##   AP_CALAB (mean (SD))                     2.57 (0.62)    2.54 (0.47)   0.053
##   AP_CALAB.flg = Not Missing (%)            197 (19.6)      94 (12.3)   0.198
##   AP_BIOL (mean (SD))                      2.51 (0.31)    2.50 (0.29)   0.040
##   AP_BIOL.flg = Not Missing (%)             109 (10.8)      61 ( 8.0)   0.097
##   AP_CHEM (mean (SD))                      1.96 (0.23)    1.96 (0.13)   0.030
##   AP_CHEM.flg = Not Missing (%)              54 ( 5.4)      16 ( 2.1)   0.173
##   csus.gpa.start (mean (SD))               3.11 (0.53)    3.22 (0.46)   0.237
##   prevPAL (mean (SD))                      0.22 (0.46)    0.37 (0.56)   0.308
##   cMaj (%)                                                              0.365
##      Biology                                412 (40.9)     405 (53.1)        
##      Chemistry                              145 (14.4)      76 (10.0)        
##      Civil Engineering                       28 ( 2.8)      12 ( 1.6)        
##      Geology                                 15 ( 1.5)       7 ( 0.9)        
##      Kinesiology/Physical Education         150 (14.9)     129 (16.9)        
##      Mechanical Engineering                  48 ( 4.8)      14 ( 1.8)        
##      Nursing                                 16 ( 1.6)      16 ( 2.1)        
##      Nutrition                               35 ( 3.5)      34 ( 4.5)        
##      OTHER                                   90 ( 8.9)      43 ( 5.6)        
##      Physics                                 26 ( 2.6)       7 ( 0.9)        
##      Undeclared                              42 ( 4.2)      19 ( 2.5)        
##   term.units.attemptedCensus (mean (SD))  13.65 (2.10)   13.90 (1.91)   0.125
##   hs.gpa (mean (SD))                       3.42 (0.44)    3.40 (0.42)   0.030
##   delay.from.hs (mean (SD))                2.23 (1.28)    2.16 (1.08)   0.052
##   acad.stand = Other (%)                     52 ( 5.2)      14 ( 1.8)   0.182

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     8    11    19
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
sat.math.score 0.3997009
cMaj 0.3649218
gender 0.3140303
prevPAL 0.3077470
sat.verbal.score 0.2842378
csus.gpa.start 0.2374740
eth.erss 0.2125984
AP_CALAB.flg 0.1983562
acad.stand 0.1817409
AP_CHEM.flg 0.1727961
term.units.attemptedCensus 0.1253328

11 out of 19 variables have SMD >0.1

Implement a propensity score matching method.

match.chem1a <- with(chem1a.final, Match(
  Y=chem1a.final$grd.pt.unt, Tr = chem1a.final$palN, X = p.score, 
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
chem1a.final <- chem1a.final %>%
  rownames_to_column(var = "id")

# Matched data
chem1a.matched.dat <- chem1a.final[unlist(match.chem1a[c("index.treated", "index.control")]), ]
chem1a.matched.dat$match.weights<-  c(match.chem1a$weights, match.chem1a$weights)

# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem1a.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem1a.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                         Stratified by palN
##                                          0              1              SMD   
##   n                                      757.00         757.00               
##   cum.percent.units.passed (mean (SD))     0.91 (0.09)    0.91 (0.09)   0.053
##   eth.erss (%)                                                          0.116
##      African American                      21.2 ( 2.8)    29.0 ( 3.8)        
##      Asian                                230.7 (30.5)   237.0 (31.3)        
##      Other                                 18.1 ( 2.4)    21.0 ( 2.8)        
##      Hispanic                             224.1 (29.6)   238.0 (31.4)        
##      Two or More Races                     37.0 ( 4.9)    40.0 ( 5.3)        
##      Unknown                               14.7 ( 1.9)    15.0 ( 2.0)        
##      White                                211.1 (27.9)   177.0 (23.4)        
##   gender = Male (%)                       226.1 (29.9)   240.0 (31.7)   0.040
##   sat.math.score (mean (SD))             498.04 (73.29) 500.17 (73.25)  0.029
##   sat.verbal.score (mean (SD))           478.74 (75.32) 478.10 (74.42)  0.009
##   sat.math.flg = old (%)                  697.2 (92.1)   697.0 (92.1)   0.001
##   AP_CALAB (mean (SD))                     2.50 (0.49)    2.54 (0.47)   0.069
##   AP_CALAB.flg = Not Missing (%)           91.8 (12.1)    94.0 (12.4)   0.009
##   AP_BIOL (mean (SD))                      2.52 (0.21)    2.50 (0.30)   0.059
##   AP_BIOL.flg = Not Missing (%)            57.4 ( 7.6)    61.0 ( 8.1)   0.018
##   AP_CHEM (mean (SD))                      1.96 (0.12)    1.96 (0.13)   0.009
##   AP_CHEM.flg = Not Missing (%)            13.8 ( 1.8)    16.0 ( 2.1)   0.021
##   csus.gpa.start (mean (SD))               3.20 (0.47)    3.22 (0.46)   0.039
##   prevPAL (mean (SD))                      0.38 (0.61)    0.36 (0.54)   0.041
##   cMaj (%)                                                              0.123
##      Biology                              390.5 (51.6)   401.0 (53.0)        
##      Chemistry                             92.9 (12.3)    76.0 (10.0)        
##      Civil Engineering                     13.0 ( 1.7)    12.0 ( 1.6)        
##      Geology                                9.6 ( 1.3)     7.0 ( 0.9)        
##      Kinesiology/Physical Education       117.2 (15.5)   128.0 (16.9)        
##      Mechanical Engineering                11.0 ( 1.5)    14.0 ( 1.8)        
##      Nursing                               13.7 ( 1.8)    16.0 ( 2.1)        
##      Nutrition                             32.0 ( 4.2)    34.0 ( 4.5)        
##      OTHER                                 56.3 ( 7.4)    43.0 ( 5.7)        
##      Physics                                5.4 ( 0.7)     7.0 ( 0.9)        
##      Undeclared                            15.2 ( 2.0)    19.0 ( 2.5)        
##   term.units.attemptedCensus (mean (SD))  13.96 (1.83)   13.90 (1.91)   0.030
##   hs.gpa (mean (SD))                       3.40 (0.42)    3.41 (0.42)   0.013
##   delay.from.hs (mean (SD))                2.20 (1.21)    2.16 (1.08)   0.039
##   acad.stand = Other (%)                   22.8 ( 3.0)    14.0 ( 1.8)   0.075

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion

All variables are balanced and under the <0.1 mean threshold.

chem1a.bal <- bal.tab(match.chem1a, formula = f.build("palN", chem1a.covs), data = chem1a.final,
        distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
chem1a.bal
## Balance Measures
## All covariates are balanced.
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        36
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##  Variable Diff.Adj    M.Threshold
##  AP_CALAB     0.07 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                  1007.       762
## Matched (ESS)         313.69     757
## Matched (Unweighted)  711.       757
## Unmatched             296.         0
## Discarded               0.         5

Check variable percent improvement

get.var.perc.tab(chem1a.bal)
##                               Variable      Diff.Un      Diff.Adj % Improvement
## 1                              p.score  0.813473150  0.0009607886           100
## 2                     eth.erss_Unknown -0.018050810  0.0004010191            98
## 3                     sat.math.flg_old -0.014191995 -0.0002642008            98
## 4                     sat.verbal.score -0.299750402 -0.0086395931            97
## 5             AP_CALAB.flg_Not Missing -0.072271006  0.0028952004            96
## 6                       sat.math.score -0.411373506  0.0286721308            93
## 7              AP_CHEM.flg_Not Missing -0.032627252  0.0029502422            91
## 8                         cMaj_Biology  0.122360015  0.0138422344            89
## 9               cMaj_Civil Engineering -0.012057331 -0.0013099956            89
## 10                         gender_Male -0.151452953  0.0183855444            88
## 11                        cMaj_Physics -0.016632913  0.0020695729            88
## 12         cMaj_Mechanical Engineering -0.029293632  0.0039189784            87
## 13          eth.erss_Two or More Races -0.027943503  0.0038969617            86
## 14                             prevPAL  0.281028450 -0.0422296608            85
## 15                      csus.gpa.start  0.255366543  0.0396354928            84
## 16             AP_BIOL.flg_Not Missing -0.028189810  0.0047225892            83
## 17                             AP_CHEM -0.043044496  0.0088420199            79
## 18          term.units.attemptedCensus  0.131689543 -0.0297896263            77
## 19                   eth.erss_Hispanic  0.069323137  0.0183478015            74
## 20                      cMaj_Nutrition  0.009862719  0.0025869661            74
## 21                     cMaj_Undeclared -0.016773660  0.0050166698            70
## 22                    acad.stand_Other -0.033265827 -0.0115918098            65
## 23                      eth.erss_Asian -0.022640728  0.0082609926            64
## 24                              hs.gpa -0.030258269  0.0130619500            57
## 25                      eth.erss_Other  0.007698082  0.0037837328            51
## 26                      cMaj_Chemistry -0.044254523 -0.0223469837            50
## 27                          cMaj_OTHER -0.032943933 -0.0175803611            47
## 28                        cMaj_Geology -0.005709378 -0.0033937221            41
## 29                        cMaj_Nursing  0.005108597  0.0029942756            41
## 30 cMaj_Kinesiology/Physical Education  0.020334040  0.0142023652            30
## 31                       delay.from.hs -0.057063209 -0.0409351417            28
## 32                            AP_CALAB -0.062685060  0.0700268159           -12
## 33                             AP_BIOL -0.041283872 -0.0516391001           -25
## 34           eth.erss_African American -0.004324062  0.0103478644          -139
## 35                      eth.erss_White -0.004062116 -0.0450383720         -1009
## 36            cum.percent.units.passed  0.004657179  0.0537413078         -1054

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(chem1a.bal,binary = "raw", stars = "std", var.order = "unadjusted", 
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(chem1a.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 278 337
Multiple Matches 433 420
Total Students 711 757

Out of 762 PAL students, 757 were matched and 5 were unable to be matched. Out of 757 PAL student matches, 337 PAL students were matched to one non-PAL student and 420 PAL students were matched to multiple non-PAL students.

Out of 1561 non-PAL student matches, there were 711 non-PAL students, 278 of the non-PAL students were matched to one PAL student and 433 of the non-PAL students were matched to multiple PAL students.

Propensity scores for average treatment effect on the treated (ATT)

get.att.plot(chem1a.final, match.chem1a)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is 0.0335, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.

## 
## Call:
## glm(formula = f.build("grd.pt.unt", chem1a.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.39006  -0.58465   0.02405   0.57994   2.76737  
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -5.2196838  0.5382662  -9.697  < 2e-16 ***
## cum.percent.units.passed            0.2918154  0.3625120   0.805  0.42103    
## eth.erssAsian                       0.0601629  0.1430710   0.421  0.67421    
## eth.erssOther                       0.0209315  0.2374072   0.088  0.92976    
## eth.erssHispanic                   -0.0631047  0.1453120  -0.434  0.66419    
## eth.erssTwo or More Races          -0.0504216  0.1665614  -0.303  0.76217    
## eth.erssUnknown                     0.2034702  0.1984971   1.025  0.30559    
## eth.erssWhite                       0.0313683  0.1474357   0.213  0.83156    
## genderMale                          0.1350774  0.0621716   2.173  0.03005 *  
## sat.math.score                      0.0024302  0.0005127   4.740 2.45e-06 ***
## sat.verbal.score                   -0.0008151  0.0004630  -1.760  0.07866 .  
## sat.math.flgold                     0.1244919  0.1187407   1.048  0.29470    
## AP_CALAB                            0.0791501  0.0472506   1.675  0.09423 .  
## AP_CALAB.flgNot Missing             0.0481646  0.0796500   0.605  0.54552    
## AP_BIOL                             0.1680120  0.0935846   1.795  0.07292 .  
## AP_BIOL.flgNot Missing              0.0269552  0.0953309   0.283  0.77743    
## AP_CHEM                             0.3201416  0.1258924   2.543  0.01115 *  
## AP_CHEM.flgNot Missing              0.3035225  0.1284433   2.363  0.01832 *  
## csus.gpa.start                      0.9656280  0.0758034  12.739  < 2e-16 ***
## prevPAL                            -0.0695985  0.0624447  -1.115  0.26531    
## cMajChemistry                       0.1194315  0.0869728   1.373  0.17000    
## cMajCivil Engineering               0.3998032  0.1789340   2.234  0.02569 *  
## cMajGeology                        -0.3463892  0.2371614  -1.461  0.14446    
## cMajKinesiology/Physical Education  0.0310270  0.0855213   0.363  0.71683    
## cMajMechanical Engineering          0.1324651  0.1417757   0.934  0.35037    
## cMajNursing                        -0.4427774  0.2251455  -1.967  0.04951 *  
## cMajNutrition                      -0.0323645  0.1578557  -0.205  0.83759    
## cMajOTHER                           0.1995494  0.1050010   1.900  0.05767 .  
## cMajPhysics                        -0.0867119  0.1849268  -0.469  0.63925    
## cMajUndeclared                     -0.1265802  0.1437063  -0.881  0.37863    
## term.units.attemptedCensus          0.0079314  0.0141139   0.562  0.57427    
## hs.gpa                              0.2435173  0.0756390   3.219  0.00133 ** 
## delay.from.hs                       0.1102145  0.0257807   4.275 2.10e-05 ***
## acad.standOther                    -0.1641247  0.1412638  -1.162  0.24559    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.7558023)
## 
##     Null deviance: 1209.9  on 1006  degrees of freedom
## Residual deviance:  735.4  on  973  degrees of freedom
## AIC: 2611.2
## 
## Number of Fisher Scoring iterations: 2
## Balance Measures
##                                         Type Diff.Adj    M.Threshold
## prog.score                          Distance   0.0335 Balanced, <0.1
## cum.percent.units.passed             Contin.   0.0537 Balanced, <0.1
## eth.erss_African American             Binary   0.0103 Balanced, <0.1
## eth.erss_Asian                        Binary   0.0083 Balanced, <0.1
## eth.erss_Other                        Binary   0.0038 Balanced, <0.1
## eth.erss_Hispanic                     Binary   0.0183 Balanced, <0.1
## eth.erss_Two or More Races            Binary   0.0039 Balanced, <0.1
## eth.erss_Unknown                      Binary   0.0004 Balanced, <0.1
## eth.erss_White                        Binary  -0.0450 Balanced, <0.1
## gender_Male                           Binary   0.0184 Balanced, <0.1
## sat.math.score                       Contin.   0.0287 Balanced, <0.1
## sat.verbal.score                     Contin.  -0.0086 Balanced, <0.1
## sat.math.flg_old                      Binary  -0.0003 Balanced, <0.1
## AP_CALAB                             Contin.   0.0700 Balanced, <0.1
## AP_CALAB.flg_Not Missing              Binary   0.0029 Balanced, <0.1
## AP_BIOL                              Contin.  -0.0516 Balanced, <0.1
## AP_BIOL.flg_Not Missing               Binary   0.0047 Balanced, <0.1
## AP_CHEM                              Contin.   0.0088 Balanced, <0.1
## AP_CHEM.flg_Not Missing               Binary   0.0030 Balanced, <0.1
## csus.gpa.start                       Contin.   0.0396 Balanced, <0.1
## prevPAL                              Contin.  -0.0422 Balanced, <0.1
## cMaj_Biology                          Binary   0.0138 Balanced, <0.1
## cMaj_Chemistry                        Binary  -0.0223 Balanced, <0.1
## cMaj_Civil Engineering                Binary  -0.0013 Balanced, <0.1
## cMaj_Geology                          Binary  -0.0034 Balanced, <0.1
## cMaj_Kinesiology/Physical Education   Binary   0.0142 Balanced, <0.1
## cMaj_Mechanical Engineering           Binary   0.0039 Balanced, <0.1
## cMaj_Nursing                          Binary   0.0030 Balanced, <0.1
## cMaj_Nutrition                        Binary   0.0026 Balanced, <0.1
## cMaj_OTHER                            Binary  -0.0176 Balanced, <0.1
## cMaj_Physics                          Binary   0.0021 Balanced, <0.1
## cMaj_Undeclared                       Binary   0.0050 Balanced, <0.1
## term.units.attemptedCensus           Contin.  -0.0298 Balanced, <0.1
## hs.gpa                               Contin.   0.0131 Balanced, <0.1
## delay.from.hs                        Contin.  -0.0409 Balanced, <0.1
## acad.stand_Other                      Binary  -0.0116 Balanced, <0.1
## p.score                              Contin.   0.0010 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        37
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##  Variable Diff.Adj    M.Threshold
##  AP_CALAB     0.07 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                  1007.       762
## Matched (ESS)         313.69     757
## Matched (Unweighted)  711.       757
## Unmatched             296.         0
## Discarded               0.         5

Estimate Difference Between Mean grade in CHEM 1A of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.4952651 . This result is statistically significant with a P-value of \(4.8561x10^{-13}\) and is based on 757 PAL students and 1561 non-PAL student matches(711 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.chem1a)
## 
## Estimate...  0.49527 
## AI SE......  0.068509 
## T-stat.....  7.2292 
## p.val......  4.8561e-13 
## 
## Original number of observations..............  1769 
## Original number of treated obs...............  762 
## Matched number of observations...............  757 
## Matched number of observations  (unweighted).  1561 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  5

Sensitivity Analysis

psens(match.chem1a, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0000
##    1.3           0      0.0000
##    1.4           0      0.0000
##    1.5           0      0.0000
##    1.6           0      0.0006
##    1.7           0      0.0106
##    1.8           0      0.0764
##    1.9           0      0.2732
##    2.0           0      0.5716
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “csus.gpa.start” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(chem1a.first.order.prop.model$coefficients))))
x
sat.verbal.score 1.002461
sat.math.score 1.003143
cMajNursing 1.006695
AP_BIOL 1.011381
cum.percent.units.passed 1.018726
cMajNutrition 1.029318
AP_CALAB 1.030082
term.units.attemptedCensus 1.073628
cMajGeology 1.103792
delay.from.hs 1.122394
cMajKinesiology/Physical Education 1.134369
sat.math.flgold 1.137279
eth.erssTwo or More Races 1.138576
eth.erssAsian 1.222257
eth.erssUnknown 1.340370
hs.gpa 1.345820
AP_BIOL.flgNot Missing 1.387255
genderMale 1.396312
eth.erssWhite 1.406862
AP_CALAB.flgNot Missing 1.411832
eth.erssOther 1.521413
cMajUndeclared 1.588511
eth.erssHispanic 1.612834
acad.standOther 1.622814
prevPAL 1.702267
cMajCivil Engineering 1.817850
cMajOTHER 1.839501
cMajChemistry 2.028817
csus.gpa.start 2.052126
AP_CHEM 2.103142
cMajPhysics 2.222943
cMajMechanical Engineering 2.341708
AP_CHEM.flgNot Missing 3.122362
(Intercept) 9.085387

CHEM 1B

Executive Summary

Based on data for 769 PAL students and 1090 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.41 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.43\pm 0.08\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 1B, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 1B course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Chem 1B, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: ethnicity, has an AP Chemistry exam score, enrollment in PAL in the past, class level, being remedial in math, being eligible for a Pell grant when entering CSUS, academic major, CSUS GPA at start of term, and from CSUS local admission area.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0733. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.43\pm 0.08\) or between 0.35 and 0.51 on a 4.0 grade scale. This result is statistically significant with a P-value of \(8.59x10^{-9}\) and is based on 573 PAL students and 457 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.41.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 1B. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 1B indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of being remedial in math, class level, or has an AP Chemistry exam score observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 769 PAL students in Chem 1B being reduced to 573. Also, 457 non-PAL students were selected out of 1090 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 991 non-PAL matches. Of the 573 PAL students, 331 were matched one-to-one with non-PAL students and 242 were matched one-to-many with non-PAL students.

Extract CHEM 1B Data and Prerequisite Course CHEM 1A

The non-PAL and PAL groups will include students with only first attempts at CHEM 1B.They will also include students with previous PAL participation and/or are currently in a PAL for another course.

For the prerequisite course CHEM 1A, the grade for the student’s last attempt and the number of times it was taken are added to the CHEM 1B data set. Only 1416 out of 1859 CHEM 1B students have CHEM 1A grades.
However, few students retook CHEM 1A so there was inadequate balance on number of times Chem 1B was taken, and it was removed from the propensity model after balance checks.

# Excludes course repeats
chem1b.dat <- chem.dat %>%
  filter(course=="CHEM 1B", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem1b.dat) #  1859  179
## [1] 1859  179
prereq <- "CHEM 1A"
chem1b.dat <- get.prereq.grades(chem1b.dat, chem.dat, prereq)

# Rename CHEM 1A variables
chem1b.dat <- chem1b.dat %>%
  rename(chem1a.course.seq= prereq.course.seq, chem1a.grd.pt.unt= prereq.grd.pt.unt, chem1a.grade = prereq.grade)
dim(chem1b.dat) # 1416  182
## [1] 1416  182

There are 1,416 CHEM 1B first attempt only students with prerequisite CHEM 1A grades.

The variables below were added to added to the CHEM 1B data, so there are now 182 variables instead of 179 variables.
chem1a.course.seq: Is the student taking CHEM 1A for the first time or is it the second attempt, third attempt, etc.
chem1a.grd.pt.unt: Numeric grade on 0 to 4 scale (0=F, 4=A) for CHEM 1A
chem1a.grade: Course grade for CHEM 1A

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# # Collapsed cMaj categories to Biology and Other majors at 0.04
# with(chem1b.dat, table(cMaj, palN))
chem1b.dat <- group_category(data = chem1b.dat, feature = "cMaj", threshold = 0.04,  update = TRUE)
with(chem1b.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        433 338
##   Chemistry                      119  83
##   Geology                          7   5
##   Health Science                   7   4
##   Kinesiology/Physical Education 129 107
##   Nutrition                       48  19
##   OTHER                           36  22
##   Physics                         33   2
##   Undeclared                      12  12

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

## [1] 35
##                        feature num_missing pct_missing
## 22                   deg.plan3        1416   1.0000000
## 23                   deg.plan4        1416   1.0000000
## 24                   deg.plan5        1416   1.0000000
## 25                   deg.plan6        1416   1.0000000
## 19             withdraw_reason        1414   0.9985876
## 21                   deg.plan2        1393   0.9837571
## 4                  pledge.term        1251   0.8834746
## 11                  trf.gpaADM        1203   0.8495763
## 18               treat.section         824   0.5819209
## 1                fys.term.code         778   0.5494350
## 2                      fys.grd         778   0.5494350
## 3                  fys.rpt.flg         778   0.5494350
## 17               Instructor_02         723   0.5105932
## 27                grad.termERS         670   0.4731638
## 20                   deg.plan1         656   0.4632768
## 26                   grad.term         656   0.4632768
## 28                         ttg         656   0.4632768
## 29                plan.college         607   0.4286723
## 30           plan.college.desc         607   0.4286723
## 31                   plan.dept         607   0.4286723
## 32               plan.deptAbbr         607   0.4286723
## 33                 plan.degree         607   0.4286723
## 34                   plan.type         607   0.4286723
## 5               sat.math.score         277   0.1956215
## 6                 sat.math.flg         277   0.1956215
## 7             sat.verbal.score         277   0.1956215
## 8               sat.verbal.flg         277   0.1956215
## 9                sat.test.date         277   0.1956215
## 13 ge.critical.thinking.status         260   0.1836158
## 14      ge.english.comp.status         260   0.1836158
## 15              ge.math.status         260   0.1836158
## 16         ge.oral.comm.status         260   0.1836158
## 12                  admit.term         256   0.1807910
## 10                      hs.gpa         176   0.1242938
## 35                      county         166   0.1172316

## [1] 1416  147

35 variables missing >10%
So, 35 variables were removed due to missingness and there are now 147 variables instead of 182 variables.

Subset on Complete Cases only in CHEM 1B Data

chem1b.dat <- chem1b.dat[complete.cases(chem1b.dat), ]
dim(chem1b.dat) # 1337  147
## [1] 1337  147

1337 out of 1416 students are kept
79 students were removed due to missingness of variables

single.vars <- chem1b.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem1b.dat)
##                                            
##                                             Overall      
##   n                                         1337         
##   country = USA (%)                         1337 (100.0) 
##   career.course = UGRD (%)                  1337 (100.0) 
##   acad.prog.course = UGD (%)                1337 (100.0) 
##   course (%)                                             
##      CHEM 4                                    0 (  0.0) 
##      CHEM 1A                                   0 (  0.0) 
##      CHEM 1B                                1337 (100.0) 
##      CHEM 24                                   0 (  0.0) 
##   component = LEC (%)                       1337 (100.0) 
##   units (mean (SD))                         5.00 (0.00)  
##   course.numeric (mean (SD))                1.00 (0.00)  
##   div = Lower Division (%)                  1337 (100.0) 
##   course.seq (mean (SD))                    0.00 (0.00)  
##   rpt.flg = First Attempt (%)               1337 (100.0) 
##   pass = Non-PASS (%)                       1337 (100.0) 
##   c2s = Non-C2S (%)                         1337 (100.0) 
##   base.time.course (mean (SD))              1.00 (0.00)  
##   years (mean (SD))                         0.50 (0.00)  
##   withdraw_code = NWD (%)                   1337 (100.0) 
##   enrl.flg = Enrolled (%)                   1337 (100.0) 
##   enrl.flgERS = Enrolled (%)                1337 (100.0) 
##   rtn.flg = Retained (%)                    1337 (100.0) 
##   rtn.flgERS = Retained (%)                 1337 (100.0) 
##   pass.term.flg = PASS Term (%)             1337 (100.0) 
##   passN (mean (SD))                         0.00 (0.00)  
##   csus.gpa.start.flg = Not Missing (%)      1337 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%) 1337 (100.0)
sum(single.vars) # 23
## [1] 23
# remove single-valued variables
chem1b.dat<- chem1b.dat %>%
  dplyr::select(-names(single.vars))
dim(chem1b.dat) # 1337  124
## [1] 1337  124

124 out of 147 variables are kept
23 variables removed due to single values

Identify variables causing complete separation in logistic regression

# Remove non chem1b instructors
chem1b.dat <- chem1b.dat %>%
  droplevels(chem1b.dat$Instructor_01)

# Combine sparse ethnicity categories to Other
chem1b.dat <- chem1b.dat %>%
  mutate(eth.erss = fct_other(eth.erss, drop = c("Native American", "Pacific Islander")))
with(chem1b.dat, table(eth.erss, palN))
##                    palN
## eth.erss              0   1
##   African American   31  21
##   Asian             233 208
##   Foreign             8   6
##   Hispanic          167 154
##   Two or More Races  52  31
##   Unknown            34  13
##   White             227 137
##   Other              12   3

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

chem1b.final <- chem1b.step.vars(chem1b.dat)
kable(names(chem1b.final))
x
acad.stand
adm.area
bot.level
cMaj
coh
course.age
csus.gpa.start
cum.percent.units.passed
delay.from.hs
e.rmd
eth.erss
father.ed
fys.flg
gender
hous.coh.term.flg
Instructor_01
median.income
m.rmd
mother.ed
pct.female.head
pell.coh.term.flg
prevPAL
prevPASS
reason
sac.county.flg
term.units.attemptedCensus
palN
grd.pt.unt
chem1a.grd.pt.unt
AP_BIOL
AP_CALAB
AP_CALBC
AP_CHEM
AP_BIOL.flg
AP_CALAB.flg
AP_CALBC.flg
AP_CHEM.flg
pct.female.head.flg
med.inc.flg

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, and chem1a.grd.pt.unt. Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

chem1b.final <- chem1b.step.vars(chem1b.dat)
chem1b.first.order.prop.model <- chem1b.step(chem1b.final)
summary(chem1b.first.order.prop.model)
## 
## Call:
## glm(formula = model.first.order, family = binomial, data = chem1b.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0704  -1.0105  -0.6102   1.0971   2.5795  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -2.4187425  1.1236815  -2.153 0.031357 *  
## chem1a.grd.pt.unt                  -0.0952206  0.1183413  -0.805 0.421035    
## cum.percent.units.passed            0.4253555  1.0331478   0.412 0.680553    
## eth.erssAsian                       0.2258778  0.3168833   0.713 0.475963    
## eth.erssForeign                     0.1193382  0.6437159   0.185 0.852923    
## eth.erssHispanic                    0.2176552  0.3213975   0.677 0.498269    
## eth.erssTwo or More Races          -0.0008854  0.3852899  -0.002 0.998166    
## eth.erssUnknown                    -0.4839073  0.4644237  -1.042 0.297434    
## eth.erssWhite                      -0.0732071  0.3225052  -0.227 0.820427    
## eth.erssOther                      -1.5590171  0.7498221  -2.079 0.037601 *  
## genderMale                          0.1921808  0.1281980   1.499 0.133849    
## AP_CHEM                             0.0172767  0.3201902   0.054 0.956969    
## AP_CHEM.flgNot Missing             -0.8156336  0.2959788  -2.756 0.005856 ** 
## prevPAL                             0.5954449  0.0787251   7.564 3.92e-14 ***
## bot.levelJunior                    -0.5557096  0.3064733  -1.813 0.069795 .  
## bot.levelSenior                    -0.7147519  0.3160084  -2.262 0.023709 *  
## bot.levelSophomore                 -0.0971246  0.3014493  -0.322 0.747307    
## m.rmdRemedial in Math               0.6764007  0.1892499   3.574 0.000351 ***
## pell.coh.term.flgPell               0.4699412  0.1222645   3.844 0.000121 ***
## cMajChemistry                      -0.0751801  0.1856516  -0.405 0.685512    
## cMajGeology                         0.2741545  0.6425678   0.427 0.669631    
## cMajHealth Science                 -0.1534683  0.6711026  -0.229 0.819117    
## cMajKinesiology/Physical Education  0.2702222  0.1694312   1.595 0.110739    
## cMajNutrition                      -0.3721911  0.3149924  -1.182 0.237369    
## cMajOTHER                           0.0981896  0.3051662   0.322 0.747636    
## cMajPhysics                        -2.2262195  0.7492173  -2.971 0.002965 ** 
## cMajUndeclared                      0.2144541  0.4430973   0.484 0.628394    
## csus.gpa.start                      0.4743598  0.2143101   2.213 0.026868 *  
## adm.areanonlocal                   -0.2446215  0.1375927  -1.778 0.075426 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1826.1  on 1336  degrees of freedom
## Residual deviance: 1641.8  on 1308  degrees of freedom
## AIC: 1699.8
## 
## Number of Fisher Scoring iterations: 5
p.score <- chem1b.first.order.prop.model$fitted.values
chem1b.covs <-  names(chem1b.first.order.prop.model %>%  pluck("model") %>% dplyr::select(-palN)) 

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(chem1b.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.859686 2.187609 0.3279232

Standardized mean differences for continuous variables and categorical variables.

unmatched.tab <- CreateTableOne(vars = chem1b.covs, strata = "palN", 
                             data = chem1b.final, smd = TRUE, test = FALSE)
print(unmatched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0            1            SMD   
##   n                                     764          573               
##   chem1a.grd.pt.unt (mean (SD))        2.59 (0.68)  2.58 (0.64)   0.006
##   cum.percent.units.passed (mean (SD)) 0.92 (0.08)  0.92 (0.08)   0.008
##   eth.erss (%)                                                    0.249
##      African American                    31 ( 4.1)    21 ( 3.7)        
##      Asian                              233 (30.5)   208 (36.3)        
##      Foreign                              8 ( 1.0)     6 ( 1.0)        
##      Hispanic                           167 (21.9)   154 (26.9)        
##      Two or More Races                   52 ( 6.8)    31 ( 5.4)        
##      Unknown                             34 ( 4.5)    13 ( 2.3)        
##      White                              227 (29.7)   137 (23.9)        
##      Other                               12 ( 1.6)     3 ( 0.5)        
##   gender = Male (%)                     308 (40.3)   223 (38.9)   0.029
##   AP_CHEM (mean (SD))                  1.97 (0.26)  1.96 (0.18)   0.037
##   AP_CHEM.flg = Not Missing (%)          55 ( 7.2)    19 ( 3.3)   0.175
##   prevPAL (mean (SD))                  0.63 (0.77)  1.02 (0.80)   0.499
##   bot.level (%)                                                   0.210
##      Freshman                            32 ( 4.2)    29 ( 5.1)        
##      Junior                             270 (35.3)   211 (36.8)        
##      Senior                             256 (33.5)   141 (24.6)        
##      Sophomore                          206 (27.0)   192 (33.5)        
##   m.rmd = Remedial in Math (%)           75 ( 9.8)   106 (18.5)   0.251
##   pell.coh.term.flg = Pell (%)          358 (46.9)   343 (59.9)   0.263
##   cMaj (%)                                                        0.299
##      Biology                            399 (52.2)   324 (56.5)        
##      Chemistry                          114 (14.9)    79 (13.8)        
##      Geology                              6 ( 0.8)     5 ( 0.9)        
##      Health Science                       7 ( 0.9)     4 ( 0.7)        
##      Kinesiology/Physical Education     120 (15.7)   107 (18.7)        
##      Nutrition                           44 ( 5.8)    18 ( 3.1)        
##      OTHER                               32 ( 4.2)    22 ( 3.8)        
##      Physics                             30 ( 3.9)     2 ( 0.3)        
##      Undeclared                          12 ( 1.6)    12 ( 2.1)        
##   csus.gpa.start (mean (SD))           3.16 (0.45)  3.21 (0.41)   0.105
##   adm.area = nonlocal (%)               231 (30.2)   149 (26.0)   0.094

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     5     8    13
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
prevPAL 0.4992792
cMaj 0.2985656
pell.coh.term.flg 0.2628653
m.rmd 0.2510048
eth.erss 0.2493008
bot.level 0.2101597
AP_CHEM.flg 0.1746490
csus.gpa.start 0.1050944

8 variables have SMD >0.1

Implement a propensity score matching method.

match.chem1b <- with(chem1b.final, Match(
  Y=chem1b.final$grd.pt.unt, Tr = chem1b.final$palN, X = p.score, 
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
chem1b.final <- chem1b.final %>%
  rownames_to_column(var = "id")

# Matched data
chem1b.matched.dat <- chem1b.final[unlist(match.chem1b[c("index.treated", "index.control")]), ]
chem1b.matched.dat$match.weights<-  c(match.chem1b$weights, match.chem1b$weights)

# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem1b.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem1b.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0              1              SMD   
##   n                                    573.00         573.00               
##   chem1a.grd.pt.unt (mean (SD))          2.54 (0.67)    2.58 (0.64)   0.071
##   cum.percent.units.passed (mean (SD))   0.92 (0.08)    0.92 (0.08)   0.002
##   eth.erss (%)                                                        0.176
##      African American                    33.9 ( 5.9)    21.0 ( 3.7)        
##      Asian                              189.9 (33.1)   208.0 (36.3)        
##      Foreign                              7.5 ( 1.3)     6.0 ( 1.0)        
##      Hispanic                           153.9 (26.9)   154.0 (26.9)        
##      Two or More Races                   48.8 ( 8.5)    31.0 ( 5.4)        
##      Unknown                             11.0 ( 1.9)    13.0 ( 2.3)        
##      White                              125.7 (21.9)   137.0 (23.9)        
##      Other                                2.2 ( 0.4)     3.0 ( 0.5)        
##   gender = Male (%)                     195.8 (34.2)   223.0 (38.9)   0.099
##   AP_CHEM (mean (SD))                    1.96 (0.11)    1.96 (0.18)   0.003
##   AP_CHEM.flg = Not Missing (%)          10.9 ( 1.9)    19.0 ( 3.3)   0.089
##   prevPAL (mean (SD))                    1.04 (0.92)    1.02 (0.80)   0.025
##   bot.level (%)                                                       0.078
##      Freshman                            37.9 ( 6.6)    29.0 ( 5.1)        
##      Junior                             218.6 (38.2)   211.0 (36.8)        
##      Senior                             135.4 (23.6)   141.0 (24.6)        
##      Sophomore                          181.2 (31.6)   192.0 (33.5)        
##   m.rmd = Remedial in Math (%)          106.2 (18.5)   106.0 (18.5)   0.001
##   pell.coh.term.flg = Pell (%)          325.4 (56.8)   343.0 (59.9)   0.062
##   cMaj (%)                                                            0.118
##      Biology                            298.0 (52.0)   324.0 (56.5)        
##      Chemistry                           85.5 (14.9)    79.0 (13.8)        
##      Geology                              3.6 ( 0.6)     5.0 ( 0.9)        
##      Health Science                       6.8 ( 1.2)     4.0 ( 0.7)        
##      Kinesiology/Physical Education     119.0 (20.8)   107.0 (18.7)        
##      Nutrition                           19.1 ( 3.3)    18.0 ( 3.1)        
##      OTHER                               28.7 ( 5.0)    22.0 ( 3.8)        
##      Physics                              2.0 ( 0.3)     2.0 ( 0.3)        
##      Undeclared                          10.2 ( 1.8)    12.0 ( 2.1)        
##   csus.gpa.start (mean (SD))             3.18 (0.42)    3.21 (0.41)   0.055
##   adm.area = nonlocal (%)               147.6 (25.8)   149.0 (26.0)   0.005

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion

All variables are balanced and under the <0.1 mean threshold.

## Balance Measures
## All covariates are balanced.
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        32
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##           Variable Diff.Adj    M.Threshold
##  chem1a.grd.pt.unt   0.0733 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                   764.       573
## Matched (ESS)         214.85     573
## Matched (Unweighted)  457.       573
## Unmatched             307.         0

Check variable percent improvement

get.var.perc.tab(chem1b.bal)
##                               Variable       Diff.Un      Diff.Adj % Improvement
## 1                              p.score  0.7985494927  0.0016913096           100
## 2                    eth.erss_Hispanic  0.0501745201  0.0002326934           100
## 3               m.rmd_Remedial in Math  0.0868237347 -0.0002908668           100
## 4                         cMaj_Physics -0.0357766143  0.0000000000           100
## 5                              prevPAL  0.4896307024 -0.0266280848            95
## 6                              AP_CHEM -0.0460365334  0.0026776474            94
## 7                    adm.area_nonlocal -0.0423211169  0.0023851076            94
## 8                       cMaj_Nutrition -0.0261780105 -0.0020069808            92
## 9                     bot.level_Senior -0.0890052356  0.0098312973            89
## 10                      eth.erss_Other -0.0104712042  0.0013089005            88
## 11                    eth.erss_Unknown -0.0218150087  0.0034322280            84
## 12            cum.percent.units.passed -0.0084163831  0.0017311130            79
## 13              pell.coh.term.flg_Pell  0.1300174520  0.0307737056            76
## 14                 bot.level_Sophomore  0.0654450262  0.0189063409            71
## 15                      eth.erss_White -0.0580279232  0.0197498546            66
## 16             AP_CHEM.flg_Not Missing -0.0388307155  0.0141942990            63
## 17                      csus.gpa.start  0.1110693721  0.0559114629            50
## 18                      eth.erss_Asian  0.0580279232  0.0315299593            46
## 19                     cMaj_Undeclared  0.0052356021  0.0031413613            40
## 20 cMaj_Kinesiology/Physical Education  0.0296684119 -0.0209714951            29
## 21                    bot.level_Junior  0.0148342059 -0.0132635253            11
## 22                    eth.erss_Foreign  0.0000000000 -0.0026178010             0
## 23                      cMaj_Chemistry -0.0113438045 -0.0113728912             0
## 24                        cMaj_Biology  0.0431937173  0.0454043048            -5
## 25                  bot.level_Freshman  0.0087260035 -0.0154741129           -77
## 26                 cMaj_Health Science -0.0021815009 -0.0047993019          -120
## 27          eth.erss_Two or More Races -0.0139616056 -0.0310936591          -123
## 28                        cMaj_Geology  0.0008726003  0.0023560209          -170
## 29                          cMaj_OTHER -0.0034904014 -0.0117510180          -237
## 30                         gender_Male -0.0139616056  0.0474694590          -240
## 31           eth.erss_African American -0.0039267016 -0.0225421757          -474
## 32                   chem1a.grd.pt.unt -0.0065835834  0.0732743689         -1013

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(chem1b.bal,binary = "raw", stars = "std", var.order = "unadjusted", 
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(chem1b.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 168 331
Multiple Matches 289 242
Total Students 457 573

Out of 573 PAL students, 331 PAL students were matched to one non-PAL student and 242 PAL students were matched to multiple non-PAL students.

Out of 991 non-PAL student matches, there were 457 non-PAL students, 168 of the non-PAL students were matched to one PAL student and 289 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(chem1b.final, match.chem1b)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is 0.0751, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.

## 
## Call:
## glm(formula = f.build("grd.pt.unt", chem1b.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.38485  -0.44720   0.07171   0.50204   2.90517  
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -2.560100   0.506831  -5.051 5.54e-07 ***
## chem1a.grd.pt.unt                   0.383550   0.056839   6.748 3.04e-11 ***
## cum.percent.units.passed            0.263816   0.493095   0.535   0.5928    
## eth.erssAsian                       0.173864   0.153593   1.132   0.2580    
## eth.erssForeign                     0.230472   0.316260   0.729   0.4664    
## eth.erssHispanic                    0.120339   0.154148   0.781   0.4352    
## eth.erssTwo or More Races           0.224773   0.180187   1.247   0.2126    
## eth.erssUnknown                     0.213163   0.199194   1.070   0.2849    
## eth.erssWhite                       0.209831   0.154464   1.358   0.1747    
## eth.erssOther                       0.663180   0.269936   2.457   0.0142 *  
## genderMale                          0.137315   0.062484   2.198   0.0283 *  
## AP_CHEM                             0.112366   0.113839   0.987   0.3239    
## AP_CHEM.flgNot Missing              0.062124   0.113912   0.545   0.5857    
## prevPAL                            -0.161055   0.039232  -4.105 4.49e-05 ***
## bot.levelJunior                     0.054140   0.158553   0.341   0.7329    
## bot.levelSenior                     0.032432   0.160796   0.202   0.8402    
## bot.levelSophomore                 -0.021684   0.155768  -0.139   0.8893    
## m.rmdRemedial in Math              -0.077182   0.104382  -0.739   0.4599    
## pell.coh.term.flgPell              -0.079351   0.059006  -1.345   0.1791    
## cMajChemistry                       0.148660   0.090582   1.641   0.1012    
## cMajGeology                         0.131348   0.324729   0.404   0.6860    
## cMajHealth Science                  0.535358   0.304482   1.758   0.0791 .  
## cMajKinesiology/Physical Education  0.091306   0.083899   1.088   0.2768    
## cMajNutrition                      -0.289441   0.129282  -2.239   0.0255 *  
## cMajOTHER                          -0.084585   0.146957  -0.576   0.5651    
## cMajPhysics                        -0.009565   0.155174  -0.062   0.9509    
## cMajUndeclared                      0.241947   0.231955   1.043   0.2973    
## csus.gpa.start                      0.895163   0.101919   8.783  < 2e-16 ***
## adm.areanonlocal                   -0.036492   0.064317  -0.567   0.5706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.6128765)
## 
##     Null deviance: 802.88  on 763  degrees of freedom
## Residual deviance: 450.46  on 735  degrees of freedom
## AIC: 1824.5
## 
## Number of Fisher Scoring iterations: 2
## Balance Measures
##                                         Type Diff.Adj    M.Threshold
## prog.score                          Distance   0.0751 Balanced, <0.1
## chem1a.grd.pt.unt                    Contin.   0.0733 Balanced, <0.1
## cum.percent.units.passed             Contin.   0.0017 Balanced, <0.1
## eth.erss_African American             Binary  -0.0225 Balanced, <0.1
## eth.erss_Asian                        Binary   0.0315 Balanced, <0.1
## eth.erss_Foreign                      Binary  -0.0026 Balanced, <0.1
## eth.erss_Hispanic                     Binary   0.0002 Balanced, <0.1
## eth.erss_Two or More Races            Binary  -0.0311 Balanced, <0.1
## eth.erss_Unknown                      Binary   0.0034 Balanced, <0.1
## eth.erss_White                        Binary   0.0197 Balanced, <0.1
## eth.erss_Other                        Binary   0.0013 Balanced, <0.1
## gender_Male                           Binary   0.0475 Balanced, <0.1
## AP_CHEM                              Contin.   0.0027 Balanced, <0.1
## AP_CHEM.flg_Not Missing               Binary   0.0142 Balanced, <0.1
## prevPAL                              Contin.  -0.0266 Balanced, <0.1
## bot.level_Freshman                    Binary  -0.0155 Balanced, <0.1
## bot.level_Junior                      Binary  -0.0133 Balanced, <0.1
## bot.level_Senior                      Binary   0.0098 Balanced, <0.1
## bot.level_Sophomore                   Binary   0.0189 Balanced, <0.1
## m.rmd_Remedial in Math                Binary  -0.0003 Balanced, <0.1
## pell.coh.term.flg_Pell                Binary   0.0308 Balanced, <0.1
## cMaj_Biology                          Binary   0.0454 Balanced, <0.1
## cMaj_Chemistry                        Binary  -0.0114 Balanced, <0.1
## cMaj_Geology                          Binary   0.0024 Balanced, <0.1
## cMaj_Health Science                   Binary  -0.0048 Balanced, <0.1
## cMaj_Kinesiology/Physical Education   Binary  -0.0210 Balanced, <0.1
## cMaj_Nutrition                        Binary  -0.0020 Balanced, <0.1
## cMaj_OTHER                            Binary  -0.0118 Balanced, <0.1
## cMaj_Physics                          Binary   0.0000 Balanced, <0.1
## cMaj_Undeclared                       Binary   0.0031 Balanced, <0.1
## csus.gpa.start                       Contin.   0.0559 Balanced, <0.1
## adm.area_nonlocal                     Binary   0.0024 Balanced, <0.1
## p.score                              Contin.   0.0017 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        33
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##           Variable Diff.Adj    M.Threshold
##  chem1a.grd.pt.unt   0.0733 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                   764.       573
## Matched (ESS)         214.85     573
## Matched (Unweighted)  457.       573
## Unmatched             307.         0

Estimate Difference Between Mean grade in CHEM 1B of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.43171. This result is statistically significant with a P-value of \(1.7177x10^{-8}\) and is based on 573 PAL students and 991 non-PAL student matches(457 total non-PAL students) Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.chem1b)
## 
## Estimate...  0.43171 
## AI SE......  0.076567 
## T-stat.....  5.6383 
## p.val......  1.7177e-08 
## 
## Original number of observations..............  1337 
## Original number of treated obs...............  573 
## Matched number of observations...............  573 
## Matched number of observations  (unweighted).  991 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  0

Sensitivity Analysis

psens(match.chem1b, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0000
##    1.3           0      0.0000
##    1.4           0      0.0000
##    1.5           0      0.0005
##    1.6           0      0.0058
##    1.7           0      0.0363
##    1.8           0      0.1335
##    1.9           0      0.3217
##    2.0           0      0.5599
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “instructor”bot.level" or “eth.erss” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(chem1b.first.order.prop.model$coefficients))))
x
eth.erssTwo or More Races 1.000886
AP_CHEM 1.017427
eth.erssWhite 1.075953
cMajChemistry 1.078078
chem1a.grd.pt.unt 1.099901
bot.levelSophomore 1.101998
cMajOTHER 1.103172
eth.erssForeign 1.126751
cMajHealth Science 1.165871
genderMale 1.211890
cMajUndeclared 1.239185
eth.erssHispanic 1.243158
eth.erssAsian 1.253423
adm.areanonlocal 1.277138
cMajKinesiology/Physical Education 1.310256
cMajGeology 1.315418
cMajNutrition 1.450910
cum.percent.units.passed 1.530134
pell.coh.term.flgPell 1.599900
csus.gpa.start 1.606985
eth.erssUnknown 1.622401
bot.levelJunior 1.743177
prevPAL 1.813838
m.rmdRemedial in Math 1.966786
bot.levelSenior 2.043680
AP_CHEM.flgNot Missing 2.260607
eth.erssOther 4.754146
cMajPhysics 9.264774
(Intercept) 11.231726

CHEM 4

Executive Summary

Based on data for 759 PAL students and 1929 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.36 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.44\pm 0.09\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 4, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 4 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Chem 4, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: female gender, lower SAT scores, lower AP Calculus exam scores, higher term units attempted, academic major, class level, and CSUS GPA at start of term.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0794. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.44\pm 0.09\) or between 0.35 and 0.53 on a 4.0 grade scale. This result is statistically significant with a P-value of \(2.19x10^{-7}\) and is based on 530 PAL students and 680 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.36.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 4. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.7 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 4 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of class level, instructor, or cohort (Transfer or Native Freshmen) observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 759 PAL students in Chem 4 being reduced to 530. Also, 680 non-PAL students were selected out of 1929 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 1238 non-PAL matches. Of the 530 PAL students, 206 were matched one-to-one with non-PAL students and 324 were matched one-to-many with non-PAL students.

Extract CHEM 4 Data

The non-PAL and PAL groups will include students with only first attempts at CHEM 4. They will also include students with previous PAL participation and/or are currently in a PAL for another course.

# Excludes course repeats
chem4.dat <- chem.dat %>%
  filter(course=="CHEM 4", pass.term.flg == "PASS Term", course.seq== 0) 
dim(chem4.dat) # 2688  179
## [1] 2688  179

There are 2688 first attempt CHEM 4 students.

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.09
with(chem4.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Anthropology                     4   2
##   Art                              1   0
##   Biology                        590 324
##   Business                        14   2
##   Chemistry                      168  66
##   Child Devel/Early Childhood Ed   8   6
##   Civil Engineering              220  42
##   Communications                   5   1
##   Computer Engineering            21   8
##   Computer Science                17   7
##   Construction Management          2   2
##   Criminal Justice                16   8
##   Deaf Studies                     3   0
##   Economics                        1   0
##   Electrical Engineering          86  13
##   English                          3   3
##   Environmental Studies           24   6
##   Family & Consumer Sciences       1   0
##   Film                             1   0
##   Finance                          3   1
##   French                           1   0
##   Geology                         27   8
##   Gerontology                      2   0
##   Graphic Design                   3   0
##   Health Science                  17   6
##   History                          2   3
##   Humanities                       0   1
##   Interior Design                  1   0
##   International Business           1   2
##   Journalism                       1   1
##   Kinesiology/Physical Education 202  81
##   Liberal Studies                  8   0
##   Mathematics                      7   2
##   Mechanical Engineering         212  62
##   Music                            0   1
##   Nursing                         29  25
##   Nutrition                       89  35
##   Philosophy                       3   1
##   Photography                      0   1
##   Physical Science                 1   0
##   Physics                         13   3
##   Political Science                2   0
##   Psychology                      42   8
##   Recreation Administration        0   1
##   Social Science                   1   1
##   Social Work                      1   3
##   Sociology                        1   0
##   Spanish                          1   2
##   Speech Pathology                 8   0
##   Theatre Arts                     1   1
##   Undeclared                      61  20
chem4.dat <- group_category(data = chem4.dat, feature = "cMaj", threshold = 0.09,  update = TRUE)
with(chem4.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        590 324
##   Chemistry                      168  66
##   Civil Engineering              220  42
##   Electrical Engineering          86  13
##   Environmental Studies           24   6
##   Geology                         27   8
##   Kinesiology/Physical Education 202  81
##   Mechanical Engineering         212  62
##   Nursing                         29  25
##   Nutrition                       89  35
##   OTHER                          179  69
##   Psychology                      42   8
##   Undeclared                      61  20

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

## [1] 40
##                        feature num_missing pct_missing
## 19               Instructor_02        2688  1.00000000
## 24                   deg.plan3        2688  1.00000000
## 25                   deg.plan4        2688  1.00000000
## 26                   deg.plan5        2688  1.00000000
## 27                   deg.plan6        2688  1.00000000
## 21             withdraw_reason        2671  0.99367560
## 23                   deg.plan2        2667  0.99218750
## 13                  trf.gpaADM        2330  0.86681548
## 4                  pledge.term        2157  0.80245536
## 29                grad.termERS        1738  0.64657738
## 22                   deg.plan1        1713  0.63727679
## 28                   grad.term        1713  0.63727679
## 30                         ttg        1713  0.63727679
## 1                fys.term.code        1579  0.58742560
## 2                      fys.grd        1579  0.58742560
## 3                  fys.rpt.flg        1579  0.58742560
## 20               treat.section        1367  0.50855655
## 15 ge.critical.thinking.status         813  0.30245536
## 17              ge.math.status         813  0.30245536
## 18         ge.oral.comm.status         813  0.30245536
## 16      ge.english.comp.status         812  0.30208333
## 14                  admit.term         790  0.29389881
## 33                plan.college         769  0.28608631
## 34           plan.college.desc         769  0.28608631
## 35                   plan.dept         769  0.28608631
## 36               plan.deptAbbr         769  0.28608631
## 37                 plan.degree         769  0.28608631
## 38                   plan.type         769  0.28608631
## 7               sat.math.score         519  0.19308036
## 8                 sat.math.flg         519  0.19308036
## 9             sat.verbal.score         519  0.19308036
## 10              sat.verbal.flg         519  0.19308036
## 11               sat.test.date         519  0.19308036
## 31      tot.passd.prgrss.start         419  0.15587798
## 32      tot.taken.prgrss.start         419  0.15587798
## 39    cum.percent.units.passed         419  0.15587798
## 40                      county         406  0.15104167
## 12                      hs.gpa         321  0.11941964
## 5                  m.rmd.admin         146  0.05431548
## 6           m.rmd.admin.detail         146  0.05431548

## [1] 2688  144

40 variables missing >5%
5 out of 40 variables were important and force included, even though they were missing >5%
So, 35 variables were removed due to missingness and there are now 144 variables instead of 179 variables.

Subset on Complete Cases only in CHEM 4 Data

chem4.dat <- chem4.dat[complete.cases(chem4.dat), ]
dim(chem4.dat) # 1723  144
## [1] 1723  144

1723 out of 2688 students are kept
965 students were removed due to missingness of variables

single.vars <- chem4.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem4.dat)
##                                            
##                                             Overall      
##   n                                         1723         
##   country = USA (%)                         1723 (100.0) 
##   career.course = UGRD (%)                  1723 (100.0) 
##   acad.prog.course = UGD (%)                1723 (100.0) 
##   course (%)                                             
##      CHEM 4                                 1723 (100.0) 
##      CHEM 1A                                   0 (  0.0) 
##      CHEM 1B                                   0 (  0.0) 
##      CHEM 24                                   0 (  0.0) 
##   component = LEC (%)                       1723 (100.0) 
##   units (mean (SD))                         3.00 (0.00)  
##   course.numeric (mean (SD))                4.00 (0.00)  
##   div = Lower Division (%)                  1723 (100.0) 
##   course.seq (mean (SD))                    0.00 (0.00)  
##   rpt.flg = First Attempt (%)               1723 (100.0) 
##   base.time.course (mean (SD))              1.00 (0.00)  
##   years (mean (SD))                         0.50 (0.00)  
##   enroll.status = Continuing (%)            1723 (100.0) 
##   withdraw_code = NWD (%)                   1723 (100.0) 
##   enrl.flg = Enrolled (%)                   1723 (100.0) 
##   enrl.flgERS = Enrolled (%)                1723 (100.0) 
##   rtn.flg = Retained (%)                    1723 (100.0) 
##   rtn.flgERS = Retained (%)                 1723 (100.0) 
##   pass.term.flg = PASS Term (%)             1723 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)      1723 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%) 1723 (100.0)
sum(single.vars) # 21
## [1] 21
# remove single-valued variables
chem4.dat<- chem4.dat %>%
  dplyr::select(-names(single.vars))
dim(chem4.dat) # 1723  123
## [1] 1723  123

123 out of 144 variables are kept
21 variables removed due to single values

Identify variables causing complete separation in logistic regression

# Remove non chem4 instructors
chem4.dat <- chem4.dat %>%
  droplevels(chem4.dat$Instructor_01)

# Combine sparse ethnicity categories to Other
chem4.dat <- chem4.dat %>%
  mutate(eth.erss = fct_collapse(eth.erss, `N.A/P.I` = c("Pacific Islander", "Native American")))
with(chem4.dat, table(eth.erss, palN))
##                    palN
## eth.erss              0   1
##   African American   70  32
##   Asian             361 132
##   Foreign            26   9
##   Hispanic          404 234
##   N.A/P.I            15  11
##   Two or More Races  64  31
##   Unknown            35   9
##   White             217  73
# Collapse sparse categories for acad.stand 
# Other:  Academic Dismissal, Academic Disqualification 
chem4.dat <- chem4.dat %>%
  mutate(acad.stand = fct_other(acad.stand, keep = c("Good Standing")))
with(chem4.dat, table(acad.stand, palN))
##                palN
## acad.stand         0    1
##   Good Standing 1076  498
##   Other          116   33

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

chem4.final <- chem4.step.vars(chem4.dat)
kable(names(chem4.final))
x
acad.stand
adm.area
bot.level
cMaj
coh
course.age
csus.gpa.start
cum.percent.units.passed
delay.from.hs
e.rmd
eth.erss
father.ed
fys.flg
gender
hous.coh.term.flg
hs.gpa
Instructor_01
median.income
m.rmd
mother.ed
pct.female.head
pell.coh.term.flg
prevPAL
prevPASS
reason
sac.county.flg
term.units.attemptedCensus
palN
grd.pt.unt
sat.math.score
sat.math.flg
sat.verbal.score
AP_BIOL
AP_CALAB
AP_CALBC
AP_CHEM
AP_BIOL.flg
AP_CALAB.flg
AP_CALBC.flg
AP_CHEM.flg
pct.female.head.flg
med.inc.flg

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

chem4.first.order.prop.model <- chem4.step(chem4.final)

summary(chem4.first.order.prop.model)
## 
## Call:
## glm(formula = model.first.order, family = binomial, data = chem4.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0928  -0.8582  -0.5962   1.0823   2.5189  
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -0.313546   0.984757  -0.318 0.750182    
## cum.percent.units.passed           -0.874892   0.563737  -1.552 0.120674    
## eth.erssAsian                      -0.183902   0.257877  -0.713 0.475761    
## eth.erssForeign                     0.097238   0.470616   0.207 0.836309    
## eth.erssHispanic                    0.389267   0.249464   1.560 0.118663    
## eth.erssN.A/P.I                     0.426014   0.492404   0.865 0.386944    
## eth.erssTwo or More Races           0.108771   0.329229   0.330 0.741113    
## eth.erssUnknown                    -0.448564   0.460291  -0.975 0.329796    
## eth.erssWhite                      -0.124588   0.276471  -0.451 0.652252    
## genderMale                         -0.363496   0.130878  -2.777 0.005480 ** 
## sat.math.score                     -0.002785   0.001077  -2.586 0.009715 ** 
## sat.verbal.score                   -0.001747   0.001014  -1.723 0.084824 .  
## sat.math.flgold                     0.392623   0.243203   1.614 0.106444    
## AP_CALAB                           -0.411235   0.206074  -1.996 0.045981 *  
## AP_CALAB.flgNot Missing            -0.664814   0.309910  -2.145 0.031938 *  
## term.units.attemptedCensus          0.147671   0.027876   5.298 1.17e-07 ***
## Instructor_012                     -0.330922   0.600790  -0.551 0.581763    
## Instructor_013                     -0.720154   0.559506  -1.287 0.198051    
## Instructor_016                     -0.893954   0.785129  -1.139 0.254867    
## Instructor_017                     -0.395600   0.747739  -0.529 0.596762    
## Instructor_0110                    -0.181681   0.617853  -0.294 0.768718    
## Instructor_0112                     0.293936   0.508475   0.578 0.563214    
## Instructor_0120                     0.615611   0.549709   1.120 0.262763    
## Instructor_0122                    -0.816065   0.636632  -1.282 0.199896    
## cMajChemistry                      -0.405909   0.229873  -1.766 0.077429 .  
## cMajCivil Engineering              -0.870867   0.239665  -3.634 0.000279 ***
## cMajElectrical Engineering         -0.787875   0.378779  -2.080 0.037522 *  
## cMajEnvironmental Studies           0.232229   0.636605   0.365 0.715266    
## cMajGeology                        -1.415123   0.793576  -1.783 0.074550 .  
## cMajKinesiology/Physical Education -0.436497   0.185440  -2.354 0.018580 *  
## cMajMechanical Engineering         -0.657946   0.229622  -2.865 0.004166 ** 
## cMajNursing                         0.367788   0.367903   1.000 0.317463    
## cMajNutrition                      -0.730409   0.297964  -2.451 0.014233 *  
## cMajOTHER                          -0.444793   0.221100  -2.012 0.044248 *  
## cMajPsychology                     -1.636895   0.571105  -2.866 0.004154 ** 
## cMajUndeclared                     -0.518958   0.307354  -1.688 0.091321 .  
## bot.levelJunior                     0.475672   0.194301   2.448 0.014360 *  
## bot.levelSenior                     0.636372   0.428091   1.487 0.137138    
## bot.levelSophomore                  0.515050   0.145909   3.530 0.000416 ***
## csus.gpa.start                      0.332728   0.123350   2.697 0.006988 ** 
## cohTransfers                       -0.679730   0.456568  -1.489 0.136545    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2128.4  on 1722  degrees of freedom
## Residual deviance: 1892.6  on 1682  degrees of freedom
## AIC: 1974.6
## 
## Number of Fisher Scoring iterations: 4
p.score <- chem4.first.order.prop.model$fitted.values
chem4.covs <-  names(chem4.first.order.prop.model %>%  pluck("model") %>% dplyr::select(-palN)) 

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(chem4.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
2.01552 2.343503 0.3279827
##                                         Stratified by palN
##                                          0              1              SMD   
##   n                                        1192            531               
##   cum.percent.units.passed (mean (SD))     0.87 (0.13)    0.85 (0.13)   0.125
##   eth.erss (%)                                                          0.251
##      African American                        70 ( 5.9)      32 ( 6.0)        
##      Asian                                  361 (30.3)     132 (24.9)        
##      Foreign                                 26 ( 2.2)       9 ( 1.7)        
##      Hispanic                               404 (33.9)     234 (44.1)        
##      N.A/P.I                                 15 ( 1.3)      11 ( 2.1)        
##      Two or More Races                       64 ( 5.4)      31 ( 5.8)        
##      Unknown                                 35 ( 2.9)       9 ( 1.7)        
##      White                                  217 (18.2)      73 (13.7)        
##   gender = Male (%)                         589 (49.4)     185 (34.8)   0.298
##   sat.math.score (mean (SD))             492.77 (71.49) 465.20 (69.12)  0.392
##   sat.verbal.score (mean (SD))           470.15 (73.09) 451.90 (72.72)  0.250
##   sat.math.flg = old (%)                   1042 (87.4)     495 (93.2)   0.197
##   AP_CALAB (mean (SD))                     2.51 (0.49)    2.51 (0.41)   0.003
##   AP_CALAB.flg = Not Missing (%)            148 (12.4)      41 ( 7.7)   0.156
##   term.units.attemptedCensus (mean (SD))  13.67 (2.19)   14.16 (2.23)   0.226
##   Instructor_01 (%)                                                     0.348
##      1                                       25 ( 2.1)       6 ( 1.1)        
##      2                                       43 ( 3.6)      11 ( 2.1)        
##      3                                      116 ( 9.7)      22 ( 4.1)        
##      6                                       19 ( 1.6)       4 ( 0.8)        
##      7                                       24 ( 2.0)       4 ( 0.8)        
##      10                                      37 ( 3.1)      12 ( 2.3)        
##      12                                     818 (68.6)     415 (78.2)        
##      20                                      69 ( 5.8)      48 ( 9.0)        
##      22                                      41 ( 3.4)       9 ( 1.7)        
##   cMaj (%)                                                              0.418
##      Biology                                361 (30.3)     232 (43.7)        
##      Chemistry                               85 ( 7.1)      36 ( 6.8)        
##      Civil Engineering                      139 (11.7)      31 ( 5.8)        
##      Electrical Engineering                  46 ( 3.9)      11 ( 2.1)        
##      Environmental Studies                    7 ( 0.6)       5 ( 0.9)        
##      Geology                                 15 ( 1.3)       2 ( 0.4)        
##      Kinesiology/Physical Education         142 (11.9)      70 (13.2)        
##      Mechanical Engineering                 150 (12.6)      39 ( 7.3)        
##      Nursing                                 18 ( 1.5)      18 ( 3.4)        
##      Nutrition                               52 ( 4.4)      20 ( 3.8)        
##      OTHER                                  102 ( 8.6)      45 ( 8.5)        
##      Psychology                              24 ( 2.0)       4 ( 0.8)        
##      Undeclared                              51 ( 4.3)      18 ( 3.4)        
##   bot.level (%)                                                         0.175
##      Freshman                               390 (32.7)     134 (25.2)        
##      Junior                                 210 (17.6)      98 (18.5)        
##      Senior                                  30 ( 2.5)      11 ( 2.1)        
##      Sophomore                              562 (47.1)     288 (54.2)        
##   csus.gpa.start (mean (SD))               2.94 (0.57)    3.01 (0.52)   0.129
##   coh = Transfers (%)                        33 ( 2.8)       7 ( 1.3)   0.103

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     1    13    14
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
cMaj 0.4176251
sat.math.score 0.3920790
Instructor_01 0.3483413
gender 0.2984060
eth.erss 0.2508546
sat.verbal.score 0.2503217
term.units.attemptedCensus 0.2255303
sat.math.flg 0.1972337
bot.level 0.1752879
AP_CALAB.flg 0.1564959
csus.gpa.start 0.1288580
cum.percent.units.passed 0.1248668
coh 0.1026375

13 variables have SMD >0.1

Implement a propensity score matching method.

match.chem4 <- with(chem4.final, Match(
  Y=chem4.final$grd.pt.unt, Tr = chem4.final$palN, X = p.score, 
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
chem4.final <- chem4.final %>%
  rownames_to_column(var = "id")

# Matched data
chem4.matched.dat <- chem4.final[unlist(match.chem4[c("index.treated", "index.control")]), ]
chem4.matched.dat$match.weights<-  c(match.chem4$weights, match.chem4$weights)

# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem4.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem4.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE, noSpaces = TRUE)
##                                         Stratified by palN
##                                          0              1              SMD   
##   n                                      530.00         530.00               
##   cum.percent.units.passed (mean (SD))   0.85 (0.13)    0.85 (0.13)    0.019 
##   eth.erss (%)                                                         0.147 
##      African American                    26.1 (4.9)     32.0 (6.0)           
##      Asian                               125.3 (23.6)   132.0 (24.9)         
##      Foreign                             11.8 (2.2)     9.0 (1.7)            
##      Hispanic                            243.9 (46.0)   233.0 (44.0)         
##      N.A/P.I                             4.4 (0.8)      11.0 (2.1)           
##      Two or More Races                   25.8 (4.9)     31.0 (5.8)           
##      Unknown                             12.7 (2.4)     9.0 (1.7)            
##      White                               80.1 (15.1)    73.0 (13.8)          
##   gender = Male (%)                      191.2 (36.1)   185.0 (34.9)   0.025 
##   sat.math.score (mean (SD))             465.40 (70.72) 465.40 (69.00) <0.001
##   sat.verbal.score (mean (SD))           453.36 (69.57) 452.25 (72.31) 0.016 
##   sat.math.flg = old (%)                 489.5 (92.3)   494.0 (93.2)   0.033 
##   AP_CALAB (mean (SD))                   2.51 (0.40)    2.51 (0.41)    0.004 
##   AP_CALAB.flg = Not Missing (%)         45.6 (8.6)     41.0 (7.7)     0.032 
##   term.units.attemptedCensus (mean (SD)) 14.19 (2.18)   14.16 (2.23)   0.014 
##   Instructor_01 (%)                                                    0.146 
##      1                                   11.7 (2.2)     6.0 (1.1)            
##      2                                   8.5 (1.6)      11.0 (2.1)           
##      3                                   20.7 (3.9)     22.0 (4.2)           
##      6                                   7.0 (1.3)      4.0 (0.8)            
##      7                                   7.1 (1.3)      4.0 (0.8)            
##      10                                  12.5 (2.4)     12.0 (2.3)           
##      12                                  418.0 (78.9)   414.0 (78.1)         
##      20                                  38.9 (7.3)     48.0 (9.1)           
##      22                                  5.6 (1.1)      9.0 (1.7)            
##   cMaj (%)                                                             0.134 
##      Biology                             238.6 (45.0)   231.0 (43.6)         
##      Chemistry                           33.2 (6.3)     36.0 (6.8)           
##      Civil Engineering                   25.4 (4.8)     31.0 (5.8)           
##      Electrical Engineering              16.4 (3.1)     11.0 (2.1)           
##      Environmental Studies               6.6 (1.2)      5.0 (0.9)            
##      Geology                             4.0 (0.7)      2.0 (0.4)            
##      Kinesiology/Physical Education      64.3 (12.1)    70.0 (13.2)          
##      Mechanical Engineering              35.3 (6.7)     39.0 (7.4)           
##      Nursing                             18.2 (3.4)     18.0 (3.4)           
##      Nutrition                           16.9 (3.2)     20.0 (3.8)           
##      OTHER                               48.4 (9.1)     45.0 (8.5)           
##      Psychology                          1.7 (0.3)      4.0 (0.8)            
##      Undeclared                          21.0 (4.0)     18.0 (3.4)           
##   bot.level (%)                                                        0.074 
##      Freshman                            119.0 (22.5)   134.0 (25.3)         
##      Junior                              95.6 (18.0)    98.0 (18.5)          
##      Senior                              12.6 (2.4)     11.0 (2.1)           
##      Sophomore                           302.7 (57.1)   287.0 (54.2)         
##   csus.gpa.start (mean (SD))             2.97 (0.50)    3.01 (0.52)    0.081 
##   coh = Transfers (%)                    5.7 (1.1)      7.0 (1.3)      0.023

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group Binary variables: Raw differences in proportion

All variables are balanced and under the <0.1 mean threshold.

chem4.bal <- bal.tab(match.chem4, formula = f.build("palN", chem4.covs), data = chem4.final,
        distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
chem4.bal
## Balance Measures
## All covariates are balanced.
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        45
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##        Variable Diff.Adj    M.Threshold
##  csus.gpa.start   0.0794 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                  1192.       531
## Matched (ESS)         318.44     530
## Matched (Unweighted)  680.       530
## Unmatched             512.         0
## Discarded               0.         1

Check variable percent improvement

get.var.perc.tab(chem4.bal)
##                               Variable       Diff.Un      Diff.Adj % Improvement
## 1                              p.score  0.7912852849  2.660153e-04           100
## 2                       sat.math.score -0.3988551284 -9.585924e-05           100
## 3                         cMaj_Nursing  0.0187976339 -3.773585e-04            98
## 4                      Instructor_01_3 -0.0558841745  2.493261e-03            96
## 5                     sat.verbal.score -0.2509643508 -1.536474e-02            94
## 6           term.units.attemptedCensus  0.2235767269 -1.400638e-02            94
## 7                          gender_Male -0.1457282701 -1.173405e-02            92
## 8                     Instructor_01_12  0.0953026454 -7.520216e-03            92
## 9                         cMaj_Biology  0.1340591388 -1.434187e-02            89
## 10                    Instructor_01_10 -0.0084413984 -1.013028e-03            88
## 11         cMaj_Mechanical Engineering -0.0523925985  6.889039e-03            87
## 12            cum.percent.units.passed -0.1231213537  1.902176e-02            85
## 13                    sat.math.flg_old  0.0580423160  8.580413e-03            85
## 14                       coh_Transfers -0.0145018896  2.493261e-03            83
## 15              cMaj_Civil Engineering -0.0582303239  1.052785e-02            82
## 16            AP_CALAB.flg_Not Missing -0.0469482678 -8.728661e-03            81
## 17                   eth.erss_Hispanic  0.1017517916 -2.060872e-02            80
## 18                      eth.erss_Asian -0.0542647784  1.268643e-02            77
## 19                      eth.erss_White -0.0445705204 -1.332210e-02            70
## 20                     Instructor_01_2 -0.0153581946  4.683288e-03            70
## 21                     cMaj_Psychology -0.0126012715  4.429470e-03            65
## 22                    Instructor_01_22 -0.0174468206  6.331986e-03            64
## 23                  bot.level_Freshman -0.0748271591  2.823001e-02            62
## 24                        cMaj_Geology -0.0088174143 -3.679245e-03            58
## 25                 bot.level_Sophomore  0.0708963713 -2.965633e-02            58
## 26                     Instructor_01_7 -0.0126012715 -5.864780e-03            53
## 27                    Instructor_01_20  0.0325095742  1.725966e-02            47
## 28                    bot.level_Junior  0.0083829422  4.451932e-03            47
## 29                    eth.erss_Unknown -0.0124132636 -6.951932e-03            44
## 30         cMaj_Electrical Engineering -0.0178749731 -1.019317e-02            43
## 31                      csus.gpa.start  0.1357473318  7.943121e-02            41
## 32                     cMaj_Undeclared -0.0088869298 -5.754717e-03            35
## 33                     Instructor_01_6 -0.0084066406 -5.566038e-03            34
## 34                    bot.level_Senior -0.0044521543 -3.025606e-03            32
## 35          cMaj_Environmental Studies  0.0035437126 -2.955975e-03            17
## 36 cMaj_Kinesiology/Physical Education  0.0126992252  1.075921e-02            15
## 37                      cMaj_Nutrition -0.0059593776  5.824349e-03             2
## 38                            AP_CALAB  0.0034431498 -3.675201e-03            -7
## 39                    eth.erss_Foreign -0.0048629280 -5.332435e-03           -10
## 40                     Instructor_01_1 -0.0096737193 -1.080413e-02           -12
## 41                      cMaj_Chemistry -0.0035121147  5.370620e-03           -53
## 42                    eth.erss_N.A/P.I  0.0081317383  1.250000e-02           -54
## 43          eth.erss_Two or More Races  0.0046891391  9.838275e-03          -110
## 44           eth.erss_African American  0.0015388213  1.119048e-02          -627
## 45                          cMaj_OTHER -0.0008247071 -6.498203e-03          -688

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(chem4.bal,binary = "raw", stars = "std", var.order = "unadjusted", 
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(chem4.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 343 206
Multiple Matches 337 324
Total Students 680 530

Out of 531 PAL students, 530 were matched and 1 was unable to be matched. Out of 530 PAL student matches, 206 PAL students were matched to one non-PAL student and 324 PAL students were matched to multiple non-PAL students.

Out of 1238 non-PAL student matches, there were 680 non-PAL students, 343 of the non-PAL students were matched to one PAL student and 337 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(chem4.final, match.chem4)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is 0.0900, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.

## 
## Call:
## glm(formula = f.build("grd.pt.unt", chem4.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.09610  -0.71453   0.09259   0.72726   2.81319  
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -3.1551135  0.4687028  -6.732 2.64e-11 ***
## cum.percent.units.passed            0.5280403  0.3058402   1.727  0.08452 .  
## eth.erssAsian                       0.6156161  0.1398936   4.401 1.18e-05 ***
## eth.erssForeign                     0.4445376  0.2444358   1.819  0.06923 .  
## eth.erssHispanic                    0.4145461  0.1384392   2.994  0.00281 ** 
## eth.erssN.A/P.I                     1.1880648  0.3029530   3.922 9.32e-05 ***
## eth.erssTwo or More Races           0.5392068  0.1848077   2.918  0.00360 ** 
## eth.erssUnknown                     0.6844153  0.2210742   3.096  0.00201 ** 
## eth.erssWhite                       0.4857870  0.1486030   3.269  0.00111 ** 
## genderMale                         -0.0452349  0.0698444  -0.648  0.51734    
## sat.math.score                      0.0034058  0.0005773   5.900 4.78e-09 ***
## sat.verbal.score                   -0.0008005  0.0005374  -1.490  0.13660    
## sat.math.flgold                    -0.0266766  0.1141820  -0.234  0.81531    
## AP_CALAB                            0.0838340  0.0740435   1.132  0.25777    
## AP_CALAB.flgNot Missing             0.3171721  0.1142736   2.776  0.00560 ** 
## term.units.attemptedCensus         -0.0210416  0.0148051  -1.421  0.15552    
## Instructor_012                      0.6256332  0.2704276   2.313  0.02087 *  
## Instructor_013                     -0.0692320  0.2442575  -0.283  0.77689    
## Instructor_016                     -0.3435549  0.3370825  -1.019  0.30832    
## Instructor_017                      0.2694129  0.3038283   0.887  0.37541    
## Instructor_0110                    -0.0389665  0.2877060  -0.135  0.89229    
## Instructor_0112                     0.1022879  0.2288037   0.447  0.65492    
## Instructor_0120                    -0.0277218  0.2644355  -0.105  0.91653    
## Instructor_0122                     0.0430879  0.2833216   0.152  0.87915    
## cMajChemistry                       0.1952620  0.1279747   1.526  0.12734    
## cMajCivil Engineering               0.1616398  0.1126137   1.435  0.15146    
## cMajElectrical Engineering          0.0486928  0.1720735   0.283  0.77725    
## cMajEnvironmental Studies          -0.8540900  0.4077684  -2.095  0.03643 *  
## cMajGeology                        -0.5551327  0.2853390  -1.946  0.05196 .  
## cMajKinesiology/Physical Education  0.1843223  0.1093237   1.686  0.09206 .  
## cMajMechanical Engineering         -0.1065179  0.1122961  -0.949  0.34305    
## cMajNursing                        -0.5533050  0.2563063  -2.159  0.03107 *  
## cMajNutrition                      -0.0422353  0.1648899  -0.256  0.79789    
## cMajOTHER                           0.0852959  0.1240686   0.687  0.49191    
## cMajPsychology                      0.0750013  0.2259877   0.332  0.74004    
## cMajUndeclared                      0.0353021  0.1595944   0.221  0.82498    
## bot.levelJunior                     0.0487889  0.1048907   0.465  0.64192    
## bot.levelSenior                     0.0325395  0.2267044   0.144  0.88589    
## bot.levelSophomore                 -0.1937920  0.0758361  -2.555  0.01073 *  
## csus.gpa.start                      1.0094300  0.0660161  15.291  < 2e-16 ***
## cohTransfers                        0.0724838  0.2024442   0.358  0.72038    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.095629)
## 
##     Null deviance: 1984.6  on 1191  degrees of freedom
## Residual deviance: 1261.1  on 1151  degrees of freedom
## AIC: 3533.9
## 
## Number of Fisher Scoring iterations: 2
## Balance Measures
##                                         Type Diff.Adj    M.Threshold
## prog.score                          Distance   0.0900 Balanced, <0.1
## cum.percent.units.passed             Contin.   0.0190 Balanced, <0.1
## eth.erss_African American             Binary   0.0112 Balanced, <0.1
## eth.erss_Asian                        Binary   0.0127 Balanced, <0.1
## eth.erss_Foreign                      Binary  -0.0053 Balanced, <0.1
## eth.erss_Hispanic                     Binary  -0.0206 Balanced, <0.1
## eth.erss_N.A/P.I                      Binary   0.0125 Balanced, <0.1
## eth.erss_Two or More Races            Binary   0.0098 Balanced, <0.1
## eth.erss_Unknown                      Binary  -0.0070 Balanced, <0.1
## eth.erss_White                        Binary  -0.0133 Balanced, <0.1
## gender_Male                           Binary  -0.0117 Balanced, <0.1
## sat.math.score                       Contin.  -0.0001 Balanced, <0.1
## sat.verbal.score                     Contin.  -0.0154 Balanced, <0.1
## sat.math.flg_old                      Binary   0.0086 Balanced, <0.1
## AP_CALAB                             Contin.  -0.0037 Balanced, <0.1
## AP_CALAB.flg_Not Missing              Binary  -0.0087 Balanced, <0.1
## term.units.attemptedCensus           Contin.  -0.0140 Balanced, <0.1
## Instructor_01_1                       Binary  -0.0108 Balanced, <0.1
## Instructor_01_2                       Binary   0.0047 Balanced, <0.1
## Instructor_01_3                       Binary   0.0025 Balanced, <0.1
## Instructor_01_6                       Binary  -0.0056 Balanced, <0.1
## Instructor_01_7                       Binary  -0.0059 Balanced, <0.1
## Instructor_01_10                      Binary  -0.0010 Balanced, <0.1
## Instructor_01_12                      Binary  -0.0075 Balanced, <0.1
## Instructor_01_20                      Binary   0.0173 Balanced, <0.1
## Instructor_01_22                      Binary   0.0063 Balanced, <0.1
## cMaj_Biology                          Binary  -0.0143 Balanced, <0.1
## cMaj_Chemistry                        Binary   0.0054 Balanced, <0.1
## cMaj_Civil Engineering                Binary   0.0105 Balanced, <0.1
## cMaj_Electrical Engineering           Binary  -0.0102 Balanced, <0.1
## cMaj_Environmental Studies            Binary  -0.0030 Balanced, <0.1
## cMaj_Geology                          Binary  -0.0037 Balanced, <0.1
## cMaj_Kinesiology/Physical Education   Binary   0.0108 Balanced, <0.1
## cMaj_Mechanical Engineering           Binary   0.0069 Balanced, <0.1
## cMaj_Nursing                          Binary  -0.0004 Balanced, <0.1
## cMaj_Nutrition                        Binary   0.0058 Balanced, <0.1
## cMaj_OTHER                            Binary  -0.0065 Balanced, <0.1
## cMaj_Psychology                       Binary   0.0044 Balanced, <0.1
## cMaj_Undeclared                       Binary  -0.0058 Balanced, <0.1
## bot.level_Freshman                    Binary   0.0282 Balanced, <0.1
## bot.level_Junior                      Binary   0.0045 Balanced, <0.1
## bot.level_Senior                      Binary  -0.0030 Balanced, <0.1
## bot.level_Sophomore                   Binary  -0.0297 Balanced, <0.1
## csus.gpa.start                       Contin.   0.0794 Balanced, <0.1
## coh_Transfers                         Binary   0.0025 Balanced, <0.1
## p.score                              Contin.   0.0003 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        46
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##        Variable Diff.Adj    M.Threshold
##  csus.gpa.start   0.0794 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                  1192.       531
## Matched (ESS)         318.44     530
## Matched (Unweighted)  680.       530
## Unmatched             512.         0
## Discarded               0.         1

Estimate Difference Between Mean grade in CHEM 4 of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.44347. This result is statistically significant with a P-value of \(4.3873x10^{-7}\) and is based on 530 PAL students and 1238 non-PAL student matches(680 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.chem4)
## 
## Estimate...  0.44347 
## AI SE......  0.087792 
## T-stat.....  5.0513 
## p.val......  4.3873e-07 
## 
## Original number of observations..............  1723 
## Original number of treated obs...............  531 
## Matched number of observations...............  530 
## Matched number of observations  (unweighted).  1238 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  1

Sensitivity Analysis

psens(match.chem4, Gamma=2.5, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0000
##    1.3           0      0.0000
##    1.4           0      0.0001
##    1.5           0      0.0019
##    1.6           0      0.0228
##    1.7           0      0.1219
##    1.8           0      0.3517
##    1.9           0      0.6410
##    2.0           0      0.8566
##    2.1           0      0.9587
##    2.2           0      0.9912
##    2.3           0      0.9986
##    2.4           0      0.9998
##    2.5           0      1.0000
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.7\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.7 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “coh” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(chem4.first.order.prop.model$coefficients))))
x
sat.verbal.score 1.001748
sat.math.score 1.002788
eth.erssForeign 1.102122
eth.erssTwo or More Races 1.114907
eth.erssWhite 1.132682
term.units.attemptedCensus 1.159132
Instructor_0110 1.199232
eth.erssAsian 1.201897
cMajEnvironmental Studies 1.261409
Instructor_0112 1.341698
(Intercept) 1.368269
Instructor_012 1.392251
csus.gpa.start 1.394769
genderMale 1.438349
cMajNursing 1.444535
eth.erssHispanic 1.475899
sat.math.flgold 1.480860
Instructor_017 1.485276
cMajChemistry 1.500667
AP_CALAB 1.508679
eth.erssN.A/P.I 1.531142
cMajKinesiology/Physical Education 1.547277
cMajOTHER 1.560167
eth.erssUnknown 1.566062
bot.levelJunior 1.609096
bot.levelSophomore 1.673723
cMajUndeclared 1.680276
Instructor_0120 1.850787
bot.levelSenior 1.889612
cMajMechanical Engineering 1.930822
AP_CALAB.flgNot Missing 1.944130
cohTransfers 1.973346
Instructor_013 2.054749
cMajNutrition 2.075929
cMajElectrical Engineering 2.198720
Instructor_0122 2.261584
cMajCivil Engineering 2.388982
cum.percent.units.passed 2.398617
Instructor_016 2.444778
cMajGeology 4.116994
cMajPsychology 5.139186

CHEM 24

Executive Summary

Based on data for 177 PAL students and 224 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.43 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.71\pm 0.26\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 24, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 24 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Chem 24, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: ethnicity, enrollment in PAL in the past, instructor, Biology major, being remedial in English, class level, lower percent of households with a female head in the zip code where the student lived at the time of application to CSUS, and lower median income where the student lived at the time of application to CSUS.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.145. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.71\pm 0.26\) or between 0.45 and 0.97 on a 4.0 grade scale. This result is statistically significant with a P-value of \(3.16x10^{-3}\) and is based on 126 PAL students and 50 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.43.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 24. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 24 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of enrollment in PAL in the past, CSUS GPA at start of term, or remedial in English observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 177 PAL students in Chem 24 being reduced to 126. Also, 50 non-PAL students were selected out of 224 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 130 non-PAL matches. Of the 126 PAL students, 122 were matched one-to-one with non-PAL students and 4 were matched one-to-many with non-PAL students.

Extract Chem 24 Data and Prerequisite Course Chem 1B

The non-PAL and PAL groups will include students with only first attempts at CHEM 24.They will also include students with previous PAL participation and/or are currently in a PAL for another course.

For the prerequisite course CHEM 1B, the grade for the student’s last attempt and the number of times it was taken are added to the CHEM 1B data set. Only 275 out of 401 CHEM 24 students have CHEM 1B grades.

However, few students retook Chem 1B so there was inadequate balance on number of times Chem 1B was taken, and it was removed from the propensity model after balance checks.

# Excludes course repeats
chem24.dat <- chem.dat %>%
  filter(course=="CHEM 24", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem24.dat) # 401 179
## [1] 401 179
prereq <- "CHEM 1B"
chem24.dat <- get.prereq.grades(chem24.dat, chem.dat, prereq)
# Rename Chem 1B variables
chem24.dat <- chem24.dat %>%
  rename(chem1b.course.seq= prereq.course.seq, chem1b.grd.pt.unt= prereq.grd.pt.unt, chem1b.grade = prereq.grade)
dim(chem24.dat) # 275 182
## [1] 275 182

There is a total of 275 CHEM 24 first attempt only students with prerequisite CHEM 1B grades.

The variables below were added to the CHEM 24 data, so there are now 182 variables instead of 179 variables.
chem1b.course.seq: Is the student taking CHEM 1B for the first time or is it the second attempt, third attempt, etc.?
chem1b.grd.pt.unt: Numeric grade on 0 to 4 scale (0=F, 4=A) for CHEM 1B
chem1b.grade: Course grade for CHEM 1B

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.10
with(chem24.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        103 122
##   Business                         1   0
##   Chemistry                       23   5
##   Child Devel/Early Childhood Ed   1   0
##   Criminal Justice                 2   0
##   Dance                            1   0
##   Geology                          1   0
##   Gerontology                      1   0
##   Health Science                   1   0
##   Kinesiology/Physical Education   5   3
##   Liberal Studies                  1   0
##   Mechanical Engineering           2   0
##   Nutrition                        0   1
##   Psychology                       0   1
##   Sociology                        1   0
chem24.dat <- group_category(data = chem24.dat, feature = "cMaj", threshold = 0.10,  update = TRUE)
with(chem24.dat, table(cMaj, palN))
##          palN
## cMaj        0   1
##   Biology 103 122
##   OTHER    40  10

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

## [1] 35
##                        feature num_missing pct_missing
## 17               Instructor_02         275   1.0000000
## 19             withdraw_reason         275   1.0000000
## 22                   deg.plan3         275   1.0000000
## 23                   deg.plan4         275   1.0000000
## 24                   deg.plan5         275   1.0000000
## 25                   deg.plan6         275   1.0000000
## 21                   deg.plan2         274   0.9963636
## 20                   deg.plan1         243   0.8836364
## 26                   grad.term         243   0.8836364
## 27                grad.termERS         243   0.8836364
## 28                         ttg         243   0.8836364
## 11                  trf.gpaADM         242   0.8800000
## 4                  pledge.term         199   0.7236364
## 18               treat.section         181   0.6581818
## 1                fys.term.code         134   0.4872727
## 2                      fys.grd         134   0.4872727
## 3                  fys.rpt.flg         134   0.4872727
## 13 ge.critical.thinking.status          91   0.3309091
## 14      ge.english.comp.status          91   0.3309091
## 15              ge.math.status          91   0.3309091
## 16         ge.oral.comm.status          91   0.3309091
## 12                  admit.term          89   0.3236364
## 5               sat.math.score          47   0.1709091
## 6                 sat.math.flg          47   0.1709091
## 7             sat.verbal.score          47   0.1709091
## 8               sat.verbal.flg          47   0.1709091
## 9                sat.test.date          47   0.1709091
## 29                plan.college          41   0.1490909
## 30           plan.college.desc          41   0.1490909
## 31                   plan.dept          41   0.1490909
## 32               plan.deptAbbr          41   0.1490909
## 33                 plan.degree          41   0.1490909
## 34                   plan.type          41   0.1490909
## 35                      county          39   0.1418182
## 10                      hs.gpa          30   0.1090909

## [1] 275 147

35 variables missing >10%
So, 35 variables were removed due to missingness and there are now 147 variables instead of 182 variables.

Subset on Complete Cases only in Chem 24 Data

chem24.dat <- chem24.dat[complete.cases(chem24.dat), ]
dim(chem24.dat) # 265 147
## [1] 265 147

265 out of 275 students are kept
10 students were removed due to missingness of variables

single.vars <- chem24.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem24.dat)
##                                            
##                                             Overall       
##   n                                           265         
##   AP_PHYSC (mean (SD))                       2.40 (0.00)  
##   AP_PHYSM (mean (SD))                       2.47 (0.00)  
##   AP_PHYSC.flg = Missing (%)                  265 (100.0) 
##   AP_PHYSM.flg = Missing (%)                  265 (100.0) 
##   country = USA (%)                           265 (100.0) 
##   med.inc.flg = Not Missing (%)               265 (100.0) 
##   career.course = UGRD (%)                    265 (100.0) 
##   acad.prog.course = UGD (%)                  265 (100.0) 
##   course (%)                                              
##      CHEM 4                                     0 (  0.0) 
##      CHEM 1A                                    0 (  0.0) 
##      CHEM 1B                                    0 (  0.0) 
##      CHEM 24                                  265 (100.0) 
##   component = LEC (%)                         265 (100.0) 
##   units (mean (SD))                          3.00 (0.00)  
##   course.numeric (mean (SD))                24.00 (0.00)  
##   div = Lower Division (%)                    265 (100.0) 
##   course.seq (mean (SD))                     0.00 (0.00)  
##   rpt.flg = First Attempt (%)                 265 (100.0) 
##   c2s = Non-C2S (%)                           265 (100.0) 
##   base.time.course (mean (SD))               1.00 (0.00)  
##   years (mean (SD))                          0.50 (0.00)  
##   enroll.status = Continuing (%)              265 (100.0) 
##   withdraw_code = NWD (%)                     265 (100.0) 
##   enrl.flg = Enrolled (%)                     265 (100.0) 
##   enrl.flgERS = Enrolled (%)                  265 (100.0) 
##   pst.flg = Enrolled (%)                      265 (100.0) 
##   pst.flgERS = Enrolled (%)                   265 (100.0) 
##   grad.flg = Not Graduated (%)                265 (100.0) 
##   grad.flgERS = Not Graduated (%)             265 (100.0) 
##   rtn.flg = Retained (%)                      265 (100.0) 
##   rtn.flgERS = Retained (%)                   265 (100.0) 
##   pass.term.flg = PASS Term (%)               265 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)        265 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%)   265 (100.0)
sum(single.vars) # 1
## [1] 31
# remove single-valued variables
chem24.dat<- chem24.dat %>%
  dplyr::select(-names(single.vars))
dim(chem24.dat) # 265 116
## [1] 265 116

116 out of 147 variables are kept
31 variables removed due to single values

Identify variables causing complete separation in logistic regression

# Remove non chem24 instructors
chem24.dat <- chem24.dat %>%
  droplevels(chem24.dat$Instructor_01)

# Combine sparse ethnicity categories to Other
chem24.dat <- chem24.dat %>%
  mutate(eth.erss = fct_other(eth.erss, drop = c("Foreign", "Pacific Islander", "Unknown")))
with(chem24.dat, table(eth.erss, palN))
##                    palN
## eth.erss             0  1
##   African American   5 10
##   Asian             50 35
##   Hispanic          30 38
##   Two or More Races  9  9
##   White             38 28
##   Other              7  6
# Only one Freshman student, so combined with Sophomore
chem24.dat <- chem24.dat %>%
  mutate(bot.level = fct_collapse(bot.level, Sophomore = c("Sophomore", "Freshman")))
with(chem24.dat, table(bot.level, palN))
##            palN
## bot.level    0  1
##   Sophomore 22 27
##   Junior    50 60
##   Senior    67 39

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

chem24.final <- chem24.step.vars(chem24.dat)
kable(names(chem24.final))
x
acad.stand
adm.area
bot.level
cMaj
coh
course.age
csus.gpa.start
cum.percent.units.passed
delay.from.hs
e.rmd
eth.erss
father.ed
fys.flg
gender
hous.coh.term.flg
Instructor_01
median.income
m.rmd
mother.ed
pct.female.head
pell.coh.term.flg
prevPAL
prevPASS
reason
sac.county.flg
term.units.attemptedCensus
palN
grd.pt.unt
chem1b.grd.pt.unt
AP_BIOL
AP_CALAB
AP_CALBC
AP_CHEM
AP_BIOL.flg
AP_CALAB.flg
AP_CALBC.flg
AP_CHEM.flg
pct.female.head.flg

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, and chem1b.grd.pt.unt . Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

## 
## Call:
## glm(formula = model.first.order, family = binomial, data = chem24.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3386  -0.8796  -0.1799   0.8482   2.2670  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -7.508e-01  2.893e+00  -0.260  0.79521    
## chem1b.grd.pt.unt         -3.697e-01  2.564e-01  -1.442  0.14940    
## cum.percent.units.passed   3.443e+00  2.976e+00   1.157  0.24738    
## eth.erssAsian             -1.543e+00  7.270e-01  -2.123  0.03377 *  
## eth.erssHispanic          -1.155e+00  7.313e-01  -1.579  0.11438    
## eth.erssTwo or More Races -1.209e+00  8.740e-01  -1.383  0.16657    
## eth.erssWhite             -1.817e+00  7.544e-01  -2.409  0.01599 *  
## eth.erssOther             -1.906e+00  9.454e-01  -2.016  0.04380 *  
## genderMale                 6.725e-02  3.340e-01   0.201  0.84043    
## prevPAL                    8.030e-01  1.464e-01   5.485 4.13e-08 ***
## Instructor_015            -4.481e-01  4.879e-01  -0.918  0.35839    
## Instructor_0110           -1.730e+00  4.521e-01  -3.827  0.00013 ***
## Instructor_0114           -4.912e-01  4.002e-01  -1.227  0.21967    
## cMajOTHER                 -1.572e+00  4.910e-01  -3.202  0.00137 ** 
## e.rmdRemedial in English   1.013e+00  4.046e-01   2.504  0.01227 *  
## bot.levelJunior           -2.517e-01  4.541e-01  -0.554  0.57942    
## bot.levelSenior           -1.353e+00  5.719e-01  -2.365  0.01801 *  
## delay.from.hs              1.506e-01  1.121e-01   1.343  0.17919    
## csus.gpa.start             8.065e-01  5.858e-01   1.377  0.16859    
## pct.female.head           -8.324e-02  3.506e-02  -2.374  0.01759 *  
## median.income             -2.480e-05  1.264e-05  -1.962  0.04979 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 366.73  on 264  degrees of freedom
## Residual deviance: 270.15  on 244  degrees of freedom
## AIC: 312.15
## 
## Number of Fisher Scoring iterations: 5

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(chem24.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.689209 2.101587 0.4123787

Standardized mean differences for continuous variables and categorical variables.

unmatched.tab <- CreateTableOne(vars = chem24.covs, strata = "palN", 
                             data = chem24.final, smd = TRUE, test = FALSE)
print(unmatched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0                   1                   SMD   
##   n                                         139                 126                  
##   chem1b.grd.pt.unt (mean (SD))            2.28 (0.86)         2.32 (0.82)      0.048
##   cum.percent.units.passed (mean (SD))     0.92 (0.09)         0.92 (0.07)      0.091
##   eth.erss (%)                                                                  0.309
##      African American                         5 ( 3.6)           10 ( 7.9)           
##      Asian                                   50 (36.0)           35 (27.8)           
##      Hispanic                                30 (21.6)           38 (30.2)           
##      Two or More Races                        9 ( 6.5)            9 ( 7.1)           
##      White                                   38 (27.3)           28 (22.2)           
##      Other                                    7 ( 5.0)            6 ( 4.8)           
##   gender = Male (%)                          56 (40.3)           41 (32.5)      0.162
##   prevPAL (mean (SD))                      1.03 (1.23)         1.87 (1.12)      0.715
##   Instructor_01 (%)                                                             0.527
##      4                                       36 (25.9)           52 (41.3)           
##      5                                       14 (10.1)           21 (16.7)           
##      10                                      50 (36.0)           20 (15.9)           
##      14                                      39 (28.1)           33 (26.2)           
##   cMaj = OTHER (%)                           37 (26.6)           10 ( 7.9)      0.510
##   e.rmd = Remedial in English (%)            20 (14.4)           34 (27.0)      0.315
##   bot.level (%)                                                                 0.358
##      Sophomore                               22 (15.8)           27 (21.4)           
##      Junior                                  50 (36.0)           60 (47.6)           
##      Senior                                  67 (48.2)           39 (31.0)           
##   delay.from.hs (mean (SD))                3.72 (2.07)         3.40 (2.67)      0.132
##   csus.gpa.start (mean (SD))               3.11 (0.48)         3.19 (0.42)      0.185
##   pct.female.head (mean (SD))             19.44 (6.32)        19.43 (7.09)      0.001
##   median.income (mean (SD))            66770.71 (19689.75) 64023.91 (18940.87)  0.142

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     3    10    13
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
prevPAL 0.7154758
Instructor_01 0.5273159
cMaj 0.5099804
bot.level 0.3584403
e.rmd 0.3147882
eth.erss 0.3094389
csus.gpa.start 0.1849083
gender 0.1615446
median.income 0.1421813
delay.from.hs 0.1317131

10 variables have SMD >0.1

Implement a propensity score matching method.

match.chem24 <- with(chem24.final, Match(
  Y=chem24.final$grd.pt.unt, Tr = chem24.final$palN, X = p.score, 
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
chem24.final <- chem24.final %>%
  rownames_to_column(var = "id")

# Matched data
chem24.matched.dat <- chem24.final[unlist(match.chem24[c("index.treated", "index.control")]), ]
chem24.matched.dat$match.weights<-  c(match.chem24$weights, match.chem24$weights)

# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem24.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem24.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0                   1                   SMD   
##   n                                      126.00              126.00                  
##   chem1b.grd.pt.unt (mean (SD))            2.35 (0.85)         2.32 (0.82)      0.037
##   cum.percent.units.passed (mean (SD))     0.93 (0.09)         0.92 (0.07)      0.028
##   eth.erss (%)                                                                  0.330
##      African American                      14.5 (11.5)         10.0 ( 7.9)           
##      Asian                                 45.0 (35.7)         35.0 (27.8)           
##      Hispanic                              36.0 (28.6)         38.0 (30.2)           
##      Two or More Races                      6.0 ( 4.8)          9.0 ( 7.1)           
##      White                                 23.5 (18.7)         28.0 (22.2)           
##      Other                                  1.0 ( 0.8)          6.0 ( 4.8)           
##   gender = Male (%)                        50.5 (40.1)         41.0 (32.5)      0.157
##   prevPAL (mean (SD))                      1.82 (1.34)         1.87 (1.12)      0.042
##   Instructor_01 (%)                                                             0.066
##      4                                     55.0 (43.7)         52.0 (41.3)           
##      5                                     22.0 (17.5)         21.0 (16.7)           
##      10                                    19.0 (15.1)         20.0 (15.9)           
##      14                                    30.0 (23.8)         33.0 (26.2)           
##   cMaj = OTHER (%)                         19.0 (15.1)         10.0 ( 7.9)      0.225
##   e.rmd = Remedial in English (%)          24.0 (19.0)         34.0 (27.0)      0.189
##   bot.level (%)                                                                 0.116
##      Sophomore                             28.0 (22.2)         27.0 (21.4)           
##      Junior                                65.5 (52.0)         60.0 (47.6)           
##      Senior                                32.5 (25.8)         39.0 (31.0)           
##   delay.from.hs (mean (SD))                3.00 (1.21)         3.40 (2.67)      0.193
##   csus.gpa.start (mean (SD))               3.20 (0.41)         3.19 (0.42)      0.017
##   pct.female.head (mean (SD))             18.76 (6.39)        19.43 (7.09)      0.100
##   median.income (mean (SD))            61841.54 (13978.07) 64023.91 (18938.54)  0.131

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion

The variables below, “delay.from.hs” and “median.income” are not under the <0.1 mean threshold, but they are under the <0.25 mean threshold.

## Balance Measures
##                  Type Diff.Un Diff.Adj        M.Threshold
## delay.from.hs Contin. -0.1178   0.1500 Not Balanced, >0.1
## median.income Contin. -0.1450   0.1152 Not Balanced, >0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        22
## Not Balanced, >0.1     2
## 
## Variable with the greatest mean difference
##       Variable Diff.Adj        M.Threshold
##  delay.from.hs     0.15 Not Balanced, >0.1
## 
## Sample sizes
##                      Control Treated
## All                   139.       126
## Matched (ESS)          25.48     126
## Matched (Unweighted)   50.       126
## Unmatched              89.         0

Check variable percent improvement

get.var.perc.tab(chem24.bal)
##                      Variable       Diff.Un     Diff.Adj % Improvement
## 1                     p.score  1.4150085757  0.002828720           100
## 2            Instructor_01_10 -0.2009820715  0.007936508            96
## 3                     prevPAL  0.7509922479  0.045889456            94
## 4              csus.gpa.start  0.1977011453 -0.016216685            92
## 5             Instructor_01_5  0.0659472422 -0.007936508            88
## 6         bot.level_Sophomore  0.0560123330 -0.007936508            86
## 7             Instructor_01_4  0.1537056069 -0.023809524            85
## 8           eth.erss_Hispanic  0.0857599635  0.015873016            81
## 9            bot.level_Senior -0.1724905790  0.051587302            70
## 10   cum.percent.units.passed  0.1012499852 -0.030907174            69
## 11           bot.level_Junior  0.1164782460 -0.043650794            63
## 12                 cMaj_OTHER -0.1868219710 -0.071428571            62
## 13  e.rmd_Remedial in English  0.1259563778  0.079365079            37
## 14             eth.erss_White -0.0511590727  0.035714286            30
## 15          chem1b.grd.pt.unt  0.0493920155 -0.037396912            24
## 16              median.income -0.1450193421  0.115220532            21
## 17  eth.erss_African American  0.0433938563 -0.035714286            18
## 18             eth.erss_Asian -0.0819344524 -0.079365079             3
## 19                gender_Male -0.0774808724 -0.075396825             3
## 20              delay.from.hs -0.1177689275  0.150005260           -27
## 21           Instructor_01_14 -0.0186707777  0.023809524           -28
## 22 eth.erss_Two or More Races  0.0066803700  0.023809524          -256
## 23             eth.erss_Other -0.0027406646  0.039682540         -1348
## 24            pct.female.head -0.0007108937  0.094801544        -13236

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(chem24.bal,binary = "raw", stars = "std", var.order = "unadjusted", 
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(chem24.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 23 122
Multiple Matches 27 4
Total Students 50 126

Out of 126 PAL students, 122 PAL students were matched to one non-PAL student and 4 PAL students were matched to multiple non-PAL students.

Out of 130 non-PAL student matches, there were 50 non-PAL students, 23 of the non-PAL students were matched to one PAL student and 27 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(chem24.final, match.chem24)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is 0.0194, which indicates balance. The variables, “delay.from.hs” and “median.income”, are not under the 0.10 mean difference threshold, but they are under the 0.25 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the imbalanced variables are not highly prognostic and the estimated prognostic score is balanced.

## 
## Call:
## glm(formula = f.build("grd.pt.unt", chem24.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4293  -0.5496   0.1083   0.6382   2.5388  
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.151e+00  1.772e+00  -1.214    0.227    
## chem1b.grd.pt.unt          7.545e-01  1.440e-01   5.238 7.19e-07 ***
## cum.percent.units.passed   2.546e-01  1.688e+00   0.151    0.880    
## eth.erssAsian             -4.020e-01  4.818e-01  -0.834    0.406    
## eth.erssHispanic          -1.580e-01  4.970e-01  -0.318    0.751    
## eth.erssTwo or More Races -7.777e-01  5.815e-01  -1.338    0.184    
## eth.erssWhite             -1.290e-01  4.959e-01  -0.260    0.795    
## eth.erssOther              4.655e-01  6.339e-01   0.734    0.464    
## genderMale                -1.011e-01  1.916e-01  -0.528    0.599    
## prevPAL                   -5.747e-02  7.795e-02  -0.737    0.462    
## Instructor_015             1.694e+00  3.509e-01   4.826 4.21e-06 ***
## Instructor_0110            1.558e+00  2.444e-01   6.373 3.73e-09 ***
## Instructor_0114            1.081e+00  2.400e-01   4.503 1.59e-05 ***
## cMajOTHER                  2.367e-01  2.236e-01   1.058    0.292    
## e.rmdRemedial in English   2.513e-02  2.837e-01   0.089    0.930    
## bot.levelJunior            9.747e-02  2.922e-01   0.334    0.739    
## bot.levelSenior           -3.210e-01  3.450e-01  -0.930    0.354    
## delay.from.hs              1.016e-02  7.022e-02   0.145    0.885    
## csus.gpa.start             1.764e-01  3.344e-01   0.528    0.599    
## pct.female.head            6.096e-03  2.337e-02   0.261    0.795    
## median.income              8.076e-06  7.239e-06   1.116    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9597258)
## 
##     Null deviance: 240.75  on 138  degrees of freedom
## Residual deviance: 113.25  on 118  degrees of freedom
## AIC: 409.98
## 
## Number of Fisher Scoring iterations: 2
## Balance Measures
##                                Type Diff.Adj        M.Threshold
## prog.score                 Distance   0.0194     Balanced, <0.1
## chem1b.grd.pt.unt           Contin.  -0.0374     Balanced, <0.1
## cum.percent.units.passed    Contin.  -0.0309     Balanced, <0.1
## eth.erss_African American    Binary  -0.0357     Balanced, <0.1
## eth.erss_Asian               Binary  -0.0794     Balanced, <0.1
## eth.erss_Hispanic            Binary   0.0159     Balanced, <0.1
## eth.erss_Two or More Races   Binary   0.0238     Balanced, <0.1
## eth.erss_White               Binary   0.0357     Balanced, <0.1
## eth.erss_Other               Binary   0.0397     Balanced, <0.1
## gender_Male                  Binary  -0.0754     Balanced, <0.1
## prevPAL                     Contin.   0.0459     Balanced, <0.1
## Instructor_01_4              Binary  -0.0238     Balanced, <0.1
## Instructor_01_5              Binary  -0.0079     Balanced, <0.1
## Instructor_01_10             Binary   0.0079     Balanced, <0.1
## Instructor_01_14             Binary   0.0238     Balanced, <0.1
## cMaj_OTHER                   Binary  -0.0714     Balanced, <0.1
## e.rmd_Remedial in English    Binary   0.0794     Balanced, <0.1
## bot.level_Sophomore          Binary  -0.0079     Balanced, <0.1
## bot.level_Junior             Binary  -0.0437     Balanced, <0.1
## bot.level_Senior             Binary   0.0516     Balanced, <0.1
## delay.from.hs               Contin.   0.1500 Not Balanced, >0.1
## csus.gpa.start              Contin.  -0.0162     Balanced, <0.1
## pct.female.head             Contin.   0.0948     Balanced, <0.1
## median.income               Contin.   0.1152 Not Balanced, >0.1
## p.score                     Contin.   0.0028     Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        23
## Not Balanced, >0.1     2
## 
## Variable with the greatest mean difference
##       Variable Diff.Adj        M.Threshold
##  delay.from.hs     0.15 Not Balanced, >0.1
## 
## Sample sizes
##                      Control Treated
## All                   139.       126
## Matched (ESS)          25.48     126
## Matched (Unweighted)   50.       126
## Unmatched              89.         0

Estimate Difference Between Mean grade in Chem 24 of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.70754 . This result is statistically significant with a P-value of 0.0063135. and is based on 126 PAL students and 130 non-PAL student matches(50 non-PAL students total). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.chem24)
## 
## Estimate...  0.70754 
## AI SE......  0.25907 
## T-stat.....  2.731 
## p.val......  0.0063135 
## 
## Original number of observations..............  265 
## Original number of treated obs...............  126 
## Matched number of observations...............  126 
## Matched number of observations  (unweighted).  130 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  0

Sensitivity Analysis

psens(match.chem24, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0001
##    1.2           0      0.0004
##    1.3           0      0.0015
##    1.4           0      0.0041
##    1.5           0      0.0095
##    1.6           0      0.0193
##    1.7           0      0.0352
##    1.8           0      0.0586
##    1.9           0      0.0905
##    2.0           0      0.1310
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to csus.gpa.start or ethnicity the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(chem24.first.order.prop.model$coefficients))))
x
median.income 1.000025
genderMale 1.069562
pct.female.head 1.086803
delay.from.hs 1.162585
bot.levelJunior 1.286195
chem1b.grd.pt.unt 1.447259
Instructor_015 1.565295
Instructor_0114 1.634353
(Intercept) 2.118759
prevPAL 2.232138
csus.gpa.start 2.240135
e.rmdRemedial in English 2.754383
eth.erssHispanic 3.172879
eth.erssTwo or More Races 3.350293
bot.levelSenior 3.867930
eth.erssAsian 4.679341
cMajOTHER 4.817476
Instructor_0110 5.641448
eth.erssWhite 6.155689
eth.erssOther 6.725577
cum.percent.units.passed 31.271288

Propensity Score Adjusted Mean Grades

Adjusted Mean Grades
Course Non-PAL PAL Diff. Std. error p-val Sensitivity N(non-PAL) N(PAL)
BIO 22 1.71 2.31 0.60 0.13 1.28e-06 1.6 332 326
CHEM 1A 1.48 1.98 0.50 0.07 2.43e-13 1.8 711 757
CHEM 1B 1.76 2.19 0.43 0.08 8.59e-09 1.8 457 573
CHEM 4 1.90 2.35 0.44 0.09 2.19e-07 1.7 680 530
CHEM 24 1.39 2.10 0.71 0.26 3.16e-03 1.8 50 126

References

Greifer, Noah. 2020. Cobalt: Covariate Balance Tables and Plots. https://CRAN.R-project.org/package=cobalt.

Leite, W. L. 2017. Practical Propensity Score Methods Using R. Thousand Oaks, CA: Sage Publishing. https://osf.io/nygb5/.

Sekhon, Jasjeet S. 2011. “Multivariate and Propensity Score Matching Software with Automated Balance Optimization: The Matching Package for R.” Journal of Statistical Software 42 (7): 1–52. http://www.jstatsoft.org/v42/i07/.

Yoshida, Kazuki, and Alexander Bartel. 2020. Tableone: Create ’Table 1’ to Describe Baseline Characteristics with or Without Propensity Score Weights. https://CRAN.R-project.org/package=tableone.

Zhang, Z., H. J. Kim, G. Lonjon, Y. Zhu, and written on behalf of AME Big-Data Clinical Trial Collaborative Group. 2019. “Balance Diagnostics After Propensity Score Matching.” Annals of Translational Medicine 7 (1): 16. https://doi.org/10.21037/atm.2018.12.10.