Load packages

First Add Some R Packages to the Workspace.
Caution: warning messages are suppressed to reduce clutter in the output.

tidyverse: Importing data, cleaning data, data manipulation, & data visualization
kableExtra: Build HTML tables
DataExplorer: Exploratory Data Analysis & Feature Engineering
tableone: Standardized mean differences for before and after matching
survey: Matched data with match weights
Matching: Propensity score matching
cobalt: Covariate balance
reshape2: Covariate balance plot
rbounds: Rosenbaum Sensitivity test

library(tidyverse)
library(kableExtra)
library(DataExplorer) 
library(tableone)
library(survey)
library(Matching)
library(cobalt)
library(reshape2)
library(rbounds)

select <- dplyr::select # Resolves package conflicts with select
options(width = 120) # Format print width

Load functions

General functions used throughout the analysis.

# Update palN for Chem 24 Spring 2019 ------------------------------------------
update.chem24s19 <- function(chem.dat) {
  PAL.course.data <- read_rds("palCourseData.rds")
  chem24.S19 <- PAL.course.data  %>%
    filter(term == "Spring 2019", course == "CHEM 24")
  # Add a palN indicator for Chem 24 Spring 2019
  chem24.S19 <- chem24.S19 %>%
    mutate(palN.chem24.S19 = case_when(
      pal.grade == "CR" ~ 2,
      is.na(pal.grade) ~ 0,
      TRUE ~ 1
    )) %>%
    select(emplid, palN.chem24.S19) 
  
  # Check how many student are non-PAL, incomplete PAL, and PAL
  table(chem24.S19$palN.chem24.S19)
  # 0  1  2 
  # 51 10 52 
  
  chem.dat <- left_join(chem.dat, chem24.S19, by= "emplid" )
  
  chem.dat  <- chem.dat %>%
    mutate(palN = case_when(
      course == "CHEM 24" & term == "Spring 2019" ~ palN.chem24.S19,
      TRUE ~ palN
    )) %>%
    select(-palN.chem24.S19)
  
  return(chem.dat)
}

# Get raw table of mean gpa for PAL and non-PAL  -------------------------------
get.raw.tab <- function(classes, df)
{ 
 raw.table = data.frame(class=character(),
                         nonPALavg=numeric(),
                         PALavg=numeric(), 
                         Diff=numeric(), 
                         NonPAL_Num= integer(),
                         PAL_Num=integer(),
                         CompletePAL=numeric(),
                         TermPALStart=integer(),
                         row.names=NULL,
                         stringsAsFactors = FALSE)
  
  
  for (i in 1:length(classes))
  {
    curr.class = classes[i]
    temp = subset(df, course==curr.class & course.seq==0)
    pal.start=min(unique(temp$term.code[temp$palN==2]))
    # only include terms after PAL start term
    temp = subset(temp, term.code>= pal.start)
    
    x=tapply(temp$grd.pt.unt,temp$palN, 
             mean, na.rm=T) %>% 
      as.numeric %>% 
      round(2)
    
    y=table(temp$palN) %>% as.numeric
    
    raw.table[i, 'class' ] = curr.class
    raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
                             round(y[3]/sum(y),2))
    raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
    
  }
  
  # formatted table
  raw.table <- kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)") %>%
    kable_styling(full_width= T, position = "left")
 
   return(raw.table)
}

# Data cleaning ----------------------------------------------------------------
clean.data <- function(df)
{
  # Replaced coh.term with coh.term.course
  yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",df$coh.term.course))
  df$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(df$hs.grad.yr),
                                  yr.course.taken-df$hs.grad.yr, NA)
  
  sum(is.na(df$delay.from.hs)) 
  
  # remove students who did not complete PAL 
  df=subset(df, palN!=1) 
  
  #recode palN to factor with 0/1 levels
  df$palN = ifelse(df$palN==2, 1, 0)
  
  #clean up category names in m.rmd and e.rmd
  df$m.rmd[df$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
  df$m.rmd[df$m.rmd=="Remedial\nin Math"]="Remedial in Math"
  df$e.rmd[df$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
  df$e.rmd[df$e.rmd=="Remedial\nin English"]="Remedial in English"
  
  df <- df %>% mutate(m.rmd = factor(m.rmd), e.rmd = factor(e.rmd))
  # table(df$e.rmd)
  
  # Create feature, proportion of cumulative units taken that were passes
  # To distinguish the students who have taken 0 units from the students who 
  #   have passed 0  units they have taken, students who have taken 0 units are 
  #   labeled as -1. Then the -1 is replaced by the mean of cum.percent.units.passed
  df <- df %>%
    mutate(cum.percent.units.passed = ifelse(tot.taken.prgrss.start == 0, -1,
                                             tot.passd.prgrss.start / tot.taken.prgrss.start)) %>%
    mutate(cum.percent.units.passed = ifelse(cum.percent.units.passed  == -1, mean(cum.percent.units.passed,  na.rm =TRUE),
                                             cum.percent.units.passed  ))
  
  # code instructor as alphabetic letter for anonymity
  df$Instructor_01=droplevels(factor(df$Instructor_01))
  
  instructor.vec = sort(unique(df$Instructor_01))
  num.instr = length(instructor.vec)
  
  df$Instructor_01 = factor(
    df$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr)
  )
  
  key.instr.code = cbind(as.character(instructor.vec), 1:num.instr)

  # Add "cMaj", census majors without concentrations/specializations/tracks/etc. 
  major_lookup <- read.csv("Census Major Lookup.csv", header = TRUE, 
                           stringsAsFactors = FALSE)
  df <- merge(df, major_lookup %>% select(censusMajor, cMaj),
              by = "censusMajor", all.x = TRUE)

  # Recode mother's education and father's education variables.
  non.hs.grad= c("No High School","Some High School")
  hs.grad= c("High School Graduate","Some College","2-Year College Graduate")
  coll.grad= c("4-Year College Graduate","Postgraduate")
  parent.ed.levels= c(
    "Non-HS Graduate","HS Graduate", "College Graduate", "Unknown"
  )
  
  df <- df %>%
    mutate(
      mother.ed = ifelse(mother.ed %in% non.hs.grad, "Non-HS Graduate",
        ifelse(mother.ed %in% hs.grad, "HS Graduate", 
          ifelse(mother.ed %in% coll.grad, "College Graduate", "Unknown"))),
      mother.ed= factor(mother.ed, levels= parent.ed.levels),
      father.ed = ifelse(father.ed %in% non.hs.grad,"Non-HS Graduate",
        ifelse(father.ed %in% hs.grad, "HS Graduate", 
          ifelse(father.ed %in% coll.grad, "College Graduate", "Unknown"))),
      father.ed= factor(father.ed, levels= parent.ed.levels))
  
  # Recoded adm.area with these counties as local: 'El Dorado', 'Nevada', 
  #   'Placer', 'Sacramento', 'San Joaquin', 'Solano', 'Yolo'.
  counties.rad <- read_csv(
    "countiesRadius120mi.csv", 
    col_types = cols(
      state = col_skip(), city = col_skip(), distance.km = col_skip()
    )
  )                                     
  
  df <- left_join(df, counties.rad, by = "zip")
  
  local.adm.counties <- c(
    'El Dorado', 'Nevada', 'Placer', 'Sacramento', 'San Joaquin', 'Solano', 
    'Yolo'
  )
  
  # County will be NA if the zip code is not within 120 mile radius of 
  #   CSUS zip code(95819) 
  df <- df %>%
    mutate(
      adm.area = 
        if_else(!(county %in% local.adm.counties) | is.na(county), 
                         "nonlocal", "local")
    ) %>%
    mutate(sac.county.flg =
             if_else(!(county == "Sacramento") | is.na(county), 0, 1)
    ) %>%
    mutate(sac.county.flg = factor(sac.county.flg))

return(df)
}

# Extract prerequisite course grade ---------------------------------------------
get.prereq.grades <- function(course.df, df, prereq) {
  # Get student's recent Chem 1B grade
  course.stu <- course.df$emplid
  prereq.df <- df %>%
    select(emplid, course, course.seq, grd.pt.unt, grade) %>%
    filter(emplid %in% course.stu, course== prereq) %>% 
    group_by(emplid) %>%
    filter(course.seq == max(course.seq)) %>%
    rename(
      prereq.course.seq = course.seq, prereq.grd.pt.unt = grd.pt.unt, 
      prereq.grade = grade
    ) %>% 
    select(-course)
  
  dim(prereq.df) # [1] 275   6
  prereq.stu <- prereq.df$emplid
  
  course.df <- course.df %>%
    filter(emplid %in% prereq.stu)
  course.df <- left_join(course.df, prereq.df, by = "emplid")
  
  return(course.df)
}

# Get only the variables that have missing values ---------------------------------------------
get.missing.only <- function(course.df) {
  get.missing.only <- course.df %>% 
    summarise(across(everything(), ~ sum(is.na(.x)))) %>%
    gather() %>%
    filter(value != 0) 
  get.missing.only <- course.df %>%
    dplyr::select(all_of(get.missing.only$key)) 

  return(get.missing.only)
}

# Get imbalanced variables with SMD > 0.1------------------------------------
get.imbal.vars <- function(tab)
{
  get.imbal.vars <- as.data.frame(ExtractSmd(tab))
  get.imbal.vars <- get.imbal.vars %>%
    rownames_to_column(var = "Variable") %>%
    rename(`Before Matching SMD` = `1 vs 2`) %>%
    filter(`Before Matching SMD` > 0.1) %>% 
    arrange(desc(`Before Matching SMD`))
  get.imbal.vars <- kable(
    get.imbal.vars, caption = "Variables with SMD > 0.1"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.imbal.vars)
}
# Unadjusted means -------------------------------------------------------------
get.unadj.means <- function(df.final)
{
  get.unadj.means <- df.final %>%
    group_by(palN) %>% summarise(unadj.means = mean(grd.pt.unt)) %>%
    pivot_wider(names_from = "palN", values_from = "unadj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  get.unadj.means<- kable(
    get.unadj.means, caption = "Unadjusted Mean Grades"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.unadj.means)
}
# Adjusted means  --------------------------------------------------------------
adj.means <- function(match.list, matched.dat) {
  get.adj.means <- matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  # formatted table
  get.adj.means<- kable(get.adj.means, caption = "Adjusted Mean Grades") %>%
    kable_styling(full_width= F)
  
  return(get.adj.means)
}

# Match Table ------------------------------------------------------------------
create.match.tab <- function(matched.dat) {
  matched.dat <- matched.dat %>%
    mutate(pal = if_else(palN == 0, "Non-PAL", "PAL"))
  pal.flg <- c('Non-PAL', 'PAL')
  
  for (i in seq_along(pal.flg)) {
    multiple.matches <- matched.dat %>%
      filter(pal ==pal.flg[i]) %>%
      count(id) %>%
      filter(n> 1) %>%
      summarise(n())
    single.matches <- matched.dat %>%
      filter(pal == pal.flg[i]) %>%
      count(id) %>%
      filter(n==1) %>%
      summarise(n())
    if(pal.flg[i] == 'Non-PAL') {
      match.tab <- bind_rows(single.matches,  multiple.matches)
      match.tab <- match.tab %>%
        rename('Non-PAL'= 'n()')
    }
    pal.matches <- bind_rows(single.matches, multiple.matches)
    match.tab$PAL <- pal.matches$`n()`
    row.names(match.tab) <- c("Single Matches", "Multiple Matches")
  } 
  match.tab <-rbind(
    match.tab, "Total Students" = c(sum(match.tab$`Non-PAL`), sum(match.tab$`PAL`))
  )
  match.tab <- kable(match.tab, caption = "PAL and Non-PAL Matches") %>%
    kable_styling(full_width= F)
  
  return(match.tab)
}

# ATT plot ---------------------------------------------------------------------
# https://livefreeordichotomize.com/2019/01/17/understanding-propensity-score-weighting/
# https://www.csus.edu/brand/colors.html
get.att.plot <- function(df.final, match.list)
{
  df.final$p.score <- p.score
  
  df.final <- df.final %>%
    select(-id) %>%
    rownames_to_column(var = "id")

  ps.dat <- df.final %>%
    select(id, palN, p.score) %>%
    pivot_wider(
      names_from = "palN", values_from = "p.score", names_prefix = "b.pal."
    )
  before.match <- ps.dat %>%
    select(b.pal.0, b.pal.1)
  
  matched.dat <- df.final[unlist(match.list[c("index.treated", "index.control")]), ]
  matched.dat$match.weights<-  c(match.list$weights, match.list$weights)

  after.match <-matched.dat %>% 
    select(-id) %>%
    rownames_to_column(var = "id")
  after.match <- after.match %>%
    pivot_wider(names_from = "palN", values_from = "p.score", names_prefix = "pal.")
  after.match <- after.match %>%
    select(pal.0, pal.1, match.weights)
  
  get.att.plot <- ggplot() +
    geom_histogram(data = before.match, bins = 50, aes(b.pal.1), alpha = 0.5) + 
    geom_histogram(data = after.match,bins = 50, aes(pal.1, weight = match.weights), 
                   fill = "#043927", alpha = 0.5) + 
    geom_histogram(data = before.match, bins = 50, alpha = 0.5, 
                   aes(x = b.pal.0, y = -..count..)) + 
    geom_histogram(data = after.match, bins = 50, 
                   aes(x = pal.0, weight = match.weights, y = -..count..), 
                   fill = "#c4b581", alpha = 0.5) + 
    ylab("Count") + xlab("Propensity Scores") +
    geom_hline(yintercept = 0, lwd = 0.5) +
    scale_y_continuous(label = abs) 

return(get.att.plot)
}

# Variable Percent Improvement -------------------------------------------------
get.var.perc.tab <- function(list.bal) {
  get.var.perc.tab <- list.bal %>%
    pluck("Balance") %>%
    rownames_to_column("Variable") %>%
    dplyr::select("Variable", "Type", "Diff.Un","Diff.Adj") %>%
    mutate(`% Improvement` = if_else(Diff.Un == 0, 0, round(((abs(Diff.Un) - abs(Diff.Adj))/ abs(Diff.Un)) * 100 , 0))) %>%
    arrange(desc(`% Improvement`))
  get.var.perc.tab <- get.var.perc.tab %>% dplyr::select("Variable", "Diff.Un", "Diff.Adj", `% Improvement`)
  
  return(get.var.perc.tab)
}

# Covariate Balance Plots -------------------------------------------------------
# https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
# https://www.csus.edu/brand/colors.html
get.bal.plot <- function(unmatched.tab, matched.tab) {
  ## Construct a data frame containing variable name and SMD from all methods
  dataPlot <- data.frame(variable  = rownames(ExtractSmd(unmatched.tab)),
                         Unmatched = as.numeric(ExtractSmd(unmatched.tab)),
                         Matched   = as.numeric(ExtractSmd(matched.tab))  )
  
  ## Create long-format data for ggplot2
  dataPlotMelt <- melt(data          = dataPlot,
                       id.vars       = c("variable"),
                       variable.name = "Method",
                       value.name    = "SMD")
  
  ## Order variable names by magnitude of SMD
  varNames <- as.character(dataPlot$variable)[order(dataPlot$Unmatched)]
  
  ## Order factor levels in the same order
  dataPlotMelt$variable <- factor(dataPlotMelt$variable,
                                  levels = varNames)
  
  ## Plot using ggplot2
  # Sac State colors and dashed line
  get.bal.plot <-ggplot(
    data = dataPlotMelt, mapping = 
      aes(x = variable, y = SMD, group = Method, color= Method)) +
    scale_color_manual(values = c("#043927", "#c4b581")) +
    geom_line(aes(linetype = Method)) +
    geom_point() +
    scale_linetype_manual(values= c("dashed", "solid")) +
    geom_hline(yintercept = 0.1, color = "black", size = 0.1) +
    coord_flip() +
    theme_bw() + theme(legend.key = element_blank())
  
  return(get.bal.plot)
}

# PAL Effect -------------------------------------------------------------------
get.pal.effect <- function(match.list, matched.dat, course) {  
 get.gamma <- psens(match.list, Gamma=2.0, GammaInc = 0.1)[["bounds"]] %>%
    filter(`Lower bound` < 0.05 & 0.05 < `Upper bound`) %>%
    slice_min(Gamma) %>% 
    select(Gamma) 
  
  get.pal.effect <-  matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Course= course, .before= "Non-PAL") %>%
    mutate(Diff. = `PAL`-`Non-PAL`) %>%
    mutate(`Std. error`= match.list$se, .after= "Diff.") %>%
    mutate(
      `p-val`= formatC( 1-pnorm(Diff./`Std. error`), format = "e", digits = 2), 
      Sensitivity= get.gamma$Gamma, 
      `N(non-PAL)`= length(unique(match.list$index.control)),
      `N(PAL)`= match.list$wnobs
    )

  return(get.pal.effect)
  }

Specialized functions for each course.

## BIO 22 ====================================================================
## Filter to relevant variables 
bio22.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 'course.count',
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',    
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',    'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(bio22.dat))
  bio22.final <- bio22.dat[ ,new.vars]
  
  return(bio22.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
bio22.step <- function(final.df) {
  # AP_CALAB 
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score +
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg, 
    data= bio22.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(glm(palN ~. - grd.pt.unt,  data=bio22.final, family=binomial))
  bio22.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(bio22.step.first.order)
  bio22.step.first.order$anova
  
  model.first.order <- formula(bio22.step.first.order)
  bio22.first.order.prop.model <- glm(
    model.first.order, data=bio22.final, family=binomial
  )
  
  return(bio22.first.order.prop.model)
}


## CHEM 1A ====================================================================
## Filter to relevant variables 
chem1a.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',   
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg',
    'term.units.attemptedCensus','palN', 'grd.pt.unt', 'sat.math.score', 
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1a.dat))
  chem1a.final <- chem1a.dat[ ,new.vars]
  
  return(chem1a.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem1a.step <- function(final.df) {
  # Stepwise selection selected AP_CALAB.flg, AP_BIOL.flg, AP_CHEM, and
  # AP_CHEM.flg
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score + 
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg + AP_BIOL +
      AP_BIOL.flg + AP_CHEM + AP_CHEM.flg, data= chem1a.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt, data=chem1a.final, family=binomial)
  )

  chem1a.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
  )
  summary(chem1a.step.first.order)
  chem1a.step.first.order$anova
  
  model.first.order <- formula(chem1a.step.first.order)
  chem1a.first.order.prop.model <- glm(
    model.first.order, data=chem1a.final, family=binomial
  )
  
  return(chem1a.first.order.prop.model)
}

## CHEM 1B ====================================================================
## Filter to relevant variables
chem1b.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1a.grd.pt.unt',
    'AP_BIOL',  'AP_CALAB', 'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',
    'AP_CALBC.flg', 'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1b.dat))
  chem1b.final <- chem1b.dat[ ,new.vars]

  return(chem1b.final)
}

## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem1b.step <- function(final.df) {
  # Stepwise selection selected AP_BIOL.flg and AP_CHEM.flg
  # Removed AP_BIOL.flg. Then stepwise selection selected AP_CALAB.flg.
  # Removed AP_CALAB.flg and pct.female.head.flg
  min.model <- glm(
    palN ~ chem1a.grd.pt.unt + cum.percent.units.passed + eth.erss + gender +
      AP_CHEM + AP_CHEM.flg, data= chem1b.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL.flg - AP_CALAB.flg - pct.female.head.flg,  
        data=chem1b.final, family=binomial)
  )

  chem1b.step.first.order <- step(min.model,
                                  direction="forward",scope = biggest,
                                  trace=FALSE, k=2)
  summary(chem1b.step.first.order)
  chem1b.step.first.order$anova

  model.first.order <- formula(chem1b.step.first.order)
  chem1b.first.order.prop.model <- glm(model.first.order, data=chem1b.final, family=binomial)

  return(chem1b.first.order.prop.model)
}

## CHEM 4 ====================================================================
## Filter to relevant variables 
chem4.step.vars <- function(course.df)
{
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt','sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  
  new.vars <- intersect(vars.to.keep, names(chem4.dat))
  chem4.final <- chem4.dat[ ,new.vars]
  
  return(chem4.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem4.step <- function(final.df)
{
 # "AP_BIOL"
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender+ sat.math.score + 
      sat.verbal.score+sat.math.flg + AP_CALAB+AP_CALAB.flg, data= chem4.final, 
    family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL, data=chem4.final, family=binomial)
  )

  chem4.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(chem4.step.first.order)
  chem4.step.first.order$anova
  
  model.first.order <- formula(chem4.step.first.order)
  chem4.first.order.prop.model <- glm(
    model.first.order, data=chem4.final, family=binomial
    )
  
  return(chem4.first.order.prop.model)
}

## CHEM 24 ====================================================================
## Filter to relevant variables 
chem24.step.vars <- function(course.df)
{
  vars.to.keep <- c(  
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1b.grd.pt.unt', 
    'chem1b.term.gpa', 'chem1b.units.attempted', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem24.dat))
  chem24.final <- chem24.dat[ ,new.vars]
  
  return(chem24.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem24.step <- function(final.df) {
  min.model <- glm(
    palN ~ chem1b.grd.pt.unt + cum.percent.units.passed + eth.erss + gender,
    data= chem24.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(
    glm(palN ~.- grd.pt.unt - acad.stand - reason - pct.female.head.flg, 
        data=chem24.final, family=binomial)
  )
  
  chem24.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
    )
  summary(chem24.step.first.order)
  chem24.step.first.order$anova
  
  model.first.order <- formula(chem24.step.first.order)
  chem24.first.order.prop.model <- glm(
    model.first.order, data=chem24.final, family=binomial
  )
 
  return(chem24.first.order.prop.model)
}

Import the Data

Make sure the PAL datafile in the same directory as this RMarkdown file.

PALdatafull <- read_rds("paldatafull_csv.rds")
dim(PALdatafull)
## [1] 1099371     174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555

The files which includes data through the Spring 2019 semester has 1099371 rows and 174 columns. The total of the grd.pt.unt column is 2237555.

Chemistry classes

Subset data for chemistry classes

chem.classes <- paste("CHEM", c(4, '1A', '1B', 24))
chem.dat <- PALdatafull %>%
  filter(base.time.course == 1, course %in% chem.classes) %>%
  mutate(course = factor(course, levels = chem.classes)) 
dim(chem.dat) #  18948   174
## [1] 18948   174
num.stu <- dim(chem.dat)[1]
num.vars <- dim(chem.dat)[2]

There are 18948 rows and 174 variables. Each row is a chemistry student. So, there is a total of 18948 chemistry students.

Update CHEM 24 Spring 2019 for chemistry data

There are 83 first attempt only Chem 24 Spring 2019 students. Some of them are incorrectly labeled as non-PAL and need to be relabeled.

with(chem.dat %>% 
       filter(base.time.course == 1, pass.term.flg == "PASS Term",  course == "CHEM 24",  term == "Spring 2019", course.seq == 0), 
     table(palN))
## palN
##  0 
## 83
chem.dat <- update.chem24s19(chem.dat)

with(chem.dat %>% 
       filter(base.time.course == 1, pass.term.flg == "PASS Term",  course == "CHEM 24",  term == "Spring 2019", course.seq == 0), 
     table(palN))
## palN
##  0  1  2 
## 28  9 46
#  0  1  2 
# 28  9 46 

After relabeling, there are 28 non-PAL students, 9 incomplete PAL students, and 46 PAL students for Chem 24 Spring 2019.

Compare the mean gpa for PAL and non-PAL students by Course without Propensity Score Adjustment

The course.seq variable indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.

Note: Excludes incomplete PAL students

get.raw.tab(chem.classes, chem.dat)
Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)
class nonPALavg PALavg Diff NonPAL_Num PAL_Num CompletePAL TermPALStart
CHEM 4 2.03 2.39 0.36 1929 759 0.28 2123
CHEM 1A 1.63 2.04 0.41 1717 1055 0.37 2128
CHEM 1B 1.70 2.11 0.41 1090 769 0.40 2138
CHEM 24 1.63 2.06 0.43 224 177 0.43 2178

Data Cleaning & Feature Engineering

Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State

Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.

# Check how many students did not complete PAL
sum(chem.dat$palN==1) # 226 
## [1] 226
incl.pal.stu <- sum(chem.dat$palN==1)
chem.dat <- clean.data(chem.dat)
dim(chem.dat) # 18722   179
## [1] 18722   179

There were 226 chemistry students who did not complete PAL and were removed from the analysis. There are now 18722 chemistry students instead of 18948.

There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.

CHEM 24

Executive Summary

Based on data for 177 PAL students and 224 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.43 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.71\pm 0.26\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Chem 24, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Chem 24 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Chem 24, 13 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: ethnicity, enrollment in PAL in the past, instructor, Biology major, being remedial in English, class level, lower percent of households with a female head in the zip code where the student lived at the time of application to CSUS, and lower median income where the student lived at the time of application to CSUS.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.145. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.71\pm 0.26\) or between 0.45 and 0.97 on a 4.0 grade scale. This result is statistically significant with a P-value of \(3.16x10^{-3}\) and is based on 126 PAL students and 50 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.43.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Chem 24. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Chem 24 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of enrollment in PAL in the past, CSUS GPA at start of term, or remedial in English observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 177 PAL students in Chem 24 being reduced to 126. Also, 50 non-PAL students were selected out of 224 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 130 non-PAL matches. Of the 126 PAL students, 122 were matched one-to-one with non-PAL students and 4 were matched one-to-many with non-PAL students.

Extract Chem 24 Data and Prerequisite Course Chem 1B

The non-PAL and PAL groups will include students with only first attempts at CHEM 24.They will also include students with previous PAL participation and/or are currently in a PAL for another course.

For the prerequisite course CHEM 1B, the grade for the student’s last attempt and the number of times it was taken are added to the CHEM 1B data set. Only 275 out of 401 CHEM 24 students have CHEM 1B grades.

However, few students retook Chem 1B so there was inadequate balance on number of times Chem 1B was taken, and it was removed from the propensity model after balance checks.

# Excludes course repeats
chem24.dat <- chem.dat %>%
  filter(course=="CHEM 24", pass.term.flg == "PASS Term", course.seq== 0)
dim(chem24.dat) # 401 179
## [1] 401 179
prereq <- "CHEM 1B"
chem24.dat <- get.prereq.grades(chem24.dat, chem.dat, prereq)
# Rename Chem 1B variables
chem24.dat <- chem24.dat %>%
  rename(chem1b.course.seq= prereq.course.seq, chem1b.grd.pt.unt= prereq.grd.pt.unt, chem1b.grade = prereq.grade)
dim(chem24.dat) # 275 182
## [1] 275 182

There is a total of 275 CHEM 24 first attempt only students with prerequisite CHEM 1B grades.

The variables below were added to the CHEM 24 data, so there are now 182 variables instead of 179 variables.
chem1b.course.seq: Is the student taking CHEM 1B for the first time or is it the second attempt, third attempt, etc.?
chem1b.grd.pt.unt: Numeric grade on 0 to 4 scale (0=F, 4=A) for CHEM 1B
chem1b.grade: Course grade for CHEM 1B

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.10
with(chem24.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        103 122
##   Business                         1   0
##   Chemistry                       23   5
##   Child Devel/Early Childhood Ed   1   0
##   Criminal Justice                 2   0
##   Dance                            1   0
##   Geology                          1   0
##   Gerontology                      1   0
##   Health Science                   1   0
##   Kinesiology/Physical Education   5   3
##   Liberal Studies                  1   0
##   Mechanical Engineering           2   0
##   Nutrition                        0   1
##   Psychology                       0   1
##   Sociology                        1   0
chem24.dat <- group_category(data = chem24.dat, feature = "cMaj", threshold = 0.10,  update = TRUE)
with(chem24.dat, table(cMaj, palN))
##          palN
## cMaj        0   1
##   Biology 103 122
##   OTHER    40  10

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

## [1] 35
##                        feature num_missing pct_missing
## 17               Instructor_02         275   1.0000000
## 19             withdraw_reason         275   1.0000000
## 22                   deg.plan3         275   1.0000000
## 23                   deg.plan4         275   1.0000000
## 24                   deg.plan5         275   1.0000000
## 25                   deg.plan6         275   1.0000000
## 21                   deg.plan2         274   0.9963636
## 20                   deg.plan1         243   0.8836364
## 26                   grad.term         243   0.8836364
## 27                grad.termERS         243   0.8836364
## 28                         ttg         243   0.8836364
## 11                  trf.gpaADM         242   0.8800000
## 4                  pledge.term         199   0.7236364
## 18               treat.section         181   0.6581818
## 1                fys.term.code         134   0.4872727
## 2                      fys.grd         134   0.4872727
## 3                  fys.rpt.flg         134   0.4872727
## 13 ge.critical.thinking.status          91   0.3309091
## 14      ge.english.comp.status          91   0.3309091
## 15              ge.math.status          91   0.3309091
## 16         ge.oral.comm.status          91   0.3309091
## 12                  admit.term          89   0.3236364
## 5               sat.math.score          47   0.1709091
## 6                 sat.math.flg          47   0.1709091
## 7             sat.verbal.score          47   0.1709091
## 8               sat.verbal.flg          47   0.1709091
## 9                sat.test.date          47   0.1709091
## 29                plan.college          41   0.1490909
## 30           plan.college.desc          41   0.1490909
## 31                   plan.dept          41   0.1490909
## 32               plan.deptAbbr          41   0.1490909
## 33                 plan.degree          41   0.1490909
## 34                   plan.type          41   0.1490909
## 35                      county          39   0.1418182
## 10                      hs.gpa          30   0.1090909

## [1] 275 147

35 variables missing >10%
So, 35 variables were removed due to missingness and there are now 147 variables instead of 182 variables.

Subset on Complete Cases only in Chem 24 Data

chem24.dat <- chem24.dat[complete.cases(chem24.dat), ]
dim(chem24.dat) # 265 147
## [1] 265 147

265 out of 275 students are kept
10 students were removed due to missingness of variables

single.vars <- chem24.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = chem24.dat)
##                                            
##                                             Overall       
##   n                                           265         
##   AP_PHYSC (mean (SD))                       2.40 (0.00)  
##   AP_PHYSM (mean (SD))                       2.47 (0.00)  
##   AP_PHYSC.flg = Missing (%)                  265 (100.0) 
##   AP_PHYSM.flg = Missing (%)                  265 (100.0) 
##   country = USA (%)                           265 (100.0) 
##   med.inc.flg = Not Missing (%)               265 (100.0) 
##   career.course = UGRD (%)                    265 (100.0) 
##   acad.prog.course = UGD (%)                  265 (100.0) 
##   course (%)                                              
##      CHEM 4                                     0 (  0.0) 
##      CHEM 1A                                    0 (  0.0) 
##      CHEM 1B                                    0 (  0.0) 
##      CHEM 24                                  265 (100.0) 
##   component = LEC (%)                         265 (100.0) 
##   units (mean (SD))                          3.00 (0.00)  
##   course.numeric (mean (SD))                24.00 (0.00)  
##   div = Lower Division (%)                    265 (100.0) 
##   course.seq (mean (SD))                     0.00 (0.00)  
##   rpt.flg = First Attempt (%)                 265 (100.0) 
##   c2s = Non-C2S (%)                           265 (100.0) 
##   base.time.course (mean (SD))               1.00 (0.00)  
##   years (mean (SD))                          0.50 (0.00)  
##   enroll.status = Continuing (%)              265 (100.0) 
##   withdraw_code = NWD (%)                     265 (100.0) 
##   enrl.flg = Enrolled (%)                     265 (100.0) 
##   enrl.flgERS = Enrolled (%)                  265 (100.0) 
##   pst.flg = Enrolled (%)                      265 (100.0) 
##   pst.flgERS = Enrolled (%)                   265 (100.0) 
##   grad.flg = Not Graduated (%)                265 (100.0) 
##   grad.flgERS = Not Graduated (%)             265 (100.0) 
##   rtn.flg = Retained (%)                      265 (100.0) 
##   rtn.flgERS = Retained (%)                   265 (100.0) 
##   pass.term.flg = PASS Term (%)               265 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)        265 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%)   265 (100.0)
sum(single.vars) # 1
## [1] 31
# remove single-valued variables
chem24.dat<- chem24.dat %>%
  dplyr::select(-names(single.vars))
dim(chem24.dat) # 265 116
## [1] 265 116

116 out of 147 variables are kept
31 variables removed due to single values

Identify variables causing complete separation in logistic regression

# Remove non chem24 instructors
chem24.dat <- chem24.dat %>%
  droplevels(chem24.dat$Instructor_01)

# Combine sparse ethnicity categories to Other
chem24.dat <- chem24.dat %>%
  mutate(eth.erss = fct_other(eth.erss, drop = c("Foreign", "Pacific Islander", "Unknown")))
with(chem24.dat, table(eth.erss, palN))
##                    palN
## eth.erss             0  1
##   African American   5 10
##   Asian             50 35
##   Hispanic          30 38
##   Two or More Races  9  9
##   White             38 28
##   Other              7  6
# Only one Freshman student, so combined with Sophomore
chem24.dat <- chem24.dat %>%
  mutate(bot.level = fct_collapse(bot.level, Sophomore = c("Sophomore", "Freshman")))
with(chem24.dat, table(bot.level, palN))
##            palN
## bot.level    0  1
##   Sophomore 22 27
##   Junior    50 60
##   Senior    67 39

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

chem24.final <- chem24.step.vars(chem24.dat)
kable(names(chem24.final))
x
acad.stand
adm.area
bot.level
cMaj
coh
course.age
csus.gpa.start
cum.percent.units.passed
delay.from.hs
e.rmd
eth.erss
father.ed
fys.flg
gender
hous.coh.term.flg
Instructor_01
median.income
m.rmd
mother.ed
pct.female.head
pell.coh.term.flg
prevPAL
prevPASS
reason
sac.county.flg
term.units.attemptedCensus
palN
grd.pt.unt
chem1b.grd.pt.unt
AP_BIOL
AP_CALAB
AP_CALBC
AP_CHEM
AP_BIOL.flg
AP_CALAB.flg
AP_CALBC.flg
AP_CHEM.flg
pct.female.head.flg

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, and chem1b.grd.pt.unt . Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

## 
## Call:
## glm(formula = model.first.order, family = binomial, data = chem24.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3386  -0.8796  -0.1799   0.8482   2.2670  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -7.508e-01  2.893e+00  -0.260  0.79521    
## chem1b.grd.pt.unt         -3.697e-01  2.564e-01  -1.442  0.14940    
## cum.percent.units.passed   3.443e+00  2.976e+00   1.157  0.24738    
## eth.erssAsian             -1.543e+00  7.270e-01  -2.123  0.03377 *  
## eth.erssHispanic          -1.155e+00  7.313e-01  -1.579  0.11438    
## eth.erssTwo or More Races -1.209e+00  8.740e-01  -1.383  0.16657    
## eth.erssWhite             -1.817e+00  7.544e-01  -2.409  0.01599 *  
## eth.erssOther             -1.906e+00  9.454e-01  -2.016  0.04380 *  
## genderMale                 6.725e-02  3.340e-01   0.201  0.84043    
## prevPAL                    8.030e-01  1.464e-01   5.485 4.13e-08 ***
## Instructor_015            -4.481e-01  4.879e-01  -0.918  0.35839    
## Instructor_0110           -1.730e+00  4.521e-01  -3.827  0.00013 ***
## Instructor_0114           -4.912e-01  4.002e-01  -1.227  0.21967    
## cMajOTHER                 -1.572e+00  4.910e-01  -3.202  0.00137 ** 
## e.rmdRemedial in English   1.013e+00  4.046e-01   2.504  0.01227 *  
## bot.levelJunior           -2.517e-01  4.541e-01  -0.554  0.57942    
## bot.levelSenior           -1.353e+00  5.719e-01  -2.365  0.01801 *  
## delay.from.hs              1.506e-01  1.121e-01   1.343  0.17919    
## csus.gpa.start             8.065e-01  5.858e-01   1.377  0.16859    
## pct.female.head           -8.324e-02  3.506e-02  -2.374  0.01759 *  
## median.income             -2.480e-05  1.264e-05  -1.962  0.04979 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 366.73  on 264  degrees of freedom
## Residual deviance: 270.15  on 244  degrees of freedom
## AIC: 312.15
## 
## Number of Fisher Scoring iterations: 5

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(chem24.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.689209 2.101587 0.4123787

Standardized mean differences for continuous variables and categorical variables.

unmatched.tab <- CreateTableOne(vars = chem24.covs, strata = "palN", 
                             data = chem24.final, smd = TRUE, test = FALSE)
print(unmatched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0                   1                   SMD   
##   n                                         139                 126                  
##   chem1b.grd.pt.unt (mean (SD))            2.28 (0.86)         2.32 (0.82)      0.048
##   cum.percent.units.passed (mean (SD))     0.92 (0.09)         0.92 (0.07)      0.091
##   eth.erss (%)                                                                  0.309
##      African American                         5 ( 3.6)           10 ( 7.9)           
##      Asian                                   50 (36.0)           35 (27.8)           
##      Hispanic                                30 (21.6)           38 (30.2)           
##      Two or More Races                        9 ( 6.5)            9 ( 7.1)           
##      White                                   38 (27.3)           28 (22.2)           
##      Other                                    7 ( 5.0)            6 ( 4.8)           
##   gender = Male (%)                          56 (40.3)           41 (32.5)      0.162
##   prevPAL (mean (SD))                      1.03 (1.23)         1.87 (1.12)      0.715
##   Instructor_01 (%)                                                             0.527
##      4                                       36 (25.9)           52 (41.3)           
##      5                                       14 (10.1)           21 (16.7)           
##      10                                      50 (36.0)           20 (15.9)           
##      14                                      39 (28.1)           33 (26.2)           
##   cMaj = OTHER (%)                           37 (26.6)           10 ( 7.9)      0.510
##   e.rmd = Remedial in English (%)            20 (14.4)           34 (27.0)      0.315
##   bot.level (%)                                                                 0.358
##      Sophomore                               22 (15.8)           27 (21.4)           
##      Junior                                  50 (36.0)           60 (47.6)           
##      Senior                                  67 (48.2)           39 (31.0)           
##   delay.from.hs (mean (SD))                3.72 (2.07)         3.40 (2.67)      0.132
##   csus.gpa.start (mean (SD))               3.11 (0.48)         3.19 (0.42)      0.185
##   pct.female.head (mean (SD))             19.44 (6.32)        19.43 (7.09)      0.001
##   median.income (mean (SD))            66770.71 (19689.75) 64023.91 (18940.87)  0.142

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     3    10    13
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
prevPAL 0.7154758
Instructor_01 0.5273159
cMaj 0.5099804
bot.level 0.3584403
e.rmd 0.3147882
eth.erss 0.3094389
csus.gpa.start 0.1849083
gender 0.1615446
median.income 0.1421813
delay.from.hs 0.1317131

10 variables have SMD >0.1

Implement a propensity score matching method.

match.chem24 <- with(chem24.final, Match(
  Y=chem24.final$grd.pt.unt, Tr = chem24.final$palN, X = p.score, 
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
chem24.final <- chem24.final %>%
  rownames_to_column(var = "id")

# Matched data
chem24.matched.dat <- chem24.final[unlist(match.chem24[c("index.treated", "index.control")]), ]
chem24.matched.dat$match.weights<-  c(match.chem24$weights, match.chem24$weights)

# Add match weights to match data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = chem24.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = chem24.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                       Stratified by palN
##                                        0                   1                   SMD   
##   n                                      126.00              126.00                  
##   chem1b.grd.pt.unt (mean (SD))            2.35 (0.85)         2.32 (0.82)      0.037
##   cum.percent.units.passed (mean (SD))     0.93 (0.09)         0.92 (0.07)      0.028
##   eth.erss (%)                                                                  0.330
##      African American                      14.5 (11.5)         10.0 ( 7.9)           
##      Asian                                 45.0 (35.7)         35.0 (27.8)           
##      Hispanic                              36.0 (28.6)         38.0 (30.2)           
##      Two or More Races                      6.0 ( 4.8)          9.0 ( 7.1)           
##      White                                 23.5 (18.7)         28.0 (22.2)           
##      Other                                  1.0 ( 0.8)          6.0 ( 4.8)           
##   gender = Male (%)                        50.5 (40.1)         41.0 (32.5)      0.157
##   prevPAL (mean (SD))                      1.82 (1.34)         1.87 (1.12)      0.042
##   Instructor_01 (%)                                                             0.066
##      4                                     55.0 (43.7)         52.0 (41.3)           
##      5                                     22.0 (17.5)         21.0 (16.7)           
##      10                                    19.0 (15.1)         20.0 (15.9)           
##      14                                    30.0 (23.8)         33.0 (26.2)           
##   cMaj = OTHER (%)                         19.0 (15.1)         10.0 ( 7.9)      0.225
##   e.rmd = Remedial in English (%)          24.0 (19.0)         34.0 (27.0)      0.189
##   bot.level (%)                                                                 0.116
##      Sophomore                             28.0 (22.2)         27.0 (21.4)           
##      Junior                                65.5 (52.0)         60.0 (47.6)           
##      Senior                                32.5 (25.8)         39.0 (31.0)           
##   delay.from.hs (mean (SD))                3.00 (1.21)         3.40 (2.67)      0.193
##   csus.gpa.start (mean (SD))               3.20 (0.41)         3.19 (0.42)      0.017
##   pct.female.head (mean (SD))             18.76 (6.39)        19.43 (7.09)      0.100
##   median.income (mean (SD))            61841.54 (13978.07) 64023.91 (18938.54)  0.131

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group
Binary variables: Raw differences in proportion

The variables below, “delay.from.hs” and “median.income” are not under the <0.1 mean threshold, but they are under the <0.25 mean threshold.

## Balance Measures
##                  Type Diff.Un Diff.Adj        M.Threshold
## delay.from.hs Contin. -0.1178   0.1500 Not Balanced, >0.1
## median.income Contin. -0.1450   0.1152 Not Balanced, >0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        22
## Not Balanced, >0.1     2
## 
## Variable with the greatest mean difference
##       Variable Diff.Adj        M.Threshold
##  delay.from.hs     0.15 Not Balanced, >0.1
## 
## Sample sizes
##                      Control Treated
## All                   139.       126
## Matched (ESS)          25.48     126
## Matched (Unweighted)   50.       126
## Unmatched              89.         0

Check variable percent improvement

get.var.perc.tab(chem24.bal)
##                      Variable       Diff.Un     Diff.Adj % Improvement
## 1                     p.score  1.4150085757  0.002828720           100
## 2            Instructor_01_10 -0.2009820715  0.007936508            96
## 3                     prevPAL  0.7509922479  0.045889456            94
## 4              csus.gpa.start  0.1977011453 -0.016216685            92
## 5             Instructor_01_5  0.0659472422 -0.007936508            88
## 6         bot.level_Sophomore  0.0560123330 -0.007936508            86
## 7             Instructor_01_4  0.1537056069 -0.023809524            85
## 8           eth.erss_Hispanic  0.0857599635  0.015873016            81
## 9            bot.level_Senior -0.1724905790  0.051587302            70
## 10   cum.percent.units.passed  0.1012499852 -0.030907174            69
## 11           bot.level_Junior  0.1164782460 -0.043650794            63
## 12                 cMaj_OTHER -0.1868219710 -0.071428571            62
## 13  e.rmd_Remedial in English  0.1259563778  0.079365079            37
## 14             eth.erss_White -0.0511590727  0.035714286            30
## 15          chem1b.grd.pt.unt  0.0493920155 -0.037396912            24
## 16              median.income -0.1450193421  0.115220532            21
## 17  eth.erss_African American  0.0433938563 -0.035714286            18
## 18             eth.erss_Asian -0.0819344524 -0.079365079             3
## 19                gender_Male -0.0774808724 -0.075396825             3
## 20              delay.from.hs -0.1177689275  0.150005260           -27
## 21           Instructor_01_14 -0.0186707777  0.023809524           -28
## 22 eth.erss_Two or More Races  0.0066803700  0.023809524          -256
## 23             eth.erss_Other -0.0027406646  0.039682540         -1348
## 24            pct.female.head -0.0007108937  0.094801544        -13236

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(chem24.bal,binary = "raw", stars = "std", var.order = "unadjusted", 
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(chem24.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 23 122
Multiple Matches 27 4
Total Students 50 126

Out of 126 PAL students, 122 PAL students were matched to one non-PAL student and 4 PAL students were matched to multiple non-PAL students.

Out of 130 non-PAL student matches, there were 50 non-PAL students, 23 of the non-PAL students were matched to one PAL student and 27 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(chem24.final, match.chem24)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is 0.0194, which indicates balance. The variables, “delay.from.hs” and “median.income”, are not under the 0.10 mean difference threshold, but they are under the 0.25 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the imbalanced variables are not highly prognostic and the estimated prognostic score is balanced.

## 
## Call:
## glm(formula = f.build("grd.pt.unt", chem24.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4293  -0.5496   0.1083   0.6382   2.5388  
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -2.151e+00  1.772e+00  -1.214    0.227    
## chem1b.grd.pt.unt          7.545e-01  1.440e-01   5.238 7.19e-07 ***
## cum.percent.units.passed   2.546e-01  1.688e+00   0.151    0.880    
## eth.erssAsian             -4.020e-01  4.818e-01  -0.834    0.406    
## eth.erssHispanic          -1.580e-01  4.970e-01  -0.318    0.751    
## eth.erssTwo or More Races -7.777e-01  5.815e-01  -1.338    0.184    
## eth.erssWhite             -1.290e-01  4.959e-01  -0.260    0.795    
## eth.erssOther              4.655e-01  6.339e-01   0.734    0.464    
## genderMale                -1.011e-01  1.916e-01  -0.528    0.599    
## prevPAL                   -5.747e-02  7.795e-02  -0.737    0.462    
## Instructor_015             1.694e+00  3.509e-01   4.826 4.21e-06 ***
## Instructor_0110            1.558e+00  2.444e-01   6.373 3.73e-09 ***
## Instructor_0114            1.081e+00  2.400e-01   4.503 1.59e-05 ***
## cMajOTHER                  2.367e-01  2.236e-01   1.058    0.292    
## e.rmdRemedial in English   2.513e-02  2.837e-01   0.089    0.930    
## bot.levelJunior            9.747e-02  2.922e-01   0.334    0.739    
## bot.levelSenior           -3.210e-01  3.450e-01  -0.930    0.354    
## delay.from.hs              1.016e-02  7.022e-02   0.145    0.885    
## csus.gpa.start             1.764e-01  3.344e-01   0.528    0.599    
## pct.female.head            6.096e-03  2.337e-02   0.261    0.795    
## median.income              8.076e-06  7.239e-06   1.116    0.267    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.9597258)
## 
##     Null deviance: 240.75  on 138  degrees of freedom
## Residual deviance: 113.25  on 118  degrees of freedom
## AIC: 409.98
## 
## Number of Fisher Scoring iterations: 2
## Balance Measures
##                                Type Diff.Adj        M.Threshold
## prog.score                 Distance   0.0194     Balanced, <0.1
## chem1b.grd.pt.unt           Contin.  -0.0374     Balanced, <0.1
## cum.percent.units.passed    Contin.  -0.0309     Balanced, <0.1
## eth.erss_African American    Binary  -0.0357     Balanced, <0.1
## eth.erss_Asian               Binary  -0.0794     Balanced, <0.1
## eth.erss_Hispanic            Binary   0.0159     Balanced, <0.1
## eth.erss_Two or More Races   Binary   0.0238     Balanced, <0.1
## eth.erss_White               Binary   0.0357     Balanced, <0.1
## eth.erss_Other               Binary   0.0397     Balanced, <0.1
## gender_Male                  Binary  -0.0754     Balanced, <0.1
## prevPAL                     Contin.   0.0459     Balanced, <0.1
## Instructor_01_4              Binary  -0.0238     Balanced, <0.1
## Instructor_01_5              Binary  -0.0079     Balanced, <0.1
## Instructor_01_10             Binary   0.0079     Balanced, <0.1
## Instructor_01_14             Binary   0.0238     Balanced, <0.1
## cMaj_OTHER                   Binary  -0.0714     Balanced, <0.1
## e.rmd_Remedial in English    Binary   0.0794     Balanced, <0.1
## bot.level_Sophomore          Binary  -0.0079     Balanced, <0.1
## bot.level_Junior             Binary  -0.0437     Balanced, <0.1
## bot.level_Senior             Binary   0.0516     Balanced, <0.1
## delay.from.hs               Contin.   0.1500 Not Balanced, >0.1
## csus.gpa.start              Contin.  -0.0162     Balanced, <0.1
## pct.female.head             Contin.   0.0948     Balanced, <0.1
## median.income               Contin.   0.1152 Not Balanced, >0.1
## p.score                     Contin.   0.0028     Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        23
## Not Balanced, >0.1     2
## 
## Variable with the greatest mean difference
##       Variable Diff.Adj        M.Threshold
##  delay.from.hs     0.15 Not Balanced, >0.1
## 
## Sample sizes
##                      Control Treated
## All                   139.       126
## Matched (ESS)          25.48     126
## Matched (Unweighted)   50.       126
## Unmatched              89.         0

Estimate Difference Between Mean grade in Chem 24 of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.70754 . This result is statistically significant with a P-value of 0.0063135. and is based on 126 PAL students and 130 non-PAL student matches(50 non-PAL students total). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.chem24)
## 
## Estimate...  0.70754 
## AI SE......  0.25907 
## T-stat.....  2.731 
## p.val......  0.0063135 
## 
## Original number of observations..............  265 
## Original number of treated obs...............  126 
## Matched number of observations...............  126 
## Matched number of observations  (unweighted).  130 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  0

Sensitivity Analysis

psens(match.chem24, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0001
##    1.2           0      0.0004
##    1.3           0      0.0015
##    1.4           0      0.0041
##    1.5           0      0.0095
##    1.6           0      0.0193
##    1.7           0      0.0352
##    1.8           0      0.0586
##    1.9           0      0.0905
##    2.0           0      0.1310
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.8\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.8 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to csus.gpa.start or ethnicity the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(chem24.first.order.prop.model$coefficients))))
x
median.income 1.000025
genderMale 1.069562
pct.female.head 1.086803
delay.from.hs 1.162585
bot.levelJunior 1.286195
chem1b.grd.pt.unt 1.447259
Instructor_015 1.565295
Instructor_0114 1.634353
(Intercept) 2.118759
prevPAL 2.232138
csus.gpa.start 2.240135
e.rmdRemedial in English 2.754383
eth.erssHispanic 3.172879
eth.erssTwo or More Races 3.350293
bot.levelSenior 3.867930
eth.erssAsian 4.679341
cMajOTHER 4.817476
Instructor_0110 5.641448
eth.erssWhite 6.155689
eth.erssOther 6.725577
cum.percent.units.passed 31.271288

Propensity Score Adjusted Mean Grades

Adjusted Mean Grades
Course Non-PAL PAL Diff. Std. error p-val Sensitivity N(non-PAL) N(PAL)
CHEM 24 1.39 2.1 0.71 0.26 3.16e-03 1.8 50 126

References

Greifer, Noah. 2020. Cobalt: Covariate Balance Tables and Plots. https://CRAN.R-project.org/package=cobalt.

Leite, W. L. 2017. Practical Propensity Score Methods Using R. Thousand Oaks, CA: Sage Publishing. https://osf.io/nygb5/.

Sekhon, Jasjeet S. 2011. “Multivariate and Propensity Score Matching Software with Automated Balance Optimization: The Matching Package for R.” Journal of Statistical Software 42 (7): 1–52. http://www.jstatsoft.org/v42/i07/.

Yoshida, Kazuki, and Alexander Bartel. 2020. Tableone: Create ’Table 1’ to Describe Baseline Characteristics with or Without Propensity Score Weights. https://CRAN.R-project.org/package=tableone.

Zhang, Z., H. J. Kim, G. Lonjon, Y. Zhu, and written on behalf of AME Big-Data Clinical Trial Collaborative Group. 2019. “Balance Diagnostics After Propensity Score Matching.” Annals of Translational Medicine 7 (1): 16. https://doi.org/10.21037/atm.2018.12.10.