Load packages

First Add Some R Packages to the Workspace.
Caution: warning messages are suppressed to reduce clutter in the output.

tidyverse: Importing data, cleaning data, data manipulation, & data visualization
kableExtra: Build HTML tables
DataExplorer: Exploratory Data Analysis & Feature Engineering
tableone: Standardized mean differences for before and after matching
survey: Matched data with match weights
Matching: Propensity score matching
cobalt: Covariate balance
reshape2: Covariate balance plot
rbounds: Rosenbaum Sensitivity test

library(tidyverse)
library(kableExtra)
library(DataExplorer) 
library(tableone)
library(survey)
library(Matching)
library(cobalt)
library(reshape2)
library(rbounds)

select <- dplyr::select # Resolves package conflicts with select
options(width = 120) # Format print width

Load functions

General functions used throughout the analysis.

# Update palN for Chem 24 Spring 2019 ------------------------------------------
update.chem24s19 <- function(chem.dat) {
  PAL.course.data <- read_rds("palCourseData.rds")
  chem24.S19 <- PAL.course.data  %>%
    filter(term == "Spring 2019", course == "CHEM 24")
  # Add a palN indicator for Chem 24 Spring 2019
  chem24.S19 <- chem24.S19 %>%
    mutate(palN.chem24.S19 = case_when(
      pal.grade == "CR" ~ 2,
      is.na(pal.grade) ~ 0,
      TRUE ~ 1
    )) %>%
    select(emplid, palN.chem24.S19) 
  
  # Check how many student are non-PAL, incomplete PAL, and PAL
  table(chem24.S19$palN.chem24.S19)
  # 0  1  2 
  # 51 10 52 
  
  chem.dat <- left_join(chem.dat, chem24.S19, by= "emplid" )
  
  chem.dat  <- chem.dat %>%
    mutate(palN = case_when(
      course == "CHEM 24" & term == "Spring 2019" ~ palN.chem24.S19,
      TRUE ~ palN
    )) %>%
    select(-palN.chem24.S19)
  
  return(chem.dat)
}

# Get raw table of mean gpa for PAL and non-PAL  -------------------------------
get.raw.tab <- function(classes, df)
{ 
 raw.table = data.frame(class=character(),
                         nonPALavg=numeric(),
                         PALavg=numeric(), 
                         Diff=numeric(), 
                         NonPAL_Num= integer(),
                         PAL_Num=integer(),
                         CompletePAL=numeric(),
                         TermPALStart=integer(),
                         row.names=NULL,
                         stringsAsFactors = FALSE)
  
  
  for (i in 1:length(classes))
  {
    curr.class = classes[i]
    temp = subset(df, course==curr.class & course.seq==0)
    pal.start=min(unique(temp$term.code[temp$palN==2]))
    # only include terms after PAL start term
    temp = subset(temp, term.code>= pal.start)
    
    x=tapply(temp$grd.pt.unt,temp$palN, 
             mean, na.rm=T) %>% 
      as.numeric %>% 
      round(2)
    
    y=table(temp$palN) %>% as.numeric
    
    raw.table[i, 'class' ] = curr.class
    raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
                             round(y[3]/sum(y),2))
    raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
    
  }
  
  # formatted table
  raw.table <- kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)") %>%
    kable_styling(full_width= T, position = "left")
 
   return(raw.table)
}

# Data cleaning ----------------------------------------------------------------
clean.data <- function(df)
{
  # Replaced coh.term with coh.term.course
  yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",df$coh.term.course))
  df$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(df$hs.grad.yr),
                                  yr.course.taken-df$hs.grad.yr, NA)
  
  sum(is.na(df$delay.from.hs)) 
  
  # remove students who did not complete PAL 
  df=subset(df, palN!=1) 
  
  #recode palN to factor with 0/1 levels
  df$palN = ifelse(df$palN==2, 1, 0)
  
  #clean up category names in m.rmd and e.rmd
  df$m.rmd[df$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
  df$m.rmd[df$m.rmd=="Remedial\nin Math"]="Remedial in Math"
  df$e.rmd[df$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
  df$e.rmd[df$e.rmd=="Remedial\nin English"]="Remedial in English"
  
  df <- df %>% mutate(m.rmd = factor(m.rmd), e.rmd = factor(e.rmd))
  # table(df$e.rmd)
  
  # Create feature, proportion of cumulative units taken that were passes
  # To distinguish the students who have taken 0 units from the students who 
  #   have passed 0  units they have taken, students who have taken 0 units are 
  #   labeled as -1. Then the -1 is replaced by the mean of cum.percent.units.passed
  df <- df %>%
    mutate(cum.percent.units.passed = ifelse(tot.taken.prgrss.start == 0, -1,
                                             tot.passd.prgrss.start / tot.taken.prgrss.start)) %>%
    mutate(cum.percent.units.passed = ifelse(cum.percent.units.passed  == -1, mean(cum.percent.units.passed,  na.rm =TRUE),
                                             cum.percent.units.passed  ))
  
  # code instructor as alphabetic letter for anonymity
  df$Instructor_01=droplevels(factor(df$Instructor_01))
  
  instructor.vec = sort(unique(df$Instructor_01))
  num.instr = length(instructor.vec)
  
  df$Instructor_01 = factor(
    df$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr)
  )
  
  key.instr.code = cbind(as.character(instructor.vec), 1:num.instr)

  # Add "cMaj", census majors without concentrations/specializations/tracks/etc. 
  major_lookup <- read.csv("Census Major Lookup.csv", header = TRUE, 
                           stringsAsFactors = FALSE)
  df <- merge(df, major_lookup %>% select(censusMajor, cMaj),
              by = "censusMajor", all.x = TRUE)

  # Recode mother's education and father's education variables.
  non.hs.grad= c("No High School","Some High School")
  hs.grad= c("High School Graduate","Some College","2-Year College Graduate")
  coll.grad= c("4-Year College Graduate","Postgraduate")
  parent.ed.levels= c(
    "Non-HS Graduate","HS Graduate", "College Graduate", "Unknown"
  )
  
  df <- df %>%
    mutate(
      mother.ed = ifelse(mother.ed %in% non.hs.grad, "Non-HS Graduate",
        ifelse(mother.ed %in% hs.grad, "HS Graduate", 
          ifelse(mother.ed %in% coll.grad, "College Graduate", "Unknown"))),
      mother.ed= factor(mother.ed, levels= parent.ed.levels),
      father.ed = ifelse(father.ed %in% non.hs.grad,"Non-HS Graduate",
        ifelse(father.ed %in% hs.grad, "HS Graduate", 
          ifelse(father.ed %in% coll.grad, "College Graduate", "Unknown"))),
      father.ed= factor(father.ed, levels= parent.ed.levels))
  
  # Recoded adm.area with these counties as local: 'El Dorado', 'Nevada', 
  #   'Placer', 'Sacramento', 'San Joaquin', 'Solano', 'Yolo'.
  counties.rad <- read_csv(
    "countiesRadius120mi.csv", 
    col_types = cols(
      state = col_skip(), city = col_skip(), distance.km = col_skip()
    )
  )                                     
  
  df <- left_join(df, counties.rad, by = "zip")
  
  local.adm.counties <- c(
    'El Dorado', 'Nevada', 'Placer', 'Sacramento', 'San Joaquin', 'Solano', 
    'Yolo'
  )
  
  # County will be NA if the zip code is not within 120 mile radius of 
  #   CSUS zip code(95819) 
  df <- df %>%
    mutate(
      adm.area = 
        if_else(!(county %in% local.adm.counties) | is.na(county), 
                         "nonlocal", "local")
    ) %>%
    mutate(sac.county.flg =
             if_else(!(county == "Sacramento") | is.na(county), 0, 1)
    ) %>%
    mutate(sac.county.flg = factor(sac.county.flg))

return(df)
}

# Extract prerequisite course grade ---------------------------------------------
get.prereq.grades <- function(course.df, df, prereq) {
  # Get student's recent Chem 1B grade
  course.stu <- course.df$emplid
  prereq.df <- df %>%
    select(emplid, course, course.seq, grd.pt.unt, grade) %>%
    filter(emplid %in% course.stu, course== prereq) %>% 
    group_by(emplid) %>%
    filter(course.seq == max(course.seq)) %>%
    rename(
      prereq.course.seq = course.seq, prereq.grd.pt.unt = grd.pt.unt, 
      prereq.grade = grade
    ) %>% 
    select(-course)
  
  dim(prereq.df) # [1] 275   6
  prereq.stu <- prereq.df$emplid
  
  course.df <- course.df %>%
    filter(emplid %in% prereq.stu)
  course.df <- left_join(course.df, prereq.df, by = "emplid")
  
  return(course.df)
}

# Get only the variables that have missing values ---------------------------------------------
get.missing.only <- function(course.df) {
  get.missing.only <- course.df %>% 
    summarise(across(everything(), ~ sum(is.na(.x)))) %>%
    gather() %>%
    filter(value != 0) 
  get.missing.only <- course.df %>%
    dplyr::select(all_of(get.missing.only$key)) 

  return(get.missing.only)
}

# Get imbalanced variables with SMD > 0.1------------------------------------
get.imbal.vars <- function(tab)
{
  get.imbal.vars <- as.data.frame(ExtractSmd(tab))
  get.imbal.vars <- get.imbal.vars %>%
    rownames_to_column(var = "Variable") %>%
    rename(`Before Matching SMD` = `1 vs 2`) %>%
    filter(`Before Matching SMD` > 0.1) %>% 
    arrange(desc(`Before Matching SMD`))
  get.imbal.vars <- kable(
    get.imbal.vars, caption = "Variables with SMD > 0.1"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.imbal.vars)
}
# Unadjusted means -------------------------------------------------------------
get.unadj.means <- function(df.final)
{
  get.unadj.means <- df.final %>%
    group_by(palN) %>% summarise(unadj.means = mean(grd.pt.unt)) %>%
    pivot_wider(names_from = "palN", values_from = "unadj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  get.unadj.means<- kable(
    get.unadj.means, caption = "Unadjusted Mean Grades"
    ) %>%
    kable_styling(full_width= F)
  
  return(get.unadj.means)
}
# Adjusted means  --------------------------------------------------------------
adj.means <- function(match.list, matched.dat) {
  get.adj.means <- matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Diff. = `PAL`-`Non-PAL`)
  
  # formatted table
  get.adj.means<- kable(get.adj.means, caption = "Adjusted Mean Grades") %>%
    kable_styling(full_width= F)
  
  return(get.adj.means)
}

# Match Table ------------------------------------------------------------------
create.match.tab <- function(matched.dat) {
  matched.dat <- matched.dat %>%
    mutate(pal = if_else(palN == 0, "Non-PAL", "PAL"))
  pal.flg <- c('Non-PAL', 'PAL')
  
  for (i in seq_along(pal.flg)) {
    multiple.matches <- matched.dat %>%
      filter(pal ==pal.flg[i]) %>%
      count(id) %>%
      filter(n> 1) %>%
      summarise(n())
    single.matches <- matched.dat %>%
      filter(pal == pal.flg[i]) %>%
      count(id) %>%
      filter(n==1) %>%
      summarise(n())
    if(pal.flg[i] == 'Non-PAL') {
      match.tab <- bind_rows(single.matches,  multiple.matches)
      match.tab <- match.tab %>%
        rename('Non-PAL'= 'n()')
    }
    pal.matches <- bind_rows(single.matches, multiple.matches)
    match.tab$PAL <- pal.matches$`n()`
    row.names(match.tab) <- c("Single Matches", "Multiple Matches")
  } 
  match.tab <-rbind(
    match.tab, "Total Students" = c(sum(match.tab$`Non-PAL`), sum(match.tab$`PAL`))
  )
  match.tab <- kable(match.tab, caption = "PAL and Non-PAL Matches") %>%
    kable_styling(full_width= F)
  
  return(match.tab)
}

# ATT plot ---------------------------------------------------------------------
# https://livefreeordichotomize.com/2019/01/17/understanding-propensity-score-weighting/
# https://www.csus.edu/brand/colors.html
get.att.plot <- function(df.final, match.list)
{
  df.final$p.score <- p.score
  
  df.final <- df.final %>%
    select(-id) %>%
    rownames_to_column(var = "id")

  ps.dat <- df.final %>%
    select(id, palN, p.score) %>%
    pivot_wider(
      names_from = "palN", values_from = "p.score", names_prefix = "b.pal."
    )
  before.match <- ps.dat %>%
    select(b.pal.0, b.pal.1)
  
  matched.dat <- df.final[unlist(match.list[c("index.treated", "index.control")]), ]
  matched.dat$match.weights<-  c(match.list$weights, match.list$weights)

  after.match <-matched.dat %>% 
    select(-id) %>%
    rownames_to_column(var = "id")
  after.match <- after.match %>%
    pivot_wider(names_from = "palN", values_from = "p.score", names_prefix = "pal.")
  after.match <- after.match %>%
    select(pal.0, pal.1, match.weights)
  
  get.att.plot <- ggplot() +
    geom_histogram(data = before.match, bins = 50, aes(b.pal.1), alpha = 0.5) + 
    geom_histogram(data = after.match,bins = 50, aes(pal.1, weight = match.weights), 
                   fill = "#043927", alpha = 0.5) + 
    geom_histogram(data = before.match, bins = 50, alpha = 0.5, 
                   aes(x = b.pal.0, y = -..count..)) + 
    geom_histogram(data = after.match, bins = 50, 
                   aes(x = pal.0, weight = match.weights, y = -..count..), 
                   fill = "#c4b581", alpha = 0.5) + 
    ylab("Count") + xlab("Propensity Scores") +
    geom_hline(yintercept = 0, lwd = 0.5) +
    scale_y_continuous(label = abs) 

return(get.att.plot)
}

# Variable Percent Improvement -------------------------------------------------
get.var.perc.tab <- function(list.bal) {
  get.var.perc.tab <- list.bal %>%
    pluck("Balance") %>%
    rownames_to_column("Variable") %>%
    dplyr::select("Variable", "Type", "Diff.Un","Diff.Adj") %>%
    mutate(`% Improvement` = if_else(Diff.Un == 0, 0, round(((abs(Diff.Un) - abs(Diff.Adj))/ abs(Diff.Un)) * 100 , 0))) %>%
    arrange(desc(`% Improvement`))
  get.var.perc.tab <- get.var.perc.tab %>% dplyr::select("Variable", "Diff.Un", "Diff.Adj", `% Improvement`)
  
  return(get.var.perc.tab)
}

# Covariate Balance Plots -------------------------------------------------------
# https://cran.r-project.org/web/packages/tableone/vignettes/smd.html
# https://www.csus.edu/brand/colors.html
get.bal.plot <- function(unmatched.tab, matched.tab) {
  ## Construct a data frame containing variable name and SMD from all methods
  dataPlot <- data.frame(variable  = rownames(ExtractSmd(unmatched.tab)),
                         Unmatched = as.numeric(ExtractSmd(unmatched.tab)),
                         Matched   = as.numeric(ExtractSmd(matched.tab))  )
  
  ## Create long-format data for ggplot2
  dataPlotMelt <- melt(data          = dataPlot,
                       id.vars       = c("variable"),
                       variable.name = "Method",
                       value.name    = "SMD")
  
  ## Order variable names by magnitude of SMD
  varNames <- as.character(dataPlot$variable)[order(dataPlot$Unmatched)]
  
  ## Order factor levels in the same order
  dataPlotMelt$variable <- factor(dataPlotMelt$variable,
                                  levels = varNames)
  
  ## Plot using ggplot2
  # Sac State colors and dashed line
  get.bal.plot <-ggplot(
    data = dataPlotMelt, mapping = 
      aes(x = variable, y = SMD, group = Method, color= Method)) +
    scale_color_manual(values = c("#043927", "#c4b581")) +
    geom_line(aes(linetype = Method)) +
    geom_point() +
    scale_linetype_manual(values= c("dashed", "solid")) +
    geom_hline(yintercept = 0.1, color = "black", size = 0.1) +
    coord_flip() +
    theme_bw() + theme(legend.key = element_blank())
  
  return(get.bal.plot)
}

# PAL Effect -------------------------------------------------------------------
get.pal.effect <- function(match.list, matched.dat, course) {  
 get.gamma <- psens(match.list, Gamma=2.0, GammaInc = 0.1)[["bounds"]] %>%
    filter(`Lower bound` < 0.05 & 0.05 < `Upper bound`) %>%
    slice_min(Gamma) %>% 
    select(Gamma) 
  
  get.pal.effect <-  matched.dat %>%
    group_by(palN) %>% 
    summarise(adj.means = weighted.mean(grd.pt.unt, match.list$weights)) %>%
    pivot_wider(names_from = "palN", values_from = "adj.means") %>%
    rename(`Non-PAL`= `0`, `PAL`= `1`) %>%
    mutate(Course= course, .before= "Non-PAL") %>%
    mutate(Diff. = `PAL`-`Non-PAL`) %>%
    mutate(`Std. error`= match.list$se, .after= "Diff.") %>%
    mutate(
      `p-val`= formatC( 1-pnorm(Diff./`Std. error`), format = "e", digits = 2), 
      Sensitivity= get.gamma$Gamma, 
      `N(non-PAL)`= length(unique(match.list$index.control)),
      `N(PAL)`= match.list$wnobs
    )

  return(get.pal.effect)
  }

Specialized functions for each course.

## BIO 22 ====================================================================
## Filter to relevant variables 
bio22.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 'course.count',
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa',
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',    
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',    'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(bio22.dat))
  bio22.final <- bio22.dat[ ,new.vars]
  
  return(bio22.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
bio22.step <- function(final.df) {
  # AP_CALAB 
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score +
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg, 
    data= bio22.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(glm(palN ~. - grd.pt.unt,  data=bio22.final, family=binomial))
  bio22.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(bio22.step.first.order)
  bio22.step.first.order$anova
  
  model.first.order <- formula(bio22.step.first.order)
  bio22.first.order.prop.model <- glm(
    model.first.order, data=bio22.final, family=binomial
  )
  
  return(bio22.first.order.prop.model)
}


## CHEM 1A ====================================================================
## Filter to relevant variables 
chem1a.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age',   
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg',
    'term.units.attemptedCensus','palN', 'grd.pt.unt', 'sat.math.score', 
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1a.dat))
  chem1a.final <- chem1a.dat[ ,new.vars]
  
  return(chem1a.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem1a.step <- function(final.df) {
  # Stepwise selection selected AP_CALAB.flg, AP_BIOL.flg, AP_CHEM, and
  # AP_CHEM.flg
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender + sat.math.score + 
      sat.verbal.score + sat.math.flg + AP_CALAB + AP_CALAB.flg + AP_BIOL +
      AP_BIOL.flg + AP_CHEM + AP_CHEM.flg, data= chem1a.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt, data=chem1a.final, family=binomial)
  )

  chem1a.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
  )
  summary(chem1a.step.first.order)
  chem1a.step.first.order$anova
  
  model.first.order <- formula(chem1a.step.first.order)
  chem1a.first.order.prop.model <- glm(
    model.first.order, data=chem1a.final, family=binomial
  )
  
  return(chem1a.first.order.prop.model)
}

## CHEM 1B ====================================================================
## Filter to relevant variables
chem1b.step.vars <- function(course.df) {
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1a.grd.pt.unt',
    'AP_BIOL',  'AP_CALAB', 'AP_CALBC', 'AP_CHEM','AP_BIOL.flg', 'AP_CALAB.flg',
    'AP_CALBC.flg', 'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem1b.dat))
  chem1b.final <- chem1b.dat[ ,new.vars]

  return(chem1b.final)
}

## Build a Logistic Regression Model for Propensity Score
## Fit Propensity Score model (linear terms only)
chem1b.step <- function(final.df) {
  # Stepwise selection selected AP_BIOL.flg and AP_CHEM.flg
  # Removed AP_BIOL.flg. Then stepwise selection selected AP_CALAB.flg.
  # Removed AP_CALAB.flg and pct.female.head.flg
  min.model <- glm(
    palN ~ chem1a.grd.pt.unt + cum.percent.units.passed + eth.erss + gender +
      AP_CHEM + AP_CHEM.flg, data= chem1b.final, family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL.flg - AP_CALAB.flg - pct.female.head.flg,  
        data=chem1b.final, family=binomial)
  )

  chem1b.step.first.order <- step(min.model,
                                  direction="forward",scope = biggest,
                                  trace=FALSE, k=2)
  summary(chem1b.step.first.order)
  chem1b.step.first.order$anova

  model.first.order <- formula(chem1b.step.first.order)
  chem1b.first.order.prop.model <- glm(model.first.order, data=chem1b.final, family=binomial)

  return(chem1b.first.order.prop.model)
}

## CHEM 4 ====================================================================
## Filter to relevant variables 
chem4.step.vars <- function(course.df)
{
  vars.to.keep <- c(
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd',
    'eth.erss', 'father.ed','fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01','median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt','sat.math.score',
    'sat.math.flg', 'sat.verbal.score', 'sat.verbal.flg', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  
  new.vars <- intersect(vars.to.keep, names(chem4.dat))
  chem4.final <- chem4.dat[ ,new.vars]
  
  return(chem4.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem4.step <- function(final.df)
{
 # "AP_BIOL"
  min.model <- glm(
    palN ~ cum.percent.units.passed + eth.erss + gender+ sat.math.score + 
      sat.verbal.score+sat.math.flg + AP_CALAB+AP_CALAB.flg, data= chem4.final, 
    family=binomial
  )
  summary(min.model)

  biggest <- formula(
    glm(palN ~. - grd.pt.unt - AP_BIOL, data=chem4.final, family=binomial)
  )

  chem4.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2)
  summary(chem4.step.first.order)
  chem4.step.first.order$anova
  
  model.first.order <- formula(chem4.step.first.order)
  chem4.first.order.prop.model <- glm(
    model.first.order, data=chem4.final, family=binomial
    )
  
  return(chem4.first.order.prop.model)
}

## CHEM 24 ====================================================================
## Filter to relevant variables 
chem24.step.vars <- function(course.df)
{
  vars.to.keep <- c(  
    'acad.stand', 'adm.area', 'bot.level','cMaj', 'coh', 'course.age', 
    'csus.gpa.start', 'cum.percent.units.passed', 'delay.from.hs', 'e.rmd', 
    'eth.erss', 'father.ed', 'fys.flg','gender', 'hous.coh.term.flg', 'hs.gpa', 
    'Instructor_01', 'median.income','m.rmd', 'mother.ed', 'pct.female.head',
    'pell.coh.term.flg', 'prevPAL', 'prevPASS', 'reason',  'sac.county.flg', 
    'term.units.attemptedCensus', 'palN', 'grd.pt.unt', 'chem1b.grd.pt.unt', 
    'chem1b.term.gpa', 'chem1b.units.attempted', 'AP_BIOL', 'AP_CALAB',
    'AP_CALBC', 'AP_CHEM','AP_BIOL.flg',    'AP_CALAB.flg', 'AP_CALBC.flg', 
    'AP_CHEM.flg', 'pct.female.head.flg', 'med.inc.flg'
  )
  new.vars <- intersect(vars.to.keep, names(chem24.dat))
  chem24.final <- chem24.dat[ ,new.vars]
  
  return(chem24.final)
}

## Build a Logistic Regression Model for Propensity Score 
## Fit Propensity Score model (linear terms only)
chem24.step <- function(final.df) {
  min.model <- glm(
    palN ~ chem1b.grd.pt.unt + cum.percent.units.passed + eth.erss + gender,
    data= chem24.final, family=binomial
  )
  summary(min.model)
  
  biggest <- formula(
    glm(palN ~.- grd.pt.unt - acad.stand - reason - pct.female.head.flg, 
        data=chem24.final, family=binomial)
  )
  
  chem24.step.first.order <- step(
    min.model, direction="forward", scope = biggest, trace=FALSE, k=2
    )
  summary(chem24.step.first.order)
  chem24.step.first.order$anova
  
  model.first.order <- formula(chem24.step.first.order)
  chem24.first.order.prop.model <- glm(
    model.first.order, data=chem24.final, family=binomial
  )
 
  return(chem24.first.order.prop.model)
}

Import the Data

Make sure the PAL datafile in the same directory as this RMarkdown file.

PALdatafull <- read_rds("paldatafull_csv.rds")
dim(PALdatafull)
## [1] 1099371     174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555

The files which includes data through the Spring 2019 semester has 1099371 rows and 174 columns. The total of the grd.pt.unt column is 2237555.

Biology classes

Subset data for biology classes

bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))
bio.dat <- PALdatafull %>%
  filter(base.time.course == 1, course %in% bio.classes) %>%
  mutate(course = factor(course, levels = bio.classes)) 
dim(bio.dat)
## [1] 19652   174
num.stu <- dim(bio.dat)[1]
num.vars <- dim(bio.dat)[2]

There are 19652 rows and 174 variables. Each row is a biology student. So, there is a total of 19652 biology students.

Compare the mean gpa for PAL and non-PAL students by Course without Propensity Score Adjustment

The course.seq variables indicate how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.

Note: Excludes incomplete PAL students

get.raw.tab(bio.classes, bio.dat)
Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)
class nonPALavg PALavg Diff NonPAL_Num PAL_Num CompletePAL TermPALStart
BIO 22 1.81 2.33 0.52 949 413 0.29 2158
BIO 121 2.18 2.39 0.21 888 390 0.30 2148
BIO 131 2.33 2.76 0.43 1516 544 0.26 2148
BIO 139 2.41 2.61 0.20 308 98 0.24 2178
BIO 184 2.51 2.73 0.22 509 149 0.22 2178

Data Cleaning & Feature Engineering

Create new variables.
delay.from.hs: delay since high school
cum.percent.units.passed: cumulative percent of units passed
cMaj: census majors without concentrations/specializations/tracks/etc.
county: which county did the student live in at the time of application to Sac state
sac.county.flg: did the student live in Sacramento county at the time of application to Sac State

Collapse sparse categories and other miscellaneous clean up of data. Sparse categories can cause complete separation in logistic regression and are only predictive for a few students.

# Check how many students did not complete PAL
sum(bio.dat$palN==1)
## [1] 106
incl.pal.stu <- sum(bio.dat$palN==1)      
bio.dat <- clean.data(bio.dat)
dim(bio.dat)
## [1] 19546   179

There were 106 biology students who did not complete PAL and were removed from the analysis. There are now 19546 biology students instead of 19652.

There were originally 174 variables in the data set, 5 variables were added, so there are now 179 total variables in the data set.

BIO 22

Executive Summary

Based on data for 413 PAL students and 949 non-PAL students, the unadjusted, raw difference in average grade for PAL and non-PAL students was 0.52 on a A=4.0 grade scale. However, since students self-select into supplemental PAL instruction, it is possible that the resulting PAL and non-PAL groups were not balanced with respect to other characteristics which could impact course grade. For example, if students with better study habits tend to enroll in PAL, all else being equal, the PAL mean grade would be higher than non-PAL– even if PAL had no effect on course grade. Consequently, we also performed a propensity score analysis to adjust the estimated effect of PAL on course grade for potential self-selection biases.

After adjusting for self-selection bias, we found that PAL students earned an average grade \(0.60\pm 0.13\) higher than non-PAL students. A sensitivity analysis indicates that this analysis is moderately sensitive to unknown confounders. Although the data give us sufficient evidence to conclude that PAL increases students’ grades in Bio 22, the existence of an unknown confounder similar in magnitude to living in on-campus housing during their first year, ethnicity, or major would nullify that conclusion.

Detailed Summary

A propensity score analysis was conducted to assess the effect of PAL supplemental instruction on Bio 22 course grade. Propensity score adjustment was necessary since the data are observational and the characteristics of students who voluntarily enroll in PAL may differ in ways that may, independently of PAL, impact course grade compared to students who do not enroll in PAL. In propensity score analysis, variables related to both likelihood of PAL enrollment and course grade (confounders) are used in a logistic regression model to obtain a propensity score, which is a student’s likelihood of enrolling in PAL.

For Bio 22, 16 covariates were found to have a statistically significant relationship to likelihood of enrolling in PAL. Variables related to increased likelihood of enrolling were: female gender, lower AP Calculus exam scores, higher term units attempted, CSUS GPA at start of term, enrollment in PAL in the past, academic major, living in off-campus housing during their first year, fewer years between first term at CSUS and high school graduation, and not living in Sacramento county at the time of application.

Using the propensity score model, all students in the dataset, PAL and non-PAL, are assigned a propensity score. Then, each PAL student is matched to one or more non-PAL students who have similar propensity score(s). After matching, the PAL and matched non-PAL groups are compared to determine if the distribution of each covariate is similar between the two groups. This is called a balance check. If the standardized difference between the non-PAL and PAL means is less than 0.10 then the strong criteria in (Leite 2017, p.10) is met for covariate balance. If the standardized difference is under 0.25, then a more lenient criteria is met. The highest absolute value standardized mean difference in this analysis is 0.0951. Consequently, adequate balance appears to have been achieved.

The difference in the average grade for the matched PAL and non-PAL data is then calculated. The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is \(0.60\pm 0.13\) or between 0.47 and 0.73 on a 4.0 grade scale. This result is statistically significant with a P-value of \(1.28x10^{-6}\) and is based on 326 PAL students and 332 non-PAL students. For comparison, the non-propensity score adjusted difference in average grade for PAL and non-PAL students was 0.52.

The estimated PAL effect is based on the assumption that the propensity model includes all potential confounders for PAL enrollment and grade in Bio 22. However, it is possible that unknown confounders exist. A sensitivity analysis was conducted to determine how strong an unknown confounder must be to nullify the statistically significant PAL effect that was found in this analysis. The sensitivity analysis (Rosenbaum, 2002) indicated that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. Inspection of the covariates in the estimated propensity model for Bio 22 indicates that if there is an unknown confounder that has an effect on the propensity score similar to the effect of living in on-campus housing during their first year, ethnicity, or major observed in this analysis, the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works (which is not in our dataset) is an unknown confounder which could reverse the statistical significance of this analysis.

Additionally, a number of variables were removed from this analysis due to large amounts of missingness. Since all students who had missing information on any included covariate were eliminated from the analysis, a balance had to be struck between retaining a sufficiently large pool of PAL and non-PAL students and retaining a sufficient number of important covariates. Variables which were eliminated from this analysis had substantial missing data or were subjectively judged as unlikely to be confounding. The choices about which variables to retain resulted in the original pool of 413 PAL students in Bio 22 being reduced to 326. Also, 332 non-PAL students were selected out of 949 original non-PAL students.

When a PAL student had more than one suitable match among the non-PAL students, all non-PAL students were taken as matches and weighted appropriately in the final estimated PAL effect. There were 523 non-PAL matches. Of the 326 PAL students, 196 were matched one-to-one with non-PAL students and 130 were matched one-to-many with non-PAL students.

Extract BIO 22 Data

The non-PAL and PAL groups will include students with only first attempts at BIO 22. They will also include students with previous PAL participation and/or are currently in a PAL for another course.

# Excludes course repeats
bio22.dat <- bio.dat %>%
  filter(course=="BIO 22", pass.term.flg == "PASS Term", course.seq== 0)
dim(bio22.dat) # 1362  179
## [1] 1362  179

There are 1,362 BIO 22 first attempt only students.

Collapse ‘cMaj’ variable separately for each course since the amount of collapsing necessary will vary by course.

# Collapsed cMaj categories to Biology and Other majors at 0.07
with(bio22.dat, table(cMaj, palN))
##                                          palN
## cMaj                                        0   1
##   Anthropology                              2   0
##   Biology                                 333 125
##   Business                                  8   1
##   Chemistry                                27   7
##   Child Devel/Early Childhood Ed           14   4
##   Civil Engineering                         1   0
##   Communications                            3   1
##   Computer Engineering                      1   0
##   Computer Science                          6   0
##   Construction Management                   0   1
##   Criminal Justice                          6   2
##   Dance                                     1   1
##   Economics                                 1   0
##   English                                   1   1
##   Family & Consumer Sciences                2   0
##   Film                                      0   1
##   French                                    1   0
##   Gender/Ethnic/Women's Studies             1   0
##   Gerontology                               0   3
##   Graphic Design                            1   2
##   Health Science                            8   5
##   History                                   2   0
##   Interdisciplinary Studies/Special Major   2   0
##   Interior Design                           0   2
##   International Business                    1   0
##   Journalism                                1   0
##   Kinesiology/Physical Education          422 222
##   Liberal Studies                           1   0
##   Mechanical Engineering                    1   0
##   Nursing                                  23  17
##   Nutrition                                 8   2
##   Philosophy                                1   1
##   Physics                                   1   0
##   Psychology                                7   1
##   Recreation Administration                 1   0
##   Social Science                            2   0
##   Sociology                                 2   0
##   Spanish                                   1   0
##   Speech Pathology                          1   2
##   Undeclared                               35  11
bio22.dat <- group_category(data = bio22.dat, feature = "cMaj", threshold = 0.07,  update = TRUE)
with(bio22.dat, table(cMaj, palN))
##                                 palN
## cMaj                               0   1
##   Biology                        333 125
##   Chemistry                       27   7
##   Child Devel/Early Childhood Ed  14   4
##   Kinesiology/Physical Education 422 222
##   Nursing                         23  17
##   OTHER                           75  26
##   Undeclared                      35  11

Analyze missingness

Remove variables having too many missing values in order to retain a larger pool of PAL and non-PAL students.

# Include only variables that are missing values
vars.missing <- get.missing.only(bio22.dat)

missingness.rpt <- profile_missing(vars.missing)
# missingness.rpt[order(missingness.rpt$pct_missing, decreasing = TRUE), ]

# check which variables are missing >5% values
vars.missing.data <- missingness.rpt %>%
  filter(pct_missing > 0.05)
sum(missingness.rpt$pct_missing>0.05)
## [1] 37
vars.missing.data [order(vars.missing.data $pct_missing, decreasing = TRUE), ]
##                        feature num_missing pct_missing
## 19               Instructor_02        1362  1.00000000
## 25                   deg.plan4        1362  1.00000000
## 26                   deg.plan5        1362  1.00000000
## 27                   deg.plan6        1362  1.00000000
## 24                   deg.plan3        1361  0.99926579
## 21             withdraw_reason        1357  0.99632893
## 23                   deg.plan2        1356  0.99559471
## 13                  trf.gpaADM        1083  0.79515419
## 4                  pledge.term        1050  0.77092511
## 20               treat.section         938  0.68869310
## 29                grad.termERS         876  0.64317181
## 22                   deg.plan1         874  0.64170338
## 28                   grad.term         874  0.64170338
## 30                         ttg         874  0.64170338
## 1                fys.term.code         839  0.61600587
## 2                      fys.grd         839  0.61600587
## 3                  fys.rpt.flg         839  0.61600587
## 15 ge.critical.thinking.status         442  0.32452276
## 16      ge.english.comp.status         442  0.32452276
## 17              ge.math.status         442  0.32452276
## 18         ge.oral.comm.status         442  0.32452276
## 14                  admit.term         433  0.31791483
## 7               sat.math.score         361  0.26505140
## 8                 sat.math.flg         361  0.26505140
## 9             sat.verbal.score         361  0.26505140
## 10              sat.verbal.flg         361  0.26505140
## 11               sat.test.date         361  0.26505140
## 12                      hs.gpa         260  0.19089574
## 31                plan.college         244  0.17914831
## 32           plan.college.desc         244  0.17914831
## 33                   plan.dept         244  0.17914831
## 34               plan.deptAbbr         244  0.17914831
## 35                 plan.degree         244  0.17914831
## 36                   plan.type         244  0.17914831
## 37                      county         189  0.13876652
## 5                  m.rmd.admin          82  0.06020558
## 6           m.rmd.admin.detail          82  0.06020558
plot_missing(bio22.dat %>% select(all_of(vars.missing.data$feature)))

# Remove variables missing >5%, except for "sat.math.score", "sat.verbal.score", "hs.gpa", "sat.math.flg"

bio22.dat <- bio22.dat %>%
  select(!all_of(vars.missing.data$feature),
                c("sat.math.score", "sat.verbal.score", "hs.gpa","sat.math.flg"))
dim(bio22.dat) # 1362  146
## [1] 1362  146

37 variables missing >5%
The variables below are important and force included, even though they are missing >5%
“sat.math.score”, “sat.verbal.score”, “hs.gpa”,“sat.math.flg”
Only 33 variables were removed due to missingness and there are now 146 variables instead of 179 variables.

Subset on Complete Cases only in BIO 22 Data

bio22.dat <- bio22.dat[complete.cases(bio22.dat), ]
dim(bio22.dat) # 937 146
## [1] 937 146

937 out of 1362 students are kept
425 students were removed due to missingness of variables

Removed single-valued variables

single.vars <- bio22.dat %>%
  summarise(across(everything(), ~ n_distinct(.x))) %>%
  select_if(. == 1)

# Table of variables with single values
CreateTableOne(vars = names(single.vars), data = bio22.dat)
##                                            
##                                             Overall       
##   n                                           937         
##   country = USA (%)                           937 (100.0) 
##   career.course = UGRD (%)                    937 (100.0) 
##   acad.prog.course = UGD (%)                  937 (100.0) 
##   course (%)                                              
##      BIO 22                                   937 (100.0) 
##      BIO 121                                    0 (  0.0) 
##      BIO 131                                    0 (  0.0) 
##      BIO 139                                    0 (  0.0) 
##      BIO 184                                    0 (  0.0) 
##   component = LEC (%)                         937 (100.0) 
##   units (mean (SD))                          4.00 (0.00)  
##   course.numeric (mean (SD))                22.00 (0.00)  
##   div = Lower Division (%)                    937 (100.0) 
##   course.seq (mean (SD))                     0.00 (0.00)  
##   rpt.flg = First Attempt (%)                 937 (100.0) 
##   c2s = Non-C2S (%)                           937 (100.0) 
##   base.time.course (mean (SD))               1.00 (0.00)  
##   years (mean (SD))                          0.50 (0.00)  
##   withdraw_code = NWD (%)                     937 (100.0) 
##   enrl.flg = Enrolled (%)                     937 (100.0) 
##   enrl.flgERS = Enrolled (%)                  937 (100.0) 
##   rtn.flg = Retained (%)                      937 (100.0) 
##   rtn.flgERS = Retained (%)                   937 (100.0) 
##   pass.term.flg = PASS Term (%)               937 (100.0) 
##   csus.gpa.start.flg = Not Missing (%)        937 (100.0) 
##   higher.ed.gpa.start.flg = Not Missing (%)   937 (100.0)
sum(single.vars) # 21
## [1] 21
# remove single-valued variables
bio22.dat<- bio22.dat %>%
  dplyr::select(-names(single.vars))
dim(bio22.dat) # 937  125
## [1] 937 125

125 out of 146 variables are kept
21 variables removed due to single values

Filter to relevant variables

Sujective judgment was used to narrow the pool of variables down to those likely to be confounders. It’s important to include all variables correlated with outcome even if it is uncertain whether they are related to likeihood of enrolling in PAL. This allows for a more precise estimate of the treatment effect.

bio22.final <- bio22.step.vars(bio22.dat)
names(bio22.final)
##  [1] "acad.stand"                 "adm.area"                   "bot.level"                  "cMaj"                      
##  [5] "coh"                        "course.age"                 "csus.gpa.start"             "cum.percent.units.passed"  
##  [9] "delay.from.hs"              "e.rmd"                      "eth.erss"                   "father.ed"                 
## [13] "fys.flg"                    "gender"                     "hous.coh.term.flg"          "hs.gpa"                    
## [17] "Instructor_01"              "median.income"              "m.rmd"                      "mother.ed"                 
## [21] "pct.female.head"            "pell.coh.term.flg"          "prevPAL"                    "prevPASS"                  
## [25] "reason"                     "sac.county.flg"             "term.units.attemptedCensus" "palN"                      
## [29] "grd.pt.unt"                 "sat.math.score"             "sat.math.flg"               "sat.verbal.score"          
## [33] "AP_BIOL"                    "AP_CALAB"                   "AP_CALBC"                   "AP_CHEM"                   
## [37] "AP_BIOL.flg"                "AP_CALAB.flg"               "AP_CALBC.flg"               "AP_CHEM.flg"               
## [41] "pct.female.head.flg"        "med.inc.flg"

Build a Logistic Regression Model for Propensity Score

Subjectively identified four potential confounders to force the model to retain: cum.percent.units.passed, gender, eth.erss, sat.math.score, sat.verbal.score, and sat.math.flg (same as sat.verbal.flg). Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.

bio22.first.order.prop.model <- bio22.step(bio22.final)
summary(bio22.first.order.prop.model)
## 
## Call:
## glm(formula = model.first.order, family = binomial, data = bio22.final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7671  -0.9137  -0.6395   1.1352   2.3038  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -0.3775015  1.2040989  -0.314 0.753890    
## cum.percent.units.passed           -1.0822415  1.0554624  -1.025 0.305188    
## eth.erssAsian                      -0.4634201  0.3721018  -1.245 0.212980    
## eth.erssForeign                    -0.6166642  0.8106835  -0.761 0.446853    
## eth.erssHispanic                   -0.0583669  0.3641114  -0.160 0.872645    
## eth.erssPacific Islander            0.9985099  0.7949037   1.256 0.209065    
## eth.erssTwo or More Races          -0.5168364  0.4579780  -1.129 0.259101    
## eth.erssUnknown                    -0.1312048  0.5574675  -0.235 0.813930    
## eth.erssWhite                      -0.4577725  0.3921462  -1.167 0.243068    
## genderMale                         -0.2872814  0.1562340  -1.839 0.065946 .  
## sat.math.score                     -0.0006397  0.0014134  -0.453 0.650813    
## sat.verbal.score                   -0.0021488  0.0013662  -1.573 0.115751    
## sat.math.flgold                    -0.3615956  0.2329028  -1.553 0.120528    
## AP_CALAB                           -0.3784461  0.1622482  -2.333 0.019674 *  
## AP_CALAB.flgNot Missing            -0.0812130  0.2570992  -0.316 0.752092    
## term.units.attemptedCensus          0.1419638  0.0383030   3.706 0.000210 ***
## csus.gpa.start                      0.5422919  0.1893888   2.863 0.004191 ** 
## prevPAL                             0.3630843  0.0838341   4.331 1.48e-05 ***
## cMajChemistry                       0.2181870  0.5406120   0.404 0.686512    
## cMajChild Devel/Early Childhood Ed -0.4823010  0.8039725  -0.600 0.548575    
## cMajKinesiology/Physical Education  0.8378476  0.2289283   3.660 0.000252 ***
## cMajNursing                         1.4989657  0.4443443   3.373 0.000742 ***
## cMajOTHER                           0.1981652  0.3601025   0.550 0.582112    
## cMajUndeclared                      0.4871465  0.4576017   1.065 0.287073    
## hous.coh.term.flgOn-Campus Housing -0.5708439  0.2003577  -2.849 0.004384 ** 
## delay.from.hs                      -0.1437080  0.0719453  -1.997 0.045775 *  
## sac.county.flg1                    -0.3061509  0.1718863  -1.781 0.074892 .  
## prevPASS                            0.2165654  0.1462875   1.480 0.138764    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1213.4  on 936  degrees of freedom
## Residual deviance: 1092.9  on 909  degrees of freedom
## AIC: 1148.9
## 
## Number of Fisher Scoring iterations: 4
p.score <- bio22.first.order.prop.model$fitted.values
bio22.covs <-  names(bio22.first.order.prop.model %>%  pluck("model") %>% dplyr::select(-palN))

Propensity Score Matching

Before matching

# Unadjusted mean grades
get.unadj.means(bio22.final)
Unadjusted Mean Grades
Non-PAL PAL Diff.
1.760099 2.306402 0.5463039
# Variable Summary Table for unmatched data
unmatched.tab <- CreateTableOne(vars = bio22.covs, strata = "palN",
                             data = bio22.final, smd = TRUE, test= FALSE)
print(unmatched.tab, smd = TRUE)
##                                            Stratified by palN
##                                             0              1              SMD   
##   n                                            609            328               
##   cum.percent.units.passed (mean (SD))        0.91 (0.10)    0.91 (0.09)   0.007
##   eth.erss (%)                                                             0.259
##      African American                           26 ( 4.3)      16 ( 4.9)        
##      Asian                                     218 (35.8)      96 (29.3)        
##      Foreign                                     8 ( 1.3)       3 ( 0.9)        
##      Hispanic                                  178 (29.2)     130 (39.6)        
##      Pacific Islander                            4 ( 0.7)       5 ( 1.5)        
##      Two or More Races                          44 ( 7.2)      17 ( 5.2)        
##      Unknown                                    16 ( 2.6)       9 ( 2.7)        
##      White                                     115 (18.9)      52 (15.9)        
##   gender = Male (%)                            273 (44.8)     115 (35.1)   0.200
##   sat.math.score (mean (SD))                504.93 (79.84) 495.43 (79.20)  0.119
##   sat.verbal.score (mean (SD))              487.27 (78.52) 478.84 (72.64)  0.111
##   sat.math.flg = old (%)                       538 (88.3)     270 (82.3)   0.171
##   AP_CALAB (mean (SD))                        2.58 (0.52)    2.48 (0.53)   0.200
##   AP_CALAB.flg = Not Missing (%)                79 (13.0)      49 (14.9)   0.057
##   term.units.attemptedCensus (mean (SD))     13.57 (2.24)   14.27 (1.93)   0.330
##   csus.gpa.start (mean (SD))                  3.01 (0.53)    3.12 (0.49)   0.212
##   prevPAL (mean (SD))                         0.52 (1.06)    0.71 (1.29)   0.158
##   cMaj (%)                                                                 0.291
##      Biology                                   215 (35.3)      90 (27.4)        
##      Chemistry                                  14 ( 2.3)       6 ( 1.8)        
##      Child Devel/Early Childhood Ed             13 ( 2.1)       2 ( 0.6)        
##      Kinesiology/Physical Education            283 (46.5)     188 (57.3)        
##      Nursing                                    15 ( 2.5)      15 ( 4.6)        
##      OTHER                                      45 ( 7.4)      17 ( 5.2)        
##      Undeclared                                 24 ( 3.9)      10 ( 3.0)        
##   hous.coh.term.flg = On-Campus Housing (%)    160 (26.3)      75 (22.9)   0.079
##   delay.from.hs (mean (SD))                   2.83 (1.56)    2.47 (1.26)   0.255
##   sac.county.flg = 1 (%)                       294 (48.3)     140 (42.7)   0.112
##   prevPASS (mean (SD))                        0.23 (0.58)    0.26 (0.58)   0.040

Check how many variables have SMD > 0.1

addmargins(table(ExtractSmd(unmatched.tab) > 0.1))
## 
## FALSE  TRUE   Sum 
##     4    12    16
get.imbal.vars(unmatched.tab)
Variables with SMD > 0.1
Variable Before Matching SMD
term.units.attemptedCensus 0.3299348
cMaj 0.2912810
eth.erss 0.2585443
delay.from.hs 0.2553547
csus.gpa.start 0.2118672
gender 0.2004051
AP_CALAB 0.1998329
sat.math.flg 0.1708929
prevPAL 0.1582117
sat.math.score 0.1194512
sac.county.flg 0.1124963
sat.verbal.score 0.1114884

12 variables have SMD >0.1

Implement a propensity score matching method.

match.bio22 <- with(bio22.final, Match(
  Y=bio22.final$grd.pt.unt, Tr = bio22.final$palN, X = p.score,
  BiasAdjust = F, estimand = "ATT",  M=1, caliper=0.25, replace = TRUE, ties = TRUE))

After matching

Standardized mean differences for continuous variables and categorical variables.

# Needed for match table
bio22.final <- bio22.final %>%
  rownames_to_column(var = "id")

# Matched data
bio22.matched.dat <- bio22.final[unlist(match.bio22[c("index.treated", "index.control")]), ]
bio22.matched.dat$match.weights<-  c(match.bio22$weights, match.bio22$weights)

# Add match weights to matched data
weighted.dat<-svydesign(id=~1,weights=~match.weights, data = bio22.matched.dat)
# Variable Summary Table for matched data with match weights
matched.tab <-svyCreateTableOne(vars = bio22.covs,  strata = "palN", data= weighted.dat, smd = TRUE, test = FALSE)
print(matched.tab, smd = TRUE)
##                                            Stratified by palN
##                                             0              1              SMD   
##   n                                         326.00         326.00               
##   cum.percent.units.passed (mean (SD))        0.92 (0.09)    0.91 (0.09)   0.094
##   eth.erss (%)                                                             0.131
##      African American                         17.6 ( 5.4)    16.0 ( 4.9)        
##      Asian                                   111.8 (34.3)    96.0 (29.4)        
##      Foreign                                   3.2 ( 1.0)     3.0 ( 0.9)        
##      Hispanic                                112.2 (34.4)   128.0 (39.3)        
##      Pacific Islander                          3.8 ( 1.2)     5.0 ( 1.5)        
##      Two or More Races                        19.4 ( 6.0)    17.0 ( 5.2)        
##      Unknown                                   9.7 ( 3.0)     9.0 ( 2.8)        
##      White                                    48.3 (14.8)    52.0 (16.0)        
##   gender = Male (%)                          119.2 (36.6)   115.0 (35.3)   0.027
##   sat.math.score (mean (SD))                502.55 (79.44) 495.64 (79.29)  0.087
##   sat.verbal.score (mean (SD))              484.86 (78.53) 479.51 (72.31)  0.071
##   sat.math.flg = old (%)                     261.2 (80.1)   268.0 (82.2)   0.053
##   AP_CALAB (mean (SD))                        2.47 (0.60)    2.48 (0.53)   0.009
##   AP_CALAB.flg = Not Missing (%)              59.0 (18.1)    49.0 (15.0)   0.083
##   term.units.attemptedCensus (mean (SD))     14.44 (2.06)   14.26 (1.94)   0.088
##   csus.gpa.start (mean (SD))                  3.11 (0.48)    3.12 (0.49)   0.014
##   prevPAL (mean (SD))                         0.70 (1.31)    0.68 (1.24)   0.013
##   cMaj (%)                                                                 0.124
##      Biology                                  88.5 (27.2)    89.0 (27.3)        
##      Chemistry                                 4.6 ( 1.4)     6.0 ( 1.8)        
##      Child Devel/Early Childhood Ed            1.5 ( 0.5)     2.0 ( 0.6)        
##      Kinesiology/Physical Education          181.4 (55.7)   187.0 (57.4)        
##      Nursing                                  24.0 ( 7.4)    15.0 ( 4.6)        
##      OTHER                                    17.1 ( 5.3)    17.0 ( 5.2)        
##      Undeclared                                8.8 ( 2.7)    10.0 ( 3.1)        
##   hous.coh.term.flg = On-Campus Housing (%)   80.2 (24.6)    75.0 (23.0)   0.038
##   delay.from.hs (mean (SD))                   2.45 (1.39)    2.46 (1.26)   0.010
##   sac.county.flg = 1 (%)                     136.9 (42.0)   140.0 (42.9)   0.019
##   prevPASS (mean (SD))                        0.23 (0.59)    0.25 (0.57)   0.030

Balance Check

Continuous variables: Standardized mean differences are computed by using the standard deviation of treated group Binary variables: Raw differences in proportion

All variables are balanced and under the <0.1 mean threshold.

bio22.bal <- bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final,
        distance = ~ p.score, thresholds = c(m = .1), un = TRUE, imbalanced.only = TRUE)
bio22.bal
## Balance Measures
## All covariates are balanced.
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        30
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##                  Variable Diff.Adj    M.Threshold
##  cum.percent.units.passed  -0.0951 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                    609.      328
## Matched (ESS)          169.3     326
## Matched (Unweighted)   332.      326
## Unmatched              277.        0
## Discarded                0.        2

Check variable percent improvement

get.var.perc.tab(bio22.bal)
##                               Variable      Diff.Un      Diff.Adj % Improvement
## 1                              p.score  0.757140033 -5.483664e-05           100
## 2                         cMaj_Biology -0.078647523  1.431493e-03            98
## 3                           cMaj_OTHER -0.022062357 -4.601227e-04            98
## 4                        delay.from.hs -0.287221743  1.036247e-02            96
## 5                             AP_CALAB -0.198548550  1.004536e-02            95
## 6                       csus.gpa.start  0.220446905  1.400119e-02            94
## 7                              prevPAL  0.144568525 -1.297841e-02            91
## 8  cMaj_Child Devel/Early Childhood Ed -0.015248909  1.533742e-03            90
## 9                     eth.erss_Foreign -0.003989948 -5.112474e-04            87
## 10                         gender_Male -0.097666106 -1.303681e-02            87
## 11 cMaj_Kinesiology/Physical Education  0.108474508  1.702454e-02            84
## 12                      sac.county.flg -0.055929352  9.406953e-03            83
## 13          term.units.attemptedCensus  0.356927519 -9.032332e-02            75
## 14                    sat.math.flg_old -0.060244703  2.091002e-02            65
## 15          eth.erss_Two or More Races -0.020420321 -7.413088e-03            64
## 16                      eth.erss_White -0.030297569  1.140082e-02            62
## 17           eth.erss_Pacific Islander  0.008675758  3.578732e-03            59
## 18                     cMaj_Undeclared -0.008921062  3.834356e-03            57
## 19                   eth.erss_Hispanic  0.104059033  4.836401e-02            54
## 20 hous.coh.term.flg_On-Campus Housing -0.034067243 -1.610429e-02            53
## 21                    sat.verbal.score -0.116085624 -7.372148e-02            36
## 22                      sat.math.score -0.119934927 -8.720503e-02            27
## 23                      eth.erss_Asian -0.065280948 -4.831288e-02            26
## 24                            prevPASS  0.039806845  3.026697e-02            24
## 25           eth.erss_African American  0.006087549 -5.061350e-03            17
## 26                      cMaj_Chemistry -0.004695823  4.243354e-03            10
## 27                        cMaj_Nursing  0.021101165 -2.760736e-02           -31
## 28            AP_CALAB.flg_Not Missing  0.019669390 -3.072597e-02           -56
## 29                    eth.erss_Unknown  0.001166446 -2.044990e-03           -75
## 30            cum.percent.units.passed  0.007863796 -9.507585e-02         -1109

Check covariate balance visually

get.bal.plot(unmatched.tab, matched.tab)

love.plot(bio22.bal,binary = "raw", stars = "std", var.order = "unadjusted",
            thresholds = c(m = .1), abs = F)

Compare single and multiple matches for PAL and non-PAL

create.match.tab(bio22.matched.dat)
PAL and Non-PAL Matches
Non-PAL PAL
Single Matches 213 196
Multiple Matches 119 130
Total Students 332 326

Out of 328 PAL students, 326 were matched and 2 were unable to be matched. Out of 326 PAL student matches, 196 PAL students were matched to one non-PAL student and 130 PAL students were matched to multiple non-PAL students.

Out of 523 non-PAL student matches, there were 332 non-PAL students, 213 of the non-PAL students were matched to one PAL student and 119 of the non-PAL students were matched to multiple PAL students.

Plot of Propensity Scores for Average Treatment Effect Among the Treated (ATT)

get.att.plot(bio22.final, match.bio22)

Assess balance with prognostic score

The standardized mean differences of the prognostic scores is -0.0171, which indicates balance. All variables are under the 0.01 mean difference threshold. It is likely that the effect estimate will be relatively unbiased, since the estimated prognostic score is balanced.

# Outcome model of non-PAL group
ctrl.data <- bio22.final %>% filter(palN == 0)
ctrl.fit <- glm(f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
summary(ctrl.fit)
## 
## Call:
## glm(formula = f.build("grd.pt.unt", bio22.covs), data = ctrl.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7560  -0.8716   0.0303   0.8395   4.1864  
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        -3.1402550  0.7201409  -4.361 1.53e-05 ***
## cum.percent.units.passed           -0.1622929  0.6399986  -0.254 0.799907    
## eth.erssAsian                      -0.0779431  0.2438373  -0.320 0.749347    
## eth.erssForeign                     0.3085674  0.4645496   0.664 0.506807    
## eth.erssHispanic                   -0.1654707  0.2422559  -0.683 0.494853    
## eth.erssPacific Islander           -0.2477167  0.6156367  -0.402 0.687556    
## eth.erssTwo or More Races           0.0802446  0.2883046   0.278 0.780856    
## eth.erssUnknown                     0.3015452  0.3699353   0.815 0.415332    
## eth.erssWhite                       0.2307871  0.2558751   0.902 0.367456    
## genderMale                          0.0716462  0.0978066   0.733 0.464141    
## sat.math.score                      0.0019424  0.0008767   2.216 0.027112 *  
## sat.verbal.score                    0.0002403  0.0008407   0.286 0.775085    
## sat.math.flgold                    -0.0241787  0.1607446  -0.150 0.880488    
## AP_CALAB                           -0.0216909  0.0911965  -0.238 0.812083    
## AP_CALAB.flgNot Missing             0.0779702  0.1512937   0.515 0.606500    
## term.units.attemptedCensus          0.0214280  0.0217106   0.987 0.324062    
## csus.gpa.start                      1.1731403  0.1167655  10.047  < 2e-16 ***
## prevPAL                             0.1710735  0.0535187   3.197 0.001466 ** 
## cMajChemistry                       0.2927482  0.3206663   0.913 0.361654    
## cMajChild Devel/Early Childhood Ed  0.3579739  0.3331423   1.075 0.283028    
## cMajKinesiology/Physical Education -0.5104136  0.1382659  -3.692 0.000244 ***
## cMajNursing                        -0.3789272  0.3221003  -1.176 0.239906    
## cMajOTHER                          -0.3813397  0.2029681  -1.879 0.060770 .  
## cMajUndeclared                     -0.0800524  0.2693843  -0.297 0.766445    
## hous.coh.term.flgOn-Campus Housing -0.0613837  0.1248750  -0.492 0.623215    
## delay.from.hs                       0.1497663  0.0376607   3.977 7.87e-05 ***
## sac.county.flg1                    -0.1025577  0.1086385  -0.944 0.345548    
## prevPASS                           -0.0261742  0.0910270  -0.288 0.773799    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.264761)
## 
##     Null deviance: 1275.90  on 608  degrees of freedom
## Residual deviance:  734.83  on 581  degrees of freedom
## AIC: 1900.6
## 
## Number of Fisher Scoring iterations: 2
# Predict values for both non-PAL and PAL groups
bio22.final$prog.score <- predict(ctrl.fit, bio22.final)

# Compare the standardized mean differences of the prognostic scores in the non-PAL and PAL groups
# p.score is at the bottom of the table
prog.bal <-bal.tab(match.bio22, formula = f.build("palN", bio22.covs), data = bio22.final, distance = bio22.final["prog.score"], sd.denom = "treated", addl = "p.score", thresholds = c(m = .1))
prog.bal
## Balance Measures
##                                         Type Diff.Adj    M.Threshold
## prog.score                          Distance  -0.0171 Balanced, <0.1
## cum.percent.units.passed             Contin.  -0.0951 Balanced, <0.1
## eth.erss_African American             Binary  -0.0051 Balanced, <0.1
## eth.erss_Asian                        Binary  -0.0483 Balanced, <0.1
## eth.erss_Foreign                      Binary  -0.0005 Balanced, <0.1
## eth.erss_Hispanic                     Binary   0.0484 Balanced, <0.1
## eth.erss_Pacific Islander             Binary   0.0036 Balanced, <0.1
## eth.erss_Two or More Races            Binary  -0.0074 Balanced, <0.1
## eth.erss_Unknown                      Binary  -0.0020 Balanced, <0.1
## eth.erss_White                        Binary   0.0114 Balanced, <0.1
## gender_Male                           Binary  -0.0130 Balanced, <0.1
## sat.math.score                       Contin.  -0.0872 Balanced, <0.1
## sat.verbal.score                     Contin.  -0.0737 Balanced, <0.1
## sat.math.flg_old                      Binary   0.0209 Balanced, <0.1
## AP_CALAB                             Contin.   0.0100 Balanced, <0.1
## AP_CALAB.flg_Not Missing              Binary  -0.0307 Balanced, <0.1
## term.units.attemptedCensus           Contin.  -0.0903 Balanced, <0.1
## csus.gpa.start                       Contin.   0.0140 Balanced, <0.1
## prevPAL                              Contin.  -0.0130 Balanced, <0.1
## cMaj_Biology                          Binary   0.0014 Balanced, <0.1
## cMaj_Chemistry                        Binary   0.0042 Balanced, <0.1
## cMaj_Child Devel/Early Childhood Ed   Binary   0.0015 Balanced, <0.1
## cMaj_Kinesiology/Physical Education   Binary   0.0170 Balanced, <0.1
## cMaj_Nursing                          Binary  -0.0276 Balanced, <0.1
## cMaj_OTHER                            Binary  -0.0005 Balanced, <0.1
## cMaj_Undeclared                       Binary   0.0038 Balanced, <0.1
## hous.coh.term.flg_On-Campus Housing   Binary  -0.0161 Balanced, <0.1
## delay.from.hs                        Contin.   0.0104 Balanced, <0.1
## sac.county.flg                        Binary   0.0094 Balanced, <0.1
## prevPASS                             Contin.   0.0303 Balanced, <0.1
## p.score                              Contin.  -0.0001 Balanced, <0.1
## 
## Balance tally for mean differences
##                    count
## Balanced, <0.1        31
## Not Balanced, >0.1     0
## 
## Variable with the greatest mean difference
##                  Variable Diff.Adj    M.Threshold
##  cum.percent.units.passed  -0.0951 Balanced, <0.1
## 
## Sample sizes
##                      Control Treated
## All                    609.      328
## Matched (ESS)          169.3     326
## Matched (Unweighted)   332.      326
## Unmatched              277.        0
## Discarded                0.        2

Estimate Difference Between Mean grade in BIO 22 of PAL and non-PAL students

The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is 0.5987 . This result is statistically significant with a P-value of \(2.556x10^{-6}\) and is based on 326 PAL students and 523 non-PAL student matches(332 total non-PAL students). Note this P-value is for a two-tailed test, but it will be corrected to a one-tailed test (halves the P-value) in the final table output summarizing the effect of PAL across chemistry courses.

summary(match.bio22)
## 
## Estimate...  0.5987 
## AI SE......  0.12728 
## T-stat.....  4.7036 
## p.val......  2.556e-06 
## 
## Original number of observations..............  937 
## Original number of treated obs...............  328 
## Matched number of observations...............  326 
## Matched number of observations  (unweighted).  523 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  2

Sensitivity Analysis

psens(match.bio22, Gamma=2.0, GammaInc = 0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0001
##    1.3           0      0.0010
##    1.4           0      0.0078
##    1.5           0      0.0363
##    1.6           0      0.1123
##    1.7           0      0.2517
##    1.8           0      0.4380
##    1.9           0      0.6290
##    2.0           0      0.7852
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 

Note that in the above table \(\Gamma=1.6\) in the first column is the first row where 0.05 is between the Lower and Upper bounds. This means that an unknown confounder which increases the odds of being in PAL by more than 1.6 is enough to change the treatment effect from significant to non-significant. The next code block generates the effect on the odds ratio of each variable in the propensity score. Thus, if there is an unknown confounder that has an effect on the propensity score similar to “cMaj” or “csus.gpa.start” the PAL effect would become non-significant. Thus, this finding is sensitive to unknown confounders. It is possible a variable like the number of hours per week a student works which is not in our dataset is a confounder which could reverse the statistical significance of this analysis.

kable(sort(exp(abs(bio22.first.order.prop.model$coefficients))))
x
sat.math.score 1.000640
sat.verbal.score 1.002151
eth.erssHispanic 1.060104
AP_CALAB.flgNot Missing 1.084602
eth.erssUnknown 1.140201
term.units.attemptedCensus 1.152535
delay.from.hs 1.154547
cMajOTHER 1.219164
prevPASS 1.241804
cMajChemistry 1.243820
genderMale 1.332799
sac.county.flg1 1.358187
sat.math.flgold 1.435618
prevPAL 1.437757
(Intercept) 1.458636
AP_CALAB 1.460014
eth.erssWhite 1.580549
eth.erssAsian 1.589501
cMajChild Devel/Early Childhood Ed 1.619797
cMajUndeclared 1.627665
eth.erssTwo or More Races 1.676715
csus.gpa.start 1.719944
hous.coh.term.flgOn-Campus Housing 1.769760
eth.erssForeign 1.852737
cMajKinesiology/Physical Education 2.311386
eth.erssPacific Islander 2.714234
cum.percent.units.passed 2.951287
cMajNursing 4.477056

Propensity Score Adjusted Mean Grades

Adjusted Mean Grades
Course Non-PAL PAL Diff. Std. error p-val Sensitivity N(non-PAL) N(PAL)
BIO 22 1.71 2.31 0.6 0.13 1.28e-06 1.6 332 326

References

Greifer, Noah. 2020. Cobalt: Covariate Balance Tables and Plots. https://CRAN.R-project.org/package=cobalt.

Leite, W. L. 2017. Practical Propensity Score Methods Using R. Thousand Oaks, CA: Sage Publishing. https://osf.io/nygb5/.

Sekhon, Jasjeet S. 2011. “Multivariate and Propensity Score Matching Software with Automated Balance Optimization: The Matching Package for R.” Journal of Statistical Software 42 (7): 1–52. http://www.jstatsoft.org/v42/i07/.

Yoshida, Kazuki, and Alexander Bartel. 2020. Tableone: Create ’Table 1’ to Describe Baseline Characteristics with or Without Propensity Score Weights. https://CRAN.R-project.org/package=tableone.

Zhang, Z., H. J. Kim, G. Lonjon, Y. Zhu, and written on behalf of AME Big-Data Clinical Trial Collaborative Group. 2019. “Balance Diagnostics After Propensity Score Matching.” Annals of Translational Medicine 7 (1): 16. https://doi.org/10.21037/atm.2018.12.10.