Data Analysis

First Add Some R Packages to the Workspace\ Caution: warning messages are suppressed to reduce clutter in the output.

defaultW <- getOption("warn") 
options(warn = -1) 


library(rbounds)
library(readr)
library(tidyr)
library(tidyverse)
library(MatchIt)
library(cobalt)
library(backports)
library(DataExplorer)
library(readxl)
library(Matching)

options(warn = defaultW)

Source a script

The script contains a user-defined function called id.vars.zero.freq to identify variables causing complete separation in logistic regression.

source("IdentifyCompleteSepVars2.R")

Import the Data

Make sure the PAL datafile in the same directory as this RMarkdown file. The files which includes data through the Spring 2019 semester has 1,099,371 rows and 174 columns. The total of the grd.pt.unt column is 2,237,555.

load("~/PAL data analysis/PALdatafull.Rdata")
dim(PALdatafull)
## [1] 1099371     174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555

Subset Data for Biology

Each instance of a course taken by a student is included in the data file for the semester during which it was taken and every semester thereafter. 'test1' below filters so the dataset only includes the semester during which the course was taken.

bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))

# test1 filters on the semester in which the course was taken; courses are included in the data file for the semester in which it was taken and every semester thereafter

test1 <- PALdatafull$base.time.course==1
test.bio <- PALdatafull$course %in% bio.classes

bio.indx <- test1 & test.bio

bio.dat <- PALdatafull[bio.indx, ]
dim(bio.dat)
## [1] 19652   174
# [1] 19652   174

Compare the mean gpa for PAL and non-PAL students for Biology classes without Propensity Score Adjustment

The course.seq variables indicates how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.

raw.table = data.frame(class=character(),
                       nonPALavg=numeric(),
                       PALavg=numeric(), 
                       Diff=numeric(), 
                       NonPAL_Num= integer(),
                       PAL_Num=integer(),
                       CompletePAL=numeric(),
                       TermPALStart=integer(),
                       row.names=NULL,
                       stringsAsFactors = FALSE)
eth.tables = list()

for (i in 1:length(bio.classes))
    {
      curr.class = bio.classes[i]
      temp = subset(bio.dat, course==curr.class & course.seq==0)
      pal.start=min(unique(temp$term.code[temp$palN==2]))
      # only include terms after PAL start term
      temp = subset(temp, term.code>= pal.start)
      
      x=tapply(temp$grd.pt.unt,temp$palN, 
               mean, na.rm=T) %>% 
        as.numeric %>% 
        round(2)
      
      y=table(temp$palN) %>% as.numeric
      
      raw.table[i, 'class' ] = curr.class
      raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
                        round(y[3]/sum(y),2))
      raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
      
      #participation by ethnicity
      eth.tables[[i]] <- with(temp, table(eth.erss, palN)[,-c(2)]) #remove a few students who started PAL but did not finish
      colnames(eth.tables[[i]])<- c("nonPAL","PAL")
     
}
names(eth.tables) <- bio.classes
#counts
eth.tables
## $`BIO 22`
##                    palN
## eth.erss            nonPAL PAL
##   African American      47  23
##   Asian                302 110
##   Foreign               15   7
##   Hispanic             253 154
##   Native American        1   0
##   Pacific Islander       5   5
##   Two or More Races     62  22
##   Unknown               33  15
##   White                207  75
## 
## $`BIO 121`
##                    palN
## eth.erss            nonPAL PAL
##   African American      28  24
##   Asian                287 110
##   Foreign               17  12
##   Hispanic             181  96
##   Native American        1   0
##   Pacific Islander       5   7
##   Two or More Races     68  16
##   Unknown               44  15
##   White                222  96
## 
## $`BIO 131`
##                    palN
## eth.erss            nonPAL PAL
##   African American      60  15
##   Asian                438 163
##   Foreign               19  13
##   Hispanic             346 152
##   Native American        4   0
##   Pacific Islander       6   3
##   Two or More Races    105  33
##   Unknown               70  20
##   White                418 133
## 
## $`BIO 139`
##                    palN
## eth.erss            nonPAL PAL
##   African American      16   2
##   Asian                105  31
##   Foreign                6   4
##   Hispanic              66  25
##   Native American        0   0
##   Pacific Islander       1   0
##   Two or More Races     11   9
##   Unknown               15   6
##   White                 77  21
## 
## $`BIO 184`
##                    palN
## eth.erss            nonPAL PAL
##   African American      24   6
##   Asian                147  33
##   Foreign               17   4
##   Hispanic             114  55
##   Native American        0   0
##   Pacific Islander       1   0
##   Two or More Races     30   8
##   Unknown               22  10
##   White                146  33
# each column is 100%
# percent.eth.within.PAL.and.within.non = lapply(eth.tables, prop.table, margin=2) %>% lapply(round, digits=2)

# percent.eth.within.PAL.and.within.non

# each row is 100%
 percent.PAL.and.non.within.ethnicity = lapply(eth.tables, prop.table, margin=1) %>% lapply(round, digits=2)

 percent.PAL.and.non.within.ethnicity
## $`BIO 22`
##                    palN
## eth.erss            nonPAL  PAL
##   African American    0.67 0.33
##   Asian               0.73 0.27
##   Foreign             0.68 0.32
##   Hispanic            0.62 0.38
##   Native American     1.00 0.00
##   Pacific Islander    0.50 0.50
##   Two or More Races   0.74 0.26
##   Unknown             0.69 0.31
##   White               0.73 0.27
## 
## $`BIO 121`
##                    palN
## eth.erss            nonPAL  PAL
##   African American    0.54 0.46
##   Asian               0.72 0.28
##   Foreign             0.59 0.41
##   Hispanic            0.65 0.35
##   Native American     1.00 0.00
##   Pacific Islander    0.42 0.58
##   Two or More Races   0.81 0.19
##   Unknown             0.75 0.25
##   White               0.70 0.30
## 
## $`BIO 131`
##                    palN
## eth.erss            nonPAL  PAL
##   African American    0.80 0.20
##   Asian               0.73 0.27
##   Foreign             0.59 0.41
##   Hispanic            0.69 0.31
##   Native American     1.00 0.00
##   Pacific Islander    0.67 0.33
##   Two or More Races   0.76 0.24
##   Unknown             0.78 0.22
##   White               0.76 0.24
## 
## $`BIO 139`
##                    palN
## eth.erss            nonPAL  PAL
##   African American    0.89 0.11
##   Asian               0.77 0.23
##   Foreign             0.60 0.40
##   Hispanic            0.73 0.27
##   Native American              
##   Pacific Islander    1.00 0.00
##   Two or More Races   0.55 0.45
##   Unknown             0.71 0.29
##   White               0.79 0.21
## 
## $`BIO 184`
##                    palN
## eth.erss            nonPAL  PAL
##   African American    0.80 0.20
##   Asian               0.82 0.18
##   Foreign             0.81 0.19
##   Hispanic            0.67 0.33
##   Native American              
##   Pacific Islander    1.00 0.00
##   Two or More Races   0.79 0.21
##   Unknown             0.69 0.31
##   White               0.82 0.18
# formatted table
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)")
Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)
class nonPALavg PALavg Diff NonPAL_Num PAL_Num CompletePAL TermPALStart
BIO 22 1.81 2.33 0.52 949 413 0.29 2158
BIO 121 2.18 2.39 0.21 888 390 0.30 2148
BIO 131 2.33 2.76 0.43 1516 544 0.26 2148
BIO 139 2.41 2.61 0.20 308 98 0.24 2178
BIO 184 2.51 2.73 0.22 509 149 0.22 2178

Clean up data and engineer additional features

yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",bio.dat$coh.term))

bio.dat$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(bio.dat$hs.grad.yr), yr.course.taken-bio.dat$hs.grad.yr, NA)

sum(is.na(bio.dat$delay.from.hs))  #1864 missing values
## [1] 1864
# remove 106 students who did not complete PAL
table(bio.dat$palN)
## 
##     0     1     2 
## 17778   106  1768
bio.dat=subset(bio.dat, palN!=1)
table(bio.dat$palN)
## 
##     0     2 
## 17778  1768
#recode palN to factor with 0/1 levels
bio.dat$palN = ifelse(bio.dat$palN==2, 1, 0)

#clean up category names in m.rmd and e.rmd
bio.dat$m.rmd[bio.dat$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"

bio.dat$m.rmd[bio.dat$m.rmd=="Remedial\nin Math"]="Remedial in Math"

bio.dat$m.rmd = droplevels(bio.dat$m.rmd)

bio.dat$e.rmd[bio.dat$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"

bio.dat$e.rmd[bio.dat$e.rmd=="Remedial\nin English"]="Remedial in English"

bio.dat$e.rmd = droplevels(bio.dat$e.rmd)
table(bio.dat$e.rmd)
## 
## Not Remedial in English     Remedial in English 
##                   15474                    3704
# create feature, proportion of cumulative units taken that were passes
bio.dat$cum.percent.units.passed =  bio.dat$tot.passd.prgrss.start/bio.dat$tot.taken.prgrss.start

# collapse sparse categories

bio.dat=group_category(data = bio.dat, feature = "load.code", threshold = 0.05, update = TRUE)

table(bio.dat$load.code, bio.dat$palN, droplevels(bio.dat$course))
## , ,  = BIO 121
## 
##        
##            0    1
##   F     2104  383
##   H      428   13
##   OTHER  308   61
## 
## , ,  = BIO 131
## 
##        
##            0    1
##   F     3127  491
##   H      486   19
##   OTHER  462   68
## 
## , ,  = BIO 139
## 
##        
##            0    1
##   F     2315   87
##   H      402    1
##   OTHER  340   15
## 
## , ,  = BIO 184
## 
##        
##            0    1
##   F     2751  144
##   H      417    5
##   OTHER  339    6
## 
## , ,  = BIO 22
## 
##        
##            0    1
##   F     3332  430
##   H      496    9
##   OTHER  471   36
bio.dat=group_category(data = bio.dat, feature = "addr.type", threshold = 0.05, update = TRUE)
table(bio.dat$addr.type, bio.dat$palN, droplevels(bio.dat$course))
## , ,  = BIO 121
## 
##        
##            0    1
##   OTHER 1263  101
##   PERM  1577  356
## 
## , ,  = BIO 131
## 
##        
##            0    1
##   OTHER 1608  103
##   PERM  2467  475
## 
## , ,  = BIO 139
## 
##        
##            0    1
##   OTHER 1530   19
##   PERM  1527   84
## 
## , ,  = BIO 184
## 
##        
##            0    1
##   OTHER 1411   20
##   PERM  2096  135
## 
## , ,  = BIO 22
## 
##        
##            0    1
##   OTHER 1627   70
##   PERM  2672  405
# code instructor as number for anonymity ----------------------------------
bio.dat$Instructor_01=droplevels(factor(bio.dat$Instructor_01))

instructor.vec = sort(unique(bio.dat$Instructor_01))
num.instr = length(instructor.vec)

bio.dat$Instructor_01 = factor(bio.dat$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr))

key.instr.code = cbind(as.character(instructor.vec),
                       1:num.instr)

Subset Data for Bio 131 (physiology)

Subset Bio 131 dat and remove variables with too many missing values

physio <- subset(bio.dat, course=="BIO 131" & term.code>=2148)

physio <- subset(physio, course.seq==0)

Analyze missingness

missingness.rpt = profile_missing(physio)

missingness.rpt[order(missingness.rpt$pct_missing),]
##                         feature num_missing  pct_missing
## 1                        emplid           0 0.0000000000
## 2                     term.code           0 0.0000000000
## 3                          term           0 0.0000000000
## 11                      fys.flg           0 0.0000000000
## 18                   pledge.flg           0 0.0000000000
## 50                      AP_BIOL           0 0.0000000000
## 51                     AP_CALAB           0 0.0000000000
## 52                     AP_CALBC           0 0.0000000000
## 53                      AP_CHEM           0 0.0000000000
## 54                      AP_PHYS           0 0.0000000000
## 55                     AP_PHYS2           0 0.0000000000
## 56                     AP_PHYSB           0 0.0000000000
## 57                     AP_PHYSC           0 0.0000000000
## 58                     AP_PHYSM           0 0.0000000000
## 59                      AP_STAT           0 0.0000000000
## 60                  AP_BIOL.flg           0 0.0000000000
## 61                 AP_CALAB.flg           0 0.0000000000
## 62                 AP_CALBC.flg           0 0.0000000000
## 63                  AP_CHEM.flg           0 0.0000000000
## 64                  AP_PHYS.flg           0 0.0000000000
## 65                 AP_PHYS2.flg           0 0.0000000000
## 66                 AP_PHYSB.flg           0 0.0000000000
## 67                 AP_PHYSC.flg           0 0.0000000000
## 68                 AP_PHYSM.flg           0 0.0000000000
## 69                  AP_STAT.flg           0 0.0000000000
## 70                    addr.type           0 0.0000000000
## 73                      country           0 0.0000000000
## 74                median.income           0 0.0000000000
## 75              pct.female.head           0 0.0000000000
## 76          pct.female.head.flg           0 0.0000000000
## 77                  med.inc.flg           0 0.0000000000
## 78              coh.term.course           0 0.0000000000
## 79         coh.term.code.course           0 0.0000000000
## 80                career.course           0 0.0000000000
## 81             acad.prog.course           0 0.0000000000
## 82                       reason           0 0.0000000000
## 83                       course           0 0.0000000000
## 84                      section           0 0.0000000000
## 85                    component           0 0.0000000000
## 86                        grade           0 0.0000000000
## 87                   grd.pt.unt           0 0.0000000000
## 88                   grade.base           0 0.0000000000
## 89                        units           0 0.0000000000
## 90               course.numeric           0 0.0000000000
## 91                          div           0 0.0000000000
## 92                Instructor_01           0 0.0000000000
## 94                   course.seq           0 0.0000000000
## 95                      rpt.flg           0 0.0000000000
## 96                  uniq.course           0 0.0000000000
## 97                    uniq.sect           0 0.0000000000
## 98                         pass           0 0.0000000000
## 99                          pal           0 0.0000000000
## 100                         c2s           0 0.0000000000
## 102            base.time.course           0 0.0000000000
## 103                       years           0 0.0000000000
## 105                   load.code           0 0.0000000000
## 142              csus.gpa.start           0 0.0000000000
## 143         higher.ed.gpa.start           0 0.0000000000
## 157                    termAbbr           0 0.0000000000
## 158               pass.term.flg           0 0.0000000000
## 159                        palN           0 0.0000000000
## 160                       passN           0 0.0000000000
## 161                   treat.flg           0 0.0000000000
## 162                  course.age           0 0.0000000000
## 163                    stem.flg           0 0.0000000000
## 166             stem.degree.flg           0 0.0000000000
## 169                     currPAL           0 0.0000000000
## 170                    currPASS           0 0.0000000000
## 171                     prevPAL           0 0.0000000000
## 172                    prevPASS           0 0.0000000000
## 173          csus.gpa.start.flg           0 0.0000000000
## 174     higher.ed.gpa.start.flg           0 0.0000000000
## 6                        gender           2 0.0009708738
## 72                        state           3 0.0014563107
## 14            hous.coh.term.flg           5 0.0024271845
## 71                          zip           8 0.0038834951
## 35                     adm.area          10 0.0048543689
## 36                      veteran          10 0.0048543689
## 37                 hs.eng.units          10 0.0048543689
## 38                hs.math.units          10 0.0048543689
## 39                hs.phys.units          10 0.0048543689
## 40                 hs.bio.units          10 0.0048543689
## 41                 hs.lab.units          10 0.0048543689
## 42             hs.forLang.units          10 0.0048543689
## 43             hs.socstud.units          10 0.0048543689
## 44            hs.perfarts.units          10 0.0048543689
## 45            hs.elective.units          10 0.0048543689
## 46  ge.critical.thinking.status          10 0.0048543689
## 47       ge.english.comp.status          10 0.0048543689
## 48               ge.math.status          10 0.0048543689
## 49          ge.oral.comm.status          10 0.0048543689
## 19                        m.rmd          11 0.0053398058
## 20                 m.rmd.detail          11 0.0053398058
## 23                        e.rmd          11 0.0053398058
## 24                 e.rmd.detail          11 0.0053398058
## 25                      rmd.flg          11 0.0053398058
## 26                 rmd.subj.flg          11 0.0053398058
## 167                    pass.flg          16 0.0077669903
## 168                   pass.flgN          16 0.0077669903
## 104                  career.num          18 0.0087378641
## 106                   bot.level          18 0.0087378641
## 107               enroll.status          18 0.0087378641
## 108                    term.gpa          18 0.0087378641
## 109               higher.ed.gpa          18 0.0087378641
## 110                    csus.gpa          18 0.0087378641
## 111               withdraw_code          18 0.0087378641
## 113            unt_taken_prgrss          18 0.0087378641
## 114            unt_passd_prgrss          18 0.0087378641
## 115              tot_cumulative          18 0.0087378641
## 116          cur_resident_terms          18 0.0087378641
## 117          cum_resident_terms          18 0.0087378641
## 149                   acad.plan          18 0.0087378641
## 118                 censusMajor          21 0.0101941748
## 119                  campus.gpa          21 0.0101941748
## 120                   total.gpa          21 0.0101941748
## 121  term.units.attemptedCensus          21 0.0101941748
## 147                  acad.stand          22 0.0106796117
## 4                           coh          36 0.0174757282
## 5                      coh.term          36 0.0174757282
## 122               base.time.coh          36 0.0174757282
## 123                    enrl.flg          36 0.0174757282
## 124                 enrl.flgERS          36 0.0174757282
## 125                     pst.flg          36 0.0174757282
## 126                  pst.flgERS          36 0.0174757282
## 127                    grad.flg          36 0.0174757282
## 128                 grad.flgERS          36 0.0174757282
## 129                     rtn.flg          36 0.0174757282
## 130                  rtn.flgERS          36 0.0174757282
## 140                        ttgF          36 0.0174757282
## 141                    maxUnits          36 0.0174757282
## 146        tot_cumulative.start          36 0.0174757282
## 165              stem.final.flg          36 0.0174757282
## 154                  hs.grad.yr          37 0.0179611650
## 175               delay.from.hs          37 0.0179611650
## 148                  admis.plan          39 0.0189320388
## 13            pell.coh.term.flg          41 0.0199029126
## 164               stem.base.flg          41 0.0199029126
## 7                      eth.erss          62 0.0300970874
## 8                    urm.flgERS          62 0.0300970874
## 21                  m.rmd.admin          71 0.0344660194
## 22           m.rmd.admin.detail          71 0.0344660194
## 15                    mother.ed         112 0.0543689320
## 144      tot.passd.prgrss.start         160 0.0776699029
## 145      tot.taken.prgrss.start         160 0.0776699029
## 176    cum.percent.units.passed         160 0.0776699029
## 16                    father.ed         164 0.0796116505
## 34                   admit.term         225 0.1092233010
## 150                plan.college         495 0.2402912621
## 151           plan.college.desc         495 0.2402912621
## 152                   plan.dept         495 0.2402912621
## 153               plan.deptAbbr         495 0.2402912621
## 155                 plan.degree         495 0.2402912621
## 156                   plan.type         495 0.2402912621
## 131                   deg.plan1         680 0.3300970874
## 137                   grad.term         680 0.3300970874
## 139                         ttg         680 0.3300970874
## 138                grad.termERS         717 0.3480582524
## 32                       hs.gpa         943 0.4577669903
## 33                   trf.gpaADM        1004 0.4873786408
## 27               sat.math.score        1040 0.5048543689
## 28                 sat.math.flg        1040 0.5048543689
## 29             sat.verbal.score        1040 0.5048543689
## 30               sat.verbal.flg        1040 0.5048543689
## 31                sat.test.date        1040 0.5048543689
## 101               treat.section        1516 0.7359223301
## 9                 fys.term.code        1636 0.7941747573
## 10                      fys.grd        1636 0.7941747573
## 12                  fys.rpt.flg        1636 0.7941747573
## 17                  pledge.term        1875 0.9101941748
## 132                   deg.plan2        2036 0.9883495146
## 112             withdraw_reason        2056 0.9980582524
## 133                   deg.plan3        2058 0.9990291262
## 93                Instructor_02        2060 1.0000000000
## 134                   deg.plan4        2060 1.0000000000
## 135                   deg.plan5        2060 1.0000000000
## 136                   deg.plan6        2060 1.0000000000
sum(missingness.rpt$pct_missing>0.08)  # remove 30 variables with >8% missing values
## [1] 30
vars.missing.data = as.character(missingness.rpt[missingness.rpt$pct_missing>0.08, "feature"])


physio = physio[,!(names(physio) %in% vars.missing.data)]

Remove single-valued variables

single.category.vars = sapply(physio, function(x) length(unique(x))==1)  

sum(single.category.vars)#15 single category vars
## [1] 15
keep.vars = names(which(single.category.vars==FALSE))
physio = physio[, keep.vars]

sep.cols = id.vars.zero.freq(physio, treat.col= which(names(physio)=="palN"))

sum(sep.cols)  # 25 variables causing complete separation
## [1] 25
names(physio)[sep.cols]
##  [1] "coh.term"               "eth.erss"               "rmd.subj.flg"          
##  [4] "ge.english.comp.status" "AP_PHYS.flg"            "AP_PHYSC.flg"          
##  [7] "state"                  "acad.prog.course"       "grade"                 
## [10] "grade.base"             "pal"                    "enroll.status"         
## [13] "withdraw_code"          "censusMajor"            "enrl.flg"              
## [16] "enrl.flgERS"            "pst.flg"                "pst.flgERS"            
## [19] "rtn.flg"                "rtn.flgERS"             "ttgF"                  
## [22] "acad.stand"             "admis.plan"             "acad.plan"             
## [25] "treat.flg"
# collapse categories to keep censusMajor; use urm.flgERS instead of eth.erss, discard other vars

physio=  group_category(data = physio, feature = "censusMajor", threshold = 0.07, update = TRUE)

table(physio$censusMajor, physio$palN)
##                                                      
##                                                         0   1
##   Biology                                             485 230
##   Chemistry                                            52  19
##   Dietetics & Food Administration/Nutritional Science  92  31
##   Kinesiology/Physical Education                      722 217
##   OTHER                                               165  47

Filter to relevant variables

prevPAL left out, no UD prereq courses for Bio 131 so transfer students would not have a chance to take PAL

vars.to.keep = c("grd.pt.unt","palN","coh","urm.flgERS","fys.flg","pell.coh.term.flg","hous.coh.term.flg","mother.ed","father.ed","m.rmd","e.rmd","adm.area","addr.type","median.income","pct.female.head","Instructor_01","bot.level","censusMajor","term.units.attemptedCensus","base.time.coh","higher.ed.gpa.start","course.age","stem.flg","prevPASS","cum.percent.units.passed","gender", "load.code")

new.vars = intersect(vars.to.keep, names(physio))
physio = physio[ ,new.vars]

dim(physio)
## [1] 2060   27
sum(complete.cases(physio))  # 2060 to 1740
## [1] 1740
physio = physio[complete.cases(physio), ]

Check for complete separation again

sep.cols = id.vars.zero.freq(physio, treat.col= which(names(physio)=="palN"))

sum(sep.cols) 
## [1] 1
names(physio)[sep.cols]  # bot.level causing complete separation
## [1] "bot.level"
physio=  group_category(data = physio, feature = "bot.level", threshold = 0.01, update = TRUE)

table(physio$bot.level, physio$palN)
##         
##            0   1
##   OTHER  278 110
##   Senior 987 365

Propensity physiology

min.model4 = glm(palN ~cum.percent.units.passed+gender+urm.flgERS+higher.ed.gpa.start, data=physio, family=binomial)

#summary(min.model4)
biggest4 = formula(glm(palN ~.-grd.pt.unt, data=physio, family=binomial))

physio.step = step(min.model4, 
               direction="forward",scope = biggest4, k=2, trace=F) 

summary(physio.step)
## 
## Call:
## glm(formula = palN ~ cum.percent.units.passed + gender + urm.flgERS + 
##     higher.ed.gpa.start + coh + Instructor_01 + term.units.attemptedCensus + 
##     adm.area + addr.type + prevPASS + e.rmd, family = binomial, 
##     data = physio)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3553  -0.8305  -0.6497   1.1793   2.4142  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.49733    0.97126  -3.601 0.000317 ***
## cum.percent.units.passed     0.51092    1.08310   0.472 0.637127    
## genderMale                  -0.22663    0.11504  -1.970 0.048850 *  
## urm.flgERSNon-URM           -0.89482    0.39283  -2.278 0.022733 *  
## urm.flgERSTwo or More Races -1.04353    0.44057  -2.369 0.017855 *  
## urm.flgERSUnknown           -1.38454    0.49931  -2.773 0.005556 ** 
## urm.flgERSURM               -0.64744    0.39778  -1.628 0.103603    
## higher.ed.gpa.start          0.20885    0.19387   1.077 0.281352    
## cohTransfers                -0.25811    0.13928  -1.853 0.063851 .  
## Instructor_019               0.81635    0.39040   2.091 0.036523 *  
## Instructor_0118              1.38220    0.27951   4.945 7.61e-07 ***
## Instructor_0123              1.09108    0.25031   4.359 1.31e-05 ***
## Instructor_0126              0.68791    0.26622   2.584 0.009767 ** 
## Instructor_0131              1.00026    0.26352   3.796 0.000147 ***
## term.units.attemptedCensus   0.08980    0.02231   4.025 5.70e-05 ***
## adm.areanonlocal            -0.18265    0.13127  -1.391 0.164110    
## adm.areaunknown              0.46136    0.18656   2.473 0.013402 *  
## addr.typePERM                0.29377    0.15172   1.936 0.052827 .  
## prevPASS                     0.19025    0.09916   1.919 0.055044 .  
## e.rmdRemedial in English     0.26207    0.17445   1.502 0.133028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.0  on 1739  degrees of freedom
## Residual deviance: 1917.3  on 1720  degrees of freedom
## AIC: 1957.3
## 
## Number of Fisher Scoring iterations: 4
physio.step$anova
##                           Step Df  Deviance Resid. Df Resid. Dev      AIC
## 1                              NA        NA      1732   2020.045 2036.045
## 2                        + coh -1 30.532896      1731   1989.512 2007.512
## 3              + Instructor_01 -5 34.220770      1726   1955.291 1983.291
## 4 + term.units.attemptedCensus -1 17.234388      1725   1938.057 1968.057
## 5                   + adm.area -2 11.249234      1723   1926.808 1960.808
## 6                  + addr.type -1  3.838403      1722   1922.969 1958.969
## 7                   + prevPASS -1  3.389816      1721   1919.580 1957.580
## 8                      + e.rmd -1  2.239564      1720   1917.340 1957.340
model.physio=formula(physio.step)


# Calc propensity scores 

physio.propensity.model = glm(model.physio, data=physio, family = binomial)


summary(physio.propensity.model)
## 
## Call:
## glm(formula = model.physio, family = binomial, data = physio)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3553  -0.8305  -0.6497   1.1793   2.4142  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.49733    0.97126  -3.601 0.000317 ***
## cum.percent.units.passed     0.51092    1.08310   0.472 0.637127    
## genderMale                  -0.22663    0.11504  -1.970 0.048850 *  
## urm.flgERSNon-URM           -0.89482    0.39283  -2.278 0.022733 *  
## urm.flgERSTwo or More Races -1.04353    0.44057  -2.369 0.017855 *  
## urm.flgERSUnknown           -1.38454    0.49931  -2.773 0.005556 ** 
## urm.flgERSURM               -0.64744    0.39778  -1.628 0.103603    
## higher.ed.gpa.start          0.20885    0.19387   1.077 0.281352    
## cohTransfers                -0.25811    0.13928  -1.853 0.063851 .  
## Instructor_019               0.81635    0.39040   2.091 0.036523 *  
## Instructor_0118              1.38220    0.27951   4.945 7.61e-07 ***
## Instructor_0123              1.09108    0.25031   4.359 1.31e-05 ***
## Instructor_0126              0.68791    0.26622   2.584 0.009767 ** 
## Instructor_0131              1.00026    0.26352   3.796 0.000147 ***
## term.units.attemptedCensus   0.08980    0.02231   4.025 5.70e-05 ***
## adm.areanonlocal            -0.18265    0.13127  -1.391 0.164110    
## adm.areaunknown              0.46136    0.18656   2.473 0.013402 *  
## addr.typePERM                0.29377    0.15172   1.936 0.052827 .  
## prevPASS                     0.19025    0.09916   1.919 0.055044 .  
## e.rmdRemedial in English     0.26207    0.17445   1.502 0.133028    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2040.0  on 1739  degrees of freedom
## Residual deviance: 1917.3  on 1720  degrees of freedom
## AIC: 1957.3
## 
## Number of Fisher Scoring iterations: 4
physio$propensity=physio.propensity.model$fitted.values

Matching and balance check

match.physio = Match(Y=physio$grd.pt.unt, Tr=physio$palN, X=physio$propensity, BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25)

summary(match.physio)
## 
## Estimate...  0.34873 
## AI SE......  0.070823 
## T-stat.....  4.9239 
## p.val......  8.4817e-07 
## 
## Original number of observations..............  1740 
## Original number of treated obs...............  475 
## Matched number of observations...............  469 
## Matched number of observations  (unweighted).  1200 
## 
## Caliper (SDs)........................................   0.25 
## Number of obs dropped by 'exact' or 'caliper'  6
num.matched.physio = length(match.physio$weights)
mean.gpa.PAL.physio = with(match.physio, 
                           sum(weights*mdata$Y[1:num.matched.physio])/sum(weights))

mean.gpa.nonPAL.physio=with(match.physio,
                            sum(weights*mdata$Y[(num.matched.physio+1):(2*num.matched.physio)])/sum(weights))

mean.gpa.PAL.physio-mean.gpa.nonPAL.physio
## [1] 0.34873
psens(match.physio, Gamma=2, GammaInc=0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0000
##    1.3           0      0.0002
##    1.4           0      0.0055
##    1.5           0      0.0529
##    1.6           0      0.2258
##    1.7           0      0.5231
##    1.8           0      0.7946
##    1.9           0      0.9390
##    2.0           0      0.9873
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 
sort(round(exp(physio.propensity.model$coefficients),2))
##                 (Intercept)           urm.flgERSUnknown 
##                        0.03                        0.25 
## urm.flgERSTwo or More Races           urm.flgERSNon-URM 
##                        0.35                        0.41 
##               urm.flgERSURM                cohTransfers 
##                        0.52                        0.77 
##                  genderMale            adm.areanonlocal 
##                        0.80                        0.83 
##  term.units.attemptedCensus                    prevPASS 
##                        1.09                        1.21 
##         higher.ed.gpa.start    e.rmdRemedial in English 
##                        1.23                        1.30 
##               addr.typePERM             adm.areaunknown 
##                        1.34                        1.59 
##    cum.percent.units.passed             Instructor_0126 
##                        1.67                        1.99 
##              Instructor_019             Instructor_0131 
##                        2.26                        2.72 
##             Instructor_0123             Instructor_0118 
##                        2.98                        3.98
physio.bal = bal.tab(match.physio, physio.propensity.model$formula, data=physio, un=TRUE, quick=FALSE)

physio.bal
## Balance Measures
##                                 Type Diff.Un Diff.Adj
## cum.percent.units.passed     Contin.  0.0384   0.0321
## gender_Male                   Binary -0.0695  -0.0044
## urm.flgERS_Foreign            Binary  0.0110  -0.0061
## urm.flgERS_Non-URM            Binary -0.0260   0.0143
## urm.flgERS_Two or More Races  Binary -0.0098   0.0062
## urm.flgERS_Unknown            Binary -0.0185  -0.0024
## urm.flgERS_URM                Binary  0.0433  -0.0120
## higher.ed.gpa.start          Contin.  0.0997   0.0917
## coh_Transfers                 Binary -0.1566  -0.0120
## Instructor_01_1               Binary -0.0754  -0.0015
## Instructor_01_9               Binary -0.0045   0.0016
## Instructor_01_18              Binary  0.0341   0.0032
## Instructor_01_23              Binary  0.0662  -0.0229
## Instructor_01_26              Binary -0.0453  -0.0017
## Instructor_01_31              Binary  0.0250   0.0213
## term.units.attemptedCensus   Contin.  0.3264  -0.0748
## adm.area_local                Binary -0.0358  -0.0126
## adm.area_nonlocal             Binary -0.0559  -0.0146
## adm.area_unknown              Binary  0.0917   0.0272
## addr.type_PERM                Binary  0.0224  -0.0187
## prevPASS                     Contin.  0.1508  -0.0102
## e.rmd_Remedial in English     Binary  0.0751   0.0098
## 
## Sample sizes
##                      Control Treated
## All                  1265.       475
## Matched (ESS)         396.32     469
## Matched (Unweighted)  751.       469
## Unmatched             514.         0
## Discarded               0.         6
love.plot(physio.bal, binary = "raw", stars = "std", var.order = "unadjusted", 
          thresholds = c(m = .1), abs = F)

Summary table

bio.list =  list( bio131=match.physio)

mean.grd.PAL = c( mean.gpa.PAL.physio)
mean.grd.nonPAL = c( mean.gpa.nonPAL.physio)
est.diff = sapply(bio.list, '[[', 1)
est.se = sapply(bio.list, '[[', 2)
test.stat = est.diff/est.se
num.cntrl = sapply(bio.list,  function(x) length(unique(x[["index.control"]])))
num.trt= sapply(bio.list, function(x) sum(x[["weights"]]))
approxPVal = (1- pnorm(test.stat))  # one-tailed test to see if PAL improves grades

results = data.frame(course=names(bio.list), mean.PAL=mean.grd.PAL, mean.nonPAL=mean.grd.nonPAL, difference.mean.gpa=round(est.diff,3), standard.error=round(est.se,3),p.val=round(approxPVal,4),number.of.nonPAL=num.cntrl, number.PAL=num.trt, row.names=NULL)

kable(results)  
course mean.PAL mean.nonPAL difference.mean.gpa standard.error p.val number.of.nonPAL number.PAL
bio131 2.724307 2.375577 0.349 0.071 0 751 469

Sensitivity Analysis Bio131

 psens(match.physio, Gamma=2, GammaInc=0.1)
## 
##  Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value 
##  
## Unconfounded estimate ....  0 
## 
##  Gamma Lower bound Upper bound
##    1.0           0      0.0000
##    1.1           0      0.0000
##    1.2           0      0.0000
##    1.3           0      0.0002
##    1.4           0      0.0055
##    1.5           0      0.0529
##    1.6           0      0.2258
##    1.7           0      0.5231
##    1.8           0      0.7946
##    1.9           0      0.9390
##    2.0           0      0.9873
## 
##  Note: Gamma is Odds of Differential Assignment To
##  Treatment Due to Unobserved Factors 
## 
  print(sort(exp(abs(physio.propensity.model$coefficients))))
##  term.units.attemptedCensus            adm.areanonlocal 
##                    1.093952                    1.200391 
##                    prevPASS         higher.ed.gpa.start 
##                    1.209547                    1.232262 
##                  genderMale                cohTransfers 
##                    1.254361                    1.294482 
##    e.rmdRemedial in English               addr.typePERM 
##                    1.299616                    1.341482 
##             adm.areaunknown    cum.percent.units.passed 
##                    1.586227                    1.666822 
##               urm.flgERSURM             Instructor_0126 
##                    1.910641                    1.989549 
##              Instructor_019           urm.flgERSNon-URM 
##                    2.262219                    2.446899 
##             Instructor_0131 urm.flgERSTwo or More Races 
##                    2.718977                    2.839227 
##             Instructor_0123             Instructor_0118 
##                    2.977481                    3.983663 
##           urm.flgERSUnknown                 (Intercept) 
##                    3.992980                   33.027218