First Add Some R Packages to the Workspace\ Caution: warning messages are suppressed to reduce clutter in the output.
defaultW <- getOption("warn")
options(warn = -1)
library(rbounds)
library(readr)
library(tidyr)
library(tidyverse)
library(MatchIt)
library(cobalt)
library(backports)
library(DataExplorer)
library(readxl)
library(Matching)
options(warn = defaultW)
The script contains a user-defined function called id.vars.zero.freq to identify variables causing complete separation in logistic regression.
source("IdentifyCompleteSepVars2.R")
Make sure the PAL datafile in the same directory as this RMarkdown file. The files which includes data through the Spring 2019 semester has 1,099,371 rows and 174 columns. The total of the grd.pt.unt column is 2,237,555.
load("~/PAL data analysis/PALdatafull.Rdata")
dim(PALdatafull)
## [1] 1099371 174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555
Each instance of a course taken by a student is included in the data file for the semester during which it was taken and every semester thereafter. 'test1' below filters so the dataset only includes the semester during which the course was taken.
bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))
# test1 filters on the semester in which the course was taken; courses are included in the data file for the semester in which it was taken and every semester thereafter
test1 <- PALdatafull$base.time.course==1
test.bio <- PALdatafull$course %in% bio.classes
bio.indx <- test1 & test.bio
bio.dat <- PALdatafull[bio.indx, ]
dim(bio.dat)
## [1] 19652 174
# [1] 19652 174
The course.seq variables indicates how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.
raw.table = data.frame(class=character(),
nonPALavg=numeric(),
PALavg=numeric(),
Diff=numeric(),
NonPAL_Num= integer(),
PAL_Num=integer(),
CompletePAL=numeric(),
TermPALStart=integer(),
row.names=NULL,
stringsAsFactors = FALSE)
eth.tables = list()
for (i in 1:length(bio.classes))
{
curr.class = bio.classes[i]
temp = subset(bio.dat, course==curr.class & course.seq==0)
pal.start=min(unique(temp$term.code[temp$palN==2]))
# only include terms after PAL start term
temp = subset(temp, term.code>= pal.start)
x=tapply(temp$grd.pt.unt,temp$palN,
mean, na.rm=T) %>%
as.numeric %>%
round(2)
y=table(temp$palN) %>% as.numeric
raw.table[i, 'class' ] = curr.class
raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
round(y[3]/sum(y),2))
raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
#participation by ethnicity
eth.tables[[i]] <- with(temp, table(eth.erss, palN)[,-c(2)]) #remove a few students who started PAL but did not finish
colnames(eth.tables[[i]])<- c("nonPAL","PAL")
}
names(eth.tables) <- bio.classes
#counts
eth.tables
## $`BIO 22`
## palN
## eth.erss nonPAL PAL
## African American 47 23
## Asian 302 110
## Foreign 15 7
## Hispanic 253 154
## Native American 1 0
## Pacific Islander 5 5
## Two or More Races 62 22
## Unknown 33 15
## White 207 75
##
## $`BIO 121`
## palN
## eth.erss nonPAL PAL
## African American 28 24
## Asian 287 110
## Foreign 17 12
## Hispanic 181 96
## Native American 1 0
## Pacific Islander 5 7
## Two or More Races 68 16
## Unknown 44 15
## White 222 96
##
## $`BIO 131`
## palN
## eth.erss nonPAL PAL
## African American 60 15
## Asian 438 163
## Foreign 19 13
## Hispanic 346 152
## Native American 4 0
## Pacific Islander 6 3
## Two or More Races 105 33
## Unknown 70 20
## White 418 133
##
## $`BIO 139`
## palN
## eth.erss nonPAL PAL
## African American 16 2
## Asian 105 31
## Foreign 6 4
## Hispanic 66 25
## Native American 0 0
## Pacific Islander 1 0
## Two or More Races 11 9
## Unknown 15 6
## White 77 21
##
## $`BIO 184`
## palN
## eth.erss nonPAL PAL
## African American 24 6
## Asian 147 33
## Foreign 17 4
## Hispanic 114 55
## Native American 0 0
## Pacific Islander 1 0
## Two or More Races 30 8
## Unknown 22 10
## White 146 33
# each column is 100%
# percent.eth.within.PAL.and.within.non = lapply(eth.tables, prop.table, margin=2) %>% lapply(round, digits=2)
# percent.eth.within.PAL.and.within.non
# each row is 100%
percent.PAL.and.non.within.ethnicity = lapply(eth.tables, prop.table, margin=1) %>% lapply(round, digits=2)
percent.PAL.and.non.within.ethnicity
## $`BIO 22`
## palN
## eth.erss nonPAL PAL
## African American 0.67 0.33
## Asian 0.73 0.27
## Foreign 0.68 0.32
## Hispanic 0.62 0.38
## Native American 1.00 0.00
## Pacific Islander 0.50 0.50
## Two or More Races 0.74 0.26
## Unknown 0.69 0.31
## White 0.73 0.27
##
## $`BIO 121`
## palN
## eth.erss nonPAL PAL
## African American 0.54 0.46
## Asian 0.72 0.28
## Foreign 0.59 0.41
## Hispanic 0.65 0.35
## Native American 1.00 0.00
## Pacific Islander 0.42 0.58
## Two or More Races 0.81 0.19
## Unknown 0.75 0.25
## White 0.70 0.30
##
## $`BIO 131`
## palN
## eth.erss nonPAL PAL
## African American 0.80 0.20
## Asian 0.73 0.27
## Foreign 0.59 0.41
## Hispanic 0.69 0.31
## Native American 1.00 0.00
## Pacific Islander 0.67 0.33
## Two or More Races 0.76 0.24
## Unknown 0.78 0.22
## White 0.76 0.24
##
## $`BIO 139`
## palN
## eth.erss nonPAL PAL
## African American 0.89 0.11
## Asian 0.77 0.23
## Foreign 0.60 0.40
## Hispanic 0.73 0.27
## Native American
## Pacific Islander 1.00 0.00
## Two or More Races 0.55 0.45
## Unknown 0.71 0.29
## White 0.79 0.21
##
## $`BIO 184`
## palN
## eth.erss nonPAL PAL
## African American 0.80 0.20
## Asian 0.82 0.18
## Foreign 0.81 0.19
## Hispanic 0.67 0.33
## Native American
## Pacific Islander 1.00 0.00
## Two or More Races 0.79 0.21
## Unknown 0.69 0.31
## White 0.82 0.18
# formatted table
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)")
class | nonPALavg | PALavg | Diff | NonPAL_Num | PAL_Num | CompletePAL | TermPALStart |
---|---|---|---|---|---|---|---|
BIO 22 | 1.81 | 2.33 | 0.52 | 949 | 413 | 0.29 | 2158 |
BIO 121 | 2.18 | 2.39 | 0.21 | 888 | 390 | 0.30 | 2148 |
BIO 131 | 2.33 | 2.76 | 0.43 | 1516 | 544 | 0.26 | 2148 |
BIO 139 | 2.41 | 2.61 | 0.20 | 308 | 98 | 0.24 | 2178 |
BIO 184 | 2.51 | 2.73 | 0.22 | 509 | 149 | 0.22 | 2178 |
yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",bio.dat$coh.term))
bio.dat$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(bio.dat$hs.grad.yr), yr.course.taken-bio.dat$hs.grad.yr, NA)
sum(is.na(bio.dat$delay.from.hs)) #1864 missing values
## [1] 1864
# remove 106 students who did not complete PAL
table(bio.dat$palN)
##
## 0 1 2
## 17778 106 1768
bio.dat=subset(bio.dat, palN!=1)
table(bio.dat$palN)
##
## 0 2
## 17778 1768
#recode palN to factor with 0/1 levels
bio.dat$palN = ifelse(bio.dat$palN==2, 1, 0)
#clean up category names in m.rmd and e.rmd
bio.dat$m.rmd[bio.dat$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
bio.dat$m.rmd[bio.dat$m.rmd=="Remedial\nin Math"]="Remedial in Math"
bio.dat$m.rmd = droplevels(bio.dat$m.rmd)
bio.dat$e.rmd[bio.dat$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
bio.dat$e.rmd[bio.dat$e.rmd=="Remedial\nin English"]="Remedial in English"
bio.dat$e.rmd = droplevels(bio.dat$e.rmd)
table(bio.dat$e.rmd)
##
## Not Remedial in English Remedial in English
## 15474 3704
# create feature, proportion of cumulative units taken that were passes
bio.dat$cum.percent.units.passed = bio.dat$tot.passd.prgrss.start/bio.dat$tot.taken.prgrss.start
# collapse sparse categories
bio.dat=group_category(data = bio.dat, feature = "load.code", threshold = 0.05, update = TRUE)
table(bio.dat$load.code, bio.dat$palN, droplevels(bio.dat$course))
## , , = BIO 121
##
##
## 0 1
## F 2104 383
## H 428 13
## OTHER 308 61
##
## , , = BIO 131
##
##
## 0 1
## F 3127 491
## H 486 19
## OTHER 462 68
##
## , , = BIO 139
##
##
## 0 1
## F 2315 87
## H 402 1
## OTHER 340 15
##
## , , = BIO 184
##
##
## 0 1
## F 2751 144
## H 417 5
## OTHER 339 6
##
## , , = BIO 22
##
##
## 0 1
## F 3332 430
## H 496 9
## OTHER 471 36
bio.dat=group_category(data = bio.dat, feature = "addr.type", threshold = 0.05, update = TRUE)
table(bio.dat$addr.type, bio.dat$palN, droplevels(bio.dat$course))
## , , = BIO 121
##
##
## 0 1
## OTHER 1263 101
## PERM 1577 356
##
## , , = BIO 131
##
##
## 0 1
## OTHER 1608 103
## PERM 2467 475
##
## , , = BIO 139
##
##
## 0 1
## OTHER 1530 19
## PERM 1527 84
##
## , , = BIO 184
##
##
## 0 1
## OTHER 1411 20
## PERM 2096 135
##
## , , = BIO 22
##
##
## 0 1
## OTHER 1627 70
## PERM 2672 405
# code instructor as number for anonymity ----------------------------------
bio.dat$Instructor_01=droplevels(factor(bio.dat$Instructor_01))
instructor.vec = sort(unique(bio.dat$Instructor_01))
num.instr = length(instructor.vec)
bio.dat$Instructor_01 = factor(bio.dat$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr))
key.instr.code = cbind(as.character(instructor.vec),
1:num.instr)
Subset Bio 131 dat and remove variables with too many missing values
physio <- subset(bio.dat, course=="BIO 131" & term.code>=2148)
physio <- subset(physio, course.seq==0)
missingness.rpt = profile_missing(physio)
missingness.rpt[order(missingness.rpt$pct_missing),]
## feature num_missing pct_missing
## 1 emplid 0 0.0000000000
## 2 term.code 0 0.0000000000
## 3 term 0 0.0000000000
## 11 fys.flg 0 0.0000000000
## 18 pledge.flg 0 0.0000000000
## 50 AP_BIOL 0 0.0000000000
## 51 AP_CALAB 0 0.0000000000
## 52 AP_CALBC 0 0.0000000000
## 53 AP_CHEM 0 0.0000000000
## 54 AP_PHYS 0 0.0000000000
## 55 AP_PHYS2 0 0.0000000000
## 56 AP_PHYSB 0 0.0000000000
## 57 AP_PHYSC 0 0.0000000000
## 58 AP_PHYSM 0 0.0000000000
## 59 AP_STAT 0 0.0000000000
## 60 AP_BIOL.flg 0 0.0000000000
## 61 AP_CALAB.flg 0 0.0000000000
## 62 AP_CALBC.flg 0 0.0000000000
## 63 AP_CHEM.flg 0 0.0000000000
## 64 AP_PHYS.flg 0 0.0000000000
## 65 AP_PHYS2.flg 0 0.0000000000
## 66 AP_PHYSB.flg 0 0.0000000000
## 67 AP_PHYSC.flg 0 0.0000000000
## 68 AP_PHYSM.flg 0 0.0000000000
## 69 AP_STAT.flg 0 0.0000000000
## 70 addr.type 0 0.0000000000
## 73 country 0 0.0000000000
## 74 median.income 0 0.0000000000
## 75 pct.female.head 0 0.0000000000
## 76 pct.female.head.flg 0 0.0000000000
## 77 med.inc.flg 0 0.0000000000
## 78 coh.term.course 0 0.0000000000
## 79 coh.term.code.course 0 0.0000000000
## 80 career.course 0 0.0000000000
## 81 acad.prog.course 0 0.0000000000
## 82 reason 0 0.0000000000
## 83 course 0 0.0000000000
## 84 section 0 0.0000000000
## 85 component 0 0.0000000000
## 86 grade 0 0.0000000000
## 87 grd.pt.unt 0 0.0000000000
## 88 grade.base 0 0.0000000000
## 89 units 0 0.0000000000
## 90 course.numeric 0 0.0000000000
## 91 div 0 0.0000000000
## 92 Instructor_01 0 0.0000000000
## 94 course.seq 0 0.0000000000
## 95 rpt.flg 0 0.0000000000
## 96 uniq.course 0 0.0000000000
## 97 uniq.sect 0 0.0000000000
## 98 pass 0 0.0000000000
## 99 pal 0 0.0000000000
## 100 c2s 0 0.0000000000
## 102 base.time.course 0 0.0000000000
## 103 years 0 0.0000000000
## 105 load.code 0 0.0000000000
## 142 csus.gpa.start 0 0.0000000000
## 143 higher.ed.gpa.start 0 0.0000000000
## 157 termAbbr 0 0.0000000000
## 158 pass.term.flg 0 0.0000000000
## 159 palN 0 0.0000000000
## 160 passN 0 0.0000000000
## 161 treat.flg 0 0.0000000000
## 162 course.age 0 0.0000000000
## 163 stem.flg 0 0.0000000000
## 166 stem.degree.flg 0 0.0000000000
## 169 currPAL 0 0.0000000000
## 170 currPASS 0 0.0000000000
## 171 prevPAL 0 0.0000000000
## 172 prevPASS 0 0.0000000000
## 173 csus.gpa.start.flg 0 0.0000000000
## 174 higher.ed.gpa.start.flg 0 0.0000000000
## 6 gender 2 0.0009708738
## 72 state 3 0.0014563107
## 14 hous.coh.term.flg 5 0.0024271845
## 71 zip 8 0.0038834951
## 35 adm.area 10 0.0048543689
## 36 veteran 10 0.0048543689
## 37 hs.eng.units 10 0.0048543689
## 38 hs.math.units 10 0.0048543689
## 39 hs.phys.units 10 0.0048543689
## 40 hs.bio.units 10 0.0048543689
## 41 hs.lab.units 10 0.0048543689
## 42 hs.forLang.units 10 0.0048543689
## 43 hs.socstud.units 10 0.0048543689
## 44 hs.perfarts.units 10 0.0048543689
## 45 hs.elective.units 10 0.0048543689
## 46 ge.critical.thinking.status 10 0.0048543689
## 47 ge.english.comp.status 10 0.0048543689
## 48 ge.math.status 10 0.0048543689
## 49 ge.oral.comm.status 10 0.0048543689
## 19 m.rmd 11 0.0053398058
## 20 m.rmd.detail 11 0.0053398058
## 23 e.rmd 11 0.0053398058
## 24 e.rmd.detail 11 0.0053398058
## 25 rmd.flg 11 0.0053398058
## 26 rmd.subj.flg 11 0.0053398058
## 167 pass.flg 16 0.0077669903
## 168 pass.flgN 16 0.0077669903
## 104 career.num 18 0.0087378641
## 106 bot.level 18 0.0087378641
## 107 enroll.status 18 0.0087378641
## 108 term.gpa 18 0.0087378641
## 109 higher.ed.gpa 18 0.0087378641
## 110 csus.gpa 18 0.0087378641
## 111 withdraw_code 18 0.0087378641
## 113 unt_taken_prgrss 18 0.0087378641
## 114 unt_passd_prgrss 18 0.0087378641
## 115 tot_cumulative 18 0.0087378641
## 116 cur_resident_terms 18 0.0087378641
## 117 cum_resident_terms 18 0.0087378641
## 149 acad.plan 18 0.0087378641
## 118 censusMajor 21 0.0101941748
## 119 campus.gpa 21 0.0101941748
## 120 total.gpa 21 0.0101941748
## 121 term.units.attemptedCensus 21 0.0101941748
## 147 acad.stand 22 0.0106796117
## 4 coh 36 0.0174757282
## 5 coh.term 36 0.0174757282
## 122 base.time.coh 36 0.0174757282
## 123 enrl.flg 36 0.0174757282
## 124 enrl.flgERS 36 0.0174757282
## 125 pst.flg 36 0.0174757282
## 126 pst.flgERS 36 0.0174757282
## 127 grad.flg 36 0.0174757282
## 128 grad.flgERS 36 0.0174757282
## 129 rtn.flg 36 0.0174757282
## 130 rtn.flgERS 36 0.0174757282
## 140 ttgF 36 0.0174757282
## 141 maxUnits 36 0.0174757282
## 146 tot_cumulative.start 36 0.0174757282
## 165 stem.final.flg 36 0.0174757282
## 154 hs.grad.yr 37 0.0179611650
## 175 delay.from.hs 37 0.0179611650
## 148 admis.plan 39 0.0189320388
## 13 pell.coh.term.flg 41 0.0199029126
## 164 stem.base.flg 41 0.0199029126
## 7 eth.erss 62 0.0300970874
## 8 urm.flgERS 62 0.0300970874
## 21 m.rmd.admin 71 0.0344660194
## 22 m.rmd.admin.detail 71 0.0344660194
## 15 mother.ed 112 0.0543689320
## 144 tot.passd.prgrss.start 160 0.0776699029
## 145 tot.taken.prgrss.start 160 0.0776699029
## 176 cum.percent.units.passed 160 0.0776699029
## 16 father.ed 164 0.0796116505
## 34 admit.term 225 0.1092233010
## 150 plan.college 495 0.2402912621
## 151 plan.college.desc 495 0.2402912621
## 152 plan.dept 495 0.2402912621
## 153 plan.deptAbbr 495 0.2402912621
## 155 plan.degree 495 0.2402912621
## 156 plan.type 495 0.2402912621
## 131 deg.plan1 680 0.3300970874
## 137 grad.term 680 0.3300970874
## 139 ttg 680 0.3300970874
## 138 grad.termERS 717 0.3480582524
## 32 hs.gpa 943 0.4577669903
## 33 trf.gpaADM 1004 0.4873786408
## 27 sat.math.score 1040 0.5048543689
## 28 sat.math.flg 1040 0.5048543689
## 29 sat.verbal.score 1040 0.5048543689
## 30 sat.verbal.flg 1040 0.5048543689
## 31 sat.test.date 1040 0.5048543689
## 101 treat.section 1516 0.7359223301
## 9 fys.term.code 1636 0.7941747573
## 10 fys.grd 1636 0.7941747573
## 12 fys.rpt.flg 1636 0.7941747573
## 17 pledge.term 1875 0.9101941748
## 132 deg.plan2 2036 0.9883495146
## 112 withdraw_reason 2056 0.9980582524
## 133 deg.plan3 2058 0.9990291262
## 93 Instructor_02 2060 1.0000000000
## 134 deg.plan4 2060 1.0000000000
## 135 deg.plan5 2060 1.0000000000
## 136 deg.plan6 2060 1.0000000000
sum(missingness.rpt$pct_missing>0.08) # remove 30 variables with >8% missing values
## [1] 30
vars.missing.data = as.character(missingness.rpt[missingness.rpt$pct_missing>0.08, "feature"])
physio = physio[,!(names(physio) %in% vars.missing.data)]
single.category.vars = sapply(physio, function(x) length(unique(x))==1)
sum(single.category.vars)#15 single category vars
## [1] 15
keep.vars = names(which(single.category.vars==FALSE))
physio = physio[, keep.vars]
sep.cols = id.vars.zero.freq(physio, treat.col= which(names(physio)=="palN"))
sum(sep.cols) # 25 variables causing complete separation
## [1] 25
names(physio)[sep.cols]
## [1] "coh.term" "eth.erss" "rmd.subj.flg"
## [4] "ge.english.comp.status" "AP_PHYS.flg" "AP_PHYSC.flg"
## [7] "state" "acad.prog.course" "grade"
## [10] "grade.base" "pal" "enroll.status"
## [13] "withdraw_code" "censusMajor" "enrl.flg"
## [16] "enrl.flgERS" "pst.flg" "pst.flgERS"
## [19] "rtn.flg" "rtn.flgERS" "ttgF"
## [22] "acad.stand" "admis.plan" "acad.plan"
## [25] "treat.flg"
# collapse categories to keep censusMajor; use urm.flgERS instead of eth.erss, discard other vars
physio= group_category(data = physio, feature = "censusMajor", threshold = 0.07, update = TRUE)
table(physio$censusMajor, physio$palN)
##
## 0 1
## Biology 485 230
## Chemistry 52 19
## Dietetics & Food Administration/Nutritional Science 92 31
## Kinesiology/Physical Education 722 217
## OTHER 165 47
prevPAL left out, no UD prereq courses for Bio 131 so transfer students would not have a chance to take PAL
vars.to.keep = c("grd.pt.unt","palN","coh","urm.flgERS","fys.flg","pell.coh.term.flg","hous.coh.term.flg","mother.ed","father.ed","m.rmd","e.rmd","adm.area","addr.type","median.income","pct.female.head","Instructor_01","bot.level","censusMajor","term.units.attemptedCensus","base.time.coh","higher.ed.gpa.start","course.age","stem.flg","prevPASS","cum.percent.units.passed","gender", "load.code")
new.vars = intersect(vars.to.keep, names(physio))
physio = physio[ ,new.vars]
dim(physio)
## [1] 2060 27
sum(complete.cases(physio)) # 2060 to 1740
## [1] 1740
physio = physio[complete.cases(physio), ]
sep.cols = id.vars.zero.freq(physio, treat.col= which(names(physio)=="palN"))
sum(sep.cols)
## [1] 1
names(physio)[sep.cols] # bot.level causing complete separation
## [1] "bot.level"
physio= group_category(data = physio, feature = "bot.level", threshold = 0.01, update = TRUE)
table(physio$bot.level, physio$palN)
##
## 0 1
## OTHER 278 110
## Senior 987 365
min.model4 = glm(palN ~cum.percent.units.passed+gender+urm.flgERS+higher.ed.gpa.start, data=physio, family=binomial)
#summary(min.model4)
biggest4 = formula(glm(palN ~.-grd.pt.unt, data=physio, family=binomial))
physio.step = step(min.model4,
direction="forward",scope = biggest4, k=2, trace=F)
summary(physio.step)
##
## Call:
## glm(formula = palN ~ cum.percent.units.passed + gender + urm.flgERS +
## higher.ed.gpa.start + coh + Instructor_01 + term.units.attemptedCensus +
## adm.area + addr.type + prevPASS + e.rmd, family = binomial,
## data = physio)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3553 -0.8305 -0.6497 1.1793 2.4142
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.49733 0.97126 -3.601 0.000317 ***
## cum.percent.units.passed 0.51092 1.08310 0.472 0.637127
## genderMale -0.22663 0.11504 -1.970 0.048850 *
## urm.flgERSNon-URM -0.89482 0.39283 -2.278 0.022733 *
## urm.flgERSTwo or More Races -1.04353 0.44057 -2.369 0.017855 *
## urm.flgERSUnknown -1.38454 0.49931 -2.773 0.005556 **
## urm.flgERSURM -0.64744 0.39778 -1.628 0.103603
## higher.ed.gpa.start 0.20885 0.19387 1.077 0.281352
## cohTransfers -0.25811 0.13928 -1.853 0.063851 .
## Instructor_019 0.81635 0.39040 2.091 0.036523 *
## Instructor_0118 1.38220 0.27951 4.945 7.61e-07 ***
## Instructor_0123 1.09108 0.25031 4.359 1.31e-05 ***
## Instructor_0126 0.68791 0.26622 2.584 0.009767 **
## Instructor_0131 1.00026 0.26352 3.796 0.000147 ***
## term.units.attemptedCensus 0.08980 0.02231 4.025 5.70e-05 ***
## adm.areanonlocal -0.18265 0.13127 -1.391 0.164110
## adm.areaunknown 0.46136 0.18656 2.473 0.013402 *
## addr.typePERM 0.29377 0.15172 1.936 0.052827 .
## prevPASS 0.19025 0.09916 1.919 0.055044 .
## e.rmdRemedial in English 0.26207 0.17445 1.502 0.133028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.0 on 1739 degrees of freedom
## Residual deviance: 1917.3 on 1720 degrees of freedom
## AIC: 1957.3
##
## Number of Fisher Scoring iterations: 4
physio.step$anova
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 NA NA 1732 2020.045 2036.045
## 2 + coh -1 30.532896 1731 1989.512 2007.512
## 3 + Instructor_01 -5 34.220770 1726 1955.291 1983.291
## 4 + term.units.attemptedCensus -1 17.234388 1725 1938.057 1968.057
## 5 + adm.area -2 11.249234 1723 1926.808 1960.808
## 6 + addr.type -1 3.838403 1722 1922.969 1958.969
## 7 + prevPASS -1 3.389816 1721 1919.580 1957.580
## 8 + e.rmd -1 2.239564 1720 1917.340 1957.340
model.physio=formula(physio.step)
# Calc propensity scores
physio.propensity.model = glm(model.physio, data=physio, family = binomial)
summary(physio.propensity.model)
##
## Call:
## glm(formula = model.physio, family = binomial, data = physio)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3553 -0.8305 -0.6497 1.1793 2.4142
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.49733 0.97126 -3.601 0.000317 ***
## cum.percent.units.passed 0.51092 1.08310 0.472 0.637127
## genderMale -0.22663 0.11504 -1.970 0.048850 *
## urm.flgERSNon-URM -0.89482 0.39283 -2.278 0.022733 *
## urm.flgERSTwo or More Races -1.04353 0.44057 -2.369 0.017855 *
## urm.flgERSUnknown -1.38454 0.49931 -2.773 0.005556 **
## urm.flgERSURM -0.64744 0.39778 -1.628 0.103603
## higher.ed.gpa.start 0.20885 0.19387 1.077 0.281352
## cohTransfers -0.25811 0.13928 -1.853 0.063851 .
## Instructor_019 0.81635 0.39040 2.091 0.036523 *
## Instructor_0118 1.38220 0.27951 4.945 7.61e-07 ***
## Instructor_0123 1.09108 0.25031 4.359 1.31e-05 ***
## Instructor_0126 0.68791 0.26622 2.584 0.009767 **
## Instructor_0131 1.00026 0.26352 3.796 0.000147 ***
## term.units.attemptedCensus 0.08980 0.02231 4.025 5.70e-05 ***
## adm.areanonlocal -0.18265 0.13127 -1.391 0.164110
## adm.areaunknown 0.46136 0.18656 2.473 0.013402 *
## addr.typePERM 0.29377 0.15172 1.936 0.052827 .
## prevPASS 0.19025 0.09916 1.919 0.055044 .
## e.rmdRemedial in English 0.26207 0.17445 1.502 0.133028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2040.0 on 1739 degrees of freedom
## Residual deviance: 1917.3 on 1720 degrees of freedom
## AIC: 1957.3
##
## Number of Fisher Scoring iterations: 4
physio$propensity=physio.propensity.model$fitted.values
match.physio = Match(Y=physio$grd.pt.unt, Tr=physio$palN, X=physio$propensity, BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25)
summary(match.physio)
##
## Estimate... 0.34873
## AI SE...... 0.070823
## T-stat..... 4.9239
## p.val...... 8.4817e-07
##
## Original number of observations.............. 1740
## Original number of treated obs............... 475
## Matched number of observations............... 469
## Matched number of observations (unweighted). 1200
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 6
num.matched.physio = length(match.physio$weights)
mean.gpa.PAL.physio = with(match.physio,
sum(weights*mdata$Y[1:num.matched.physio])/sum(weights))
mean.gpa.nonPAL.physio=with(match.physio,
sum(weights*mdata$Y[(num.matched.physio+1):(2*num.matched.physio)])/sum(weights))
mean.gpa.PAL.physio-mean.gpa.nonPAL.physio
## [1] 0.34873
psens(match.physio, Gamma=2, GammaInc=0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0000
## 1.3 0 0.0002
## 1.4 0 0.0055
## 1.5 0 0.0529
## 1.6 0 0.2258
## 1.7 0 0.5231
## 1.8 0 0.7946
## 1.9 0 0.9390
## 2.0 0 0.9873
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
sort(round(exp(physio.propensity.model$coefficients),2))
## (Intercept) urm.flgERSUnknown
## 0.03 0.25
## urm.flgERSTwo or More Races urm.flgERSNon-URM
## 0.35 0.41
## urm.flgERSURM cohTransfers
## 0.52 0.77
## genderMale adm.areanonlocal
## 0.80 0.83
## term.units.attemptedCensus prevPASS
## 1.09 1.21
## higher.ed.gpa.start e.rmdRemedial in English
## 1.23 1.30
## addr.typePERM adm.areaunknown
## 1.34 1.59
## cum.percent.units.passed Instructor_0126
## 1.67 1.99
## Instructor_019 Instructor_0131
## 2.26 2.72
## Instructor_0123 Instructor_0118
## 2.98 3.98
physio.bal = bal.tab(match.physio, physio.propensity.model$formula, data=physio, un=TRUE, quick=FALSE)
physio.bal
## Balance Measures
## Type Diff.Un Diff.Adj
## cum.percent.units.passed Contin. 0.0384 0.0321
## gender_Male Binary -0.0695 -0.0044
## urm.flgERS_Foreign Binary 0.0110 -0.0061
## urm.flgERS_Non-URM Binary -0.0260 0.0143
## urm.flgERS_Two or More Races Binary -0.0098 0.0062
## urm.flgERS_Unknown Binary -0.0185 -0.0024
## urm.flgERS_URM Binary 0.0433 -0.0120
## higher.ed.gpa.start Contin. 0.0997 0.0917
## coh_Transfers Binary -0.1566 -0.0120
## Instructor_01_1 Binary -0.0754 -0.0015
## Instructor_01_9 Binary -0.0045 0.0016
## Instructor_01_18 Binary 0.0341 0.0032
## Instructor_01_23 Binary 0.0662 -0.0229
## Instructor_01_26 Binary -0.0453 -0.0017
## Instructor_01_31 Binary 0.0250 0.0213
## term.units.attemptedCensus Contin. 0.3264 -0.0748
## adm.area_local Binary -0.0358 -0.0126
## adm.area_nonlocal Binary -0.0559 -0.0146
## adm.area_unknown Binary 0.0917 0.0272
## addr.type_PERM Binary 0.0224 -0.0187
## prevPASS Contin. 0.1508 -0.0102
## e.rmd_Remedial in English Binary 0.0751 0.0098
##
## Sample sizes
## Control Treated
## All 1265. 475
## Matched (ESS) 396.32 469
## Matched (Unweighted) 751. 469
## Unmatched 514. 0
## Discarded 0. 6
love.plot(physio.bal, binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
bio.list = list( bio131=match.physio)
mean.grd.PAL = c( mean.gpa.PAL.physio)
mean.grd.nonPAL = c( mean.gpa.nonPAL.physio)
est.diff = sapply(bio.list, '[[', 1)
est.se = sapply(bio.list, '[[', 2)
test.stat = est.diff/est.se
num.cntrl = sapply(bio.list, function(x) length(unique(x[["index.control"]])))
num.trt= sapply(bio.list, function(x) sum(x[["weights"]]))
approxPVal = (1- pnorm(test.stat)) # one-tailed test to see if PAL improves grades
results = data.frame(course=names(bio.list), mean.PAL=mean.grd.PAL, mean.nonPAL=mean.grd.nonPAL, difference.mean.gpa=round(est.diff,3), standard.error=round(est.se,3),p.val=round(approxPVal,4),number.of.nonPAL=num.cntrl, number.PAL=num.trt, row.names=NULL)
kable(results)
course | mean.PAL | mean.nonPAL | difference.mean.gpa | standard.error | p.val | number.of.nonPAL | number.PAL |
---|---|---|---|---|---|---|---|
bio131 | 2.724307 | 2.375577 | 0.349 | 0.071 | 0 | 751 | 469 |
psens(match.physio, Gamma=2, GammaInc=0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0
##
## Gamma Lower bound Upper bound
## 1.0 0 0.0000
## 1.1 0 0.0000
## 1.2 0 0.0000
## 1.3 0 0.0002
## 1.4 0 0.0055
## 1.5 0 0.0529
## 1.6 0 0.2258
## 1.7 0 0.5231
## 1.8 0 0.7946
## 1.9 0 0.9390
## 2.0 0 0.9873
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
print(sort(exp(abs(physio.propensity.model$coefficients))))
## term.units.attemptedCensus adm.areanonlocal
## 1.093952 1.200391
## prevPASS higher.ed.gpa.start
## 1.209547 1.232262
## genderMale cohTransfers
## 1.254361 1.294482
## e.rmdRemedial in English addr.typePERM
## 1.299616 1.341482
## adm.areaunknown cum.percent.units.passed
## 1.586227 1.666822
## urm.flgERSURM Instructor_0126
## 1.910641 1.989549
## Instructor_019 urm.flgERSNon-URM
## 2.262219 2.446899
## Instructor_0131 urm.flgERSTwo or More Races
## 2.718977 2.839227
## Instructor_0123 Instructor_0118
## 2.977481 3.983663
## urm.flgERSUnknown (Intercept)
## 3.992980 33.027218