First Add Some R Packages to the Workspace\ Caution: warning messages are suppressed to reduce clutter in the output.
defaultW <- getOption("warn")
options(warn = -1)
library(rbounds)
library(readr)
library(tidyr)
library(tidyverse)
library(MatchIt)
library(cobalt)
library(backports)
library(DataExplorer)
library(readxl)
library(Matching)
options(warn = defaultW)
The script contains a user-defined function called id.vars.zero.freq to identify variables causing complete separation in logistic regression.
source("IdentifyCompleteSepVars2.R")
Make sure the PAL datafile in the same directory as this RMarkdown file. The files which includes data through the Spring 2019 semester has 1,099,371 rows and 174 columns. The total of the grd.pt.unt column is 2,237,555.
load("~/PAL data analysis/PALdatafull.Rdata")
dim(PALdatafull)
## [1] 1099371 174
sum(PALdatafull$grd.pt.unt)
## [1] 2237555
Each instance of a course taken by a student is included in the data file for the semester during which it was taken and every semester thereafter. 'test1' below filters so the dataset only includes the semester during which the course was taken.
bio.classes <- paste("BIO", c(22, 121, 131, 139, 184))
# test1 filters on the semester in which the course was taken; courses are included in the data file for the semester in which it was taken and every semester thereafter
test1 <- PALdatafull$base.time.course==1
test.bio <- PALdatafull$course %in% bio.classes
bio.indx <- test1 & test.bio
bio.dat <- PALdatafull[bio.indx, ]
dim(bio.dat)
## [1] 19652 174
# [1] 19652 174
The course.seq variables indicates how many times a student has taken a course prior to the current attempt. To filter on the first attempt at a course, we set course.seq to 0.
raw.table = data.frame(class=character(),
nonPALavg=numeric(),
PALavg=numeric(),
Diff=numeric(),
NonPAL_Num= integer(),
PAL_Num=integer(),
CompletePAL=numeric(),
TermPALStart=integer(),
row.names=NULL,
stringsAsFactors = FALSE)
eth.tables = list()
for (i in 1:length(bio.classes))
{
curr.class = bio.classes[i]
temp = subset(bio.dat, course==curr.class & course.seq==0)
pal.start=min(unique(temp$term.code[temp$palN==2]))
# only include terms after PAL start term
temp = subset(temp, term.code>= pal.start)
x=tapply(temp$grd.pt.unt,temp$palN,
mean, na.rm=T) %>%
as.numeric %>%
round(2)
y=table(temp$palN) %>% as.numeric
raw.table[i, 'class' ] = curr.class
raw.table[i, c(2:4,7)]=c(x[1], x[3],x[3]-x[1],
round(y[3]/sum(y),2))
raw.table[i, c(5,6,8)]= c(y[1], y[3], pal.start)
#participation by ethnicity
eth.tables[[i]] <- with(temp, table(eth.erss, palN)[,-c(2)]) #remove a few students who started PAL but did not finish
colnames(eth.tables[[i]])<- c("nonPAL","PAL")
}
names(eth.tables) <- bio.classes
#counts
eth.tables
## $`BIO 22`
## palN
## eth.erss nonPAL PAL
## African American 47 23
## Asian 302 110
## Foreign 15 7
## Hispanic 253 154
## Native American 1 0
## Pacific Islander 5 5
## Two or More Races 62 22
## Unknown 33 15
## White 207 75
##
## $`BIO 121`
## palN
## eth.erss nonPAL PAL
## African American 28 24
## Asian 287 110
## Foreign 17 12
## Hispanic 181 96
## Native American 1 0
## Pacific Islander 5 7
## Two or More Races 68 16
## Unknown 44 15
## White 222 96
##
## $`BIO 131`
## palN
## eth.erss nonPAL PAL
## African American 60 15
## Asian 438 163
## Foreign 19 13
## Hispanic 346 152
## Native American 4 0
## Pacific Islander 6 3
## Two or More Races 105 33
## Unknown 70 20
## White 418 133
##
## $`BIO 139`
## palN
## eth.erss nonPAL PAL
## African American 16 2
## Asian 105 31
## Foreign 6 4
## Hispanic 66 25
## Native American 0 0
## Pacific Islander 1 0
## Two or More Races 11 9
## Unknown 15 6
## White 77 21
##
## $`BIO 184`
## palN
## eth.erss nonPAL PAL
## African American 24 6
## Asian 147 33
## Foreign 17 4
## Hispanic 114 55
## Native American 0 0
## Pacific Islander 1 0
## Two or More Races 30 8
## Unknown 22 10
## White 146 33
# each column is 100%
# percent.eth.within.PAL.and.within.non = lapply(eth.tables, prop.table, margin=2) %>% lapply(round, digits=2)
# percent.eth.within.PAL.and.within.non
# each row is 100%
percent.PAL.and.non.within.ethnicity = lapply(eth.tables, prop.table, margin=1) %>% lapply(round, digits=2)
percent.PAL.and.non.within.ethnicity
## $`BIO 22`
## palN
## eth.erss nonPAL PAL
## African American 0.67 0.33
## Asian 0.73 0.27
## Foreign 0.68 0.32
## Hispanic 0.62 0.38
## Native American 1.00 0.00
## Pacific Islander 0.50 0.50
## Two or More Races 0.74 0.26
## Unknown 0.69 0.31
## White 0.73 0.27
##
## $`BIO 121`
## palN
## eth.erss nonPAL PAL
## African American 0.54 0.46
## Asian 0.72 0.28
## Foreign 0.59 0.41
## Hispanic 0.65 0.35
## Native American 1.00 0.00
## Pacific Islander 0.42 0.58
## Two or More Races 0.81 0.19
## Unknown 0.75 0.25
## White 0.70 0.30
##
## $`BIO 131`
## palN
## eth.erss nonPAL PAL
## African American 0.80 0.20
## Asian 0.73 0.27
## Foreign 0.59 0.41
## Hispanic 0.69 0.31
## Native American 1.00 0.00
## Pacific Islander 0.67 0.33
## Two or More Races 0.76 0.24
## Unknown 0.78 0.22
## White 0.76 0.24
##
## $`BIO 139`
## palN
## eth.erss nonPAL PAL
## African American 0.89 0.11
## Asian 0.77 0.23
## Foreign 0.60 0.40
## Hispanic 0.73 0.27
## Native American
## Pacific Islander 1.00 0.00
## Two or More Races 0.55 0.45
## Unknown 0.71 0.29
## White 0.79 0.21
##
## $`BIO 184`
## palN
## eth.erss nonPAL PAL
## African American 0.80 0.20
## Asian 0.82 0.18
## Foreign 0.81 0.19
## Hispanic 0.67 0.33
## Native American
## Pacific Islander 1.00 0.00
## Two or More Races 0.79 0.21
## Unknown 0.69 0.31
## White 0.82 0.18
# formatted table
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
kable(raw.table, caption = "Raw Comparison of PAL and non-PAL Grades (No Propensity Adjustment)")
class | nonPALavg | PALavg | Diff | NonPAL_Num | PAL_Num | CompletePAL | TermPALStart |
---|---|---|---|---|---|---|---|
BIO 22 | 1.81 | 2.33 | 0.52 | 949 | 413 | 0.29 | 2158 |
BIO 121 | 2.18 | 2.39 | 0.21 | 888 | 390 | 0.30 | 2148 |
BIO 131 | 2.33 | 2.76 | 0.43 | 1516 | 544 | 0.26 | 2148 |
BIO 139 | 2.41 | 2.61 | 0.20 | 308 | 98 | 0.24 | 2178 |
BIO 184 | 2.51 | 2.73 | 0.22 | 509 | 149 | 0.22 | 2178 |
yr.course.taken = as.numeric(gsub(".*([0-9]{4})","\\1",bio.dat$coh.term))
bio.dat$delay.from.hs = ifelse(!is.na(yr.course.taken) & !is.na(bio.dat$hs.grad.yr), yr.course.taken-bio.dat$hs.grad.yr, NA)
sum(is.na(bio.dat$delay.from.hs)) #1864 missing values
## [1] 1864
# remove 106 students who did not complete PAL
table(bio.dat$palN)
##
## 0 1 2
## 17778 106 1768
bio.dat=subset(bio.dat, palN!=1)
table(bio.dat$palN)
##
## 0 2
## 17778 1768
#recode palN to factor with 0/1 levels
bio.dat$palN = ifelse(bio.dat$palN==2, 1, 0)
#clean up category names in m.rmd and e.rmd
bio.dat$m.rmd[bio.dat$m.rmd=="Not Remedial\nin Math"]="Not Remedial in Math"
bio.dat$m.rmd[bio.dat$m.rmd=="Remedial\nin Math"]="Remedial in Math"
bio.dat$m.rmd = droplevels(bio.dat$m.rmd)
bio.dat$e.rmd[bio.dat$e.rmd=="Not Remedial\nin English"]="Not Remedial in English"
bio.dat$e.rmd[bio.dat$e.rmd=="Remedial\nin English"]="Remedial in English"
bio.dat$e.rmd = droplevels(bio.dat$e.rmd)
table(bio.dat$e.rmd)
##
## Not Remedial in English Remedial in English
## 15474 3704
# create feature, proportion of cumulative units taken that were passes
bio.dat$cum.percent.units.passed = bio.dat$tot.passd.prgrss.start/bio.dat$tot.taken.prgrss.start
# collapse sparse categories
bio.dat=group_category(data = bio.dat, feature = "load.code", threshold = 0.05, update = TRUE)
table(bio.dat$load.code, bio.dat$palN, droplevels(bio.dat$course))
## , , = BIO 121
##
##
## 0 1
## F 2104 383
## H 428 13
## OTHER 308 61
##
## , , = BIO 131
##
##
## 0 1
## F 3127 491
## H 486 19
## OTHER 462 68
##
## , , = BIO 139
##
##
## 0 1
## F 2315 87
## H 402 1
## OTHER 340 15
##
## , , = BIO 184
##
##
## 0 1
## F 2751 144
## H 417 5
## OTHER 339 6
##
## , , = BIO 22
##
##
## 0 1
## F 3332 430
## H 496 9
## OTHER 471 36
bio.dat=group_category(data = bio.dat, feature = "addr.type", threshold = 0.05, update = TRUE)
table(bio.dat$addr.type, bio.dat$palN, droplevels(bio.dat$course))
## , , = BIO 121
##
##
## 0 1
## OTHER 1263 101
## PERM 1577 356
##
## , , = BIO 131
##
##
## 0 1
## OTHER 1608 103
## PERM 2467 475
##
## , , = BIO 139
##
##
## 0 1
## OTHER 1530 19
## PERM 1527 84
##
## , , = BIO 184
##
##
## 0 1
## OTHER 1411 20
## PERM 2096 135
##
## , , = BIO 22
##
##
## 0 1
## OTHER 1627 70
## PERM 2672 405
# code instructor as number for anonymity ----------------------------------
bio.dat$Instructor_01=droplevels(factor(bio.dat$Instructor_01))
instructor.vec = sort(unique(bio.dat$Instructor_01))
num.instr = length(instructor.vec)
bio.dat$Instructor_01 = factor(bio.dat$Instructor_01, levels = instructor.vec, labels=as.character(1:num.instr))
key.instr.code = cbind(as.character(instructor.vec),
1:num.instr)
bio184.dat <- subset(bio.dat, course=="BIO 184" & term.code>=2178)
table(bio184.dat$palN)
##
## 0 1
## 568 155
bio184.dat = subset(bio184.dat, course.seq==0)
table(bio184.dat$palN)
##
## 0 1
## 509 149
Remove 36 variables having over 5% of values missing in order to retain a larger pool of PAL and non-PAL students. None of the removed variables seem likely to be confounders. We will only include students with no missing values for the subset of variables specified in the 'Filter to relevant variables' section below.
plot_missing(bio184.dat, missing_only=TRUE)
missingness.rpt = profile_missing(bio184.dat)
missingness.rpt[order(missingness.rpt$pct_missing),]
## feature num_missing pct_missing
## 1 emplid 0 0.000000000
## 2 term.code 0 0.000000000
## 3 term 0 0.000000000
## 11 fys.flg 0 0.000000000
## 14 hous.coh.term.flg 0 0.000000000
## 18 pledge.flg 0 0.000000000
## 50 AP_BIOL 0 0.000000000
## 51 AP_CALAB 0 0.000000000
## 52 AP_CALBC 0 0.000000000
## 53 AP_CHEM 0 0.000000000
## 54 AP_PHYS 0 0.000000000
## 55 AP_PHYS2 0 0.000000000
## 56 AP_PHYSB 0 0.000000000
## 57 AP_PHYSC 0 0.000000000
## 58 AP_PHYSM 0 0.000000000
## 59 AP_STAT 0 0.000000000
## 60 AP_BIOL.flg 0 0.000000000
## 61 AP_CALAB.flg 0 0.000000000
## 62 AP_CALBC.flg 0 0.000000000
## 63 AP_CHEM.flg 0 0.000000000
## 64 AP_PHYS.flg 0 0.000000000
## 65 AP_PHYS2.flg 0 0.000000000
## 66 AP_PHYSB.flg 0 0.000000000
## 67 AP_PHYSC.flg 0 0.000000000
## 68 AP_PHYSM.flg 0 0.000000000
## 69 AP_STAT.flg 0 0.000000000
## 70 addr.type 0 0.000000000
## 73 country 0 0.000000000
## 74 median.income 0 0.000000000
## 75 pct.female.head 0 0.000000000
## 76 pct.female.head.flg 0 0.000000000
## 77 med.inc.flg 0 0.000000000
## 78 coh.term.course 0 0.000000000
## 79 coh.term.code.course 0 0.000000000
## 80 career.course 0 0.000000000
## 81 acad.prog.course 0 0.000000000
## 82 reason 0 0.000000000
## 83 course 0 0.000000000
## 84 section 0 0.000000000
## 85 component 0 0.000000000
## 86 grade 0 0.000000000
## 87 grd.pt.unt 0 0.000000000
## 88 grade.base 0 0.000000000
## 89 units 0 0.000000000
## 90 course.numeric 0 0.000000000
## 91 div 0 0.000000000
## 92 Instructor_01 0 0.000000000
## 94 course.seq 0 0.000000000
## 95 rpt.flg 0 0.000000000
## 96 uniq.course 0 0.000000000
## 97 uniq.sect 0 0.000000000
## 98 pass 0 0.000000000
## 99 pal 0 0.000000000
## 100 c2s 0 0.000000000
## 102 base.time.course 0 0.000000000
## 103 years 0 0.000000000
## 105 load.code 0 0.000000000
## 142 csus.gpa.start 0 0.000000000
## 143 higher.ed.gpa.start 0 0.000000000
## 157 termAbbr 0 0.000000000
## 158 pass.term.flg 0 0.000000000
## 159 palN 0 0.000000000
## 160 passN 0 0.000000000
## 161 treat.flg 0 0.000000000
## 162 course.age 0 0.000000000
## 163 stem.flg 0 0.000000000
## 166 stem.degree.flg 0 0.000000000
## 169 currPAL 0 0.000000000
## 170 currPASS 0 0.000000000
## 171 prevPAL 0 0.000000000
## 172 prevPASS 0 0.000000000
## 173 csus.gpa.start.flg 0 0.000000000
## 174 higher.ed.gpa.start.flg 0 0.000000000
## 6 gender 2 0.003039514
## 19 m.rmd 2 0.003039514
## 20 m.rmd.detail 2 0.003039514
## 23 e.rmd 2 0.003039514
## 24 e.rmd.detail 2 0.003039514
## 25 rmd.flg 2 0.003039514
## 26 rmd.subj.flg 2 0.003039514
## 35 adm.area 2 0.003039514
## 36 veteran 2 0.003039514
## 37 hs.eng.units 2 0.003039514
## 38 hs.math.units 2 0.003039514
## 39 hs.phys.units 2 0.003039514
## 40 hs.bio.units 2 0.003039514
## 41 hs.lab.units 2 0.003039514
## 42 hs.forLang.units 2 0.003039514
## 43 hs.socstud.units 2 0.003039514
## 44 hs.perfarts.units 2 0.003039514
## 45 hs.elective.units 2 0.003039514
## 46 ge.critical.thinking.status 2 0.003039514
## 47 ge.english.comp.status 2 0.003039514
## 48 ge.math.status 2 0.003039514
## 49 ge.oral.comm.status 2 0.003039514
## 72 state 2 0.003039514
## 104 career.num 3 0.004559271
## 106 bot.level 3 0.004559271
## 107 enroll.status 3 0.004559271
## 108 term.gpa 3 0.004559271
## 109 higher.ed.gpa 3 0.004559271
## 110 csus.gpa 3 0.004559271
## 111 withdraw_code 3 0.004559271
## 113 unt_taken_prgrss 3 0.004559271
## 114 unt_passd_prgrss 3 0.004559271
## 115 tot_cumulative 3 0.004559271
## 116 cur_resident_terms 3 0.004559271
## 117 cum_resident_terms 3 0.004559271
## 118 censusMajor 3 0.004559271
## 119 campus.gpa 3 0.004559271
## 120 total.gpa 3 0.004559271
## 121 term.units.attemptedCensus 3 0.004559271
## 149 acad.plan 3 0.004559271
## 147 acad.stand 6 0.009118541
## 167 pass.flg 6 0.009118541
## 168 pass.flgN 6 0.009118541
## 4 coh 7 0.010638298
## 5 coh.term 7 0.010638298
## 13 pell.coh.term.flg 7 0.010638298
## 71 zip 7 0.010638298
## 122 base.time.coh 7 0.010638298
## 123 enrl.flg 7 0.010638298
## 124 enrl.flgERS 7 0.010638298
## 125 pst.flg 7 0.010638298
## 126 pst.flgERS 7 0.010638298
## 127 grad.flg 7 0.010638298
## 128 grad.flgERS 7 0.010638298
## 129 rtn.flg 7 0.010638298
## 130 rtn.flgERS 7 0.010638298
## 140 ttgF 7 0.010638298
## 141 maxUnits 7 0.010638298
## 146 tot_cumulative.start 7 0.010638298
## 154 hs.grad.yr 7 0.010638298
## 165 stem.final.flg 7 0.010638298
## 175 delay.from.hs 7 0.010638298
## 7 eth.erss 8 0.012158055
## 8 urm.flgERS 8 0.012158055
## 148 admis.plan 8 0.012158055
## 164 stem.base.flg 8 0.012158055
## 15 mother.ed 30 0.045592705
## 16 father.ed 62 0.094224924
## 21 m.rmd.admin 112 0.170212766
## 22 m.rmd.admin.detail 112 0.170212766
## 150 plan.college 124 0.188449848
## 151 plan.college.desc 124 0.188449848
## 152 plan.dept 124 0.188449848
## 153 plan.deptAbbr 124 0.188449848
## 155 plan.degree 124 0.188449848
## 156 plan.type 124 0.188449848
## 34 admit.term 135 0.205167173
## 144 tot.passd.prgrss.start 139 0.211246201
## 145 tot.taken.prgrss.start 139 0.211246201
## 176 cum.percent.units.passed 139 0.211246201
## 32 hs.gpa 268 0.407294833
## 27 sat.math.score 294 0.446808511
## 28 sat.math.flg 294 0.446808511
## 29 sat.verbal.score 294 0.446808511
## 30 sat.verbal.flg 294 0.446808511
## 31 sat.test.date 294 0.446808511
## 17 pledge.term 448 0.680851064
## 9 fys.term.code 465 0.706686930
## 10 fys.grd 465 0.706686930
## 12 fys.rpt.flg 465 0.706686930
## 33 trf.gpaADM 467 0.709726444
## 101 treat.section 509 0.773556231
## 131 deg.plan1 561 0.852583587
## 137 grad.term 561 0.852583587
## 138 grad.termERS 561 0.852583587
## 139 ttg 561 0.852583587
## 112 withdraw_reason 655 0.995440729
## 132 deg.plan2 656 0.996960486
## 93 Instructor_02 658 1.000000000
## 133 deg.plan3 658 1.000000000
## 134 deg.plan4 658 1.000000000
## 135 deg.plan5 658 1.000000000
## 136 deg.plan6 658 1.000000000
#************************************************************
# 45% of SAT's missing, 41% of hs.gpa missing, higher.ed.gpa.start is used as a proxy for level of preparation/ability, prereq data may be more closely related to outcome, unable to obtain prereq data
#**********************************************************
sum(missingness.rpt$pct_missing>0.05) # remove 36 variables with >5% missing values
## [1] 36
vars.missing.data = as.character(missingness.rpt[missingness.rpt$pct_missing>0.05, "feature"])
# tried keeping bio 1 grade, p=0.39 not significant in prop model and reduces PAL number from 142 to 90. Add back when there is more complete data
bio184.dat = bio184.dat[,!(names(bio184.dat) %in% vars.missing.data)]
# Missingness of reduced Bio 184 data -------------------------------------
introduce(bio184.dat) # 658 rows
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 658 140 81 59 0
## total_missing_values complete_rows total_observations memory_usage
## 1 310 608 92120 734424
red.missingness.rpt = profile_missing(bio184.dat)
# red.missingness.rpt[order(red.missingness.rpt$pct_missing),]
Some variables may be single-values and are, thus, unrelated to propensity score. There were 10 single-valued variables in this dataset which were removed.
single.category.vars = sapply(bio184.dat, function(x) length(unique(x))==1)
sum(single.category.vars)
## [1] 15
keep.vars = names(which(single.category.vars==FALSE))
bio184.dat = bio184.dat[, keep.vars]
sep.cols = id.vars.zero.freq(bio184.dat, treat.col= which(names(bio184.dat)=="palN"))
sum(sep.cols)
## [1] 27
# collapse categories and keep censusMajor. use urm.flgERS instead of eth.erss
names(bio184.dat)[sep.cols]
## [1] "coh.term" "eth.erss"
## [3] "rmd.subj.flg" "veteran"
## [5] "ge.english.comp.status" "AP_PHYSM.flg"
## [7] "state" "career.course"
## [9] "acad.prog.course" "reason"
## [11] "grade" "grade.base"
## [13] "pal" "withdraw_code"
## [15] "censusMajor" "enrl.flg"
## [17] "pst.flg" "pst.flgERS"
## [19] "grad.flg" "grad.flgERS"
## [21] "rtn.flg" "ttgF"
## [23] "acad.stand" "admis.plan"
## [25] "acad.plan" "treat.flg"
## [27] "higher.ed.gpa.start.flg"
bio184.dat= group_category(data = bio184.dat, feature = "censusMajor", threshold = 0.05, update = TRUE)
table(bio184.dat$censusMajor, bio184.dat$palN)
##
## 0 1
## Biochemistry 41 17
## Biology 386 100
## Chemistry 27 12
## Clinical Science/Biomedical Laboratory Science 23 11
## OTHER 32 9
with(bio184.dat, table(urm.flgERS, palN))
## palN
## urm.flgERS 0 1
## Foreign 17 4
## Non-URM 293 66
## Two or More Races 30 8
## Unknown 22 10
## URM 139 61
vars.to.keep = c("grd.pt.unt","palN","coh","urm.flgERS","fys.flg","pell.coh.term.flg","hous.coh.term.flg","mother.ed","father.ed","m.rmd","e.rmd","adm.area","ge.math.status","addr.type","median.income","pct.female.head","Instructor_01","bot.level","censusMajor","term.units.attemptedCensus","base.time.coh","higher.ed.gpa.start","course.age","stem.flg","prevPASS","cum.percent.units.passed","gender","load.code","prevPAL")
# father.ed and cum.percent.units.passed were dropped since they have over 5% missing. Due to small number of PAL students, trying to retain as many students as possible
new.vars = intersect(vars.to.keep, names(bio184.dat))
bio184.dat = bio184.dat[ ,new.vars]
dim(bio184.dat)
## [1] 658 27
bio184.dat = bio184.dat[complete.cases(bio184.dat), ]
dim(bio184.dat)
## [1] 622 27
sep.cols = id.vars.zero.freq(bio184.dat, treat.col= which(names(bio184.dat)=="palN"))
sum(sep.cols)
## [1] 0
Subjectively identified two potential confounders to force the model to retain: gender and urm.flgERS. Stepwise variable selection will be used to select which of the variables currently in the PAL dataset to include in the propensity model.
# Bio 184 propensity score model ------------------------------------------
min.model3 = glm(palN ~gender+urm.flgERS, data=bio184.dat, family=binomial)
summary(min.model3) # cum.percent.units.passed p=.541 when included in model
##
## Call:
## glm(formula = palN ~ gender + urm.flgERS, family = binomial,
## data = bio184.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0137 -0.7182 -0.6100 -0.6100 2.0062
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8690 0.6348 -2.944 0.00324 **
## genderMale 0.2832 0.1988 1.424 0.15435
## urm.flgERSNon-URM 0.2819 0.6425 0.439 0.66081
## urm.flgERSTwo or More Races 0.3887 0.7576 0.513 0.60790
## urm.flgERSUnknown 1.1878 0.7419 1.601 0.10936
## urm.flgERSURM 0.9395 0.6466 1.453 0.14624
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 670.71 on 621 degrees of freedom
## Residual deviance: 655.44 on 616 degrees of freedom
## AIC: 667.44
##
## Number of Fisher Scoring iterations: 4
# Transfers many not have had chance to enroll in PAL, this looks like the first UD class for Bio with PAL
biggest3 = formula(glm(palN ~.-grd.pt.unt-prevPAL, data=bio184.dat, family=binomial))
bio184.step.first.order = step(min.model3,
direction="forward",scope = biggest3, trace=F)
summary(bio184.step.first.order)
##
## Call:
## glm(formula = palN ~ gender + urm.flgERS + load.code + mother.ed +
## e.rmd + term.units.attemptedCensus + stem.flg + bot.level +
## censusMajor + higher.ed.gpa.start + course.age, family = binomial,
## data = bio184.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5534 -0.7335 -0.5169 -0.2399 2.5899
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -5.06331 1.73853
## genderMale 0.27322 0.21806
## urm.flgERSNon-URM 0.52603 0.68393
## urm.flgERSTwo or More Races 0.52510 0.81182
## urm.flgERSUnknown 1.80314 0.80014
## urm.flgERSURM 1.15642 0.69040
## load.codeH -0.31061 0.66990
## load.codeOTHER -0.95492 0.52631
## mother.ed4-Year College Graduate -0.24117 0.45260
## mother.edHigh School Graduate 0.28675 0.43471
## mother.edNo High School 1.18171 0.45136
## mother.edPostgraduate 0.38760 0.48844
## mother.edSome College 0.35076 0.43679
## mother.edSome High School 0.69931 0.47356
## e.rmdRemedial in English 0.63589 0.26748
## term.units.attemptedCensus 0.09362 0.06206
## stem.flgSTEM -0.67748 0.26667
## bot.levelSenior -0.69012 0.23756
## bot.levelSophomore 0.04390 0.45049
## censusMajorBiology -0.96645 0.35401
## censusMajorChemistry -0.32545 0.50915
## censusMajorClinical Science/Biomedical Laboratory Science -0.13214 0.52794
## censusMajorOTHER -0.82534 0.53836
## higher.ed.gpa.start 0.53484 0.27459
## course.age 0.05350 0.03366
## z value Pr(>|z|)
## (Intercept) -2.912 0.00359 **
## genderMale 1.253 0.21023
## urm.flgERSNon-URM 0.769 0.44182
## urm.flgERSTwo or More Races 0.647 0.51775
## urm.flgERSUnknown 2.254 0.02423 *
## urm.flgERSURM 1.675 0.09393 .
## load.codeH -0.464 0.64288
## load.codeOTHER -1.814 0.06962 .
## mother.ed4-Year College Graduate -0.533 0.59414
## mother.edHigh School Graduate 0.660 0.50948
## mother.edNo High School 2.618 0.00884 **
## mother.edPostgraduate 0.794 0.42747
## mother.edSome College 0.803 0.42196
## mother.edSome High School 1.477 0.13975
## e.rmdRemedial in English 2.377 0.01744 *
## term.units.attemptedCensus 1.509 0.13139
## stem.flgSTEM -2.541 0.01107 *
## bot.levelSenior -2.905 0.00367 **
## bot.levelSophomore 0.097 0.92237
## censusMajorBiology -2.730 0.00633 **
## censusMajorChemistry -0.639 0.52270
## censusMajorClinical Science/Biomedical Laboratory Science -0.250 0.80236
## censusMajorOTHER -1.533 0.12526
## higher.ed.gpa.start 1.948 0.05144 .
## course.age 1.590 0.11189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 670.71 on 621 degrees of freedom
## Residual deviance: 596.21 on 597 degrees of freedom
## AIC: 646.21
##
## Number of Fisher Scoring iterations: 5
model.first.order184=formula(bio184.step.first.order)
# Calc propensity scores --------------------------------------------------
bio184.propensity.model = glm(model.first.order184, data=bio184.dat, family = binomial)
summary(bio184.propensity.model)
##
## Call:
## glm(formula = model.first.order184, family = binomial, data = bio184.dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5534 -0.7335 -0.5169 -0.2399 2.5899
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -5.06331 1.73853
## genderMale 0.27322 0.21806
## urm.flgERSNon-URM 0.52603 0.68393
## urm.flgERSTwo or More Races 0.52510 0.81182
## urm.flgERSUnknown 1.80314 0.80014
## urm.flgERSURM 1.15642 0.69040
## load.codeH -0.31061 0.66990
## load.codeOTHER -0.95492 0.52631
## mother.ed4-Year College Graduate -0.24117 0.45260
## mother.edHigh School Graduate 0.28675 0.43471
## mother.edNo High School 1.18171 0.45136
## mother.edPostgraduate 0.38760 0.48844
## mother.edSome College 0.35076 0.43679
## mother.edSome High School 0.69931 0.47356
## e.rmdRemedial in English 0.63589 0.26748
## term.units.attemptedCensus 0.09362 0.06206
## stem.flgSTEM -0.67748 0.26667
## bot.levelSenior -0.69012 0.23756
## bot.levelSophomore 0.04390 0.45049
## censusMajorBiology -0.96645 0.35401
## censusMajorChemistry -0.32545 0.50915
## censusMajorClinical Science/Biomedical Laboratory Science -0.13214 0.52794
## censusMajorOTHER -0.82534 0.53836
## higher.ed.gpa.start 0.53484 0.27459
## course.age 0.05350 0.03366
## z value Pr(>|z|)
## (Intercept) -2.912 0.00359 **
## genderMale 1.253 0.21023
## urm.flgERSNon-URM 0.769 0.44182
## urm.flgERSTwo or More Races 0.647 0.51775
## urm.flgERSUnknown 2.254 0.02423 *
## urm.flgERSURM 1.675 0.09393 .
## load.codeH -0.464 0.64288
## load.codeOTHER -1.814 0.06962 .
## mother.ed4-Year College Graduate -0.533 0.59414
## mother.edHigh School Graduate 0.660 0.50948
## mother.edNo High School 2.618 0.00884 **
## mother.edPostgraduate 0.794 0.42747
## mother.edSome College 0.803 0.42196
## mother.edSome High School 1.477 0.13975
## e.rmdRemedial in English 2.377 0.01744 *
## term.units.attemptedCensus 1.509 0.13139
## stem.flgSTEM -2.541 0.01107 *
## bot.levelSenior -2.905 0.00367 **
## bot.levelSophomore 0.097 0.92237
## censusMajorBiology -2.730 0.00633 **
## censusMajorChemistry -0.639 0.52270
## censusMajorClinical Science/Biomedical Laboratory Science -0.250 0.80236
## censusMajorOTHER -1.533 0.12526
## higher.ed.gpa.start 1.948 0.05144 .
## course.age 1.590 0.11189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 670.71 on 621 degrees of freedom
## Residual deviance: 596.21 on 597 degrees of freedom
## AIC: 646.21
##
## Number of Fisher Scoring iterations: 5
bio184.dat$propensity=bio184.propensity.model$fitted.values
match.bio184 = Match(Y=bio184.dat$grd.pt.unt, Tr=bio184.dat$palN, X=bio184.dat$propensity, BiasAdjust = F, estimand = "ATT", M=1, caliper=0.25)
Check to see if matching resulted in good covariate balance between the PAL and nonPAL groups. Covariate balance is not perfect, but reasonable for this courses.
bio184.bal = bal.tab(match.bio184, bio184.propensity.model$formula, data=bio184.dat, un=TRUE, quick=FALSE)
bio184.bal
## Balance Measures
## Type Diff.Un
## gender_Male Binary 0.0653
## urm.flgERS_Foreign Binary -0.0145
## urm.flgERS_Non-URM Binary -0.1391
## urm.flgERS_Two or More Races Binary -0.0116
## urm.flgERS_Unknown Binary 0.0324
## urm.flgERS_URM Binary 0.1328
## load.code_F Binary 0.1172
## load.code_H Binary -0.0235
## load.code_OTHER Binary -0.0937
## mother.ed_2-Year College Graduate Binary -0.0365
## mother.ed_4-Year College Graduate Binary -0.1059
## mother.ed_High School Graduate Binary -0.0040
## mother.ed_No High School Binary 0.1298
## mother.ed_Postgraduate Binary -0.0065
## mother.ed_Some College Binary -0.0026
## mother.ed_Some High School Binary 0.0257
## e.rmd_Remedial in English Binary 0.0825
## term.units.attemptedCensus Contin. 0.2981
## stem.flg_STEM Binary -0.0569
## bot.level_Junior Binary 0.0625
## bot.level_Senior Binary -0.0816
## bot.level_Sophomore Binary 0.0191
## censusMajor_Biochemistry Binary 0.0333
## censusMajor_Biology Binary -0.0844
## censusMajor_Chemistry Binary 0.0157
## censusMajor_Clinical Science/Biomedical Laboratory Science Binary 0.0310
## censusMajor_OTHER Binary 0.0045
## higher.ed.gpa.start Contin. 0.0547
## course.age Contin. -0.0246
## Diff.Adj
## gender_Male -0.0622
## urm.flgERS_Foreign 0.0099
## urm.flgERS_Non-URM -0.0054
## urm.flgERS_Two or More Races 0.0163
## urm.flgERS_Unknown -0.0017
## urm.flgERS_URM -0.0191
## load.code_F 0.0280
## load.code_H -0.0181
## load.code_OTHER -0.0099
## mother.ed_2-Year College Graduate 0.0186
## mother.ed_4-Year College Graduate 0.0024
## mother.ed_High School Graduate 0.0429
## mother.ed_No High School -0.0029
## mother.ed_Postgraduate 0.0103
## mother.ed_Some College -0.0573
## mother.ed_Some High School -0.0140
## e.rmd_Remedial in English -0.0338
## term.units.attemptedCensus -0.0326
## stem.flg_STEM -0.0728
## bot.level_Junior 0.0783
## bot.level_Senior -0.0690
## bot.level_Sophomore -0.0093
## censusMajor_Biochemistry 0.0227
## censusMajor_Biology -0.0484
## censusMajor_Chemistry 0.0233
## censusMajor_Clinical Science/Biomedical Laboratory Science 0.0035
## censusMajor_OTHER -0.0012
## higher.ed.gpa.start -0.0482
## course.age -0.2898
##
## Sample sizes
## Control Treated
## All 479. 143
## Matched (ESS) 75.19 143
## Matched (Unweighted) 153. 143
## Unmatched 326. 0
love.plot(bio184.bal, binary = "raw", stars = "std", var.order = "unadjusted",
thresholds = c(m = .1), abs = F)
The estimated increase in the mean grade of students in PAL over those not in PAL after correcting for self-selection biases is .
summary(match.bio184)
##
## Estimate... 0.2278
## AI SE...... 0.13877
## T-stat..... 1.6415
## p.val...... 0.10069
##
## Original number of observations.............. 622
## Original number of treated obs............... 143
## Matched number of observations............... 143
## Matched number of observations (unweighted). 207
##
## Caliper (SDs)........................................ 0.25
## Number of obs dropped by 'exact' or 'caliper' 0
num.matched84 = length(match.bio184$weights)
mean.gpa.PAL.bio184 = with(match.bio184,
sum(weights*mdata$Y[1:num.matched84])/sum(weights))
mean.gpa.nonPAL.bio184=with(match.bio184,
sum(weights*mdata$Y[(num.matched84+1):(2*num.matched84)])/sum(weights))
paste("mean.gpa.PAL=", mean.gpa.PAL.bio184)
## [1] "mean.gpa.PAL= 2.72027972027972"
paste("mean.gpa.nonPAL=", mean.gpa.nonPAL.bio184)
## [1] "mean.gpa.nonPAL= 2.49248251748252"
mean.gpa.PAL.bio184-mean.gpa.nonPAL.bio184
## [1] 0.2277972
psens(match.bio184, Gamma=2, GammaInc=0.1)
##
## Rosenbaum Sensitivity Test for Wilcoxon Signed Rank P-Value
##
## Unconfounded estimate .... 0.0206
##
## Gamma Lower bound Upper bound
## 1.0 0.0206 0.0206
## 1.1 0.0046 0.0687
## 1.2 0.0009 0.1633
## 1.3 0.0002 0.3021
## 1.4 0.0000 0.4638
## 1.5 0.0000 0.6205
## 1.6 0.0000 0.7515
## 1.7 0.0000 0.8485
## 1.8 0.0000 0.9133
## 1.9 0.0000 0.9531
## 2.0 0.0000 0.9759
##
## Note: Gamma is Odds of Differential Assignment To
## Treatment Due to Unobserved Factors
##
print(sort(exp(abs(bio184.propensity.model$coefficients))))
## bot.levelSophomore
## 1.044880
## course.age
## 1.054961
## term.units.attemptedCensus
## 1.098142
## censusMajorClinical Science/Biomedical Laboratory Science
## 1.141267
## mother.ed4-Year College Graduate
## 1.272735
## genderMale
## 1.314190
## mother.edHigh School Graduate
## 1.332095
## load.codeH
## 1.364263
## censusMajorChemistry
## 1.384651
## mother.edSome College
## 1.420140
## mother.edPostgraduate
## 1.473437
## urm.flgERSTwo or More Races
## 1.690625
## urm.flgERSNon-URM
## 1.692200
## higher.ed.gpa.start
## 1.707183
## e.rmdRemedial in English
## 1.888704
## stem.flgSTEM
## 1.968916
## bot.levelSenior
## 1.993964
## mother.edSome High School
## 2.012374
## censusMajorOTHER
## 2.282658
## load.codeOTHER
## 2.598457
## censusMajorBiology
## 2.628595
## urm.flgERSURM
## 3.178521
## mother.edNo High School
## 3.259943
## urm.flgERSUnknown
## 6.068665
## (Intercept)
## 158.112255