## R Code for Organic Chemistry PAL Dosing Effect ##
library(readr)
library(tidyverse)
library(cobalt)
library(WeightIt)
library(weights)
library(DataExplorer)
library(car)
library(glmnet)
library(rbounds)
library(EValue)
setwd("Course Outcomes")
dat <- read_rds("pal_data_25.rds")
dat[sapply(dat, is.character)] <- lapply(dat[sapply(dat, is.character)], as.factor)
dat$pal.flg2 <- rep("Non-PAL", length=length(dat$pal.flg))
temp <- which(dat$pal.grade == "CR")
dat$pal.flg2[temp] <- "PAL"
dat$pal.flg2 <- as.factor(dat$pal.flg2)
temp <- which(dat$ssr.cum.tr.gpa == 0)
dat$ssr.cum.tr.gpa[temp] <- NA
## Remove course repeats
temp <- which(dat$rpt.seq == 1)
dat.up <- dat[temp,]
dat.up <- dat.up[,-which(names(dat.up) %in% c("rpt.seq"))]
# Remove variables not of interest:
dat.up <- dat.up[,-which(names(dat.up) %in%
c("course.descr", "session.code", "enrl.add.dt",
"acad.group", "dept.abbr", "grade", "dfw.flg",
"instructor1", "acad.career", "acad.prog",
"Asian", "Black", "Hispanic", "Native.American",
"Pacific.Islander", "SWANA", "White",
"last.school.attended.ext.org.id", "acad.plan1st",
"plan.desc1st", "plan.degree1st", "plan.dept1st",
"plan.college1st", "acad.plan1", "plan.desc1",
"plan.dept1", "plan.college1", "plan.degree1",
"class.section", "last.school.attended",
"acad.group", "unt.taken", "coh.term", "term",
"units.passed.remedial", "cum.previous.pal",
"pal.flg","num.pal","cum.pal","cum.pal.this.course",
"cum.previous.pal.this.course","learning.mode",
"enrl.tot"))]
chem.ind <- which(dat.up$course == "CHEM 24" |
dat.up$course == "CHEM 124")
data.chem <- dat.up[chem.ind, ]
# Remove variables with >10% missing
prop.na <- function(x) sum(is.na(x))/length(x)
miss <- apply(data.chem, MARGIN=2, FUN=prop.na)
missGr10 <- which(miss > 0.1)
dat.chem <- data.chem[,-missGr10]
# Want to keep "hs.gpa"
data.chem <- data.frame(dat.chem, data.chem$hs.gpa)
dat.chem24 <- data.chem[data.chem$course == "CHEM 24",]
dat.chem124 <- data.chem[data.chem$course == "CHEM 124",]
chem.merge <- merge(dat.chem24,
dat.chem124[c("emplid","course","grade.num","pal.flg2")], by.x = "emplid", by.y = "emplid",
suffixes = c(".1",".2"))
chem.merge[sapply(chem.merge, is.factor)] <- lapply(chem.merge[sapply(chem.merge, is.factor)], droplevels)
data.chem <- chem.merge[,-which(names(chem.merge) %in%
c("emplid","course.1","course.2"))]
dat1 <- data.chem
dat1 <- dat1[complete.cases(dat1), ]
## Unweighted Means ##
# Chem 124
mean(dat1$grade.num.2[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"])
mean(dat1$grade.num.2[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"])
# Chem 24
mean(dat1$grade.num.1[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"])
mean(dat1$grade.num.1[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"])
## Identify variables causing complete separation in logistic regression
## Identify any variables that are constants
# lapply(dat1, unique)
# table(dat1$pal.flg2.1, dat1$last.school.attended.local.flg)
dat1$gender <- droplevels(dat1$gender)
dat1$foreign.flg <- droplevels(dat1$foreign.flg)
# collapse sparse categories
dat1=group_category(data = dat1, feature = "coh",
threshold = 0.05, update = TRUE)
dat1$coh <- as.factor(dat1$coh)
dat1$prop.pssd <- dat1$unt.passd.prgrss/(dat1$unt.taken.prgrss - dat1$units.taken.remedial)
dat1 <- dat1[,-which(names(dat1) %in%
c("unt.passd.prgrss","unt.taken.prgrss","term.units"))]
# Combine sparse ethnicity categories to Other
dat1 <- dat1 %>%
mutate(eth.ipeds = fct_collapse(eth.ipeds,
`Other` = c("Native American", "Pacific Islander",
"Two or More Races", "Unknown")))
# Collapse sparse categories for acad.stand
# Other: Academic Dismissal, Academic Disqualification
dat1 <- dat1 %>%
mutate(acad.stndng.stat.desc = fct_other(acad.stndng.stat.desc, keep = c("Good Standing")))
# Response variable
response <- "grade.num.2"
stopifnot(response %in% names(dat1))
# Predictors
predictors <- setdiff(names(dat1), response)
# Safe model summary extractor
analyze_predictor <- function(var) {
form <- as.formula(paste(response, "~", var))
mod <- lm(form, data = dat1)
mod_summary <- summary(mod)
# Get F-statistic and compute overall model p-value
if (!is.null(mod_summary$fstatistic)) {
fstat <- mod_summary$fstatistic
f_val <- as.numeric(fstat["value"])
df1 <- as.numeric(fstat["numdf"])
df2 <- as.numeric(fstat["dendf"])
p_val <- pf(f_val, df1, df2, lower.tail = FALSE)
} else {
p_val <- NA # Could not compute
}
data.frame(
predictor = var,
r_squared = round(mod_summary$r.squared,3),
p_value = round(p_val,4),
stringsAsFactors = FALSE
)
}
# Run on all predictors
results <- do.call(rbind, lapply(predictors, analyze_predictor))
# View output
View(results)
# Removed remedials units because it was aliased
# Removed last.school.attended.local.flg, unt.taken.prgrss due to high vif
mod.final <- pal.flg2.1 ~ . -pal.flg2.2 -grade.num.2 -units.taken.remedial -
last.school.attended.local.flg -base.time
mod1 <- glm(mod.final, data=dat1, family="binomial")
vif(mod1)
summary(mod1)
plot(mod1)
wts.full <- weightit(mod.final, data=dat1, estimand="ATE", method="glm")
temp <- bal.tab(wts.full, un=TRUE)
write.csv(temp$Balance, "BalanceTable.csv")
love.plot(wts.full, abs=TRUE, threshold=0.1, stars="none")
wts <- wts.full$weights
dat1$wts <- wts
plot(density(wts.full$ps[dat1$pal.flg2.2 == 'Non-PAL']), col = "blue", main = "Propensity Score Overlap")
lines(density(wts.full$ps[dat1$pal.flg2.2 == 'PAL']), col = "red")
legend("topright", legend = c("Non-PAL", "PAL"), col = c("blue", "red"), lty = 1)
# Chem 124 PAL students
length(dat1$grade.num.2[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"])
weighted.mean(dat1$grade.num.2[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"],
w = dat1$wts[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"])
var.wt1 <- wtd.var(dat1$grade.num.2[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"],
dat1$wts[dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL"])
sqrt(var.wt1/46)
length(dat1$grade.num.2[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"])
weighted.mean(dat1$grade.num.2[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"],
w = dat1$wts[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"])
var.wt2 <- wtd.var(dat1$grade.num.2[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"],
dat1$wts[dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"])
sqrt(var.wt2/31)
df <- data.frame(
group = c("NSM 12N", "No NSM 12N"),
mean = c(2.823134, 2.256758),
se = c(0.1538862, 0.213329)
)
ggplot(df, aes(x = group, y = mean)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black", width = 0.7) +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.2) +
scale_y_continuous(breaks = seq(0, 3, by = 0.5)) +
theme_minimal() +
labs(x = "NSM 12S", y = "ORGANIC II GPA", title = "")
add.eff <- which((dat1$pal.flg2.1 == "PAL" & dat1$pal.flg2.2 == "PAL") |
(dat1$pal.flg2.1 == "Non-PAL" & dat1$pal.flg2.2 == "PAL"))
dat2 <- dat1[add.eff,]
dat2$ind2 <- rep("both", length(add.eff))
dat2$ind2[which(dat2$pal.flg2.1 == "Non-PAL" & dat2$pal.flg2.2 == "PAL")] <- "Second Only"
dat2$ind2 <- as.factor(dat2$ind2)
temp <- wtd.t.test(dat2$grade.num.2[dat2$ind2=="both"],
dat2$grade.num.2[dat2$ind2=="Second Only"],
weight=dat2$wts[dat2$ind2=="both"],
weighty=dat2$wts[dat2$ind2=="Second Only"])
temp
temp$additional[1] + c(-1,1)*qt(0.975, df=temp$coefficients[2])*temp$additional[4]
evalues.OLS(est = temp$additional[1], se = temp$additional[4],
sd = sd(dat2$grade.num.2))