## Analysis on the PGLW dataset. ## The Code runs on the Software R Version 2.7.2, the Version of R when this research ## was conducted. The changes of packages in R since then may cause compatibility issue. To avoid ## the compatibility issue, it is recommended that the user downloads R 2.7.2 and run the ## code under the version. The first half of the code works on data organization, fitting ## ignorable models, and may take time to finish. The latter part for computing ISNI is very quick. rm(list=ls()) library(nnet) library(numDeriv) library(rgenoud) library(nlme) source("funcintermiss.R") ## read in dataset pglw= read.table('pglwlong.dat',header=F, col.names=c("id","time","age","birth","education","racesex","y"), na.strings = "." ) ## create missingness indicator pglw = fun.table(data=pglw, time=c(0,2,5,7)) ## create the missingness statu at the 2nd visit prior to the current time. ## ig, igp, igp2 recodes the dropout to intermittent missingness for alternative specification ## of missingness types. pglw$dg= pglw$g pglw$dgp= pglw$gp pglw$ig= ifelse(pglw$g==2, 1, pglw$g) pglw$igp = ifelse(pglw$gp==2, 1, pglw$gp) ## now create igp2 and dgp2 for the 2nd visit before the current time. pglw$igp2=pglw$dgp2=-99 uid=unique(pglw$id) pglw$ambig=0 for (i in 1: length(uid)) { print(i) datai=pglw[pglw$id==uid[i],] if (all(is.na(datai$y)==c(F,F,F,T)) | all(is.na(datai$y)==c(F,F,T,T)) | all(is.na(datai$y)==c(F,T,T,T)) |all(is.na(datai$y)==c(F,T,F,T))) { datai$ambig=rep(1,4) } datai$igp2=ifelse(datai$time==0 | datai$time==2, -1, ifelse(datai$time== 5, 0, datai$ig[datai$time==2])) pglw$igp2[pglw$id==uid[i]]= datai$igp2 datai$dgp2=ifelse(datai$time==0 | datai$time==2, -1, ifelse(datai$time== 5, 0, datai$dg[datai$time==2])) pglw$dgp2[pglw$id==uid[i]]= datai$dgp2 pglw$ambig[pglw$id==uid[i]]= datai$ambig } rm(datai); rm(uid) pglw$gp2=pglw$dgp2 length(unique(pglw$id[pglw$g==2])) ## create categorical variables pglw$educ1 = ifelse(pglw$education==1,1,0) pglw$educ2 = ifelse(pglw$education==2,1,0) pglw$educ3 = ifelse(pglw$education==3,1,0) pglw$interc1 = ifelse(pglw$racesex==1,1,0) pglw$interc2 = ifelse(pglw$racesex==2,1,0) pglw$interc3 = ifelse(pglw$racesex==3,1,0) pglw$interc4 = ifelse(pglw$racesex==4,1,0) pglw$slope1 = pglw$interc1 * pglw$time pglw$slope2 = pglw$interc2 * pglw$time pglw$slope3 = pglw$interc3 * pglw$time pglw$slope4 = pglw$interc4 * pglw$time pglw$year5= ifelse(pglw$time==5,1,0) pglw$year7= ifelse(pglw$time==7,1,0) pglw$year52= pglw$interc2*pglw$year5 pglw$year53= pglw$interc3*pglw$year5 pglw$year54= pglw$interc4*pglw$year5 pglw$year72= pglw$interc2*pglw$year7 pglw$year73= pglw$interc3*pglw$year7 pglw$year74= pglw$interc4*pglw$year7 pglw$yp2= pglw$interc2*pglw$yp pglw$yp3= pglw$interc3*pglw$yp pglw$yp4= pglw$interc4*pglw$yp pglw$ypage=pglw$yp*pglw$age pglw$ypeduc1=pglw$yp*pglw$educ1 pglw$ypeduc2=pglw$yp*pglw$educ2 pglw$ypt5=pglw$yp*pglw$year5 pglw$ypt7=pglw$yp*pglw$year7 pglw$yp2t7=pglw$yp2*pglw$year7 pglw$yp3t7=pglw$yp3*pglw$year7 pglw$yp4t7=pglw$yp4*pglw$year7 pglw$female= ifelse(pglw$racesex==2 | pglw$racesex==4,1,0) pglw$black=ifelse(pglw$racesex==1 | pglw$racesex==2,1,0) pglw$ypfemale=pglw$yp*pglw$female pglw$ypblack=pglw$yp*pglw$black pglw$agestd=(pglw$age-mean(pglw$age))/10 pglw$ypagestd=pglw$yp*pglw$agestd pglw$blackt5=pglw$black* pglw$year5 pglw$blackt7=pglw$black* pglw$year7 pglw$blackfemale=pglw$black*pglw$female pglw.gen=pglw ## fit missing-data model gmodel0= as.factor(g)~ black+ female+blackfemale +year5+year7+blackt5+blackt7 + yp ##+agestd+educ2+educ3 ##+year52+year53+year54+year72+year73+year74+ agestd+educ1+educ2 ##+ ypfemale+ypblack+ypeduc1+ypeduc2+ypt5+ypt7 gmodel1= as.factor(g)~ black+female+blackfemale+year5+ year7+blackt5+ blackt7+yp ##+ agestd+educ2+educ3 ##+ ypblack+ypfemale+ ypeduc1+ypeduc2+ypt7 +agestd+ educ1+educ2 ##agestd+educ1+educ2+ blackt7+ yp + ypmale+ypblack +ypagestd+ypeduc1+ypeduc2+ypt7-1 gmodel=list(gmodel0=gmodel0,gmodel1=gmodel1) ## (1) This is for calclation with three missingness types. To run (1), comment out code under (2) and rerun the code ##pglw.gres = fun.gfit(data=pglw, gmodel= gmodel, lag=2) ## (2) for only two missingness type. To run (2), comment out code under (1) and rerun the code. ## recode G to have only two missingness types pglw$g=pglw$ig; pglw$gp=pglw$igp; pglw$gp2=pglw$igp2 pglw.gres= fun.gfit2(data=pglw, gmodel=gmodel,lag=2) pglw=pglw.gres$data ## generate the sumerization table pglw.n1<-aggregate(pglw$y, by=list(racesex=pglw$racesex,time=pglw$time),sum,na.rm=TRUE) pglw.n<-aggregate(pglw$y[!is.na(pglw$y)], by=list(racesex=pglw$racesex[!is.na(pglw$y)], time=pglw$time[!is.na(pglw$y)]),length) pglw.p<-pglw.n1$x/pglw.n$x ## do the ignorable model estimation yfix.form<-as.formula(y ~ slope1+slope2+slope3+slope4+interc1+interc2+interc3+interc4-1) ##yfix.form<-as.formula(y ~ slope1+slope2+slope3+slope4+interc1+interc2+interc3+interc4-1+educ1+educ2) yrdm.form1<-as.formula(y~1) fity.init<-glm(yfix.form,family="binomial",data=pglw, na.action=na.exclude) pary.init<- c(fity.init$coef,5) fun.glmmgrad<- function(par,data,fix.form,rdm.form,transform) fun.glmmall(par,data,fix.form, rdm.form, transform, case=2) pglw.ig<-nlminb(pary.init,fun.glmmall,gradient=fun.glmmgrad,data=pglw,fix.form=yfix.form, rdm.form=yrdm.form1,transform=FALSE,control=list(trace=TRUE,iter.max=200)) pglw.hess = fdHess(pglw.ig$par,fun.glmmall,data=pglw,fix.form=yfix.form,rdm.form=yrdm.form1,transform=FALSE)$Hessian ##pglw.hess = hessian(fun.glmmall,pglw.ig$par,data=pglw,fix.form=yfix.form,rdm.form=yrdm.form1,transform=FALSE) pglw.nabla11 = solve(pglw.hess) pglw.se = sqrt(diag(pglw.nabla11)) ## now do four dimensional ISNI calculation pglw.nabla12 =fun.glmmall(pglw.ig$par,data=pglw,fix.form=yfix.form,rdm.form=yrdm.form1,transform=FALSE, r1grp="racesex", case=3) pglw.isni= pglw.nabla11%*%pglw.nabla12 pglw.isni= abs(pglw.isni[,1])+abs(pglw.isni[,2])+ abs(pglw.isni[,3]) + abs(pglw.isni[,4]) pglw.c= pglw.se/pglw.isni pglwp1=pglw.ig$par+pglw.isni pglwm1= pglw.ig$par - pglw.isni ## now use genetic algorithm to find the bound for ISNI values ## The GA algorithm may take time to do because a large number of ISNI values needs to be evaluated to obtain the bounds ## The exact time depends on the population size with a large population size requires longer time to run. gen.dmean=fun.dermean4gen(pglw.gen,rep(1,1036), pglw.ig, fix.form=yfix.form, rdm.form=yrdm.form1, r1grp="racesex") ##(1) USe GA to find the minimum gvec= rep(2,1036) ismax=FALSE pop.size=1000 max.generation=300 slope=1 ## slope=1,2,3 or 4 indicating black male, black female, white male and white female respectively. pglw.ga<-genoud(fun.gen, nvars=length(gvec), max=ismax,starting.values=gvec, data.type.int=TRUE, Domains=cbind(rep(1, length(gvec)), rep(2,length(gvec))), boundary.enforcement=2, gendata=pglw.gen,pglw.nabla11=pglw.nabla11, para=pglw.ig$par,yfix.form=yfix.form, yrdm.form1=yrdm.form1, gen.dmean=gen.dmean$dmean, r1group=gen.dmean$r1group, slope=slope) pglw.ga ## now use the genetic algorithm to find the maximum. gvec= rep(1,1036) ismax=TRUE pop.size=1000 max.generation=300 slope=1 ## slope=1,2,3 or 4 indicating black male, black female, white male and white female respectively. pglw.ga<-genoud(fun.gen, nvars=length(gvec), max=ismax,starting.values=gvec, data.type.int=TRUE, Domains=cbind(rep(1, length(gvec)), rep(2,length(gvec))), boundary.enforcement=2, gendata=pglw.gen,pglw.nabla11=pglw.nabla11, para=pglw.ig$par,yfix.form=yfix.form, yrdm.form1=yrdm.form1, gen.dmean=gen.dmean$dmean, r1group=gen.dmean$r1group, slope=slope) pglw.ga