/***********************************************************/ /* INTRODUCTION TO STATA - toys using brain-IQ data */ /***********************************************************/ /*input data using `point and click'*/ /*clear the memory*/ clear /*same thing using code*/ insheet using "/scratch/OhioFamilyHealthSurvey/brainIQ.dat" /*look at the data using the data editor*/ /*look at the data using `list'*/ list set more off /*tells STATA to not wait for scrolling*/ /*variable summaries*/ summarize /*summarize all the variables*/ summarize gender fsiq mri_count /*summarize just some of the variables*/ help summarize summarize fsiq, detail /*use options to show more detail*/ su f, d /*Stata allows extreme abbreviation*/ /*find means*/ mean fsiq /*find proportions*/ prop gender /*make graphs*/ scatter f m scatter f m, by(gender) graph box f, over(g) /*make new variables as functions of old variables*/ gen weightkg = weight/2.2 gen heightm = height*0.0254 gen BMI = weightkg/heightm^2 summarize w h BMI summarize w* h* BMI /*drop extra variables*/ drop weightkg heightm /*make new categorical variables*/ gen IQcat = 0 replace IQcat = 1 if fsiq>116.5 summarize IQcat, d graph box fsiq, over(IQcat) /*Using results of functions*/ summarize f, d return list display r(p50) gen IQcat2 = 0 replace IQcat2 = 1 if fsiq > r(p50) tab IQcat IQcat2 clear /***********************************************************/ /* STATA ANALYSIS of OFHS */ /***********************************************************/ set memory 1g /*increase available memory, since the data is huge*/ use "ofhs_virgin_file.dta" /*read in the data*/ /*recode some missing data*/ replace s10 = . if s10>9 /*maximum recorded adults = 9 */ replace s12 = . if s12>12 /*maximum number of children = 12 */ /*create a new variable - yes/no for children in hh*/ gen kids = 0 replace kids = 1 if s12>0 /*************************************************************/ /* Estimation with SRS */ /*************************************************************/ /*****************************************/ /*create a pseudo-SRS from Butler County*/ keep if stratum==9 /*keep only one county - this is the coded number for Butler*/ keep if s10!=. /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/ keep if s12!=. gen N=123018 gen n=1082 save "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta", replace /*save the reduced data set*/ clear set memory 8000 /*don't need as much memory for this reduced data set*/ use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta" /*read in the Butler County data set*/ /**************************************************/ /*Estimate the Mean Number of Adults per Household*/ /*s10 is the variable that stores the number of Adults*/ tab s10, missing /*look at the data, the "missing" option shows any missing values*/ /*****************/ /*First by "hand"*/ mean s10 /*calculate the mean*/ ereturn list /*list the variables that the estimator produces*/ matrix list e(V) /*report the variance*/ matrix define varmat = e(V) /*save the variance matrix with name `varmat'*/ matrix list varmat scalar define myvar = varmat[1,1] /*extract the variance into a scalar*/ display myvar scalar define myvar = myvar*(1-e(N)/123082) /*adjust via the fpc*/ display myvar scalar define myse = sqrt(myvar) /*find the standard error*/ display myse matrix define meanmat = e(b) /*extract the mean*/ matrix list meanmat scalar define mymean = meanmat[1,1] /*save the mean as a scalar*/ display mymean /*Calculate CI*/ /* Stata uses a t statistic with n-1 df*/ scalar define tval = abs(invttail(e(N)-1, 0.025)) /*find the critical value*/ display tval scalar define lower = mymean - tval*myse /*calculate the lower CI*/ scalar define upper = mymean + tval*myse /*calculate the upper CI*/ scalar list mymean myse lower upper /***************************/ /*Next by built-in command */ capture gen N=123082 /*make a variable to hold the population size*/ svyset masterid, fpc(N) /*make survey definition straight up with knowledge of N*/ svy: mean s10 /*survey calculate the mean*/ scalar list mymean myse lower upper /*compare with hand calculations*/ /**************************************************/ /*Estimate the proportion of households with kids */ svy: proportion kids svy: mean kids /***************************************************/ /*Estimate the total number of households with kids*/ svy: total kids /*doesn't work*/ /****************************************/ /* now do estimation with exact weights */ svyset, clear /*clear the old definitions*/ gen wt = 123082/1082 /*define the weight equal to the inverse probability*/ svyset masterid [pweight=wt], fpc(N) /*make survey definition*/ /*mean*/ svy: mean s10 scalar list mymean myse lower upper /*compare with hand calculations*/ /*total*/ svy: total kids /*********************************/ /* weights that are proportional */ svyset, clear /*clear the old definitions*/ gen wtprop = 1/1082 svyset masterid [pweight=wtprop], fpc(N) /*mean*/ svy: mean s10 scalar list mymean myse lower upper /*compare with hand calculations*/ /*total*/ svy: total kids /*doesn't work*/ /*********************************/ /* Ratios */ capture gen wt = 123082/1082 /*define the wt if it hasn't already*/ capture gen N = 123082 svyset masterid [pweight=wt], fpc(N) /*Ratio of children to total*/ gen tothhmembers = s10 + s12 gen prophhkids = s12 / tothhmembers /*Unit level*/ svy: mean prophhkids /*population level*/ svy: ratio s12 / tothhmembers /*Ratio for unknown pop size*/ svyset, clear capture gen wtprop = 1/1082 svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/ svy: ratio kidsratio: s12 / tothhmembers /*estimate ratio, store in kidsratio*/ lincom _b[kidsratio] * 332807 /*multiply estimate by total Population in Butler County*/ /*Ratio for reduced variance*/ svy: ratio kidsratio: s12 / tothhmembers /*estimate ratio, store in kidsratio*/ lincom _b[kidsratio] * 2.61 /*ratio estimate, 2.61 is the average #members/household in Butler County*/ matrix ybarr = r(estimate) /*lincom stores stuff in r() */ matrix Vybarr = r(se)^2 svy: mean s12 /*usual estimate*/ matrix ybar = e(b) matrix Vybar = e(V) /*display results*/ scalar define tval = abs(invttail(e(N)-1, 0.025)) /*t critical value*/ /*srs - consolidate results*/ matrix srs = (ybar, Vybar, ybar - tval*cholesky(Vybar), ybar + tval*cholesky(Vybar)) matrix colnames srs = estimate variance lowerCI upperCI matrix rownames srs = srs /*ratio - consolidate results*/ matrix ratio = (ybarr, Vybarr, ybarr - tval*cholesky(Vybarr), ybarr + tval*cholesky(Vybarr)) matrix colnames ratio = estimate variance lowerCI upperCI matrix rownames ratio = ratio matrix comparemeans = srs\ratio /*combine the srs and ratio into one matrix*/ matrix list comparemeans correlate s12 tothhmembers scatter s12 tothhmembers, jitter(10) title(The correlation is `r(rho)') bysort tothhmembers: summarize s12 /*********************************/ /* Domain */ svyset, clear capture gen wtprop = 1 svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/ tab kids svy: mean s10, subpop(kids) /*only households with kids*/ svy: mean s10, over(kids) /*households with and without kids, separately*/ /*!!!THE WRONG WAY!!!*/ drop if kids==0 svy: mean s10 /*Get the right data back*/ clear use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta" capture gen wtprop = 1/1082 svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/ /*difference between means in two domains*/ svy: mean s10, over(kids) lincom [s10]1 - [s10]0 /*********************************/ /* Regression */ svyset, clear capture gen wtprop = 1/1082 capture gen tothhmembers = s10 + s12 capture gen prophhkids = s12 / tothhmembers svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/ svy: regress s12 tothhmembers lincom _cons + tothhmembers * 2.61 /*store the results*/ matrix ybarreg = r(estimate) /*lincom stores stuff in r() */ matrix Vybarreg = r(se)^2 /*display the results in a matrix*/ scalar define tval = abs(invttail(e(N)-1, 0.025)) matrix reg = (ybarreg, Vybarreg, ybarreg - tval*cholesky(Vybarreg), ybarreg + tval*cholesky(Vybarreg)) matrix colnames reg = estimate variance lowerCI upperCI matrix rownames reg = regression matrix comparemeans = srs\ratio\reg matrix list comparemeans graph twoway scatter s12 tothhmembers, jitter(10) || lfit s12 tothhmembers /**DIFFERENT EXAMPLE -- income vs. hours worked**/ capture gen myincome = h85y capture replace myincome = . if myincome>=999997 /*get rid of missing data*/ capture replace myincome = h85m * 12 if myincome = . scatter myincome g73 /* income vs. hours worked */ /**DIFFERENT EXAMPLE -- premiums vs. income**/ capture gen mypremiums = b8 capture replace mypremiums = b8 * 52 if b8b==1 capture replace mypremiums = b8 * 52 / 2 if b8b==2 capture replace mypremiums = b8 * 12 if b8b==3 capture replace mypremiums = b8 * 12 *2 if b8b==4 capture replace mypremiums = b8 * 12 / 2 if b8b==5 capture replace mypremiums = b8 * 4 if b8b==6 capture replace mypremiums = b8 * 2 if b8b==7 capture replace mypremiums = . if mypremiums >= 99998 capture replace mypremiums = . if b8b >= 97 scatter mypremiums myincome if myincome!=. & mypremiums!=. clear /*************************************************************/ /* Estimation with STRATA */ /*************************************************************/ /******************************************/ /*create a pseudo-SRS from Mahoning County*/ set memory 1g /*increase available memory, since the data is huge*/ use "ofhs_virgin_file.dta" /*read in the data*/ /*recode some missing data*/ replace s10 = . if s10>9 /*maximum recorded adults = 9 */ replace s12 = . if s12>12 /*maximum number of children = 12 */ /*create a new variable - yes/no for children in hh*/ gen kids = 0 replace kids = 1 if s12>0 keep if stratum==50 /*keep only one county - this is the coded number for Mahoning*/ keep if s10!=. /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/ keep if s12!=. gen N=102587 gen n=1023 save "/scratch/OhioFamilyHealthSurvey/MahoningCounty.dta", replace /*save the reduced data set*/ clear /*******************************************/ /* Append the two county data sets */ use "MahoningCounty.dta" append using "ButlerCounty.dta" set more off tab stratum /******************************************/ /* Calculate a Mean by hand */ su s10 if stratum==9 /*calculate the mean and standard error*/ scalar define N9 = 123082 scalar define n9 = r(N) /* sample size*/ scalar define mean9 = r(mean) /*mean */ scalar define stdev9 = r(sd) /* variance */ su s10 if stratum==50 /*calculate the mean and standard error*/ scalar define N50 = 102587 scalar define n50 = r(N) /* sample size*/ scalar define mean50 = r(mean) /*mean */ scalar define stdev50 = r(sd) /* variance */ scalar define Ntot=N9+N50 /*stratified mean*/ scalar define meanstr = N9/(Ntot)*mean9 + N50/(Ntot)*mean50 display meanstr /*stratified variance*/ scalar define varstr = N9^2/Ntot^2*(1-n9/N9)*stdev9^2/n9 + N50^2/Ntot^2*(1-n50/N50)*stdev50^2/n50 display sqrt(varstr) scalar define varstrnofpc = (N9^2)/(Ntot^2)*stdev9^2/n9 + (N50^2)/(Ntot^2)*stdev50^2/n50 display sqrt(varstrnofpc) /*******************************************/ /* Calculate a Mean using weights */ /*make a weight variable*/ gen mywt = N/n svyset, clear svyset masterid [pweight=mywt], strata(stratum) fpc(N) svy: mean s10 /* compare to the ``by hand" results*/ display meanstr display sqrt(varstr) /*******No FPC********/ svyset, clear svyset masterid [pweight=mywt], strata(stratum) svy: mean s10 /* compare to the ``by hand" results*/ display meanstr display sqrt(varstrnofpc) /*******************************************/ /* Now include all counties as strata */ clear set memory 1g /*increase available memory, since the data is huge*/ use "ofhs_virgin_file.dta" /*read in the data*/ /*recode some missing data*/ replace s10 = . if s10>9 /*maximum recorded adults = 9 */ replace s12 = . if s12>12 /*maximum number of children = 12 */ /*create a new variable - yes/no for children in hh*/ gen kids = 0 replace kids = 1 if s12>0 /*create a new variable - total number of people in a household */ capture gen tothhmembers = s10 + s12 keep if stratum<=88 /*keep only the counties (not the oversample)*/ keep if s10!=. /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/ keep if s12!=. run StrWts.do /*this file assigns naive weights to units in each stratum to a variable called mystrwt*/ svyset masterid [pweight=mystrwt], strata(stratum) /*DESCRIBE THE SURVEY DESIGN*/ set more off svydes svy: mean s10 s12 svy: ratio s12 / tothhmembers /*************************************************************/ /* Estimation with CLUSTERS */ /*************************************************************/ set more off clear set memory 1g fdause "NIS05.xpt", novallabels sort seqnumhh /*Look only at Franklin County == approximately an SRS of phone numbers */ keep if estiap == 43 /*Look at polio vaccines*/ tab full_pol, missing /*this variable containes yes/no polio series*/ drop if full_pol > 2 /*cheat by dropping the missing data*/ replace full_pol = 0 if full_pol == 2 /*recode the nos to `0' so that the proportions work out*/ /*Look at the households */ tab seqnumhh /* there are 4 households with 2 children, rest have only one */ return list /* 196-192 confirms this */ duplicates report seqnumhh /*yet another way to get this*/ /*identify houshohld with more than one child*/ duplicates tag seqnumhh, generate(numkids) tab numkids /*look at the generated variable*/ replace numkids = numkids + 1 /*this is the number of kids in the child's household*/ /*look at the polio of the kids in multi-kid households*/ list seqnumhh full_pol if numkids==2 /* record the number of households in the sample*/ scalar nn = 192 /*we use nn because `n' is ambiguous*/ /*Cheat and use the weights to find the number of households*/ total rddwt if numkids==1 matrix define totonekid = e(b) total rddwt if numkids==2 matrix define tottwokids = e(b) scalar N_est = round(totonekid[1,1] + tottwokids[1,1]/2) /*This estimates the number of households*/ gen N = N_est gen KK = N_est * 1.05 /*generate the number of children per household (in the sample)*/ generate M = numkids /*********************************************************/ /* to do estimation, it's easier to collapse the data set*/ /* into totals with one row per household */ by seqnumhh: egen tot_pol = total(full_pol) /* sum the polio for the household */ list seqnumhh full_pol tot_pol if numkids==2 /*save the current state first!*/ save "/scratch/OhioFamilyHealthSurvey/NIS05.dta", replace /***now collapse***/ duplicates drop seqnumhh, force /******unbiased estimate******/ duplicates drop seqnumhh, force display N_est display nn summarize tot_pol scalar t_bar = r(mean) scalar t_hat_unb = N_est * t_bar scalar st_sq = r(Var) scalar t_unb_var = N_est^2*(1-nn/N)*st_sq/nn display t_hat_unb display sqrt(t_unb_var) scalar ybar_unb = t_hat_unb / KK scalar ybar_unb_var = t_unb_var / (KK^2) display ybar_unb display sqrt(ybar_unb_var) /*******ratio estimate*******/ total tot_pol M matrix totests = e(b) scalar ybar_r = totests[1,1]/totests[1,2] display ybar_r summarize M scalar mbar_U = r(mean) gen ybar_i = tot_pol/M gen numerator = M^2 * (ybar_i - ybar_r)^2 summarize numerator scalar nummean=r(mean) scalar ybar_r_var = (1- nn/N) * 1 / (mbar_U^2) / nn * nummean * nn / (nn-1) display ybar_r display sqrt(ybar_r_var) /******Use built-in Functions*****/ clear use "/scratch/OhioFamilyHealthSurvey/NIS05.dta" svyset seqnumhh, fpc(N) svy: mean full_pol /******Use Weights ******/ gen clustwt = N_est / nn svyset seqnumhh [pweight = clustwt], fpc(N) svy: total full_pol svy: mean full_pol display ybar_r display sqrt(ybar_r_var) clear /************************************************************/ /** Back to the OFHS **/ /************************************************************/ set memory 8000 /*don't need as much memory for this reduced data set*/ use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta" /*read in the Butler County data set*/ capture gen N=123082 /*make a variable to hold the population size*/ capture gen n=1082 /*make a variable to hold the sample size*/ /* Weight the data using the simplistic 2-stage sample */ gen p1 = n/N /*probability of household being selected*/ gen p2 = 1/s10 /*probability of adult being selected */ gen totprob = p1 * p2 gen twostagewt = 1/totprob /* Define the design [two stage cluster]*/ svyset masterid [pweight = twostagewt], fpc(N) svy: prop a1 /* a1 is "do you have insurance" */ clear /*********************************************************/ /** Exploratory Data Analysis **/ /*********************************************************/ clear set memory 1g /*increase available memory, since the data is huge*/ use "ofhs_virgin_file.dta" /*read in the data*/ /*recode some missing data*/ replace s10 = . if s10>9 /*maximum recorded adults in HH = 9 */ replace s11 = . if s11>9 /*maximum recorded adults in family */ replace s12 = . if s12>12 /*maximum number of children = 12 */ /*consider number of people in the family: h84 */ replace h84 = . if h84 == 99 /*consider yearly family income: h85y*/ replace h85y = . if h85y > 999997 gen sqrtincome = sqrt(h85y) /*transform to the square root scale*/ /*consider number of hours worked: g73*/ replace g73 = . if g73 > 85 gen hoursworked = g73 /*consider health care premiums: b8 */ replace b8 = . if b8 > 99997 gen sqrtpremiums = sqrt(b8) /*transform to the square root scale*/ /*b8b == 3 means the premium is paid monthly */ /*consider the race of the adult*/ replace race_a = . if race_a>4 /*make pretty labels*/ label define race 1 "White" 2 "Black" 3 "Asian" 4 "Other" 98 "Dont Know" label values race_a race /*consider the ethnicity of the adult*/ replace raceth_a = . if raceth_a >4 /*make pretty labels*/ label define eth 1 "Hispanic" 2 "Black/NH" 3 "Asian" 4 "White/NH" 98 "Dont Know" label values raceth_a eth /*consider the age of the respondent*/ replace s14 = . if s14 >997 gen age = s14 /*generate variables for counting purposes*/ gen const=1 sum wtfinal scalar N_est = r(sum) gen wtSRS = r(sum)/r(N) /*bar plots*/ graph bar (sum) wtSRS, over(raceth_a) title("Equal Weights") blabel(bar) ytitle("Total") graph export "barracethSRS.ps", replace graph bar (sum) wtfinal, over(raceth_a) title("Probability Weights") blabel(bar) ytitle("Total") graph export "barracethWT.ps", replace /*box plots*/ graph box sqrtincome if h84==1, /// ytitle("Income") title("Income for Single-Adult Households") subtitle("Equal Weights") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) graph export "boxincomeSRS.ps", replace graph box sqrtincome if h84==1 [pweight = wtfinal], /// ytitle("Income") title("Income for Single-Adult Households") subtitle("Probability Weights") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) graph export "boxincomeWT.ps", replace /*box plots -- "zoomed" subset */ graph box sqrtincome if h84==1 & sqrtincome<500, /// ytitle("Income") title("Income for Single-Adult Households < 250,000") subtitle("Equal Weights") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000") graph export "boxincomeSRS_trunc.ps", replace graph box sqrtincome if h84==1 & sqrtincome<500 [pweight = wtfinal], /// ytitle("Income") title("Income for Single-Adult Households < 250,000") subtitle("Probability Weights") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000") graph export "boxincomeWT_trunc.ps", replace graph box sqrtincome if h84==1 [pweight = wtfinal], /// over(cluster, label(alternate)) /// ytitle("Income") title("Income for Single-Adult Households") subtitle("Probability Weights, Over County Groups") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) graph export "boxincomeStrata.ps", replace /*scatter plots*/ scatter sqrtpremiums sqrtincome if h84==1, /// xlabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) /// ylabel(`=sqrt(0)' "0" `=sqrt(2500)' "2500" `=sqrt(10000)' "10,000" `=sqrt(22500)' "22,500") /// xtitle("Income") ytitle("Premiums") scatter sqrtpremiums sqrtincome if h84==1 [pweight = wtfinal], /// mfcolor(none) xlabel(`=sqrt(0)' "0" `=sqrt(20000)' "20000" `=sqrt(100000)' "100000" `=sqrt(300000)' "300000" ) || lfit sqrtpremiums sqrtincome if h84==1 [pweight = wtfinal] scatter sqrtincome hoursworked if s11==1, /// xtitle("# Hours Worked") ytitle("Income") title("Work vs. Income in Single Adult Families") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) graph export "scatterincomehoursSRS.ps", replace scatter sqrtincome hoursworked if h84==1 [pweight = wtfinal], /// mfcolor(none) xtitle("# Hours Worked") ytitle("Income") title("Work vs. Income in Single Adult Families") /// ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000" `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) /// || lfit sqrtincome hoursworked [pweight=wtfinal], lwidth(vthick) graph export "scatterincomehoursWT.ps", replace /*scatter plots of ordinal variables*/ scatter s12 s10 if region2 == "A" graph export "scatterordinal1.ps", replace scatter s12 s10 if region2 == "A" [pweight = wtfinal], mfcolor(none) graph export "scatterordinal2.ps", replace scatter s12 s10 if region2 == "A" [pweight = wtfinal], mfcolor(none) jitter(5) graph export "scatterordinal3.ps", replace capture bysort s12 s10 region2: egen tot_wt = total(wtfinal) /* sum the weights by the categories */ scatter s12 s10 if region2 == "A" [pweight = tot_wt], mfcol(none) graph export "scatterordinal4.ps", replace /*********************************************************/ /** Linear Regression **/ /*********************************************************/ svyset masterid [pweight=wtfinal], strata(stratum) svy: regress sqrtincome hoursworked /*NOT THE SAME AS : no design information*/ regress sqrtincome hoursworked [pweight=wtfinal] /*NOT THE SAME AS : weighted least squares*/ regress sqrtincome hoursworked [aweight=wtfinal] /*********************************************************/ /** Logistic Regression **/ /*********************************************************/ gen uninsured = 0 replace uninsured = 1 if a1 == 2 replace uninsured = . if a1 > 2 replace uninsured = . if a1 == . svy: logistic uninsured hoursworked /********************************************************/ /** Small Area Estimation **/ /********************************************************/ clear set memory 1g /*increase available memory, since the data is huge*/ use "ofhs_virgin_file.dta" /*read in the data*/ /*Example: consider Minorities in Champaign County */ /*race_aim is the self-reported race*/ gen minority = 1 replace minority = 0 if race_aim == 1 replace minority = . if race_aim == . /*Make a new variable to identify this "small area"*/ /*s9 is self reported county != stratum, 21=Champaign Co.*/ gen smallarea = 0 replace smallarea = 1 if minority == 1 & s9 == 21 /*recode the insurance status*/ gen insured = a1 replace insured = 0 if insured == 2 replace insured = . if insured >= 98 tab insured gen uninsured = 1 - insured /*create a counter variable*/ gen const = 1 svyset masterid [pweight=wtfinal], strata(stratum) /******************/ /*Direct Estimator*/ svy: ratio insuredratiosmallarea: uninsured/const, subpop(smallarea) lincom _b[insuredratiosmallarea] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/ scalar t_hat_dir = r(estimate) scalar t_dir_var = r(se)^2 /******************/ /*Synthetic Estimator*/ /*assume proportion of insured is constant across the state, and uniform over minority subpop*/ svy: ratio insuredratio: uninsured/const lincom _b[insuredratio] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/ scalar t_hat_syn = r(estimate) scalar t_syn_var = r(se)^2 /*assume proportion of insured is constant in the minority population across the state*/ svy: ratio insuredratiominority: uninsured/const, subpop(minority) estimates store syntheticest lincom _b[insuredratiominority] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/ scalar t_hat_syn2 = r(estimate) scalar t_syn2_var = r(se)^2 /***********************/ /* Composite Estimator */ scalar alpha = 0.5 scalar t_hat_comp = alpha * t_hat_dir + (1-alpha) * t_hat_syn2 /***********************/ /* Compare Estimates */ display t_hat_dir display t_hat_syn2 display t_hat_comp /********************************************************/ /** Variance Estimation Using NHANES 99 **/ /********************************************************/ set more off clear set memory 1g /*increase available memory, since the data is huge*/ /**READ IN DEMOGRAPHIC AND SURVEY DESIGN INFO**/ fdause "/scratch/NHANES99/demo.xpt" sort seqn /*sort by unique ID*/ save "/scratch/NHANES99/demo", replace clear /**READ IN HEALTH INSURANCE QUESTIONS**/ fdause "/scratch/NHANES99/hiq.xpt" sort seqn /*sort by unique ID*/ save "/scratch/NHANES99/hiq", replace /*MERGE THE TWO DATA SETS TOGETHER BASED ON THE UNIQUE ID*/ merge seqn using "/scratch/NHANES99/demo" /*READ THE DOCUMENTATION FOR THIS SURVEY - IT'S COMPLICATED!*/ /*Question = Proportion of Americans Covered by Health Insurance*/ /*HID010 : Health Insurance? 1=y, 2=n, 7=refuse, 9=DK*/ gen insuranceMAR = hid010 replace insuranceMAR = . if hid010==7 | hid010==9 replace insuranceMAR = 0 if hid010==2 /*wtint2yr = The appropriate weights for this data set, and this question */ /*sdmvpsu = The `pseudo' PSUs (clusters) provided for this data */ /*sdmvstra = The `pseudo' Strata provided for this data */ /**************************************************************/ /* Design Effects - no design info, but published DEFF values */ /*HERE I AM PRETENDING TO BE THE AGENCY WITH THE DESIGN INFORMATION*/ /*THIS CODE CALCULATES THE DEFF*/ svyset, clear svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) svy: mean insuranceMAR estat effects, deff /*PUBLISH THE DESIGN EFFECT = 11.78*/ /*NOW FORGET THE DESIGN INFORMATION*/ svyset, clear /*NOW WE GET THE SURVEY WITHOUT THE DESIGN INFORMATION*/ /*we still need the weights!*/ svyset, clear svyset [pweight=wtint2yr] /******estimate using assumption of SRS*****/ svy: mean insuranceMAR /*Find the appropriate t critical value*/ scalar define tval = abs(invttail(e(df_r), 0.025)) /*find the critical value*/ /*unadjusted CIs*/ matrix define lower = e(b) - tval * cholesky(e(V)) /*cholesky finds the square root*/ matrix define upper = e(b) + tval * cholesky(e(V)) matrix define se = cholesky(e(V)) /*adjusted CIs*/ matrix define lowerdeff = e(b) - tval * cholesky(e(V)*11.78) matrix define upperdeff = e(b) + tval * cholesky(e(V)*11.78) matrix define sedeff = cholesky(e(V)*11.78) matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) matrix colnames info = Standard_Error LowerCI UpperCI matrix rownames info = SRS Deff_adjusted matrix list info /*********************************************/ /* Linearization - known design information */ svyset, clear svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) svydes svy: mean insuranceMAR /*save the CIs, and compare to before*/ scalar define tval = abs(invttail(e(df_r), 0.025)) /*find the critical value*/ matrix define cislinear = cholesky(e(V)), e(b) - tval * cholesky(e(V)), e(b) + tval * cholesky(e(V)) matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear matrix colnames info = Standard_Error LowerCI UpperCI matrix rownames info = SRS Deff_adjusted Linearization matrix list info /*********************************************/ /* Jackknife - have resampling weights */ svyset, clear svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) vce(jackknife) jkrweight(wtirep01-wtirep52) svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) vce(jackknife) jkrweight(wtirep*) /*same thing*/ svy: mean insuranceMAR /*save the CIs, and compare to before*/ scalar define tval = abs(invttail(e(df_r), 0.025)) /*find the critical value*/ matrix define cisjack = cholesky(e(V)), e(b) - tval * cholesky(e(V)), e(b) + tval * cholesky(e(V)) matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear \ cisjack matrix colnames info = Standard_Error LowerCI UpperCI matrix rownames info = SRS Deff_adjusted Linearization Jackknife matrix list info /****Jackknife "guts"****/ /*estimate using the first weights*/ svyset, clear svyset sdmvpsu [pweight=wtirep01], strata(sdmvstra) svy: mean insuranceMAR matrix define j_ests = e(b) /*estimate using the other weights, store values in j_ests -- remove the `quietly's to see what's going on*/ foreach var of varlist wtirep02-wtirep52 { quietly: svyset sdmvpsu [pweight=`var'], strata(sdmvstra) quietly: svy: mean insuranceMAR quietly: matrix define j_ests = j_ests \ e(b) } matrix list j_ests svmat j_ests /*store the matrix as a variable*/ mean j_ests /*estimate the mean*/ sum j_ests /*summarize the replicates*/ /*save the CIs and compare*/ scalar define tval = abs(invttail(e(df_r), 0.025)) /*find the critical value*/ /*jackknife estimates of variance are n * sample variance over all the replicates*/ matrix define cisjackbyhand = sqrt(r(Var)*r(N)), r(mean) - tval * sqrt(r(Var)*r(N)), r(mean) + tval * sqrt(r(Var)*e(N)) matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear \ cisjack \ cisjackbyhand matrix colnames info = Standard_Error LowerCI UpperCI matrix rownames info = SRS Deff_adjusted Linearization Jackknife Jackknife_by_hand matrix list info /********************************************************/ /** Poststratification using NHANES 99 **/ /********************************************************/ /*Suppose we'd like to post-stratify by gender: Male (1) and Female (0) */ generate male = riagendr replace male = 0 if male ==2 /*Total Census population size = 281,400,000*/ total wtint2yr /*too small*/ /*adjust for total population*/ sum wtint2yr generate adjwt = wtint2yr * 281400000 / r(sum) /*check*/ total adjwt /*Population Values are Male=136,100,000 Female = 145,300,000 */ /*first see if there's a problem*/ total adjwt, over(male) /*too little wt for females, too much for males*/ matrix totals = e(b) scalar totalfem = totals[1,1] scalar totalmale = totals[1,2] display totalfem display totalmale /*****TWO WAYS TO POSTSTRATIFY******/ /*********/ /*By Hand*/ gen genderadjwt = adjwt replace genderadjwt = genderadjwt * 136100000 / totalmale if male == 1 replace genderadjwt = genderadjwt * 145300000 / totalfem if male == 0 total genderadjwt, over(male) /*Just Right!*/ /*****************/ /*"Automatically"*/ /*create variables specifying the target*/ gen malepopsize = 145300000 replace malepopsize = 136100000 if male == 1 svyset [pweight = adjwt], poststrata(male) postweight(malepopsize) svy: total male /******************** RAKING **************************/ /*this is in the "svr" package that you must download and install into Stata*/ /*this is easy to do via the help menus, if you just search for svr*/ /*We also need a second variable by which to adjust*/ /*I'll use over/under 18 years old */ generate kid = 0 replace kid = 1 if ridageyr < 18 /*we again, need variables that store the "true" values */ generate kidpopsize = 70000000 replace kidpopsize = 281400000 - 70000000 if kid == 0 /*see if our unadjusted weights are off*/ total adjwt, over(kid) /*perform the raking*/ survwgt rake adjwt, by(male kid) totvar(malepopsize kidpopsize) generate(rakewt) /*check the results*/ total rakewt, over(male) total rakewt, over(kid)