/***********************************************************/
/*  INTRODUCTION TO STATA - toys using brain-IQ data       */
/***********************************************************/

/*input data using `point and click'*/

/*clear the memory*/
clear

/*same thing using code*/
insheet using "/scratch/OhioFamilyHealthSurvey/brainIQ.dat"


/*look at the data using the data editor*/
/*look at the data using `list'*/
list

set more off /*tells STATA to not wait for scrolling*/

/*variable summaries*/
summarize                         /*summarize all the variables*/
summarize gender fsiq mri_count   /*summarize just some of the variables*/

help summarize
summarize fsiq, detail            /*use options to show more detail*/
su f, d				  /*Stata allows extreme abbreviation*/

/*find means*/
mean fsiq

/*find proportions*/
prop gender

/*make graphs*/
scatter f m
scatter f m, by(gender)
graph box f, over(g)

/*make new variables as functions of old variables*/
gen weightkg = weight/2.2
gen heightm = height*0.0254
gen BMI = weightkg/heightm^2

summarize w h BMI
summarize w* h* BMI

/*drop extra variables*/
drop weightkg heightm

/*make new categorical variables*/
gen IQcat = 0
replace IQcat = 1 if fsiq>116.5
summarize IQcat, d
graph box fsiq, over(IQcat)

/*Using results of functions*/
summarize f, d
return list
display r(p50)

gen IQcat2 = 0
replace IQcat2 = 1 if fsiq > r(p50)

tab IQcat IQcat2

clear


/***********************************************************/
/*  STATA ANALYSIS of OFHS                                 */
/***********************************************************/

set memory 1g  /*increase available memory, since the data is huge*/
use "ofhs_virgin_file.dta"   /*read in the data*/

/*recode some missing data*/
replace s10 = . if s10>9  /*maximum recorded adults = 9 */
replace s12 = . if s12>12 /*maximum number of children = 12 */

/*create a new variable - yes/no for children in hh*/
gen kids = 0 
replace kids = 1 if s12>0


/*************************************************************/
/* Estimation with SRS                                       */
/*************************************************************/

/*****************************************/
/*create a pseudo-SRS from Butler County*/

keep if stratum==9  /*keep only one county - this is the coded number for Butler*/
keep if s10!=.      /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/
keep if s12!=.
gen N=123018
gen n=1082

save "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta", replace  /*save the reduced data set*/
clear

set memory 8000    /*don't need as much memory for this reduced data set*/
use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta"  /*read in the Butler County data set*/

/**************************************************/
/*Estimate the Mean Number of Adults per Household*/
/*s10 is the variable that stores the number of Adults*/

tab s10, missing  /*look at the data, the "missing" option shows any missing values*/

/*****************/
/*First by "hand"*/

mean s10  /*calculate the mean*/
ereturn list  /*list the variables that the estimator produces*/

matrix list e(V)  /*report the variance*/
matrix define varmat = e(V)  /*save the variance matrix with name `varmat'*/
matrix list varmat
scalar define myvar = varmat[1,1]  /*extract the variance into a scalar*/
display myvar
scalar define myvar = myvar*(1-e(N)/123082)  /*adjust via the fpc*/
display myvar
scalar define myse = sqrt(myvar)  /*find the standard error*/
display myse

matrix define meanmat = e(b)  /*extract the mean*/
matrix list meanmat
scalar define mymean = meanmat[1,1]  /*save the mean as a scalar*/
display mymean

/*Calculate CI*/
/* Stata uses a t statistic with n-1 df*/
scalar define tval = abs(invttail(e(N)-1, 0.025))  /*find the critical value*/
display tval
scalar define lower = mymean - tval*myse   /*calculate the lower CI*/
scalar define upper = mymean + tval*myse   /*calculate the upper CI*/

scalar list mymean myse lower upper

/***************************/
/*Next by built-in command */

capture gen N=123082             /*make a variable to hold the population size*/
svyset masterid, fpc(N)  /*make survey definition straight up with knowledge of N*/
svy: mean s10            /*survey calculate the mean*/

scalar list mymean myse lower upper /*compare with hand calculations*/


/**************************************************/
/*Estimate the proportion of households with kids */

svy: proportion kids
svy: mean kids

/***************************************************/
/*Estimate the total number of households with kids*/

svy: total kids  /*doesn't work*/


/****************************************/
/* now do estimation with exact weights */

svyset, clear   /*clear the old definitions*/
gen wt = 123082/1082  /*define the weight equal to the inverse probability*/
svyset masterid [pweight=wt], fpc(N)  /*make survey definition*/

/*mean*/
svy: mean s10 
scalar list mymean myse lower upper /*compare with hand calculations*/

/*total*/
svy: total kids

/*********************************/
/* weights that are proportional */

svyset, clear  /*clear the old definitions*/
gen wtprop = 1/1082
svyset masterid [pweight=wtprop], fpc(N)

/*mean*/
svy: mean s10 
scalar list mymean myse lower upper /*compare with hand calculations*/

/*total*/
svy: total kids  /*doesn't work*/


/*********************************/
/* Ratios */

capture gen wt = 123082/1082  /*define the wt if it hasn't already*/
capture gen N = 123082
svyset masterid [pweight=wt], fpc(N)


/*Ratio of children to total*/
gen tothhmembers = s10 + s12
gen prophhkids = s12 / tothhmembers


/*Unit level*/
svy: mean prophhkids

/*population level*/
svy: ratio s12 / tothhmembers


/*Ratio for unknown pop size*/
svyset, clear
capture gen wtprop = 1/1082
svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/

svy: ratio kidsratio: s12 / tothhmembers /*estimate ratio, store in kidsratio*/
lincom _b[kidsratio] * 332807  /*multiply estimate by total Population in Butler County*/


/*Ratio for reduced variance*/
svy: ratio kidsratio: s12 / tothhmembers /*estimate ratio, store in kidsratio*/
lincom _b[kidsratio] * 2.61     /*ratio estimate, 2.61 is the average #members/household in Butler County*/
matrix ybarr = r(estimate)	/*lincom stores stuff in r() */
matrix Vybarr = r(se)^2
svy: mean s12  			/*usual estimate*/
matrix ybar = e(b)
matrix Vybar = e(V)

/*display results*/
scalar define tval = abs(invttail(e(N)-1, 0.025))  /*t critical value*/
/*srs - consolidate results*/
matrix srs = (ybar, Vybar, ybar - tval*cholesky(Vybar), ybar + tval*cholesky(Vybar))
matrix colnames srs = estimate variance lowerCI upperCI
matrix rownames srs = srs
/*ratio - consolidate results*/
matrix ratio = (ybarr, Vybarr, ybarr - tval*cholesky(Vybarr), ybarr + tval*cholesky(Vybarr))
matrix colnames ratio = estimate variance lowerCI upperCI
matrix rownames ratio = ratio

matrix comparemeans = srs\ratio  /*combine the srs and ratio into one matrix*/

matrix list comparemeans


correlate s12 tothhmembers
scatter s12 tothhmembers, jitter(10) title(The correlation is `r(rho)')

bysort tothhmembers: summarize s12


/*********************************/
/* Domain */
svyset, clear
capture gen wtprop = 1
svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/

tab kids

svy: mean s10, subpop(kids) /*only households with kids*/
svy: mean s10, over(kids)   /*households with and without kids, separately*/

/*!!!THE WRONG WAY!!!*/
drop if kids==0
svy: mean s10

/*Get the right data back*/
clear
use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta"

capture gen wtprop = 1/1082
svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/

/*difference between means in two domains*/
svy: mean s10, over(kids)
lincom [s10]1 - [s10]0


/*********************************/
/* Regression   */
svyset, clear
capture gen wtprop = 1/1082
capture gen tothhmembers = s10 + s12
capture gen prophhkids = s12 / tothhmembers
svyset masterid [pweight=wtprop] /*assume FPC is approx 1*/

svy: regress s12 tothhmembers
lincom _cons + tothhmembers * 2.61

/*store the results*/
matrix ybarreg = r(estimate)	/*lincom stores stuff in r() */
matrix Vybarreg = r(se)^2

/*display the results in a matrix*/
scalar define tval = abs(invttail(e(N)-1, 0.025))
matrix reg = (ybarreg, Vybarreg, ybarreg - tval*cholesky(Vybarreg), ybarreg + tval*cholesky(Vybarreg))
matrix colnames reg = estimate variance lowerCI upperCI
matrix rownames reg = regression

matrix comparemeans = srs\ratio\reg

matrix list comparemeans


graph twoway scatter s12 tothhmembers, jitter(10) || lfit s12 tothhmembers 


/**DIFFERENT EXAMPLE -- income vs. hours worked**/
capture gen myincome = h85y
capture replace myincome = . if myincome>=999997 /*get rid of missing data*/
capture replace myincome = h85m * 12 if myincome = .

scatter myincome g73  /* income vs. hours worked */

/**DIFFERENT EXAMPLE -- premiums vs. income**/
capture gen mypremiums = b8
capture replace mypremiums = b8 * 52 if b8b==1
capture replace mypremiums = b8 * 52 / 2 if b8b==2
capture replace mypremiums = b8 * 12 if b8b==3
capture replace mypremiums = b8 * 12 *2 if b8b==4
capture replace mypremiums = b8 * 12 / 2 if b8b==5
capture replace mypremiums = b8 * 4 if b8b==6
capture replace mypremiums = b8 * 2 if b8b==7
capture replace mypremiums = . if mypremiums >= 99998
capture replace mypremiums = . if b8b >= 97

scatter mypremiums myincome if myincome!=. & mypremiums!=.

clear


/*************************************************************/
/* Estimation with STRATA                                    */
/*************************************************************/


/******************************************/
/*create a pseudo-SRS from Mahoning County*/

set memory 1g  /*increase available memory, since the data is huge*/
use "ofhs_virgin_file.dta"   /*read in the data*/

/*recode some missing data*/
replace s10 = . if s10>9  /*maximum recorded adults = 9 */
replace s12 = . if s12>12 /*maximum number of children = 12 */

/*create a new variable - yes/no for children in hh*/
gen kids = 0 
replace kids = 1 if s12>0

keep if stratum==50  /*keep only one county - this is the coded number for Mahoning*/
keep if s10!=.      /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/
keep if s12!=.
gen N=102587
gen n=1023

save "/scratch/OhioFamilyHealthSurvey/MahoningCounty.dta", replace  /*save the reduced data set*/
clear

/*******************************************/
/* Append the two county data sets         */

use "MahoningCounty.dta"
append using "ButlerCounty.dta"

set more off
tab stratum


/******************************************/
/* Calculate a Mean by hand               */

su s10 if stratum==9           /*calculate the mean and standard error*/

scalar define N9 = 123082
scalar define n9 = r(N)  /* sample size*/
scalar define mean9 = r(mean)  /*mean */
scalar define stdev9 = r(sd)    /* variance */

su s10 if stratum==50           /*calculate the mean and standard error*/

scalar define N50 = 102587
scalar define n50 = r(N)  /* sample size*/
scalar define mean50 = r(mean)  /*mean */
scalar define stdev50 = r(sd)    /* variance */

scalar define Ntot=N9+N50

/*stratified mean*/
scalar define meanstr = N9/(Ntot)*mean9 + N50/(Ntot)*mean50
display meanstr

/*stratified variance*/
scalar define varstr = N9^2/Ntot^2*(1-n9/N9)*stdev9^2/n9 + N50^2/Ntot^2*(1-n50/N50)*stdev50^2/n50 
display sqrt(varstr)

scalar define varstrnofpc = (N9^2)/(Ntot^2)*stdev9^2/n9 + (N50^2)/(Ntot^2)*stdev50^2/n50 
display sqrt(varstrnofpc)


/*******************************************/
/* Calculate a Mean using weights          */


/*make a weight variable*/
gen mywt = N/n

svyset, clear
svyset masterid [pweight=mywt], strata(stratum) fpc(N)

svy: mean s10

/* compare to the ``by hand" results*/
display meanstr
display sqrt(varstr)


/*******No FPC********/
svyset, clear
svyset masterid [pweight=mywt], strata(stratum)

svy: mean s10

/* compare to the ``by hand" results*/
display meanstr
display sqrt(varstrnofpc)


/*******************************************/
/* Now include all counties as strata      */


clear

set memory 1g  /*increase available memory, since the data is huge*/
use "ofhs_virgin_file.dta"   /*read in the data*/

/*recode some missing data*/
replace s10 = . if s10>9  /*maximum recorded adults = 9 */
replace s12 = . if s12>12 /*maximum number of children = 12 */

/*create a new variable - yes/no for children in hh*/
gen kids = 0 
replace kids = 1 if s12>0

/*create a new variable - total number of people in a household */
capture gen tothhmembers = s10 + s12


keep if stratum<=88  /*keep only the counties (not the oversample)*/
keep if s10!=.       /*keep only observations with no missing values - DON'T DO THIS IN PRACTICE!*/
keep if s12!=.


run StrWts.do /*this file assigns naive weights to units in each stratum to a variable called mystrwt*/


svyset masterid [pweight=mystrwt], strata(stratum)

/*DESCRIBE THE SURVEY DESIGN*/
set more off
svydes

svy: mean s10 s12
svy: ratio s12 / tothhmembers


/*************************************************************/
/* Estimation with CLUSTERS                                  */
/*************************************************************/

set more off

clear

set memory 1g
fdause "NIS05.xpt", novallabels
sort seqnumhh

/*Look only at Franklin County == approximately an SRS of phone numbers */
keep if estiap == 43

/*Look at polio vaccines*/
tab full_pol, missing  /*this variable containes yes/no polio series*/

drop if full_pol > 2 /*cheat by dropping the missing data*/
replace full_pol = 0 if full_pol == 2 /*recode the nos to `0' 
                             so that the proportions work out*/
                             
/*Look at the households */

tab seqnumhh /* there are 4 households with 2 children, rest have only one */
return list  /* 196-192 confirms this */
duplicates report seqnumhh /*yet another way to get this*/

/*identify houshohld with more than one child*/
duplicates tag seqnumhh, generate(numkids)
tab numkids /*look at the generated variable*/
replace numkids = numkids + 1 /*this is the number of kids in the child's household*/

/*look at the polio of the kids in multi-kid households*/
list seqnumhh full_pol if numkids==2

/* record the number of households in the sample*/
scalar nn = 192 /*we use nn because `n' is ambiguous*/

/*Cheat and use the weights to find the number of households*/
total rddwt if numkids==1
matrix define totonekid = e(b)
total rddwt if numkids==2
matrix define tottwokids = e(b)
scalar N_est = round(totonekid[1,1] + tottwokids[1,1]/2)  
	/*This estimates the number of households*/
gen N = N_est
gen KK = N_est * 1.05

/*generate the number of children per household (in the sample)*/
generate M = numkids


/*********************************************************/
/* to do estimation, it's easier to collapse the data set*/ 
/* into totals with one row per household                */

by seqnumhh: egen tot_pol = total(full_pol) /* sum the polio for the household */
list seqnumhh full_pol tot_pol if numkids==2

/*save the current state first!*/
save "/scratch/OhioFamilyHealthSurvey/NIS05.dta", replace

/***now collapse***/
duplicates drop seqnumhh, force


/******unbiased estimate******/

duplicates drop seqnumhh, force

display N_est
display nn

summarize tot_pol
scalar t_bar = r(mean)
scalar t_hat_unb = N_est * t_bar
scalar st_sq = r(Var)
scalar t_unb_var = N_est^2*(1-nn/N)*st_sq/nn
display t_hat_unb
display sqrt(t_unb_var)

scalar ybar_unb = t_hat_unb / KK
scalar ybar_unb_var = t_unb_var / (KK^2)
display ybar_unb
display sqrt(ybar_unb_var)

/*******ratio estimate*******/

total tot_pol M
matrix totests = e(b)
scalar ybar_r = totests[1,1]/totests[1,2]
display ybar_r

summarize M
scalar mbar_U = r(mean)
gen ybar_i = tot_pol/M
gen numerator = M^2 * (ybar_i - ybar_r)^2
summarize numerator
scalar nummean=r(mean)
scalar ybar_r_var = (1- nn/N) * 1 / (mbar_U^2) / nn * nummean * nn / (nn-1)
display ybar_r
display sqrt(ybar_r_var)

/******Use built-in Functions*****/

clear
use "/scratch/OhioFamilyHealthSurvey/NIS05.dta"

svyset seqnumhh, fpc(N)
svy: mean full_pol


/******Use Weights ******/

gen clustwt = N_est / nn
svyset seqnumhh [pweight = clustwt], fpc(N)
svy: total full_pol

svy: mean full_pol
display ybar_r
display sqrt(ybar_r_var)


clear


/************************************************************/
/**  Back to the OFHS                                      **/
/************************************************************/

set memory 8000    /*don't need as much memory for this reduced data set*/
use "/scratch/OhioFamilyHealthSurvey/ButlerCounty.dta"  /*read in the Butler County data set*/

capture gen N=123082             /*make a variable to hold the population size*/
capture gen n=1082               /*make a variable to hold the sample size*/

/* Weight the data using the simplistic 2-stage sample */

gen p1 = n/N	/*probability of household being selected*/
gen p2 = 1/s10  /*probability of adult being selected */

gen totprob = p1 * p2
gen twostagewt = 1/totprob

/* Define the design [two stage cluster]*/

svyset masterid [pweight = twostagewt], fpc(N)

svy: prop a1  /* a1 is "do you have insurance" */


clear


/*********************************************************/
/** Exploratory Data Analysis                           **/
/*********************************************************/

clear

set memory 1g  /*increase available memory, since the data is huge*/
use "ofhs_virgin_file.dta"   /*read in the data*/

/*recode some missing data*/
replace s10 = . if s10>9  /*maximum recorded adults in HH = 9 */
replace s11 = . if s11>9  /*maximum recorded adults in family */
replace s12 = . if s12>12 /*maximum number of children = 12 */


/*consider number of people in the family: h84 */
replace h84 = . if h84 == 99

/*consider yearly family income:  h85y*/
replace h85y = . if h85y > 999997
gen sqrtincome = sqrt(h85y) /*transform to the square root scale*/

/*consider number of hours worked: g73*/
replace g73 = . if g73 > 85
gen hoursworked = g73

/*consider health care premiums: b8 */
replace b8 = . if b8 > 99997
gen sqrtpremiums = sqrt(b8)  /*transform to the square root scale*/

/*b8b == 3 means the premium is paid monthly */

/*consider the race of the adult*/
replace race_a = . if race_a>4
/*make pretty labels*/
label define race 1 "White" 2 "Black" 3 "Asian" 4 "Other" 98 "Dont Know"
label values race_a race

/*consider the ethnicity of the adult*/
replace raceth_a = . if raceth_a >4
/*make pretty labels*/
label define eth 1 "Hispanic" 2 "Black/NH" 3 "Asian" 4 "White/NH" 98 "Dont Know"
label values raceth_a eth


/*consider the age of the respondent*/
replace s14 = . if s14 >997
gen age = s14


/*generate variables for counting purposes*/
gen const=1
sum wtfinal
scalar N_est = r(sum)
gen wtSRS = r(sum)/r(N)


/*bar plots*/
graph bar (sum) wtSRS, over(raceth_a) title("Equal Weights") blabel(bar) ytitle("Total")
graph export "barracethSRS.ps", replace
graph bar (sum) wtfinal, over(raceth_a) title("Probability Weights") blabel(bar) ytitle("Total")
graph export "barracethWT.ps", replace


/*box plots*/
graph box sqrtincome if h84==1, ///
	ytitle("Income") title("Income for Single-Adult Households") subtitle("Equal Weights") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" )
graph export "boxincomeSRS.ps", replace
graph box sqrtincome if h84==1 [pweight = wtfinal], ///
	ytitle("Income") title("Income for Single-Adult Households") subtitle("Probability Weights") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" )
graph export "boxincomeWT.ps", replace

/*box plots -- "zoomed" subset */
graph box sqrtincome if h84==1 & sqrtincome<500, ///
	ytitle("Income") title("Income for Single-Adult Households < 250,000") subtitle("Equal Weights") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000")
graph export "boxincomeSRS_trunc.ps", replace
graph box sqrtincome if h84==1  & sqrtincome<500 [pweight = wtfinal], ///
	ytitle("Income") title("Income for Single-Adult Households < 250,000") subtitle("Probability Weights") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000")
graph export "boxincomeWT_trunc.ps", replace


graph box sqrtincome if h84==1 [pweight = wtfinal], ///
	over(cluster, label(alternate)) ///
	ytitle("Income") title("Income for Single-Adult Households") subtitle("Probability Weights, Over County Groups") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" )
graph export "boxincomeStrata.ps", replace


/*scatter plots*/
scatter sqrtpremiums sqrtincome if h84==1, ///
	xlabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) ///
	ylabel(`=sqrt(0)' "0" `=sqrt(2500)' "2500" `=sqrt(10000)' "10,000" `=sqrt(22500)' "22,500") ///
	xtitle("Income") ytitle("Premiums")
	
scatter sqrtpremiums sqrtincome if h84==1 [pweight = wtfinal], ///
	mfcolor(none) xlabel(`=sqrt(0)' "0" `=sqrt(20000)' "20000" `=sqrt(100000)' "100000" `=sqrt(300000)' "300000" ) || lfit sqrtpremiums sqrtincome if h84==1 [pweight = wtfinal]


scatter sqrtincome hoursworked if s11==1, ///
	xtitle("# Hours Worked") ytitle("Income") title("Work vs. Income in Single Adult Families") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" )
graph export "scatterincomehoursSRS.ps", replace
scatter sqrtincome hoursworked if h84==1 [pweight = wtfinal], ///
	mfcolor(none) xtitle("# Hours Worked") ytitle("Income") title("Work vs. Income in Single Adult Families") ///
	ylabel(`=sqrt(0)' "0" `=sqrt(20000)' "20,000" `=sqrt(100000)' "100,000" `=sqrt(300000)' "300,000"  `=sqrt(600000)' "600,000" `=sqrt(1000000)' "1,000,000" ) ///
	|| lfit sqrtincome hoursworked [pweight=wtfinal], lwidth(vthick)
graph export "scatterincomehoursWT.ps", replace

/*scatter plots of ordinal variables*/
scatter s12 s10 if region2 == "A"
graph export "scatterordinal1.ps", replace
scatter s12 s10 if region2 == "A" [pweight = wtfinal], mfcolor(none)
graph export "scatterordinal2.ps", replace
scatter s12 s10 if region2 == "A" [pweight = wtfinal], mfcolor(none) jitter(5)
graph export "scatterordinal3.ps", replace

capture bysort s12 s10 region2: egen tot_wt = total(wtfinal) /* sum the weights by the categories */
scatter s12 s10 if region2 == "A" [pweight = tot_wt], mfcol(none)
graph export "scatterordinal4.ps", replace

/*********************************************************/
/** Linear Regression                                   **/
/*********************************************************/


svyset masterid [pweight=wtfinal], strata(stratum)
svy: regress sqrtincome hoursworked

/*NOT THE SAME AS : no design information*/
regress sqrtincome hoursworked [pweight=wtfinal]

/*NOT THE SAME AS : weighted least squares*/
regress sqrtincome hoursworked [aweight=wtfinal]


/*********************************************************/
/** Logistic Regression                                 **/
/*********************************************************/

gen uninsured = 0
replace uninsured = 1 if a1 == 2
replace uninsured = . if a1 > 2
replace uninsured = . if a1 == .

svy: logistic uninsured hoursworked


/********************************************************/
/** Small Area Estimation                              **/
/********************************************************/

clear

set memory 1g  /*increase available memory, since the data is huge*/
use "ofhs_virgin_file.dta"   /*read in the data*/

/*Example:  consider Minorities in Champaign County */
/*race_aim is the self-reported race*/
gen minority = 1
replace minority = 0 if race_aim == 1
replace minority = . if race_aim == .

/*Make a new variable to identify this "small area"*/
/*s9 is self reported county != stratum, 21=Champaign Co.*/

gen smallarea = 0
replace smallarea = 1 if minority == 1 & s9 == 21

/*recode the insurance status*/
gen insured = a1
replace insured = 0 if insured == 2
replace insured = . if insured >= 98
tab insured

gen uninsured = 1 - insured

/*create a counter variable*/
gen const = 1

svyset masterid [pweight=wtfinal], strata(stratum)

/******************/
/*Direct Estimator*/

svy: ratio insuredratiosmallarea: uninsured/const, subpop(smallarea)
lincom _b[insuredratiosmallarea] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/
scalar t_hat_dir = r(estimate)
scalar t_dir_var = r(se)^2


/******************/
/*Synthetic Estimator*/

/*assume proportion of insured is constant across the state, and uniform over minority subpop*/

svy: ratio insuredratio: uninsured/const
lincom _b[insuredratio] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/
scalar t_hat_syn = r(estimate)
scalar t_syn_var = r(se)^2

/*assume proportion of insured is constant in the minority population across the state*/

svy: ratio insuredratiominority: uninsured/const, subpop(minority)
estimates store syntheticest

lincom _b[insuredratiominority] * 1062 /*multiply estimate by the Minorioty Population in Champaign County*/
scalar t_hat_syn2 = r(estimate)
scalar t_syn2_var = r(se)^2


/***********************/
/* Composite Estimator */

scalar alpha = 0.5
scalar t_hat_comp = alpha * t_hat_dir + (1-alpha) * t_hat_syn2


/***********************/
/* Compare Estimates   */

display t_hat_dir 
display t_hat_syn2 
display t_hat_comp


/********************************************************/
/** Variance Estimation Using NHANES 99                **/
/********************************************************/

set more off
clear

set memory 1g  /*increase available memory, since the data is huge*/

/**READ IN DEMOGRAPHIC AND SURVEY DESIGN INFO**/
fdause "/scratch/NHANES99/demo.xpt"
sort seqn  /*sort by unique ID*/
save "/scratch/NHANES99/demo", replace

clear

/**READ IN HEALTH INSURANCE QUESTIONS**/
fdause "/scratch/NHANES99/hiq.xpt"
sort seqn  /*sort by unique ID*/
save "/scratch/NHANES99/hiq", replace

/*MERGE THE TWO DATA SETS TOGETHER BASED ON THE UNIQUE ID*/
merge seqn using "/scratch/NHANES99/demo"

/*READ THE DOCUMENTATION FOR THIS SURVEY - IT'S COMPLICATED!*/
/*Question = Proportion of Americans Covered by Health Insurance*/
/*HID010 : Health Insurance?  1=y, 2=n, 7=refuse, 9=DK*/
gen insuranceMAR = hid010
replace insuranceMAR = . if hid010==7 | hid010==9
replace insuranceMAR = 0 if hid010==2

/*wtint2yr = The appropriate weights for this data set, and this question */
/*sdmvpsu  = The `pseudo' PSUs (clusters) provided for this data */
/*sdmvstra = The `pseudo' Strata provided for this data */


/**************************************************************/
/* Design Effects - no design info, but published DEFF values */


/*HERE I AM PRETENDING TO BE THE AGENCY WITH THE DESIGN INFORMATION*/
/*THIS CODE CALCULATES THE DEFF*/
svyset, clear
svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra)
svy: mean insuranceMAR
estat effects, deff
/*PUBLISH THE DESIGN EFFECT = 11.78*/
/*NOW FORGET THE DESIGN INFORMATION*/
svyset, clear


/*NOW WE GET THE SURVEY WITHOUT THE DESIGN INFORMATION*/
/*we still need the weights!*/
svyset, clear
svyset [pweight=wtint2yr]

/******estimate using assumption of SRS*****/
svy: mean insuranceMAR

/*Find the appropriate t critical value*/
scalar define tval = abs(invttail(e(df_r), 0.025))  /*find the critical value*/

/*unadjusted CIs*/
matrix define lower = e(b) - tval * cholesky(e(V))  /*cholesky finds the square root*/ 
matrix define upper = e(b) + tval * cholesky(e(V))
matrix define se = cholesky(e(V))

/*adjusted CIs*/
matrix define lowerdeff = e(b) - tval * cholesky(e(V)*11.78)
matrix define upperdeff = e(b) + tval * cholesky(e(V)*11.78)
matrix define sedeff = cholesky(e(V)*11.78)

matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff)
matrix colnames info = Standard_Error LowerCI UpperCI
matrix rownames info = SRS Deff_adjusted

matrix list info


/*********************************************/
/* Linearization - known design information  */

svyset, clear
svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra)
svydes

svy: mean insuranceMAR

/*save the CIs, and compare to before*/
scalar define tval = abs(invttail(e(df_r), 0.025))  /*find the critical value*/
matrix define cislinear = cholesky(e(V)), e(b) - tval * cholesky(e(V)), e(b) + tval * cholesky(e(V))
matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear
matrix colnames info = Standard_Error LowerCI UpperCI
matrix rownames info = SRS Deff_adjusted Linearization

matrix list info


/*********************************************/
/* Jackknife - have resampling weights       */

svyset, clear
svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) vce(jackknife) jkrweight(wtirep01-wtirep52)
svyset sdmvpsu [pweight=wtint2yr], strata(sdmvstra) vce(jackknife) jkrweight(wtirep*)   /*same thing*/

svy: mean insuranceMAR

/*save the CIs, and compare to before*/
scalar define tval = abs(invttail(e(df_r), 0.025))  /*find the critical value*/
matrix define cisjack = cholesky(e(V)), e(b) - tval * cholesky(e(V)), e(b) + tval * cholesky(e(V))
matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear \ cisjack
matrix colnames info = Standard_Error LowerCI UpperCI
matrix rownames info = SRS Deff_adjusted Linearization Jackknife

matrix list info

/****Jackknife "guts"****/

/*estimate using the first weights*/
	svyset, clear
	svyset sdmvpsu [pweight=wtirep01], strata(sdmvstra)
	svy: mean insuranceMAR
	matrix define j_ests = e(b)

/*estimate using the other weights, store values in j_ests -- remove the `quietly's to see what's going on*/
foreach var of varlist wtirep02-wtirep52 {
	quietly: svyset sdmvpsu [pweight=`var'], strata(sdmvstra)
	quietly: svy: mean insuranceMAR
	quietly: matrix define j_ests = j_ests \ e(b)
}

matrix list j_ests

svmat j_ests /*store the matrix as a variable*/
mean j_ests  /*estimate the mean*/
sum j_ests   /*summarize the replicates*/

/*save the CIs and compare*/
scalar define tval = abs(invttail(e(df_r), 0.025))  /*find the critical value*/
/*jackknife estimates of variance are n * sample variance over all the replicates*/
matrix define cisjackbyhand = sqrt(r(Var)*r(N)), r(mean) - tval * sqrt(r(Var)*r(N)), r(mean) + tval * sqrt(r(Var)*e(N))

matrix define info = (se, lower, upper) \ (sedeff, lowerdeff, upperdeff) \ cislinear \ cisjack \ cisjackbyhand
matrix colnames info = Standard_Error LowerCI UpperCI
matrix rownames info = SRS Deff_adjusted Linearization Jackknife Jackknife_by_hand

matrix list info


/********************************************************/
/** Poststratification using NHANES 99                 **/
/********************************************************/

/*Suppose we'd like to post-stratify by gender:  Male (1) and Female (0) */
generate male = riagendr
replace male = 0 if male ==2

/*Total Census population size = 281,400,000*/

total wtint2yr /*too small*/

/*adjust for total population*/
sum wtint2yr
generate adjwt = wtint2yr * 281400000 / r(sum) 
/*check*/
total adjwt

/*Population Values are Male=136,100,000  Female = 145,300,000           */
/*first see if there's a problem*/

total adjwt, over(male) /*too little wt for females, too much for males*/
matrix totals = e(b)
scalar totalfem = totals[1,1]
scalar totalmale = totals[1,2]
display totalfem
display totalmale

/*****TWO WAYS TO POSTSTRATIFY******/

/*********/
/*By Hand*/
gen genderadjwt = adjwt
replace genderadjwt = genderadjwt * 136100000 / totalmale if male == 1
replace genderadjwt = genderadjwt * 145300000 / totalfem if male == 0
 
total genderadjwt, over(male) /*Just Right!*/

/*****************/
/*"Automatically"*/
/*create variables specifying the target*/
gen malepopsize = 145300000
replace malepopsize = 136100000 if male == 1

svyset [pweight = adjwt], poststrata(male) postweight(malepopsize)
svy: total male


/********************  RAKING  **************************/
/*this is in the "svr" package that you must download and install into Stata*/
/*this is easy to do via the help menus, if you just search for svr*/

/*We also need a second variable by which to adjust*/
/*I'll use over/under 18 years old                 */
generate kid = 0
replace kid = 1 if ridageyr < 18

/*we again, need variables that store the "true" values */
generate kidpopsize = 70000000
replace kidpopsize = 281400000 - 70000000  if kid == 0  

/*see if our unadjusted weights are off*/
total adjwt, over(kid)

/*perform the raking*/
survwgt rake adjwt, by(male kid) totvar(malepopsize kidpopsize) generate(rakewt)

/*check the results*/
total rakewt, over(male)
total rakewt, over(kid)