capture log close capture program drop _all capture macro drop _all drop _all set mem 500m set more off #delimit; /*** Set Server Directory Structure ***/ global dofiles "/homes/nber/jscottc/marcps/dofiles/" ; global raw "/homes/nber/jscottc/marcps/data/"; global data "/homes/nber/jscottc/marcps/data/" ; /*** Open log file ***/ log using "${dofiles}mcps03_s0.log", replace ; /********************************************************************** *stata 8.0 * PROJECT: Prepping file for TaxSim - match to October CPS data * FOR: 2007 TPE Paper * PROGRAMMER: Judy Scott-Clayton * PROGNAME: mcps03_s0.do * PURPOSE: *Read in entire raw 2003 Unicon file, select vars * *Create tax-unit level vars and attach to all in the tax unit * **NOTE** Variables needed for taxsim are labeled TS0-TS21, so * search for TS# to find relevant code. * **NOTE** You will also need to change directory settings above. * **NOTE** SOME OF THE CODING IS SPECIFIC TO YEAR 2003. IF YOU * WANT TO USE ANOTHER YEAR YOU WILL NEED TO EDIT THE CODE - PAY * PARTICULAR ATTENTION TO ANY PROBLEMS IN "dpstat" AND "flstat" * IN THE RAW DATA. ALSO, NAMES OF IMPUTED TAX VARS MAY CHANGE. * * USES DATA: mar03.dta - March CPS 2003, published by Unicon * Obtain Unicon CD from Jean Roth, upload mcps03_tax.req to * select vars, then run extract to obtain mar03.dta * * SAVES DATA: mcps03_s0.dta - includes tax + other variables * * UPDATED: 04/24/2007 ***********************************************************************/ use "${raw}mar03.dta" ; *Check record count: should be 216,424 ; desc ; /*** basic vars ***/ gen cpsyear=2003 ; label var cpsyear "March CPS Survey Year" ; gen year=2002 ; label var year "[TS1] Year for income reports" ; *March income supplement weight = wgt ; replace wgt=wgt/100 ; *Basic weight for core vars = wgtfnl ; replace wgtfnl=wgtfnl/100 ; *Household weight ; replace hhwgt=hhwgt/100 ; *Family weight ; replace famwgt=famwgt/100 ; rename state stcps ; gen state= . ; replace state=20 if stcps==11 ; /*Maine */ replace state=30 if stcps==12 ; /*NewHampshire */ replace state=46 if stcps==13 ; /*Vermont */ replace state=22 if stcps==14 ; /*Massachusetts*/ replace state=40 if stcps==15 ; /*RhodeIsland */ replace state=7 if stcps==16 ; /*Connecticut */ replace state=33 if stcps==21 ; /*NewYork */ replace state=31 if stcps==22 ; /*NewJersey */ replace state=39 if stcps==23 ; /*Pennsylvania */ replace state=36 if stcps==31 ; /*Ohio */ replace state=15 if stcps==32 ; /*Indiana */ replace state=14 if stcps==33 ; /*Illinois */ replace state=23 if stcps==34 ; /*Michigan */ replace state=50 if stcps==35 ; /*Wisconsin */ replace state=24 if stcps==41 ; /*Minnesota */ replace state=16 if stcps==42 ; /*Iowa */ replace state=26 if stcps==43 ; /*Missouri */ replace state=35 if stcps==44 ; /*NorthDakota */ replace state=42 if stcps==45 ; /*SouthDakota */ replace state=28 if stcps==46 ; /*Nebraska */ replace state=17 if stcps==47 ; /*Kansas */ replace state=8 if stcps==51 ; /*Delaware */ replace state=21 if stcps==52 ; /*Maryland */ replace state=9 if stcps==53 ; /*DC */ replace state=47 if stcps==54 ; /*Virginia */ replace state=49 if stcps==55 ; /*WestVirginia */ replace state=34 if stcps==56 ; /*NorthCarolina*/ replace state=41 if stcps==57 ; /*SouthCarolina*/ replace state=11 if stcps==58 ; /*Georgia */ replace state=10 if stcps==59 ; /*Florida */ replace state=18 if stcps==61 ; /*Kentucky */ replace state=43 if stcps==62 ; /*Tennessee */ replace state=1 if stcps==63 ; /*Alabama */ replace state=25 if stcps==64 ; /*Mississippi */ replace state=4 if stcps==71 ; /*Arkansas */ replace state=19 if stcps==72 ; /*Louisiana */ replace state=37 if stcps==73 ; /*Oklahoma */ replace state=44 if stcps==74 ; /*Texas */ replace state=27 if stcps==81 ; /*Montana */ replace state=13 if stcps==82 ; /*Idaho */ replace state=51 if stcps==83 ; /*Wyoming */ replace state=6 if stcps==84 ; /*Colorado */ replace state=32 if stcps==85 ; /*NewMexico */ replace state=3 if stcps==86 ; /*Arizona */ replace state=45 if stcps==87 ; /*Utah */ replace state=29 if stcps==88 ; /*Nevada */ replace state=48 if stcps==91 ; /*Washington */ replace state=38 if stcps==92 ; /*Oregon */ replace state=5 if stcps==93 ; /*California */ replace state=2 if stcps==94 ; /*Alaska */ replace state=12 if stcps==95 ; /*Hawaii */ label var state "[TS2] State (SOI codes)" ; *Race and ethnicity ; gen race6=race if race<=5 ; replace race6=6 if race>=6 & race<=21 ; label def race6 1 "White" 2 "Black" 3 "Amer.Ind." 4 "Asian" 5 "Haw/PacIsl" 6 "Multiple" ; label val race6 race6 ; label var race6 "CPS Race, including multiple" ; rename hisp hispcps ; gen raceeth=1 if race6==1 & hispcps==2 ; replace raceeth=2 if race6==2 & hispcps==2 ; replace raceeth=3 if race6>=3 & race6<=6 & hispcps==2 ; replace raceeth=4 if hispcps==1 ; label def raceeth 1 "White, NH" 2 "Black, NH" 3 "Other NH" 4 "Hispanic" ; gen whitenh=(raceeth==1) if raceeth<. ; gen blacknh=(raceeth==2) if raceeth<. ; gen othernh=(raceeth==3) if raceeth<. ; gen hisp=(raceeth==4) if raceeth< . ; label var whitenh "White, non-hispanic" ; label var blacknh "Black, non-hispanic" ; label var othernh "Other race, non-hispanic" ; label var hisp "Hispanic" ; *Age and gender ; gen agelt17=(age>=0 & age<=16) if age<. ; label var agelt17 "Person is age 0-16" ; gen agegt64=(age>=65) if age<. ; label var agegt64 "Person is age 65 or older" ; gen female=(sex==2) if sex< .; label var female "Female" ; gen male=(sex==1) if sex<. ; label var male "Male" ; *Marital status ; gen married=(marstat>=1 & marstat<=3) if marstat<. ; label var married "Currently married" ; *Educational attainment; gen educatn=1 if grdatn>=0 & grdatn<=38 ; replace educatn=2 if grdatn==39 ; replace educatn=3 if grdatn==40 ; replace educatn=4 if grdatn==41 | grdatn==42 ; replace educatn=5 if grdatn==43 ; replace educatn=6 if grdatn>=44 & grdatn<=46 ; label def educatn 1 "Less than HS dip/equiv" 2 "HS dip/equiv" 3 "Some coll, no deg" 4 "AS deg" 5 "BA deg" 6 "MA,Prof,PhD" ; label val educatn educatn ; label var educatn "Educ. attainment: 1=Lt HS, 6=More than BA" ; gen colatt=(schlvl==2) if age>=16 & age<=24 ; label var colatt "Currently enrolled in college (age 16-24 only)" ; gen colft=(colatt==1 & schft==1) if colatt< . ; label var colft "Currently enrolled full-time in college (16-24 only)" ; gen colpt=(colatt==1 & schft==2) if colatt<. ; label var colpt "Currently enrolled part-time in college (16-24 only)" ; /* CODES 04-92 Children 00 Less than 1st grade 31 1st, 2nd, 3rd or 4th grade 32 5th or 6th grade 33 7th or 8th grade 34 9th grade 35 10th grade 36 11th grade 37 12th grade or no diploma 38 High school graduate - High school diploma or equivalent 39 Some college but no degree 40 Associate's degree in college - occupational/vocational 41 Associate's degree in college - academic 42 Bachelor's degree (e.g., BA, BS, AB) 43 Master's degree (e.g., MA, MS, MEng, MEd, MSW, MBA) 44 Professional school degree (e.g.: MD, DDS, DVM, LLB, JD) 45 Doctorate degree (e.g., PhD, EdD) 46 */ *Individual income (these will aggregate in next program) ; gen wages_i= incwag /*Income: wage & salary */ + incse /*Income: nonfarm self-employ */ + incfrm ; /*Income: farm/nonincorporated SE */ label var wages_i "Individual wages" ; gen dividends_i=incdv2 ; /*Income: dividends */ label var dividends_i "Individual dividends" ; gen otherprop_i=incint /*Income: interest */ + incrnt ; /*Income: rent */ label var otherprop_i "Individual otherprop" ; gen pensions_i= incret ; /*Income: retirement funds */ label var pensions_i "Individual pensions" ; gen gssi_i= incss /*Income: social security */ + incsec /*Income: supplemental security */ + incsrv /*Income: survivor's benefits */ + incdis ;/*Income: disability */ label var gssi_i "Individual gross social security" ; gen transfers_i=incpa /*Income: public assist/welfare */ + incwcp /*Income: workers comp */ + incvet /*Income: veteran payments */ + inchld ; /*Income: child support */ label var transfers_i "Individual transfer income" ; gen ui_i= incuc ; /*Income: unemployment comp */ label var ui_i "Individual UI income" ; /*Ignored sources of income inced Income: educational assistance incalm Income: alimony incont Income: contrib/financial asst incoth Income: other sources NOTE: cpgain and cploss are imputed by CPS - attached to "tax filing head" prptax imputation is also available and attached to every member of a household BOTH OF THESE VARS WILL BE PICKED UP WHEN DATA ARE AGGREGATED TO TAX FILING UNIT */ *Any self-employment income - pos or neg (a tag for manipulability of AGI) ; gen selfemp=(incse!=0) ; label var selfemp "Person had any self-emp income (pos or neg)" ; gen selfemp2=(incse!=0 | incfrm!=0) ; label var selfemp2 "Person had any self-emp or farm income (pos or neg)" ; *Tax filing status - based on CPS imputation. Case check of ages, gender, marital status looks good; gen mstat=1 if flstat==5 ; replace mstat=2 if flstat>=1 & flstat<=3 ; replace mstat=3 if flstat==4 ; replace mstat=8 if flstat==1 & dpstat>0 ; /* dependent taxpayers */ replace mstat=0 if flstat==6 ; /* nonfilers*/ label def mstat 1 "Single" 2 "Joint" 3 "Head of HHold" 8 "Dep Taxpayer" 0 "Nonfiler" ; label val mstat mstat ; label var mstat "[TS3] Tax filing marital status (created by CPS)" ; *Number of age 65+ taxpayers ; gen agex=0 if flstat>=4 & age<65 ; replace agex=1 if flstat>=4 & age>=65 ; replace agex=0 if flstat==1 ; replace agex=1 if flstat==2 ; replace agex=2 if flstat==3 ; label var agex "[TS5] Number of age 65+ taxpayers" ; /* FLSTAT CODES (1-6) Tax filer | status | Freq. Percent Cum. ------------+----------------------------------- Jtbth<65 | 72,788 33.63 33.63 J1<1+65 | 3,456 1.60 35.23 Jtbth65+ | 6,550 3.03 38.26 HeadofHH | 10,538 4.87 43.12 Single | 38,284 17.69 60.81 Nonfiler | 84,808 39.19 100.00 ------------+----------------------------------- Total | 216,424 100.00 */ *Relationship to head ; *Unique family identifier (Note: 85% have only one family w/in household, 93% are in primary family); gen double uniqfam=(hhseq*10000)+(fampos*100) ; label var uniqfam "Unique family identifier" ; *Generate unique tax unit identifiers (caseid) ; gen taxid=0 if flstat==6 ; /*non filers */ replace taxid=1 if flstat<=3 ; /* joint filers - no more than two per family */ *Check that no more than 2 joint filers per family ; by uniqfam,sort: egen njoint=sum(taxid) ; tab njoint ; drop njoint ; gen othfiler=(flstat>=4 & flstat<=5) ; by uniqfam othfiler, sort: gen taxnum=_n ; replace taxid=taxnum+2 if taxid==. ; /* the number is not meaningful but will be unique within families */ list hhseq fampos age marstat flstat dpstat taxid mstat if hfmcnt>1 in 1/1000, sepby(uniqfam) ; label var taxid "Tax unit id within families (0=nonfiler,1=joint,oth=single/hh)" ; drop othfiler taxnum ; *Note: caseid uniquely identifies tax units, but joint filers have same id - keep only one record for taxsim? ; gen double uniqtax=uniqfam+taxid ; label var uniqtax "Unique tax unit identifier" ; by uniqtax, sort: gen taxpers=_n ; gen double caseid=(uniqtax*10)+taxpers ; label var caseid "[TS0] Unique case identifier" ; *Number of dependents per tax form ; *dpstat var = 0 if not a dependent, otherwise lists lineno of person claiming (imputed by CPS) ; tab lineno ; *Loop should run until the maximum of lineno - WILL NEED TO CHECK THIS EACH YEAR ; for X in num 1/16: gen depX=(dpstat==X) ; for X in num 1/16: by uniqfam,sort: egen ndepX=sum(depX) ; gen numdep=0 ; for X in num 1/16: replace numdep=ndepX if lineno==X ; by uniqtax, sort: egen depx=max(numdep) ; /* gets dependents listed for both joint filers */ label var depx "[TS5] Number of dependents on tax form" ; list hhseq age flstat dpstat depx lineno perid if hfmcnt>1 in 1/1000 , sepby(uniqfam) ; /* NOTE: The code above works but reveals problems with dpstat var, created by CPS ; In some cases dependents are associated to young children or not associated to a lineno that does not appear ; dpstat is supposed to refer to lineno of taxpayer claiming the dependent, but in at least one case it appears wrong ; Seems to be associated with discrepencies between lineno and perid. Usually lineno=perid-40 but not always ; In the obviously wrong cases, dpstat seems to refer to perid instead of lineno. ; */ *Try alternate construction, see if any better ; gen altnumdep=0 ; for X in num 1/16: replace altnumdep=ndepX if perid==X+40 ; by uniqtax, sort: egen altdepx=max(altnumdep) ; /* gets dependents listed for both joint filers */ gen prob=(depx!=altdepx) ; by uniqfam, sort: egen famprob=max(prob) ; /* identifies all members of families in which there is any conflict */ tab famprob ; list hhseq age flstat dpstat depx altdepx lineno perid if famprob==1, sepby(uniqfam) ; *In what proportion of conflicts is depx clearly wrong ; by uniqfam, sort: egen anydep=sum(depx) ; tab anydep if famprob==1 ; /* if anydep==0 this is clearly wrong, dependents are assoc to lineno not listed */ gen ageprob=depx>0 & age<15 ; by uniqfam, sort: egen anyageprob=max(ageprob) ; tab anyageprob if famprob==1 & anydep>0 ; /* Of 3124 cases of conflicts, only 123 remain unexplained */ list hhseq age flstat dpstat depx altdepx lineno perid if famprob==1 & anydep>0 & anyageprob==0, sepby(uniqfam) ; *These remaining cases also appear to be wrong originally: eg deps associated to very young single filers */ *I conclude my variable is better ; replace depx=altdepx ; drop dep1-dep16 ndep1-ndep16 numdep altnumdep altdepx prob famprob anydep ageprob anyageprob ; *Use same code to count number of dependent children under 17 for child credit ; for X in num 1/16: gen depchildX=(dpstat==X & age<17) ; for X in num 1/16: by uniqfam,sort: egen ndepchildX=sum(depchildX) ; gen numdepchild=0 ; for X in num 1/16: replace numdepchild=ndepchildX if perid==X+40 ; by uniqtax, sort: egen depchild=max(numdepchild) ; label var depchild "[TS18] Number of dependents under age 17" ; drop depchild1-depchild16 ndepchild1-ndepchild16 numdepchild ; *Income tax items ; gen pwages=wages_i ; gen swages=0 if mstat!=2 ; by uniqtax, sort: gen spwages1=wages_i[_n+1] if mstat[_n+1]==2 ; by uniqtax, sort: gen spwages2=wages_i[_n-1] if mstat[_n-1]==2 ; replace swages=spwages1 if swages==. ; replace swages=spwages2 if swages==. ; list hhseq fampos perid mstat wages_i pwages swages in 1/100, sepby(hhseq) ; label var pwages "[TS6] Primary taxpayer wage income" ; label var swages "[TS7] Secondary taxpayer wage income" ; drop spwages1 spwages2 ; *Note: In a joint filing tax unit, there are two records, one for each as a primary taxpayer ; *The following items are added for those in joint filing unit ; by uniqtax, sort: gen dividends=sum(dividends_i) ; by uniqtax, sort: gen otherprop=sum(otherprop_i) ; by uniqtax, sort: gen pensions=sum(pensions_i) ; by uniqtax, sort: gen gssi=sum(gssi_i) ; by uniqtax, sort: gen transfers=(transfers_i) ; by uniqtax, sort: gen ui=sum(ui_i) ; label var dividends "[TS8] Dividend income" ; label var otherprop "[TS9] Interest, other prop income, no alimony" ; label var pensions "[TS10] Pensions/retirement income" ; label var gssi "[TS11] Social sec, supp sec, survivors, disability bens" ; label var transfers "[TS12] Welfare, WC, Vets, child supp" ; label var ui "[TS17] Unemployment compensation" ; *The following vars are unavailable ; gen rentpaid=0 ; gen otheritem=0 ; gen childcare=0 ; gen mortgage=0 ; label var rentpaid "[TS13] Rent paid - NOT AVAIL FOR CPS" ; label var otheritem "[TS15] Oth itemized deductions - NOT AVAIL FOR CPS" ; label var childcare "[TS16] Child care expenses - NOT AVAIL FOR CPS" ; label var mortgage "[TS19] Mortgage interest paid - NOT AVAIL FOR CPS" ; *The following vars are CPS imputations ; *Property taxes paid. ; *These are originally attached to all persons in the HOUSEHOLD ; *Fix so that values are assigned only to one tax unit within hhold ; *Order of assignment: joint filers, head of hhold, single filers ; *If there are multiple singles in a hhhold with no joint or hhead filers, then split proptax across singles ; gen joint=(flstat<=3) ; gen hhead=(flstat==4) ; gen single=(flstat==5) ; by hhseq, sort: egen anyjoint=max(joint) ; by hhseq, sort: egen anyhhead=max(hhead) ; by hhseq, sort: egen numsingle=sum(single) ; gen proptax=prptax ; replace proptax=0 if flstat==6 ; /* nonfilers */ replace proptax=0 if flstat>=4 & anyjoint==1 ; /* hheads or single filers in hholds with joint filers */ replace proptax=0 if flstat==5 & anyhhead==1 ; /* single filers in hholds with hhhead filers */ replace proptax=proptax/numsingle if flstat==5 & anyjoint==0 & anyhhead==0 ; /* single filers in hhold with no hheads or joint filers */ label var proptax "[TS14] Property tax paid (imputed by CPS)" ; *Capital gains/losses: only one person in joint filing unit has the info - need to attach to both ; /*CHECK DOC: What is up with 99999? Appears to be a topcode - those with 99999 have much higher family income, but there are 22 obs with values above 99999. Approx 12% of individuals are in filing units with positive cap gains imputed. Among these 12%, 90% range from $128 to $20,000, 2% are between $20K and $60K, 8% have 99999, and less than 1% are above 99999. */ *NOTE: all cap gains/losses are assumed to be long-term ; gen stcg=0 ; label var stcg "[TS20] Short-term cap gains - NOT AVAIL FOR CPS" ; by uniqtax, sort: gen famcapg=sum(cpgain) ; by uniqtax, sort: gen famcapl=sum(cploss) ; gen ltcg=famcapg-famcapl ; label var ltcg "[TS21] Long-term cap gains (imputed by CPS)" ; drop famcapg famcapl ; save "${data}mcps03_s0.dta", replace ; ****RUN TAXSIM**** ; taxsim7, full replace ; *Check some of taxsim vars against CPS imputations (note: CPS imputations only applied to one member of joint filers) ; corr agi v10 if agi>0 ; corr agi v10 if agi>0 & agi!=99999 ; corr eitcrd v25 if eitcrd>0 & eitcrd<99999; corr fedtax fiitax if fedtax>0 & fedtax<99999; corr sttax siitax if sttax>0 & sttax<99999; corr taxinc v18 if taxinc>0 & taxinc!=99999; save "${data}mcps03_s0.dta", replace ; log close ;