capture log close capture program drop _all capture macro drop _all drop _all set mem 500m set more off #delimit; /*** Set Server Directory Structure ***/ global dofiles "/homes/nber/jscottc/marcps/dofiles/" ; global raw "/homes/nber/jscottc/marcps/data/"; global data "/homes/nber/jscottc/bulk/marcps/" ; /*** Set NBER desktop directory structure ***/ *global dofiles "C:\documents and settings\jscottc\my documents\marcps\dofiles\" ; *global raw "C:\documents and settings\jscottc\my documents\marcps\mcps03_tax\"; *global data "C:\documents and settings\jscottc\my documents\marcps\data\" ; /*** Open log file ***/ log using "${dofiles}mcps06_s0.log", replace ; /********************************************************************** *stata 8.0 * PROJECT: Prepping file for TaxSim - match to October CPS data * FOR: 2007 TPE Paper * PROGRAMMER: Judy Scott-Clayton * PROGNAME: mcps06_s0.do * PURPOSE: *Read in relevant 2006 March Unicon file * *Create tax-unit level vars and attach to all in the tax unit * **NOTE** Variables needed for taxsim are labeled TS0-TS21, so * search for TS# to find relevant code. * **NOTE** THIS CODE WILL NEED TO BE EDITED/CHECK IF USED FOR * OTHER YEARS. DIFFERENT PROBLEMS MAY ARISE IN ANOTHER FILE. * * USES DATA: mar06.dta - March CPS 2006, published by Unicon * **NOTE** See mar06.rpt for variable descriptions, original CPS locations * * SAVES DATA: mcps06_s0.dta - includes tax + other variables * * UPDATED: 04/17/2007 ***********************************************************************/ use "${raw}mar06.dta" ; *Check record count: should be 208,562 ; desc ; /*** basic vars ***/ rename year cpsyear ; label var cpsyear "March CPS Survey Year" ; gen year=cpsyear-1 ; label var year "[TS1] Year for income reports" ; *March income supplement weight = wgt ; replace wgt=wgt/100 ; *Basic weight for core vars = wgtfnl ; replace wgtfnl=wgtfnl/100 ; *Household weight ; replace hhwgt=hhwgt/100 ; *Family weight ; replace famwgt=famwgt/100 ; rename state stcps ; gen state= . ; replace state=20 if stcps==11 ; /*Maine */ replace state=30 if stcps==12 ; /*NewHampshire */ replace state=46 if stcps==13 ; /*Vermont */ replace state=22 if stcps==14 ; /*Massachusetts*/ replace state=40 if stcps==15 ; /*RhodeIsland */ replace state=7 if stcps==16 ; /*Connecticut */ replace state=33 if stcps==21 ; /*NewYork */ replace state=31 if stcps==22 ; /*NewJersey */ replace state=39 if stcps==23 ; /*Pennsylvania */ replace state=36 if stcps==31 ; /*Ohio */ replace state=15 if stcps==32 ; /*Indiana */ replace state=14 if stcps==33 ; /*Illinois */ replace state=23 if stcps==34 ; /*Michigan */ replace state=50 if stcps==35 ; /*Wisconsin */ replace state=24 if stcps==41 ; /*Minnesota */ replace state=16 if stcps==42 ; /*Iowa */ replace state=26 if stcps==43 ; /*Missouri */ replace state=35 if stcps==44 ; /*NorthDakota */ replace state=42 if stcps==45 ; /*SouthDakota */ replace state=28 if stcps==46 ; /*Nebraska */ replace state=17 if stcps==47 ; /*Kansas */ replace state=8 if stcps==51 ; /*Delaware */ replace state=21 if stcps==52 ; /*Maryland */ replace state=9 if stcps==53 ; /*DC */ replace state=47 if stcps==54 ; /*Virginia */ replace state=49 if stcps==55 ; /*WestVirginia */ replace state=34 if stcps==56 ; /*NorthCarolina*/ replace state=41 if stcps==57 ; /*SouthCarolina*/ replace state=11 if stcps==58 ; /*Georgia */ replace state=10 if stcps==59 ; /*Florida */ replace state=18 if stcps==61 ; /*Kentucky */ replace state=43 if stcps==62 ; /*Tennessee */ replace state=1 if stcps==63 ; /*Alabama */ replace state=25 if stcps==64 ; /*Mississippi */ replace state=4 if stcps==71 ; /*Arkansas */ replace state=19 if stcps==72 ; /*Louisiana */ replace state=37 if stcps==73 ; /*Oklahoma */ replace state=44 if stcps==74 ; /*Texas */ replace state=27 if stcps==81 ; /*Montana */ replace state=13 if stcps==82 ; /*Idaho */ replace state=51 if stcps==83 ; /*Wyoming */ replace state=6 if stcps==84 ; /*Colorado */ replace state=32 if stcps==85 ; /*NewMexico */ replace state=3 if stcps==86 ; /*Arizona */ replace state=45 if stcps==87 ; /*Utah */ replace state=29 if stcps==88 ; /*Nevada */ replace state=48 if stcps==91 ; /*Washington */ replace state=38 if stcps==92 ; /*Oregon */ replace state=5 if stcps==93 ; /*California */ replace state=2 if stcps==94 ; /*Alaska */ replace state=12 if stcps==95 ; /*Hawaii */ label var state "[TS2] State (SOI codes)" ; *Race and ethnicity ; gen race6=race if race<=5 ; replace race6=6 if race>=6 & race<=21 ; label def race6 1 "White" 2 "Black" 3 "Amer.Ind." 4 "Asian" 5 "Haw/PacIsl" 6 "Multiple" ; label val race6 race6 ; label var race6 "CPS Race, including multiple" ; rename hisp hispcps ; gen raceeth=1 if race6==1 & hispcps==2 ; replace raceeth=2 if race6==2 & hispcps==2 ; replace raceeth=3 if race6>=3 & race6<=6 & hispcps==2 ; replace raceeth=4 if hispcps==1 ; label def raceeth 1 "White, NH" 2 "Black, NH" 3 "Other NH" 4 "Hispanic" ; gen whitenh=(raceeth==1) if raceeth<. ; gen blacknh=(raceeth==2) if raceeth<. ; gen othernh=(raceeth==3) if raceeth<. ; gen hisp=(raceeth==4) if raceeth< . ; label var whitenh "White, non-hispanic" ; label var blacknh "Black, non-hispanic" ; label var othernh "Other race, non-hispanic" ; label var hisp "Hispanic" ; *Age and gender ; gen agelt17=(age>=0 & age<=16) if age<. ; label var agelt17 "Person is age 0-16" ; gen agegt64=(age>=65) if age<. ; label var agegt64 "Person is age 65 or older" ; gen female=(sex==2) if sex< .; label var female "Female" ; gen male=(sex==1) if sex<. ; label var male "Male" ; *Marital status ; gen married=(marstat>=1 & marstat<=3) if marstat<. ; label var married "Currently married" ; *Educational attainment; gen educatn=1 if grdatn>=0 & grdatn<=38 ; replace educatn=2 if grdatn==39 ; replace educatn=3 if grdatn==40 ; replace educatn=4 if grdatn==41 | grdatn==42 ; replace educatn=5 if grdatn==43 ; replace educatn=6 if grdatn>=44 & grdatn<=46 ; label def educatn 1 "Less than HS dip/equiv" 2 "HS dip/equiv" 3 "Some coll, no deg" 4 "AS deg" 5 "BA deg" 6 "MA,Prof,PhD" ; label val educatn educatn ; label var educatn "Educ. attainment: 1=Lt HS, 6=More than BA" ; gen colatt=(schlvl==2) if age>=16 & age<=24 ; label var colatt "Currently enrolled in college (age 16-24 only)" ; gen colft=(colatt==1 & schft==1) if colatt< . ; label var colft "Currently enrolled full-time in college (16-24 only)" ; gen colpt=(colatt==1 & schft==2) if colatt<. ; label var colpt "Currently enrolled part-time in college (16-24 only)" ; /* CODES 04-92 Children 00 Less than 1st grade 31 1st, 2nd, 3rd or 4th grade 32 5th or 6th grade 33 7th or 8th grade 34 9th grade 35 10th grade 36 11th grade 37 12th grade or no diploma 38 High school graduate - High school diploma or equivalent 39 Some college but no degree 40 Associate's degree in college - occupational/vocational 41 Associate's degree in college - academic 42 Bachelor's degree (e.g., BA, BS, AB) 43 Master's degree (e.g., MA, MS, MEng, MEd, MSW, MBA) 44 Professional school degree (e.g.: MD, DDS, DVM, LLB, JD) 45 Doctorate degree (e.g., PhD, EdD) 46 */ *Individual income (these will aggregate in next program) ; gen wages_i= incwag /*Income: wage & salary */ + incse /*Income: nonfarm self-employ */ + incfrm /*Income: farm/nonincorporated SE */ if incse>=0 & incfrm>=0 ; replace wages_i=incwag if incse<0 | incfrm<0 ; /* taxsim will not accept negative wages - put negatives in otherprop */ label var wages_i "Individual wages" ; gen dividends_i=incdv2 ; /*Income: dividends */ label var dividends_i "Individual dividends" ; gen otherprop_i=incint /*Income: interest */ + incrnt /*Income: rent */ if incse>=0 & incfrm>=0 ; replace otherprop_i=incint+incrnt+incse+incfrm if incse<0|incfrm<0 ; label var otherprop_i "Individual otherprop" ; gen pensions_i= incret ; /*Income: retirement funds */ label var pensions_i "Individual pensions" ; gen gssi_i= incss /*Income: social security */ + incsec /*Income: supplemental security */ + incsrv /*Income: survivor's benefits */ + incdis ;/*Income: disability */ label var gssi_i "Individual gross social security" ; gen transfers_i=incpa /*Income: public assist/welfare */ + incwcp /*Income: workers comp */ + incvet /*Income: veteran payments */ + inchld ; /*Income: child support */ label var transfers_i "Individual transfer income" ; gen ui_i= incuc ; /*Income: unemployment comp */ label var ui_i "Individual UI income" ; /*Ignored sources of income inced Income: educational assistance incalm Income: alimony incont Income: contrib/financial asst incoth Income: other sources NOTE: cpgain and cploss are imputed by CPS - attached to "tax filing head" prptax imputation is also available and attached to every member of a household BOTH OF THESE VARS WILL BE PICKED UP WHEN DATA ARE AGGREGATED TO TAX FILING UNIT */ *Any self-employment income - pos or neg ; gen selfemp=(incse!=0) ; label var selfemp "Person had any self-emp income (pos or neg)" ; gen selfemp2=(incse!=0 | incfrm!=0) ; label var selfemp2 "Person had any self-emp or farm income (pos or neg)" ; *NOTE: IN 2006 FILE, DPSTAT VAR APPEARS INCORRECT - LINENOs LISTED FOR CLEAR NON-DEPENDENTS ; tab dpstat ; /* only 11% of sample NOT dependent? */ tab dpstat flstat, row col ; /* dpstat appears valid only for non-filers */ gen dpstat_raw=dpstat ; label var dpstat_raw "Original CPS-provided dpstat var (problematic)" ; replace dpstat=0 if flstat<6 ; /* anyone who files own taxes is not a dependent */ *Tax filing status - based on CPS imputation. Case check of ages, gender, marital status looks good; *NOTE: IN 2006 FILE, FLSTAT VAR APPEARS INCORRECT FOR MANY JOINT FILERS - AGE IS WRONG ; *This doesn't affect creation of mstat but will affect creation of agex ; gen mstat=1 if flstat==5 ; replace mstat=2 if flstat>=1 & flstat<=3 ; replace mstat=3 if flstat==4 ; replace mstat=8 if flstat==5 & dpstat>0 ; /* dependent taxpayers */ replace mstat=0 if flstat==6 ; /* nonfilers*/ label def mstat 1 "Single" 2 "Joint" 3 "Head of HHold" 8 "Dep Taxpayer" 0 "Nonfiler" ; label val mstat mstat ; label var mstat "[TS3] Tax filing marital status (created by CPS)" ; *LATER IN PROGRAM: Want to check non-filers - may want to send them through taxsim anyway ; *Check: how are married/spouse absent or separated individuals assigned to tax status? ; *Married, Armedforces-Spouse absent are treated as joint filers. Other marr,spouse-abs and separated are single or Hhhead ; tab marstat flstat, row col ; /* FLSTAT CODES (1-6) from 2003 file Tax filer | status | Freq. Percent Cum. ------------+----------------------------------- Jtbth<65 | 72,788 33.63 33.63 J1<1+65 | 3,456 1.60 35.23 Jtbth65+ | 6,550 3.03 38.26 HeadofHH | 10,538 4.87 43.12 Single | 38,284 17.69 60.81 Nonfiler | 84,808 39.19 100.00 ------------+----------------------------------- Total | 216,424 100.00 */ *Unique family identifier (Note: 85% have only one family w/in household, 93% are in primary family); gen double uniqfam=(hhseq*10000)+(fampos*100) ; label var uniqfam "Unique family identifier" ; set linesize 125 ; *Generate unique tax unit identifiers ; gen taxid=0 if flstat==6 ; /*non filers */ replace taxid=1 if flstat<=3 ; /* joint filers - no more than two per family */ *Check that no more than 2 joint filers per family ; by uniqfam,sort: egen njoint=sum(taxid) ; tab njoint ; drop njoint ; gen othfiler=(flstat>=4 & flstat<=5) ; by uniqfam othfiler, sort: gen taxnum=_n ; replace taxid=taxnum+2 if taxid==. ; /* the number is not meaningful but will be unique within families */ list hhseq fampos age marstat flstat dpstat taxid mstat if hfmcnt>1 in 1/1000, sepby(uniqfam) ; label var taxid "Tax unit id within families (0=nonfiler,1=joint,oth=single/hh)" ; drop othfiler taxnum ; *Note: caseid uniquely identifies tax units, but joint filers have same id - keep only one record for taxsim? ; gen double uniqtax=uniqfam+taxid ; label var uniqtax "Unique tax unit identifier" ; by uniqtax, sort: gen taxpers=_n ; gen double caseid=(uniqtax*10)+taxpers ; label var caseid "[TS0] Unique case identifier" ; *Number of age 65+ taxpayers ; by uniqtax, sort: egen agex=sum(agegt64) ; label var agex "[TS5] Number of age 65+ taxpayers" ; /* note: all nonfilers w/in hhold have same taxunit id */ ***FIX FLSTAT VAR *** ; *Look at a few problem cases: ; list hhseq age marstat relhd flstat if hhseq==56|hhseq==146|hhseq==58|hhseq==705 ; gen flstat_raw=flstat ; /* preserves raw flstat codes */ replace flstat=1 if flstat<=3 & agex==0 ; replace flstat=2 if flstat<=3 & agex==1 ; replace flstat=3 if flstat<=3 & agex==2 ; *Number of dependents per tax form ; *dpstat var = 0 if not a dependent, otherwise lists lineno of person claiming (imputed by CPS) ; tab lineno ; *Loop should run until the maximum of lineno - WILL NEED TO CHECK THIS EACH YEAR ; for X in num 1/16: gen depX=(dpstat==X) ; for X in num 1/16: by uniqfam,sort: egen ndepX=sum(depX) ; gen depx=0 ; for X in num 1/16: replace depx=ndepX if lineno==X ; by uniqtax, sort: egen maxdep=max(depx) ; replace depx=maxdep if flstat<=3 ; /* gets dependents listed for both joint filers */ label var depx "[TS5] Number of dependents on tax form" ; list hhseq age sex marstat relhd flstat dpstat depx lineno perid if hfmcnt>1 in 1/1000 , sepby(uniqfam) ; *Check how many non-filers have children - about 3%; tab depx flstat, col ; *Check how many young children are not classified as dependents (ie not associated to anyone in hhold) - about 5%; *Of these about 45% are grandchildren of hhhead. Others are "other relatives", foster kids, secondary individs, etc ; tab dpstat if age<16 ; tab relhd if age<16 & dpstat==0 ; tab dpstat if age<18 & flstat==6 ; tab relhd if age<18 & dpstat==0 & flstat==6; /* NOTE: The code above works but I have some reservations about whether identification of non-filers is correct. Some non-filers are married, have income, and/or have children. HOWEVER the majority look okay: */ tab mstat ; tab dpstat if mstat==0 ; /* how many non-filers are dependents of someone else - about 2/3 */ tab depx if mstat==0 & dpstat==0 ; /* how many non-filers are not dependents and have their own kids - only 10% */ sum incern if mstat==0 & dpstat==0 ; /* what is earned income of non-filers who are not dependents? 90% have zero */ sum incern if mstat==0 & dpstat==0 & incern>0 ; /* of 10% that do have income, median is $8000 */ sum incwag if mstat==0 & dpstat==0 & incern>0 ; /* of 10% that do have income, wage/salary median is $5000 */ *Use same code to count number of dependent children under 17 for child credit ; for X in num 1/16: gen depchildX=(dpstat==X & age<17) ; for X in num 1/16: by uniqfam,sort: egen ndepchildX=sum(depchildX) ; gen numdepchild=0 ; for X in num 1/16: replace numdepchild=ndepchildX if lineno==X ; by uniqtax, sort: egen depchild=max(numdepchild) ; label var depchild "[TS18] Number of dependents under age 17" ; drop depchild1-depchild16 ndepchild1-ndepchild16 numdepchild ; *Income tax items ; gen pwages=wages_i ; gen swages=0 if mstat!=2 ; by uniqtax, sort: gen spwages1=wages_i[_n+1] if mstat[_n+1]==2 ; by uniqtax, sort: gen spwages2=wages_i[_n-1] if mstat[_n-1]==2 ; replace swages=spwages1 if swages==. ; replace swages=spwages2 if swages==. ; list hhseq fampos perid mstat wages_i pwages swages in 1/100, sepby(hhseq) ; label var pwages "[TS6] Primary taxpayer wage income" ; label var swages "[TS7] Secondary taxpayer wage income" ; drop spwages1 spwages2 ; *Note: In a joint filing tax unit, there are two records, one for each as a primary taxpayer ; *The following items are added for those in joint filing unit ; by uniqtax, sort: gen dividends=sum(dividends_i) ; by uniqtax, sort: gen otherprop=sum(otherprop_i) ; by uniqtax, sort: gen pensions=sum(pensions_i) ; by uniqtax, sort: gen gssi=sum(gssi_i) ; by uniqtax, sort: gen transfers=(transfers_i) ; by uniqtax, sort: gen ui=sum(ui_i) ; label var dividends "[TS8] Dividend income" ; label var otherprop "[TS9] Interest, other prop income, no alimony" ; label var pensions "[TS10] Pensions/retirement income" ; label var gssi "[TS11] Social sec, supp sec, survivors, disability bens" ; label var transfers "[TS12] Welfare, WC, Vets, child supp" ; label var ui "[TS17] Unemployment compensation" ; *The following vars are unavailable ; gen rentpaid=0 ; gen otheritem=0 ; gen childcare=0 ; gen mortgage=0 ; label var rentpaid "[TS13] Rent paid - NOT AVAIL FOR CPS" ; label var otheritem "[TS15] Oth itemized deductions - NOT AVAIL FOR CPS" ; label var childcare "[TS16] Child care expenses - NOT AVAIL FOR CPS" ; label var mortgage "[TS19] Mortgage interest paid - NOT AVAIL FOR CPS" ; *The following vars are CPS imputations ; *Property taxes paid. ; *These are originally attached to all persons in the HOUSEHOLD ; *Fix so that values are assigned only to one tax unit within hhold ; *Order of assignment: joint filers, head of hhold, single filers ; *If there are multiple singles in a hhhold with no joint or hhead filers, then split proptax across singles ; gen joint=(flstat<=3) ; gen hhead=(flstat==4) ; gen single=(flstat==5) ; by hhseq, sort: egen anyjoint=max(joint) ; by hhseq, sort: egen anyhhead=max(hhead) ; by hhseq, sort: egen numsingle=sum(single) ; gen proptax=prptax ; replace proptax=0 if flstat==6 ; /* nonfilers */ replace proptax=0 if flstat>=4 & anyjoint==1 ; /* hheads or single filers in hholds with joint filers */ replace proptax=0 if flstat==5 & anyhhead==1 ; /* single filers in hholds with hhhead filers */ replace proptax=proptax/numsingle if flstat==5 & anyjoint==0 & anyhhead==0 ; /* single filers in hhold with no hheads or joint filers */ label var proptax "[TS14] Property tax paid (imputed by CPS)" ; *Capital gains/losses: only one person in joint filing unit has the info - need to attach to both ; /*CHECK DOC: What is up with 99999? Appears to be a topcode - those with 99999 have much higher family income, but there are 22 obs with values above 99999. Approx 12% of individuals are in filing units with positive cap gains imputed. Among these 12%, 90% range from $128 to $20,000, 2% are between $20K and $60K, 8% have 99999, and less than 1% are above 99999. */ *NOTE: all cap gains/losses are assumed to be long-term ; gen stcg=0 ; label var stcg "[TS20] Short-term cap gains - NOT AVAIL FOR CPS" ; by uniqtax, sort: gen famcapg=sum(cpgain) ; by uniqtax, sort: gen famcapl=sum(cploss) ; gen ltcg=famcapg-famcapl ; label var ltcg "[TS21] Long-term cap gains (imputed by CPS)" ; drop famcapg famcapl ; save "${data}mcps06_s0.dta", replace ; ****RUN TAXSIM**** ; taxsim7, full replace ; *Check some of taxsim vars against CPS imputations (note: CPS imputations only applied to one member of joint filers) ; corr agi v10 if agi>0 ; corr agi v10 if agi>0 & agi!=99999 ; corr eitcrd v25 if eitcrd>0 & eitcrd<99999; corr fedtaxbc fiitax if fedtaxbc>0 & fedtaxbc<99999; corr sttaxbc siitax if sttaxbc>0 & sttaxbc<99999; corr taxinc v18 if taxinc>0 & taxinc!=99999; save "${data}mcps06_s0.dta", replace ; log close ;