-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpop_housing.R
451 lines (406 loc) · 23.7 KB
/
pop_housing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
# Script to get total and occupied housing units, and population, by type and county, for 2019
rm(list=ls()) # clear workspace i.e. remove saved variables
cat("\014") # clear console
# make sure plyr is not loaded
library(dplyr)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
setwd("C:/Users/pb637/Documents/Yale Courses/Research/Final Paper/HSM_github")
load('~/Yale Courses/Research/US Housing/Popn Survey/2017 USCB Pop Projections/Hauer files/SSP_asrc/Hauer.RData') # Hauer data, based on 'SSP_asrc.csv' file, accessible at https://osf.io/uh5sj/
ssp1<-tapply(d$SSP1,d$YEAR,sum) # summary populations each 5 years 2020-2010, ssp1
ssp2<-tapply(d$SSP2,d$YEAR,sum) # summary populations each 5 years 2020-2010, ssp2
ssp3<-tapply(d$SSP3,d$YEAR,sum) # summary populations each 5 years 2020-2010, ssp3
ssp4<-tapply(d$SSP4,d$YEAR,sum) # summary populations each 5 years 2020-2010, ssp4
# census bureau total population projection, main series, accessible at https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1_mid.csv
cb<-read.csv('~/Yale Courses/Research/US Housing/Popn Survey/2017 USCB Pop Projections/np2017_d1_mid.csv')
cb<-cb[1:45,] # only interested in national totals
cb<-select(cb,YEAR, TOTAL_POP)
cbp<-cb[c(seq(5,45,5)),] # census bureau population projections for same years as Hauer
# census bureau total population projection, main series, accessible at https://www2.census.gov/programs-surveys/popproj/datasets/2017/2017-popproj/np2017_d1_low.csv
cbl<-read.csv('~/Yale Courses/Research/US Housing/Popn Survey/2017 USCB Pop Projections/np2017_d1_low.csv')
cbl<-select(cbl,YEAR, TOTAL_POP)
cblp<-cbl[c(seq(5,45,5)),] # census bureau population projections for same years as Hauer
# actual US population in Jul1 2020 is ~ 329.5 million, close to the USCB low immigration scenario of 330.6, lower than the midrange scenario of 332.6, and lower than the Hauer SSP projections
# SSP1 and SSP2 2020 valuer are 336.7, 335.7. Also lower than the ssp3 ssp4 values of 333.6 and 328.8.
# The Hauer-SSP scenarios most comparable to USCB projections are Hauer ssp2 for USCB mid range, and Hauer ssp4 for USCB low immigration.
s1<-data.frame(Year=cbp$YEAR,Pop= ssp1[1:9],Projection="SSP1")
s2<-data.frame(Year=cbp$YEAR,Pop= ssp2[1:9],Projection="SSP2")
s4<-data.frame(Year=cbp$YEAR,Pop= ssp4[1:9],Projection="SSP4")
cm<-data.frame(Year=cbp$YEAR,Pop=cbp$TOTAL_POP,Projection="USCB-mid")
cl<-data.frame(Year=cblp$YEAR,Pop=cblp$TOTAL_POP,Projection="USCB-low")
pp<-rbind(s1,s2,s4,cm,cl)
cb$Growth<-0
for(i in 2:45) {cb$Growth[i]<-cb$TOTAL_POP[i]-cb$TOTAL_POP[i-1]}
cb$Growth[1]<-NA
cb$delG<-0
for(i in 3:45) {cb$delG[i]<-cb$Growth[i]-cb$Growth[i-1]}
windows()
ggplot(pp,aes(Year,1e-6*Pop,shape=Projection)) + geom_line(aes(col=Projection)) + geom_point(aes(col=Projection),size=2) + #ylim(320,440) +
theme_bw() + scale_shape_manual(values=c(19,19,19,5,5)) + scale_color_brewer(palette = "Dark2") +
labs(title = "(a) US Population Projections", y = "Population (mill)") +
theme(axis.text=element_text(size=11),axis.title=element_text(size=11,face = "bold"),plot.title = element_text(size = 12, face = "bold"),legend.text=element_text(size=11))
windows()
ggplot(cb,aes(YEAR,1e-6*Growth)) + geom_line(col=brewer.pal(n = 5, name = "Dark2")[5],size=1) + theme_bw() + ylim(1,2.5) + xlim(2020,2060) +
labs(title = "(b) Annual Population Growth, USCB Mid-Range", y = "Population (mill)",x="Year") +
theme(axis.text=element_text(size=11),axis.title=element_text(size=11,face = "bold"),plot.title = element_text(size = 12, face = "bold"),legend.text=element_text(size=11))
######### household population by house type, from ACS Table B25033 ############### downloaded from data.census.gov
# 5-year estimate, contains data for all counties in US and Puerto Rico
pocc2019<-read.csv("~/Yale Courses/Research/US Housing/ACS/B25033_2019/ACSDT5Y2019.B25033_data_with_overlays_2020-12-22T132625.csv")
pocc2019$id<-substr(pocc2019$GEO_ID,nchar(pocc2019$GEO_ID)-4,nchar(pocc2019$GEO_ID)) # create the 5-digit county FIPS codes
pocc2019$StID<-substr(pocc2019$id,1,2) # extract the 2-digit State FIPS codes
# clean the data frame column names and remove margin of error columns
name<-names(pocc2019)
rmmar<-which(substr(name,nchar(name),nchar(name))=="M")
pocc2019<-pocc2019[,-rmmar]
pocc2019<-pocc2019[,c(dim(pocc2019)[2]-1,dim(pocc2019)[2],2:(dim(pocc2019)[2]-2))] # reorder columns and remove the redundant long GeoID
n<-as.character(pocc2019[1,])
pocc2019[1,5:16]<-substr(n[5:16],19,nchar(n[5:16]))
n<-as.character(pocc2019[1,])
pocc2019[1,]<-sub("1, detached or attached","1",n)
n<-as.character(pocc2019[1,])
pocc2019[1,]<-gsub(":!!","_",n)
n<-as.character(pocc2019[1,])
pocc2019[1,]<-gsub(":","",n)
n<-as.character(pocc2019[1,])
names(pocc2019)<-n
pocc2019<-pocc2019[-1,]
names(pocc2019)[1:3]<-c("GeoID","StID","County,State")
pocc2019<-pocc2019[1:3142,] # remove counties in Puerto Rico
pocc2019[,4:16]<-as.numeric(unlist(pocc2019[,4:16])) # convert numeric columns to numeric format
# this pocc2019 dataframe gives total household population in 2019 based on the 5yr ACS survey.
# Household population is approx. 97.5% of the resident population, which includes population living in group quarters, and population without homes
# Hauer data represent total resident population, see the description under 'Group Quarters' in Hauer (2019) https://www.nature.com/articles/sdata20195
pocc2019<-pocc2019[order(pocc2019$GeoID),] # order by GeoID
#sum up populations by house type by adding together owner and renter occupied housing populations
pocc2019$SF<-pocc2019$`Owner occupied_1`+pocc2019$`Renter occupied_1`
pocc2019$MF<-pocc2019$`Owner occupied_2 to 4`+pocc2019$`Owner occupied_5 or more`+
pocc2019$`Renter occupied_2 to 4`+pocc2019$`Renter occupied_5 or more`
pocc2019$MH<-pocc2019$`Owner occupied_Mobile home`+pocc2019$`Owner occupied_Boat, RV, van, etc.`+pocc2019$`Renter occupied_Mobile home`+pocc2019$`Renter occupied_Boat, RV, van, etc.`
# pocc2019$Year<-2019
pocc2019<-pocc2019[,c(1:4,17:19)]
colnames(pocc2019)[4:7]<-c("HhPop","SF_Pop","MF_Pop","MH_Pop") # household population by house type
pocc2019[,8:10]<-pocc2019[,5:7]/pocc2019[,4] # express housing units in percentages
colnames(pocc2019)[8:10]<-c("SFpc","MFpc","MHpc")
cid<-unique(d$GEOID)
cid2<-as.numeric(unique(pocc2019$GeoID))
Hmiss<-cid2[!cid2 %in% cid] # which counties does Hauer miss? 5 counties in Alaska. These are also not included by ResStock
# load in census estimates of total resident population, which is a consistent definition with Hauer (including GQ) and the census projections
# census population totals available at https://www2.census.gov/programs-surveys/popest/tables/2010-2019/counties/totals/co-est2019-annres.xlsx
pres1019<-read.csv("~/Yale Courses/Research/US Housing/Census/co-est2019-annres.csv") # converted to csv before reading
pres1019$County.State<-as.character(pres1019$County.State)
# are there string mismatches? (If TRUE, no)
all.equal(pres1019$County.State,pocc2019$County,State)
# locate string mismatches
s<-pres1019$County.State
s2<-pocc2019$`County,State`
# s2<-pocc2019$County.State
s2[!s2 %in% s] # check if any differences
pocc2019[pocc2019$GeoID=="35013",]$`County,State`<-"Dona Ana County, New Mexico"
# combine data frames of household and resident population
p19<-merge(pres1019,pocc2019,by.x = "County.State",by.y="County,State")
p19<-p19[,c(14,1,15,13,16:22)] # extract only GeoID, County.State, StateID, Resident pop in 2019, then hh population total, by type and percentages
p19<-p19[order(p19$GeoID),] # order by GeoID
# adjust spelling of La Salle, LA
p19[p19$GeoID==22059,]$County.State<-"La Salle Parish, Louisiana"
# check population ratios to see if there are any unusual differences, or to see if the matching by county name failed
# generally we should see that the census residential population is larger than the household population
p19$popr<-p19$X2019/p19$HhPop
rownames(p19)<-1:3142
large_under<-p19$County.State[which(p19$popr>1.4)] # counties with ACS estimate of HH pop much less than census estimate of resident pop
# big underestimate in Alaskan counties, OK
# big underestimate in Lincoln Cty AR, two prisons there (Cummins Unit and the Varner Unit)
# big underestimate in Lassen Cty CA, two state prisons
# same in Bent County CO
# Lincoln County CO tiny pop
# Union Cty FL, prison
# Chattahoochee County GA military base
# Stweary GA small pop
# East Carroll Parish, Louisiana, famring population, also small population
# Pershing Cty NV desert county small population
# Harding Cty NM tiny pop
# big differences in household and resident population in Forest Cty PA likely due to the prison there https://en.wikipedia.org/wiki/Forest_County,_Pennsylvania
# Lake Cty TN, prison
# Jones Cty TX, prison
# Williamsburg VA college town
# big difference in Dagget Utah (49009) likely due to the low population there
# big difference in Lexington city VA (51678) likely due to the existence of the Virginia Military Institute there
# otherwise, looks ok
# some of the counties with unusually low estimates of resident population includ"
# Cottle County Texas (low pop)
# Clark County, Idaho (tiny pop)
large_over<-p19$County.State[which(p19$popr<0.85)] # counties with ACS estimate of HH pop much more than census estimate of resident pop
# Kenedy County Texas (tiny pop)
# Piute County, Utah (low pop)
# Logan County, Nebraska (tiny pop)
# scale population by house type, based on resident population
p19$SF_rPop<-p19$SFpc*p19$X2019
p19$MF_rPop<-p19$MFpc*p19$X2019
p19$MH_rPop<-p19$MHpc*p19$X2019
p19$geoid_num<-as.numeric(p19$GeoID)
pocc2019$geoid_num<-as.numeric(pocc2019$GeoID)
# make Hauer ssp2 data consistent with census midrange projections, then further divide all data so 2020 population is equal to 329.5 million, roughly equal to July 1 2020 population from https://www.census.gov/popclock/
# first put Hauer data into county total format
# missing counties,. seems like all related to the creation of https://en.wikipedia.org/wiki/Petersburg_Borough,_Alaska
pocc2019[pocc2019$geoid_num==Hmiss[1],3]
pocc2019[pocc2019$geoid_num==Hmiss[2],3]
pocc2019[pocc2019$geoid_num==Hmiss[3],3]
pocc2019[pocc2019$geoid_num==Hmiss[4],3]
pocc2019[pocc2019$geoid_num==Hmiss[5],3]
ssp2Cty<-d %>%
group_by(GEOID,YEAR) %>%
summarize(pop = sum(SSP2, na.rm = TRUE))
ssp2Cty<-as.data.frame(ssp2Cty) # get rid of annoying list properties
ssp4Cty<-d %>%
group_by(GEOID,YEAR) %>%
summarize(pop = sum(SSP4, na.rm = TRUE))
ssp4Cty<-as.data.frame(ssp4Cty) # get rid of annoying list properties
# To compare end-points of projections in SSP2 SSP4, and CB mid and low-ranage projections:
# SSP2 - 420.1 million, SSP4 - 386.1 million, CBmid - 404.5, CBlow - 376.2.
# The mid-range projection adjusted downward by actual 2020 population is 401.1 million
# I stick with SSP2, adjusted to census mid-range
# population growth ratios 2050-2060 wrt 2020, for introduced AK counties, based on 2100, Haines Borough
prat<-ssp2Cty[ssp2Cty$GEOID==2100,3][2:9]/ssp2Cty[ssp2Cty$GEOID==2100,3][1]
# add in 5 missing counties
r<-matrix(0,9,5)
r[,]<-dim(ssp2Cty)[1]+1:45
# add in data for every 5 years for the AK missing counties, 2020-2060, this adds 45 rows to ssp2cty
for (i in 1:5) {
ssp2Cty[r[1,i],]<-c(Hmiss[i],2020, pres1019[pres1019$County.State==pocc2019[pocc2019$geoid_num==Hmiss[i],3],]$X2019) # cty, year, pop in 2020
ssp2Cty[r[2:9,i],1]<-Hmiss[i] # cty code, for 2025-2060
ssp2Cty[r[2:9,i],2]<-seq(2025,2060,5) # year for 2025-2060
ssp2Cty[r[2:9,i],3]<-ssp2Cty[r[1,i],3]*prat # population, 2025-2060
}
ssp2Cty<-ssp2Cty[order(ssp2Cty$GEOID),] # order by GeoID
# scale census burea mid range population by actual 2020 population recently reported https://www.census.gov/programs-surveys/popest/technical-documentation/research/evaluation-estimates.html
sf<-cbp[1,2]/329484123# scaling factor, census bureau 2020 projection divided by actual 2020 pop
cbp_scale<-cbp
cbp_scale$TOTAL_POP<-cbp$TOTAL_POP/sf # divide cb projection by sf between 2020 projection and 2020 actual, this prodocues a 2060 pop of 400mill
# scaling factor per 5 year period to bring Hauer SSP2 totals in line with mid-range census projections, adjusted to actual 2020 value
sf2<-tapply(ssp2Cty$pop,ssp2Cty$YEAR,sum)[1:9]/cbp_scale$TOTAL_POP
# get data frame of Hauer SSP2 data to 2060 only
ssp2Cty60<-ssp2Cty[ssp2Cty$YEAR<2061,]
# scale Hauer SSP2 to match 2020-adjusted census projections
ssp2Cty60$pop_adj<-ssp2Cty60$pop/as.numeric(sf2)
cbp$TOTAL_POP/ssp2[1:9] # this shows how much the scaled census bureau population projection is lower than Hauer SSP2.
# adjusted population projection per each county for each 5 year
s2t<-ssp2Cty60[,c(1,2,4)]
s2t$GEOID<-as.character(s2t$GEOID)
s2t[nchar(s2t$GEOID)<5,]$GEOID<-paste("0",s2t[nchar(s2t$GEOID)<5,]$GEOID,sep = "")
id9<-rep(p19$GeoID,each=9) # rep each geoID 0 times
# check that the geoids match
all.equal(id9,s2t$GEOID,tolerance=1.0)
s2t$SFp<-s2t$pop_adj*rep(p19$SFpc,each=9) # population in SF homes
s2t$MFp<-s2t$pop_adj*rep(p19$MFpc,each=9) # population in MF homes
s2t$MHp<-s2t$pop_adj*rep(p19$MHpc,each=9) # population in MH homes
## load data of occupied houses, calculate 2019 household size by house type and county #########
load('Intermediate_results/OccHousing2019.RData') # produced by script housing_stock.R
ho19t$sf<-ho19t$Tot_OU/(ho19t$SF+ho19t$MF+ho19t$MH) # scaling factor, ensure house types add up to the total
ho19t$SF<-ho19t$sf*ho19t$SF
ho19t$MF<-ho19t$sf*ho19t$MF
ho19t$MH<-ho19t$sf*ho19t$MH
# extract data for totals by type and cohort, and by type only
ho19<-ho19t[,c(1,5:25)]
names(ho19)[2:22]<-paste(names(ho19)[2:22],"ho",sep="") # housing, occupied
# merge population and occupied housing data
hp19<-merge(p19,ho19,by = 'GeoID')
# load in data of total occupied stock in 2019, and calculate vacancy rate by type for each county
load('Intermediate_results/TotalHousing2019.RData') # produced by script housing_stock.R
h19t<-h19tb_new[,c(1:8,52,10,16:33)] # extract required columns
#make sure things match up, i.e. that totals are the sum of their parts
h19t$SF<-rowSums(h19t[,11:16]) # sf is the sum of sf by cohort
h19t$MF<-rowSums(h19t[,17:22]) # mf is the sum of mf by cohort
h19t$MH<-rowSums(h19t[,23:28]) # mh is the sum of mh by cohort
h19t$Total_HUht<-h19t$SF+h19t$MF+h19t$MH
colnames(h19t)[5:28]<-paste(names(h19t[5:28]),"ht",sep = "")# house, total
# merge occupied housing and population data with total housing data
hpt19<-merge(hp19,h19t,by = 'GeoID')
# colnames(hpt19)[20:22]<-c("SFho","MFho","MHho") # house, occupied
# define occupancy/vacancy rates by type
hpt19$SFvr<-hpt19$SFho/hpt19$SFht
hpt19$MFvr<-hpt19$MFho/hpt19$MFht
hpt19$MFvr[is.nan(hpt19$MFvr)]<-0
hpt19$MHvr<-hpt19$MHho/hpt19$MHht
hpt19$MHvr[is.nan(hpt19$MHvr)]<-0
# Divide resident (_r) population by occupied units to estimate avg household size
hpt19$SFhhs<-hpt19$SF_rPop/hpt19$SFho
hpt19$MFhhs<-hpt19$MF_rPop/hpt19$MFho
hpt19$MHhhs<-hpt19$MH_rPop/hpt19$MHho
# convert NAs to 1. Problem in later calculations if 0.
# also convert any hhs<1 to 1
hpt19$SFhhs[is.nan(hpt19$SFhhs)|hpt19$SFhhs<1]<-1
hpt19$MFhhs[is.nan(hpt19$MFhhs)|hpt19$MFhhs<1]<-1
hpt19$MHhhs[is.nan(hpt19$MHhhs)|hpt19$MHhhs<1]<-1
hpt19$MH_Pop[is.infinite(hpt19$MHhhs)]<-hpt19$MH_rPop[is.infinite(hpt19$MHhhs)]<-0 # exception for Alexandria City VA, which had population of 108 in MH, but no MH units
hpt19$MHhhs[is.infinite(hpt19$MHhhs)]<-1 # exception for Alexandria City VA, which had population of 108 in MH, but no MH units
# recalc occupancy/vacancy rates by type
hpt19$SFvr<-hpt19$SFho/hpt19$SFht
hpt19$MFvr<-hpt19$MFho/hpt19$MFht
hpt19$MFvr[is.nan(hpt19$MFvr)]<-0
hpt19$MHvr<-hpt19$MHho/hpt19$MHht
hpt19$MHvr[is.nan(hpt19$MHvr)]<-0
# split total housing units into percentages by type, cohort, and occ/vac
h19pc<-hpt19[,1:2]
h19pc$Tot_Hous_Units<-hpt19$SFht+hpt19$MFht+hpt19$MHht
h19pc[,c("Tot_HU_SF","Tot_HU_MF","Tot_HU_MH")]<-hpt19[,c("SFht","MFht","MHht")]
h19pc[,c("pcTot_HU_SF_Occ_p1940","pcTot_HU_SF_Occ_1940_59","pcTot_HU_SF_Occ_1960_79","pcTot_HU_SF_Occ_1980_99",
"pcTot_HU_SF_Occ_2000_09","pcTot_HU_SF_Occ_2010_19")]<-hpt19[,17:22]/h19pc$Tot_HU_SF
h19pc[,c("pcTot_HU_SF_Occ_2020_29","pcTot_HU_SF_Occ_2030_39","pcTot_HU_SF_Occ_2040_49","pcTot_HU_SF_Occ_2050_60")]<-0
sfvac<-hpt19[,47:52]-hpt19[,17:22] # vacant units = total units (by cohort) - occupied units (by cohort)
sfvact<-rowSums(sfvac)
sfvac[sfvac<0]<-0 # no negative vacancy values
sfvacpc<-sfvac/sfvact
sfvacpcsum<-rowSums(sfvacpc)
# fix potential infeasible percentage vacancy values
for (i in 1:length(sfvacpcsum)){
if (sfvacpcsum[i]<0) {sfvacpc[i,]<-0} # none less than zero
if (sfvacpcsum[i]>1) {sfvacpc[i,]<-sfvacpc[i,]/sfvacpcsum[i]} # none greater than 1 (100%)
}
sfvac<-sfvacpc*sfvact
h19pc[,c("pcTot_HU_SF_Vac_p1940","pcTot_HU_SF_Vac_1940_59","pcTot_HU_SF_Vac_1960_79","pcTot_HU_SF_Vac_1980_99",
"pcTot_HU_SF_Vac_2000_09","pcTot_HU_SF_Vac_2010_19")]<-sfvac/h19pc$Tot_HU_SF
h19pc[,c("pcTot_HU_SF_Vac_2020_29","pcTot_HU_SF_Vac_2030_39","pcTot_HU_SF_Vac_2040_49","pcTot_HU_SF_Vac_2050_60")]<-0
# MF
h19pc[,c("pcTot_HU_MF_Occ_p1940","pcTot_HU_MF_Occ_1940_59","pcTot_HU_MF_Occ_1960_79","pcTot_HU_MF_Occ_1980_99",
"pcTot_HU_MF_Occ_2000_09","pcTot_HU_MF_Occ_2010_19")]<-hpt19[,23:28]/h19pc$Tot_HU_MF
h19pc[,c("pcTot_HU_MF_Occ_2020_29","pcTot_HU_MF_Occ_2030_39","pcTot_HU_MF_Occ_2040_49","pcTot_HU_MF_Occ_2050_60")]<-0
mfvac<-hpt19[,53:58]-hpt19[,23:28] # vacant units = total units (by cohort) - occupied units (by cohort)
mfvact<-rowSums(mfvac)
mfvac[mfvac<0]<-0
mfvacpc<-mfvac/mfvact
mfvacpc[is.na(mfvacpc)]<-0
mfvacpcsum<-rowSums(mfvacpc)
# fix potential infeasible percentage vacancy values
for (i in 1:length(mfvacpcsum)){
if (mfvacpcsum[i]<0) {mfvacpc[i,]<-0} # none less than zero
if (mfvacpcsum[i]>1) {mfvacpc[i,]<-mfvacpc[i,]/mfvacpcsum[i]} # none greater than 1 (100%)
}
mfvac<-mfvacpc*mfvact
h19pc[,c("pcTot_HU_MF_Vac_p1940","pcTot_HU_MF_Vac_1940_59","pcTot_HU_MF_Vac_1960_79","pcTot_HU_MF_Vac_1980_99",
"pcTot_HU_MF_Vac_2000_09","pcTot_HU_MF_Vac_2010_19")]<-mfvac/h19pc$Tot_HU_MF
h19pc[,c("pcTot_HU_MF_Vac_2020_29","pcTot_HU_MF_Vac_2030_39","pcTot_HU_MF_Vac_2040_49","pcTot_HU_MF_Vac_2050_60")]<-0
# MH
h19pc[,c("pcTot_HU_MH_Occ_p1940","pcTot_HU_MH_Occ_1940_59","pcTot_HU_MH_Occ_1960_79","pcTot_HU_MH_Occ_1980_99",
"pcTot_HU_MH_Occ_2000_09","pcTot_HU_MH_Occ_2010_19")]<-hpt19[,29:34]/h19pc$Tot_HU_MH
h19pc[,c("pcTot_HU_MH_Occ_2020_29","pcTot_HU_MH_Occ_2030_39","pcTot_HU_MH_Occ_2040_49","pcTot_HU_MH_Occ_2050_60")]<-0
mhvac<-hpt19[,59:64]-hpt19[,29:34] # vacant units = total units (by cohort) - occupied units (by cohort)
mhvact<-rowSums(mhvac)
mhvac[mhvac<0]<-0
mhvacpc<-mhvac/mhvact
mhvacpc[is.na(mhvacpc)]<-0
mhvacpcsum<-rowSums(mhvacpc)
# fix potential infeasible percentage vacancy values
for (i in 1:length(mhvacpcsum)){
if (mhvacpcsum[i]<0) {mhvacpc[i,]<-0} # none less than zero
if (mhvacpcsum[i]>1) {mhvacpc[i,]<-mhvacpc[i,]/mhvacpcsum[i]} # none greater than 1 (100%)
}
mhvac<-mhvacpc*mhvact
h19pc[,c("pcTot_HU_MH_Vac_p1940","pcTot_HU_MH_Vac_1940_59","pcTot_HU_MH_Vac_1960_79","pcTot_HU_MH_Vac_1980_99",
"pcTot_HU_MH_Vac_2000_09","pcTot_HU_MH_Vac_2010_19")]<-mhvac/h19pc$Tot_HU_MH
h19pc[,c("pcTot_HU_MH_Vac_2020_29","pcTot_HU_MH_Vac_2030_39","pcTot_HU_MH_Vac_2040_49","pcTot_HU_MH_Vac_2050_60")]<-0
# remove nas
h19pc[7:66][is.na(h19pc[7:66])]<-0
# add in vacancy rates and population data to h19pc
h19pc$VR_SF<-rowSums(h19pc[,7:26])/rowSums(h19pc[,7:16])
h19pc$VR_SF[is.na(h19pc$VR_SF)]<-1
h19pc$VR_MF<-rowSums(h19pc[,27:46])/rowSums(h19pc[,27:36])
h19pc$VR_MF[is.na(h19pc$VR_MF)]<-1
h19pc$VR_MF[is.infinite(h19pc$VR_MF)]<-1
h19pc$VR_MH<-rowSums(h19pc[,47:66])/rowSums(h19pc[,47:56])
h19pc$VR_MH[is.na(h19pc$VR_MH)]<-1
h19pc$VR_MH[is.infinite(h19pc$VR_MH)]<-1
h19pc$Pop_SF<-hpt19$SF_rPop
h19pc$Pop_MF<-hpt19$MF_rPop
h19pc$Pop_MH<-hpt19$MH_rPop
h19pc$HHS_SF<-hpt19$SFhhs
h19pc$HHS_MF<-hpt19$MFhhs
h19pc$HHS_MH<-hpt19$MHhhs
# start to define the housing stock and population data based on the 2019 percentage values scaled by 2020 absolute values
h20pc<-h19pc
s2t20<-s2t[s2t$YEAR==2020,] # 2020 adjusted county population data
h20pc$Pop_SF<-s2t20$SFp
h20pc$Pop_MF<-s2t20$MFp
h20pc$Pop_MH<-s2t20$MHp
# in many counties (around 2,253 out of 3,142) the population shows a decline between 2019 and 2020: length(which(h20pc$Pop_SF<h19pc$Pop_SF))
# this method for estimating total units in 2020 assumes that vacancy rates stay the same (and thus total units decline), rather than an increase in vacancy rates and/or a decrease in household size
h20pc$Tot_HU_SF<-h20pc$Pop_SF/h20pc$HHS_SF*h20pc$VR_SF
h20pc$Tot_HU_MF<-h20pc$Pop_MF/h20pc$HHS_MF*h20pc$VR_MF
h20pc$Tot_HU_MH<-h20pc$Pop_MH/h20pc$HHS_MH*h20pc$VR_MH
h20pc$Tot_Hous_Units<-h20pc$Tot_HU_SF+h20pc$Tot_HU_MF+h20pc$Tot_HU_MH
save(h20pc,file = "Intermediate_results/InitStock20.RData") # this is used as an input to hsm_cty.R
# This will serve as the data for initial HS by type, cohort, and vacancy status in the county HSM
# next step is to create population and HHS every year by type and cohort, for each county. I think that's already partially/fully done in the script below
s2t[,c("Pop_Share_SF","Pop_Share_MF","Pop_Share_MH")]<-s2t[,c("SFp","MFp","MHp")]/s2t$pop_adj
# average national household size reductions, from JHCS, see Table 1 in Yale Courses\Research\US Housing\Popn Survey\JCHS\Harvard_JCHS_McCue_Household_Projections_AppendixTables.xlsx
h<-data.frame("Year" = seq(2020,2060,5), hhs =c(2.5515,2.5213,2.4984,2.4822,2.4723,2.4679,2.465,2.4625,2.460))
h$hr<-1 # househod size ratio, based on 2020 value
h$hr[2:9]<-h$hhs[2:9]/h$hhs[1]
# apply average reductions in household size to each house type in all counties, but don't let household size go below 1.
s2t$HHS_SF<-rep(h20pc$HHS_SF,each=9)*h$hr
s2t$HHS_SF[s2t$HHS_SF<1]<-1 # don't go below HHS of 1
s2t$HHS_MF<-rep(h20pc$HHS_MF,each=9)*h$hr
s2t$HHS_MF[s2t$HHS_MF<1]<-1 # don't go below HHS of 1
s2t$HHS_MH<-rep(h20pc$HHS_MH,each=9)*h$hr
s2t$HHS_MH[s2t$HHS_MH<1]<-1 # don't go below HHS of 1
# interpolate with spline function to create population and hhs for every year for each county ############
cid2<-as.character(cid2)
cid2[nchar(cid2)<5]<-paste("0",cid2[nchar(cid2)<5],sep = "")
for (k in 1:3142) { print(k)
re<-s2t[s2t$GEOID==cid2[k],][1,]
re[2:41,1]<-rep(re[1,1],40) # copy geoid
re$YEAR[2:41]<-2021:2060 # insert year
y5<-c(seq(6,41,5)) # extract the 5 years
re$pop_adj[y5]<-s2t[s2t$GEOID==cid2[k],]$pop_adj[2:9]
re$HHS_SF[y5]<-s2t[s2t$GEOID==cid2[k],]$HHS_SF[2:9]
re$HHS_MF[y5]<-s2t[s2t$GEOID==cid2[k],]$HHS_MF[2:9]
re$HHS_MH[y5]<-s2t[s2t$GEOID==cid2[k],]$HHS_MH[2:9]
# loop to fill in between 5 years for population and household size. Will need to add in vacancy rate here too when/if it changes
# fill in total pop
sd <- data.frame(
with(select(s2t[s2t$GEOID==cid2[k],],YEAR,pop_adj),
spline(YEAR, pop_adj, xout = seq(2020, 2060, by = 1))
),
method = "spline()"
)
re$pop_adj<-sd$y
# now for SF hhs
sd <- data.frame(
with(select(s2t[s2t$GEOID==cid2[k],],YEAR,HHS_SF),
spline(YEAR, HHS_SF, xout = seq(2020, 2060, by = 1))
),
method = "spline()"
)
re$HHS_SF<-sd$y
# now for MF hhs
sd <- data.frame(
with(select(s2t[s2t$GEOID==cid2[k],],YEAR,HHS_MF),
spline(YEAR, HHS_MF, xout = seq(2020, 2060, by = 1))
),
method = "spline()"
)
re$HHS_MF<-sd$y
# now for MH hhs
sd <- data.frame(
with(select(s2t[s2t$GEOID==cid2[k],],YEAR,HHS_MH),
spline(YEAR, HHS_MH, xout = seq(2020, 2060, by = 1))
),
method = "spline()"
)
re$HHS_MH<-sd$y
# Assuming static share of households among types
re$Pop_Share_SF<-re$Pop_Share_SF[1]
re$Pop_Share_MF<-re$Pop_Share_MF[1]
re$Pop_Share_MH<-re$Pop_Share_MH[1]
re$SFp[2:41]<-re$pop_adj[2:41]*re$Pop_Share_SF[2:41]
re$MFp[2:41]<-re$pop_adj[2:41]*re$MFp[1]/re$pop_adj[1]
re$MHp[2:41]<-re$pop_adj[2:41]*re$MHp[1]/re$pop_adj[1]
if (k==1) {
sall<-re
} else { sall<-bind_rows(sall,re)
}
}
# reorder, rename, nest and save #######
names(sall)[3:6]<-c("Population","Pop_SF","Pop_MF","Pop_MH")
sall<-sall[,c(1:3,7:9,4:6,10:12)]
ss<-sall %>% group_by(GEOID) %>% nest()
save(ss,file= "Intermediate_results/CountyProjections.RData") # this is used as an input to hsm_cty.R