-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel_experiments.py
3327 lines (3074 loc) · 190 KB
/
model_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from covid_constants_and_util import *
from disease_model import Model
import matplotlib.ticker as ticker
from matplotlib import cm
import helper_methods_for_aggregate_data_analysis as helper
import seaborn as sns
import copy
from collections import Counter
import pickle
import re
import sys
import getpass
from traceback import print_exc
import socket
import psutil
import json
import subprocess
import multiprocessing
import IPython
from scipy.stats import scoreatpercentile, poisson, binom
from scipy.special import logsumexp
from psutil._common import bytes2human
from scipy.stats import ttest_ind, rankdata
from scipy.sparse import hstack, csr_matrix
import argparse
import getpass
from collections import Counter
###################################################
# Loss functions
###################################################
def MRE(y_true, y_pred):
'''
Computes the median relative error (MRE). y_true and y_pred should
both be numpy arrays.
If y_true and y_pred are 1D, the MRE is returned.
If y_true and y_pred are 2D, e.g., predictions over multiple seeds,
the MRE is computed per row, then averaged.
'''
abs_err = np.absolute(y_true - y_pred)
rel_err = abs_err / y_true
if len(abs_err.shape) == 1: # this implies y_true and y_pred are 1D
mre = np.median(rel_err)
else: # this implies at least one of them is 2D
mre = np.mean(np.median(rel_err, axis=1))
return mre
def RMSE(y_true, y_pred):
'''
Computes the root mean squared error (RMSE). y_true and y_pred should
both be numpy arrays.
If y_true and y_pred are 1D, the RMSE is returned.
If y_true and y_pred are 2D, e.g., predictions over multiple seeds,
the RMSE is computed per row, then averaged.
'''
sq_err = (y_true - y_pred) ** 2
if len(sq_err.shape) == 1: # this implies y_true and y_pred are 1D
rmse = np.sqrt(np.mean(sq_err))
else: # this implies at least one of them is 2D
rmse = np.sqrt(np.mean(sq_err, axis=1))
rmse = np.mean(rmse)
return rmse
def MSE(y_true, y_pred):
'''
Computes the mean squared error (MSE). y_true and y_pred should
both be numpy arrays.
'''
return np.mean((y_true - y_pred) ** 2)
def poisson_NLL(y_true, y_pred, sum_or_logsumexp):
# We clip variance to a min of 4, similar to Li et al. (2020)
# First sum log-likelihoods over days
variance = np.clip(y_pred, 4, None)
ll = np.sum(poisson.logpmf(y_true, variance), axis=1)
# Then sum or logsumexp over seeds
ll = sum_or_logsumexp(ll)
return -ll
###################################################
# Code for running one model
###################################################
def fit_disease_model_on_real_data(d,
min_datetime,
max_datetime,
exogenous_model_kwargs,
poi_attributes_to_clip,
msa_name=None,
preload_poi_visits_list_filename=None,
poi_cbg_visits_list=None,
poi_ids=None,
cbg_ids=None,
cbg_init_mode='cases',
correct_poi_visits=True,
multiply_poi_visit_counts_by_census_ratio=True,
aggregate_home_cbg_col='aggregated_cbg_population_adjusted_visitor_home_cbgs',
poi_hourly_visits_cutoff='all',
cbg_count_cutoff=10,
cbgs_to_filter_for=None,
cbg_groups_to_track=None,
counties_to_track=None,
include_cbg_prop_out=False,
include_inter_cbg_travel=False,
include_mask_use=True,
model_init_kwargs=None,
simulation_kwargs=None,
counterfactual_poi_opening_experiment_kwargs=None,
counterfactual_retrospective_experiment_kwargs=None,
return_model_without_fitting=False,
attach_data_to_model=False,
model_quality_dict=None,
verbose=True):
"""
Function to prepare data as input for the disease model, and to run the disease simulation on formatted data.
d: pandas DataFrame; POI data from SafeGraph
min_datetime, max_datetime: DateTime objects; the first and last hour to simulate
exogenous_model_kwargs: dict; extra arguments for Model.init_exogenous_variables()
required keys: p_sick_at_t0, poi_psi, and home_beta
poi_attributes_to_clip: dict; which POI attributes to clip
required keys: clip_areas, clip_dwell_times, clip_visits
preload_poi_visits_list_filename: str; name of file from which to load precomputed hourly networks
poi_cbg_visits_list: list of sparse matrices; precomputed hourly networks
correct_poi_visits: bool; whether to correct hourly visit counts with dwell time
multiply_poi_visit_counts_by_census_ratio: bool; whether to upscale visit counts by a constant factor
derived using Census data to try to get real visit volumes
aggregate_col_to_use: str; the field that holds the aggregated CBG proportions for each POI
cbg_count_cutoff: int; the minimum number of POIs a CBG must visit to be included in the model
cbgs_to_filter_for: list; only model CBGs in this list
cbg_groups_to_track: dict; maps group name to CBGs, will track their disease trajectories during simulation
counties_to_track: list; names of counties, will track their disease trajectories during simulation
include_cbg_prop_out: bool; whether to adjust the POI-CBG network based on Social Distancing Metrics (SDM);
should only be used if precomputed poi_cbg_visits_list is not in use
model_init_kwargs: dict; extra arguments for initializing Model
simulation_kwargs: dict; extra arguments for Model.simulate_disease_spread()
counterfactual_poi_opening_experiment_kwargs: dict; arguments for POI category reopening experiments
counterfactual_retrospective_experiment_kwargs: dict; arguments for counterfactual mobility reduction experiment
"""
assert min_datetime <= max_datetime
assert all([k in exogenous_model_kwargs for k in ['poi_psi', 'home_beta']])
assert all([k in poi_attributes_to_clip for k in ['clip_areas', 'clip_dwell_times', 'clip_visits']])
assert all([k in d.columns for k in ['region', 'sub_category', 'safegraph_computed_area_in_square_feet']])
assert aggregate_home_cbg_col in ['aggregated_cbg_population_adjusted_visitor_home_cbgs',
'aggregated_visitor_home_cbgs']
if cbg_groups_to_track is None:
cbg_groups_to_track = {}
if model_init_kwargs is None:
model_init_kwargs = {}
if simulation_kwargs is None:
simulation_kwargs = {}
if preload_poi_visits_list_filename is not None:
f = open(preload_poi_visits_list_filename, 'rb')
poi_cbg_visits_list = pickle.load(f)
f.close()
t0 = time.time()
print('1. Processing SafeGraph data...')
# get hours and check hourly visit info
all_hours = helper.list_hours_in_range(min_datetime, max_datetime)
print("Found %d hours in all (%s to %s)" % (len(all_hours),
get_datetime_hour_as_string(min_datetime),
get_datetime_hour_as_string(max_datetime)))
if poi_cbg_visits_list is not None:
assert len(poi_cbg_visits_list) == len(all_hours)
hour_cols = ['hourly_visits_%s' % get_datetime_hour_as_string(dt) for dt in all_hours]
if poi_cbg_visits_list is None: # don't need hourly visits in dataframe otherwise
assert(all([col in d.columns for col in hour_cols]))
model_days = helper.list_datetimes_in_range(min_datetime, max_datetime)
home_beta = exogenous_model_kwargs['home_beta']
if type(home_beta) in {np.ndarray, list}:
if len(home_beta) == 2: # start and end points
home_beta = np.linspace(home_beta[0], home_beta[1], len(model_days)) # increment daily
exogenous_model_kwargs['home_beta'] = home_beta
else: # should be daily
assert len(home_beta) == len(model_days)
# aggregate median_dwell time over weeks
if 'avg_median_dwell' not in d.columns:
weekly_median_dwell_pattern = re.compile('2020-\d\d-\d\d.median_dwell')
median_dwell_cols = [col for col in d.columns if re.match(weekly_median_dwell_pattern, col)]
print('Taking median over median_dwell from %s to %s' % (median_dwell_cols[0], median_dwell_cols[-1]))
# note: this may trigger "RuntimeWarning: All-NaN slice encountered" if a POI has all nans for median_dwell;
# this is not a problem and will be addressed during clipping and/or POI dropping
d['avg_median_dwell'] = d[median_dwell_cols].median(axis=1).values
# clip before dropping data so we have more POIs as basis for percentiles
# this will also drop POIs whose sub and top categories are too small for clipping
poi_attributes_to_clip = poi_attributes_to_clip.copy() # copy in case we need to modify
if poi_cbg_visits_list is not None:
poi_attributes_to_clip['clip_visits'] = False
print('Precomputed POI-CBG networks were passed in; will NOT be clipping hourly visits in dataframe')
if poi_attributes_to_clip['clip_areas'] or poi_attributes_to_clip['clip_dwell_times'] or poi_attributes_to_clip['clip_visits']:
d, categories_to_clip, cols_to_clip, thresholds, medians = clip_poi_attributes_in_msa_df(
d, min_datetime, max_datetime, **poi_attributes_to_clip)
print('After clipping, %i POIs' % len(d))
# filter POIs
if poi_ids is None:
d = d.loc[d[aggregate_home_cbg_col].map(lambda x:len(x.keys()) > 0)]
if verbose: print("After dropping for missing CBG home data, %i POIs" % len(d))
d = d.dropna(subset=['avg_median_dwell'])
if verbose: print("After dropping for missing avg_median_dwell, %i POIs" % len(d))
d = d.dropna(subset=['safegraph_computed_area_in_square_feet'])
if verbose: print("After dropping for missing area, %i POIs" % len(d))
curr_num_visits = np.nansum(d[hour_cols].values)
if poi_hourly_visits_cutoff == 'all' or poi_hourly_visits_cutoff >= len(hour_cols): # POI must have non-missing hourly visits data for every hour to be included
d = d.dropna(subset=hour_cols)
new_num_visits = np.sum(d[hour_cols].values)
if verbose: print("After dropping for missing any hours, %i POIs; kept %.2f%% of visits" %
(len(d), 100. * new_num_visits / curr_num_visits))
else: # cutoff based on simulation hours
assert poi_hourly_visits_cutoff >= 0
num_nonnan_hours = np.sum(~pd.isnull(d[hour_cols]), axis=1)
poi_passes = num_nonnan_hours >= poi_hourly_visits_cutoff
d = d.loc[poi_passes]
fill_with_0 = {k:0 for k in hour_cols}
d = d.fillna(value=fill_with_0)
new_num_visits = np.sum(d[hour_cols].values)
if verbose: print("After dropping for having less than %d hours of data, %i POIs; kept %.2f%% of visits" %
(poi_hourly_visits_cutoff, len(d), 100. * new_num_visits / curr_num_visits))
else:
pois_in_df = set(d.index)
n_missing = len(set(poi_ids) - pois_in_df)
print('Received %d pre-specified POI ids -> missing %d in dataframe' % (len(poi_ids), n_missing))
assert n_missing == 0 # all poi_ids should be in df
d = d.loc[poi_ids]
assert len(d) == len(poi_ids)
if poi_cbg_visits_list is None:
is_null = pd.isnull(d[hour_cols]).values
print('%d / %d hours are null -> filling with 0' % (np.sum(is_null), len(hour_cols) * len(d)))
fill_with_0 = {k:0 for k in hour_cols}
d = d.fillna(value=fill_with_0)
else:
assert poi_cbg_visits_list[0].shape[0] == len(poi_ids)
M = len(d)
# filter CBGs
poi_cbg_proportions = d[aggregate_home_cbg_col].values # an array of dicts; each dict represents CBG distribution for POI
acs_d = helper.load_and_reconcile_multiple_acs_data()
cbgs_to_census_pops = dict(zip(acs_d['census_block_group'].values,
acs_d['total_cbg_population_2018_1YR'].values)) # use most recent population data
if cbg_ids is None:
all_cbgs = [a for b in poi_cbg_proportions for a in b.keys()]
cbg_counts = Counter(all_cbgs).most_common()
all_unique_cbgs = [cbg for cbg, count in cbg_counts if count >= cbg_count_cutoff] # only keep CBGs that have visited at least this many POIs
if verbose: print("After dropping CBGs that appear in < %i POIs, %i CBGs (%2.1f%%)" %
(cbg_count_cutoff, len(all_unique_cbgs), 100.*len(all_unique_cbgs)/len(cbg_counts)))
if cbgs_to_filter_for is not None:
all_unique_cbgs = [a for a in all_unique_cbgs if a in cbgs_to_filter_for]
print("After filtering for CBGs in MSA, %i CBGs" % len(all_unique_cbgs))
all_unique_cbgs = [cbg for cbg in all_unique_cbgs if cbgs_to_census_pops[cbg] > 0]
if verbose: print('After dropping CBGs with population size 0 in ACS data, %i CBGs' % len(all_unique_cbgs))
all_unique_cbgs = sorted(all_unique_cbgs) # order CBGs lexicographically
else:
print('Received %d pre-specified CBG ids' % len(cbg_ids))
all_unique_cbgs = cbg_ids
N = len(all_unique_cbgs)
cbgs_to_idxs = dict(zip(all_unique_cbgs, range(N)))
print('FINAL: number of CBGs (N) = %d, number of POIs (M) = %d' % (N, M))
# convert data structures with CBG names to CBG indices
poi_cbg_proportions_mat = np.zeros((M, N))
for poi_idx, old_dict in enumerate(poi_cbg_proportions):
for string_key, prop in old_dict.items():
if string_key in cbgs_to_idxs:
int_key = cbgs_to_idxs[string_key]
poi_cbg_proportions_mat[poi_idx, int_key] = prop
E = np.sum(poi_cbg_proportions_mat > 0)
print('Num connected POI-CBG pairs (E) = %d, network density (E/N) = %.3f' %
(E, E / N)) # avg num adjacent POIs per CBG
if poi_cbg_visits_list is not None:
expected_M, expected_N = poi_cbg_visits_list[0].shape
assert M == expected_M
assert N == expected_N
cbg_idx_groups_to_track = {}
for group in cbg_groups_to_track:
cbg_idx_groups_to_track[group] = [
cbgs_to_idxs[a] for a in cbg_groups_to_track[group] if a in cbgs_to_idxs]
if verbose: print(f'{len(cbg_groups_to_track[group])} CBGs in {group} -> matched {len(cbg_idx_groups_to_track[group])} ({(len(cbg_idx_groups_to_track[group]) / len(cbg_groups_to_track[group])):.3f})')
# get POI-related variables
all_states = sorted(list(set(d['region'].dropna())))
poi_subcategory_types = d['sub_category'].values
poi_areas = d['safegraph_computed_area_in_square_feet'].values
poi_dwell_times = d['avg_median_dwell'].values
poi_dwell_time_correction_factors = (poi_dwell_times / (poi_dwell_times+60)) ** 2
print('Dwell time correction factors: mean = %.2f, min = %.2f, max = %.2f' %
(np.mean(poi_dwell_time_correction_factors), min(poi_dwell_time_correction_factors), max(poi_dwell_time_correction_factors)))
if poi_cbg_visits_list is None:
poi_time_counts = d[hour_cols].values
else:
poi_time_counts = None # don't need poi_time_counts if precomputed is provided
if correct_poi_visits: # applying correction to visits so that they represent number of visitors present per hour,
# not number of visits arriving per hour
if poi_cbg_visits_list is not None:
print('Precomputed POI-CBG networks were passed in; will NOT be applying dwell-time-based correction to hourly visits in dataframe')
else:
print('Correcting POI hourly visit vectors...')
new_poi_time_counts = []
for i, (visit_vector, dwell_time) in enumerate(list(zip(poi_time_counts, poi_dwell_times))):
new_poi_time_counts.append(correct_visit_vector(visit_vector, dwell_time))
poi_time_counts = np.array(new_poi_time_counts)
d[hour_cols] = poi_time_counts
new_hourly_visit_count = np.sum(poi_time_counts)
print('After correcting, %.2f hourly visits' % new_hourly_visit_count)
if multiply_poi_visit_counts_by_census_ratio: # scale visits based on undersampling
if poi_cbg_visits_list is not None:
print('Precomputed POI-CBG networks were passed in; will NOT be applying undersampling correction to hourly visits in dataframe')
else:
# Get overall undersampling factor.
# Basically we take ratio of ACS US population to SafeGraph population in Feb 2020.
# SafeGraph thinks this is reasonable.
# https://safegraphcovid19.slack.com/archives/C0109NPA543/p1586801883190800?thread_ts=1585770817.335800&cid=C0109NPA543
total_us_population_in_50_states_plus_dc = acs_d.loc[acs_d['state_code'].map(lambda x:x in FIPS_CODES_FOR_50_STATES_PLUS_DC), 'total_cbg_population_2018_1YR'].sum()
safegraph_visitor_count_df = pd.read_csv('/dfs/scratch1/safegraph_homes/all_aggregate_data/20191213-safegraph-aggregate-longitudinal-data-to-unzip-to/SearchofAllRecords-CORE_POI-GEOMETRY-PATTERNS-2020_02-2020-03-16/visit_panel_summary.csv')
safegraph_visitor_count = safegraph_visitor_count_df.loc[safegraph_visitor_count_df['state'] == 'ALL_STATES', 'num_unique_visitors'].iloc[0]
# remove a few safegraph visitors from non-US states.
two_letter_codes_for_states = set([a.lower() for a in codes_to_states if codes_to_states[a] in JUST_50_STATES_PLUS_DC])
safegraph_visitor_count_to_non_states = safegraph_visitor_count_df.loc[safegraph_visitor_count_df['state'].map(lambda x:x not in two_letter_codes_for_states and x != 'ALL_STATES'), 'num_unique_visitors'].sum()
if verbose:
print("Removing %2.3f%% of people from SafeGraph count who are not in 50 states or DC" %
(100. * safegraph_visitor_count_to_non_states/safegraph_visitor_count))
safegraph_visitor_count = safegraph_visitor_count - safegraph_visitor_count_to_non_states
correction_factor = 1. * total_us_population_in_50_states_plus_dc / safegraph_visitor_count
if verbose:
print("Total US population from ACS: %i; total safegraph visitor count: %i; correction factor for POI visits is %2.3f" %
(total_us_population_in_50_states_plus_dc,
safegraph_visitor_count,
correction_factor))
poi_time_counts = poi_time_counts * correction_factor
d[hour_cols] = poi_time_counts
# get CBG-related variables from census data
print('2. Processing ACS data...')
cbg_sizes = np.array([cbgs_to_census_pops[a] for a in all_unique_cbgs])
assert np.sum(np.isnan(cbg_sizes)) == 0
if verbose:
print('CBGs: median population size = %d, sum of population sizes = %d' %
(np.median(cbg_sizes), np.sum(cbg_sizes)))
if counties_to_track is not None:
print('Found %d counties to track...' % len(counties_to_track))
county2cbgs = {}
for county in counties_to_track:
county_cbgs = acs_d[acs_d['county_code'] == county]['census_block_group'].values
orig_len = len(county_cbgs)
county_cbgs = sorted(set(county_cbgs).intersection(set(all_unique_cbgs)))
if orig_len > 0:
coverage = len(county_cbgs) / orig_len
if coverage < 0.8:
print('Low coverage warning: only modeling %d/%d (%.1f%%) of the CBGs in %s' %
(len(county_cbgs), orig_len, 100. * coverage, county))
if len(county_cbgs) > 0:
county_cbg_idx = np.array([cbgs_to_idxs[a] for a in county_cbgs])
county2cbgs[county] = (county_cbgs, county_cbg_idx)
cbg_idx_groups_to_track[county] = county_cbg_idx
print('Tracking infection trajectories from %d of the counties' % len(county2cbgs))
else:
county2cbgs = None
# turn off warnings temporarily so that using > or <= on np.nan does not cause warnings
np.warnings.filterwarnings('ignore')
cbg_idx_to_track = set(range(N)) # include all CBGs
for attribute in ['p_black', 'p_white', 'median_household_income']:
attr_col_name = '%s_2017_5YR' % attribute # using 5-year ACS data for attributes bc less noisy
assert attr_col_name in acs_d.columns
mapper_d = dict(zip(acs_d['census_block_group'].values, acs_d[attr_col_name].values))
attribute_vals = np.array([mapper_d[a] if a in mapper_d and cbgs_to_idxs[a] in cbg_idx_to_track else np.nan for a in all_unique_cbgs])
non_nan_vals = attribute_vals[~np.isnan(attribute_vals)]
median_cutoff = np.median(non_nan_vals)
if verbose:
print("Attribute %s: was able to compute for %2.1f%% out of %i CBGs, median is %2.3f" %
(attribute, 100. * len(non_nan_vals) / len(cbg_idx_to_track),
len(cbg_idx_to_track), median_cutoff))
cbg_idx_groups_to_track[f'{attribute}_above_median'] = list(set(np.where(attribute_vals > median_cutoff)[0]).intersection(cbg_idx_to_track))
cbg_idx_groups_to_track[f'{attribute}_below_median'] = list(set(np.where(attribute_vals <= median_cutoff)[0]).intersection(cbg_idx_to_track))
top_decile = scoreatpercentile(non_nan_vals, 90)
bottom_decile = scoreatpercentile(non_nan_vals, 10)
cbg_idx_groups_to_track[f'{attribute}_top_decile'] = list(set(np.where(attribute_vals >= top_decile)[0]).intersection(cbg_idx_to_track))
cbg_idx_groups_to_track[f'{attribute}_bottom_decile'] = list(set(np.where(attribute_vals <= bottom_decile)[0]).intersection(cbg_idx_to_track))
if county2cbgs is not None:
above_median_in_county = []
below_median_in_county = []
for county in county2cbgs:
county_cbgs, cbg_idx = county2cbgs[county]
attribute_vals = np.array([mapper_d[a] if a in mapper_d and cbgs_to_idxs[a] in cbg_idx_to_track else np.nan for a in county_cbgs])
non_nan_vals = attribute_vals[~np.isnan(attribute_vals)]
median_cutoff = np.median(non_nan_vals)
above_median_idx = cbg_idx[np.where(attribute_vals > median_cutoff)[0]]
above_median_idx = list(set(above_median_idx).intersection(cbg_idx_to_track))
above_median_in_county.extend(above_median_idx)
below_median_idx = cbg_idx[np.where(attribute_vals <= median_cutoff)[0]]
below_median_idx = list(set(below_median_idx).intersection(cbg_idx_to_track))
below_median_in_county.extend(below_median_idx)
cbg_idx_groups_to_track[f'{attribute}_above_median_in_own_county'] = above_median_in_county
cbg_idx_groups_to_track[f'{attribute}_below_median_in_own_county'] = below_median_in_county
np.warnings.resetwarnings()
cbg_day_prop_out = None
inter_cbg_travel = None
if include_cbg_prop_out or include_inter_cbg_travel:
sdm_df = helper.load_social_distancing_metrics(model_days)
sdm_df = sdm_df.reindex(all_unique_cbgs) # reindex can handle possible missing keys
cols_to_keep = ['%s.%s.%s' % (dt.year, dt.month, dt.day) for dt in model_days]
if include_cbg_prop_out:
# missing values are filled in with median in helper.compute_cbg_day_prop_out
print('Giving model daily proportion out for %s to %s' % (cols_to_keep[0], cols_to_keep[-1]))
cbg_day_prop_out = helper.compute_cbg_day_prop_out(sdm_df)
assert all([c1 == c2 for c1, c2 in zip(cbg_day_prop_out['census_block_group'].values, all_unique_cbgs)])
assert((len(cols_to_keep) * 24) == len(hour_cols))
cbg_day_prop_out = cbg_day_prop_out[cols_to_keep].values
if include_inter_cbg_travel:
print('Giving model inter-CBG travel for %s to %s' % (cols_to_keep[0], cols_to_keep[-1]))
inter_cbg_travel = helper.compute_daily_inter_cbg_travel(sdm_df, cbg_sizes, model_days)
# num_cbgs x num_days; avg num visits to other CBGs per capita
inter_cbg_travel = (inter_cbg_travel.values.T / (cbg_sizes+1)).T
if include_mask_use:
day_strs = [dt.strftime('%Y-%m-%d') for dt in model_days]
most_common_state = d['region'].value_counts().idxmax()
print('Loading mask use data for state=%s' % most_common_state)
mask_df = helper.load_mask_use_data(most_common_state)
mask_df = mask_df[mask_df.date.isin(day_strs)]
assert len(mask_df) == len(model_days)
mask_data = mask_df['mask use'].values / 100
assert all((mask_data >= 0) & (mask_data <= 1))
else:
mask_data = None
if 'p_sick_at_t0' not in exogenous_model_kwargs or exogenous_model_kwargs['p_sick_at_t0'] is None:
fn = os.path.join(PATH_TO_SEIR_INIT, 'all_cbgs_s=%s.csv' % (min_datetime.strftime('%Y-%m-%d')))
assert os.path.isfile(fn)
cbg_init_shrinkage_alpha = 0.5 if min_datetime < datetime.datetime(2020, 4, 1) else 0.1 # if early, we trust estimates less, want to shrink more
print('Loading CBG init data; basing inferred SEIR on %s and applying shrinkage of %s' % (cbg_init_mode, cbg_init_shrinkage_alpha))
init_df = pd.read_csv(fn)
init_df = init_df.set_index('census_block_group')
init_df = init_df.loc[all_unique_cbgs]
is_null = pd.isnull(init_df['county_fips']).values
assert np.sum(is_null) == 0
states_to_init = ['E', 'I', 'R']
eir_cols = ['%s_%s' % (cbg_init_mode, state) for state in states_to_init]
initial_conditions = init_df[eir_cols].values
for idx, state in enumerate(states_to_init):
curr_prop = initial_conditions[:, idx] / cbg_sizes
mean_prop = np.mean(curr_prop)
shrunken_prop = (cbg_init_shrinkage_alpha * mean_prop) + ((1 - cbg_init_shrinkage_alpha) * curr_prop) # shrink to mean
invalid_prop = shrunken_prop > 1
print('Found %d CBGs with inferred proportion in %s > 1 -> clipping' % (np.sum(invalid_prop), state))
shrunken_prop = np.clip(shrunken_prop, None, 1)
print('Proportion in %s: min = %.4f, 25th = %.4f, median = %.4f, 75th = %.4f, max = %.4f' %
(state, np.min(shrunken_prop), np.percentile(shrunken_prop, 25),
np.percentile(shrunken_prop, 50), np.percentile(shrunken_prop, 75),
np.max(shrunken_prop)))
initial_conditions[:, idx] = np.round(shrunken_prop * cbg_sizes, 0).astype(int)
else:
initial_conditions = None
# If trying to get the counterfactual where social activity doesn't change, just repeat first week of dataset.
# We put this in exogenous_model_kwargs because it actually affects how the model runs, not just the data input.
if 'just_compute_r0' in exogenous_model_kwargs and exogenous_model_kwargs['just_compute_r0']:
print('Running model to compute r0 -> looping first week visit counts')
# simulate out 15 weeks just so we are sure all cases are gone.
max_datetime = min_datetime + datetime.timedelta(hours=(168*15)-1)
all_hours = helper.list_hours_in_range(min_datetime, max_datetime)
print("Extending time period; simulation now ends at %s (%d hours)" % (max(all_hours), len(all_hours)))
if poi_cbg_visits_list is not None:
assert len(poi_cbg_visits_list) >= 168 # ensure that we have at least a week to model
new_visits_list = []
for i in range(168 * 15):
first_week_idx = i % 168 # map to corresponding hour in first week
new_visits_list.append(poi_cbg_visits_list[first_week_idx].copy())
poi_cbg_visits_list = new_visits_list
assert len(poi_cbg_visits_list) == len(all_hours)
else:
assert poi_time_counts.shape[1] >= 168 # ensure that we have at least a week to model
first_week = poi_time_counts[:, :168]
poi_time_counts = np.tile(first_week, (1, 15))
if cbg_day_prop_out is not None:
assert cbg_day_prop_out.shape[1] >= 7
first_week = cbg_day_prop_out[:, :7]
cbg_day_prop_out = np.tile(first_week, (1, 15))
assert poi_time_counts.shape[1] == len(all_hours)
assert cbg_day_prop_out is None # R0 calibration should be simplest version of model
assert inter_cbg_travel is None
assert not type(home_beta) in {np.ndarray, list} # should run R0 calibration with constant beta (only first week)
if mask_data is not None: # shape: 1 x num_days
avg_first_week_mask = np.mean(mask_data[:7])
print('Average mask use in first week of March: %.3f' % avg_first_week_mask)
mask_data = np.ones(7 * 15) * avg_first_week_mask # use constant mask wearing for single week
# If we want to run counterfactual reopening simulations
intervention_cost = None
if counterfactual_poi_opening_experiment_kwargs is not None:
if poi_cbg_visits_list is None:
raise Exception('Missing poi_cbg_visits_list; reopening experiments should be run with IPF output')
extra_weeks_to_simulate = counterfactual_poi_opening_experiment_kwargs['extra_weeks_to_simulate']
assert extra_weeks_to_simulate >= 0
intervention_datetime = counterfactual_poi_opening_experiment_kwargs['intervention_datetime']
version = counterfactual_poi_opening_experiment_kwargs['version']
if cbg_day_prop_out is not None: # shape: num_cbgs x num_days
to_concat = [cbg_day_prop_out.copy()]
for w in range(extra_weeks_to_simulate):
to_concat.append(cbg_day_prop_out[:, -7:].copy()) # loop final week
cbg_day_prop_out = np.concatenate(to_concat, axis=1) # concatenate along rows
if inter_cbg_travel is not None: # shape: num_cbgs x num_days
to_concat = [inter_cbg_travel.copy()]
for w in range(extra_weeks_to_simulate):
to_concat.append(inter_cbg_travel[:, -7:].copy()) # loop final week
inter_cbg_travel = np.concatenate(to_concat, axis=1) # concatenate along rows
if mask_data is not None: # shape: 1 x num_days
to_concat = [mask_data.copy()]
for w in range(extra_weeks_to_simulate):
to_concat.append(mask_data[-7:].copy()) # loop final week
mask_data = np.concatenate(to_concat, axis=0) # concatenate along rows
if type(home_beta) in {np.ndarray, list}:
additional_home_beta = np.ones(7 * extra_weeks_to_simulate) * home_beta[-1] # keep final home beta, repeat
home_beta = np.concatenate([home_beta, additional_home_beta], axis=0)
exogenous_model_kwargs['home_beta'] = home_beta
# v1 is from Nature paper, uses beginning of March as full reopening, only allows one category to be
# modified at a time
if version == 'v1':
orig_num_hours = len(all_hours)
all_hours = helper.list_hours_in_range(min_datetime, max_datetime + datetime.timedelta(hours=168 * extra_weeks_to_simulate))
print("Extending time period; simulation now ends at %s (%d hours)" % (max(all_hours), len(all_hours)))
assert(intervention_datetime in all_hours)
intervention_hour_idx = all_hours.index(intervention_datetime)
if 'top_category' in counterfactual_poi_opening_experiment_kwargs:
top_category = counterfactual_poi_opening_experiment_kwargs['top_category']
else:
top_category = None
if 'sub_category' in counterfactual_poi_opening_experiment_kwargs:
sub_category = counterfactual_poi_opening_experiment_kwargs['sub_category']
else:
sub_category = None
poi_categories = d[['top_category', 'sub_category']]
# must have one but not both of these arguments
assert (('alpha' in counterfactual_poi_opening_experiment_kwargs) + ('full_activity_alpha' in counterfactual_poi_opening_experiment_kwargs)) == 1
# the original alpha - post-intervention is interpolation between no reopening and full activity
if 'alpha' in counterfactual_poi_opening_experiment_kwargs:
alpha = counterfactual_poi_opening_experiment_kwargs['alpha']
assert alpha >= 0 and alpha <= 1
poi_cbg_visits_list, intervention_cost = apply_interventions_to_poi_cbg_matrices(poi_cbg_visits_list,
poi_categories, poi_areas, all_hours, intervention_hour_idx,
alpha, extra_weeks_to_simulate, top_category, sub_category, interpolate=True)
# post-intervention is alpha-percent of full activity (no interpolation)
else:
alpha = counterfactual_poi_opening_experiment_kwargs['full_activity_alpha']
assert alpha >= 0 and alpha <= 1
poi_cbg_visits_list, intervention_cost = apply_interventions_to_poi_cbg_matrices(poi_cbg_visits_list,
poi_categories, poi_areas, all_hours, intervention_hour_idx,
alpha, extra_weeks_to_simulate, top_category, sub_category, interpolate=False)
# should be used in tandem with alpha or full_activity_alpha, since the timeseries is extended
# in those blocks; this part just caps post-intervention visits to alpha-percent of max capacity
if 'max_capacity_alpha' in counterfactual_poi_opening_experiment_kwargs:
max_capacity_alpha = counterfactual_poi_opening_experiment_kwargs['max_capacity_alpha']
assert max_capacity_alpha >= 0 and max_capacity_alpha <= 1
poi_visits = np.zeros((M, orig_num_hours)) # num pois x num hours
for t, poi_cbg_visits in enumerate(poi_cbg_visits_list[:orig_num_hours]):
poi_visits[:, t] = poi_cbg_visits @ np.ones(N)
max_per_poi = np.max(poi_visits, axis=1) # get historical max capacity per POI
alpha_max_per_poi = np.clip(max_capacity_alpha * max_per_poi, 1e-10, None) # so that we don't divide by 0
orig_total_activity = 0
capped_total_activity = 0
for t in range(intervention_hour_idx, len(poi_cbg_visits_list)):
poi_cbg_visits = poi_cbg_visits_list[t]
num_visits_per_poi = poi_cbg_visits @ np.ones(N)
orig_total_activity += np.sum(num_visits_per_poi)
ratio_per_poi = num_visits_per_poi / alpha_max_per_poi
clipping_idx = ratio_per_poi > 1 # identify which POIs need to be clipped
poi_multipliers = np.ones(M)
poi_multipliers[clipping_idx] = 1 / ratio_per_poi[clipping_idx]
adjusted_poi_cbg_visits = poi_cbg_visits.transpose().multiply(poi_multipliers).transpose().tocsr()
capped_total_activity += np.sum(adjusted_poi_cbg_visits @ np.ones(N))
poi_cbg_visits_list[t] = adjusted_poi_cbg_visits
print('Finished capping visits at %.1f%% of max capacity -> kept %.4f%% of visits' %
(100. * max_capacity_alpha, 100 * capped_total_activity / orig_total_activity))
intervention_cost['total_activity_after_max_capacity_capping'] = capped_total_activity
# v2 was implemented post-Nature paper, uses 2019 IPF output as full reopening, takes in dictionary of category
# to alpha where alpha represents the percentage of 2019 activity to keep for category
else:
assert msa_name is not None
category2alpha = counterfactual_poi_opening_experiment_kwargs['category_to_alpha']
poi_categories = d.sub_category.values
all_hours, poi_cbg_visits_list, total_post_intervention_visits = apply_different_percentages_of_2019_levels(
msa_name, category2alpha, poi_cbg_visits_list, poi_categories, all_hours, intervention_datetime,
extra_weeks_to_simulate, agg_poi_cbg_visits=poi_cbg_proportions_mat)
print('Total post intervention visits: %.3fM' % (total_post_intervention_visits / 1000000))
intervention_cost = {}
intervention_cost['total_num_visits_post_intervention'] = total_post_intervention_visits
if counterfactual_retrospective_experiment_kwargs is not None:
# must have one but not both of these arguments
assert (('distancing_degree' in counterfactual_retrospective_experiment_kwargs) + ('shift_in_days' in counterfactual_retrospective_experiment_kwargs)) == 1
if poi_cbg_visits_list is None:
raise Exception('Retrospective experiments are only implemented for when poi_cbg_visits_list is precomputed')
if 'distancing_degree' in counterfactual_retrospective_experiment_kwargs:
distancing_degree = counterfactual_retrospective_experiment_kwargs['distancing_degree']
poi_cbg_visits_list = apply_distancing_degree(poi_cbg_visits_list, distancing_degree)
print('Modified poi_cbg_visits_list for retrospective experiment: distancing_degree = %s.' % distancing_degree)
else:
shift_in_days = counterfactual_retrospective_experiment_kwargs['shift_in_days']
poi_cbg_visits_list = apply_shift_in_days(poi_cbg_visits_list, shift_in_days)
print('Modified poi_cbg_visits_list for retrospective experiment: shifted by %d days.' % shift_in_days)
print('Total time to prep data: %.3fs' % (time.time() - t0))
# feed everything into model.
m = Model(**model_init_kwargs)
m.init_exogenous_variables(poi_cbg_proportions=poi_cbg_proportions_mat,
poi_time_counts=poi_time_counts,
poi_areas=poi_areas,
poi_dwell_time_correction_factors=poi_dwell_time_correction_factors,
cbg_sizes=cbg_sizes,
all_unique_cbgs=all_unique_cbgs,
cbgs_to_idxs=cbgs_to_idxs,
all_states=all_states,
poi_cbg_visits_list=poi_cbg_visits_list,
all_hours=all_hours,
initial_conditions=initial_conditions,
cbg_idx_groups_to_track=cbg_idx_groups_to_track,
cbg_day_prop_out=cbg_day_prop_out,
inter_cbg_travel=inter_cbg_travel,
daily_mask_use=mask_data,
intervention_cost=intervention_cost,
poi_subcategory_types=poi_subcategory_types,
**exogenous_model_kwargs)
m.init_endogenous_variables()
if attach_data_to_model:
m.d = d
if return_model_without_fitting:
return m
m.simulate_disease_spread(**simulation_kwargs)
return m
def correct_visit_vector(v, median_dwell_in_minutes):
"""
Given an original hourly visit vector v and a dwell time in minutes,
return a new hourly visit vector which accounts for spillover.
"""
v = np.array(v)
d = median_dwell_in_minutes/60.
new_v = v.copy().astype(float)
max_shift = math.floor(d + 1) # maximum hours we can spill over to.
for i in range(1, max_shift + 1):
if i < max_shift:
new_v[i:] += v[:-i] # this hour is fully occupied
else:
new_v[i:] += (d - math.floor(d)) * v[:-i] # this hour only gets part of the visits.
return new_v
def clip_poi_attributes_in_msa_df(d, min_datetime, max_datetime,
clip_areas, clip_dwell_times, clip_visits,
area_below=AREA_CLIPPING_BELOW,
area_above=AREA_CLIPPING_ABOVE,
dwell_time_above=DWELL_TIME_CLIPPING_ABOVE,
visits_above=HOURLY_VISITS_CLIPPING_ABOVE,
subcat_cutoff=SUBCATEGORY_CLIPPING_THRESH,
topcat_cutoff=TOPCATEGORY_CLIPPING_THRESH):
'''
Deal with POI outliers by clipping their hourly visits, dwell times, and physical areas
to some percentile of the corresponding distribution for each POI category.
'''
attr_cols = []
if clip_areas:
attr_cols.append('safegraph_computed_area_in_square_feet')
if clip_dwell_times:
attr_cols.append('avg_median_dwell')
if clip_visits:
all_hours = helper.list_hours_in_range(min_datetime, max_datetime)
hour_cols = ['hourly_visits_%s' % get_datetime_hour_as_string(dt) for dt in all_hours]
attr_cols.extend(hour_cols)
assert all([col in d.columns for col in attr_cols])
print('Clipping areas: %s (below=%d, above=%d), clipping dwell times: %s (above=%d), clipping visits: %s (above=%d)' %
(clip_areas, area_below, area_above, clip_dwell_times, dwell_time_above, clip_visits, visits_above))
indices_covered = []
subcats = []
subcategory2idx = d.groupby('sub_category').indices
for cat, idx in subcategory2idx.items():
if len(idx) >= subcat_cutoff:
subcats.append(cat)
indices_covered.extend(idx)
# group by top_category for POIs whose sub_category's are too small
topcats = []
topcategory2idx = d.groupby('top_category').indices
for cat, idx in topcategory2idx.items():
if len(idx) >= topcat_cutoff:
new_idx = np.array(list(set(idx) - set(indices_covered))) # POIs that are not covered by sub_category clipping
if len(new_idx) > 0:
topcats.append(cat)
topcategory2idx[cat] = (idx, new_idx)
indices_covered.extend(new_idx)
print('Found %d sub-categories with >= %d POIs and %d top categories with >= %d POIs -> covers %d POIs' %
(len(subcats), subcat_cutoff, len(topcats), topcat_cutoff, len(indices_covered)))
lost_pois = len(d) - len(indices_covered)
print('Could not cover %d/%d POIs (%.1f%% POIs) -> dropping these POIs' %
(lost_pois, len(d), 100. * lost_pois/len(d)))
if lost_pois / len(d) > .05:
raise Exception('Dropping too many POIs during clipping phase')
all_cats = topcats + subcats # process top categories first so sub categories will compute percentiles on raw data
new_data = np.array(d[attr_cols].copy().values) # n_pois x n_cols_to_clip
thresholds = np.zeros((len(all_cats), len(attr_cols)+1)) # clipping thresholds for category x attribute
medians = np.zeros((len(all_cats), len(attr_cols))) # medians for category x attribute
indices_processed = []
for i, cat in enumerate(all_cats):
if i < len(topcats):
cat_idx, new_idx = topcategory2idx[cat]
else:
cat_idx = subcategory2idx[cat]
new_idx = cat_idx
indices_processed.extend(new_idx)
first_col_idx = 0 # index of first column for this attribute
if clip_areas:
cat_areas = new_data[cat_idx, first_col_idx] # compute percentiles on entire category
min_area = np.nanpercentile(cat_areas, area_below)
max_area = np.nanpercentile(cat_areas, area_above)
median_area = np.nanmedian(cat_areas)
thresholds[i][first_col_idx] = min_area
thresholds[i][first_col_idx+1] = max_area
medians[i][first_col_idx] = median_area
new_data[new_idx, first_col_idx] = np.clip(new_data[new_idx, first_col_idx], min_area, max_area)
first_col_idx += 1
if clip_dwell_times:
cat_dwell_times = new_data[cat_idx, first_col_idx]
max_dwell_time = np.nanpercentile(cat_dwell_times, dwell_time_above)
median_dwell_time = np.nanmedian(cat_dwell_times)
thresholds[i][first_col_idx+1] = max_dwell_time
medians[i][first_col_idx] = median_dwell_time
new_data[new_idx, first_col_idx] = np.clip(new_data[new_idx, first_col_idx], None, max_dwell_time)
first_col_idx += 1
if clip_visits:
col_idx = np.arange(first_col_idx, first_col_idx+len(hour_cols))
assert col_idx[-1] == (len(attr_cols)-1)
orig_visits = new_data[cat_idx][:, col_idx].copy() # need to copy bc will modify
orig_visits[orig_visits == 0] = np.nan # want percentile over positive visits
# can't take percentile of col if it is all 0's or all nan's
cols_to_process = col_idx[np.sum(~np.isnan(orig_visits), axis=0) > 0]
max_visits_per_hour = np.nanpercentile(orig_visits[:, cols_to_process-first_col_idx], visits_above, axis=0)
assert np.sum(np.isnan(max_visits_per_hour)) == 0
thresholds[i][cols_to_process + 1] = max_visits_per_hour
medians[i][cols_to_process] = np.nanmedian(orig_visits[:, cols_to_process-first_col_idx], axis=0)
orig_visit_sum = np.nansum(new_data[new_idx][:, col_idx])
orig_attributes = new_data[new_idx] # return to un-modified version
orig_attributes[:, cols_to_process] = np.clip(orig_attributes[:, cols_to_process], None, max_visits_per_hour)
new_data[new_idx] = orig_attributes
new_visit_sum = np.nansum(new_data[new_idx][:, col_idx])
print('%s -> has %d POIs, processed %d POIs, %d visits before clipping, %d visits after clipping' %
(cat, len(cat_idx), len(new_idx), orig_visit_sum, new_visit_sum))
else:
print('%s -> has %d POIs, processed %d POIs' % (cat, len(cat_idx), len(new_idx)))
assert len(indices_processed) == len(set(indices_processed)) # double check that we only processed each POI once
assert set(indices_processed) == set(indices_covered) # double check that we processed the POIs we expected to process
new_d = d.iloc[indices_covered].copy()
new_d[attr_cols] = new_data[indices_covered]
return new_d, all_cats, attr_cols, thresholds, medians
def apply_interventions_to_poi_cbg_matrices(poi_cbg_visits_list, poi_categories, poi_areas,
new_all_hours, intervention_hour_idx,
alpha, extra_weeks_to_simulate,
top_category=None, sub_category=None,
interpolate=True):
'''
Simulates hypothetical mobility patterns by editing visit matrices.
'''
# find POIs of interest
if top_category is not None:
if type(top_category) == list:
top_category_poi_idx = np.zeros(len(poi_categories)).astype(bool)
for cat in top_category:
top_category_poi_idx = top_category_poi_idx | (poi_categories['top_category'] == cat).values
else:
top_category_poi_idx = (poi_categories['top_category'] == top_category).values
else:
top_category = 'any'
top_category_poi_idx = np.ones(len(poi_categories)).astype(bool)
if sub_category is not None:
if type(sub_category) == list:
sub_category_poi_idx = np.zeros(len(poi_categories)).astype(bool)
for cat in sub_category:
sub_category_poi_idx = sub_category_poi_idx | (poi_categories['sub_category'] == cat).values
else:
sub_category_poi_idx = (poi_categories['sub_category'] == sub_category).values
else:
sub_category = 'any'
sub_category_poi_idx = np.ones(len(poi_categories)).astype(bool)
intervened_poi_idx = top_category_poi_idx & sub_category_poi_idx # poi indices to intervene on
assert intervened_poi_idx.sum() > 0
print("Intervening on POIs with top_category=%s, sub_category=%s (n=%i)" % (top_category, sub_category, intervened_poi_idx.sum()))
# extend matrix list to extra weeks, loop final week for now
num_pois, num_cbgs = poi_cbg_visits_list[0].shape
new_matrix_list = [m.copy() for m in poi_cbg_visits_list]
for i in range(extra_weeks_to_simulate * 168):
matrix_idx = -168 + (i % 168) # get corresponding matrix from final week
new_matrix_list.append(poi_cbg_visits_list[matrix_idx].copy())
assert new_matrix_list[-1].shape == (num_pois, num_cbgs), len(new_matrix_list)-1
assert len(new_matrix_list) == len(new_all_hours)
if top_category == 'any' and sub_category == 'any': # apply intervention to all POIs
full_activity_sum = 0
simulated_activity_sum = 0
for i in range(intervention_hour_idx, len(new_all_hours)):
no_reopening = new_matrix_list[i]
full_reopening = new_matrix_list[i % 168]
full_activity_sum += full_reopening.sum()
if alpha == 1:
new_matrix_list[i] = full_reopening.copy()
simulated_activity_sum = full_activity_sum
else:
if interpolate:
new_matrix_list[i] = full_reopening.multiply(alpha) + no_reopening.multiply(1-alpha)
else:
new_matrix_list[i] = full_reopening.multiply(alpha)
simulated_activity_sum += new_matrix_list[i].sum()
diff = full_activity_sum - simulated_activity_sum
overall_cost = (100. * diff / full_activity_sum)
print('Overall Cost (%% of full activity): %2.3f%%' % overall_cost)
return new_matrix_list, {'overall_cost':overall_cost, 'cost_within_intervened_pois':overall_cost}
# full activity based on first week of visits
range_end = max(intervention_hour_idx + 168, len(poi_cbg_visits_list))
full_activity = [poi_cbg_visits_list[i % 168] for i in range(intervention_hour_idx, range_end)] # get corresponding matrix in first week
full_activity = hstack(full_activity, format='csr')
orig_activity = hstack(new_matrix_list[intervention_hour_idx:range_end], format='csr')
assert full_activity.shape == orig_activity.shape
print('Computed hstacks of sparse matrices [shape=(%d, %d)]' % full_activity.shape)
# take mixture of full activity and original activity for POIs of interest
indicator_vec = np.zeros(num_pois)
indicator_vec[intervened_poi_idx] = 1.0
alpha_vec = alpha * indicator_vec
scaled_full_activity = full_activity.transpose().multiply(alpha_vec).transpose()
if interpolate:
non_alpha_vec = 1.0 - alpha_vec # intervened POIs will have alpha*full + (1-alpha)*closed
else:
non_alpha_vec = 1.0 - indicator_vec # intervened POIs will have alpha*full
scaled_orig_activity = orig_activity.transpose().multiply(non_alpha_vec).transpose()
activity_mixture = scaled_full_activity + scaled_orig_activity
print('Computed mixture of full and original activity')
# compute costs
full_overall_sum = full_activity.sum()
mixture_overall_sum = activity_mixture.sum()
overall_diff = full_overall_sum - mixture_overall_sum
overall_cost = (100. * overall_diff / full_overall_sum)
print('Overall Cost (%% of full activity): %2.3f%%' % overall_cost)
full_intervened_sum = full_activity.transpose().multiply(indicator_vec).sum()
mixture_intervened_sum = activity_mixture.transpose().multiply(indicator_vec).sum()
intervened_diff = full_intervened_sum - mixture_intervened_sum
cost_within_intervened_pois = (100. * intervened_diff / full_intervened_sum)
print('Cost within intervened POIs: %2.3f%%' % cost_within_intervened_pois)
print('Redistributing stacked matrix into hourly pieces...')
ts = time.time()
looping = False
for i in range(intervention_hour_idx, len(new_all_hours)):
matrix_idx = i - intervention_hour_idx
if i >= len(poi_cbg_visits_list) and matrix_idx >= 168:
# once we are operating past the length of real data, the "original" matrix
# is just the matrix from the last week of the real data for the corresponding
# day, and if matrix_idx > 168, then the mixture for that corresponding day
# has been computed already
new_matrix_list[i] = new_matrix_list[i - 168].copy()
if looping is False:
print('Entering looping phase at matrix %d!' % matrix_idx)
looping = True
else:
matrix_start = matrix_idx * num_cbgs
matrix_end = matrix_start + num_cbgs
new_matrix_list[i] = activity_mixture[:, matrix_start:matrix_end]
assert new_matrix_list[i].shape == (num_pois, num_cbgs), 'intervention idx = %d, overall idx = %d [found size = (%d, %d)]' % (matrix_idx, i, new_matrix_list[i].shape[0], new_matrix_list[i].shape[1])
if matrix_idx % 24 == 0:
te = time.time()
print('Finished matrix %d: time so far per hourly matrix = %.2fs' % (matrix_idx, (te-ts)/(matrix_idx+1)))
return new_matrix_list, {'overall_cost':overall_cost, 'cost_within_intervened_pois':cost_within_intervened_pois}
def get_corresponding_2019_datetime(date):
if date.year == 2019:
return date
exact_date_2019 = datetime.datetime(2019, date.month, date.day, date.hour)
if date.strftime('%m-%d') in FIXED_HOLIDAY_DATES: # is a holiday, need to return exact same date
return exact_date_2019
offset_date_2019 = datetime.datetime(2019, date.month, date.day, date.hour)
diff = date.weekday() - offset_date_2019.weekday()
if diff > 0:
if diff <= 3: # go forward 1-3 days
offset_date_2019 = offset_date_2019 + datetime.timedelta(days=diff)
else: # go back 1-3 days
assert diff <= 6
back_steps = 7-diff
offset_date_2019 = offset_date_2019 + datetime.timedelta(days=-back_steps)
elif diff < 0:
if diff >= -3: # go back 1-3 days
offset_date_2019 = offset_date_2019 + datetime.timedelta(days=diff)
else: # go forward 1-3 days
assert diff >= -6
forward_steps = 7 + diff
offset_date_2019 = offset_date_2019 + datetime.timedelta(days=forward_steps)
assert offset_date_2019.weekday() == date.weekday()
if offset_date_2019.strftime('%m-%d') in FIXED_HOLIDAY_DATES: # can't map to holiday
return exact_date_2019
return offset_date_2019
def get_corresponding_2019_visits(all_poi_visits, all_hours, start_hour, end_hour):
print('Finding corresponding 2019 hours for %s to %s' % (start_hour.strftime('%Y-%m-%d'),
end_hour.strftime('%Y-%m-%d')))
hours = helper.list_hours_in_range(start_hour, end_hour)
corresponding_visits = np.zeros((all_poi_visits.shape[0], len(hours)))
for i, hr in enumerate(hours):
date_2019 = get_corresponding_2019_datetime(hr)
assert date_2019.hour == hr.hour
index_2019 = all_hours.index(date_2019)
corresponding_visits[:, i] = all_poi_visits[:, index_2019]
return corresponding_visits
def apply_different_percentages_of_2019_levels(msa_name, category2alpha, poi_cbg_visits_list, poi_categories,
orig_hours, intervention_datetime, extra_weeks_to_simulate,
all_poi_visits=None, all_poi_visits_hours=None,
agg_poi_cbg_visits=None):
'''
V2 of constructing hypothetical mobility patterns. category2alpha maps category to level of opening, as a fraction
of 2019 mobility levels. For POIs whose categories do not appear in category2alpha, they continue their current
levels of mobility.
'''
num_pois, num_cbgs = poi_cbg_visits_list[0].shape
model_hours = helper.list_hours_in_range(orig_hours[0], orig_hours[-1] + datetime.timedelta(hours=168*extra_weeks_to_simulate))
assert intervention_datetime in model_hours
model_intervention_idx = model_hours.index(intervention_datetime)
assert model_intervention_idx <= len(poi_cbg_visits_list)
num_hours_post_intervention = len(model_hours[model_intervention_idx:])
print('Found %d hours post-intervention' % num_hours_post_intervention)
# get average proportion of home CBGs per POI
if agg_poi_cbg_visits is None:
agg_poi_cbg_visits = poi_cbg_visits_list[0]
for t in range(1, model_intervention_idx):
agg_poi_cbg_visits = agg_poi_cbg_visits + poi_cbg_visits_list[t]
assert agg_poi_cbg_visits.shape == (num_pois, num_cbgs)
agg_poi_cbg_visits = csr_matrix(agg_poi_cbg_visits)
row_sums = agg_poi_cbg_visits @ np.ones(num_cbgs)
zero_visits = row_sums == 0
print('Found %d POIs with 0 visits in aggregate POI CBG visits matrix' % np.sum(zero_visits))
row_sums[zero_visits] = 1e-10 # the proportions will remain 0 bc numerator
agg_poi_cbg_props = agg_poi_cbg_visits.transpose().multiply(1/row_sums).transpose().tocsr()
# make extended poi_cbg_visits_list with proportions matrix post-interventions
new_matrix_list = [m.copy() for m in poi_cbg_visits_list[:model_intervention_idx]]
post_intervention_list = [agg_poi_cbg_props.copy() for t in range(model_intervention_idx, len(model_hours))]
new_matrix_list += post_intervention_list
assert len(new_matrix_list) == len(model_hours)
# compute current vs 2019 scaling for each POI
if all_poi_visits is None or all_poi_visits_hours is None:
all_poi_visits, _, all_poi_visits_hours = helper.load_all_poi_visits_for_msa(msa_name)
assert len(all_poi_visits) == num_pois
# compare last 4 weeks before intervention to corresponding weeks in 2019
visits_intervention_idx = all_poi_visits_hours.index(intervention_datetime)
four_weeks_visits = all_poi_visits[:, visits_intervention_idx-(168*4):visits_intervention_idx]
start_hour = intervention_datetime + datetime.timedelta(hours=-(168*4))
end_hour = intervention_datetime + datetime.timedelta(hours=-1)
four_weeks_visits_2019 = get_corresponding_2019_visits(all_poi_visits, all_poi_visits_hours, start_hour, end_hour)
assert four_weeks_visits_2019.shape == four_weeks_visits.shape
scaling_factors = (np.sum(four_weeks_visits, axis=1)+1) / (np.sum(four_weeks_visits_2019, axis=1)+1)
# check which POIs should use computed POI scaling
valid_factors = (scaling_factors >= 0.1) & (scaling_factors <= 2) # scaling assumptions break at extremes
num_nonzero_hours_2019 = np.sum(four_weeks_visits_2019 > 0, axis=1)
num_nonzero_hours = np.sum(four_weeks_visits > 0, axis=1)
ratio_of_nonzero_hours = (num_nonzero_hours + 1) / (num_nonzero_hours_2019 + 1)
valid_ratios = (ratio_of_nonzero_hours >= 0.5) & (ratio_of_nonzero_hours <= 1.5)
use_scaled_2019 = valid_factors & valid_ratios
print('%.2f%% of POIs have valid scaling factors, %.2f%% have valid non-zero ratios, %.2f%% have both' %
(100. * np.sum(valid_factors) / num_pois, 100. * np.sum(valid_ratios) / num_pois,
100. * np.sum(use_scaled_2019) / num_pois))
print('Scaling factors over valid POIs: min = %.3f, 25th = %.3f, median = %.3f, 75th = %.3f, max = %.3f' %
(np.min(scaling_factors[use_scaled_2019]), np.percentile(scaling_factors[use_scaled_2019], 25),
np.median(scaling_factors[use_scaled_2019]), np.percentile(scaling_factors[use_scaled_2019], 75),
np.max(scaling_factors[use_scaled_2019])))
for cat, alpha in category2alpha.items():
in_cat = poi_categories == cat
assert np.sum(in_cat) > 0 # sanity check to make sure category names aren't misspelled
scaling_factors[in_cat] = alpha