-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimulation.py
427 lines (341 loc) · 17.5 KB
/
simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# External Dependencies:
import numpy as np
from numpy.random import exponential
import scipy.stats
from scipy import Infinity
from scipy.sparse import coo_matrix
# From Python stdlib:
from heapq import heappush, heappop, heapify # https://docs.python.org/3/library/heapq.html
import bisect
import csv
import os
from empirical_distribution_outdegree import sample_outdegree, sample_empirical_outdegree
"""
This algorithm generates a network according to a preferential attachment mechanism with fitness and aging. It uses the continuous-time branching process (CTBP) formulation of Garavaglia2019 that grows a tree which is finally collapsed into a graph. Every node in the tree is a stochastic process with jump times generated by update_fn, dependent on the in-degree and age of the node. When the tree is fully generated, it can be collapsed into a graph as described in Garavaglia2019. To do this correctly birth order should be kept for each node. This is a list of nodes ordered by the earliest upcoming birth times in the tree.
The python implementation uses a number of optimizations that greatly improve speed on large graphs:
- A heap (with Heapq) is used to maintain the birth order. TreeNode implements __lt__ to do a sorted insert into the heap. This makes the algorithm scale loglinear O(nlog(M)) in the number of edges M.
"""
class TreeNode:
# __slots__ saves us a lot of RAM when the graph is very large.
# Otherwise there is the overhead of Dict: ~100B per graph edge or tree node.
# See: https://book.pythontips.com/en/latest/__slots__magic.html
__slots__ = ("graph_idx", "arrival_time", "stationary", "rescaled", "next_arrival", "fitness", "pa_b", "num_children")
def __init__(self, graph_idx, fitness, pa_b, arrival_time=0.0, stationary=0.0, rescaled=0.0, num_children=0):
self.graph_idx = graph_idx # The graph node this node belongs to in the collapsed tree
self.arrival_time = arrival_time # When was the node created (in (calendar) rescaled time)
self.stationary = stationary # Current stationary time (age without age effect)
self.rescaled = rescaled # Rescaled time of the process (age in calender time)
self.fitness = fitness # fitness
self.pa_b = pa_b # the b parameter in the preferential attachment fucntion "ak + b"
self.num_children = num_children # Number of children in the BP tree.
def update(self, t, rescaled):
self.stationary = t
self.next_arrival = self.arrival_time + rescaled
def __lt__(self, other):
# Required for heapq.heapify()
return self.next_arrival < other.next_arrival
'''
Returns the appropriate rate function out of 8 possible options.
Note that if fitness is disabled, all eta's will be 1, so they will not break the method.
@param: pa: (a,b) preferential attachment parameters (f(k)=ak + b). If None, no Preferential Attachment is used.
@param: age: (G_mu, G_sigma) lognormal rescaling . If None, no Age is used
@returns the rate function
'''
def get_rate_function(pa_a=1, age=(1, 1)):
if age is None:
if pa_a is None:
def inner_fn(k, t, eta, pa_b):
rate = eta
dt = np.random.exponential(scale=rate ** -1)
t_new = t + dt
return t_new, t_new
else:
def inner_fn(k, t, eta, pa_b):
rate = (pa_a*k + pa_b) * eta
dt = np.random.exponential(scale=rate ** -1)
t_new = t+dt
return t_new, t_new
else: # apply age rescaling
"""This function uses the preferential attachment function to calculate
a jump time for the branching process. The rate depends on stationary time t, degree k, and fitness eta.
Aging function G(t) gives a time rescaled version of the process (Garavaglia2019, section 1.9).
"""
# scipy version of LogNormal distribution uses a different parameterization. See:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html
#log_normal = scipy.stats.lognorm(G_sigma, scale=scipy.exp(G_mu))
log_normal = scipy.stats.lognorm(age[1], scale=np.exp(age[0]))
if pa_a is None:
def inner_fn(k, t, eta, pa_b):
# next event, no pa
rate = eta
dt = np.random.exponential(scale=rate ** -1)
t_new = t + dt
if t_new <= 1.0:
rescaled = log_normal.ppf(t_new)
else:
rescaled = Infinity
return t_new, rescaled
else:
def inner_fn(k, t, eta, pa_b):
# next birth event with pa
rate = (pa_a*k + pa_b) * eta
dt = np.random.exponential(scale=rate ** -1)
t_new = t+dt
#print("params", k, t, eta, pa_b)
#print("time and rate", t_new, rate)
# rescale aging effect
if t_new <= 1.0:
rescaled = log_normal.ppf(t_new)
else:
rescaled = Infinity
return t_new, rescaled
return inner_fn
class BirthOrder:
"""
maintains the birth order of the nodes in the BP tree.
Has methods to get nodes from different graphidx
"""
def __init__(self):
self.birth_order = []
def non_empty(self) -> bool:
return len(self.birth_order) > 0
def add(self, node):
bisect.insort(self.birth_order, node)
def bulk_add(self, nodes):
for node in nodes:
bisect.insort(self.birth_order, node)
def get_number_of_root_nodes(self):
# TODO make this a maintained state variable
root_nodes = set()
for node in self.birth_order:
if node.graph_idx not in root_nodes:
root_nodes.add(node.graph_idx)
return len(root_nodes)
def pop_next_n_distinct_root_nodes(self, n):
"""
returns the next n distinct root nodes in the birth order.
If there are less than n distinct root nodes, returns all of them.
"""
root_nodes_yielded = set()
nodes = []
for node in self.birth_order:
if node.graph_idx not in root_nodes_yielded:
root_nodes_yielded.add(node.graph_idx)
nodes.append(node)
if len(nodes) == n:
break
for node in nodes:
self.birth_order.remove(node)
return nodes
class BranchingProcess:
def __init__(self, pa_function, fitness=[], degree=[], pa_b=[]):
self.fitness = fitness
self.degree = degree
self.pa_b = pa_b
self.birth_order = BirthOrder()
self.graph_edges = [] # edges in the collapsed graph
self.graph_nodes = [] # nodes in the collapsed graph
self.pa_function = pa_function
def alive(self):
return self.birth_order.non_empty()
def seed(self):
root = self.create_bp_node(0, arrival_time=0.)
root.update(0,0)
self.birth_order.add(root)
'''
Generate a graph
- optional
@param save_to = (fp: str, name: str)
@param debug = False; whether to store and print extra information about the simulation.
'''
def generate(self, save_to=None, debug=False):
if save_to:
assert len(save_to) == 2, "to save graph, pass save_to=(filepath, name)"
if debug:
all_individiuals = []
N = len(self.degree)
#initalize the root (this is a dummy node)
self.seed()
# Main loop of the algorithm. Grow the graph by N nodes.
for n in range(N):
# print("birthorder", [(x.graph_idx, round(x.next_arrival,2)) for x in self.birth_order.birth_order])
if debug:
print(n)
# get the M collapsed nodes the new root connects to.
if self.alive():
# out degree
M = self.degree[n]
# get the M collapsed nodes the new root connects to.
# if there are less than M nodes, connect to all of them.
to_connect_to = self.birth_order.pop_next_n_distinct_root_nodes(M)
# print([(x.graph_idx, round(x.next_arrival,2)) for x in to_connect_to])
# print("birthorder", [(x.graph_idx, round(x.next_arrival,2)) for x in self.birth_order.birth_order])
# the collapsed-node arrival time is the arrival time of its last out-degree child
name_node_arrival_time = to_connect_to[0].next_arrival
else:
return False
# events that need to be rescheduled after the new node is added
to_be_rescheduled = []
# Add the new node to collapsed graph registry
self.graph_nodes.append((n, name_node_arrival_time))
for parent in to_connect_to:
# The algorithm stops if all nodes in the branching tree have died out.
""" A new node is added as child to a BP node with the earliest arrival time.
After adding the child, both parent and child calculate new arrival times and update the birth order heap.
"""
# Create a child node whose birth time is based on the stationary time of the name node.
child = self.create_bp_node(n, arrival_time=name_node_arrival_time)
# Update the stationary time of parent and child nodes and update the birth order.
# Only alive nodes are added to the birth order (i.e. stationary time < 1)
# t is the stationary time, ts is the rescaled stationary time.
t, ts = self.pa_function(k=child.num_children, t=child.stationary, eta=child.fitness, pa_b=child.pa_b)
child.update(t, ts)
if ts < Infinity: # If rescaled time reaches Infinity the node died.
to_be_rescheduled.append(child)
parent.num_children += 1
t, ts = self.pa_function(k=parent.num_children, t=parent.stationary, eta=parent.fitness, pa_b=parent.pa_b)
parent.update(t, ts)
if ts < Infinity: # If rescaled time reaches Infinity the node died.
to_be_rescheduled.append(parent)
if debug:
all_individiuals.append((child.graph_idx, parent.next_arrival, child.fitness, child.pa_b, child.arrival_time))
# Add the corresponding edge in the collapsed graph.
self.graph_edges.append((child.graph_idx, parent.graph_idx))
# add the birth events for all children and parents
# print('reschedule', [(r.graph_idx, round(r.next_arrival,2)) for r in to_be_rescheduled])
self.birth_order.bulk_add(to_be_rescheduled)
if save_to:
name = save_to[1]
fp = save_to[0]
self.save_edges(os.path.join(fp,f"{name}_edges.csv"))
self.save_nodes(os.path.join(fp,f"{name}_nodes.csv"))
if debug:
# store the all individual diagnostic
print("==========DIAGNOSTIC OUTPUT============")
print(f"In total, there were {len(all_individiuals)} born.")
print(f"From the inital out_degrees, we expected {sum(self.degree)}")
print(f"There were {len(self.graph_edges)} edges.")
# filename = f"{name}_all_individuals.csv"
# print(f"Saving a log of all individuals to {filename} in {fp}")
# header = ["name_node", "actual birth time", "fitness", "PA_B", "stored birth time"]
# with open(os.path.join(fp,filename), "w", newline="") as f:
# writer = csv.writer(f)
# writer.writerow(header)
# writer.writerows(all_individiuals)
# print("=======================================")
# the simulation was succesfull
return True
def create_bp_node(self, graph_idx, arrival_time=0.0):
"""Create a new branching process node, corresponding to node graph_idx in the collapsed graph"""
return TreeNode(
graph_idx=graph_idx,
arrival_time=arrival_time,
fitness=self.fitness[graph_idx],
pa_b=self.pa_b[graph_idx]
)
def as_sparse(self):
""" Return the collapsed graph as a sparse matrix"""
edges = self.graph_edges
if len(edges) == 0:
return coo_matrix((0, 0))
i, j, t = zip(*edges)
ones = [1 for _ in i]
N = len(self.graph_nodes)
return coo_matrix((ones, (i, j)), shape=(N, N))
def save_edges(self, path):
"""Save edges to csv"""
with open(path, 'w', newline="") as f:
writer = csv.writer(f)
for edge in self.graph_edges:
writer.writerow(edge)
def save_nodes(self, path):
"""Save nodes to csv"""
with open(path, 'w', newline="") as f:
writer = csv.writer(f)
for node in self.graph_nodes:
writer.writerow(node)
'''
Generate a graph from scratch. All @pre's are asserted in code, so method is reasonably robust.
Note that the super-criticality condition is NOT checked.
# required arguments
@param N - size of the final graph
# optional arguments
@param pa - preferential attachment parameters
- pa = None: no prefferential attachment is used
- pa = (a, b): corresponds to preferential attachment sequence f_k = a*k + b
@pre: a, b > 0
@param fitness - fitness parameter(s)
- fitness = None: no fitness is used (= setting fitness to 1 for each vertex)
- fitness = l: an exponentially distributed fitness with mean = 1/l is used/
@pre: l > 0
- fitness = (min, tau): a powerlaw distributed fitness.
@pre: min > 0 AND tau > 1
@param aging
- aging = None: no aging effect is used
- aging = (mu, sigma): lognormal aging rescaling
@pre: sigma > 0
'''
def simulator_from_parameters(N=100,sample_outdegree_method=sample_outdegree, pa=None, fitness=None, aging=None):
description = f"N={N}, "
# Sample the degree sequence
node_degree = sample_outdegree_method(N)
# Sample the fitness sequence
if fitness is None:
# No fitness: use a constant fitness of 1.
description += f"Fitness=None, "
node_fitness = [1.0 for i in range(N)]
elif isinstance(fitness, (int, float)):
# use Exponential fitness
assert fitness > 0, "exponential parameter has to be larger than 0"
description += f"Fitness=Exponential(lambda={fitness}), "
node_fitness = list(scipy.stats.expon.rvs( scale=1/fitness, size=N))
# this is a very ugly way of doing this but I am not refactoring the code.
elif len(fitness) == 3:
assert fitness[0] == "uniform", "error in parameters for fitness"
assert fitness[1] < fitness[2], "bounds for uniform fitness are incorrect."
description += f"Fitness=Uniform({fitness[0]},{fitness[1]}), "
node_fitness = list(scipy.stats.uniform.rvs(fitness[1], scale=(fitness[2] - fitness[1]), size=N))
else:
description += f"Fitness=Powerlaw(min={fitness[0]}, τ={fitness[1]}), "
# Use power-law fitness.
# https://stackoverflow.com/questions/17882907/python-scipy-stats-powerlaw-negative-exponent
# This results in P(x) = x^-τ, with τ = opts.fitness
assert fitness[1] > 1 and fitness[0] > 0, "incorrect powerlaw fitness parameters"
xmin = fitness[0]
def powerlaw(r):
# fitness[1] is the negative exponent!
return xmin * (1-r) ** (-1/(fitness[1]-1))
node_fitness = [powerlaw(r) for r in np.random.rand(N)]
if aging is None:
description += f"Aging=None, "
else:
assert aging[1] > 0, "σ in lognomral aging cannot be negative"
# Use the pa function with lognormal aging
description += f"Aging=LogNormal(μ={aging[0]}, σ={aging[1]}), "
if pa is None:
# No pa: a = 0, b_node = 1/m_i, then the effect cancels out
description += f"PA=None."
nodes_pa_b = [1/node_degree[i] for i in range(N)]
else:
assert (pa[0] > 0 and pa[1] > 0), "In f_k = a*k + b. a,b > 0"
# No fitness: use a constant fitness of 1.
description += f"PA=({pa[0]}*k + {pa[1]})."
# distribute the PA_b parameter over the number of nodes
nodes_pa_b = [pa[1]/node_degree[i] for i in range(N)]
pa = pa[0]
# get_rate_function selects the appropriate rate function
rate_fn = get_rate_function(pa, aging)
# Create a CTBP
ctbp = BranchingProcess(rate_fn, fitness=node_fitness,degree=node_degree, pa_b=nodes_pa_b)
#print(description)
return ctbp
if __name__=="__main__":
# ctbp = simulator_from_parameters(N=20000, sample_outdegree_method=sample_empirical_outdegree, fitness=0.7, aging=(2, 1))
ctbp = simulator_from_parameters(N=20000, sample_outdegree_method=sample_empirical_outdegree, fitness=0.7)
# ctbp = simulator_from_parameters(N=2000, sample_outdegree_method=sample_empirical_outdegree, fitness=(0.5, 2.1), aging=(2, 1))
# ctbp = simulator_from_parameters(N=1000, sample_outdegree_method=sample_empirical_outdegree, pa=(2, 3), fitness=(0.5, 2.1), aging=(2, 1))
if ctbp.generate(debug=False):
ctbp.save_edges('ctbp_edges.csv')
ctbp.save_nodes('ctbp_nodes.csv')
else:
print('failed')