Skip to content

Commit 6376e49

Browse files
committed
attempt to improve reflist and deflist performance
1 parent 8b339a4 commit 6376e49

File tree

2 files changed

+59
-31
lines changed

2 files changed

+59
-31
lines changed

elixir/data.py

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,33 @@ class DefList:
6464
a line number and a file family.
6565
Also stores in which families the ident exists for faster tests.'''
6666
def __init__(self, data=b'#'):
67-
self.data, self.families = data.split(b'#')
67+
data, self.families = data.split(b'#')
68+
self.entries = [self.decode_entry(d) for d in deflist_regex.findall(data)]
69+
self.sorted = False
70+
71+
def decode_entry(self, entry):
72+
id = int(entry[0])
73+
type = defTypeR [entry[1].decode()]
74+
line = int(entry[2])
75+
family = entry[3].decode()
76+
return id, type, line, family
77+
78+
def encode_entry(self, entry):
79+
return str(entry[0]) + defTypeD[entry[1]] + str(entry[2]) + entry[3]
6880

6981
def iter(self, dummy=False):
7082
# Get all element in a list of sublists and sort them
71-
entries = deflist_regex.findall(self.data)
72-
entries.sort(key=lambda x:int(x[0]))
73-
for id, type, line, family in entries:
74-
id = int(id)
75-
type = defTypeR [type.decode()]
76-
line = int(line)
77-
family = family.decode()
83+
if not self.sorted:
84+
self.entries.sort(key=lambda x:int(x[0]))
85+
self.sorted = True
86+
87+
for id, type, line, family in self.entries:
7888
yield id, type, line, family
7989
if dummy:
8090
yield maxId, None, None, None
8191

8292
def exists(self, idx, line_num):
83-
entries = deflist_regex.findall(self.data)
84-
for id, _, line, _ in entries:
93+
for id, _, line, _ in self.entries:
8594
if id == idx and int(line) == line_num:
8695
return True
8796

@@ -90,14 +99,18 @@ def exists(self, idx, line_num):
9099
def append(self, id, type, line, family):
91100
if type not in defTypeD:
92101
return
93-
p = str(id) + defTypeD[type] + str(line) + family
94-
if self.data != b'':
95-
p = ',' + p
96-
self.data += p.encode()
102+
103+
self.sorted = False
104+
self.entries.append((id, type, line, family))
97105
self.add_family(family)
98106

99107
def pack(self):
100-
return self.data + b'#' + self.families
108+
if not self.sorted:
109+
self.entries.sort(key=lambda x:int(x[0]))
110+
self.sorted = True
111+
112+
data = ",".join(self.encode_entry(entry) for entry in self.entries)
113+
return data.encode() + b'#' + self.families
101114

102115
def add_family(self, family):
103116
family = family.encode()
@@ -110,7 +123,7 @@ def get_families(self):
110123
return self.families.decode().split(',')
111124

112125
def get_macros(self):
113-
return deflist_macro_regex.findall(self.data.decode()) or ''
126+
return [entry[3] for entry in self.entries if entry[1] == 'macro']
114127

115128
class PathList:
116129
'''Stores associations between a blob ID and a file path.
@@ -139,25 +152,36 @@ class RefList:
139152
and the corresponding family.'''
140153
def __init__(self, data=b''):
141154
self.data = data
155+
self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]]
156+
self.sorted = False
157+
158+
def decode_entry(self, k):
159+
return (int(k[0].decode()), k[1].decode(), k[2].decode())
142160

143161
def iter(self, dummy=False):
144162
# Split all elements in a list of sublists and sort them
145-
entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]]
146-
entries.sort(key=lambda x:int(x[0]))
147-
for b, c, d in entries:
148-
b = int(b.decode())
149-
c = c.decode()
150-
d = d.decode()
163+
if not self.sorted:
164+
self.sorted = True
165+
self.entries.sort(key=lambda x:int(x[0]))
166+
167+
for b, c, d in self.entries:
151168
yield b, c, d
152169
if dummy:
153170
yield maxId, None, None
154171

155172
def append(self, id, lines, family):
156-
p = str(id) + ':' + lines + ':' + family + '\n'
157-
self.data += p.encode()
173+
self.sorted = False
174+
self.entries.append((id, lines, family))
158175

159176
def pack(self):
160-
return self.data
177+
if not self.sorted:
178+
self.sorted = True
179+
self.entries.sort(key=lambda x:int(x[0]))
180+
181+
result = ""
182+
for id, lines, family in self.entries:
183+
result += str(id) + ":" + lines + ":" + family + "\n"
184+
return result.encode()
161185

162186
class BsdDB:
163187
def __init__(self, filename, readonly, contentType, shared=False, cachesize=None):
@@ -230,16 +254,14 @@ def exists(self, key):
230254
if key in self.cache:
231255
return True
232256

233-
key = autoBytes(key)
234-
return self.db.exists(key)
257+
return self.db.exists(autoBytes(key))
235258

236259
def get(self, key):
237260
if key in self.cache:
238261
self.cache.move_to_end(key)
239262
return self.cache[key]
240263

241-
key = autoBytes(key)
242-
p = self.db.get(key)
264+
p = self.db.get(autoBytes(key))
243265
if p is None:
244266
return None
245267
p = self.ctype(p)

elixir/update.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool):
306306
logger.info("docs done")
307307

308308
if dts_comp_support:
309-
for result in pool.imap_unordered(get_comps, idxes, chunksize):
309+
comp_idxes = [idx for idx in idxes if getFileFamily(idx[2]) not in (None, 'K', 'M')]
310+
comp_chunksize = int(len(comp_idxes) / cpu_count())
311+
comp_chunksize = min(max(1, comp_chunksize), 100)
312+
for result in pool.imap_unordered(get_comps, comp_idxes, comp_chunksize):
310313
if result is not None:
311314
add_comps(db, *result)
312315

@@ -318,7 +321,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool):
318321

319322
logger.info("dts comps docs done")
320323

321-
for result in pool.imap_unordered(get_refs, idxes, chunksize):
324+
ref_idxes = [idx for idx in idxes if getFileFamily(idx[2]) is not None]
325+
ref_chunksize = int(len(ref_idxes) / cpu_count())
326+
ref_chunksize = min(max(1, ref_chunksize), 100)
327+
for result in pool.imap_unordered(get_refs, ref_idxes, ref_chunksize):
322328
if result is not None:
323329
add_refs(db, idx_to_hash_and_filename, result)
324330

0 commit comments

Comments
 (0)