-
Notifications
You must be signed in to change notification settings - Fork 1
/
cmphash.py
executable file
·83 lines (77 loc) · 3.26 KB
/
cmphash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/pypy -O
import rollsum
bc = 1000000
K = 1024
def dotest(src, bsize, bcount, sum):
f = open('data/%s.dat' % src, 'rb')
table = rollsum.HashTable(2**32, lambda k: k)
clust = rollsum.HashTable(2**16, lambda k: (k & (2**20 - 1)) >> 4)
datastats = rollsum.runtest(sum, f, bsize, bcount, (table, clust))
#print "%-52s: %s %s" % (sum, table, clust)
colls = table.stats()
clust = clust.stats()
collsw, clustw = colls.weight, clust.weight
# Score is geometric mean of collision and cluster performance,
# Weighted by their digits of accuracy.
score = (colls.perf**collsw * clust.perf**clustw)**(1/(collsw + clustw))
return src, bsize, bcount, sum, colls, clust, score
def fmtstats(stats):
return '%s/%s/%8.6f/%8.6f' % (stats.min, stats.max, stats.colls, stats.perf)
def printtable(results):
f = '='
hdr = '%3s %4s %7s %-52s %-25s %-25s %8s'
fmt = '%3s %4s %7s %-52s %-25s %-25s %8.6f'
frame = hdr % (3*f, 4*f, 7*f, 52*f, 25*f, 25*f, 8*f)
print frame
print hdr % ('dat', 'bs', 'count', 'rollsum', 'hash min/max/col/perf', 'cluster min/max/col/perf', 'score')
print frame
for src, bsize, bcount, sum, table, clust, score in results:
if bsize >= 1024:
bsize = '%sK' % (bsize/K)
bcount = table.count
table = fmtstats(table)
clust = fmtstats(clust)
print fmt % (src, bsize, bcount, sum, table, clust, score)
print frame
datas = ('csv', 'zip')
sizes = (16, 32, 1*K, 4*K, 16*K, 64*K) # 256*K)
ans = []
# Test for different sources.
for src in datas:
# Test for different blocksize.
for blocksize in sizes:
# test Gear variants with blocksize<=32 only.
if blocksize <= 32:
# Test Gear variants with different mappings.
for mapfunc in (ord, rollsum.mul, rollsum.mix, rollsum.ipfs):
sum = rollsum.Gear(map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
sum = rollsum.RGear(map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
sum = rollsum.UGear(map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
# Test MGear with ord mapping only.
sum = rollsum.MGear(map=ord)
ans.append(dotest(src, blocksize, bc, sum))
# Test rsync rollsum seed and offs values.
sum = rollsum.RollSum(seed=0, offs=31, base=0x10000, map=ord)
ans.append(dotest(src, blocksize, bc, sum))
# Test RabinKarp with seed and offs.
sum = rollsum.RabinKarp(seed=1, mult=0x08104225, map=ord)
ans.append(dotest(src, blocksize, bc, sum))
sum = rollsum.RabinKarp(offs=1, mult=0x08104225, map=ord)
ans.append(dotest(src, blocksize, bc, sum))
# Test CyclicPoly with different mappings.
for mapfunc in (ord, rollsum.pow, rollsum.mul, rollsum.mix, rollsum.ipfs):
sum = rollsum.CyclicPoly(map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
# Test RollSum and RabinKarp with different mappings.
for mapfunc in (ord, rollsum.pow, rollsum.mul):
sum = rollsum.RollSum(seed=1, offs=0, base=0x10000, map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
for mult in (0xfffffffd, 0x55555555, 0x08104225, 0x41c64e6d):
sum = rollsum.RabinKarp(mult=mult, map=mapfunc)
ans.append(dotest(src, blocksize, bc, sum))
# Sort results by src, bsize, and then score.
ans = sorted(ans, key=lambda a: (a[0:2], -a[-1]))
printtable(ans)