-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfile-size-histogram
executable file
·445 lines (353 loc) · 16.4 KB
/
file-size-histogram
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/python3
import os, sys, math
import numpy
def sfloat_na(f, removetrail=1, digits=None, extradigits=0, estyleabove=100000):
' sfloat() without alignment spaces, which is basically just a strip() added '
return sfloat(f, fixedwidth=0, aligndigit=0, removetrail=removetrail, digits=digits, extradigits=extradigits, estyleabove=estyleabove).strip()
def sfloat(f, fixedwidth=10, aligndigit=4, removetrail=1, digits=None, extradigits=0, estyleabove=100000):
''' Idea: when studying a screen full of numbers,
show only one or two significant digits at any scale
to make for easily skimmable figures by having
their length, and alignment, indicate their scale.
WARNING: It's approximate at best, and makeshift at best, and biased towards specific interests.
Started because of eigenvector expression and other fractions that can work at different orders of magnitude.
Numbers printed like 3.2345e-3 and 4.2342e-5 take time to compare even though they are hugely different.
...yes, this does defeat the scientific significant-digits approach.
Easier to read, though.
removetrail - will remove trailing zeroes, and the digit if that's all that's left
0.00 -> 0
0.50 -> 0.5
aligndigit - will try to add spaces so that the digit is some minimum area from the right edge of the string.
sfloat( 6762.73408 ) == ' 6763 '
sfloat( 18.5608302 ) == ' 18.6 '
sfloat( 0.995123 ) == ' 0.99 '
sfloat( 0.932929494 ) == ' 0.93 '
sfloat( 0.050000 ) == ' 0.05 '
sfloat( 0.0717395048 ) == ' 0.072'
Note that it will still move for small-enough numbers. This is by design, for my idea of inspectibility. E.g.
sfloat( 8.22697e-05 ) == ' 0.000082'
Also note that *very* tiny numbers become 0, e.g.:
sfloat( 3.66528e-08 ) == ' 0 '
digits - force the amount of digits if specified
e.g. to get mostly the alining/zero-strippin behaviour, BUT note you can break both the alignment and fixed width this way
sfloat( 18.5608302, digits=1 ) == ' 18.6 '
sfloat( 18.5608302, digits=2 ) == ' 18.56 '
sfloat( 18.5608302, digits=3 ) == ' 18.561'
sfloat( 18.5608302, digits=4 ) == ' 18.5608'
sfloat( 18.5608302, digits=9 ) == ' 18.5608302'
extradigits - use default rather that fixed behaviour, but ''add' a digit to all
useful to show moderate precision
sfloat( 6762.73408 , extradigits=1 ) == ' 6762.7 '
sfloat( 18.5608302 , extradigits=1 ) == ' 18.56 '
sfloat( 0.995123 , extradigits=1 ) == ' 0.995'
sfloat( 0.932929494 , extradigits=1 ) == ' 0.933'
sfloat( 0.050000 , extradigits=1 ) == ' 0.05 '
sfloat( 0.0717395048, extradigits=1 ) == ' 0.0717'
sfloat( 8.22697e-05 , extradigits=1 ) == ' 0.0000823'
sfloat( 3.66528e-08 , extradigits=1 ) == ' 0.00000004' which is slightly broken
fixedwidth - places it in a string that will be a given width
(unless the thing you want to print is itself wider)
meaning you can tell this function to make column-like output without wrapping that yourself
It often make sense to use fixedwidth when you use aligndigit or extradigits,
increasing the former when you increase the latter.
'''
if type(fixedwidth) in (int, #long,
float):
fixedwidth=str(int(fixedwidth))
af = abs(f)
if af>estyleabove:
ret= '%.1e'%f # '6.1e+07' style (in part to keep it shortinsh so that fixedwith makes sense)
removetrail=0 # there's a digit before the e and there may be a trailing 0, but we don't want that removed.
else:
if 0: # could do something clever here, say...
digits = max(0,round( 1.+0.5*-math.log(af) ))
# (still needs a af=max(af,machine_epsilon, or a test.)
else: # but this is easier to understand and tweak
if digits is None:
if af == 0.: digits = 0
elif af < .00001: digits = 7
elif af < .0001: digits = 6
elif af < .001: digits = 5
elif af < .01: digits = 4
elif af < .1: digits = 3
elif af < .5: digits = 3
elif af < 7: digits = 2
elif af < 100: digits = 1
elif af < 1000: digits = 0
else: digits = 0 # higher? show as integer
digits += extradigits
digits = max(0,digits) # negative extradigits can make that happen
ret = ('%% .%df'%digits)%f
if removetrail: # remove trailing zeroes, and a dot if it's there afterwards
if '.' in ret:
while ret.endswith('0'):
ret=ret[:-1]
if ret.endswith('.'):
ret=ret[:-1]
if aligndigit:
if 'e' in ret:
epos = ret.rindex('e')
pos_after_e = len(ret)-epos-2
aligndigit-=1
if pos_after_e<aligndigit:
ret+=' '*(aligndigit-pos_after_e)
elif '.' in ret:
aligndigit+=1 # cheat for off by 1
dpos = ret.rindex('.')
pos_after_dot = len(ret)-dpos
#print 'positions after dot:',pos_after_dot
aligndigit-=1
if pos_after_dot<aligndigit:
ret+=' '*(aligndigit-pos_after_dot)
else:
aligndigit+=1 # cheat for off by 1
i=len(ret)-1
while ret[i]==' ':
i-=1
final_spaces = len(ret)-i
#print 'final spaces:',final_spaces
if final_spaces<aligndigit:
ret+=' '*(aligndigit-final_spaces)
if fixedwidth:
ret = ('%% %ss'%fixedwidth)%ret
return ret
def prepare(ary, bins=25, sigma=None, percentiles=None, debug=False):
''' Helper for print_histogram.
Could be useful to call directly if you want to use the same
sigma or percentile cutoff logic for other things.
if sigma isn't none, it is the amount of sigmas to cut the edge by
'''
if not isinstance(ary, numpy.ndarray):
ary = numpy.array(ary)
ary = ary.flatten() # reshape. Not sure this is necessary.
mn = numpy.mean(ary) # can something do both in one go?
sg = numpy.std(ary)
ret = {
'range_full': (numpy.amin(ary),numpy.amax(ary)),
'reduction': 'none',
'meanval':mn,
'sigval':sg,
}
if sigma or percentiles:
if numpy.min(ary) == numpy.max(ary):
raise ValueError( "No variance in values (all are ~%s)"%sfloat_na(numpy.min(ary)).lstrip() )
return
if sigma:
ret['reduction'] = '%.1f sigma'%sigma
lowthresh = mn-sg*sigma
highthresh = mn+sg*sigma
print( lowthresh, highthresh)
elif percentiles:
fp, sp = percentiles
#fp = int(fp)
#sp = int(sp)
ret['reduction'] = '%.1f,%.1f percentiles'%(fp,sp)
fp_v = numpy.percentile(ary, fp)
sp_v = numpy.percentile(ary, sp)
ret['percentile%.1f'%fp] = fp_v
ret['percentile%.1f'%sp] = sp_v
lowthresh = fp_v
highthresh = sp_v
selected = ary[numpy.logical_and( ary>lowthresh, ary<highthresh )]
counts, edges = numpy.histogram( selected, bins=bins )
ret['count_data'] = len(ary)
ret['count_selected'] = len(selected)
if len(selected)>0:
ret['range_selected'] = (numpy.amin(selected),numpy.amax(selected))
ret['thresholds'] = (lowthresh, highthresh)
else:
counts, edges = numpy.histogram( ary, bins=bins )
ret['counts'] = counts
ret['edges'] = edges
return ret
def kmg(amount,kilo=1000, append='',thresh=15, nextup=0.9, rstrip0=True, extradigits=0, i_for_1024=True):
""" For more easily skimmable sizes
e.g.
kmg(3429873278462) == '3.4T'
kmg(342987327) == '343M'
kmg(34298) == '34K'
'%sB'%kmg(2342342324) == '2.3GB'
'%sB'%kmg(2342342324, kilo=1024) == '2.2GiB'
'%sB'%kmg(2342342324, kilo=1024, extradigits=1) == '2.18GiB'
'%sB'%kmg(19342342324, kilo=1024) == '18GiB'
'%sB'%kmg(19342342324, kilo=1024, extradigits=1) == '18GiB' (because of rstrip0)
Decimal/SI kilos by default, so useful beyond bytes.
Specify kilo=1024 if you want binary kilos. By default this also adds the i.
thresh is the controls where we take one digit away, e.g. for 1.3GB but 16GB.
Default is at 15 which is entirely arbitrary.
Disable using None.
nextup makes us switch to the next higher up earlier, e.g. 700GB but 0.96TB
Disable using None.
extradigits=1 (or maybe more) to unconditionally see a less-rounded number
(though note rstrip can still apply)
rstrip0 whether to take off '.0' if present (defaults to true)
append is mostly meant for optional space between number and unit.
"""
mega = kilo*kilo
giga = mega*kilo
tera = giga*kilo
peta = tera*kilo
exa = peta*kilo
zetta = exa*kilo
yotta = zetta*kilo
if nextup is None:
nextup = 1.0
if thresh is None:
thresh = 1000
nextup = float(nextup)
# Yes, could be handled a bunch more more compactly (and used to be)
showdigits=0
if abs(amount) < nextup*kilo: # less than a kilo; omits multiplier and i
showval = amount
else:
for csize, mchar in ( (peta, 'P'),
(tera, 'T'),
(giga, 'G'),
(mega, 'M'),
(kilo, 'K'),
#(exa, 'E'),# exa, zetta, yotta is shown as peta amounts. Too large to comprehend anyway.
#(zeta, 'Z'),
#(yotta,'Y'),
):
if abs(amount) > nextup*csize:
showval = amount/float(csize)
if showval<thresh:
showdigits = 1 + extradigits
else:
showdigits = 0 + extradigits
append += mchar
if i_for_1024 and kilo==1024:
append += 'i'
break
ret = ("%%.%df"%(showdigits))%showval
if rstrip0:
if '.' in ret:
ret=ret.rstrip('0').rstrip('.')
ret += append
return ret
def kmg_formatter(val, align='l', suffix='B'):
' Useful for large byte sizes '
return '%10s'%(kmg(val, kilo=1000)+suffix)
def machine_epsilon(func=float): # 1.1102230246251565e-16 because python's float is 64-bit
ret = func(1)
while func(1)+func(ret) != func(1):
ret_last = ret
ret = func(ret) / func(2)
return ret
def digits_for_range(ary):
if ary.dtype.kind in 'ui':
return 0
if numpy.mod(numpy.amax( ary ),1) < machine_epsilon:
return 0
mi = numpy.amin(ary)
ma = numpy.amax(ary)
# I forget the details about this
if abs(mi)<machine_epsilon:
dmi = 0
else:
dmi = max(0,( 0.52*-math.log( abs(mi) ) ))
if abs(ma)<machine_epsilon:
dma = 0
else:
dma = max(0,( 0.52*-math.log( abs(ma) ) ))
dr = max(0,( 0.52*-math.log( max(machine_epsilon,ma-mi) ) ))
cd = min(dmi,dma)+2
cd = round(cd)
return cd
def print_histogram(ary, bins=25,
sigma=None, percentiles=None,
histwidth=None, color_by_sigma=False, use_unicode=False,
formatter=kmg_formatter):
''' CONSIDER renaming to text_histogram
Do everything from data to printing.
(Sometimes you want to use the parts yourself isntead)
If sigma is a number, we create a histogram only of the samples up to that-many standard deviations from the mean.
Usually 2.5 or 3 is decent to remove outliers but keep most useful data.
Percentiles may work better than that. expected to be a 2-tuple of numbers.
Keep in mind that this can break on very sharp peaked data, because percentiles may be basically identical so not filter in any data.
Can color by whether bars are in the 1-sigma, 2-sigma, or 3-sigma range.
(only if also thresholding by sigma -- change that?)
If you allow use_unicode=True, the bar's width will be a little more accurate
(steps of eights of a full width, using unicode characters)
Disabled by default as you often won't see it and you may not have a UTF-8-capable terminal to start with
CONSIDER: returning histogram data so we can format and print later
CONSIDER: using percentiles instead. And allowing forced values.
'''
histdata = prepare(ary, bins=bins, sigma=sigma, percentiles=percentiles)
#print histdata
histwidth = 80.
counts = histdata['counts']
edges = histdata['edges']
mi,ma = histdata['range_full']
mn = histdata['meanval']
#print ' Full value range: (%s, %s)'%(sfloat_na(mi,extradigits=2),sfloat_na(ma,extradigits=2))
print( ' Full value range: (%s, %s)'%(mi,ma))
sg = histdata['sigval']
# print "sigma=%s"%sfloat_na(histdata['sigval'],extradigits=2)
if histdata['reduction'] != 'none'and histdata['count_selected']>0:
mi,ma = histdata['range_selected']
print( ' Showing value range: (%f, %f) based on %s'%(
mi,#sfloat_na(mi,extradigits=2),
ma,#sfloat_na(ma,extradigits=2),
histdata['reduction'],
))
# ask for an amount of digits relevant to the *range*, not just the number itself.
# TODO: maybe refine that logic and use more generally?
digits = digits_for_range( numpy.array([mi,ma]) )+1
#rng = max(machine_epsilon(), ma-mi) # machine epsilon as smallest number against an exception for 0
rng = ma-mi
if rng <= 1.0E-5: # in case we get handed a 32bit float numpy array
raise ValueError( "too little variance in values (all are %s~%s)"%(mi,ma) )
cd = max(0,round( 1.+0.52*-math.log(rng) ))
print( 'rng',rng)
# pixel drawing using Unicode characters:)
FULL = u'\u2588'#.encode('utf8')
SEVEN = u'\u2589'#.encode('utf8')
SIX = u'\u258A'#.encode('utf8')
FIVE = u'\u258B'#.encode('utf8')
FOUR = u'\u258C'#.encode('utf8')
THREE = u'\u258D'#.encode('utf8')
TWO = u'\u258E'#.encode('utf8')
ONE = u'\u258F'#.encode('utf8')
bitblocks = [ONE,TWO,THREE,FOUR,FIVE,SIX,SEVEN]
print( " value range count")
mc = float(max(counts))
#if mc==0.0:
# print "no data"
# return
histwidth -= 40 # the width of the text left of the bars
for i,c in enumerate(counts):
before=''
after=''
pw = histwidth*(c/mc)
#print pw%1,int(7.* (pw%1))
bitblock = ''
if use_unicode:
bitblock = bitblocks[int( float(len(bitblocks)-1)*(pw%1))] # % is a float operation in python
#print int(7.* (pw%1)), bitblock
fs = '%%12s .. %%12s %%%ds |%%s%%s%%s%%s'%max(7, round(.1+math.log(mc,10)))
#print mc,fs
print( fs%(
formatter(edges[i], align='r'),
formatter(edges[i+1], align='l'),
c,
before,
#'*'*int(pw),
u'\u2588'*int(pw),
bitblock,
after
) )
sizes = []
if len(sys.argv)==1:
print( "Need the names of some directories to walk")
sys.exit(0)
for arg in sys.argv[1:]:
for r, ds, fs in os.walk(arg):
for fn in fs:
ffn = os.path.join(r, fn)
stob = os.lstat(ffn)
sizes.append(stob.st_size)
if len(sizes)==0:
print( "Didn't see any files" )
sys.exit(0)
print_histogram( sizes, formatter=kmg_formatter )