Skip to content

Commit 5e70c8b

Browse files
author
piotrj
committed
optimisations, extensions statistics in record header
1 parent 1f79570 commit 5e70c8b

File tree

3 files changed

+228
-126
lines changed

3 files changed

+228
-126
lines changed

src/core.py

+76-18
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,8 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No
386386
self_scan_rec = self.scan_rec
387387

388388
filenames_set_add = filenames_set.add
389+
self_header_ext_stats = self.header.ext_stats
390+
self_header_ext_stats_size = self.header.ext_stats_size
389391
try:
390392
with scandir(path) as res:
391393

@@ -402,7 +404,10 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No
402404

403405
is_dir,is_file,is_symlink = entry.is_dir(),entry.is_file(),entry.is_symlink()
404406

405-
self.ext_statistics[pathlib_Path(entry).suffix]+=1
407+
ext=pathlib_Path(entry).suffix
408+
409+
if is_file:
410+
self_header_ext_stats[ext]+=1
406411

407412
self.info_line_current = entry_name
408413

@@ -450,6 +455,7 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No
450455
else:
451456
has_files = False
452457
size = int(stat_res.st_size)
458+
self_header_ext_stats_size[ext]+=size
453459

454460
local_folder_size += size
455461

@@ -478,7 +484,8 @@ def scan(self,cde_list,check_dev=True):
478484

479485
self.header.sum_size = 0
480486

481-
self.ext_statistics=defaultdict(int)
487+
self.header.ext_stats=defaultdict(int)
488+
self.header.ext_stats_size=defaultdict(int)
482489
self.scan_data={}
483490

484491
#########################
@@ -508,9 +515,6 @@ def scan(self,cde_list,check_dev=True):
508515

509516
self.info_line = ''
510517

511-
#for ext,stat in sorted(self.ext_statistics.items(),key = lambda x : x[1],reverse=True):
512-
# print(ext,stat)
513-
514518
def prepare_customdata_pool_rec(self,scan_like_data,parent_path):
515519
scan_path = self.header.scan_path
516520
self_prepare_customdata_pool_rec = self.prepare_customdata_pool_rec
@@ -602,6 +606,13 @@ def threaded_cde(timeout_semi_list):
602606
time_start_all = perf_counter()
603607

604608
aborted_string = 'Custom data extraction was aborted.'
609+
610+
files_cde_errors_quant = defaultdict(int)
611+
612+
files_cde_quant = 0
613+
files_cde_size = 0
614+
files_cde_size_extracted = 0
615+
605616
for (scan_like_list,subpath,rule_nr,size) in self.customdata_pool.values():
606617

607618
self.killed=False
@@ -629,9 +640,7 @@ def threaded_cde(timeout_semi_list):
629640
subprocess = uni_popen(command,shell)
630641
timeout_semi_list[0]=(timeout_val,subprocess)
631642
except Exception as re:
632-
#print('threaded_cde error:',re)
633-
subprocess = None
634-
timeout_semi_list[0]=(timeout_val,subprocess)
643+
timeout_semi_list[0]=None
635644
returncode=201
636645
output = str(re)
637646
else:
@@ -655,20 +664,21 @@ def threaded_cde(timeout_semi_list):
655664
output_list_append('Killed.')
656665

657666
output = '\n'.join(output_list).strip()
667+
if not output:
668+
returncode=203
658669

659670
#####################################
660671

661672
time_end = perf_counter()
662673
customdata_stats_time[rule_nr]+=time_end-time_start
663674

664675
if returncode or self.killed or aborted:
665-
self_header.files_cde_errors_quant[rule_nr]+=1
666-
self_header.files_cde_errors_quant_all+=1
676+
files_cde_errors_quant[rule_nr]+=1
667677

668678
if not aborted:
669-
self_header.files_cde_quant += 1
670-
self_header.files_cde_size += size
671-
self_header.files_cde_size_extracted += asizeof(output)
679+
files_cde_quant += 1
680+
files_cde_size += size
681+
files_cde_size_extracted += asizeof(output)
672682

673683
new_elem={}
674684
new_elem['cd_ok']= bool(returncode==0 and not self.killed and not aborted)
@@ -694,6 +704,13 @@ def threaded_cde(timeout_semi_list):
694704

695705
time_end_all = perf_counter()
696706

707+
self_header.files_cde_errors_quant=files_cde_errors_quant
708+
self_header.files_cde_errors_quant_all = sum(files_cde_errors_quant.values())
709+
710+
self_header.files_cde_quant = files_cde_quant
711+
self_header.files_cde_size = files_cde_size
712+
self_header.files_cde_size_extracted = files_cde_size_extracted
713+
697714
customdata_stats_time_all[0]=time_end_all-time_start_all
698715
sys.exit() #thread
699716

@@ -709,7 +726,7 @@ def threaded_cde(timeout_semi_list):
709726
kill_subprocess(subprocess)
710727
self.killed=True
711728
else:
712-
sleep(0.2)
729+
sleep(0.1)
713730

714731
cde_thread.join()
715732

@@ -1142,6 +1159,28 @@ def prepare_info(self):
11421159
info_list.append(' ' + ' '.join(line_list))
11431160
self.txtinfo_basic = self.txtinfo_basic + f'\n\n{loaded_fs_info}\n{loaded_cd_info}'
11441161

1162+
try:
1163+
longest = max({len(ext) for ext in self.header.ext_stats})+2
1164+
1165+
sublist=[]
1166+
for ext,ext_stat in sorted(self.header.ext_stats.items(),key = lambda x : x[1],reverse=True):
1167+
sublist.append(f'{ext.ljust(longest)} {fnumber(ext_stat).rjust(12)} {bytes_to_str(self.header.ext_stats_size[ext]).rjust(12)}')
1168+
info_list.append('')
1169+
info_list.append('Files extensions statistics by quantity:')
1170+
info_list.append('========================================')
1171+
info_list.extend(sublist)
1172+
1173+
sublist_size=[]
1174+
for ext,ext_stat in sorted(self.header.ext_stats_size.items(),key = lambda x : x[1],reverse=True):
1175+
sublist_size.append(f'{ext.ljust(longest)} {bytes_to_str(self.header.ext_stats_size[ext]).rjust(12)} {fnumber(self.header.ext_stats[ext]).rjust(12)}')
1176+
info_list.append('')
1177+
info_list.append('Files extensions statistics by sum size:')
1178+
info_list.append('========================================')
1179+
info_list.extend(sublist_size)
1180+
except Exception as se:
1181+
#print(se)
1182+
pass
1183+
11451184
self.txtinfo = '\n'.join(info_list)
11461185

11471186
def has_cd(self):
@@ -1485,20 +1524,39 @@ def find_items_in_records(self,
14851524

14861525
records_to_process.sort(reverse = True,key = lambda x : x.header.quant_files)
14871526

1527+
params = (size_min,size_max,
1528+
t_min,t_max,
1529+
find_filename_search_kind,name_expr,name_case_sens,
1530+
find_cd_search_kind,cd_expr,cd_case_sens,
1531+
filename_fuzzy_threshold,cd_fuzzy_threshold)
1532+
1533+
searchinfofile = sep.join([self.db_dir,'searchinfo'])
1534+
try:
1535+
with open(searchinfofile, "wb") as f:
1536+
f.write(ZstdCompressor(level=8,threads=1).compress(dumps(params)))
1537+
except Exception as e:
1538+
print(e)
1539+
14881540
record_commnad_list={}
14891541
is_frozen = bool(getattr(sys, 'frozen', False))
14901542

14911543
for record_nr,record in enumerate(records_to_process):
1544+
curr_command_list = record_commnad_list[record_nr] = []
1545+
14921546
if windows:
14931547
if is_frozen:
1494-
curr_command_list = record_commnad_list[record_nr] = ['record.exe', 'load',record.file_path]
1548+
curr_command_list.append('record.exe')
14951549
else:
1496-
curr_command_list = record_commnad_list[record_nr] = ['python','src\\record.py', 'load',record.file_path]
1550+
curr_command_list.extend(['python','src\\record.py'])
14971551
else:
14981552
if is_frozen:
1499-
curr_command_list = record_commnad_list[record_nr] = ['./record', 'load',record.file_path]
1553+
curr_command_list.append('./record')
15001554
else:
1501-
curr_command_list = record_commnad_list[record_nr] = ['python3','./src/record.py', 'load',record.file_path]
1555+
curr_command_list.extend(['python3','./src/record.py'])
1556+
1557+
curr_command_list.extend(['search',record.file_path])
1558+
1559+
curr_command_list.append(searchinfofile)
15021560

15031561
if t_min:
15041562
curr_command_list.extend( ['--timestamp_min',str(t_min) ] )

src/librer.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -3096,7 +3096,9 @@ def scan(self,compression_level):
30963096

30973097
self.last_dir = path_to_scan_from_entry
30983098

3099-
new_record = librer_core.create(self.scan_label_entry_var.get(),path_to_scan_from_entry)
3099+
new_label = self.scan_label_entry_var.get()
3100+
3101+
new_record = librer_core.create(new_label,path_to_scan_from_entry)
31003102

31013103
self.main_update()
31023104

@@ -3210,6 +3212,12 @@ def scan(self,compression_level):
32103212

32113213
#############################
32123214

3215+
try:
3216+
with open(sep.join([DATA_DIR,'scaninfo']), "wb") as f:
3217+
f.write(ZstdCompressor(level=8,threads=1).compress(dumps([new_label,path_to_scan_from_entry,check_dev,cde_list])))
3218+
except Exception as e:
3219+
print(e)
3220+
32133221
scan_thread=Thread(target=lambda : new_record.scan(tuple(cde_list),check_dev),daemon=True)
32143222
scan_thread.start()
32153223
scan_thread_is_alive = scan_thread.is_alive

0 commit comments

Comments
 (0)