forked from mvdctop/Movie_Data_Capture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
number_parser.py
executable file
·229 lines (201 loc) · 10.5 KB
/
number_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import os
import re
import sys
import config
import typing
G_spat = re.compile(
"^\w+\.(cc|com|net|me|club|jp|tv|xyz|biz|wiki|info|tw|us|de)@|^22-sht\.me|"
"^(fhd|hd|sd|1080p|720p|4K)(-|_)|"
"(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|hack|leak)",
re.IGNORECASE)
def get_number(debug: bool, file_path: str) -> str:
"""
从文件路径中提取号码 from number_parser import get_number
"""
filepath = os.path.basename(file_path)
# debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可
try:
# 先对自定义正则进行匹配
if config.getInstance().number_regexs().split().__len__() > 0:
for regex in config.getInstance().number_regexs().split():
try:
if re.search(regex, filepath):
return re.search(regex, filepath).group()
except Exception as e:
print(f'[-]custom regex exception: {e} [{regex}]')
file_number = get_number_by_dict(filepath)
if file_number:
return file_number
elif '字幕组' in filepath or 'SUB' in filepath.upper() or re.match(r'[\u30a0-\u30ff]+', filepath):
filepath = G_spat.sub("", filepath)
filepath = re.sub("\[.*?\]","",filepath)
filepath = filepath.replace(".chs", "").replace(".cht", "")
file_number = str(re.findall(r'(.+?)\.', filepath)).strip(" [']")
return file_number
elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('--', '-').replace('_', '-').upper()
filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE)
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
file_number = os.path.splitext(filename)
filename = re.search(r'[\w\-_]+', filename, re.A)
if filename:
file_number = str(filename.group())
else:
file_number = file_number[0]
new_file_number = file_number
if re.search("-c", file_number, flags=re.IGNORECASE):
new_file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
elif re.search("-u$", file_number, flags=re.IGNORECASE):
new_file_number = re.sub("(-|_)u$", "", file_number, flags=re.IGNORECASE)
elif re.search("-uc$", file_number, flags=re.IGNORECASE):
new_file_number = re.sub("(-|_)uc$", "", file_number, flags=re.IGNORECASE)
elif re.search("\d+ch$", file_number, flags=re.I):
new_file_number = file_number[:-2]
return new_file_number.upper()
else: # 提取不含减号-的番号,FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei:
return oumei.group()
try:
return str(
re.findall(r'(.+?)\.',
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-')
except:
return str(re.search(r'(.+?)\.', filepath)[0])
except Exception as e:
if debug:
print(f'[-]Number Parser exception: {e} [{file_path}]')
return None
G_TAKE_NUM_RULES = {
'tokyo.*hot': lambda x: str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
'carib': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
'1pon|mura|paco': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
'10mu': lambda x: str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
'x-art': lambda x: str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
'xxx-av': lambda x: ''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
'heydouga': lambda x: 'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]),
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0],
'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()),
'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()),
'caribpr': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
}
def get_number_by_dict(filename: str) -> typing.Optional[str]:
try:
for k, v in G_TAKE_NUM_RULES.items():
if re.search(k, filename, re.I):
return v(filename)
except:
pass
return None
class Cache_uncensored_conf:
prefix = None
def is_empty(self):
return bool(self.prefix is None)
def set(self, v: list):
if not v or not len(v) or not len(v[0]):
raise ValueError('input prefix list empty or None')
s = v[0]
if len(v) > 1:
for i in v[1:]:
s += f"|{i}.+"
self.prefix = re.compile(s, re.I)
def check(self, number):
if self.prefix is None:
raise ValueError('No init re compile')
return self.prefix.match(number)
G_cache_uncensored_conf = Cache_uncensored_conf()
# ========================================================================是否为无码
def is_uncensored(number) -> bool:
if re.match(
r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
number,
re.I
):
return True
if G_cache_uncensored_conf.is_empty():
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
return bool(G_cache_uncensored_conf.check(number))
if __name__ == "__main__":
# import doctest
# doctest.testmod(raise_on_error=True)
test_use_cases = (
"ABC-123-C.mp4",
)
def evprint(evstr):
code = compile(evstr, "<string>", "eval")
print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
for t in test_use_cases:
evprint(f'get_number(True, "{t}")')
if len(sys.argv) <= 1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
sys.exit(0)
# 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径
# https://www.voidtools.com/support/everything/command_line_interface/
# ES命令行工具需要Everything文件搜索引擎处于运行状态,es.exe单个执行文件需放入PATH路径中。
# Everything是免费软件
# 示例:
# python.exe .\number_parser.py ALL # 从所有磁盘搜索视频
# python.exe .\number_parser.py D # 从D盘搜索
# python.exe .\number_parser.py D: # 同上
# python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录,路径必须带盘符
# ==================
# Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据
# 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引
# MAC OS X 使用findutils的glocate,需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引
# 示例:
# python3 ./number_parser.py ALL
import subprocess
ES_search_path = "ALL disks"
if sys.argv[1] == "ALL":
if sys.platform == "win32":
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
elif sys.platform in ("linux", "darwin"):
ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(
ES_prog_path)
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('utf-8')
out_list = [os.path.basename(line) for line in out_text.splitlines()]
else:
print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
sys.exit(1)
else: # Windows single disk
if sys.platform != "win32":
print('[!]Usage: python3 ./number_parser.py ALL')
sys.exit(0)
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
if os.path.isdir(sys.argv[1]):
ES_search_path = sys.argv[1]
else:
ES_search_path = sys.argv[1][0] + ':/'
if not os.path.isdir(ES_search_path):
ES_search_path = 'C:/'
ES_search_path = os.path.normcase(ES_search_path)
ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
print(f'[+]Find {len(out_list)} Movies.')
for filename in out_list:
try:
n = get_number(True, filename)
if n:
print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else ''))
else:
print(f'[-]Number return None. # {filename}')
except Exception as e:
print(f'[-]Number Parser exception: {e} [{filename}]')
sys.exit(0)