-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy path6_trsf_vds2frs_ads.py
174 lines (148 loc) · 6.6 KB
/
6_trsf_vds2frs_ads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# 将视频转为帧图片保存 提取视频的音轨保存为wav文件、mel频谱文件
import shutil
import sys
import time
import uuid
from typing import List
from torch.cuda import OutOfMemoryError
if sys.version_info[0] < 3 and sys.version_info[1] < 2:
raise Exception("Must be using >= Python 3.2")
from os import path
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import argparse, os, cv2, traceback
from tqdm import tqdm
import face_alignment
import torch
import audio
sys.path.append(os.path.abspath("./DTLN"))
from DTLN.DTLN_model import DTLN_model
from DTLN.run_evaluation import process_file
# template2 = 'ffmpeg -hide_banner -loglevel panic -threads 1 -y -i {} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {}'
def process_video_file(vfile: str, args, gpu_id, modelClass):
# 保存该视频的帧图片和声音的文件夹
output_dir = vfile.replace(videos_dir, output_root)[:-4]
# 检查是否已经存在
if os.path.exists(path.join(output_dir, 'audio_mel.npy')):
print(f"'{output_dir}'之前已经完成,本次跳过。")
return
video_stream = cv2.VideoCapture(vfile)
frames = []
while True:
still_reading, frame = video_stream.read()
if not still_reading:
video_stream.release()
break
frames.append(frame)
batches = [frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size)]
face_rects = []
for fb in batches:
preds = fa[gpu_id].face_detector.detect_from_batch(torch.Tensor(np.asarray(fb).transpose(0, 3, 1, 2)))
for j, f in enumerate(preds):
if f is None:
continue
x1, y1, x2, y2 = map(int, f[0][:-1])
# 分辨率不达标
if abs(x2 - x1) < args.resolution_ratio or abs(y2 - y1) < args.resolution_ratio:
print(f"视频'{vfile}'因分辨率不达标被舍弃")
return
face_rects.append(fb[j][y1:y2, x1:x2])
os.makedirs(output_dir, exist_ok=True)
# 存为图片
for i, face_rect in enumerate(face_rects):
cv2.imwrite(path.join(output_dir, '{}.jpg'.format(i)), face_rect)
process_audio_file(modelClass, vfile, path.join(output_dir, 'audio.wav'))
def process_audio_file(modelClass, vfile, wav_path):
command = template.format(vfile, wav_path)
# subprocess.run(command, shell=True)
os.system(command)
noise_depressed_audio_path = f"/dev/shm/{uuid.uuid4()}.wav"
try:
process_file(modelClass.model, wav_path,
noise_depressed_audio_path) # fixme triggered tf.function retracing. Tracing is expensive
except:
traceback.print_exc()
else:
shutil.move(noise_depressed_audio_path, os.path.abspath(wav_path))
finally:
if os.path.exists(noise_depressed_audio_path):
os.remove(noise_depressed_audio_path)
# 存mel频谱
wav = audio.load_wav(wav_path, 16000)
mel = audio.melspectrogram(wav).T # (T, 80)
mel_path = os.path.join(os.path.dirname(wav_path), "audio_mel.npy")
np.save(mel_path, mel)
def mp_handler(job):
vfile, args, gpu_id, modelClass = job
retry_count = 5
for i in range(retry_count): # todo 推广
try:
process_video_file(vfile, args, gpu_id, modelClass)
except KeyboardInterrupt:
exit(0)
except OutOfMemoryError:
if i >= retry_count - 1: # 重试结束
traceback.print_exc()
else: # 重试
time.sleep(3)
except:
traceback.print_exc()
break
else:
break
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="将视频转为帧图片保存;提取视频的音轨保存为wav文件")
parser.add_argument('--ngpu',
help='Number of GPUs across which to run in parallel',
default=torch.cuda.device_count(),
type=int)
parser.add_argument('--batch_size', help='Single GPU Face detection batch size', default=8, type=int)
parser.add_argument('--resolution_ratio',
help='Resolution ratio requirements on both x and y direction of a face',
default=288, type=int)
parser.add_argument("--input_videos_dir",
help="Directory whose file tree contains mp4 files",
default="dataset/origin_noise_depressed_pieces",
type=str)
# parser.add_argument("--output_dir", help="Directory which contains the preprocessed dataset", required=True)
args = parser.parse_args()
videos_dir = os.path.abspath(args.input_videos_dir)
output_root = os.path.join(os.path.dirname(videos_dir), "frames_audios")
if not os.path.isdir(videos_dir):
raise ValueError("please input the path of a directory")
# 搜集视频文件路径
video_paths = []
for root, dirs, files in os.walk(videos_dir):
for file in files:
name, extension = os.path.splitext(file)
if extension != '.mp4':
continue
video_path = os.path.join(root, file)
video_paths.append(video_path)
if not video_paths:
raise FileNotFoundError("Empty directory")
# 按GPU拆分任务 todo blazeface裁剪大小跟sfd不一样
fa: List[face_alignment.FaceAlignment] = [face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_HALF_D,
device='cuda:{}'.format(gpuid),
face_detector='sfd')
for gpuid in range(args.ngpu)]
# 降噪模型
model = "DTLN/pretrained_model/model.h5"
# determine type of model
if model.find('_norm_') != -1:
norm_stft = True
else:
norm_stft = False
# create class instance
modelClass = DTLN_model()
# build the model in default configuration
modelClass.build_DTLN_model(norm_stft=norm_stft)
# load weights of the .h5 file
modelClass.model.load_weights(model)
template = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'
print('Started processing for {} with {} GPUs'.format(videos_dir, args.ngpu))
jobs = [(video_path, args, i % args.ngpu, modelClass) for i, video_path in enumerate(video_paths)]
imgs_pool_executor = ThreadPoolExecutor(args.ngpu)
futures = [imgs_pool_executor.submit(mp_handler, j) for j in jobs]
_ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
print(f"result:{output_root}")