forked from stepfun-ai/Step-Audio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
154 lines (127 loc) · 4.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import io
import base64
import librosa
import numpy as np
import math
import torch
import torchaudio
import torchaudio
import sox
import tempfile
def encode_wav(wav, sr, rep_format="wav"):
with io.BytesIO() as wavio:
torchaudio.save(wavio, wav, sr, format=rep_format)
audio_bytes = wavio.getvalue()
encoded_wav = base64.b64encode(audio_bytes).decode("ascii")
return encoded_wav
def trim_silence(audio, sr, keep_left_time=0.05, keep_right_time=0.22, hop_size=240):
_, index = librosa.effects.trim(audio, top_db=20, frame_length=512, hop_length=128)
num_frames = int(math.ceil((index[1] - index[0]) / hop_size)) # 300
left_sil_samples = int(keep_left_time * sr)
right_sil_samples = int(keep_right_time * sr)
wav_len = len(audio)
start_idx = index[0] - left_sil_samples
trim_wav = audio
if start_idx > 0:
trim_wav = trim_wav[start_idx:]
else:
trim_wav = np.pad(
trim_wav, (abs(start_idx), 0), mode="constant", constant_values=0.0
)
wav_len = len(trim_wav)
out_len = int(num_frames * hop_size + (keep_left_time + keep_right_time) * sr)
if out_len < wav_len:
trim_wav = trim_wav[:out_len]
else:
trim_wav = np.pad(
trim_wav, (0, (out_len - wav_len)), mode="constant", constant_values=0.0
)
return trim_wav
def volumn_adjust(audio16bit_torch, sr, volumn_ratio):
"""使用sox进行音频音量调整
Args:
audio16bit_torch (Tensor): 输入音频张量 [1, samples]
volume_ratio (float): 音量比率,>1增大音量,<1降低音量
Returns:
Tensor: 调整音量后的音频张量
"""
# 创建临时文件
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=True
) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out:
# 保存输入音频到临时文件
torchaudio.save(temp_in.name, audio16bit_torch, sr) # 假设采样率为16000
# 创建sox转换器
tfm = sox.Transformer()
tfm.vol(volumn_ratio) # 设置音量调整比率
# 应用音量调整
tfm.build_file(temp_in.name, temp_out.name)
# 读取处理后的音频
audio_changed, _ = torchaudio.load(temp_out.name)
return audio_changed
def speech_adjust(audio16bit_torch, sr, speed_ratio):
"""使用sox进行音频变速处理
Args:
audio16bit_torch (Tensor): 输入音频张量 [1, samples]
speed_ratio (float): 速度比率,>1加速,<1减速
Returns:
Tensor: 变速后的音频张量
"""
# 创建临时文件
with tempfile.NamedTemporaryFile(
suffix=".wav", delete=True
) as temp_in, tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_out:
# 保存输入音频到临时文件
torchaudio.save(temp_in.name, audio16bit_torch, sr) # 假设采样率为16000
# 创建sox转换器
tfm = sox.Transformer()
tfm.tempo(speed_ratio) # 设置变速比率
# 应用变速处理
tfm.build_file(temp_in.name, temp_out.name)
# 读取处理后的音频
audio_changed, _ = torchaudio.load(temp_out.name)
return audio_changed
def audio_resample(audio16bit_torch, result_sr, target_sample_rate):
audio16bit_torch = torchaudio.transforms.Resample(
orig_freq=result_sr, new_freq=target_sample_rate
)(audio16bit_torch)
result_sr = target_sample_rate
return audio16bit_torch, result_sr
def norm_audio(audio16bit_torch):
# 直接 归一化处理。
audio16bit_torch = audio16bit_torch.numpy()
audio16bit_torch = (
audio16bit_torch / np.abs(audio16bit_torch).max() * 32767
).astype(np.int16)
audio16bit_torch = torch.from_numpy(audio16bit_torch)
return audio16bit_torch
def resample_audio(wav, original_sample_rate, target_sample_rate):
if original_sample_rate != target_sample_rate:
assert (
original_sample_rate > target_sample_rate
), "wav sample rate {} must be greater than {}".format(
original_sample_rate, target_sample_rate
)
wav = torchaudio.transforms.Resample(
orig_freq=original_sample_rate, new_freq=target_sample_rate
)(wav)
return wav
def energy_norm_fn(wav):
if type(wav) is np.ndarray:
max_data = np.max(np.abs(wav))
wav = wav / max(max_data, 0.01) * 0.999
else:
max_data = torch.max(torch.abs(wav))
wav = wav / max(max_data, 0.01) * 0.999
return wav
def get_audio_tokens(audio_tokens: str) -> list[int]:
audio_tokens = audio_tokens.split("><audio_")
audio_tokens = [
int(token.replace("<audio_", "").replace(">", "")) + 65536
for token in audio_tokens
]
return audio_tokens
def load_audio(audio_path: str):
audio_wav, sr = torchaudio.load(audio_path)
audio_wav = audio_wav.mean(dim=0, keepdim=True)
return audio_wav, sr