-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscribe.py
171 lines (142 loc) · 5.8 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Transcribe an audio file using the Whisper model from the whisper-timestamped library,
then clean up duplicate or redundant segments, and finally save the results as JSON.
Usage:
1. Install dependencies:
pip install -r requirements.txt
2. Run the script:
python transcribe.py \
--audio_path="path/to/audio.wav" \
--model_size="medium" \
--output_path="transcript_cleaned.json"
Available command-line arguments:
--audio_path Path to the audio file to transcribe. (Required)
--model_size Which Whisper model variant to load (default: "medium").
--device Device to run inference on (e.g., "cpu", "cuda", or "mps").
If not specified, the script will attempt to pick the best device
automatically (CUDA, then MPS, then CPU).
--output_path JSON file path to save the cleaned transcript (default: "transcript_cleaned.json").
"""
import argparse
import json
import torch
import whisper_timestamped as whisper
def parse_args():
"""
Parse command-line arguments.
Returns:
argparse.Namespace: Object containing parsed command-line arguments.
"""
parser = argparse.ArgumentParser(
description="Transcribe an audio file using Whisper and clean duplicates."
)
parser.add_argument(
"--audio_path",
type=str,
required=True,
help="Path to the audio file to transcribe.",
)
parser.add_argument(
"--model_size",
type=str,
default="medium",
help='Which Whisper model variant to load (e.g. "small", "medium", "large").',
)
parser.add_argument(
"--device",
type=str,
default=None,
help=(
'Device to run inference on (e.g., "cpu", "cuda", or "mps"). '
"If not provided, the script will pick the best available device automatically."
),
)
parser.add_argument(
"--output_path",
type=str,
default="transcript_cleaned.json",
help="JSON file path to save the cleaned transcript.",
)
return parser.parse_args()
def pick_best_device():
"""
Determine the best available device for PyTorch inference:
1. CUDA if available
2. MPS (Apple Silicon) is not yet supported by whisper, so avoid that
3. Otherwise CPU
Returns:
str: The name of the best device ("cuda", or "cpu").
"""
if torch.cuda.is_available():
return "cuda"
return "cpu"
def clean_transcription(transcription_result):
"""
Remove duplicate or redundant segments from a transcription result.
Args:
transcription_result (dict): Dictionary containing transcription output
with a "segments" key.
Returns:
dict: A copy of `transcription_result` with cleaned segments.
"""
segments = transcription_result.get("segments", [])
cleaned_segments = []
last_text = None
for segment in segments:
if segment["text"] != last_text: # Skip duplicates
cleaned_segments.append(segment)
last_text = segment["text"]
transcription_result["segments"] = cleaned_segments
return transcription_result
def main():
"""
Main function to load the model, transcribe the audio, clean the result,
and save the cleaned transcript to JSON.
"""
# Parse command-line arguments
args = parse_args()
# Determine which device to use
final_device = args.device if args.device else pick_best_device()
print(f"Using device: {final_device}")
# Load audio file
print(f"Loading audio file: {args.audio_path}")
audio = whisper.load_audio(args.audio_path)
# Load the Whisper model
print(f"Loading Whisper model: {args.model_size} on device '{final_device}'")
model = whisper.load_model(args.model_size, device=final_device)
# Configuration for transcription
transcription_config = {
"language": "en", # Force English transcription
"task": "transcribe", # Speech recognition (not translation)
"vad": "silero", # Use Voice Activity Detection to remove non-speech segments
"detect_disfluencies": True, # Detect and include disfluencies
"trust_whisper_timestamps": True, # Trust Whisper's native timestamps
"compute_word_confidence": True, # Compute word-level confidence scores
"include_punctuation_in_confidence": True, # Punctuation confidence
"min_word_duration": 0.1, # Minimum duration for word timestamps
"plot_word_alignment": False, # Disable plotting alignment
"compression_ratio_threshold": 2.4, # Avoid highly repetitive outputs
"logprob_threshold": -1.0, # Filter low-confidence outputs
"no_speech_threshold": 0.6, # Adjust silence sensitivity
"beam_size": 5, # Beam search size
"best_of": 5, # Consider top candidates
"temperature": (0.0, 0.2, 0.4, 0.6, 0.8), # Fallback decoding temps
"condition_on_previous_text": True, # Context-aware transcription
"refine_whisper_precision": 0.02, # Precision for refining timestamps
"remove_empty_words": True, # Remove empty words
}
# Transcribe the audio
print("Transcribing audio...")
result = whisper.transcribe(model, audio, **transcription_config)
# Clean the transcription by removing duplicates
print("Cleaning up transcript...")
cleaned_result = clean_transcription(result)
# Save the cleaned transcript to JSON
print(f"Saving cleaned transcript to '{args.output_path}'")
with open(args.output_path, "w", encoding="utf-8") as output_file:
json.dump(cleaned_result, output_file, indent=2, ensure_ascii=False)
print(f"Cleaned transcript saved as '{args.output_path}'")
if __name__ == "__main__":
main()