upgrade faster-whisper 1.02

CheshireCC · May 29, 2024 · 7f9f0b7 · 7f9f0b7
1 parent 3a0ca18
commit 7f9f0b7
Show file tree

Hide file tree

Showing 7 changed files with 309 additions and 13 deletions.
diff --git a/fasterWhisperGUIConfig.json b/fasterWhisperGUIConfig.json
@@ -16,12 +16,12 @@
         "preciese": 4,
         "thread_num": "4",
         "num_worker": "1",
-        "download_root": "",
+        "download_root": "C:/Users/12059/.cache/huggingface/hub",
         "local_files_only": false
     },
     "vad_param": {
         "use_VAD": true,
-        "threshold": 0.49999999999999994,
+        "threshold": 0.5,
         "minSpeechDuration": "250",
         "minSilenceDuration": "2000",
         "maxSpeechDuration": "inf",
@@ -61,7 +61,15 @@
         "append_punctuations": "\"'.。,，!！?？:：”)]}、",
         "repetition_penalty": "1.0",
         "no_repeat_ngram_size": "0",
-        "prompt_reset_on_temperature": "0.5"
+        "prompt_reset_on_temperature": "0.5",
+        "chunk_length": "30",
+        "clip_mode": 1,
+        "max_new_tokens": "448",
+        "clip_timestamps": "",
+        "hallucination_silence_threshold": "0",
+        "hotwords": "",
+        "language_detection_threshold": "",
+        "language_detection_segments": "1"
     },
     "output_whisperX": {
         "tabMovable": true,

diff --git a/faster_whisper_GUI/mainWindows.py b/faster_whisper_GUI/mainWindows.py
@@ -59,7 +59,12 @@
 # from .style_sheet import StyleSheet
 from .subtitleFileRead import readSRTFileToSegments, readJSONFileToSegments
 from .config import ENCODING_DICT
-from .util import outputWithDateTime
+from .util import (
+                    outputWithDateTime,
+                    HMSToSeconds,
+                    MSToSeconds,
+                    WhisperParameters
+                )
 from .split_audio import SplitAudioFileWithSpeakersWorker
 
 import opencc
@@ -662,7 +667,8 @@ def transcribeOver(self, segments_path_info:list):
 
 
     def getParamTranscribe(self) -> dict:
-        Transcribe_params = {}
+
+        Transcribe_params = WhisperParameters()
 
         # audio = self.page_process.LineEdit_audio_fileName.text().strip()
         # audio = audio.split(";;") if audio != "" else []
@@ -771,9 +777,76 @@ def getParamTranscribe(self) -> dict:
         prompt_reset_on_temperature = float(prompt_reset_on_temperature)
         Transcribe_params['prompt_reset_on_temperature']  = prompt_reset_on_temperature 
 
+        max_new_tokens = self.page_transcribes.LineEdit_max_new_tokens.text().strip()
+        if max_new_tokens != "":
+            if max_new_tokens.isdigit():
+                max_new_tokens = int(max_new_tokens)
+                if max_new_tokens == 448:
+                    max_new_tokens = None
+            else:
+                max_new_tokens = None
+        else :
+            max_new_tokens = None
+        Transcribe_params["max_new_tokens"] = max_new_tokens
+
+        chunk_length = self.page_transcribes.LineEdit_chunk_length.text().strip()
+        if chunk_length != "":
+            if chunk_length.isdigit():
+                chunk_length = float(chunk_length)
+            else:
+                chunk_length = None
+        else :
+            chunk_length = None
+        Transcribe_params["chunk_length"] = chunk_length
+
+        clip_mode = self.page_transcribes.ComboBox_clip_mode.currentIndex()
+        Transcribe_params["clip_mode"] = clip_mode
+
+        clip_timestamps = self.page_transcribes.LineEdit_clip_timestamps.text().strip()
+        clip_timestamps = self.getClipTimestamps(clip_mode, clip_timestamps)
+        Transcribe_params["clip_timestamps"] = clip_timestamps
+
+        hallucination_silence_threshold = self.page_transcribes.lineEdit_hallucination_silence_threshold.text().strip()
+        Transcribe_params["hallucination_silence_threshold"] = float(hallucination_silence_threshold)
+
+        hotwords = self.page_transcribes.LineEdit_hotwords.text().strip()
+        Transcribe_params["hotwords"] = hotwords
+
+        language_detaction_th = self.page_transcribes.LineEdit_language_detection_threshold.text().strip()
+        Transcribe_params["language_detection_th"] = float(language_detaction_th)
+
+        language_detaction_segments = self.page_transcribes.lienEdit_language_detection_segments.text().strip()
+        Transcribe_params["language_detaction_segments"] = int(language_detaction_segments)
+
         return Transcribe_params
 
+    def getClipTimestamps(self, clip_mode:int, clip_timestamps:str):
+        if clip_timestamps == "0":
+            return clip_timestamps
 
+        if clip_mode == 0:
+            clip_timestamps_ = "0"
+        elif clip_mode == 1:
+            clip_timestamps_ = []
+            clip_timestamps = clip_timestamps.split(";")
+            for item in clip_timestamps:
+                items = item.split("-")
+                for item in items:
+                    clip_timestamps_.append(item)
+
+        elif clip_mode == 2:
+            clip_timestamps_ = []
+            clip_timestamps = clip_timestamps.split(";")
+            for item in clip_timestamps:
+                items = item.split("-")
+                for item in items:
+                    if len(item.split(":")) == 2:
+                        clip_timestamps_.append(float(MSToSeconds(item)))
+                    elif len(item.split(":")) == 3:
+                        clip_timestamps_.append(float(HMSToSeconds(item)))
+
+        return clip_timestamps_
+
     def getVADparam(self) -> dict:
         """
         get param of VAD

diff --git a/faster_whisper_GUI/paramItemWidget.py b/faster_whisper_GUI/paramItemWidget.py
@@ -62,5 +62,5 @@ def setupUI(self):
         self.titleVLayout.setAlignment(Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignLeft)
         self.widgetVLayout.setAlignment(Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignLeft)
 
-        self.mainHLayout.setStretch(0,8)
+        # self.mainHLayout.setStretch(0,8)
         self.mainHLayout.setStretch(1,1)
diff --git a/faster_whisper_GUI/tranccribePageNavigationInterface.py b/faster_whisper_GUI/tranccribePageNavigationInterface.py
@@ -16,6 +16,7 @@
                             , InfoBarPosition
                             , TitleLabel
                             , SwitchButton
+                            , ComboBox
                         )
 
 from .config import Language_dict
@@ -24,7 +25,7 @@
 # import datetime
 import json
 
-from .util import outputWithDateTime
+from .util import outputWithDateTime, WhisperParameters
 from .style_sheet import StyleSheet
 
 from .paramItemWidget import ParamWidget
@@ -95,10 +96,23 @@ def saveParams(self):
 
         else:
             return
-
+
+    def setClipTimestampsStatus(self):
+        if self.ComboBox_clip_mode.currentIndex() == 0:
+            self.LineEdit_clip_timestamps.setPlaceholderText("")
+            self.LineEdit_clip_timestamps.setEnabled(False)
+        elif self.ComboBox_clip_mode.currentIndex() == 1:
+            self.LineEdit_clip_timestamps.setPlaceholderText("0.0-10.0;25.0-36.0;......")
+            self.LineEdit_clip_timestamps.setEnabled(True)
+        elif self.ComboBox_clip_mode.currentIndex() == 2:
+            self.LineEdit_clip_timestamps.setPlaceholderText("00:00:10.0-00:00:20.0;00:00:25.0-00:00:36.0;......")
+            self.LineEdit_clip_timestamps.setEnabled(True)
+
     def SignalAndSlotConnect(self):
         self.saveParamButton.clicked.connect(self.saveParams)
         self.loadParamsButton.clicked.connect(self.loadParamsFromFile)
+        self.ComboBox_clip_mode.currentIndexChanged.connect(self.setClipTimestampsStatus)
+        # self.ComboBox_language.currentIndexChanged.connect(lambda:self.deta)
 
     def setupUI(self):
         # 使用网格布局存放参数列表
@@ -133,6 +147,25 @@ def setupUI(self):
 
         widget_list.append(self.language_param_widget)
 
+        # --------------------------------------------------------------------------------------------
+        self.LineEdit_language_detection_threshold = LineEdit()
+        self.language_detection_threshold_param_widget = ParamWidget(
+                                                                        self.__tr("语言检测阈值"),
+                                                                        self.__tr("自动检测音频时，语言检测的阈值。如果某种语言的最大概率高于此值，则会检测为该语言。"),
+                                                                        self.LineEdit_language_detection_threshold
+                                                                    )    
+        widget_list.append(self.language_detection_threshold_param_widget)
+
+        # --------------------------------------------------------------------------------------------
+        self.lienEdit_language_detection_segments:LineEdit =  LineEdit()
+        self.lienEdit_language_detection_segments.setText("1")
+        self.language_detection_segments_param_widget = ParamWidget(
+                                                                        self.__tr("语言检测段落数"),
+                                                                        self.__tr("自动检测音频时，语言检测需考虑的分段数。"),
+                                                                        self.lienEdit_language_detection_segments
+                                                                    )
+        widget_list.append(self.language_detection_segments_param_widget)
+
         # --------------------------------------------------------------------------------------------
 
         self.switchButton_Translate_to_English = SwitchButton()
@@ -175,12 +208,73 @@ def setupUI(self):
                                                             )
         widget_list.append(self.aggregate_contents_param_widget)
 
+        # =======================================================================================================
+        self.titleLabel_audio_segments = TitleLabel(self.__tr("音频分段设置"))
+        widget_list.append(self.titleLabel_audio_segments)
+
+        # --------------------------------------------------------------------------------------------
+        self.LineEdit_max_new_tokens:LineEdit = LineEdit()
+        self.LineEdit_max_new_tokens.setText("448")
+        self.max_new_tokens_param_widget = ParamWidget(self.__tr("最大新令牌数"),
+                                                    self.__tr("Whisper 为每个音频块生成的新令牌的最大数量。"),
+                                                    self.LineEdit_max_new_tokens
+                                                )
+
+        widget_list.append(self.max_new_tokens_param_widget)
+
+        # --------------------------------------------------------------------------------------------
+        self.LineEdit_chunk_length:LineEdit = LineEdit()
+        self.LineEdit_chunk_length.setText("30")
+        self.chunk_length_param_widget = ParamWidget(self.__tr("音频块长度"),
+                                                    self.__tr("音频段的长度，默认为 30 秒"),
+                                                    self.LineEdit_chunk_length
+                                                )
+
+        widget_list.append(self.chunk_length_param_widget)
+
+        # --------------------------------------------------------------------------------------------
+
+        self.ComboBox_clip_mode:ComboBox = ComboBox()
+
+        self.ComboBox_clip_mode.addItems([self.__tr("不启用手动分段"),self.__tr("按秒分割"),self.__tr("按时间按戳分割")])
+        self.ComboBox_clip_mode.setCurrentIndex(0)
+        self.clip_mode_param_widget = ParamWidget(self.__tr("音频分段模式"),
+                                                    self.__tr("手动输入音频分段时要使用的分段标记方式,启用的情况下可以输入分段起止时间戳、秒为单位的分段起止点。"),
+                                                    self.ComboBox_clip_mode
+                                                )
+
+        widget_list.append(self.clip_mode_param_widget)
+
+        # --------------------------------------------------------------------------------------------
+
+        self.LineEdit_clip_timestamps:LineEdit = LineEdit()
+        self.LineEdit_clip_timestamps.setClearButtonEnabled(True)
+        self.LineEdit_clip_timestamps.setEnabled(False)
+        self.clip_timestamps_param_widget = ParamWidget(
+                                                            self.__tr("分段时间戳"),
+                                                            self.__tr("手动输入音频分段，可输入分段时间戳，或者分段的起止秒数点，\n用\"-\"分隔起止点，用\";\"分隔不同段，最后一个结束时间戳默认为音频结尾。"),
+                                                            self.LineEdit_clip_timestamps
+                                                        )
+
+        self.clip_timestamps_param_widget.mainHLayout.setStretch(2,6)
+        widget_list.append(self.clip_timestamps_param_widget)
+
         # =======================================================================================================
         self.titleLabel_auditory_hallucination = TitleLabel(self.__tr("幻听参数"))
         widget_list.append(self.titleLabel_auditory_hallucination)
 
         # --------------------------------------------------------------------------------------------
 
+        self.lineEdit_hallucination_silence_threshold:LineEdit = LineEdit()
+        self.lineEdit_hallucination_silence_threshold.setText("0")
+        self.hallucination_silence_threshold_param_widget = ParamWidget(self.__tr("幻听静音阈值"),
+                                                    self.__tr("如果开启 单词级时间戳 ，当检测到可能的幻觉时，跳过长于此阈值（以秒为单位）的静默期。"),
+                                                    self.lineEdit_hallucination_silence_threshold                           
+        )
+        widget_list.append(self.hallucination_silence_threshold_param_widget)
+
+        # --------------------------------------------------------------------------------------------
+
         self.LineEdit_patience = LineEdit()
         self.LineEdit_patience.setText("1.0")
         self.patience_param_widget = ParamWidget(self.__tr("搜索耐心"),
@@ -355,6 +449,16 @@ def setupUI(self):
 
         widget_list.append(self.prefix_param_widget)
 
+        # --------------------------------------------------------------------------------------------
+        self.LineEdit_hotwords:LineEdit = LineEdit()
+        self.LineEdit_hotwords.setText("")
+        self.hotwords_param_widget = ParamWidget(
+                                                    self.__tr("热词/提示短语"), 
+                                                    self.__tr("为模型提供的热词/提示短语。如果给定了 初始文本前缀 则热词无效。"),
+                                                    self.LineEdit_hotwords
+                                                )
+        widget_list.append(self.hotwords_param_widget)
+
         # --------------------------------------------------------------------------------------------
         self.LineEdit_suppress_tokens = LineEdit()
         self.LineEdit_suppress_tokens.setText("-1")
@@ -552,9 +656,20 @@ def setParam(self, Transcribe_params:dict) -> None:
         self.LineEdit_prompt_reset_on_temperature.setText(str(Transcribe_params['prompt_reset_on_temperature']  ))
         # Transcribe_params['prompt_reset_on_temperature']  = prompt_reset_on_temperature 
 
+        try:
+            self.LineEdit_chunk_length.setText(Transcribe_params["chunk_length"])
+            self.ComboBox_clip_mode.setCurrentIndex(Transcribe_params["clip_mode"])
+            self.LineEdit_max_new_tokens.setText(Transcribe_params["max_new_tokens"])
+            self.LineEdit_clip_timestamps.setText(Transcribe_params["clip_timestamps"])
+            self.lineEdit_hallucination_silence_threshold.setText(Transcribe_params["hallucination_silence_threshold"])
+            self.LineEdit_hotwords.setText(Transcribe_params["hotwords"])
+            self.LineEdit_language_detection_threshold.setText(Transcribe_params["language_detection_threshold"])
+            self.language_detection_segments_param_widget.setText(Transcribe_params["language_detection_segments"])
+        except:
+            pass
 
     def getParam(self) -> dict:
-        Transcribe_params = {}
+        Transcribe_params = WhisperParameters()
 
         # 从数据模型获取文件列表
 
@@ -651,5 +766,15 @@ def getParam(self) -> dict:
         # prompt_reset_on_temperature = float(prompt_reset_on_temperature)
         Transcribe_params['prompt_reset_on_temperature']  = prompt_reset_on_temperature 
 
+        Transcribe_params["chunk_length"] = self.LineEdit_chunk_length.text().strip()
+        Transcribe_params["clip_mode"] = self.ComboBox_clip_mode.currentIndex()
+        Transcribe_params["max_new_tokens"] = self.LineEdit_max_new_tokens.text().strip()
+        Transcribe_params["clip_timestamps"] = self.LineEdit_clip_timestamps.text().strip()
+        Transcribe_params["hallucination_silence_threshold"] = self.lineEdit_hallucination_silence_threshold.text().strip()
+        Transcribe_params["hotwords"] = self.LineEdit_hotwords.text().strip()
+        Transcribe_params["language_detection_threshold"] = self.LineEdit_language_detection_threshold.text().strip()
+        Transcribe_params["language_detection_segments"] = self.lienEdit_language_detection_segments.text().strip()
+
+
         return Transcribe_params
 
diff --git a/faster_whisper_GUI/transcribe.py b/faster_whisper_GUI/transcribe.py
@@ -29,7 +29,12 @@
                 )
 
 from .seg_ment import segment_Transcribe
-from .util import secondsToHMS, secondsToMS
+from .util import (
+                    secondsToHMS, 
+                    secondsToMS, 
+                    WhisperParameters
+                )
+
 from .config import ENCODING_DICT, Task_list
 
 
@@ -195,7 +200,7 @@ class TranscribeWorker(QThread):
     def __init__(self
                 ,parent=None
                 ,model : WhisperModel = None
-                ,parameters : dict = None
+                ,parameters : WhisperParameters = None
                 ,vad_filter : bool = False
                 ,vad_parameters : dict = None
                 ,num_workers : int = 1
@@ -250,6 +255,13 @@ def transcribe_file(self, file) -> (TranscriptionInfo, List): # type: ignore
                                                 word_timestamps=self.parameters["word_timestamps"],
                                                 prepend_punctuations=self.parameters["prepend_punctuations"],
                                                 append_punctuations=self.parameters["append_punctuations"],
+                                                max_new_tokens=self.parameters["max_new_tokens"],
+                                                chunk_length=self.parameters["chunk_length"],
+                                                clip_timestamps=self.parameters["clip_timestamps"],
+                                                hallucination_silence_threshold=self.parameters["hallucination_silence_threshold"],
+                                                hotwords = self.parameters["hotwords"],
+                                                language_detection_threshold = self.parameters["language_detection_threshold"],
+                                                language_detection_segments = self.parameters["language_detection_segments"],
                                                 vad_filter=self.vad_filter,
                                                 vad_parameters=self.vad_parameters
                                             )