Revert "Fix support of English-only Whisper models (#1080)" (#1082)

This reverts commit 71bc055.
OpenNMT · Feb 13, 2023 · 7ee6a34 · 7ee6a34
1 parent dfd4230
commit 7ee6a34
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 108 deletions.
diff --git a/include/ctranslate2/models/whisper.h b/include/ctranslate2/models/whisper.h
@@ -83,8 +83,6 @@ namespace ctranslate2 {
 
       WhisperReplica(const std::shared_ptr<const WhisperModel>& model);
 
-      bool is_multilingual() const;
-
       std::vector<WhisperGenerationResult>
       generate(const StorageView& features,
                const std::vector<std::vector<std::string>>& prompts,
@@ -110,8 +108,6 @@ namespace ctranslate2 {
     public:
       using ReplicaPool::ReplicaPool;
 
-      bool is_multilingual() const;
-
       std::vector<std::future<WhisperGenerationResult>>
       generate(StorageView features,
                std::vector<std::vector<std::string>> prompts,

diff --git a/include/ctranslate2/replica_pool.h b/include/ctranslate2/replica_pool.h
@@ -146,11 +146,6 @@ namespace ctranslate2 {
     }
 
   protected:
-    const Replica& get_first_replica() const {
-      auto& worker = static_cast<ReplicaWorker<Replica>&>(_thread_pool->get_worker(0));
-      return worker.replica();
-    }
-
     template <typename Result, typename Func>
     std::vector<std::future<Result>>
     post_examples(const std::vector<Example>& examples,

diff --git a/python/cpp/whisper.cc b/python/cpp/whisper.cc
@@ -12,10 +12,6 @@ namespace ctranslate2 {
     public:
       using ReplicaPoolHelper::ReplicaPoolHelper;
 
-      bool is_multilingual() const {
-        return _pool->is_multilingual();
-      }
-
       std::variant<std::vector<models::WhisperGenerationResult>,
                    std::vector<AsyncResult<models::WhisperGenerationResult>>>
       generate(StorageViewWrapper features,
@@ -101,9 +97,6 @@ namespace ctranslate2 {
                https://github.com/openai/whisper
         )pbdoc")
 
-        .def_property_readonly("is_multilingual", &WhisperWrapper::is_multilingual,
-                               "Returns ``True`` if this model is multilingual.")
-
         .def(py::init<const std::string&, const std::string&, const std::variant<int, std::vector<int>>&, const StringOrMap&, size_t, size_t, long, py::object>(),
              py::arg("model_path"),
              py::arg("device")="cpu",
@@ -193,9 +186,6 @@ namespace ctranslate2 {
                  Returns:
                    For each batch, a list of pairs (language, probability) ordered from
                    best to worst probability.
-
-                 Raises:
-                   RuntimeError: if the model is not multilingual.
              )pbdoc")
 
         ;

diff --git a/python/tests/test_transformers.py b/python/tests/test_transformers.py
@@ -352,10 +352,9 @@ def test_transformers_generator_suppress_sequences(tmpdir):
 @test_utils.only_on_linux
 @test_utils.on_available_devices
 @pytest.mark.parametrize(
-    "model_name,prompts,expected_transcriptions,expected_no_speech_probs",
+    "prompts,expected_transcriptions,expected_no_speech_probs",
     [
         (
-            "openai/whisper-tiny",
             [
                 [
                     "<|startoftranscript|>",
@@ -382,7 +381,6 @@ def test_transformers_generator_suppress_sequences(tmpdir):
             ],
         ),
         (
-            "openai/whisper-tiny",
             [
                 ["<|startoftranscript|>", "<|en|>", "<|transcribe|>"],
                 ["<|startoftranscript|>", "<|en|>", "<|transcribe|>"],
@@ -399,7 +397,6 @@ def test_transformers_generator_suppress_sequences(tmpdir):
             ],
         ),
         (
-            "openai/whisper-tiny",
             [
                 [
                     "<|startoftranscript|>",
@@ -425,32 +422,14 @@ def test_transformers_generator_suppress_sequences(tmpdir):
                 pytest.approx(0.06885894387960434, abs=1e-2),
             ],
         ),
-        (
-            "openai/whisper-tiny.en",
-            [["<|startoftranscript|>"], ["<|startoftranscript|>"]],
-            [
-                " Mr. Quilter is the apostle of the middle classes, and we are glad"
-                " to welcome his gospel.",
-                " And so, my fellow Americans ask not what your country can do for you"
-                " ask what you can do for your country.",
-            ],
-            [
-                pytest.approx(0.02644546702504158, abs=1e-4),
-                pytest.approx(0.062380101531744, abs=1e-3),
-            ],
-        ),
     ],
 )
 def test_transformers_whisper(
-    tmpdir,
-    device,
-    model_name,
-    prompts,
-    expected_transcriptions,
-    expected_no_speech_probs,
+    tmpdir, device, prompts, expected_transcriptions, expected_no_speech_probs
 ):
     import transformers
 
+    model_name = "openai/whisper-tiny"
     converter = ctranslate2.converters.TransformersConverter(model_name)
     output_dir = str(tmpdir.join("ctranslate2_model"))
     output_dir = converter.convert(output_dir)
@@ -475,16 +454,10 @@ def _get_features(audio):
 
     model = ctranslate2.models.Whisper(output_dir, device=device)
 
-    assert model.is_multilingual == (not model_name.endswith(".en"))
-
-    if model.is_multilingual:
-        for result in model.detect_language(features):
-            best_lang, best_prob = result[0]
-            assert best_lang == "<|en|>"
-            assert best_prob > 0.9
-    else:
-        with pytest.raises(RuntimeError, match="multilingual"):
-            model.detect_language(features)
+    for result in model.detect_language(features):
+        best_lang, best_prob = result[0]
+        assert best_lang == "<|en|>"
+        assert best_prob > 0.9
 
     results = model.generate(
         features,

diff --git a/src/layers/transformer.cc b/src/layers/transformer.cc
@@ -365,8 +365,7 @@ namespace ctranslate2 {
         const auto it = state.find("memory_lengths");
         const StorageView* memory_lengths = it != state.end() ? &it->second : nullptr;
 
-        const auto cached_memory_proj_it = state.find("memory_keys_0");
-        if (cached_memory_proj_it == state.end() || cached_memory_proj_it->second.empty()) {
+        if (step <= 0) {
           memory = &state.at("memory");
 
           if (memory_lengths && allow_padding_removal) {

diff --git a/src/models/whisper.cc b/src/models/whisper.cc
@@ -61,11 +61,6 @@ namespace ctranslate2 {
     {
     }
 
-    bool WhisperReplica::is_multilingual() const {
-      const auto& vocabulary = _model->get_vocabulary();
-      return vocabulary.size() == 51865;
-    }
-
     StorageView WhisperReplica::encode(const StorageView& features) {
       const Device device = _model->device();
       const DataType dtype = _encoder->output_type();
@@ -183,8 +178,9 @@ namespace ctranslate2 {
                                     "simply adapt the number of previous text tokens in each "
                                     "batch.");
 
-      if (prompts[0].empty())
-        throw std::invalid_argument("The prompt cannot be empty");
+      if (prompts[0].size() < 3)
+        throw std::invalid_argument("The prompt should have at least 3 tokens: "
+                                    "START OF TRANSCRIPT, LANGUAGE TAG, and TRANSCRIBE/TRANSLATE");
 
       const auto& vocabulary = _model->get_vocabulary();
       const auto scoped_device_setter = _model->get_scoped_device_setter();
@@ -199,51 +195,30 @@ namespace ctranslate2 {
       prefix_tokens.reserve(prompts.size());
       start_tokens.reserve(prompts.size());
       for (const auto& prompt : prompts) {
-        if (prompt.size() > 1)
-          prefix_tokens.emplace_back(prompt.begin(), prompt.end() - 1);
-        else if (options.return_no_speech_prob)
-          prefix_tokens.emplace_back(prompt);
+        prefix_tokens.emplace_back(prompt.begin(), prompt.end() - 1);
         start_tokens.emplace_back(prompt.end() - 1, prompt.end());
       }
 
-      std::vector<float> no_speech_probs;
-      dim_t start_step = 0;
-
-      if (!prefix_tokens.empty()) {
-        const Device device = _decoder->device();
-        const DataType dtype = _decoder->output_type();
-        StorageView inputs = layers::make_sequence_inputs(prefix_tokens, device);
-        StorageView outputs(dtype, device);
-
-        // Forward the prefix.
-        _decoder->forward_prompt(inputs, state, options.return_no_speech_prob ? &outputs : nullptr);
-
-        if (options.return_no_speech_prob) {
-          // Get the probability of the no speech token at the start of transcript step.
-          StorageView sot_index = get_sot_index(prefix_tokens, vocabulary.bos_id(), device);
-          size_t no_speech_id = vocabulary.to_id("<|nospeech|>");
-          if (no_speech_id == vocabulary.unk_id())
-            no_speech_id = vocabulary.to_id("<|nocaptions|>");
-          no_speech_probs = get_no_speech_probs(*_decoder, outputs, sot_index, no_speech_id);
-        }
+      const Device device = _decoder->device();
+      const DataType dtype = _decoder->output_type();
+      StorageView inputs = layers::make_sequence_inputs(prefix_tokens, device);
+      StorageView outputs(dtype, device);
 
-        if (prompts[0].size() > 1)
-          start_step = inputs.dim(1);
-        else {
-          // If the prompt only contains the start token, it means we only got here to retrieve
-          // the no speech probability. The decoding will start from this token again so we need
-          // to reset the decoder state.
-          for (auto& pair : state) {
-            const auto& name = pair.first;
-            auto& tensor = pair.second;
-            if (!starts_with(name, "memory"))
-              tensor.clear();
-          }
-        }
+      // Initialize the decoder state with the prompt.
+      _decoder->forward_prompt(inputs, state, options.return_no_speech_prob ? &outputs : nullptr);
+
+      std::vector<float> no_speech_probs;
+      if (options.return_no_speech_prob) {
+        // Get the probability of the no speech token at the start of transcript step.
+        StorageView sot_index = get_sot_index(prefix_tokens, vocabulary.bos_id(), device);
+        size_t no_speech_id = vocabulary.to_id("<|nospeech|>");
+        if (no_speech_id == vocabulary.unk_id())
+          no_speech_id = vocabulary.to_id("<|nocaptions|>");
+        no_speech_probs = get_no_speech_probs(*_decoder, outputs, sot_index, no_speech_id);
       }
 
       DecodingOptions decoding_options;
-      decoding_options.start_step = start_step;
+      decoding_options.start_step = inputs.dim(1);
       decoding_options.beam_size = options.beam_size;
       decoding_options.patience = options.patience;
       decoding_options.length_penalty = options.length_penalty;
@@ -291,9 +266,6 @@ namespace ctranslate2 {
 
     std::vector<std::vector<std::pair<std::string, float>>>
     WhisperReplica::detect_language(const StorageView& features) {
-      if (!is_multilingual())
-        throw std::runtime_error("detect_language can only be called on multilingual models");
-
       PROFILE("WhisperReplica::detect_language");
 
       const auto scoped_device_setter = _model->get_scoped_device_setter();
@@ -357,11 +329,6 @@ namespace ctranslate2 {
     }
 
 
-    bool Whisper::is_multilingual() const {
-      const auto& replica = get_first_replica();
-      return replica.is_multilingual();
-    }
-
     std::vector<std::future<WhisperGenerationResult>>
     Whisper::generate(StorageView features,
                       std::vector<std::vector<std::string>> prompts,