google-ai-edge · priankakariatyml · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference+Session.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference+Session.swift
@@ -159,20 +159,26 @@ extension LlmInference {
       try llmInference.shouldContinueWithResponseGeneration()
 
       /// Used to make a decision about whitespace stripping.
-      var receivedFirstToken = true
+      var receivedFirstNonEmptyToken = false
 
       llmSessionRunner.predictAsync(
         progress: { partialResponseStrings, error in
           guard let responseStrings = partialResponseStrings,
             let humanReadableLlmResponse = Session.humanReadableString(
-              llmResponses: responseStrings, stripLeadingWhitespaces: receivedFirstToken)
+              llmResponses: responseStrings, stripLeadingWhitespaces: !receivedFirstNonEmptyToken)
           else {
             progress(nil, GenAiInferenceError.invalidResponse)
             return
           }
 
-          /// Reset state after first response is processed.
-          receivedFirstToken = false
+          /// Reset state after first non empty response is processed. Ensures that leading 
+          /// whitespaces are stripped from the first non empty response.
+          /// Some models generate series of empty responses for a few times in the beginning before 
+          /// generating a valid response. Ensures that leading white spaces are stripped from the 
+          /// first non empty response.
+          if !humanReadableLlmResponse.isEmpty {
+            receivedFirstNonEmptyToken = true
+          }
 
           progress(humanReadableLlmResponse, nil)
         },
@@ -291,7 +297,8 @@ extension String {
       .replacingOccurrences(of: String.newLine, with: "\n")
     humanReadableString =
       stripLeadingWhitespaces
-      ? humanReadableString.trimmingCharacters(in: .whitespaces) : humanReadableString
+      ? String(humanReadableString.drop(while: {$0.isWhitespace}))
+      : humanReadableString
     return humanReadableString.components(separatedBy: String.eod).first
   }
 }
diff --git a/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift b/mediapipe/tasks/ios/genai/inference/sources/LlmInference.swift
@@ -66,7 +66,7 @@ import MediaPipeTasksGenAIC
             supported_lora_ranks: supportedLoraRanks.baseAddress,
             max_top_k: options.maxTopk,
             llm_activation_data_type: options.activationDataType.activationDataTypeC,
-            num_draft_tokens: 0)
+            num_draft_tokens: options.draftTokenCount)
           return try LlmTaskRunner(modelSettings: modelSetting)
         }
       }
@@ -224,6 +224,10 @@ extension LlmInference {
     /// The activation data type for the model.
     @objc public var activationDataType: ActivationDataType = .default
 
+    /// Number of draft tokens to generate when using speculative decoding. Setting to 0 will 
+    /// disable speculative decoding.
+    @objc public var draftTokenCount: Int = 0
+
     /// Creates a new instance of `Options` with the given `modelPath` and default values of
     /// `maxTokens`, `maxTopk`, `supportedLoraRanks` and `activationDataType`.
     /// This function is only intended to be used from Objective C.