neuralmagic · dbogunowicz · Jan 2, 2024 · Oct 26, 2023 · Oct 31, 2023 · Nov 1, 2023
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
@@ -30,11 +30,7 @@
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
-from deepsparse.utils.onnx import (
-    _MODEL_DIR_ONNX_NAME,
-    model_to_path,
-    truncate_onnx_model,
-)
+from deepsparse.utils.onnx import MODEL_ONNX_NAME, model_to_path, truncate_onnx_model
 from sparsezoo.utils import save_onnx
 
 
@@ -55,6 +51,7 @@ def setup_transformers_pipeline(
     sequence_length: int,
     tokenizer_padding_side: str = "left",
     engine_kwargs: Optional[Dict] = None,
+    onnx_model_name: Optional[str] = None,
 ) -> Tuple[
     str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any]
 ]:
@@ -65,10 +62,14 @@ def setup_transformers_pipeline(
     :param sequence_length: The sequence length to use for the model
     :param tokenizer_padding_side: The side to pad on for the tokenizer,
         either "left" or "right"
+    :param onnx_model_name: The name of the onnx model to be loaded.
+        If not specified, defaults are used (see fetch_onnx_file_path)
     :param engine_kwargs: The kwargs to pass to the engine
     :return The model path, config, tokenizer, and engine kwargs
     """
-    model_path, config, tokenizer = fetch_onnx_file_path(model_path, sequence_length)
+    model_path, config, tokenizer = fetch_onnx_file_path(
+        model_path, sequence_length, onnx_model_name
+    )
 
     tokenizer.padding_side = tokenizer_padding_side
     if not tokenizer.pad_token:
@@ -89,6 +90,7 @@ def setup_transformers_pipeline(
 def fetch_onnx_file_path(
     model_path: str,
     sequence_length: int,
+    onnx_model_name: Optional[str] = None,
     task: Optional[str] = None,
 ) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]:
     """
@@ -97,9 +99,13 @@ def fetch_onnx_file_path(
     derived from the `model_path` provided.
     :param model_path: path to the model to be parsed
     :param sequence_length: maximum sequence length of the model
+    :param onnx_model_name: optionally, the precise name of the ONNX model
+        of interest may be specified. If not specified, the default ONNX model
+        name will be used (refer to `get_deployment_path` for details)
+    :param task: task to use for the config. Defaults to None
     :return: file path to the processed ONNX file for the engine to compile
     """
-    deployment_path, onnx_path = get_deployment_path(model_path)
+    deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name)
 
     hf_logger = logging.getLogger("transformers")
     hf_logger_level = hf_logger.level
@@ -126,7 +132,9 @@ def fetch_onnx_file_path(
     return onnx_path, config, tokenizer
 
 
-def get_deployment_path(model_path: str) -> Tuple[str, str]:
+def get_deployment_path(
+    model_path: str, onnx_model_name: Optional[str] = None
+) -> Tuple[str, str]:
     """
     Returns the path to the deployment directory
     for the given model path and the path to the mandatory
@@ -135,27 +143,33 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     for running the transformers model in the deepsparse pipeline
 
     :param model_path: path to model directory, sparsezoo stub, or ONNX file
+    :param onnx_model_name: name of the ONNX file to look for in the deployment
+        directory. Defaults to MODEL_ONNX_NAME
     :return: path to the deployment directory and path to the ONNX file inside
         the deployment directory
     """
+    onnx_model_name = onnx_model_name or MODEL_ONNX_NAME
+
     if os.path.isfile(model_path):
         # return the parent directory of the ONNX file
         return os.path.dirname(model_path), model_path
 
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if _MODEL_DIR_ONNX_NAME not in model_files:
+        if onnx_model_name not in model_files:
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, onnx_model_name)}"
             )
-        return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        return model_path, os.path.join(model_path, onnx_model_name)
 
     elif model_path.startswith("zoo:") or model_path.startswith("hf:"):
         onnx_model_path = model_to_path(model_path)
-        return os.path.dirname(onnx_model_path), onnx_model_path
+        return os.path.dirname(onnx_model_path), onnx_model_path.replace(
+            MODEL_ONNX_NAME, onnx_model_name
+        )
     else:
         raise ValueError(
             f"model_path {model_path} is not a valid file, directory, or zoo stub"

diff --git a/src/deepsparse/transformers/pipelines/text_generation/__init__.py b/src/deepsparse/transformers/pipelines/text_generation/__init__.py
@@ -21,6 +21,7 @@
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
+from .nl_engine_operator_no_kv_cache import *
 from .parse_inputs import *
 from .prep_for_prefill import *
 from .process_inputs import *
@@ -31,3 +32,4 @@
 from .prep_for_generation import *  # isort:skip
 
 from .pipeline import *  # isort:skip
+from .pipeline_no_kv_cache import *  # isort:skip
diff --git a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py
@@ -19,7 +19,10 @@
 from deepsparse.transformers.pipelines.text_generation.nl_engine_operator import (
     NLEngineOutputs,
 )
-from deepsparse.transformers.schemas.text_generation_schemas import FinishReason
+from deepsparse.transformers.schemas.text_generation_schemas import (
+    FinishReason,
+    PromptLogitsNoKVCacheInference,
+)
 from deepsparse.utils import InferenceState
 
 
@@ -33,14 +36,23 @@ def __init__(
         self.force_max_tokens = force_max_tokens
         self.tokenizer = tokenizer
 
-    def can_operate(self, inp: NLEngineOutputs):
+    def can_operate(self, inp: Union[PromptLogitsNoKVCacheInference, NLEngineOutputs]):
         if inp.in_generation:
             return True
         return False
 
-    def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs):
-        logits = inp.engine_outputs
-        kv_cache = inp.kv_cache
+    def run(
+        self,
+        inp: Union[PromptLogitsNoKVCacheInference, NLEngineOutputs],
+        inference_state: InferenceState,
+        **kwargs,
+    ):
+        logits = (
+            inp.engine_outputs
+            if isinstance(inp, NLEngineOutputs)
+            else inp.prompt_logits
+        )
+        kv_cache = inp.kv_cache if isinstance(inp, NLEngineOutputs) else None
 
         token_generator = inference_state.current_state.get("token_generator")
         token = token_generator.generate(logits=logits[0, -1, :])
@@ -49,7 +61,10 @@ def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs):
         callback = inference_state.current_state.get("callback")
         stop = inference_state.current_state.get("stop")
 
-        if kv_cache.total_num_processed_tokens >= kv_cache.capacity:
+        if (
+            kv_cache is not None
+            and kv_cache.total_num_processed_tokens >= kv_cache.capacity
+        ):
             finish_reason = FinishReason.CAPACITY
 
         if token == self.tokenizer.eos_token_id and not self.force_max_tokens:

diff --git a/src/deepsparse/transformers/pipelines/text_generation/join_output.py b/src/deepsparse/transformers/pipelines/text_generation/join_output.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
+from typing import Dict, List, Tuple
 
 import numpy
 
@@ -34,7 +34,8 @@ class JoinOutput(Operator):
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
-    def run(self, inp: List[CompileGenerationsOutput], **kwargs):
+    def run(self, inp: Tuple[List[CompileGenerationsOutput], Dict], **kwargs):
+
         batch_outputs = [x for x in inp[0]]
         generated_tokens = [x.generated_tokens for x in batch_outputs]
         generated_logits = [x.generated_logits for x in batch_outputs]

diff --git a/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator.py b/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator.py
@@ -32,7 +32,7 @@
 )
 
 
-__all__ = ["NLEngineOperator", "NLEngineInputs"]
+__all__ = ["NLEngineOperator", "NLEngineInputs", "NLEngineOutputs"]
 
 
 class NLEngineInputs(BaseModel):

diff --git a/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import numpy
+from pydantic import BaseModel
+
+from deepsparse.operators.engine_operator import EngineOperator, EngineOperatorInputs
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+
+
+__all__ = [
+    "NLEngineOperatorNoCache",
+    "NLEngineInputsNoCache",
+]
+
+
+class NLEngineInputsNoCache(BaseModel):
+    input_ids: Any
+    attention_mask: Any
+
+
+class NLEngineOperatorNoCache(EngineOperator):
+    """
+    Operator the Natural Language Engine, that operates without
+    KV Cache. This means that this operator merely maps input_ids
+    and attention_mask to logits
+    """
+
+    input_schema = NLEngineInputsNoCache
+    output_schema = None
+
+    def __init__(self, sequence_length: int, **kwargs):
+        overwrite_transformer_onnx_model_inputs(
+            path=kwargs.get("model_path"),
+            batch_size=kwargs.get("batch_size", 1),
+            max_length=sequence_length,
+        )
+        super().__init__(**kwargs)
+
+    def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any:
+        engine_inputs = [inp.input_ids, inp.attention_mask]
+        logits = (
+            super()
+            .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs)
+            .get("engine_outputs")
+        )
+
+        # By default, the engine outputs logits for all tokens in the sequence.
+        # Let's filter out the logits for the padding tokens.
+        logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1)
+
+        return {"logits": [logits], "kv_cache": None, "tokens": None}, {
+            "prompt_logits": [logits]
+        }