From 3e00175375ad101c419f5cf18dc5c2995ae8f309 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Thu, 26 Oct 2023 13:22:35 -0400 Subject: [PATCH 01/57] Pipelines Refactor - Initial Impl (#1287) --- src/deepsparse/v2/__init__.py | 21 ++++ src/deepsparse/v2/operators/__init__.py | 17 +++ src/deepsparse/v2/operators/operator.py | 90 ++++++++++++++++ src/deepsparse/v2/pipeline.py | 102 ++++++++++++++++++ src/deepsparse/v2/routers/__init__.py | 17 +++ src/deepsparse/v2/routers/router.py | 95 ++++++++++++++++ src/deepsparse/v2/schedulers/__init__.py | 18 ++++ src/deepsparse/v2/schedulers/scheduler.py | 63 +++++++++++ .../v2/schedulers/scheduler_group.py | 64 +++++++++++ src/deepsparse/v2/utils/__init__.py | 18 ++++ src/deepsparse/v2/utils/context.py | 42 ++++++++ src/deepsparse/v2/utils/types.py | 28 +++++ tests/deepsparse/v2/__init__.py | 0 tests/deepsparse/v2/test_basic_pipeline.py | 45 ++++++++ 14 files changed, 620 insertions(+) create mode 100644 src/deepsparse/v2/__init__.py create mode 100644 src/deepsparse/v2/operators/__init__.py create mode 100644 src/deepsparse/v2/operators/operator.py create mode 100644 src/deepsparse/v2/pipeline.py create mode 100644 src/deepsparse/v2/routers/__init__.py create mode 100644 src/deepsparse/v2/routers/router.py create mode 100644 src/deepsparse/v2/schedulers/__init__.py create mode 100644 src/deepsparse/v2/schedulers/scheduler.py create mode 100644 src/deepsparse/v2/schedulers/scheduler_group.py create mode 100644 src/deepsparse/v2/utils/__init__.py create mode 100644 src/deepsparse/v2/utils/context.py create mode 100644 src/deepsparse/v2/utils/types.py create mode 100644 tests/deepsparse/v2/__init__.py create mode 100644 tests/deepsparse/v2/test_basic_pipeline.py diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py new file mode 100644 index 0000000000..4a897be06f --- /dev/null +++ b/src/deepsparse/v2/__init__.py @@ -0,0 +1,21 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pipeline import * +from .operators import * +from .routers import * +from .schedulers import * +from .utils import * diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py new file mode 100644 index 0000000000..8f7e6a169d --- /dev/null +++ b/src/deepsparse/v2/operators/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .operator import * diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py new file mode 100644 index 0000000000..30e1a48379 --- /dev/null +++ b/src/deepsparse/v2/operators/operator.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Optional, Type + +from pydantic import BaseModel + +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["Operator"] + + +class Operator(ABC): + """ + Base operator class - can represent any part of an ML pipeline + """ + + # expected structured input and output types, to be defined by child classes + input_schema: Optional[Type[OperatorSchema]] = None + output_schema: Optional[Type[OperatorSchema]] = None + + @abstractmethod + def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema: + """ + :param inp: operator input, as the defined input schema if applicable + :param context: pipeline context of already run operators + :return: result of this operator as the defined output schema if applicable + """ + raise NotImplementedError + + @classmethod + def has_input_schema(cls) -> bool: + """ + :return: True if this class has a defined pydantic input schema + """ + return issubclass(cls.input_schema, BaseModel) + + @classmethod + def has_output_schema(cls) -> bool: + """ + :return: True if this class has a defined pydantic input schema + """ + return issubclass(cls.output_schema, BaseModel) + + def __call__( + self, + *args, + context: Optional[Context] = None, + **kwargs, + ) -> OperatorSchema: + """ + Parses inputs to this Operator and runs the run() method of this operator + + :param args: an unnamed arg may only be provided + if it is of the type of the input_schema + :param context: pipeline context to pass to operator + :param kwargs: kwargs when not initializing from an instantiated schema + :return: operator output + """ + if len(args) > 1: + raise ValueError( + f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}" + ) + + if len(args) == 1: + if self.input_schema is not None and isinstance(args[0], self.input_schema): + inference_input = args[0] + else: + raise ValueError( + f"1 arg supplied to Operator {self.__class__.__name__} but was not " + f"of expected type {self.input_schema}, found {type(args[0])}" + ) + elif self.has_input_schema(): + inference_input = self.input_schema(**kwargs) + else: + inference_input = kwargs + return self.run(inference_input, context=context) diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py new file mode 100644 index 0000000000..0ec580687d --- /dev/null +++ b/src/deepsparse/v2/pipeline.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List + +from pydantic import BaseModel, Field, PrivateAttr + +from deepsparse.v2.operators import Operator +from deepsparse.v2.routers import Router +from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup + + +__all__ = ["Pipeline"] + + +class Pipeline(BaseModel): + """ + Pipeline accepts a series of operators, schedulers, and a router which define + an end to end ML transformation. + + Calling a pipeline runs these transformations + """ + + stages: List[Operator] = Field( + required=True, + description="In-order list of operators that make up this pipeline", + ) + router: Router = Field( + default_factor=Router, + description="Router object to determine order and run the stages. " + "Defaults to the base Router object", + ) + schedulers: List[OperatorScheduler] = Field( + default_factor=lambda: [OperatorScheduler()], + description="List of schedulers to run operators in order of priority", + ) + + _scheduler_group: SchedulerGroup = PrivateAttr() + + class Config: + arbitrary_types_allowed = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.validate() + + # SchedulerGroup handles running all schedulers in order of priority + self._scheduler_group = SchedulerGroup(self.schedulers) + + def __call__(self, *args, return_context: bool = False, **kwargs): + """ + :param return_context: if True, retrns tuple of the pipelien output + and entire context. Default False + :return: output of the pipeline stages ran with the router for the given input + """ + if len(args) > 1: + raise ValueError( + "Only 1 in-line argument may be supplied to Pipeline which " + f"must be a Schema, found: {len(args)}" + ) + if args and kwargs: + raise ValueError( + "Pipeline can only run either a single in-line argument schema or a " + f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs" + ) + + pipeline_input = args[0] or kwargs + pipeline_output, context = self.router.run( + inp=pipeline_input, + operators=self.stages, + scheduler=self._scheduler_group, + ) + + if return_context: + return pipeline_output, context + + return pipeline_output + + def validate(self): + router_validation = self.router.validate(self.stages) + + if router_validation is False: + # default error message + stage_types = [type(stage) for stage in self.stages] + raise ValueError( + f"Invalid Router: {type(self.router)} for stages: {stage_types}" + ) + elif isinstance(router_validation, str): + raise ValueError(f"Invalid Router for stages: {router_validation}") diff --git a/src/deepsparse/v2/routers/__init__.py b/src/deepsparse/v2/routers/__init__.py new file mode 100644 index 0000000000..8718bedeb4 --- /dev/null +++ b/src/deepsparse/v2/routers/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .router import * diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py new file mode 100644 index 0000000000..284c348c10 --- /dev/null +++ b/src/deepsparse/v2/routers/router.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Tuple, Union + +from deepsparse.v2.operators import Operator +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["Router"] + + +class Router: + """ + Routers must implement a run method which runs a series of operators + for a pipeline for a given input. Base Router runs operators linearly + in a series + """ + + @staticmethod + def run( + inp: OperatorSchema, + operators: List[Operator], + scheduler: OperatorScheduler, + ) -> Tuple[OperatorSchema, Context]: + """ + :param inp: input to the first operator of the series + :param operators: list of operators to run + :param scheduler: scheudler to submit operators to + :return: final output of the operators + """ + context = Context() + + # run operators linearly + operator_input = inp + for operator in operators: + output_future = scheduler.submit( + operator=operator, operator_input=operator_input, context=context + ) + + # wait for future to resolve + operator_output = output_future.result() + + # update context + context.update( + operator=operator, + input=operator_input, + output=operator_output, + ) + + # previous output becomes next input + operator_input = operator_output + + return operator_output, context + + @staticmethod + def validate(operators: List[Operator]) -> Union[bool, str]: + """ + :param operators: operators that this Router could potentially run over + :return: True if this Router can run this series of operators. Base Router + runs any series of operators that is non empty and whose input and output + schemas align. If not valid, either False or an error string will be + returned + """ + if len(operators) < 1: + return "No operators found" + + for idx in range(len(operators) - 1): + current_output_schema = operators[idx].output_schema + next_input_schema = operators[idx + 1].input_schema + + if current_output_schema is None or next_input_schema is None: + # if no input/output schema defined, assume operator can run + # without schema + continue + + if current_output_schema != next_input_schema: + return ( + f"Operator at idx {idx}: {type(operators[idx])} has invalid " + f"output schema {current_output_schema} for next operator " + f"{type(operators[idx + 1])} which requires {next_input_schema}" + ) diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py new file mode 100644 index 0000000000..04c37077e1 --- /dev/null +++ b/src/deepsparse/v2/schedulers/__init__.py @@ -0,0 +1,18 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .scheduler import * +from .scheduler_group import * diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py new file mode 100644 index 0000000000..53f0c8f625 --- /dev/null +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import Future, ThreadPoolExecutor + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["OperatorScheduler"] + + +class OperatorScheduler: + """ + OperatorSchedulers should implement a `submit` function that asynchronously + runs an operator and its input and returns a Future. Priority of operators + to run and resources they are run on are deferred to specific OperatorScheduler + implementations + + Base OperatorScheduler behaves as a simple queue deferring to ThreadPoolExecutor + + :param max_workers: maximum number of threads to execute at once + """ + + def __init__(self, max_workers: int = 1): + self._threadpool = ThreadPoolExecutor(max_workers=max_workers) + + def submit( + self, + operator: Operator, + operator_input: OperatorSchema, + context: Context, + ) -> Future: + """ + :param operator: operator to run + :param operator_input: input schema to the operator + :param context: context of already run operators + :return: future referencing the asynchronously run output of the operator + """ + if isinstance(operator_input, dict): + return self._threadpool.submit(operator, context=context, **operator_input) + return self._threadpool.submit(operator, operator_input, context=context) + + def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + """ + :param operator: operator to check + :param operator_input: operator_input to check + :return: True if this Operator can process the given operator and input. + Base OperatorScheduler always returns True + """ + return True diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py new file mode 100644 index 0000000000..2f797b30c7 --- /dev/null +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import Future +from typing import List + +from deepsparse.v2.operators import Operator +from deepsparse.v2.schedulers.scheduler import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["SchedulerGroup"] + + +class SchedulerGroup(OperatorScheduler): + """ + Wrapper for a series of schedulers. Runs submitted operators on the first + scheduler that can process a given input + + :param schedulers: list of schedulers to pass operators to + """ + + def __init__(self, schedulers: List[OperatorScheduler]): + self.schedulers = schedulers + + def submit( + self, + operator: Operator, + operator_input: OperatorSchema, + context: Context, + ) -> Future: + """ + :param operator: operator to run + :param operator_input: input schema to the operator + :param context: context of already run operators + :return: future referencing the asynchronously run output of the operator + """ + for scheduler in self.schedulers: + if scheduler.can_process(operator, operator_input): + return scheduler.submit(operator, operator_input, context) + + def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + """ + :param operator: operator to check + :param operator_input: operator_input to check + :return: True if this Operator can process the given operator and input. + SchedulerGroup always returns True + """ + return any( + scheduler.can_process(operator, operator_input) + for scheduler in self.schedulers + ) diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py new file mode 100644 index 0000000000..4f36eeb448 --- /dev/null +++ b/src/deepsparse/v2/utils/__init__.py @@ -0,0 +1,18 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import * +from .types import * diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py new file mode 100644 index 0000000000..81fe26de61 --- /dev/null +++ b/src/deepsparse/v2/utils/context.py @@ -0,0 +1,42 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, List, NamedTuple + +from deepsparse.v2.utils.types import OperatorSchema + + +__all__ = ["Context"] + + +class StageInfo(NamedTuple): + operator: Callable + input: OperatorSchema + output: OperatorSchema + + +class Context: + """ + Context contains the full history of operators and their inputs and outputs + in a pipeline + """ + + def __init__(self): + self.stages_executed: List[StageInfo] = [] + + def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema): + self.stages_executed.append( + StageInfo(operator=operator, input=input, output=output) + ) diff --git a/src/deepsparse/v2/utils/types.py b/src/deepsparse/v2/utils/types.py new file mode 100644 index 0000000000..3e4b974453 --- /dev/null +++ b/src/deepsparse/v2/utils/types.py @@ -0,0 +1,28 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Types to support deepsparse pipelines +""" + +from typing import Any, Dict, Union + +from pydantic import BaseModel + + +__all__ = ["OperatorSchema"] + + +# Operator inputs and outputs may either be a pydantic base model or a dict of kwargs +OperatorSchema = Union[BaseModel, Dict[str, Any]] diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py new file mode 100644 index 0000000000..d39bc61c8c --- /dev/null +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -0,0 +1,45 @@ +""" +Simple example and test of a dummy pipeline +""" + +from pydantic import BaseModel + +from deepsparse.v2 import Pipeline +from deepsparse.v2.operators import Operator +from deepsparse.v2.routers import Router +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +class IntSchema(BaseModel): + value: int + + +class AddOneOperator(Operator): + input_schema = IntSchema + output_schema = IntSchema + + def run(self, inp: IntSchema, context: Context) -> OperatorSchema: + return IntSchema(value=inp.value + 1) + + +class AddTwoOperator(Operator): + input_schema = IntSchema + output_schema = IntSchema + + def run(self, inp: IntSchema, context: Context) -> OperatorSchema: + return IntSchema(value=inp.value + 2) + + +AddThreePipeline = Pipeline( + stages=[AddOneOperator(), AddTwoOperator()], + router=Router(), + schedulers=[OperatorScheduler()], +) + + +def test_run_simple_pipeline(): + pipeline_input = IntSchema(value=5) + pipeline_output = AddThreePipeline(pipeline_input) + + assert pipeline_output.value == 8 From 224e116dbd0dea213021ccf87b12577d8a408b55 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 31 Oct 2023 16:24:10 -0400 Subject: [PATCH 02/57] [Pipeline Refactor] Additional functionality, engine operator, linear router and image classification pipeline/operators/example (#1325) * initial functionality and working example with image classification * remove testing image * update args * initial functionality and working example with image classification * remove testing image * pr comments * defines schemas for operators and test * add image classification test, PR comments * fix input/output handling in pipeline and operator base classes to be more generic; remove context * add additional operator input message * typo fix --- src/deepsparse/v2/__init__.py | 2 +- .../v2/image_classification/__init__.py | 20 +++ .../v2/image_classification/pipeline.py | 62 ++++++++ .../postprocess_operator.py | 81 ++++++++++ .../preprocess_operator.py | 149 ++++++++++++++++++ .../v2/operators/engine_operator.py | 133 ++++++++++++++++ src/deepsparse/v2/operators/operator.py | 92 +++++++---- src/deepsparse/v2/pipeline.py | 130 ++++++++------- src/deepsparse/v2/routers/router.py | 88 ++++++----- src/deepsparse/v2/schedulers/scheduler.py | 14 +- .../v2/schedulers/scheduler_group.py | 16 +- src/deepsparse/v2/utils/__init__.py | 1 - src/deepsparse/v2/utils/context.py | 42 ----- tests/deepsparse/v2/__init__.py | 13 ++ tests/deepsparse/v2/test_basic_pipeline.py | 31 +++- .../v2/test_image_classification.py | 39 +++++ 16 files changed, 709 insertions(+), 204 deletions(-) create mode 100644 src/deepsparse/v2/image_classification/__init__.py create mode 100644 src/deepsparse/v2/image_classification/pipeline.py create mode 100644 src/deepsparse/v2/image_classification/postprocess_operator.py create mode 100644 src/deepsparse/v2/image_classification/preprocess_operator.py create mode 100644 src/deepsparse/v2/operators/engine_operator.py delete mode 100644 src/deepsparse/v2/utils/context.py create mode 100644 tests/deepsparse/v2/test_image_classification.py diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py index 4a897be06f..29fcd4126c 100644 --- a/src/deepsparse/v2/__init__.py +++ b/src/deepsparse/v2/__init__.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .pipeline import * from .operators import * +from .pipeline import * from .routers import * from .schedulers import * from .utils import * diff --git a/src/deepsparse/v2/image_classification/__init__.py b/src/deepsparse/v2/image_classification/__init__.py new file mode 100644 index 0000000000..8668227df7 --- /dev/null +++ b/src/deepsparse/v2/image_classification/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +from .postprocess_operator import * +from .preprocess_operator import * + + +from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/image_classification/pipeline.py b/src/deepsparse/v2/image_classification/pipeline.py new file mode 100644 index 0000000000..3d7887a701 --- /dev/null +++ b/src/deepsparse/v2/image_classification/pipeline.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import warnings +from typing import Dict, Optional, Tuple, Union + +from deepsparse.v2.image_classification.postprocess_operator import ( + ImageClassificationPostProcess, +) +from deepsparse.v2.image_classification.preprocess_operator import ( + ImageClassificationPreProcess, +) +from deepsparse.v2.operators.engine_operator import EngineOperator +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.routers.router import LinearRouter +from deepsparse.v2.schedulers.scheduler import OperatorScheduler + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["ImageClassificationPipeline"] + + +class ImageClassificationPipeline(Pipeline): + def __init__( + self, + model_path: str, + engine_kwargs: Optional[Dict] = None, + class_names: Union[None, str, Dict[str, str]] = None, + image_size: Optional[Tuple[int]] = None, + top_k: int = 1, + ): + if not engine_kwargs: + engine_kwargs = {} + engine_kwargs["model_path"] = model_path + elif engine_kwargs.get("model_path") != model_path: + warnings.warn(f"Updating engine_kwargs to include {model_path}") + + engine = EngineOperator(**engine_kwargs) + preproces = ImageClassificationPreProcess( + model_path=engine.model_path, image_size=image_size + ) + postprocess = ImageClassificationPostProcess( + top_k=top_k, class_names=class_names + ) + + ops = [preproces, engine, postprocess] + router = LinearRouter(end_route=len(ops)) + scheduler = [OperatorScheduler()] + super().__init__(ops=ops, router=router, schedulers=scheduler) diff --git a/src/deepsparse/v2/image_classification/postprocess_operator.py b/src/deepsparse/v2/image_classification/postprocess_operator.py new file mode 100644 index 0000000000..9231113368 --- /dev/null +++ b/src/deepsparse/v2/image_classification/postprocess_operator.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Dict, List, Union + +import numpy +from pydantic import BaseModel, Field + +from deepsparse.v2.operators import Operator + + +class ImageClassificationOutput(BaseModel): + """ + Output model for image classification + """ + + labels: List[Union[int, str, List[int], List[str]]] = Field( + description="List of labels, one for each prediction" + ) + scores: List[Union[float, List[float]]] = Field( + description="List of scores, one for each prediction" + ) + + +__all__ = ["ImageClassificationPostProcess"] + + +class ImageClassificationPostProcess(Operator): + """ + Image Classification post-processing Operator. This Operator is responsible for + processing outputs from the engine and returning the classification results to + the user, using the ImageClassifcationOutput structure. + """ + + input_schema = None + output_schema = ImageClassificationOutput + + def __init__( + self, top_k: int = 1, class_names: Union[None, str, Dict[str, str]] = None + ): + self.top_k = top_k + if isinstance(class_names, str) and class_names.endswith(".json"): + self._class_names = json.load(open(class_names)) + elif isinstance(class_names, dict): + self._class_names = class_names + else: + self._class_names = None + + def run(self, inp: "EngineOperatorOutputs", **kwargs) -> Dict: # noqa: F821 + labels, scores = [], [] + inp = inp.engine_outputs + for prediction_batch in inp[0]: + label = (-prediction_batch).argsort()[: self.top_k] + score = prediction_batch[label] + labels.append(label) + scores.append(score.tolist()) + + if self._class_names is not None: + labels = numpy.vectorize(self._class_names.__getitem__)(labels) + labels = labels.tolist() + + if isinstance(labels[0], numpy.ndarray): + labels = [label.tolist() for label in labels] + + if len(labels) == 1: + labels = labels[0] + scores = scores[0] + + return {"scores": scores, "labels": labels} diff --git a/src/deepsparse/v2/image_classification/preprocess_operator.py b/src/deepsparse/v2/image_classification/preprocess_operator.py new file mode 100644 index 0000000000..9b4517a44c --- /dev/null +++ b/src/deepsparse/v2/image_classification/preprocess_operator.py @@ -0,0 +1,149 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Tuple + +import numpy +import onnx +from PIL import Image +from torchvision import transforms + +from deepsparse.image_classification.constants import ( + IMAGENET_RGB_MEANS, + IMAGENET_RGB_STDS, +) +from deepsparse.pipelines.computer_vision import ComputerVisionSchema +from deepsparse.v2.operators import Operator + + +class ImageClassificationInput(ComputerVisionSchema): + """ + Input model for image classification + """ + + +__all__ = ["ImageClassificationPreProcess"] + + +class ImageClassificationPreProcess(Operator): + """ + Image Classification pre-processing operator. This Operator is expected to process + the user inputs and prepare them for the engine. Inputs to this Operator are + expected to follow the ImageClassificationInput schema. + """ + + input_schema = ImageClassificationInput + output_schema = None + + def __init__(self, model_path: str, image_size: Optional[Tuple[int]] = None): + self.model_path = model_path + self._image_size = image_size or self._infer_image_size() + non_rand_resize_scale = 256.0 / 224.0 # standard used + self._pre_normalization_transforms = transforms.Compose( + [ + transforms.Resize( + tuple( + [ + round(non_rand_resize_scale * size) + for size in self._image_size + ] + ) + ), + transforms.CenterCrop(self._image_size), + ] + ) + + def run(self, inp: ImageClassificationInput, **kwargs) -> Dict: + """ + Pre-Process the Inputs for DeepSparse Engine + + :param inputs: input model + :return: list of preprocessed numpy arrays + """ + + if isinstance(inp.images, numpy.ndarray): + image_batch = inp.images + else: + if isinstance(inp.images, str): + inp.images = [inp.images] + + image_batch = list(map(self._preprocess_image, inp.images)) + + # build batch + image_batch = numpy.stack(image_batch, axis=0) + + original_dtype = image_batch.dtype + image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32) + + if original_dtype == numpy.uint8: + image_batch /= 255 + # normalize entire batch + image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1)) + image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1)) + + return {"engine_inputs": [image_batch]} + + def _preprocess_image(self, image) -> numpy.ndarray: + if isinstance(image, List): + # image given as raw list + image = numpy.asarray(image) + if image.dtype == numpy.float32: + # image is already processed, append and continue + return image + # assume raw image input + # put image in PIL format for torchvision processing + image = image.astype(numpy.uint8) + if image.shape[0] < image.shape[-1]: + # put channel last + image = numpy.einsum("cwh->whc", image) + image = Image.fromarray(image) + elif isinstance(image, str): + # load image from string filepath + image = Image.open(image).convert("RGB") + elif isinstance(image, numpy.ndarray): + image = image.astype(numpy.uint8) + if image.shape[0] < image.shape[-1]: + # put channel last + image = numpy.einsum("cwh->whc", image) + image = Image.fromarray(image) + + if not isinstance(image, Image.Image): + raise ValueError( + f"inputs to {self.__class__.__name__} must be a string image " + "file path(s), a list representing a raw image, " + "PIL.Image.Image object(s), or a numpy array representing" + f"the entire pre-processed batch. Found {type(image)}" + ) + + # apply resize and center crop + image = self._pre_normalization_transforms(image) + image_numpy = numpy.array(image) + image.close() + + # make channel first dimension + image_numpy = image_numpy.transpose(2, 0, 1) + return image_numpy + + def _infer_image_size(self) -> Tuple[int, ...]: + """ + Infer and return the expected shape of the input tensor + + :return: The expected shape of the input tensor from onnx graph + """ + onnx_model = onnx.load(self.model_path) + input_tensor = onnx_model.graph.input[0] + return ( + input_tensor.type.tensor_type.shape.dim[2].dim_value, + input_tensor.type.tensor_type.shape.dim[3].dim_value, + ) diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py new file mode 100644 index 0000000000..aac94a7697 --- /dev/null +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -0,0 +1,133 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel, Field + +from deepsparse import Context, Engine, MultiModelEngine, Scheduler +from deepsparse.benchmark import ORTEngine +from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs +from deepsparse.v2.operators import Operator + + +DEEPSPARSE_ENGINE = "deepsparse" +ORT_ENGINE = "onnxruntime" + +SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE] + +__all__ = ["EngineOperator"] + + +class EngineOperatorInputs(BaseModel): + engine_inputs: List = Field(description="engine_inputs") + + +class EngineOperatorOutputs(BaseModel): + engine_outputs: List = Field(description="engine outputs") + + +class EngineOperator(Operator): + input_schema = EngineOperatorInputs + output_schema = EngineOperatorOutputs + + def __init__( + self, + model_path: str, + engine_type: str = DEEPSPARSE_ENGINE, + batch_size: Optional[int] = 1, + num_cores: int = None, + num_streams: int = None, + scheduler: Scheduler = None, + input_shapes: List[List[int]] = None, + engine_context: Optional[Context] = None, + ): + + self._batch_size = batch_size + self.model_path = model_to_path(model_path) + self.engine_context = engine_context + + if self.engine_context is not None: + num_cores = num_cores or self.engine_context.num_cores + if self.engine_context.num_cores != num_cores: + raise ValueError( + f"num_cores mismatch. Expected {self.engine_context.num_cores} " + f"from passed context, but got {num_cores} while " + f"instantiating Pipeline" + ) + + engine_args = dict( + batch_size=self._batch_size, + num_cores=num_cores, + input_shapes=input_shapes, + ) + if engine_type.lower() == DEEPSPARSE_ENGINE: + engine_args["scheduler"] = scheduler + engine_args["num_streams"] = num_streams + + self.engine = self._create_engine(self.model_path, engine_type, engine_args) + + def _create_engine( + self, onnx_file_path: str, engine_type: str, engine_args: Dict + ) -> Union[Engine, MultiModelEngine, ORTEngine]: + """ + Create an inference engine for a given ONNX model + + :param onnx_file_path: path to ONNX model file + :param engine_type: type of engine to create. + :param engine_args: arguments to pass to engine constructor + :param context: context to use for engine + :return: inference engine + """ + engine_type = engine_type.lower() + + if engine_type == DEEPSPARSE_ENGINE: + if self.engine_context is not None and isinstance( + self.engine_context, Context + ): + engine_args.pop("num_cores", None) + engine_args.pop("scheduler", None) + engine_args.pop("num_streams", None) + engine_args["context"] = self.engien_context + return MultiModelEngine( + model=onnx_file_path, + **engine_args, + ) + engine_args.pop("cache_output_bools", None) + return Engine(onnx_file_path, **engine_args) + + if engine_type == ORT_ENGINE: + return ORTEngine(onnx_file_path, **engine_args) + + raise ValueError( + f"Unknown engine_type {engine_type}. Supported values include: " + f"{SUPPORTED_PIPELINE_ENGINES}" + ) + + def run(self, inp: EngineOperatorInputs) -> Dict: + inp = inp.engine_inputs + batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) + batches_outputs = list(map(self.engine, batches)) + engine_outputs = self.condense_inputs( + batch_outputs=batches_outputs, orig_batch_size=orig_batch_size + ) + return {"engine_outputs": engine_outputs} + + def expand_inputs(self, **kwargs): + return split_engine_inputs(kwargs["engine_inputs"], self._batch_size) + + def condense_inputs(self, **kwargs): + batch_outputs = kwargs["batch_outputs"] + orig_batch_size = kwargs["orig_batch_size"] + return join_engine_outputs(batch_outputs, orig_batch_size) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 30e1a48379..c3a3e28b78 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -13,39 +13,32 @@ # limitations under the License. from abc import ABC, abstractmethod -from typing import Optional, Type +from typing import Any, Optional, Type from pydantic import BaseModel -from deepsparse.v2.utils import Context, OperatorSchema - __all__ = ["Operator"] class Operator(ABC): """ - Base operator class - can represent any part of an ML pipeline + Base operator class - an operator should be defined for each atomic, functional + part of the pipeline. """ # expected structured input and output types, to be defined by child classes - input_schema: Optional[Type[OperatorSchema]] = None - output_schema: Optional[Type[OperatorSchema]] = None - - @abstractmethod - def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema: - """ - :param inp: operator input, as the defined input schema if applicable - :param context: pipeline context of already run operators - :return: result of this operator as the defined output schema if applicable - """ - raise NotImplementedError + input_schema: Optional[Type[BaseModel]] = None + output_schema: Optional[Type[BaseModel]] = None @classmethod def has_input_schema(cls) -> bool: """ :return: True if this class has a defined pydantic input schema """ + if not cls.input_schema: + return False + return issubclass(cls.input_schema, BaseModel) @classmethod @@ -53,38 +46,73 @@ def has_output_schema(cls) -> bool: """ :return: True if this class has a defined pydantic input schema """ + if not cls.output_schema: + return False + return issubclass(cls.output_schema, BaseModel) def __call__( self, *args, - context: Optional[Context] = None, **kwargs, - ) -> OperatorSchema: + ) -> Any: """ Parses inputs to this Operator and runs the run() method of this operator - :param args: an unnamed arg may only be provided - if it is of the type of the input_schema + :param args: an unnamed arg may only be provided if it is of the type of the + input_schema :param context: pipeline context to pass to operator :param kwargs: kwargs when not initializing from an instantiated schema :return: operator output """ - if len(args) > 1: - raise ValueError( - f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}" - ) - - if len(args) == 1: - if self.input_schema is not None and isinstance(args[0], self.input_schema): + if self.has_input_schema(): + if len(args) > 1: + raise ValueError( + f"The operator requires an {self.input_schema}. Too many arguments" + "provided." + ) + elif args and isinstance(args[0], self.input_schema): inference_input = args[0] + elif kwargs: + inference_input = self.input_schema(**kwargs) else: raise ValueError( - f"1 arg supplied to Operator {self.__class__.__name__} but was not " - f"of expected type {self.input_schema}, found {type(args[0])}" + "Can't resolve inputs. The values for the schema must be provided" + "in the form of a dictionary or an instance of the input_schema" + "object" ) - elif self.has_input_schema(): - inference_input = self.input_schema(**kwargs) + + run_output = self.run(inference_input) else: - inference_input = kwargs - return self.run(inference_input, context=context) + run_output = self.run(*args, **kwargs) + + if self.has_output_schema(): + return self.output_schema(**run_output) + return run_output + + @abstractmethod + def run(self, *args, **kwargs) -> Any: + """ + :param inp: operator input, as the defined input schema if applicable + :param context: pipeline context of already run operators + :return: result of this operator as the defined output schema if applicable + """ + raise NotImplementedError + + def expand_inputs(self, **kwargs): + """ + Generic function to handle expanding values. + """ + raise NotImplementedError + + def condense_inputs(self, **kwargs): + """ + Generic function to handle condensing values. + """ + raise NotImplementedError + + def yaml(self): + pass + + def json(self): + pass diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 0ec580687d..e58f8a5191 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -13,9 +13,7 @@ # limitations under the License. -from typing import List - -from pydantic import BaseModel, Field, PrivateAttr +from typing import Dict, List, Union from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router @@ -25,78 +23,90 @@ __all__ = ["Pipeline"] -class Pipeline(BaseModel): +class Pipeline(Operator): """ - Pipeline accepts a series of operators, schedulers, and a router which define - an end to end ML transformation. + Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline + will use the router to run through all the defined operators. The operators should + be implemented using the Operator class and each implemented Operator should be + responsible for a functional component of the pipelines. The flow of inputs/outputs + between the operators and the steps in the pipeline should be defined by the router, + (based off of the Router class), which dicates the next operator in the pipeline. + Execution of the operators will be handled by the provided schedulers. + + :param ops: Operators to run within the pipeline. Can either be a list of operators + or dictionary of operators. + :param router: A Router which dictates the next operator to call. + :param schedulers: A list of schedulers to run operators. - Calling a pipeline runs these transformations """ - stages: List[Operator] = Field( - required=True, - description="In-order list of operators that make up this pipeline", - ) - router: Router = Field( - default_factor=Router, - description="Router object to determine order and run the stages. " - "Defaults to the base Router object", - ) - schedulers: List[OperatorScheduler] = Field( - default_factor=lambda: [OperatorScheduler()], - description="List of schedulers to run operators in order of priority", - ) - - _scheduler_group: SchedulerGroup = PrivateAttr() - - class Config: - arbitrary_types_allowed = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__( + self, + ops: Union[Dict[str, Operator], List[Operator]], + router: Router, + schedulers: List[OperatorScheduler], + ): + self.ops = ops + self.router = router + self.schedulers = schedulers self.validate() # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) - def __call__(self, *args, return_context: bool = False, **kwargs): + def run(self, *args, **kwargs): + """ + Run through the operators using the provided router and scheduler. Update the + context to reflect each step of the router. The input to a given operator is the + output of the previous operator. + + :param inp: input to the operator. expected to be of any type that is + expected by the operator. + :param context: context to store the current the inputs, outputs, and operator + for each step of the router. + + """ + next_step = self.router.START_ROUTE + operator_output = None + while next_step != self.router.END_ROUTE: + # Either a dictionary key or valid index + operator = self.ops[next_step] + if next_step == self.router.START_ROUTE: + output_future = self._scheduler_group.submit( + *args, operator=operator, **kwargs + ) + else: + if isinstance(operator_output, dict): + output_future = self._scheduler_group.submit( + operator=operator, **operator_output + ) + else: + output_future = self._scheduler_group.submit( + operator_output, operator=operator + ) + + # wait for future to resolve + operator_output = output_future.result() + next_step = self.router.next(next_step, self.ops) + return operator_output + + def __call__(self, *args, **kwargs): """ - :param return_context: if True, retrns tuple of the pipelien output - and entire context. Default False - :return: output of the pipeline stages ran with the router for the given input + :return: output of the pipeline operators ran with the router for the given + input """ - if len(args) > 1: - raise ValueError( - "Only 1 in-line argument may be supplied to Pipeline which " - f"must be a Schema, found: {len(args)}" - ) - if args and kwargs: - raise ValueError( - "Pipeline can only run either a single in-line argument schema or a " - f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs" - ) - - pipeline_input = args[0] or kwargs - pipeline_output, context = self.router.run( - inp=pipeline_input, - operators=self.stages, - scheduler=self._scheduler_group, - ) - - if return_context: - return pipeline_output, context - - return pipeline_output + return self.run(*args, **kwargs) def validate(self): - router_validation = self.router.validate(self.stages) + """ + Validate that compatability of the router and operators provided. + """ + router_validation = self.router.validate(self.ops) if router_validation is False: # default error message - stage_types = [type(stage) for stage in self.stages] - raise ValueError( - f"Invalid Router: {type(self.router)} for stages: {stage_types}" - ) + op_types = [type(op) for op in self.ops] + raise ValueError(f"Invalid Router: {type(self.router)} for ops: {op_types}") elif isinstance(router_validation, str): - raise ValueError(f"Invalid Router for stages: {router_validation}") + raise ValueError(f"Invalid Router for operators: {router_validation}") diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 284c348c10..6050803b5e 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -13,61 +13,70 @@ # limitations under the License. -from typing import List, Tuple, Union +import logging +from abc import abstractmethod +from typing import Dict, List, Union from deepsparse.v2.operators import Operator -from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema -__all__ = ["Router"] +_LOGGER = logging.getLogger(__name__) + +__all__ = ["Router", "LinearRouter"] class Router: """ - Routers must implement a run method which runs a series of operators - for a pipeline for a given input. Base Router runs operators linearly - in a series + Routers dicate the next operator to run. Each Router must implement a next function, + which dictates the index or key of the next operator to run. + + :param start_route: the start index or key of the router + :param end_route: the end index or key of the router + """ - @staticmethod - def run( - inp: OperatorSchema, - operators: List[Operator], - scheduler: OperatorScheduler, - ) -> Tuple[OperatorSchema, Context]: + def __init__(self, end_route: Union[str, int], start_route: Union[str, int]): + self.START_ROUTE = start_route + self.END_ROUTE = end_route + + @abstractmethod + def next( + self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]] + ) -> Union[str, int]: """ - :param inp: input to the first operator of the series - :param operators: list of operators to run - :param scheduler: scheudler to submit operators to - :return: final output of the operators + Determines the index or dictionary key for the next operator which should run. + + :param past: the previous index or key. This should uniquely determine the next + operator to run + :param ops: list or dictionary of operators + :returns: the next index or dictionary key for the next operator to run """ - context = Context() + raise NotImplementedError + + def yaml(self): + pass - # run operators linearly - operator_input = inp - for operator in operators: - output_future = scheduler.submit( - operator=operator, operator_input=operator_input, context=context - ) + def json(self): + pass - # wait for future to resolve - operator_output = output_future.result() - # update context - context.update( - operator=operator, - input=operator_input, - output=operator_output, - ) +class LinearRouter(Router): + """ + LinearRouterruns a list of Operators in sequential order. end_route should + be the length of the list and the start_route should be the start index. + """ - # previous output becomes next input - operator_input = operator_output + def __init__(self, end_route: int, start_route: int = 0): + super().__init__(end_route=end_route, start_route=start_route) - return operator_output, context + def next(self, past: int, ops: List[Operator]) -> int: + new_index = past + 1 + if new_index < self.END_ROUTE: + return new_index + return self.END_ROUTE @staticmethod - def validate(operators: List[Operator]) -> Union[bool, str]: + def validate(operators: List[Operator]) -> bool: """ :param operators: operators that this Router could potentially run over :return: True if this Router can run this series of operators. Base Router @@ -76,7 +85,8 @@ def validate(operators: List[Operator]) -> Union[bool, str]: returned """ if len(operators) < 1: - return "No operators found" + _LOGGER.info("No operators provided") + return False for idx in range(len(operators) - 1): current_output_schema = operators[idx].output_schema @@ -88,8 +98,10 @@ def validate(operators: List[Operator]) -> Union[bool, str]: continue if current_output_schema != next_input_schema: - return ( + _LOGGER.info( f"Operator at idx {idx}: {type(operators[idx])} has invalid " f"output schema {current_output_schema} for next operator " f"{type(operators[idx + 1])} which requires {next_input_schema}" ) + return False + return True diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 53f0c8f625..7d4f249444 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -16,7 +16,6 @@ from concurrent.futures import Future, ThreadPoolExecutor from deepsparse.v2.operators import Operator -from deepsparse.v2.utils import Context, OperatorSchema __all__ = ["OperatorScheduler"] @@ -37,23 +36,16 @@ class OperatorScheduler: def __init__(self, max_workers: int = 1): self._threadpool = ThreadPoolExecutor(max_workers=max_workers) - def submit( - self, - operator: Operator, - operator_input: OperatorSchema, - context: Context, - ) -> Future: + def submit(self, *args, operator: Operator, **kwargs) -> Future: """ :param operator: operator to run :param operator_input: input schema to the operator :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ - if isinstance(operator_input, dict): - return self._threadpool.submit(operator, context=context, **operator_input) - return self._threadpool.submit(operator, operator_input, context=context) + return self._threadpool.submit(operator, *args, **kwargs) - def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + def can_process(self, *args, operator: Operator, **kwargs) -> bool: """ :param operator: operator to check :param operator_input: operator_input to check diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 2f797b30c7..7f00a3c17c 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -18,7 +18,6 @@ from deepsparse.v2.operators import Operator from deepsparse.v2.schedulers.scheduler import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema __all__ = ["SchedulerGroup"] @@ -35,12 +34,7 @@ class SchedulerGroup(OperatorScheduler): def __init__(self, schedulers: List[OperatorScheduler]): self.schedulers = schedulers - def submit( - self, - operator: Operator, - operator_input: OperatorSchema, - context: Context, - ) -> Future: + def submit(self, *args, operator: Operator, **kwargs) -> Future: """ :param operator: operator to run :param operator_input: input schema to the operator @@ -48,10 +42,10 @@ def submit( :return: future referencing the asynchronously run output of the operator """ for scheduler in self.schedulers: - if scheduler.can_process(operator, operator_input): - return scheduler.submit(operator, operator_input, context) + if scheduler.can_process(*args, operator=operator, **kwargs): + return scheduler.submit(*args, operator=operator, **kwargs) - def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + def can_process(self, *args, operator: Operator, **kwargs) -> bool: """ :param operator: operator to check :param operator_input: operator_input to check @@ -59,6 +53,6 @@ def can_process(self, operator: Operator, operator_input: OperatorSchema) -> boo SchedulerGroup always returns True """ return any( - scheduler.can_process(operator, operator_input) + scheduler.can_process(*args, operator=operator, **kwargs) for scheduler in self.schedulers ) diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py index 4f36eeb448..a36d8e92ec 100644 --- a/src/deepsparse/v2/utils/__init__.py +++ b/src/deepsparse/v2/utils/__init__.py @@ -14,5 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .context import * from .types import * diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py deleted file mode 100644 index 81fe26de61..0000000000 --- a/src/deepsparse/v2/utils/context.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Callable, List, NamedTuple - -from deepsparse.v2.utils.types import OperatorSchema - - -__all__ = ["Context"] - - -class StageInfo(NamedTuple): - operator: Callable - input: OperatorSchema - output: OperatorSchema - - -class Context: - """ - Context contains the full history of operators and their inputs and outputs - in a pipeline - """ - - def __init__(self): - self.stages_executed: List[StageInfo] = [] - - def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema): - self.stages_executed.append( - StageInfo(operator=operator, input=input, output=output) - ) diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py index e69de29bb2..0c44f887a4 100644 --- a/tests/deepsparse/v2/__init__.py +++ b/tests/deepsparse/v2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py index d39bc61c8c..9f85e4976e 100644 --- a/tests/deepsparse/v2/test_basic_pipeline.py +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -1,14 +1,29 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Simple example and test of a dummy pipeline """ +from typing import Dict + from pydantic import BaseModel from deepsparse.v2 import Pipeline from deepsparse.v2.operators import Operator -from deepsparse.v2.routers import Router +from deepsparse.v2.routers import LinearRouter from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema class IntSchema(BaseModel): @@ -19,21 +34,21 @@ class AddOneOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema, context: Context) -> OperatorSchema: - return IntSchema(value=inp.value + 1) + def run(self, inp: IntSchema) -> Dict: + return {"value": inp.value + 1} class AddTwoOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema, context: Context) -> OperatorSchema: - return IntSchema(value=inp.value + 2) + def run(self, inp: IntSchema) -> Dict: + return {"value": inp.value + 2} AddThreePipeline = Pipeline( - stages=[AddOneOperator(), AddTwoOperator()], - router=Router(), + ops=[AddOneOperator(), AddTwoOperator()], + router=LinearRouter(end_route=2), schedulers=[OperatorScheduler()], ) diff --git a/tests/deepsparse/v2/test_image_classification.py b/tests/deepsparse/v2/test_image_classification.py new file mode 100644 index 0000000000..03e2807454 --- /dev/null +++ b/tests/deepsparse/v2/test_image_classification.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy + +import pytest +from deepsparse.v2.image_classification import ImageClassificationPipeline +from deepsparse.v2.image_classification.preprocess_operator import ( + ImageClassificationInput, +) +from tests.deepsparse.pipelines.data_helpers import computer_vision + + +@pytest.fixture +def get_images(): + batch_size = 2 + images = computer_vision(batch_size=batch_size) + return images.get("images") + + +def test_image_classification(get_images): + model_path = ( + "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned95-none" + ) + pipeline = ImageClassificationPipeline(model_path=model_path) + output = pipeline(ImageClassificationInput(images=get_images)) + assert output.labels == [[207], [670]] + assert numpy.allclose(output.scores, [[21.85], [17.33]], atol=0.01) From 58b075888e756dd853d2b279d9a98858962ca31d Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Wed, 1 Nov 2023 10:53:10 -0400 Subject: [PATCH 03/57] [v2] EngineOperator updates to make continuous batching easier (#1371) * [v2] EngineOperator updates to make continuous batching easier * test fixes --- .../v2/operators/engine_operator.py | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index aac94a7697..2c61755df9 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy from typing import Dict, List, Optional, Union from pydantic import BaseModel, Field @@ -32,6 +33,13 @@ class EngineOperatorInputs(BaseModel): engine_inputs: List = Field(description="engine_inputs") + engine: Optional[Engine] = Field( + description="override the engine to run forward pass with", + default=None, + ) + + class Config: + arbitrary_types_allowed = True class EngineOperatorOutputs(BaseModel): @@ -76,21 +84,33 @@ def __init__( engine_args["scheduler"] = scheduler engine_args["num_streams"] = num_streams - self.engine = self._create_engine(self.model_path, engine_type, engine_args) + self._engine_args = engine_args + self._engine_type = engine_type + + self.engine = self.create_engine() + + @property + def batch_size(self) -> int: + """ + :return: the batch size this engine operator is compiled at + """ + return self._batch_size - def _create_engine( - self, onnx_file_path: str, engine_type: str, engine_args: Dict + def create_engine( + self, + **kwargs, ) -> Union[Engine, MultiModelEngine, ORTEngine]: """ Create an inference engine for a given ONNX model - :param onnx_file_path: path to ONNX model file - :param engine_type: type of engine to create. - :param engine_args: arguments to pass to engine constructor - :param context: context to use for engine + :param kwargs: overrides to engine_args used as kwargs for engine + constructor/compilation :return: inference engine """ - engine_type = engine_type.lower() + onnx_file_path = self.model_path + engine_args = deepcopy(self._engine_args) + engine_args.update(kwargs) + engine_type = self._engine_type.lower() if engine_type == DEEPSPARSE_ENGINE: if self.engine_context is not None and isinstance( @@ -116,6 +136,12 @@ def _create_engine( ) def run(self, inp: EngineOperatorInputs) -> Dict: + if inp.engine: + # run with custom engine, do not split/join since custom engine + # may run at any batch size, returning here as code below has a + # planned refactor + engine_outputs = inp.engine(inp.engine_inputs) + return {"engine_outputs": engine_outputs} inp = inp.engine_inputs batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) batches_outputs = list(map(self.engine, batches)) From e1ff108f76b2765e71fc6ee236892ea26b6c7205 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 2 Nov 2023 20:47:25 -0400 Subject: [PATCH 04/57] [Pipeline Refactor] Update routes, text generation initial functionality (#1348) * initial functionality and working example with image classification * remove testing image * rebase fixes * initial functionality and working example with image classification * text gen * updates func * prompt inference, initial functionality * remove image; update state docstring * Fix typo * add todo for split/join * remove context, clean-up args, remove prefill_preprocess_operaator * fix docstrings --- src/deepsparse/v2/operators/__init__.py | 1 - .../v2/operators/engine_operator.py | 18 +- src/deepsparse/v2/operators/operator.py | 30 ++- src/deepsparse/v2/pipeline.py | 70 ++++-- src/deepsparse/v2/routers/router.py | 57 ++++- src/deepsparse/v2/schedulers/scheduler.py | 23 +- .../v2/schedulers/scheduler_group.py | 35 ++- src/deepsparse/v2/text_generation/__init__.py | 24 ++ .../autoregressive_preprocess_operator.py | 100 ++++++++ .../v2/text_generation/compile_logits.py | 43 ++++ .../v2/text_generation/kv_cache_operator.py | 70 ++++++ .../multi_engine_prefill_operator.py | 135 +++++++++++ .../v2/text_generation/nl_engine_operator.py | 191 ++++++++++++++++ src/deepsparse/v2/text_generation/pipeline.py | 213 ++++++++++++++++++ .../v2/text_generation/prep_for_prefill.py | 57 +++++ .../v2/text_generation/process_inputs.py | 121 ++++++++++ src/deepsparse/v2/utils/__init__.py | 2 +- src/deepsparse/v2/utils/state.py | 64 ++++++ tests/deepsparse/v2/test_basic_pipeline.py | 4 +- 19 files changed, 1203 insertions(+), 55 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/__init__.py create mode 100644 src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py create mode 100644 src/deepsparse/v2/text_generation/compile_logits.py create mode 100644 src/deepsparse/v2/text_generation/kv_cache_operator.py create mode 100644 src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator.py create mode 100644 src/deepsparse/v2/text_generation/pipeline.py create mode 100644 src/deepsparse/v2/text_generation/prep_for_prefill.py create mode 100644 src/deepsparse/v2/text_generation/process_inputs.py create mode 100644 src/deepsparse/v2/utils/state.py diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py index 8f7e6a169d..9d1a9812ac 100644 --- a/src/deepsparse/v2/operators/__init__.py +++ b/src/deepsparse/v2/operators/__init__.py @@ -13,5 +13,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .operator import * diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index 2c61755df9..b7d920a686 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -17,7 +17,8 @@ from pydantic import BaseModel, Field -from deepsparse import Context, Engine, MultiModelEngine, Scheduler +from deepsparse import Context as EngineContext +from deepsparse import Engine, MultiModelEngine, Scheduler from deepsparse.benchmark import ORTEngine from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs from deepsparse.v2.operators import Operator @@ -54,16 +55,15 @@ def __init__( self, model_path: str, engine_type: str = DEEPSPARSE_ENGINE, - batch_size: Optional[int] = 1, num_cores: int = None, num_streams: int = None, scheduler: Scheduler = None, input_shapes: List[List[int]] = None, - engine_context: Optional[Context] = None, + engine_context: Optional[EngineContext] = None, + engine_kwargs: Dict = None, ): - - self._batch_size = batch_size self.model_path = model_to_path(model_path) + self._batch_size = 1 self.engine_context = engine_context if self.engine_context is not None: @@ -87,7 +87,7 @@ def __init__( self._engine_args = engine_args self._engine_type = engine_type - self.engine = self.create_engine() + self.engine = self.create_engine(**engine_kwargs) @property def batch_size(self) -> int: @@ -114,12 +114,12 @@ def create_engine( if engine_type == DEEPSPARSE_ENGINE: if self.engine_context is not None and isinstance( - self.engine_context, Context + self.engine_context, EngineContext ): engine_args.pop("num_cores", None) engine_args.pop("scheduler", None) engine_args.pop("num_streams", None) - engine_args["context"] = self.engien_context + engine_args["context"] = self.engine_context return MultiModelEngine( model=onnx_file_path, **engine_args, @@ -135,7 +135,7 @@ def create_engine( f"{SUPPORTED_PIPELINE_ENGINES}" ) - def run(self, inp: EngineOperatorInputs) -> Dict: + def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict: if inp.engine: # run with custom engine, do not split/join since custom engine # may run at any batch size, returning here as code below has a diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index c3a3e28b78..b3963d8223 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,6 +17,8 @@ from pydantic import BaseModel +from deepsparse.v2.utils import InferenceState, PipelineState + __all__ = ["Operator"] @@ -54,6 +56,8 @@ def has_output_schema(cls) -> bool: def __call__( self, *args, + inference_state: InferenceState, + pipeline_state: PipelineState, **kwargs, ) -> Any: """ @@ -61,7 +65,9 @@ def __call__( :param args: an unnamed arg may only be provided if it is of the type of the input_schema - :param context: pipeline context to pass to operator + :param inference_state: inference_state for the pipeline. + :param pipeline_state: pipeline_state for the pipeline. The values in the state + are created during pipeline creation and are read-only during inference. :param kwargs: kwargs when not initializing from an instantiated schema :return: operator output """ @@ -81,10 +87,18 @@ def __call__( "in the form of a dictionary or an instance of the input_schema" "object" ) - - run_output = self.run(inference_input) + run_output = self.run( + inference_input, + inference_state=inference_state, + pipeline_state=pipeline_state, + ) else: - run_output = self.run(*args, **kwargs) + run_output = self.run( + *args, + inference_state=inference_state, + pipeline_state=pipeline_state, + **kwargs, + ) if self.has_output_schema(): return self.output_schema(**run_output) @@ -93,12 +107,16 @@ def __call__( @abstractmethod def run(self, *args, **kwargs) -> Any: """ - :param inp: operator input, as the defined input schema if applicable - :param context: pipeline context of already run operators :return: result of this operator as the defined output schema if applicable """ raise NotImplementedError + def can_operate(self, inp: Any) -> bool: + """ + Whether or not the given operator can run, based on input + """ + return True + def expand_inputs(self, **kwargs): """ Generic function to handle expanding values. diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index e58f8a5191..0a8c8b2f93 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -18,6 +18,7 @@ from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup +from deepsparse.v2.utils import InferenceState, PipelineState __all__ = ["Pipeline"] @@ -27,7 +28,7 @@ class Pipeline(Operator): """ Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline will use the router to run through all the defined operators. The operators should - be implemented using the Operator class and each implemented Operator should be + be implemented using the Operator class and each implemented operator should be responsible for a functional component of the pipelines. The flow of inputs/outputs between the operators and the steps in the pipeline should be defined by the router, (based off of the Router class), which dicates the next operator in the pipeline. @@ -37,6 +38,7 @@ class Pipeline(Operator): or dictionary of operators. :param router: A Router which dictates the next operator to call. :param schedulers: A list of schedulers to run operators. + :param pipeline_state: pipeline_state created during pipeline initialization """ @@ -45,57 +47,93 @@ def __init__( ops: Union[Dict[str, Operator], List[Operator]], router: Router, schedulers: List[OperatorScheduler], + pipeline_state: PipelineState = None, ): self.ops = ops self.router = router self.schedulers = schedulers + self.pipeline_state = pipeline_state self.validate() # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) - def run(self, *args, **kwargs): + def run( + self, + *args, + inference_state: InferenceState, + pipeline_state: PipelineState, + **kwargs, + ): """ - Run through the operators using the provided router and scheduler. Update the - context to reflect each step of the router. The input to a given operator is the - output of the previous operator. - - :param inp: input to the operator. expected to be of any type that is - expected by the operator. - :param context: context to store the current the inputs, outputs, and operator - for each step of the router. + Run through the operators using the provided router and scheduler. + The input to a given operator is the output of the previous operator. + :param inference_state: inference_state for the pipeline. + :param pipeline_state: pipeline_state for the pipeline. The values in the state + are created during pipeline creation and are read-only during inference. """ next_step = self.router.START_ROUTE operator_output = None + while next_step != self.router.END_ROUTE: # Either a dictionary key or valid index operator = self.ops[next_step] if next_step == self.router.START_ROUTE: output_future = self._scheduler_group.submit( - *args, operator=operator, **kwargs + *args, + inference_state=inference_state, + operator=operator, + pipeline_state=pipeline_state, + **kwargs, ) else: if isinstance(operator_output, dict): output_future = self._scheduler_group.submit( - operator=operator, **operator_output + inference_state=inference_state, + operator=operator, + pipeline_state=pipeline_state, + **operator_output, ) else: output_future = self._scheduler_group.submit( - operator_output, operator=operator + operator_output, + inference_state=inference_state, + pipeline_state=pipeline_state, + operator=operator, ) - # wait for future to resolve operator_output = output_future.result() - next_step = self.router.next(next_step, self.ops) + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + inference_state.update_state(state_update) + + next_step = self.router.next(next_step, self.ops, operator_output) + return operator_output def __call__(self, *args, **kwargs): """ + Consolidate any provided inference_state or pipeline_state objects and pass + any other operator inputs to run(). + :return: output of the pipeline operators ran with the router for the given - input + input """ + if kwargs.get("inference_state"): + inference_state = kwargs.pop("inference_state") + else: + inference_state = InferenceState() + inference_state.create_state({}) + + if "pipeline_state" in kwargs: + self.pipeline_state = kwargs.get("pipeline_state") + + kwargs["inference_state"] = inference_state + kwargs["pipeline_state"] = self.pipeline_state + return self.run(*args, **kwargs) def validate(self): diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 6050803b5e..d1110d4ca7 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -15,14 +15,14 @@ import logging from abc import abstractmethod -from typing import Dict, List, Union +from typing import Any, Dict, List, Optional, Union from deepsparse.v2.operators import Operator _LOGGER = logging.getLogger(__name__) -__all__ = ["Router", "LinearRouter"] +__all__ = ["Router", "LinearRouter", "GraphRouter"] class Router: @@ -32,23 +32,34 @@ class Router: :param start_route: the start index or key of the router :param end_route: the end index or key of the router + :param route: the route that the router has to traverse through """ - def __init__(self, end_route: Union[str, int], start_route: Union[str, int]): + def __init__( + self, + end_route: Union[str, int], + start_route: Union[str, int], + route: Optional[Dict] = None, + ): self.START_ROUTE = start_route self.END_ROUTE = end_route + self.route = route @abstractmethod def next( - self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]] + self, + past: Union[str, int], + ops: Optional[Union[List[Operator], Dict[str, Operator]]], + inp: Optional[Any], ) -> Union[str, int]: """ Determines the index or dictionary key for the next operator which should run. :param past: the previous index or key. This should uniquely determine the next - operator to run + operator to run :param ops: list or dictionary of operators + :param inp: operator input :returns: the next index or dictionary key for the next operator to run """ raise NotImplementedError @@ -69,7 +80,9 @@ class LinearRouter(Router): def __init__(self, end_route: int, start_route: int = 0): super().__init__(end_route=end_route, start_route=start_route) - def next(self, past: int, ops: List[Operator]) -> int: + def next( + self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None + ) -> int: new_index = past + 1 if new_index < self.END_ROUTE: return new_index @@ -105,3 +118,35 @@ def validate(operators: List[Operator]) -> bool: ) return False return True + + +class GraphRouter(Router): + """ + Router for a DAG. Expects graphs be presented in the form of a dictionary, where + keys are the nodes of the graph and the values are the connected nodes. For + nodes with multiple ouput edges, all the nodes will be visited and the first node + where `can_operate` returns True will run. Paths should be deterministic. + """ + + def __init__(self, end_route: str, start_route: str, route: Dict): + super().__init__(end_route=end_route, start_route=start_route, route=route) + + def next( + self, + past: str, + ops: Dict[str, Operator], + inp: Any, + ) -> int: + node = past + if isinstance(self.route[node], str): + return self.route[node] + else: + for neighbour_node in self.route[node]: + neighbour_node_op = ops[neighbour_node] + if neighbour_node_op.can_operate(inp): + return neighbour_node + raise ValueError("Cannot operate on any of the nodes") + + @staticmethod + def validate(ops) -> bool: + pass diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 7d4f249444..78a58e3389 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -36,19 +36,30 @@ class OperatorScheduler: def __init__(self, max_workers: int = 1): self._threadpool = ThreadPoolExecutor(max_workers=max_workers) - def submit(self, *args, operator: Operator, **kwargs) -> Future: + def submit( + self, + *args, + operator: Operator, + **kwargs, + ) -> Future: """ :param operator: operator to run - :param operator_input: input schema to the operator - :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ - return self._threadpool.submit(operator, *args, **kwargs) + return self._threadpool.submit( + operator, + *args, + **kwargs, + ) - def can_process(self, *args, operator: Operator, **kwargs) -> bool: + def can_process( + self, + *args, + operator: Operator, + **kwargs, + ) -> bool: """ :param operator: operator to check - :param operator_input: operator_input to check :return: True if this Operator can process the given operator and input. Base OperatorScheduler always returns True """ diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 7f00a3c17c..40b5695f22 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -34,25 +34,44 @@ class SchedulerGroup(OperatorScheduler): def __init__(self, schedulers: List[OperatorScheduler]): self.schedulers = schedulers - def submit(self, *args, operator: Operator, **kwargs) -> Future: + def submit( + self, + *args, + operator: Operator, + **kwargs, + ) -> Future: """ :param operator: operator to run - :param operator_input: input schema to the operator - :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ for scheduler in self.schedulers: - if scheduler.can_process(*args, operator=operator, **kwargs): - return scheduler.submit(*args, operator=operator, **kwargs) + if scheduler.can_process( + *args, + operator=operator, + **kwargs, + ): + return scheduler.submit( + *args, + operator=operator, + **kwargs, + ) - def can_process(self, *args, operator: Operator, **kwargs) -> bool: + def can_process( + self, + *args, + operator: Operator, + **kwargs, + ) -> bool: """ :param operator: operator to check - :param operator_input: operator_input to check :return: True if this Operator can process the given operator and input. SchedulerGroup always returns True """ return any( - scheduler.can_process(*args, operator=operator, **kwargs) + scheduler.can_process( + *args, + operator=operator, + **kwargs, + ) for scheduler in self.schedulers ) diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py new file mode 100644 index 0000000000..37ac88d02f --- /dev/null +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa +from .autoregressive_preprocess_operator import * +from .compile_logits import * +from .kv_cache_operator import * +from .multi_engine_prefill_operator import * +from .nl_engine_operator import * +from .prep_for_prefill import * +from .process_inputs import * + + +from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py new file mode 100644 index 0000000000..cfe7cb531b --- /dev/null +++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any + +import numpy + +from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["AutoRegressiveOperatorPreprocess"] + + +class AutoRegressiveOperatorPreprocess(Operator): + def __init__(self, sequence_length: int, prompt_sequence_length: int): + """ + Prepare the tokens for the single-token engine. This requires creating the + attention mask, positions, and causal mask. The output contains these three + arrays to be passed into the single-token engine. + """ + self.sequence_length = sequence_length + self.prompt_sequence_length = prompt_sequence_length + self.set_capacity = False + + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "onnx_input_names_no_cache attribute set from the NLEngineOperator." + ) + + def can_operate(self, inp: Any) -> bool: + """ + Can run this Operator if the number of tokens left to process is greater than + 0 but less than the self.prompt_sequence_length. + """ + tokens = inp.get("tokens") + kv_cache = inp.get("kv_cache") + + remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens + if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length: + return True + return False + + def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + + if not self.set_capacity: + self.set_capacity = True + kv_cache.set_capacity(self.sequence_length - 1) + + num_total_processed_tokens = kv_cache.total_num_processed_tokens + new_token = tokens[num_total_processed_tokens] + engine_input_names = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + + # padding is added to left, so attention mask is 1s from the + # right up to the number of total tokens (prompt + generated) + attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) + num_attention_entries_to_unmask = min( + num_total_processed_tokens + 1, self.sequence_length + ) # cap by seq len + attention_mask[:, -num_attention_entries_to_unmask:] = 1 + positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64) + input_ids = numpy.array([[new_token]]) + causal_mask = create_causal_mask(input_ids, attention_mask) + + engine_inputs_map = dict( + input_ids=input_ids, + attention_mask=attention_mask, + causal_mask=causal_mask, + positions=positions, + ) + + engine_inputs = [engine_inputs_map[name] for name in engine_input_names] + + onnx_input_names_no_cache = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache] + + return { + "engine_inputs": engine_inputs, + "kv_cache": kv_cache, + "tokens": tokens, + } diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py new file mode 100644 index 0000000000..55c87d791d --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_logits.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompilePromptLogits"] + + +class CompilePromptLogits(Operator): + """ + Combine the prompt logits. Currently relying on the inference state to store the + prompt logits for each token or multi-token batch processed. This operator will + take prompt logits from each iteration run and update the inference state. + """ + + def run(self, logits, inference_state: InferenceState, **kwargs): + logit_type = "prompt_logits" + + if inference_state.current_state.get(logit_type) is not None: + current_logits = inference_state.current_state.get(logit_type).copy() + current_logits.append(logits) + else: + current_logits = [logits] + + state_update = {logit_type: current_logits} + return { + "kv_cache": kwargs.get("kv_cache"), + "tokens": kwargs.get("tokens"), + }, state_update diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py new file mode 100644 index 0000000000..0b232402b3 --- /dev/null +++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +from pydantic import BaseModel, Field + +from deepsparse.transformers.utils import DecoderKVCache +from deepsparse.transformers.utils.helpers import ( + initialize_kv_cache_state, + prepends_bos_token, +) +from deepsparse.v2.operators import Operator + + +__all__ = ["KVCacheCreator"] + + +class KVCacheCreatorOutput(BaseModel): + kv_cache: Any = Field(description="KV Cache Created") # DecoderKVCache + + +class KVCacheCreatorInput(BaseModel): + cache_shape: Any = Field(description="shape") + kv_cache_data_type: Any = Field(description="data type") + output_names: Any = Field(description="output names") + + +class KVCacheCreator(Operator): + input_schema = KVCacheCreatorInput + output_schema = KVCacheCreatorOutput + + def __init__( + self, + tokenizer, + sequence_length: int, + prompt_sequence_length: int, + internal_kv_cache: bool, + ): + self.tokenizer = tokenizer + self.prompt_sequence_length = prompt_sequence_length + self.internal_kv_cache = internal_kv_cache + self.sequence_length = sequence_length + + def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs): + kv_cache_state = initialize_kv_cache_state( + cache_shape=cache_shape, + kv_cache_data_type=kv_cache_data_type, + output_names=output_names, + length=self.sequence_length - self.prompt_sequence_length, + empty=bool(self.internal_kv_cache), + ) + + kv_cache = DecoderKVCache(self.internal_kv_cache) + kv_cache.setup( + state=kv_cache_state, + freeze_first_position=prepends_bos_token(self.tokenizer), + ) + return {"kv_cache": kv_cache} diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py new file mode 100644 index 0000000000..41ee830a8a --- /dev/null +++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py @@ -0,0 +1,135 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import Enum +from typing import Any + +import numpy + +from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["MultiEnginePrefill"] + + +class OnnxInputNames(Enum): + INPUT_IDS = "input_ids" + ATTN_MASK = "attention_mask" + CAUSAL_MASK = "causal_mask" + POSITIONS = "positions" + + +# NOTE: A possible clean-up could involve combining this Operator and the +# autoregressive_preprocess_operator + + +class MultiEnginePrefill(Operator): + def __init__(self, prompt_sequence_length, sequence_length): + """ + Prepare the tokens for the multi-token engine. This requires creating the + attention mask, positions, and causal mask. The output contains these three + arrays to be passed into the multi-token engine. + """ + self.prompt_sequence_length = prompt_sequence_length + self.sequence_length = sequence_length + self.cases = { + OnnxInputNames.ATTN_MASK.value: self._case_attn_mask, + OnnxInputNames.POSITIONS.value: self._case_positions, + } + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "onnx_input_names_no_cache attribute set from the NLEngineOperator." + ) + + def can_operate(self, inp: Any): + """ + Can only run if the number of prompt tokens left to process is greater than + or equal to the self.prompt_sequence_length. + """ + kv_cache = inp.get("kv_cache") + tokens = inp.get("tokens") + + if len(tokens) < self.prompt_sequence_length: + return False + + if ( + len(tokens) - kv_cache.total_num_processed_tokens + >= self.prompt_sequence_length + ): + return True + return False + + def _case_attn_mask(self, num_total_processed_tokens: int): + # create an empty attention mask + engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) + # calculate the number of entries in attention mask that should be set to 1 + num_attention_entries_to_unmask = min( + num_total_processed_tokens + self.prompt_sequence_length, + self.sequence_length, + ) + engine_input[:, -num_attention_entries_to_unmask:] = 1 + return engine_input + + def _case_positions(self, num_total_processed_tokens: int): + return ( + numpy.arange( + num_total_processed_tokens, + num_total_processed_tokens + self.prompt_sequence_length, + ) + .reshape(1, -1) + .astype(numpy.int64) + ) + + def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + + onnx_input_names_no_cache = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + + num_total_processed_tokens = kv_cache.total_num_processed_tokens + start = num_total_processed_tokens + end = start + self.prompt_sequence_length + token_batch = tokens[start:end] + + engine_inputs = [] + for name in onnx_input_names_no_cache: + if name == OnnxInputNames.INPUT_IDS.value: + engine_input = numpy.array([token_batch]) + elif ( + name == OnnxInputNames.ATTN_MASK.value + or name == OnnxInputNames.POSITIONS.value + ): + engine_input = self.cases[name](num_total_processed_tokens) + elif name == OnnxInputNames.CAUSAL_MASK.value: + continue + + engine_inputs.append(engine_input) + + if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache: + causal_mask = create_causal_mask( + input_ids=engine_inputs[0], + attention_mask=engine_inputs[1], + ) + engine_inputs.append(causal_mask) + + return { + "engine_inputs": engine_inputs, + "kv_cache": kv_cache, + "tokens": tokens, + } diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py new file mode 100644 index 0000000000..6c1ad1966e --- /dev/null +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from typing import Any, List, Tuple + +from pydantic import BaseModel, Field + +from deepsparse.utils.onnx import ( + CACHE_INPUT_PREFIX, + overwrite_onnx_model_inputs_for_kv_cache_models, +) +from deepsparse.v2.operators.engine_operator import ( + DEEPSPARSE_ENGINE, + EngineOperator, + EngineOperatorInputs, +) + + +__all__ = ["NLEngineOperator"] + + +class NlEngineInput(BaseModel): + engine_inputs: List = Field(description="engine inputs") + kv_cache: Any = Field(description="kv_cache object") + tokens: List = Field(description="tokens") + + +class NLEngineOperator(EngineOperator): + + """ + Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator. + Specific updates to engine attributes are made through this operator, as well + as updating the kv_cache. This Operator is used for both the single-token and + multi-token case. + """ + + input_schema = NlEngineInput + output_schema = None + + def __init__( + self, + sequence_length: int, + input_ids_length: int, + internal_kv_cache: bool = False, + **kwargs, + ): + + self.kv_cache_data_type = None + ( + onnx_file_path, + output_indices_to_be_cached, + kv_cache_data_type, + ) = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=kwargs.get("model_path"), + batch_size=kwargs.get("batch_size", 1), + sequence_length=sequence_length, + input_ids_length=input_ids_length, + ) + + engine_kwargs = kwargs.get("engine_kwargs", {}) + if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE: + if "WAND_OPT_FLAGS" not in os.environ: + os.environ["WAND_OPT_FLAGS"] = "default,~pyramids" + + if any(output_indices_to_be_cached): + self.kv_cache_data_type = kv_cache_data_type + if ( + internal_kv_cache + and kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE + ): + engine_kwargs["cached_outputs"] = output_indices_to_be_cached + + kwargs["engine_kwargs"] = engine_kwargs + kwargs["model_path"] = onnx_file_path + super().__init__(**kwargs) + + self.input_ids_length = input_ids_length + + def run(self, inp: NlEngineInput, **kwargs) -> Any: + engine_input = inp.engine_inputs + kv_cache = inp.kv_cache + + inputs = self._add_kv_cache_to_input(engine_input, kv_cache) + if bool(kv_cache.engine_internal_cache): + # conventionally, before dispatching + # inputs to the engine, we validate them + # if val_inp=True. However, in this case + # we want to pass the empty kv cache inputs + # (batch_size=0) to the engine. Therefore, + # we skip the validation + out = self.engine._eng_net.execute_list_out( + inputs, kv_cache.engine_internal_cache + ) + else: + # run the engine without the LIB.kv_cache object + out = ( + super() + .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs) + .get("engine_outputs") + ) + + logits, *kv_cache_state = out + self._update_kv_cache( + kv_cache_state=kv_cache_state, + input_ids_len=self.input_ids_length, + kv_cache=kv_cache, + ) + + output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens} + return output + + def _add_kv_cache_to_input(self, engine_input, kv_cache): + kv_cache_state = copy.copy(kv_cache.cached_inputs) + + for idx, input_name in enumerate(self.onnx_input_names_no_cache): + kv_cache_state[input_name] = engine_input[idx] + + new_inp = [kv_cache_state[name] for name in self.engine.input_names] + return new_inp + + def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache): + if bool(kv_cache.engine_internal_cache): + kv_cache.total_num_processed_tokens += input_ids_len + return + + kv_cache_state = { + name: array + for name, array in zip(self.onnx_input_names_cached, kv_cache_state) + } + + kv_cache.update( + state=kv_cache_state, + input_ids_len=input_ids_len, + ) + + @property + def onnx_input_names_no_cache(self) -> List[str]: + """ + :return: The input names for the onnx model, excluding + the potential kv cache inputs + """ + return [ + name + for name in self.engine.input_names + if not name.startswith(CACHE_INPUT_PREFIX) + ] + + @property + def onnx_input_names_cached(self) -> List[str]: + """ + :return: The cached input names for the onnx model + """ + return [ + name + for name in self.engine.input_names + if name.startswith(CACHE_INPUT_PREFIX) + ] + + @property + def cache_shape(self) -> Tuple[int, int, int, int]: + """ + :return: The shape of the kv cache inputs + for the onnx model. The shape is + (batch_size, num_heads, sequence_length, hidden_size) + """ + cache_engine_input_index = next( + i + for i, name in enumerate(self.engine.input_names) + if CACHE_INPUT_PREFIX in name + ) + return self.engine.input_shapes[cache_engine_input_index] + + @property + def output_names(self) -> List[str]: + """ + :return: The output names for the onnx model + """ + return self.engine.output_names diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py new file mode 100644 index 0000000000..9878aa0061 --- /dev/null +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -0,0 +1,213 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.v2.operators import Operator +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.routers import GraphRouter +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.text_generation import ( + AutoRegressiveOperatorPreprocess, + CompilePromptLogits, + KVCacheCreator, + MultiEnginePrefill, + NLEngineOperator, + PrepareforPrefill, + ProcessInputsTextGeneration, +) +from deepsparse.v2.utils import PipelineState + + +class TextGenerationPipeline(Pipeline): + def __init__( + self, + model_path: str, + prompt_sequence_length: int = 16, + sequence_length: int = 1024, + internal_kv_cache: bool = True, + force_max_tokens: bool = False, + generation_config=None, + engine_kwargs: Dict = None, + ): + + pipeline_state = PipelineState() + pipeline_state_vals = {} + + # TODO: The code below will be replaced with a transformers set-up Operator. + self.tokenizer = None + model_path = self.setup_onnx_file_path(model_path, sequence_length) + self.tokenizer.padding_side = "left" + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + if not engine_kwargs: + engine_kwargs = {} + engine_kwargs["model_path"] = model_path + + if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": + internal_kv_cache = False + + single_engine_operator = NLEngineOperator( + sequence_length=sequence_length, + internal_kv_cache=internal_kv_cache, + input_ids_length=1, + **engine_kwargs, + ) + + multi_engine_operator = NLEngineOperator( + sequence_length=sequence_length, + internal_kv_cache=internal_kv_cache, + input_ids_length=prompt_sequence_length, + **engine_kwargs, + ) + + # NOTE: Currently using pipeline state. Can swap to simply pass in the + # attributes to the specific Operator that neeed them, as class attributes. + pipeline_state_vals[ + "onnx_input_names_no_cache" + ] = single_engine_operator.onnx_input_names_no_cache + pipeline_state_vals["cache_shape"] = single_engine_operator.cache_shape + pipeline_state_vals["output_names"] = single_engine_operator.output_names + pipeline_state_vals[ + "kv_cache_data_type" + ] = single_engine_operator.kv_cache_data_type + pipeline_state.create_state(pipeline_state_vals) + + process_inputs = ProcessInputsTextGeneration( + generation_config=process_generation_config(generation_config), + sequence_length=sequence_length, + tokenizer=self.tokenizer, + ) + + kv_cache_creator = KVCacheCreator( + sequence_length=sequence_length, + tokenizer=self.tokenizer, + prompt_sequence_length=prompt_sequence_length, + internal_kv_cache=internal_kv_cache, + ) + + # NOTE: Can also have the KVCacheCreator be initialized inside this Operator. + # Relies on pipeline state variables set-up above (can be swapped to be class + # attributes instead of using the state. + engine_inputs_for_prefill = PrepareforPrefill(kv_cache_creator=kv_cache_creator) + + multi_engine_prefill = MultiEnginePrefill( + prompt_sequence_length=prompt_sequence_length, + sequence_length=sequence_length, + ) + compile_prompt_logits = CompilePromptLogits() + """ + prep_for_single_engine = PrepareforSingleEngine( + prompt_sequence_length=prompt_sequence_length, + sequence_length=sequence_length, + ) + """ + autoregressive_preprocess = AutoRegressiveOperatorPreprocess( + sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, + ) + final_step = FinalStep() + + ops = { + "process_input": process_inputs, + "single_engine": single_engine_operator, + "multi_engine": multi_engine_operator, + "kv_cache_creator": kv_cache_creator, + "prepare_prefill": engine_inputs_for_prefill, + "multi_engine_prefill": multi_engine_prefill, + "compile_logits": compile_prompt_logits, + "autoregressive_preprocess": autoregressive_preprocess, + "final_step": final_step, + } + + routes = { + "process_input": "prepare_prefill", + "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"], + "multi_engine_prefill": "multi_engine", + "multi_engine": "compile_logits", + "compile_logits": [ + "multi_engine_prefill", + "autoregressive_preprocess", + "final_step", + ], + "autoregressive_preprocess": "single_engine", + "single_engine": "compile_logits", + "final_step": "STOP", + } + + router = GraphRouter( + end_route="STOP", start_route="process_input", route=routes + ) + scheduler = [OperatorScheduler()] + super().__init__( + ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state + ) + + # TODO: Move to be part of a generic transformers set-up Operator. + def setup_onnx_file_path(self, model_path, sequence_length) -> str: + import logging + + import transformers + from transformers import AutoTokenizer + + from deepsparse.transformers.helpers import get_deployment_path + + """ + Parses ONNX model from the `model_path` provided. It additionally + creates config and tokenizer objects from the `deployment path`, + derived from the `model_path` provided. + + :return: file path to the processed ONNX file for the engine to compile + """ + deployment_path, onnx_path = get_deployment_path(model_path) + + hf_logger = logging.getLogger("transformers") + hf_logger_level = hf_logger.level + hf_logger.setLevel(logging.ERROR) + self.config = transformers.PretrainedConfig.from_pretrained( + deployment_path, + finetuning_task=self.task if hasattr(self, "task") else None, + ) + hf_logger.setLevel(hf_logger_level) + + self._trust_remote_code = False + self.tokenizer = AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=self._trust_remote_code, + model_max_length=sequence_length, + ) + + if not self.config or not self.tokenizer: + raise RuntimeError( + "Invalid config or tokenizer provided. Please provide " + "paths to the files or ensure they exist in the `model_path` provided. " + "See `tokenizer` and `config` arguments for details." + ) + return onnx_path + + +# NOTE: This is a dummy last step which will be removed. Used as a final step +# for the current routes. +class FinalStep(Operator): + def can_operate(self, *args, **kwargs): + return True + + def run(self, *args, **kwargs): + import numpy + + inference_state = kwargs.get("inference_state") + prompt_logits = inference_state.current_state.get("prompt_logits") + return numpy.concatenate(prompt_logits, axis=1) diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py new file mode 100644 index 0000000000..2f9eb15797 --- /dev/null +++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["PrepareforPrefill"] + + +class PrepareforPrefill(Operator): + def __init__(self, kv_cache_creator: Operator): + """ + Operator before prefill. Responsible for creating the kv_cache based on engine + variables. Currently, this operator expects that the kv_cache_creator is + provided during initization and then uses pipeline_state to run the + kv_cache_operator. + """ + # NOTE: Alternatively, we can initialize the kv_cache_creater operator here, + # instead of at the pipeline level. + self.kv_cache_creator = kv_cache_creator + + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "cache_shape, output_names, kv_cache_data_type attributes to be set " + "from the NLEngineOperator" + ) + + def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs): + # NOTE: Can potentially just be class attributes instead of relying on + # pipeline state. + cache_shape = pipeline_state.current_state.get("cache_shape") + data_type = pipeline_state.current_state.get("kv_cache_data_type") + output_names = pipeline_state.current_state.get("output_names") + + kv_cache = self.kv_cache_creator.run( + cache_shape=cache_shape, + kv_cache_data_type=data_type, + output_names=output_names, + ).get("kv_cache") + return {"tokens": tokens, "kv_cache": kv_cache} diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py new file mode 100644 index 0000000000..528dcee0b7 --- /dev/null +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -0,0 +1,121 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +from typing import Dict, Union + +import transformers + +from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.utils.helpers import ( + check_and_return_generation_config, + override_config, + repeat_inputs, +) +from deepsparse.v2.operators import Operator + + +class GenerationDefaults: + num_return_sequences = 1 + max_length = 1024 + max_new_tokens = None + output_scores = False + top_k = 0 + top_p = 0.0 + repetition_penalty = 0.0 + do_sample = False + temperature = 1.0 + + +__all__ = ["ProcessInputsTextGeneration"] + + +class ProcessInputsTextGeneration(Operator): + """ + Input processing operator. Responsible for tokenizing the input, handling the + generation_config (if provided), updating the inference_state for later use, + and returning the tokens for prompt inferece. The expected input is defined by + the input_schema, which for this operator is TextGeneratioInput. + """ + + input_schema = TextGenerationInput + + def __init__( + self, + tokenizer: transformers.PreTrainedTokenizerBase, + generation_config: Union[ + str, pathlib.Path, Dict, transformers.GenerationConfig + ], + sequence_length: int, + ): + self.generation_config = generation_config + self.tokenizer = tokenizer + self.sequence_length = sequence_length + + def run(self, inp: TextGenerationInput, **kwargs): + generation_config = check_and_return_generation_config( + self.generation_config, inp.generation_config, GenerationDefaults() + ) + + generation_config = override_config(inp.generation_kwargs, generation_config) + + original_inputs = inp.sequences + if generation_config.num_return_sequences > 1: + if isinstance(inp.sequences, str): + inp.sequences = [inp.sequences] + inp.sequences = repeat_inputs( + inp.sequences, generation_config.num_return_sequences + ) + + if inp.fixed_sequences_length: + # to enforce a fixed sequence length, we need to + # truncate the input to the maximum sequence length + # or/and pad it to the maximum sequence length + truncate, padding = True, "max_length" + else: + # otherwise, we do not need to truncate the input + # and we shall can pad it to the longest sequence + # in the batch (so that the engine can process multiple inputs + # at once) + truncate, padding = False, "longest" + + input_tokens = self.tokenizer( + inp.sequences, + return_tensors="np", + max_length=self.sequence_length, + padding=padding, + truncation=truncate, + ) + + input_ids = input_tokens["input_ids"] + attention_mask = input_tokens["attention_mask"] + + inference_state_update = dict( + prompts=original_inputs, + streaming=inp.streaming, + generation_config=generation_config, + include_prompt_logits=inp.include_prompt_logits, + callback=inp.callback, + stop=inp.stop, + top_p=generation_config.top_p, + top_k=generation_config.top_k, + presence_penalty=inp.presence_penalty, + frequency_penalty=generation_config.repetition_penalty, + ) + + # TODO: move this step to prep_for_prefill and add attention mask to the output + # this will allow us to split/join more easily when processing multiple prompts + # in parallel + tokens = input_ids[attention_mask.nonzero()].tolist() + return {"tokens": tokens}, inference_state_update diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py index a36d8e92ec..358405d7af 100644 --- a/src/deepsparse/v2/utils/__init__.py +++ b/src/deepsparse/v2/utils/__init__.py @@ -13,5 +13,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from .state import * from .types import * diff --git a/src/deepsparse/v2/utils/state.py b/src/deepsparse/v2/utils/state.py new file mode 100644 index 0000000000..b54b890acf --- /dev/null +++ b/src/deepsparse/v2/utils/state.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from abc import ABC +from typing import Any, Union + + +__all__ = ["State", "PipelineState", "InferenceState"] + + +class State(ABC): + """ + Abstract class to store pipeline-level and inference-level state variables which + are generated by some Operator, and required by some other Operator. + """ + + def __init__(self): + self._current_state = None + + @property + def current_state(self): + return self._current_state + + +class PipelineState(State): + """ + Created during pipeline initialization. Pipeline state values are ready-only + duirng inference. + """ + + def create_state(self, new_state: dict): + if self._current_state: + raise ValueError("State creation is only allowed during initialization.") + self._current_state = new_state + + +class InferenceState(State): + """ + Inference state, created during every inference run. + """ + + def create_state(self, new_state: dict): + if self._current_state: + warnings.warn("Current state already exists, overriding.") + self._current_state = new_state + + def update_value(self, attribute: str, value: Union[str, int, list]): + if not self._current_state.get(attribute): + raise ValueError(f"{attribute} is not a valid state attribute") + self._current_state[attribute] = value + + def update_state(self, value: Any): + self._current_state.update(value) diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py index 9f85e4976e..bedddd537a 100644 --- a/tests/deepsparse/v2/test_basic_pipeline.py +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -34,7 +34,7 @@ class AddOneOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema) -> Dict: + def run(self, inp: IntSchema, **kwargs) -> Dict: return {"value": inp.value + 1} @@ -42,7 +42,7 @@ class AddTwoOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema) -> Dict: + def run(self, inp: IntSchema, **kwargs) -> Dict: return {"value": inp.value + 2} From 59457b7ca7967c54aad1d33d7db1d6ef83924a87 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 3 Nov 2023 11:15:00 -0400 Subject: [PATCH 05/57] [Pipeline Refactor] Additional Operators, Route update and completed generation functionality (#1356) * initial functionality and working example with image classification * remove testing image * rebase fixes * initial functionality and working example with image classification * text gen * updates func * prompt inference, initial functionality * remove image; update state docstring * Fix typo * add todo for split/join * remove context, clean-up args, remove prefill_preprocess_operaator * fix docstrings * initial functionality and working example with image classification * updates func * prompt inference, initial functionality * finish generation operators and update routes * further breakdown operators * add operators * fix can_operate condition * update can_operate to not rely on the inference_state * rebase + update * fix condition * fix capacity settting again * typo fixes --- .../v2/operators/engine_operator.py | 3 + src/deepsparse/v2/text_generation/__init__.py | 7 + .../autoregressive_preprocess_operator.py | 20 ++- .../compile_generated_tokens.py | 56 +++++++ .../v2/text_generation/compile_generations.py | 55 +++++++ .../v2/text_generation/compile_logits.py | 6 + .../v2/text_generation/generate_new_token.py | 90 +++++++++++ .../multi_engine_prefill_operator.py | 1 + .../v2/text_generation/nl_engine_operator.py | 8 +- src/deepsparse/v2/text_generation/pipeline.py | 61 ++++---- .../v2/text_generation/prep_for_generation.py | 140 ++++++++++++++++++ .../v2/text_generation/process_inputs.py | 2 +- .../v2/text_generation/process_outputs.py | 88 +++++++++++ .../v2/text_generation/token_generator.py | 30 ++++ 14 files changed, 529 insertions(+), 38 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/compile_generated_tokens.py create mode 100644 src/deepsparse/v2/text_generation/compile_generations.py create mode 100644 src/deepsparse/v2/text_generation/generate_new_token.py create mode 100644 src/deepsparse/v2/text_generation/prep_for_generation.py create mode 100644 src/deepsparse/v2/text_generation/process_outputs.py create mode 100644 src/deepsparse/v2/text_generation/token_generator.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index b7d920a686..c2fc562c63 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -87,6 +87,9 @@ def __init__( self._engine_args = engine_args self._engine_type = engine_type + if not engine_kwargs: + engine_kwargs = {} + self.engine = self.create_engine(**engine_kwargs) @property diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py index 37ac88d02f..21cd7e2acd 100644 --- a/src/deepsparse/v2/text_generation/__init__.py +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -13,12 +13,19 @@ # limitations under the License. # flake8: noqa from .autoregressive_preprocess_operator import * +from .compile_generated_tokens import * +from .compile_generations import * from .compile_logits import * +from .generate_new_token import * from .kv_cache_operator import * from .multi_engine_prefill_operator import * from .nl_engine_operator import * from .prep_for_prefill import * from .process_inputs import * +from .process_outputs import * +from .token_generator import * # isort:skip +from .prep_for_generation import * # isort:skip + from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py index cfe7cb531b..6e97412e43 100644 --- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py @@ -36,7 +36,6 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int): """ self.sequence_length = sequence_length self.prompt_sequence_length = prompt_sequence_length - self.set_capacity = False _LOGGER.warn( "This operator requires the PipelineState to be set-up with the " @@ -51,16 +50,19 @@ def can_operate(self, inp: Any) -> bool: tokens = inp.get("tokens") kv_cache = inp.get("kv_cache") + if inp.get("in_generation"): + return True + remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens - if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length: + can_process = ( + remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length + ) + if can_process and inp.get("in_generation") is None: return True return False def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): - - if not self.set_capacity: - self.set_capacity = True - kv_cache.set_capacity(self.sequence_length - 1) + kv_cache.set_capacity(self.sequence_length - 1) num_total_processed_tokens = kv_cache.total_num_processed_tokens new_token = tokens[num_total_processed_tokens] @@ -88,13 +90,9 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg engine_inputs = [engine_inputs_map[name] for name in engine_input_names] - onnx_input_names_no_cache = pipeline_state.current_state.get( - "onnx_input_names_no_cache" - ) - engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache] - return { "engine_inputs": engine_inputs, "kv_cache": kv_cache, "tokens": tokens, + "in_generation": kwargs.get("in_generation"), } diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py new file mode 100644 index 0000000000..c87436ab3a --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompileGeneratedTokens"] + + +class CompileGeneratedTokens(Operator): + def run( + self, + new_token, + logits, + finish_reason, + kv_cache, + tokens, + inference_state: InferenceState, + **kwargs, + ): + in_generation = True + + generated_tokens = inference_state.current_state.get("generated_tokens") + generated_logits = inference_state.current_state.get("generated_logits") + finished_reason = inference_state.current_state.get("finished_reason") + + generated_tokens.append(new_token) + generated_logits.append(logits) + finished_reason.append(finish_reason) + + if finish_reason is not None: + in_generation = False + + state_update = { # TODO: check if necessary + "finished_reason": finished_reason, + "generated_tokens": generated_tokens, + "generated_logits": generated_logits, + } + + output = { + "tokens": tokens, + "kv_cache": kv_cache, + "in_generation": in_generation, + } + return output, state_update diff --git a/src/deepsparse/v2/text_generation/compile_generations.py b/src/deepsparse/v2/text_generation/compile_generations.py new file mode 100644 index 0000000000..ed8297ac01 --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_generations.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any + +import numpy +from pydantic import BaseModel, Field + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompileGenerations", "CompileGenerationsOutput"] + + +class CompileGenerationsOutput(BaseModel): + generated_tokens: Any = Field(description="generated_tokens") + generated_logits: Any = Field(description="generated_logits") + finished_reason: Any = Field(description="finished_reason") + + +class CompileGenerations(Operator): + output_schema = CompileGenerationsOutput + + def can_operate(self, inp: Any): + if inp.get("in_generation") is False: + return True + return False + + def run(self, inference_state: InferenceState, **kwargs): + generated_tokens = inference_state.current_state.get("generated_tokens") + generated_logits = inference_state.current_state.get("generated_logits") + finished_reason = inference_state.current_state.get("finished_reason") + + if len(finished_reason) == 0: + finished_reason.append(FinishReason.LENGTH) + + generated_tokens = numpy.array([generated_tokens]) + generated_logits = numpy.concatenate(generated_logits, axis=1) + return { + "generated_tokens": generated_tokens, + "generated_logits": generated_logits, + "finished_reason": finished_reason, + } diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py index 55c87d791d..21bd50e03e 100644 --- a/src/deepsparse/v2/text_generation/compile_logits.py +++ b/src/deepsparse/v2/text_generation/compile_logits.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any from deepsparse.v2.operators import Operator from deepsparse.v2.utils import InferenceState @@ -27,6 +28,11 @@ class CompilePromptLogits(Operator): take prompt logits from each iteration run and update the inference state. """ + def can_operate(self, inp: Any): + if inp.get("in_generation") is None: + return True + return False + def run(self, logits, inference_state: InferenceState, **kwargs): logit_type = "prompt_logits" diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py new file mode 100644 index 0000000000..33ab546e39 --- /dev/null +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Sequence, Union + +import transformers + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["GenerateNewTokenOperator"] + + +class GenerateNewTokenOperator(Operator): + def __init__( + self, tokenizer: transformers.PreTrainedTokenizerBase, force_max_tokens: bool + ): + self.force_max_tokens = force_max_tokens + self.tokenizer = tokenizer + + def can_operate(self, inp: Any): + if inp.get("in_generation"): + return True + return False + + def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs): + token_generator = inference_state.current_state.get("token_generator") + token = token_generator.generate(logits=logits[0, -1, :]) + finish_reason = None + + callback = inference_state.current_state.get("callback") + stop = inference_state.current_state.get("stop") + + if token == self.tokenizer.eos_token_id and not self.force_max_tokens: + finish_reason = FinishReason.STOP + + if self._stop_token_generated(token, stop_tokens=stop): + print( + "Stop token %s generated. Stopping generation." + % self.tokenizer.decode(token) + ) + finish_reason = FinishReason.STOP + + if callback is not None and callback(token) is False: + print( + "callback %s returned False, stopping generation." + % callback.__qualname__ + ) + finish_reason = FinishReason.CALLBACK + + max_tokens = inference_state.current_state.get("max_tokens") + if len(inference_state.current_state.get("generated_tokens")) + 1 == max_tokens: + finish_reason = inference_state.current_state.get("length_finish_reason") + + state_update = { + "token_generator": token_generator, + } + + new_generation = { + "logits": logits, + "new_token": token, + "finish_reason": finish_reason, + } + output = {"tokens": token_generator.tokens, "kv_cache": kv_cache} + output.update(new_generation) + return output, state_update + + def _stop_token_generated( + self, token, stop_tokens: Union[None, str, Sequence[str]] + ) -> bool: + if stop_tokens is None: + return False + + decoded_token = self.tokenizer.decode(token) + decoded_token = ( + decoded_token if decoded_token.isspace() else decoded_token.strip() + ) + return decoded_token in stop_tokens diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py index 41ee830a8a..9a885c2355 100644 --- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py +++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py @@ -97,6 +97,7 @@ def _case_positions(self, num_total_processed_tokens: int): ) def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length) onnx_input_names_no_cache = pipeline_state.current_state.get( "onnx_input_names_no_cache" diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 6c1ad1966e..0bd9098a40 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -36,6 +36,7 @@ class NlEngineInput(BaseModel): engine_inputs: List = Field(description="engine inputs") kv_cache: Any = Field(description="kv_cache object") tokens: List = Field(description="tokens") + in_generation: bool = Field(description="in_generation", default=None) class NLEngineOperator(EngineOperator): @@ -119,7 +120,12 @@ def run(self, inp: NlEngineInput, **kwargs) -> Any: kv_cache=kv_cache, ) - output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens} + output = { + "logits": logits, + "kv_cache": kv_cache, + "tokens": inp.tokens, + "in_generation": inp.in_generation, + } return output def _add_kv_cache_to_input(self, engine_input, kv_cache): diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 9878aa0061..49826b8af7 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -15,18 +15,23 @@ from typing import Dict from deepsparse.transformers.utils.helpers import process_generation_config -from deepsparse.v2.operators import Operator from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, + CompileGeneratedTokens, + CompileGenerations, CompilePromptLogits, + GenerateNewTokenOperator, KVCacheCreator, MultiEnginePrefill, NLEngineOperator, PrepareforPrefill, + PrepareGeneration, ProcessInputsTextGeneration, + ProcessOutputs, + TokenGeneratorOperator, ) from deepsparse.v2.utils import PipelineState @@ -109,17 +114,23 @@ def __init__( sequence_length=sequence_length, ) compile_prompt_logits = CompilePromptLogits() - """ - prep_for_single_engine = PrepareforSingleEngine( - prompt_sequence_length=prompt_sequence_length, + + autoregressive_preprocess = AutoRegressiveOperatorPreprocess( sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, ) - """ - autoregressive_preprocess = AutoRegressiveOperatorPreprocess( + token_generator = TokenGeneratorOperator() + prep_for_generation = PrepareGeneration( sequence_length=sequence_length, prompt_sequence_length=prompt_sequence_length, + token_generator=token_generator, + ) + generate_new_token = GenerateNewTokenOperator( + tokenizer=self.tokenizer, force_max_tokens=force_max_tokens ) - final_step = FinalStep() + process_output = ProcessOutputs(tokenizer=self.tokenizer) + compile_generations = CompileGenerations() + compile_generated_tokens = CompileGeneratedTokens() ops = { "process_input": process_inputs, @@ -130,7 +141,11 @@ def __init__( "multi_engine_prefill": multi_engine_prefill, "compile_logits": compile_prompt_logits, "autoregressive_preprocess": autoregressive_preprocess, - "final_step": final_step, + "prep_for_generation": prep_for_generation, + "generate_new_token": generate_new_token, + "process_outputs": process_output, + "compile_generations": compile_generations, + "compile_generated_tokens": compile_generated_tokens, } routes = { @@ -140,12 +155,22 @@ def __init__( "multi_engine": "compile_logits", "compile_logits": [ "multi_engine_prefill", + "prep_for_generation", "autoregressive_preprocess", - "final_step", ], "autoregressive_preprocess": "single_engine", - "single_engine": "compile_logits", - "final_step": "STOP", + "single_engine": [ + "compile_logits", + "generate_new_token", + ], + "prep_for_generation": "autoregressive_preprocess", + "generate_new_token": "compile_generated_tokens", + "compile_generated_tokens": [ + "autoregressive_preprocess", + "compile_generations", + ], + "compile_generations": "process_outputs", + "process_outputs": "STOP", } router = GraphRouter( @@ -197,17 +222,3 @@ def setup_onnx_file_path(self, model_path, sequence_length) -> str: "See `tokenizer` and `config` arguments for details." ) return onnx_path - - -# NOTE: This is a dummy last step which will be removed. Used as a final step -# for the current routes. -class FinalStep(Operator): - def can_operate(self, *args, **kwargs): - return True - - def run(self, *args, **kwargs): - import numpy - - inference_state = kwargs.get("inference_state") - prompt_logits = inference_state.current_state.get("prompt_logits") - return numpy.concatenate(prompt_logits, axis=1) diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py new file mode 100644 index 0000000000..544af43980 --- /dev/null +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any + +import numpy + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation import TokenGeneratorOperator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["PrepareGeneration"] + + +class PrepareGeneration(Operator): + def __init__( + self, + token_generator: TokenGeneratorOperator, + prompt_sequence_length: int, + sequence_length: int, + ): + self.prompt_sequence_length = prompt_sequence_length + self.sequence_length = sequence_length + self.token_generator_creator = token_generator + + def can_operate(self, inp: Any): + kv_cache = inp.get("kv_cache") + tokens = inp.get("tokens") + + # If the number of prompt tokens is greater than what we've processed, + # don't start generation. Should be equal when started as all prompt logits + # should be accounted for and we should have updated the kv_cache for the single + # token engine. + if len(tokens) == kv_cache.total_num_processed_tokens: + return True + return False + + @staticmethod + def set_generated_length( + max_length: int, + prompt_tokens_length: int, + sequence_length: int, + prompt_sequence_length: int, + max_new_tokens: int, + finish_reason_choices: "FinishReason", # noqa + ): + """ + Determine the length of the generated tokens. The hard cap on the total number + of tokens is based on the sequence length. If max_length is provided and is less + than the sequence length, it will be used to cap the total number of tokens + generated. If it is not provided, the max_new_tokens attribute will be used and + also capped by the sequence length. + + :param max_length: max_length attribute, provided as input during inference + :param prompt_tokens_length: the number of prompt tokens used as part of the + generated output + :param sequence_length: the sequence length used for the pipeline + :param prompt_sequence_length: the prompt sequence length used for the pipeline + :param max_new_tokens: the max_new_tokens attribute, which may be provided + as part of the input during inference + """ + if max_length: + # if max_length provided, use that to cap total tokens generated + max_tokens = max_length + finish_reason = finish_reason_choices.LENGTH + else: + # if not provided, max tokens is based on max_new_tokens + prompt tokens + max_tokens = ( + min(max_new_tokens, sequence_length - prompt_sequence_length) + + prompt_tokens_length + ) + finish_reason = finish_reason_choices.MAX_NEW_TOKENS + + # hard model/pipeline cap + return ( + (sequence_length, finish_reason_choices.CAPACITY) + if sequence_length < max_tokens + else (max_tokens, finish_reason) + ) + + def run( + self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs + ): + prompt_logits = inference_state.current_state.get("prompt_logits") + prompt_logits = numpy.concatenate(prompt_logits, axis=1) + # TODO: clean this up such that dont have to keep writing current_state + # everywhere + + generation_config = inference_state.current_state.get("generation_config") + include_prompt_logits = inference_state.current_state.get( + "include_prompt_logits" + ) + + token_generator_creator_output = self.token_generator_creator.run( + logits_shape=prompt_logits[0, -1, :].shape, + deterministic=not generation_config.do_sample, + sampling_temperature=generation_config.temperature, + tokens=tokens, + **inference_state.current_state, + ) + token_generator = token_generator_creator_output.get("token_generator") + token_generator.generate(prompt_logits[0, -1, :]) + + max_tokens, length_finish_reason = PrepareGeneration.set_generated_length( + max_length=generation_config.max_length, + prompt_tokens_length=1, + max_new_tokens=generation_config.max_new_tokens, + sequence_length=self.sequence_length, + prompt_sequence_length=self.prompt_sequence_length, + finish_reason_choices=FinishReason, + ) + state_update = { + "max_tokens": max_tokens, + "length_finish_reason": length_finish_reason, + "generated_tokens": [token_generator.tokens[-1]], + "generated_logits": [prompt_logits] + if include_prompt_logits + else [numpy.expand_dims(prompt_logits[:, -1, :], 0)], + "finished_reason": [], + "token_generator": token_generator, + } + + output = { + "tokens": token_generator.tokens, + "kv_cache": kv_cache, + "in_generation": True, + } + return output, state_update diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 528dcee0b7..e57e402983 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -28,7 +28,7 @@ class GenerationDefaults: num_return_sequences = 1 - max_length = 1024 + max_length = 100 max_new_tokens = None output_scores = False top_k = 0 diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py new file mode 100644 index 0000000000..ca1cf78521 --- /dev/null +++ b/src/deepsparse/v2/text_generation/process_outputs.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +from typing import Optional + +import numpy + +from deepsparse.transformers.pipelines.text_generation import ( + FinishReason, + GeneratedText, + TextGenerationOutput, +) +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput +from deepsparse.v2.utils import InferenceState + + +class ProcessOutputs(Operator): + output_schema = TextGenerationOutput + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def _create_generated_text_output( + self, + sequence: str, + finish_reason: Optional[FinishReason] = None, + logits: Optional[numpy.array] = None, + ): + if finish_reason: + return GeneratedText( + text=sequence, + score=logits, + finished=True, + finished_reason=finish_reason.value, + ) + return GeneratedText( + text=sequence, + score=logits, + finished=False, + ) + + def run( + self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs + ): + generation_config = inference_state.current_state.get("generation_config") + generated_tokens = inp.generated_tokens + generated_logits = ( + inp.generated_logits if generation_config.output_scores else None + ) + finished_reason = inp.finished_reason + sequences = self.tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) + + finished_reason = [f for f in finished_reason if f] + + if generated_logits is not None: + generations = list( + map( + self._create_generated_text_output, + sequences, + finished_reason, + generated_logits, + ) + ) + else: + generations = list( + map(self._create_generated_text_output, sequences, finished_reason) + ) + outputs = dict( + created=datetime.datetime.now(), + prompts=inference_state.current_state.get("prompts"), + generations=generations, + ) + + return outputs diff --git a/src/deepsparse/v2/text_generation/token_generator.py b/src/deepsparse/v2/text_generation/token_generator.py new file mode 100644 index 0000000000..9148d71cc8 --- /dev/null +++ b/src/deepsparse/v2/text_generation/token_generator.py @@ -0,0 +1,30 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from deepsparse.transformers.utils.token_generator import TokenGenerator +from deepsparse.v2.operators import Operator + + +__all__ = ["TokenGeneratorOperator"] + + +class TokenGeneratorOperator(Operator): + def run(self, logits_shape, deterministic, tokens, sampling_temperature, **kwargs): + token_generator = TokenGenerator( + logits_shape=logits_shape, + deterministic=deterministic, + tokens=tokens, + sampling_temperature=sampling_temperature, + **kwargs, + ) + return {"token_generator": token_generator} From f18d5f3c4a3a6f9431787ae36a3cdfcabaacdd91 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 3 Nov 2023 15:24:15 -0400 Subject: [PATCH 06/57] add split/join functionality --- .../v2/operators/engine_operator.py | 18 +---- src/deepsparse/v2/operators/operator.py | 13 ---- src/deepsparse/v2/pipeline.py | 71 ++++++++++++++++++- src/deepsparse/v2/routers/router.py | 4 +- src/deepsparse/v2/schedulers/scheduler.py | 17 +++++ .../v2/schedulers/scheduler_group.py | 27 +++---- src/deepsparse/v2/text_generation/__init__.py | 1 + .../v2/text_generation/join_output.py | 70 ++++++++++++++++++ src/deepsparse/v2/text_generation/pipeline.py | 28 ++++++-- .../v2/text_generation/prep_for_prefill.py | 9 ++- .../v2/text_generation/process_inputs.py | 9 ++- .../v2/text_generation/process_outputs.py | 16 ++--- 12 files changed, 216 insertions(+), 67 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/join_output.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index c2fc562c63..bd58aefafa 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -20,7 +20,7 @@ from deepsparse import Context as EngineContext from deepsparse import Engine, MultiModelEngine, Scheduler from deepsparse.benchmark import ORTEngine -from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs +from deepsparse.utils import model_to_path from deepsparse.v2.operators import Operator @@ -145,18 +145,6 @@ def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict: # planned refactor engine_outputs = inp.engine(inp.engine_inputs) return {"engine_outputs": engine_outputs} - inp = inp.engine_inputs - batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) - batches_outputs = list(map(self.engine, batches)) - engine_outputs = self.condense_inputs( - batch_outputs=batches_outputs, orig_batch_size=orig_batch_size - ) - return {"engine_outputs": engine_outputs} - def expand_inputs(self, **kwargs): - return split_engine_inputs(kwargs["engine_inputs"], self._batch_size) - - def condense_inputs(self, **kwargs): - batch_outputs = kwargs["batch_outputs"] - orig_batch_size = kwargs["orig_batch_size"] - return join_engine_outputs(batch_outputs, orig_batch_size) + engine_outputs = self.engine(inp.engine_inputs) + return {"engine_outputs": engine_outputs} diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index b3963d8223..5bb0be841a 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -99,7 +99,6 @@ def __call__( pipeline_state=pipeline_state, **kwargs, ) - if self.has_output_schema(): return self.output_schema(**run_output) return run_output @@ -117,18 +116,6 @@ def can_operate(self, inp: Any) -> bool: """ return True - def expand_inputs(self, **kwargs): - """ - Generic function to handle expanding values. - """ - raise NotImplementedError - - def condense_inputs(self, **kwargs): - """ - Generic function to handle condensing values. - """ - raise NotImplementedError - def yaml(self): pass diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 0a8c8b2f93..79667fdc3a 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -13,7 +13,9 @@ # limitations under the License. -from typing import Dict, List, Union +import copy +from functools import partial +from typing import Any, Dict, List, Union from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router @@ -59,6 +61,55 @@ def __init__( # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) + def _run_sequential( + self, + inp: Any, + inference_state: InferenceState, + pipeline_state: PipelineState, + start: str, + end: str, + ): + # TODO: somehow refactor to prevent repeat code. + next_step = start + while next_step != end: + operator = self.ops[next_step] + if isinstance(inp, dict): + operator_output = operator( + pipeline_state=pipeline_state, + inference_state=inference_state, + **inp, + ) + else: + operator_output = operator( + inp, pipeline_state=pipeline_state, inference_state=inference_state + ) + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + inference_state.update_state(state_update) + + next_step = self.router.next(next_step, self.ops, operator_output) + inp = operator_output + return inp + + def _apply_split(self, inp: Any, inference_state: InferenceState): + + batches, orig_batch_size = self.expand_inputs(inp, 1) + run_with_state = partial( + self._run_sequential, + pipeline_state=self.pipeline_state, + start=self.router.route[self.router.SPLIT_ROUTE], + end=self.router.END_SPLIT, + ) + inference_state_list = [ + copy.deepcopy(inference_state) for x in range(len(batches)) + ] + outputs = self._scheduler_group.map( + batches, inference_state_list, func=run_with_state + ) + outputs = self.condense_inputs(outputs) + return outputs + def run( self, *args, @@ -78,7 +129,11 @@ def run( operator_output = None while next_step != self.router.END_ROUTE: - # Either a dictionary key or valid index + # Split_Route should be after Start_Route + if next_step == self.router.SPLIT_ROUTE: + operator_output = self._apply_split(operator_output, inference_state) + next_step = self.router.route[self.router.END_SPLIT] + operator = self.ops[next_step] if next_step == self.router.START_ROUTE: output_future = self._scheduler_group.submit( @@ -136,6 +191,18 @@ def __call__(self, *args, **kwargs): return self.run(*args, **kwargs) + def expand_inputs(self, *args, **kwargs): + """ + Generic function to handle expanding values. + """ + raise NotImplementedError + + def condense_inputs(self, *args, **kwargs): + """ + Generic function to handle condensing values. + """ + raise NotImplementedError + def validate(self): """ Validate that compatability of the router and operators provided. diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index d1110d4ca7..93bc059ddb 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -128,8 +128,10 @@ class GraphRouter(Router): where `can_operate` returns True will run. Paths should be deterministic. """ - def __init__(self, end_route: str, start_route: str, route: Dict): + def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs): super().__init__(end_route=end_route, start_route=start_route, route=route) + self.SPLIT_ROUTE = kwargs.get("split_route") + self.END_SPLIT = kwargs.get("end_split") def next( self, diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 78a58e3389..617936d509 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -14,6 +14,7 @@ from concurrent.futures import Future, ThreadPoolExecutor +from typing import Callable from deepsparse.v2.operators import Operator @@ -52,6 +53,22 @@ def submit( **kwargs, ) + def can_map(self, *args): + """ + args containing list of inputs to be used for each worker. This function if we + have sufficient workes available + """ + if len(args[0]) <= self._threadpool._max_workers: + return True + return False + + def map(self, *args, func: Callable): + """ + :param func: Callable to run as part of the map function + args containing a list of function variables to map + """ + return list(self._threadpool.map(func, *args)) + def can_process( self, *args, diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 40b5695f22..8557325c9a 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -14,7 +14,7 @@ from concurrent.futures import Future -from typing import List +from typing import Callable, List from deepsparse.v2.operators import Operator from deepsparse.v2.schedulers.scheduler import OperatorScheduler @@ -56,22 +56,13 @@ def submit( **kwargs, ) - def can_process( - self, - *args, - operator: Operator, - **kwargs, - ) -> bool: + def map(self, *args, func: Callable): """ - :param operator: operator to check - :return: True if this Operator can process the given operator and input. - SchedulerGroup always returns True + :param operator: operator to run + :return: list of outputs from multiple workers """ - return any( - scheduler.can_process( - *args, - operator=operator, - **kwargs, - ) - for scheduler in self.schedulers - ) + for scheduler in self.schedulers: + if scheduler.can_map( + args[0], + ): + return scheduler.map(*args, func=func) diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py index 21cd7e2acd..08836b8bbe 100644 --- a/src/deepsparse/v2/text_generation/__init__.py +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -17,6 +17,7 @@ from .compile_generations import * from .compile_logits import * from .generate_new_token import * +from .join_output import * from .kv_cache_operator import * from .multi_engine_prefill_operator import * from .nl_engine_operator import * diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py new file mode 100644 index 0000000000..8a6c77a2f1 --- /dev/null +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import numpy + +from deepsparse.transformers.utils.helpers import pad_to_fixed_length +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput + + +__all__ = ["JoinOutput"] + + +class JoinOutput(Operator): + """ + Run this operator to combine the results from multiple prompts. + """ + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def run(self, inp: List[CompileGenerationsOutput], **kwargs): + batch_outputs = [x for x in inp[0]] + generated_tokens = [x.generated_tokens for x in batch_outputs] + generated_logits = [x.generated_logits for x in batch_outputs] + finished_reason = [x.finished_reason for x in batch_outputs] + + max_len = max(token.shape[1] for token in generated_tokens) + + # pad all tokens to the same length + tokens = [ + pad_to_fixed_length( + array=prediction, + max_len=max_len, + value=self.tokenizer.pad_token_id, + axis=1, + ) + for prediction in generated_tokens + ] + + # find the longest sequence in the batch of logits + max_len = max(logits.shape[1] for logits in generated_logits) + + # pad all logits to the same length + logits = [ + pad_to_fixed_length(array=single_logits, max_len=max_len, axis=1) + for single_logits in generated_logits + ] + + tokens = numpy.concatenate(tokens) + logits = numpy.concatenate(logits) + + return { + "generated_tokens": tokens, + "generated_logits": logits, + "finished_reason": finished_reason, + } diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 49826b8af7..a24c37ed90 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -15,6 +15,7 @@ from typing import Dict from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.utils import split_engine_inputs from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import OperatorScheduler @@ -24,6 +25,7 @@ CompileGenerations, CompilePromptLogits, GenerateNewTokenOperator, + JoinOutput, KVCacheCreator, MultiEnginePrefill, NLEngineOperator, @@ -131,6 +133,7 @@ def __init__( process_output = ProcessOutputs(tokenizer=self.tokenizer) compile_generations = CompileGenerations() compile_generated_tokens = CompileGeneratedTokens() + join_output = JoinOutput(tokenizer=self.tokenizer) ops = { "process_input": process_inputs, @@ -146,10 +149,12 @@ def __init__( "process_outputs": process_output, "compile_generations": compile_generations, "compile_generated_tokens": compile_generated_tokens, + "join_output": join_output, } routes = { - "process_input": "prepare_prefill", + "process_input": "SPLIT", + "SPLIT": "prepare_prefill", "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"], "multi_engine_prefill": "multi_engine", "multi_engine": "compile_logits", @@ -169,18 +174,33 @@ def __init__( "autoregressive_preprocess", "compile_generations", ], - "compile_generations": "process_outputs", + "compile_generations": "JOIN", + "JOIN": "join_output", + "join_output": "process_outputs", "process_outputs": "STOP", } router = GraphRouter( - end_route="STOP", start_route="process_input", route=routes + end_route="STOP", + start_route="process_input", + route=routes, + split_route="SPLIT", + end_split="JOIN", ) - scheduler = [OperatorScheduler()] + scheduler = [OperatorScheduler(), OperatorScheduler(max_workers=4)] super().__init__( ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state ) + def expand_inputs(self, items, batch_size): + items = [items.get(key) for key in items.keys()] + out, orig_batch_size = split_engine_inputs(items, batch_size) + combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] + return combined_batches, orig_batch_size + + def condense_inputs(self, *args, **kwargs): + return args[0], kwargs + # TODO: Move to be part of a generic transformers set-up Operator. def setup_onnx_file_path(self, model_path, sequence_length) -> str: import logging diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py index 2f9eb15797..2e5fecb3e8 100644 --- a/src/deepsparse/v2/text_generation/prep_for_prefill.py +++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py @@ -42,13 +42,20 @@ def __init__(self, kv_cache_creator: Operator): "from the NLEngineOperator" ) - def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs): + def run( + self, + input_ids: Any, + attention_mask: Any, + pipeline_state: PipelineState, + **kwargs, + ): # NOTE: Can potentially just be class attributes instead of relying on # pipeline state. cache_shape = pipeline_state.current_state.get("cache_shape") data_type = pipeline_state.current_state.get("kv_cache_data_type") output_names = pipeline_state.current_state.get("output_names") + tokens = input_ids[attention_mask.nonzero()].tolist() kv_cache = self.kv_cache_creator.run( cache_shape=cache_shape, kv_cache_data_type=data_type, diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index e57e402983..5d47c8ff39 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -114,8 +114,7 @@ def run(self, inp: TextGenerationInput, **kwargs): frequency_penalty=generation_config.repetition_penalty, ) - # TODO: move this step to prep_for_prefill and add attention mask to the output - # this will allow us to split/join more easily when processing multiple prompts - # in parallel - tokens = input_ids[attention_mask.nonzero()].tolist() - return {"tokens": tokens}, inference_state_update + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + }, inference_state_update diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py index ca1cf78521..ba301bdae6 100644 --- a/src/deepsparse/v2/text_generation/process_outputs.py +++ b/src/deepsparse/v2/text_generation/process_outputs.py @@ -22,7 +22,6 @@ TextGenerationOutput, ) from deepsparse.v2.operators import Operator -from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput from deepsparse.v2.utils import InferenceState @@ -52,19 +51,20 @@ def _create_generated_text_output( ) def run( - self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs + self, + generated_tokens: numpy.ndarray, + generated_logits: numpy.ndarray, + finished_reason: list, + inference_state: InferenceState, + **kwargs, ): generation_config = inference_state.current_state.get("generation_config") - generated_tokens = inp.generated_tokens - generated_logits = ( - inp.generated_logits if generation_config.output_scores else None - ) - finished_reason = inp.finished_reason + generated_logits = generated_logits if generation_config.output_scores else None sequences = self.tokenizer.batch_decode( generated_tokens, skip_special_tokens=True ) - finished_reason = [f for f in finished_reason if f] + finished_reason = [f[-1] for f in finished_reason] if generated_logits is not None: generations = list( From 2c4d23124427e5f99400cc5ce4c79508e6ae436f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 16:21:55 -0500 Subject: [PATCH 07/57] update router to include split/join in parent class, refactor pipeline code to remove repeat code, update map function --- src/deepsparse/v2/pipeline.py | 136 +++++++++++------- src/deepsparse/v2/routers/router.py | 13 +- src/deepsparse/v2/schedulers/scheduler.py | 17 --- .../v2/schedulers/scheduler_group.py | 13 +- src/deepsparse/v2/text_generation/pipeline.py | 8 +- 5 files changed, 101 insertions(+), 86 deletions(-) diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 79667fdc3a..f56680d2b9 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -14,8 +14,9 @@ import copy +from concurrent.futures import Future from functools import partial -from typing import Any, Dict, List, Union +from typing import Any, Callable, Dict, List, Union from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router @@ -58,7 +59,6 @@ def __init__( self.pipeline_state = pipeline_state self.validate() - # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) def _run_sequential( @@ -69,46 +69,77 @@ def _run_sequential( start: str, end: str, ): - # TODO: somehow refactor to prevent repeat code. next_step = start while next_step != end: - operator = self.ops[next_step] - if isinstance(inp, dict): - operator_output = operator( - pipeline_state=pipeline_state, - inference_state=inference_state, - **inp, - ) - else: - operator_output = operator( - inp, pipeline_state=pipeline_state, inference_state=inference_state - ) - if isinstance(operator_output, tuple): - state_update = operator_output[-1] - operator_output = operator_output[0] + outputs = self._run_next_step( + func=self.ops[next_step], + next_step=next_step, + input=inp, + pipeline_state=pipeline_state, + inference_state=inference_state, + ) + next_step, operator_output, state_update = outputs + if state_update: inference_state.update_state(state_update) - - next_step = self.router.next(next_step, self.ops, operator_output) inp = operator_output return inp def _apply_split(self, inp: Any, inference_state: InferenceState): + """ + Split inputs using the pipeline's expand_inputs function. Inputs are split + into a batch size of one when a SPLIT_ROUTE node is found in a given pipeline's + provided router. The split batches are run asynchronously and then joined when + a JOIN_ROUTE node is found, using the pipeline's condense_inputs function. + """ batches, orig_batch_size = self.expand_inputs(inp, 1) run_with_state = partial( self._run_sequential, pipeline_state=self.pipeline_state, start=self.router.route[self.router.SPLIT_ROUTE], - end=self.router.END_SPLIT, + end=self.router.JOIN_ROUTE, ) inference_state_list = [ copy.deepcopy(inference_state) for x in range(len(batches)) ] - outputs = self._scheduler_group.map( - batches, inference_state_list, func=run_with_state + futures = self._scheduler_group.map( + batches, + inference_state_list, + func=run_with_state, ) - outputs = self.condense_inputs(outputs) - return outputs + return self.condense_inputs([x.result() for x in futures]) + + def _run_next_step( + self, + *args, + func: Callable, + next_step: Union[str, int], + input: Any = None, + **kwargs, + ): + """ + Generic function to run a given func, process the output and determine the next + step. + """ + if input: + operator_output = ( + func(*args, **kwargs, **input) + if isinstance(input, dict) + else func(input, *args, **kwargs) + ) + else: + operator_output = func(*args, **kwargs) + + if isinstance(operator_output, Future): + operator_output = operator_output.result() + + state_update = None + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + + next_step = self.router.next(next_step, self.ops, operator_output) + return next_step, operator_output, state_update def run( self, @@ -129,44 +160,34 @@ def run( operator_output = None while next_step != self.router.END_ROUTE: - # Split_Route should be after Start_Route + # NOTE: split_route should only appear after the start route node if next_step == self.router.SPLIT_ROUTE: operator_output = self._apply_split(operator_output, inference_state) - next_step = self.router.route[self.router.END_SPLIT] + next_step = self.router.route[self.router.JOIN_ROUTE] - operator = self.ops[next_step] if next_step == self.router.START_ROUTE: - output_future = self._scheduler_group.submit( + outputs = self._run_next_step( *args, + next_step=next_step, + func=self._scheduler_group.submit, inference_state=inference_state, - operator=operator, + operator=self.ops[next_step], pipeline_state=pipeline_state, **kwargs, ) else: - if isinstance(operator_output, dict): - output_future = self._scheduler_group.submit( - inference_state=inference_state, - operator=operator, - pipeline_state=pipeline_state, - **operator_output, - ) - else: - output_future = self._scheduler_group.submit( - operator_output, - inference_state=inference_state, - pipeline_state=pipeline_state, - operator=operator, - ) - - operator_output = output_future.result() - if isinstance(operator_output, tuple): - state_update = operator_output[-1] - operator_output = operator_output[0] - inference_state.update_state(state_update) - - next_step = self.router.next(next_step, self.ops, operator_output) + outputs = self._run_next_step( + func=self._scheduler_group.submit, + input=operator_output, + next_step=next_step, + inference_state=inference_state, + operator=self.ops[next_step], + pipeline_state=pipeline_state, + ) + next_step, operator_output, state_update = outputs + if state_update: + inference_state.update_state(state_update) return operator_output def __call__(self, *args, **kwargs): @@ -195,13 +216,22 @@ def expand_inputs(self, *args, **kwargs): """ Generic function to handle expanding values. """ - raise NotImplementedError + raise NotImplementedError( + "This function should be implemented for any router with split or join" + "nodes. expand_inputs will be called prior to the split node (stored in " + "the router's SPLIT_ROUTE attribute), expanding outputs for each output " + "such that there is a batch size of one per thread." + ) def condense_inputs(self, *args, **kwargs): """ Generic function to handle condensing values. """ - raise NotImplementedError + raise NotImplementedError( + "This function should be implemented for any router with split or join " + "nodes. condense_inputs will be called after the join node (stored in the " + "router's JOIN_ROUTE attribute), condensing outputs from multiple threads." + ) def validate(self): """ diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 93bc059ddb..1b70164002 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -41,9 +41,13 @@ def __init__( end_route: Union[str, int], start_route: Union[str, int], route: Optional[Dict] = None, + split_route: str = "SPLIT", + join_route: str = "JOIN", ): self.START_ROUTE = start_route self.END_ROUTE = end_route + self.SPLIT_ROUTE = split_route + self.JOIN_ROUTE = join_route self.route = route @abstractmethod @@ -79,6 +83,9 @@ class LinearRouter(Router): def __init__(self, end_route: int, start_route: int = 0): super().__init__(end_route=end_route, start_route=start_route) + self.SPLIT_ROUTE = None + self.JOIN_ROUTE = None + _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.") def next( self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None @@ -129,9 +136,9 @@ class GraphRouter(Router): """ def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs): - super().__init__(end_route=end_route, start_route=start_route, route=route) - self.SPLIT_ROUTE = kwargs.get("split_route") - self.END_SPLIT = kwargs.get("end_split") + super().__init__( + end_route=end_route, start_route=start_route, route=route, **kwargs + ) def next( self, diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 617936d509..78a58e3389 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -14,7 +14,6 @@ from concurrent.futures import Future, ThreadPoolExecutor -from typing import Callable from deepsparse.v2.operators import Operator @@ -53,22 +52,6 @@ def submit( **kwargs, ) - def can_map(self, *args): - """ - args containing list of inputs to be used for each worker. This function if we - have sufficient workes available - """ - if len(args[0]) <= self._threadpool._max_workers: - return True - return False - - def map(self, *args, func: Callable): - """ - :param func: Callable to run as part of the map function - args containing a list of function variables to map - """ - return list(self._threadpool.map(func, *args)) - def can_process( self, *args, diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 8557325c9a..d426f830b2 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -58,11 +58,10 @@ def submit( def map(self, *args, func: Callable): """ - :param operator: operator to run - :return: list of outputs from multiple workers + :param func: generic callable run for each arg + :return: list of futures for each submit """ - for scheduler in self.schedulers: - if scheduler.can_map( - args[0], - ): - return scheduler.map(*args, func=func) + futures = [] + for _, values in enumerate(zip(*args)): + futures.append(self.submit(*values, operator=func)) + return futures diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index a24c37ed90..240da04907 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -181,13 +181,9 @@ def __init__( } router = GraphRouter( - end_route="STOP", - start_route="process_input", - route=routes, - split_route="SPLIT", - end_split="JOIN", + end_route="STOP", start_route="process_input", route=routes ) - scheduler = [OperatorScheduler(), OperatorScheduler(max_workers=4)] + scheduler = [OperatorScheduler()] super().__init__( ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state ) From 672ca2048145bacfa936627da3bb2a6f0f56666e Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 16:38:24 -0500 Subject: [PATCH 08/57] process multiple generations --- src/deepsparse/v2/text_generation/process_outputs.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py index ba301bdae6..7173b8e256 100644 --- a/src/deepsparse/v2/text_generation/process_outputs.py +++ b/src/deepsparse/v2/text_generation/process_outputs.py @@ -79,6 +79,15 @@ def run( generations = list( map(self._create_generated_text_output, sequences, finished_reason) ) + + num_preds = generation_config.num_return_sequences + if num_preds > 1: + grouped_generations = [ + generations[n : n + num_preds] + for n in range(0, len(generations), num_preds) + ] + generations = grouped_generations + outputs = dict( created=datetime.datetime.now(), prompts=inference_state.current_state.get("prompts"), From 304eb358a17923269ab2d1338a9113d32a268ce0 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 8 Nov 2023 13:36:40 +0000 Subject: [PATCH 09/57] initial commit --- src/deepsparse/transformers/helpers.py | 97 +++++++++++++++++-- src/deepsparse/utils/onnx.py | 8 +- src/deepsparse/v2/text_generation/pipeline.py | 68 +++---------- 3 files changed, 103 insertions(+), 70 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index d7acc71a99..78543baf12 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -17,24 +17,26 @@ """ +import logging import os import re from pathlib import Path from tempfile import NamedTemporaryFile -from typing import List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy import onnx +import transformers from onnx import ModelProto from deepsparse.log import get_main_logger -from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model +from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model from sparsezoo import Model from sparsezoo.utils import save_onnx __all__ = [ - "get_deployment_path", + "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", "fix_numpy_types", "get_transformer_layer_init_names", @@ -44,6 +46,81 @@ _LOGGER = get_main_logger() +def setup_transformers_pipeline( + model_path: str, + sequence_length: int, + tokenizer_padding_side: str = "left", + engine_kwargs: Optional[Dict] = None, + onnx_model_name: Optional[str] = None, +) -> Tuple[ + str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any] +]: + """ + A helper function that sets up the model path, config, tokenizer, + and engine kwargs for a transformers model. + :param model_path: The path to the model to load + :param sequence_length: The sequence length to use for the model + :param tokenizer_padding_side: The side to pad on for the tokenizer, + either "left" or "right" + :param engine_kwargs: The kwargs to pass to the engine + :param onnx_model_name: The name of the onnx model to be loaded. + If not specified, defaults are used (see setup_onnx_file_path) + :return The model path, config, tokenizer, and engine kwargs + """ + model_path, config, tokenizer = setup_onnx_file_path( + model_path, sequence_length, onnx_model_name + ) + + tokenizer.padding_side = tokenizer_padding_side + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + engine_kwargs = engine_kwargs or {} + engine_kwargs["model_path"] = model_path + return model_path, config, tokenizer, engine_kwargs + + +def setup_onnx_file_path( + model_path: str, + sequence_length: int, + onnx_model_name: Optional[str] = None, +) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]: + """ + Parses ONNX model from the `model_path` provided. It additionally + creates config and tokenizer objects from the `deployment path`, + derived from the `model_path` provided. + :param model_path: path to the model to be parsed + :param sequence_length: maximum sequence length of the model + :param onnx_model_name: optionally, the precise name of the ONNX model + of interest may be specified. If not specified, the default ONNX model + name will be used (refer to `get_deployment_path` for details) + :return: file path to the processed ONNX file for the engine to compile + """ + deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name) + + hf_logger = logging.getLogger("transformers") + hf_logger_level = hf_logger.level + hf_logger.setLevel(logging.ERROR) + + config = transformers.PretrainedConfig.from_pretrained(deployment_path) + hf_logger.setLevel(hf_logger_level) + + trust_remote_code = False + tokenizer = transformers.AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=trust_remote_code, + model_max_length=sequence_length, + ) + + if not config or not tokenizer: + raise RuntimeError( + "Invalid config or tokenizer provided. Please provide " + "paths to the files or ensure they exist in the `model_path` provided. " + "See `tokenizer` and `config` arguments for details." + ) + return onnx_path, config, tokenizer + + def get_deployment_path(model_path: str) -> Tuple[str, str]: """ Returns the path to the deployment directory @@ -63,26 +140,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: if os.path.isdir(model_path): model_files = os.listdir(model_path) - if _MODEL_DIR_ONNX_NAME not in model_files: + if MODEL_ONNX_NAME not in model_files: raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " + f"{MODEL_ONNX_NAME} not found in transformers model directory " f"{model_path}. Be sure that an export of the model is written to " - f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}" + f"{os.path.join(model_path, MODEL_ONNX_NAME)}" ) - return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME) + return model_path, os.path.join(model_path, MODEL_ONNX_NAME) elif model_path.startswith("zoo:"): zoo_model = Model(model_path) deployment_path = zoo_model.deployment_directory_path - return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + return deployment_path, os.path.join(deployment_path, MODEL_ONNX_NAME) elif model_path.startswith("hf:"): from huggingface_hub import snapshot_download deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME) if not os.path.isfile(onnx_path): raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " + f"{MODEL_ONNX_NAME} not found in transformers model directory " f"{deployment_path}. Be sure that an export of the model is written to " f"{onnx_path}" ) diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index e69bf67321..f518620c2f 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -56,12 +56,12 @@ "has_model_kv_cache", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", - "_MODEL_DIR_ONNX_NAME", + "MODEL_ONNX_NAME", ] _LOGGER = logging.getLogger(__name__) -_MODEL_DIR_ONNX_NAME = "model.onnx" +MODEL_ONNX_NAME = "model.onnx" CACHE_INPUT_PREFIX = "past_key_values" CACHE_OUTPUT_PREFIX = "present" @@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model.deployment_directory_path # default to the main onnx file for the model - model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path + model = model.deployment.get_file(MODEL_ONNX_NAME).path elif File is not object and isinstance(model, File): # get the downloaded_path -- will auto download if not on local system @@ -146,7 +146,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model_path = Path(model) if model_path.is_dir(): - return str(model_path / _MODEL_DIR_ONNX_NAME) + return str(model_path / MODEL_ONNX_NAME) return model diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 49826b8af7..fdb31f1c6c 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict +from typing import Dict, Optional +from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter @@ -45,23 +46,20 @@ def __init__( internal_kv_cache: bool = True, force_max_tokens: bool = False, generation_config=None, - engine_kwargs: Dict = None, + engine_kwargs: Optional[Dict] = None, ): + ( + self.model_path, + self.config, + self.tokenizer, + engine_kwargs, + ) = setup_transformers_pipeline( + model_path, sequence_length, engine_kwargs=engine_kwargs + ) pipeline_state = PipelineState() pipeline_state_vals = {} - # TODO: The code below will be replaced with a transformers set-up Operator. - self.tokenizer = None - model_path = self.setup_onnx_file_path(model_path, sequence_length) - self.tokenizer.padding_side = "left" - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - if not engine_kwargs: - engine_kwargs = {} - engine_kwargs["model_path"] = model_path - if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": internal_kv_cache = False @@ -80,7 +78,7 @@ def __init__( ) # NOTE: Currently using pipeline state. Can swap to simply pass in the - # attributes to the specific Operator that neeed them, as class attributes. + # attributes to the specific Operator that need them, as class attributes. pipeline_state_vals[ "onnx_input_names_no_cache" ] = single_engine_operator.onnx_input_names_no_cache @@ -180,45 +178,3 @@ def __init__( super().__init__( ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state ) - - # TODO: Move to be part of a generic transformers set-up Operator. - def setup_onnx_file_path(self, model_path, sequence_length) -> str: - import logging - - import transformers - from transformers import AutoTokenizer - - from deepsparse.transformers.helpers import get_deployment_path - - """ - Parses ONNX model from the `model_path` provided. It additionally - creates config and tokenizer objects from the `deployment path`, - derived from the `model_path` provided. - - :return: file path to the processed ONNX file for the engine to compile - """ - deployment_path, onnx_path = get_deployment_path(model_path) - - hf_logger = logging.getLogger("transformers") - hf_logger_level = hf_logger.level - hf_logger.setLevel(logging.ERROR) - self.config = transformers.PretrainedConfig.from_pretrained( - deployment_path, - finetuning_task=self.task if hasattr(self, "task") else None, - ) - hf_logger.setLevel(hf_logger_level) - - self._trust_remote_code = False - self.tokenizer = AutoTokenizer.from_pretrained( - deployment_path, - trust_remote_code=self._trust_remote_code, - model_max_length=sequence_length, - ) - - if not self.config or not self.tokenizer: - raise RuntimeError( - "Invalid config or tokenizer provided. Please provide " - "paths to the files or ensure they exist in the `model_path` provided. " - "See `tokenizer` and `config` arguments for details." - ) - return onnx_path From 71515ac774eb5c70296798cbad4f460a84d7e0ce Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 8 Nov 2023 13:48:16 +0000 Subject: [PATCH 10/57] fix error --- src/deepsparse/transformers/helpers.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index 78543baf12..70a1e9523d 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -121,7 +121,9 @@ def setup_onnx_file_path( return onnx_path, config, tokenizer -def get_deployment_path(model_path: str) -> Tuple[str, str]: +def get_deployment_path( + model_path: str, onnx_model_name: Optional[str] = None +) -> Tuple[str, str]: """ Returns the path to the deployment directory for the given model path and the path to the mandatory @@ -130,9 +132,12 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: for running the transformers model in the deepsparse pipeline :param model_path: path to model directory, sparsezoo stub, or ONNX file + :param onnx_model_name: name of the ONNX file to look for in the deployment + directory. Defaults to MODEL_ONNX_NAME :return: path to the deployment directory and path to the ONNX file inside the deployment directory """ + onnx_model_name = onnx_model_name or MODEL_ONNX_NAME if os.path.isfile(model_path): # return the parent directory of the ONNX file return os.path.dirname(model_path), model_path @@ -140,26 +145,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: if os.path.isdir(model_path): model_files = os.listdir(model_path) - if MODEL_ONNX_NAME not in model_files: + if onnx_model_name not in model_files: raise ValueError( - f"{MODEL_ONNX_NAME} not found in transformers model directory " + f"{onnx_model_name} not found in transformers model directory " f"{model_path}. Be sure that an export of the model is written to " - f"{os.path.join(model_path, MODEL_ONNX_NAME)}" + f"{os.path.join(model_path, onnx_model_name)}" ) - return model_path, os.path.join(model_path, MODEL_ONNX_NAME) + return model_path, os.path.join(model_path, onnx_model_name) elif model_path.startswith("zoo:"): zoo_model = Model(model_path) deployment_path = zoo_model.deployment_directory_path - return deployment_path, os.path.join(deployment_path, MODEL_ONNX_NAME) + return deployment_path, os.path.join(deployment_path, onnx_model_name) elif model_path.startswith("hf:"): from huggingface_hub import snapshot_download deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME) + onnx_path = os.path.join(deployment_path, onnx_model_name) if not os.path.isfile(onnx_path): raise ValueError( - f"{MODEL_ONNX_NAME} not found in transformers model directory " + f"{onnx_model_name} not found in transformers model directory " f"{deployment_path}. Be sure that an export of the model is written to " f"{onnx_path}" ) From 041174b835231099771326ffa32c45742525b62c Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 9 Nov 2023 09:49:21 -0500 Subject: [PATCH 11/57] [Pipeline Refactor] Split/Join Functionality for multiple prompts (#1384) * add split/join functionality * update router to include split/join in parent class, refactor pipeline code to remove repeat code, update map function * process multiple generations * move map to base class --- .../v2/operators/engine_operator.py | 18 +-- src/deepsparse/v2/operators/operator.py | 13 -- src/deepsparse/v2/pipeline.py | 153 ++++++++++++++---- src/deepsparse/v2/routers/router.py | 13 +- src/deepsparse/v2/schedulers/scheduler.py | 11 ++ .../v2/schedulers/scheduler_group.py | 20 --- src/deepsparse/v2/text_generation/__init__.py | 1 + .../v2/text_generation/join_output.py | 70 ++++++++ src/deepsparse/v2/text_generation/pipeline.py | 20 ++- .../v2/text_generation/prep_for_prefill.py | 9 +- .../v2/text_generation/process_inputs.py | 9 +- .../v2/text_generation/process_outputs.py | 25 ++- 12 files changed, 268 insertions(+), 94 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/join_output.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index c2fc562c63..bd58aefafa 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -20,7 +20,7 @@ from deepsparse import Context as EngineContext from deepsparse import Engine, MultiModelEngine, Scheduler from deepsparse.benchmark import ORTEngine -from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs +from deepsparse.utils import model_to_path from deepsparse.v2.operators import Operator @@ -145,18 +145,6 @@ def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict: # planned refactor engine_outputs = inp.engine(inp.engine_inputs) return {"engine_outputs": engine_outputs} - inp = inp.engine_inputs - batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) - batches_outputs = list(map(self.engine, batches)) - engine_outputs = self.condense_inputs( - batch_outputs=batches_outputs, orig_batch_size=orig_batch_size - ) - return {"engine_outputs": engine_outputs} - def expand_inputs(self, **kwargs): - return split_engine_inputs(kwargs["engine_inputs"], self._batch_size) - - def condense_inputs(self, **kwargs): - batch_outputs = kwargs["batch_outputs"] - orig_batch_size = kwargs["orig_batch_size"] - return join_engine_outputs(batch_outputs, orig_batch_size) + engine_outputs = self.engine(inp.engine_inputs) + return {"engine_outputs": engine_outputs} diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index b3963d8223..5bb0be841a 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -99,7 +99,6 @@ def __call__( pipeline_state=pipeline_state, **kwargs, ) - if self.has_output_schema(): return self.output_schema(**run_output) return run_output @@ -117,18 +116,6 @@ def can_operate(self, inp: Any) -> bool: """ return True - def expand_inputs(self, **kwargs): - """ - Generic function to handle expanding values. - """ - raise NotImplementedError - - def condense_inputs(self, **kwargs): - """ - Generic function to handle condensing values. - """ - raise NotImplementedError - def yaml(self): pass diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 0a8c8b2f93..f56680d2b9 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -13,7 +13,10 @@ # limitations under the License. -from typing import Dict, List, Union +import copy +from concurrent.futures import Future +from functools import partial +from typing import Any, Callable, Dict, List, Union from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router @@ -56,9 +59,88 @@ def __init__( self.pipeline_state = pipeline_state self.validate() - # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) + def _run_sequential( + self, + inp: Any, + inference_state: InferenceState, + pipeline_state: PipelineState, + start: str, + end: str, + ): + next_step = start + while next_step != end: + outputs = self._run_next_step( + func=self.ops[next_step], + next_step=next_step, + input=inp, + pipeline_state=pipeline_state, + inference_state=inference_state, + ) + next_step, operator_output, state_update = outputs + if state_update: + inference_state.update_state(state_update) + inp = operator_output + return inp + + def _apply_split(self, inp: Any, inference_state: InferenceState): + """ + Split inputs using the pipeline's expand_inputs function. Inputs are split + into a batch size of one when a SPLIT_ROUTE node is found in a given pipeline's + provided router. The split batches are run asynchronously and then joined when + a JOIN_ROUTE node is found, using the pipeline's condense_inputs function. + """ + + batches, orig_batch_size = self.expand_inputs(inp, 1) + run_with_state = partial( + self._run_sequential, + pipeline_state=self.pipeline_state, + start=self.router.route[self.router.SPLIT_ROUTE], + end=self.router.JOIN_ROUTE, + ) + inference_state_list = [ + copy.deepcopy(inference_state) for x in range(len(batches)) + ] + futures = self._scheduler_group.map( + batches, + inference_state_list, + func=run_with_state, + ) + return self.condense_inputs([x.result() for x in futures]) + + def _run_next_step( + self, + *args, + func: Callable, + next_step: Union[str, int], + input: Any = None, + **kwargs, + ): + """ + Generic function to run a given func, process the output and determine the next + step. + """ + if input: + operator_output = ( + func(*args, **kwargs, **input) + if isinstance(input, dict) + else func(input, *args, **kwargs) + ) + else: + operator_output = func(*args, **kwargs) + + if isinstance(operator_output, Future): + operator_output = operator_output.result() + + state_update = None + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + + next_step = self.router.next(next_step, self.ops, operator_output) + return next_step, operator_output, state_update + def run( self, *args, @@ -78,40 +160,34 @@ def run( operator_output = None while next_step != self.router.END_ROUTE: - # Either a dictionary key or valid index - operator = self.ops[next_step] + # NOTE: split_route should only appear after the start route node + if next_step == self.router.SPLIT_ROUTE: + operator_output = self._apply_split(operator_output, inference_state) + next_step = self.router.route[self.router.JOIN_ROUTE] + if next_step == self.router.START_ROUTE: - output_future = self._scheduler_group.submit( + outputs = self._run_next_step( *args, + next_step=next_step, + func=self._scheduler_group.submit, inference_state=inference_state, - operator=operator, + operator=self.ops[next_step], pipeline_state=pipeline_state, **kwargs, ) else: - if isinstance(operator_output, dict): - output_future = self._scheduler_group.submit( - inference_state=inference_state, - operator=operator, - pipeline_state=pipeline_state, - **operator_output, - ) - else: - output_future = self._scheduler_group.submit( - operator_output, - inference_state=inference_state, - pipeline_state=pipeline_state, - operator=operator, - ) - - operator_output = output_future.result() - if isinstance(operator_output, tuple): - state_update = operator_output[-1] - operator_output = operator_output[0] - inference_state.update_state(state_update) - - next_step = self.router.next(next_step, self.ops, operator_output) + outputs = self._run_next_step( + func=self._scheduler_group.submit, + input=operator_output, + next_step=next_step, + inference_state=inference_state, + operator=self.ops[next_step], + pipeline_state=pipeline_state, + ) + next_step, operator_output, state_update = outputs + if state_update: + inference_state.update_state(state_update) return operator_output def __call__(self, *args, **kwargs): @@ -136,6 +212,27 @@ def __call__(self, *args, **kwargs): return self.run(*args, **kwargs) + def expand_inputs(self, *args, **kwargs): + """ + Generic function to handle expanding values. + """ + raise NotImplementedError( + "This function should be implemented for any router with split or join" + "nodes. expand_inputs will be called prior to the split node (stored in " + "the router's SPLIT_ROUTE attribute), expanding outputs for each output " + "such that there is a batch size of one per thread." + ) + + def condense_inputs(self, *args, **kwargs): + """ + Generic function to handle condensing values. + """ + raise NotImplementedError( + "This function should be implemented for any router with split or join " + "nodes. condense_inputs will be called after the join node (stored in the " + "router's JOIN_ROUTE attribute), condensing outputs from multiple threads." + ) + def validate(self): """ Validate that compatability of the router and operators provided. diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index d1110d4ca7..1b70164002 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -41,9 +41,13 @@ def __init__( end_route: Union[str, int], start_route: Union[str, int], route: Optional[Dict] = None, + split_route: str = "SPLIT", + join_route: str = "JOIN", ): self.START_ROUTE = start_route self.END_ROUTE = end_route + self.SPLIT_ROUTE = split_route + self.JOIN_ROUTE = join_route self.route = route @abstractmethod @@ -79,6 +83,9 @@ class LinearRouter(Router): def __init__(self, end_route: int, start_route: int = 0): super().__init__(end_route=end_route, start_route=start_route) + self.SPLIT_ROUTE = None + self.JOIN_ROUTE = None + _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.") def next( self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None @@ -128,8 +135,10 @@ class GraphRouter(Router): where `can_operate` returns True will run. Paths should be deterministic. """ - def __init__(self, end_route: str, start_route: str, route: Dict): - super().__init__(end_route=end_route, start_route=start_route, route=route) + def __init__(self, end_route: str, start_route: str, route: Dict, **kwargs): + super().__init__( + end_route=end_route, start_route=start_route, route=route, **kwargs + ) def next( self, diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 78a58e3389..5313683107 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -14,6 +14,7 @@ from concurrent.futures import Future, ThreadPoolExecutor +from typing import Callable from deepsparse.v2.operators import Operator @@ -64,3 +65,13 @@ def can_process( Base OperatorScheduler always returns True """ return True + + def map(self, *args, func: Callable): + """ + :param func: generic callable run for each arg + :return: list of futures for each submit + """ + futures = [] + for _, values in enumerate(zip(*args)): + futures.append(self.submit(*values, operator=func)) + return futures diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 40b5695f22..14d869a0f2 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -55,23 +55,3 @@ def submit( operator=operator, **kwargs, ) - - def can_process( - self, - *args, - operator: Operator, - **kwargs, - ) -> bool: - """ - :param operator: operator to check - :return: True if this Operator can process the given operator and input. - SchedulerGroup always returns True - """ - return any( - scheduler.can_process( - *args, - operator=operator, - **kwargs, - ) - for scheduler in self.schedulers - ) diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py index 21cd7e2acd..08836b8bbe 100644 --- a/src/deepsparse/v2/text_generation/__init__.py +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -17,6 +17,7 @@ from .compile_generations import * from .compile_logits import * from .generate_new_token import * +from .join_output import * from .kv_cache_operator import * from .multi_engine_prefill_operator import * from .nl_engine_operator import * diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py new file mode 100644 index 0000000000..8a6c77a2f1 --- /dev/null +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import numpy + +from deepsparse.transformers.utils.helpers import pad_to_fixed_length +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput + + +__all__ = ["JoinOutput"] + + +class JoinOutput(Operator): + """ + Run this operator to combine the results from multiple prompts. + """ + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def run(self, inp: List[CompileGenerationsOutput], **kwargs): + batch_outputs = [x for x in inp[0]] + generated_tokens = [x.generated_tokens for x in batch_outputs] + generated_logits = [x.generated_logits for x in batch_outputs] + finished_reason = [x.finished_reason for x in batch_outputs] + + max_len = max(token.shape[1] for token in generated_tokens) + + # pad all tokens to the same length + tokens = [ + pad_to_fixed_length( + array=prediction, + max_len=max_len, + value=self.tokenizer.pad_token_id, + axis=1, + ) + for prediction in generated_tokens + ] + + # find the longest sequence in the batch of logits + max_len = max(logits.shape[1] for logits in generated_logits) + + # pad all logits to the same length + logits = [ + pad_to_fixed_length(array=single_logits, max_len=max_len, axis=1) + for single_logits in generated_logits + ] + + tokens = numpy.concatenate(tokens) + logits = numpy.concatenate(logits) + + return { + "generated_tokens": tokens, + "generated_logits": logits, + "finished_reason": finished_reason, + } diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 49826b8af7..240da04907 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -15,6 +15,7 @@ from typing import Dict from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.utils import split_engine_inputs from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import OperatorScheduler @@ -24,6 +25,7 @@ CompileGenerations, CompilePromptLogits, GenerateNewTokenOperator, + JoinOutput, KVCacheCreator, MultiEnginePrefill, NLEngineOperator, @@ -131,6 +133,7 @@ def __init__( process_output = ProcessOutputs(tokenizer=self.tokenizer) compile_generations = CompileGenerations() compile_generated_tokens = CompileGeneratedTokens() + join_output = JoinOutput(tokenizer=self.tokenizer) ops = { "process_input": process_inputs, @@ -146,10 +149,12 @@ def __init__( "process_outputs": process_output, "compile_generations": compile_generations, "compile_generated_tokens": compile_generated_tokens, + "join_output": join_output, } routes = { - "process_input": "prepare_prefill", + "process_input": "SPLIT", + "SPLIT": "prepare_prefill", "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"], "multi_engine_prefill": "multi_engine", "multi_engine": "compile_logits", @@ -169,7 +174,9 @@ def __init__( "autoregressive_preprocess", "compile_generations", ], - "compile_generations": "process_outputs", + "compile_generations": "JOIN", + "JOIN": "join_output", + "join_output": "process_outputs", "process_outputs": "STOP", } @@ -181,6 +188,15 @@ def __init__( ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state ) + def expand_inputs(self, items, batch_size): + items = [items.get(key) for key in items.keys()] + out, orig_batch_size = split_engine_inputs(items, batch_size) + combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] + return combined_batches, orig_batch_size + + def condense_inputs(self, *args, **kwargs): + return args[0], kwargs + # TODO: Move to be part of a generic transformers set-up Operator. def setup_onnx_file_path(self, model_path, sequence_length) -> str: import logging diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py index 2f9eb15797..2e5fecb3e8 100644 --- a/src/deepsparse/v2/text_generation/prep_for_prefill.py +++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py @@ -42,13 +42,20 @@ def __init__(self, kv_cache_creator: Operator): "from the NLEngineOperator" ) - def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs): + def run( + self, + input_ids: Any, + attention_mask: Any, + pipeline_state: PipelineState, + **kwargs, + ): # NOTE: Can potentially just be class attributes instead of relying on # pipeline state. cache_shape = pipeline_state.current_state.get("cache_shape") data_type = pipeline_state.current_state.get("kv_cache_data_type") output_names = pipeline_state.current_state.get("output_names") + tokens = input_ids[attention_mask.nonzero()].tolist() kv_cache = self.kv_cache_creator.run( cache_shape=cache_shape, kv_cache_data_type=data_type, diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index e57e402983..5d47c8ff39 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -114,8 +114,7 @@ def run(self, inp: TextGenerationInput, **kwargs): frequency_penalty=generation_config.repetition_penalty, ) - # TODO: move this step to prep_for_prefill and add attention mask to the output - # this will allow us to split/join more easily when processing multiple prompts - # in parallel - tokens = input_ids[attention_mask.nonzero()].tolist() - return {"tokens": tokens}, inference_state_update + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + }, inference_state_update diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py index ca1cf78521..7173b8e256 100644 --- a/src/deepsparse/v2/text_generation/process_outputs.py +++ b/src/deepsparse/v2/text_generation/process_outputs.py @@ -22,7 +22,6 @@ TextGenerationOutput, ) from deepsparse.v2.operators import Operator -from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput from deepsparse.v2.utils import InferenceState @@ -52,19 +51,20 @@ def _create_generated_text_output( ) def run( - self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs + self, + generated_tokens: numpy.ndarray, + generated_logits: numpy.ndarray, + finished_reason: list, + inference_state: InferenceState, + **kwargs, ): generation_config = inference_state.current_state.get("generation_config") - generated_tokens = inp.generated_tokens - generated_logits = ( - inp.generated_logits if generation_config.output_scores else None - ) - finished_reason = inp.finished_reason + generated_logits = generated_logits if generation_config.output_scores else None sequences = self.tokenizer.batch_decode( generated_tokens, skip_special_tokens=True ) - finished_reason = [f for f in finished_reason if f] + finished_reason = [f[-1] for f in finished_reason] if generated_logits is not None: generations = list( @@ -79,6 +79,15 @@ def run( generations = list( map(self._create_generated_text_output, sequences, finished_reason) ) + + num_preds = generation_config.num_return_sequences + if num_preds > 1: + grouped_generations = [ + generations[n : n + num_preds] + for n in range(0, len(generations), num_preds) + ] + generations = grouped_generations + outputs = dict( created=datetime.datetime.now(), prompts=inference_state.current_state.get("prompts"), From a508342daee2ffb715379bc4307a5d752dcc4055 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Mon, 6 Nov 2023 17:44:56 -0500 Subject: [PATCH 12/57] unit testing for text generation operators --- .../v2/unit/test_text_generation.py | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 tests/deepsparse/v2/unit/test_text_generation.py diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py new file mode 100644 index 0000000000..d54331162e --- /dev/null +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -0,0 +1,326 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect + +import numpy + +import pytest +from deepsparse.v2.text_generation import TextGenerationPipeline +from deepsparse.transformers.utils.helpers import prepends_bos_token +from deepsparse.transformers.helpers import get_deployment_path +from transformers import AutoTokenizer +from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.v2.text_generation.process_inputs import GenerationDefaults +from deepsparse.v2.utils import InferenceState +from deepsparse.v2.text_generation import PrepareGeneration, TokenGeneratorOperator, InferenceState +import copy + + +@pytest.fixture +def text_generation_attributes(): + sequence_length = 5 + prompt_sequence_length = 2 + model_path = "hf:mgoin/TinyStories-1M-deepsparse" + deployment_path, model_path = get_deployment_path(model_path) + + tokenizer = AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=False, + model_max_length=sequence_length, + ) + + tokenizer.padding_side = "left" + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + return sequence_length, prompt_sequence_length, model_path, tokenizer + + +@pytest.fixture +def single_token_engine_no_internal_cache(text_generation_attributes): + from deepsparse.v2.text_generation import NLEngineOperator + seq_length, _, model_path, _ = text_generation_attributes + nl_engine_operator = NLEngineOperator( + sequence_length=seq_length, + input_ids_length=1, + model_path=model_path + ) + return nl_engine_operator + +@pytest.fixture +def pipeline_state(single_token_engine_no_internal_cache): + from deepsparse.v2.utils import PipelineState + + pipeline_state = PipelineState() + pipeline_state_vals = {} + pipeline_state_vals[ + "onnx_input_names_no_cache" + ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache + pipeline_state_vals["cache_shape"] = single_token_engine_no_internal_cache.cache_shape + pipeline_state_vals["output_names"] = single_token_engine_no_internal_cache.output_names + print(pipeline_state_vals) + pipeline_state_vals[ + "kv_cache_data_type" + ] = single_token_engine_no_internal_cache.kv_cache_data_type + pipeline_state.create_state(pipeline_state_vals) + return pipeline_state + +@pytest.fixture +def large_prompt(): + prompt = "Hello, how are you doing today?" + generation_config = {"top_p": 0, "top_k": 0, "max_length": 10} + return TextGenerationInput(prompt=prompt, generation_config=generation_config) + +@pytest.fixture +def small_prompt(): + prompt = "Hello" + return TextGenerationInput(prompt=prompt) + +@pytest.fixture +def mock_kv_cache(): + from deepsparse.transformers.utils import DecoderKVCache + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + ) + return kv_cache + +@pytest.fixture +def mock_kv_cache_full(): + from deepsparse.transformers.utils import DecoderKVCache + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + num_processed_tokens=3 + ) + return kv_cache + +""" +@pytest.fixture +def mock_kv_cache_engine(pipeline_state): + from deepsparse.transformers.utils import DecoderKVCache + kv_cache = DecoderKVCache() + kv_cache_state = initialize_kv_cache_state( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names"), + length=self.sequence_length - self.prompt_sequence_length, + empty=bool(self.internal_kv_cache), + ) + print(state) + return kv_cache +""" + +@pytest.fixture +def mock_tokens(): + return [15496] + +@pytest.fixture +def mock_tokens_multiple(): + return [15496, 15496, 15496] + +@pytest.fixture +def mock_inference_state(): + generation_config = GenerationDefaults() + inference_state = InferenceState() + inference_state.create_state({}) + inference_state.update_state({ + "generation_config": generation_config}) + return inference_state + +@pytest.fixture +def mock_token_generator(text_generation_attributes, mock_tokens_multiple): + _, _, _, tokenizer = text_generation_attributes + token_generator_creator = TokenGeneratorOperator() + prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer)) + token_generator_creator_output = token_generator_creator.run( + logits_shape=prompt_logits[0, -1, :].shape, + deterministic=True, + sampling_temperature=1.0, + tokens=copy.copy(mock_tokens_multiple), + ) + return token_generator_creator_output.get("token_generator") + +@pytest.fixture +def mock_logits(text_generation_attributes): + _, _, _, tokenizer = text_generation_attributes + return numpy.random.rand(1, 1, len(tokenizer)) + + +def test_process_inputs(text_generation_attributes, small_prompt, large_prompt): + sequence_length, _, _, tokenizer = text_generation_attributes + from deepsparse.v2.text_generation.process_inputs import ProcessInputsTextGeneration + process_inputs = ProcessInputsTextGeneration( + sequence_length=sequence_length, + tokenizer=tokenizer + ) + + outputs, state_update = process_inputs.run(small_prompt) + assert len(outputs.get("tokens")) == 1 + assert isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get("prompts") == small_prompt.sequences + + outputs, state_update = process_inputs.run(large_prompt) + + assert not isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get("generation_config").max_length == large_prompt.generation_config.get("max_length") + assert outputs.get("tokens") + assert state_update.get("top_k") == large_prompt.generation_config.get("top_k") + + +def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache): + assert single_token_engine_no_internal_cache.input_ids_length == 1 + +def test_kv_cache_creation(pipeline_state, text_generation_attributes): + from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput + seq_length, prompt_sequence_length, model_path, tokenizer = text_generation_attributes + kv_cache_creator = KVCacheCreator( + tokenizer=tokenizer, + prompt_sequence_length=prompt_sequence_length, + sequence_length=seq_length, + internal_kv_cache=False + ) + + assert kv_cache_creator.input_schema == KVCacheCreatorInput + kv_cache = kv_cache_creator.run( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names") + ) + assert kv_cache.get("kv_cache") + assert kv_cache.get("kv_cache").total_num_processed_tokens == 0 + + +def test_autoreg_preproces_can_run(text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache): + seq_len, prompt_seq_len, _, _ = text_generation_attributes + from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, + prompt_sequence_length=prompt_seq_len + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + + assert autoreg_prep.can_operate(inputs) + outputs = autoreg_prep.run( + tokens=mock_tokens, + kv_cache=mock_kv_cache, + pipeline_state=pipeline_state + ) + + assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + print(outputs.get("engine_inputs")) + assert tokens.shape[-1] == 1 + assert attention_mask.shape[-1] == seq_len + assert positions[0] == mock_kv_cache.total_num_processed_tokens + assert outputs.get("in_generation") is None + +def test_autoreg_preproces_cant_run(text_generation_attributes, mock_kv_cache, mock_tokens_multiple): + seq_len, _, _, _ = text_generation_attributes + from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, + prompt_sequence_length=2 + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + assert not autoreg_prep.can_operate(inputs) + +def test_mult_engine_preprocess(text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state): + seq_len, prompt_seq_len, _, _ = text_generation_attributes + from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, + prompt_sequence_length=prompt_seq_len + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + assert multi_prep.can_operate(inputs) + outputs = multi_prep.run(tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, pipeline_state=pipeline_state) + assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + assert tokens.shape[-1] == prompt_seq_len + assert attention_mask.shape[-1] == seq_len + assert positions.shape[-1] == prompt_seq_len + +def test_multi_engine_preprocess_cant_operate(text_generation_attributes, mock_kv_cache, mock_tokens): + seq_len, prompt_seq_len, _, _ = text_generation_attributes + from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, + prompt_sequence_length=prompt_seq_len + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + assert not multi_prep.can_operate(inputs) + +""" +def test_run_single_engine_once(single_token_engine_no_internal_cache, mock_kv_cache_engine): + from deepsparse.v2.text_generation.nl_engine_operator import NlEngineInput + + mock_engine_inputs = [numpy.array([[15496]]), numpy.array([[0, 0, 0, 0, 1]]), numpy.array([[0]]), numpy.array([[[[0, 0, 0, 0, 1]]]])] + inputs = NlEngineInput( + engine_inputs=mock_engine_inputs, + kv_cache=mock_kv_cache_engine, + tokens=mock_engine_inputs[0].tolist() + ) + print(single_token_engine_no_internal_cache.run(inputs)) +""" + +def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_generation_attributes, mock_inference_state): + seq_len, prompt_seq_len, _, tokenizer = text_generation_attributes + prep_for_generation = PrepareGeneration( + token_generator=TokenGeneratorOperator(), + sequence_length=seq_len, + prompt_sequence_length=prompt_seq_len + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full} + assert prep_for_generation.can_operate(inputs) + + prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))] + mock_inference_state.update_state({"prompt_logits": prompt_logits}) + outputs, state = prep_for_generation.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache, + inference_state=mock_inference_state + ) + assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1 + assert outputs.get("in_generation") + assert numpy.array_equal(state.get("generated_logits")[0], numpy.expand_dims(prompt_logits[0][:, -1, :], 0)) + +def test_generate_new_token(mock_token_generator, text_generation_attributes, mock_kv_cache, mock_inference_state, mock_logits, mock_tokens): + _, _, _, tokenizer = text_generation_attributes + from deepsparse.v2.text_generation import GenerateNewTokenOperator + generate_new_token = GenerateNewTokenOperator( + force_max_tokens=False, + tokenizer=tokenizer + ) + mock_inference_state.update_state({"token_generator": mock_token_generator, "generated_tokens": [mock_token_generator.tokens]}) + outputs, state = generate_new_token.run( + logits=mock_logits, + kv_cache=mock_kv_cache, + inference_state=mock_inference_state + ) + assert outputs.get("new_token") == state.get("token_generator").tokens[-1] + + +def test_compile_logits(mock_logits, mock_inference_state): + from deepsparse.v2.text_generation import CompilePromptLogits + mock_inference_state.update_state({"prompt_logits": [mock_logits]}) + compile_prompt_logits = CompilePromptLogits() + assert compile_prompt_logits.can_operate({}) + output, state = compile_prompt_logits.run( + logits=mock_logits, + inference_state=mock_inference_state + ) + assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 + print(state.get("prompt_logits")) \ No newline at end of file From cbb0e86f5d4e588e8afa1f873c3411ef29d7ed2f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 11:05:13 -0500 Subject: [PATCH 13/57] additional changes --- src/deepsparse/v2/text_generation/kv_cache_operator.py | 3 ++- src/deepsparse/v2/text_generation/prep_for_generation.py | 4 ++-- src/deepsparse/v2/text_generation/process_inputs.py | 7 ++++--- tests/deepsparse/v2/unit/test_text_generation.py | 3 +-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py index 0b232402b3..5811f44b32 100644 --- a/src/deepsparse/v2/text_generation/kv_cache_operator.py +++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py @@ -24,7 +24,7 @@ from deepsparse.v2.operators import Operator -__all__ = ["KVCacheCreator"] +__all__ = ["KVCacheCreator", "KVCacheCreatorInput"] class KVCacheCreatorOutput(BaseModel): @@ -61,6 +61,7 @@ def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs length=self.sequence_length - self.prompt_sequence_length, empty=bool(self.internal_kv_cache), ) + print(kv_cache_state.get("past_key_values.0.key").shape) kv_cache = DecoderKVCache(self.internal_kv_cache) kv_cache.setup( diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 544af43980..887f81e173 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -14,6 +14,7 @@ from typing import Any import numpy +import copy from deepsparse.transformers.pipelines.text_generation import FinishReason from deepsparse.v2.operators import Operator @@ -107,7 +108,7 @@ def run( logits_shape=prompt_logits[0, -1, :].shape, deterministic=not generation_config.do_sample, sampling_temperature=generation_config.temperature, - tokens=tokens, + tokens=copy.copy(tokens), **inference_state.current_state, ) token_generator = token_generator_creator_output.get("token_generator") @@ -131,7 +132,6 @@ def run( "finished_reason": [], "token_generator": token_generator, } - output = { "tokens": token_generator.tokens, "kv_cache": kv_cache, diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 5d47c8ff39..31f5aa0504 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -28,7 +28,7 @@ class GenerationDefaults: num_return_sequences = 1 - max_length = 100 + max_length = 10 max_new_tokens = None output_scores = False top_k = 0 @@ -54,10 +54,11 @@ class ProcessInputsTextGeneration(Operator): def __init__( self, tokenizer: transformers.PreTrainedTokenizerBase, + sequence_length: int, generation_config: Union[ str, pathlib.Path, Dict, transformers.GenerationConfig - ], - sequence_length: int, + ] = None, + ): self.generation_config = generation_config self.tokenizer = tokenizer diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index d54331162e..f33776e010 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -322,5 +322,4 @@ def test_compile_logits(mock_logits, mock_inference_state): logits=mock_logits, inference_state=mock_inference_state ) - assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 - print(state.get("prompt_logits")) \ No newline at end of file + assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 \ No newline at end of file From 254158162868709290aa944ed3fdb090a43a431c Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 21:27:00 -0500 Subject: [PATCH 14/57] unit testing completion --- .../v2/text_generation/nl_engine_operator.py | 2 +- src/deepsparse/v2/text_generation/pipeline.py | 1 - .../v2/text_generation/prep_for_generation.py | 48 +-- .../v2/text_generation/process_inputs.py | 7 +- .../v2/unit/test_text_generation.py | 283 +++++++++++------- 5 files changed, 178 insertions(+), 163 deletions(-) diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 0bd9098a40..7549f986d9 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -29,7 +29,7 @@ ) -__all__ = ["NLEngineOperator"] +__all__ = ["NLEngineOperator", "NlEngineInput"] class NlEngineInput(BaseModel): diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 240da04907..1c2972859b 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -124,7 +124,6 @@ def __init__( token_generator = TokenGeneratorOperator() prep_for_generation = PrepareGeneration( sequence_length=sequence_length, - prompt_sequence_length=prompt_sequence_length, token_generator=token_generator, ) generate_new_token = GenerateNewTokenOperator( diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 887f81e173..75f4aa9db2 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy from typing import Any import numpy -import copy from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.transformers.utils.helpers import set_generated_length from deepsparse.v2.operators import Operator from deepsparse.v2.text_generation import TokenGeneratorOperator from deepsparse.v2.utils import InferenceState @@ -29,10 +30,8 @@ class PrepareGeneration(Operator): def __init__( self, token_generator: TokenGeneratorOperator, - prompt_sequence_length: int, sequence_length: int, ): - self.prompt_sequence_length = prompt_sequence_length self.sequence_length = sequence_length self.token_generator_creator = token_generator @@ -48,49 +47,6 @@ def can_operate(self, inp: Any): return True return False - @staticmethod - def set_generated_length( - max_length: int, - prompt_tokens_length: int, - sequence_length: int, - prompt_sequence_length: int, - max_new_tokens: int, - finish_reason_choices: "FinishReason", # noqa - ): - """ - Determine the length of the generated tokens. The hard cap on the total number - of tokens is based on the sequence length. If max_length is provided and is less - than the sequence length, it will be used to cap the total number of tokens - generated. If it is not provided, the max_new_tokens attribute will be used and - also capped by the sequence length. - - :param max_length: max_length attribute, provided as input during inference - :param prompt_tokens_length: the number of prompt tokens used as part of the - generated output - :param sequence_length: the sequence length used for the pipeline - :param prompt_sequence_length: the prompt sequence length used for the pipeline - :param max_new_tokens: the max_new_tokens attribute, which may be provided - as part of the input during inference - """ - if max_length: - # if max_length provided, use that to cap total tokens generated - max_tokens = max_length - finish_reason = finish_reason_choices.LENGTH - else: - # if not provided, max tokens is based on max_new_tokens + prompt tokens - max_tokens = ( - min(max_new_tokens, sequence_length - prompt_sequence_length) - + prompt_tokens_length - ) - finish_reason = finish_reason_choices.MAX_NEW_TOKENS - - # hard model/pipeline cap - return ( - (sequence_length, finish_reason_choices.CAPACITY) - if sequence_length < max_tokens - else (max_tokens, finish_reason) - ) - def run( self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs ): diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 31f5aa0504..059ed06f14 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -26,6 +26,9 @@ from deepsparse.v2.operators import Operator +__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"] + + class GenerationDefaults: num_return_sequences = 1 max_length = 10 @@ -38,9 +41,6 @@ class GenerationDefaults: temperature = 1.0 -__all__ = ["ProcessInputsTextGeneration"] - - class ProcessInputsTextGeneration(Operator): """ Input processing operator. Responsible for tokenizing the input, handling the @@ -58,7 +58,6 @@ def __init__( generation_config: Union[ str, pathlib.Path, Dict, transformers.GenerationConfig ] = None, - ): self.generation_config = generation_config self.tokenizer = tokenizer diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index f33776e010..410bcffdd1 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -12,27 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect +import copy import numpy +from transformers import AutoTokenizer import pytest -from deepsparse.v2.text_generation import TextGenerationPipeline -from deepsparse.transformers.utils.helpers import prepends_bos_token from deepsparse.transformers.helpers import get_deployment_path -from transformers import AutoTokenizer from deepsparse.transformers.pipelines.text_generation import TextGenerationInput -from deepsparse.v2.text_generation.process_inputs import GenerationDefaults -from deepsparse.v2.utils import InferenceState -from deepsparse.v2.text_generation import PrepareGeneration, TokenGeneratorOperator, InferenceState -import copy +from deepsparse.transformers.utils import DecoderKVCache +from deepsparse.transformers.utils.helpers import initialize_kv_cache_state +from deepsparse.v2 import InferenceState, PipelineState +from deepsparse.v2.text_generation import ( + AutoRegressiveOperatorPreprocess, + CompilePromptLogits, + GenerateNewTokenOperator, + GenerationDefaults, + KVCacheCreator, + KVCacheCreatorInput, + MultiEnginePrefill, + NlEngineInput, + NLEngineOperator, + PrepareGeneration, + ProcessInputsTextGeneration, + TokenGeneratorOperator, +) @pytest.fixture def text_generation_attributes(): sequence_length = 5 - prompt_sequence_length = 2 + prompt_sequence_length = 1 + return sequence_length, prompt_sequence_length + + +@pytest.fixture +def model_attributes(text_generation_attributes): model_path = "hf:mgoin/TinyStories-1M-deepsparse" + sequence_length, prompt_sequence_length = text_generation_attributes deployment_path, model_path = get_deployment_path(model_path) tokenizer = AutoTokenizer.from_pretrained( @@ -45,104 +62,109 @@ def text_generation_attributes(): if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token - return sequence_length, prompt_sequence_length, model_path, tokenizer + return tokenizer, model_path @pytest.fixture -def single_token_engine_no_internal_cache(text_generation_attributes): - from deepsparse.v2.text_generation import NLEngineOperator - seq_length, _, model_path, _ = text_generation_attributes +def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes): + seq_length, _ = text_generation_attributes + _, model_path = model_attributes + nl_engine_operator = NLEngineOperator( - sequence_length=seq_length, - input_ids_length=1, - model_path=model_path + sequence_length=seq_length, input_ids_length=1, model_path=model_path ) return nl_engine_operator + @pytest.fixture def pipeline_state(single_token_engine_no_internal_cache): - from deepsparse.v2.utils import PipelineState - pipeline_state = PipelineState() pipeline_state_vals = {} pipeline_state_vals[ "onnx_input_names_no_cache" ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache - pipeline_state_vals["cache_shape"] = single_token_engine_no_internal_cache.cache_shape - pipeline_state_vals["output_names"] = single_token_engine_no_internal_cache.output_names - print(pipeline_state_vals) + pipeline_state_vals[ + "cache_shape" + ] = single_token_engine_no_internal_cache.cache_shape + pipeline_state_vals[ + "output_names" + ] = single_token_engine_no_internal_cache.output_names pipeline_state_vals[ "kv_cache_data_type" ] = single_token_engine_no_internal_cache.kv_cache_data_type pipeline_state.create_state(pipeline_state_vals) return pipeline_state + @pytest.fixture def large_prompt(): prompt = "Hello, how are you doing today?" generation_config = {"top_p": 0, "top_k": 0, "max_length": 10} return TextGenerationInput(prompt=prompt, generation_config=generation_config) + @pytest.fixture def small_prompt(): prompt = "Hello" return TextGenerationInput(prompt=prompt) + @pytest.fixture def mock_kv_cache(): - from deepsparse.transformers.utils import DecoderKVCache kv_cache = DecoderKVCache() kv_cache.setup( state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, ) return kv_cache + @pytest.fixture def mock_kv_cache_full(): - from deepsparse.transformers.utils import DecoderKVCache kv_cache = DecoderKVCache() kv_cache.setup( state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, - num_processed_tokens=3 + num_processed_tokens=3, ) return kv_cache -""" + @pytest.fixture -def mock_kv_cache_engine(pipeline_state): - from deepsparse.transformers.utils import DecoderKVCache +def mock_kv_cache_engine(pipeline_state, text_generation_attributes): + seq_len, _ = text_generation_attributes kv_cache = DecoderKVCache() kv_cache_state = initialize_kv_cache_state( cache_shape=pipeline_state.current_state.get("cache_shape"), kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), output_names=pipeline_state.current_state.get("output_names"), - length=self.sequence_length - self.prompt_sequence_length, - empty=bool(self.internal_kv_cache), + length=seq_len - 1, + empty=False, ) - print(state) + kv_cache.setup(state=kv_cache_state) return kv_cache -""" + @pytest.fixture def mock_tokens(): return [15496] + @pytest.fixture def mock_tokens_multiple(): return [15496, 15496, 15496] + @pytest.fixture def mock_inference_state(): generation_config = GenerationDefaults() inference_state = InferenceState() inference_state.create_state({}) - inference_state.update_state({ - "generation_config": generation_config}) + inference_state.update_state({"generation_config": generation_config}) return inference_state + @pytest.fixture -def mock_token_generator(text_generation_attributes, mock_tokens_multiple): - _, _, _, tokenizer = text_generation_attributes +def mock_token_generator(model_attributes, mock_tokens_multiple): + tokenizer, _ = model_attributes token_generator_creator = TokenGeneratorOperator() prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer)) token_generator_creator_output = token_generator_creator.run( @@ -153,18 +175,20 @@ def mock_token_generator(text_generation_attributes, mock_tokens_multiple): ) return token_generator_creator_output.get("token_generator") + @pytest.fixture -def mock_logits(text_generation_attributes): - _, _, _, tokenizer = text_generation_attributes +def mock_logits(model_attributes): + tokenizer, _ = model_attributes return numpy.random.rand(1, 1, len(tokenizer)) -def test_process_inputs(text_generation_attributes, small_prompt, large_prompt): - sequence_length, _, _, tokenizer = text_generation_attributes - from deepsparse.v2.text_generation.process_inputs import ProcessInputsTextGeneration +def test_process_inputs( + text_generation_attributes, model_attributes, small_prompt, large_prompt +): + sequence_length, _ = text_generation_attributes + tokenizer, _ = model_attributes process_inputs = ProcessInputsTextGeneration( - sequence_length=sequence_length, - tokenizer=tokenizer + sequence_length=sequence_length, tokenizer=tokenizer ) outputs, state_update = process_inputs.run(small_prompt) @@ -173,115 +197,142 @@ def test_process_inputs(text_generation_attributes, small_prompt, large_prompt): assert state_update.get("prompts") == small_prompt.sequences outputs, state_update = process_inputs.run(large_prompt) - + assert not isinstance(state_update.get("generation_config"), GenerationDefaults) - assert state_update.get("generation_config").max_length == large_prompt.generation_config.get("max_length") + assert state_update.get( + "generation_config" + ).max_length == large_prompt.generation_config.get("max_length") assert outputs.get("tokens") assert state_update.get("top_k") == large_prompt.generation_config.get("top_k") def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache): assert single_token_engine_no_internal_cache.input_ids_length == 1 - -def test_kv_cache_creation(pipeline_state, text_generation_attributes): - from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput - seq_length, prompt_sequence_length, model_path, tokenizer = text_generation_attributes + + +def test_kv_cache_creation( + pipeline_state, text_generation_attributes, model_attributes +): + seq_length, prompt_seq_len = text_generation_attributes + tokenizer, _ = model_attributes kv_cache_creator = KVCacheCreator( tokenizer=tokenizer, - prompt_sequence_length=prompt_sequence_length, + prompt_sequence_length=prompt_seq_len, sequence_length=seq_length, - internal_kv_cache=False + internal_kv_cache=False, ) - + assert kv_cache_creator.input_schema == KVCacheCreatorInput kv_cache = kv_cache_creator.run( cache_shape=pipeline_state.current_state.get("cache_shape"), kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), - output_names=pipeline_state.current_state.get("output_names") + output_names=pipeline_state.current_state.get("output_names"), ) assert kv_cache.get("kv_cache") assert kv_cache.get("kv_cache").total_num_processed_tokens == 0 -def test_autoreg_preproces_can_run(text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache): - seq_len, prompt_seq_len, _, _ = text_generation_attributes - from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess +def test_autoreg_preproces_can_run( + text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache +): + seq_len, _ = text_generation_attributes autoreg_prep = AutoRegressiveOperatorPreprocess( - sequence_length=seq_len, - prompt_sequence_length=prompt_seq_len + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 ) inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} assert autoreg_prep.can_operate(inputs) outputs = autoreg_prep.run( - tokens=mock_tokens, - kv_cache=mock_kv_cache, - pipeline_state=pipeline_state + tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state ) - assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions + assert ( + len(outputs.get("engine_inputs")) == 4 + ) # tokens, attention mask, causal, positions tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - print(outputs.get("engine_inputs")) + assert tokens.shape[-1] == 1 assert attention_mask.shape[-1] == seq_len assert positions[0] == mock_kv_cache.total_num_processed_tokens assert outputs.get("in_generation") is None -def test_autoreg_preproces_cant_run(text_generation_attributes, mock_kv_cache, mock_tokens_multiple): - seq_len, _, _, _ = text_generation_attributes - from deepsparse.v2.text_generation.autoregressive_preprocess_operator import AutoRegressiveOperatorPreprocess + +def test_autoreg_preproces_cant_run( + text_generation_attributes, mock_kv_cache, mock_tokens_multiple +): + seq_len, _ = text_generation_attributes autoreg_prep = AutoRegressiveOperatorPreprocess( - sequence_length=seq_len, - prompt_sequence_length=2 + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} assert not autoreg_prep.can_operate(inputs) - -def test_mult_engine_preprocess(text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state): - seq_len, prompt_seq_len, _, _ = text_generation_attributes - from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill + + +def test_mult_engine_preprocess( + text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state +): + seq_len, _ = text_generation_attributes multi_prep = MultiEnginePrefill( - sequence_length=seq_len, - prompt_sequence_length=prompt_seq_len + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} assert multi_prep.can_operate(inputs) - outputs = multi_prep.run(tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, pipeline_state=pipeline_state) - assert len(outputs.get("engine_inputs")) == 4 # tokens, attention mask, causal, positions + outputs = multi_prep.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache, + pipeline_state=pipeline_state, + ) + assert ( + len(outputs.get("engine_inputs")) == 4 + ) # tokens, attention mask, causal, positions tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - assert tokens.shape[-1] == prompt_seq_len + assert tokens.shape[-1] == len(mock_tokens_multiple) assert attention_mask.shape[-1] == seq_len - assert positions.shape[-1] == prompt_seq_len + assert positions.shape[-1] == len(mock_tokens_multiple) -def test_multi_engine_preprocess_cant_operate(text_generation_attributes, mock_kv_cache, mock_tokens): - seq_len, prompt_seq_len, _, _ = text_generation_attributes - from deepsparse.v2.text_generation.multi_engine_prefill_operator import MultiEnginePrefill + +def test_multi_engine_preprocess_cant_operate( + text_generation_attributes, mock_kv_cache, mock_tokens +): + seq_len, _ = text_generation_attributes multi_prep = MultiEnginePrefill( - sequence_length=seq_len, - prompt_sequence_length=prompt_seq_len + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 ) inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} assert not multi_prep.can_operate(inputs) -""" -def test_run_single_engine_once(single_token_engine_no_internal_cache, mock_kv_cache_engine): - from deepsparse.v2.text_generation.nl_engine_operator import NlEngineInput - mock_engine_inputs = [numpy.array([[15496]]), numpy.array([[0, 0, 0, 0, 1]]), numpy.array([[0]]), numpy.array([[[[0, 0, 0, 0, 1]]]])] +def test_run_single_engine_once( + single_token_engine_no_internal_cache, + mock_kv_cache_engine, +): + + mock_engine_inputs = [ + numpy.array([[15496]]), + numpy.array([[0, 0, 0, 0, 1]]), + numpy.array([[0]]), + numpy.array([[[[0, 0, 0, 0, 1]]]]), + ] inputs = NlEngineInput( engine_inputs=mock_engine_inputs, kv_cache=mock_kv_cache_engine, - tokens=mock_engine_inputs[0].tolist() + tokens=mock_engine_inputs[0].tolist(), ) - print(single_token_engine_no_internal_cache.run(inputs)) -""" - -def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_generation_attributes, mock_inference_state): - seq_len, prompt_seq_len, _, tokenizer = text_generation_attributes + output = single_token_engine_no_internal_cache.run(inputs) + assert output + + +def test_prep_for_generation( + mock_tokens_multiple, + mock_kv_cache_full, + text_generation_attributes, + mock_inference_state, + model_attributes, +): + seq_len, _ = text_generation_attributes + tokenizer, _ = model_attributes prep_for_generation = PrepareGeneration( - token_generator=TokenGeneratorOperator(), - sequence_length=seq_len, - prompt_sequence_length=prompt_seq_len + token_generator=TokenGeneratorOperator(), sequence_length=seq_len ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full} assert prep_for_generation.can_operate(inputs) @@ -291,35 +342,45 @@ def test_prep_for_generation(mock_tokens_multiple, mock_kv_cache_full, text_gene outputs, state = prep_for_generation.run( tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, - inference_state=mock_inference_state - ) + inference_state=mock_inference_state, + ) assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1 assert outputs.get("in_generation") - assert numpy.array_equal(state.get("generated_logits")[0], numpy.expand_dims(prompt_logits[0][:, -1, :], 0)) + assert numpy.array_equal( + state.get("generated_logits")[0], + numpy.expand_dims(prompt_logits[0][:, -1, :], 0), + ) + -def test_generate_new_token(mock_token_generator, text_generation_attributes, mock_kv_cache, mock_inference_state, mock_logits, mock_tokens): - _, _, _, tokenizer = text_generation_attributes - from deepsparse.v2.text_generation import GenerateNewTokenOperator +def test_generate_new_token( + mock_token_generator, + model_attributes, + mock_kv_cache, + mock_inference_state, + mock_logits, + mock_tokens, +): + tokenizer, _ = model_attributes generate_new_token = GenerateNewTokenOperator( - force_max_tokens=False, - tokenizer=tokenizer + force_max_tokens=False, tokenizer=tokenizer + ) + mock_inference_state.update_state( + { + "token_generator": mock_token_generator, + "generated_tokens": [mock_token_generator.tokens], + } ) - mock_inference_state.update_state({"token_generator": mock_token_generator, "generated_tokens": [mock_token_generator.tokens]}) outputs, state = generate_new_token.run( - logits=mock_logits, - kv_cache=mock_kv_cache, - inference_state=mock_inference_state + logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state ) assert outputs.get("new_token") == state.get("token_generator").tokens[-1] def test_compile_logits(mock_logits, mock_inference_state): - from deepsparse.v2.text_generation import CompilePromptLogits mock_inference_state.update_state({"prompt_logits": [mock_logits]}) compile_prompt_logits = CompilePromptLogits() assert compile_prompt_logits.can_operate({}) output, state = compile_prompt_logits.run( - logits=mock_logits, - inference_state=mock_inference_state + logits=mock_logits, inference_state=mock_inference_state ) - assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 \ No newline at end of file + assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 From 8c8989d03655efe15023432db21dd21023a14f43 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 21:29:01 -0500 Subject: [PATCH 15/57] remove debug --- src/deepsparse/v2/text_generation/process_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 059ed06f14..214b8526e3 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -31,7 +31,7 @@ class GenerationDefaults: num_return_sequences = 1 - max_length = 10 + max_length = 100 max_new_tokens = None output_scores = False top_k = 0 From f8d75e3fdbd5eac3b8cd041beac82d0339058ed2 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 21:41:11 -0500 Subject: [PATCH 16/57] fix --- src/deepsparse/v2/text_generation/pipeline.py | 1 + .../v2/text_generation/prep_for_generation.py | 4 +++- .../deepsparse/v2/unit/test_text_generation.py | 17 +++++++++-------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 1c2972859b..240da04907 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -124,6 +124,7 @@ def __init__( token_generator = TokenGeneratorOperator() prep_for_generation = PrepareGeneration( sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, token_generator=token_generator, ) generate_new_token = GenerateNewTokenOperator( diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 75f4aa9db2..0ea4a06a02 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -30,10 +30,12 @@ class PrepareGeneration(Operator): def __init__( self, token_generator: TokenGeneratorOperator, + prompt_sequence_length: int, sequence_length: int, ): self.sequence_length = sequence_length self.token_generator_creator = token_generator + self.prompt_sequence_length = prompt_sequence_length def can_operate(self, inp: Any): kv_cache = inp.get("kv_cache") @@ -70,7 +72,7 @@ def run( token_generator = token_generator_creator_output.get("token_generator") token_generator.generate(prompt_logits[0, -1, :]) - max_tokens, length_finish_reason = PrepareGeneration.set_generated_length( + max_tokens, length_finish_reason = set_generated_length( max_length=generation_config.max_length, prompt_tokens_length=1, max_new_tokens=generation_config.max_new_tokens, diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index 410bcffdd1..97ed4fef95 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -211,7 +211,7 @@ def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cach def test_kv_cache_creation( - pipeline_state, text_generation_attributes, model_attributes + text_generation_attributes, model_attributes, pipeline_state ): seq_length, prompt_seq_len = text_generation_attributes tokenizer, _ = model_attributes @@ -269,7 +269,7 @@ def test_autoreg_preproces_cant_run( def test_mult_engine_preprocess( - text_generation_attributes, mock_kv_cache, mock_tokens_multiple, pipeline_state + text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple ): seq_len, _ = text_generation_attributes multi_prep = MultiEnginePrefill( @@ -323,16 +323,18 @@ def test_run_single_engine_once( def test_prep_for_generation( + text_generation_attributes, + model_attributes, mock_tokens_multiple, mock_kv_cache_full, - text_generation_attributes, mock_inference_state, - model_attributes, ): - seq_len, _ = text_generation_attributes + seq_len, prompt_seq_len = text_generation_attributes tokenizer, _ = model_attributes prep_for_generation = PrepareGeneration( - token_generator=TokenGeneratorOperator(), sequence_length=seq_len + prompt_sequence_length=prompt_seq_len, + token_generator=TokenGeneratorOperator(), + sequence_length=seq_len, ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full} assert prep_for_generation.can_operate(inputs) @@ -353,12 +355,11 @@ def test_prep_for_generation( def test_generate_new_token( - mock_token_generator, model_attributes, + mock_token_generator, mock_kv_cache, mock_inference_state, mock_logits, - mock_tokens, ): tokenizer, _ = model_attributes generate_new_token = GenerateNewTokenOperator( From fd1e466363f8fb0162640ae5f08aef964e58c084 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 7 Nov 2023 21:45:16 -0500 Subject: [PATCH 17/57] add todo --- src/deepsparse/v2/routers/router.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 1b70164002..6b0d851aef 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -158,4 +158,5 @@ def next( @staticmethod def validate(ops) -> bool: + # TODO: still needs to be implemented for the GraphRouter pass From 64c055266a50dc6ee65ef897783195f2006f943e Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 8 Nov 2023 09:58:53 -0500 Subject: [PATCH 18/57] more clean-up --- .../v2/text_generation/kv_cache_operator.py | 1 - .../deepsparse/v2/unit/test_text_generation.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py index 5811f44b32..3c15d0ff5a 100644 --- a/src/deepsparse/v2/text_generation/kv_cache_operator.py +++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py @@ -61,7 +61,6 @@ def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs length=self.sequence_length - self.prompt_sequence_length, empty=bool(self.internal_kv_cache), ) - print(kv_cache_state.get("past_key_values.0.key").shape) kv_cache = DecoderKVCache(self.internal_kv_cache) kv_cache.setup( diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index 97ed4fef95..0d0c4ef3be 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -49,7 +49,7 @@ def text_generation_attributes(): @pytest.fixture def model_attributes(text_generation_attributes): model_path = "hf:mgoin/TinyStories-1M-deepsparse" - sequence_length, prompt_sequence_length = text_generation_attributes + sequence_length, _ = text_generation_attributes deployment_path, model_path = get_deployment_path(model_path) tokenizer = AutoTokenizer.from_pretrained( @@ -129,14 +129,14 @@ def mock_kv_cache_full(): @pytest.fixture -def mock_kv_cache_engine(pipeline_state, text_generation_attributes): - seq_len, _ = text_generation_attributes +def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes): + seq_len, prompt_seq_len = text_generation_attributes kv_cache = DecoderKVCache() kv_cache_state = initialize_kv_cache_state( cache_shape=pipeline_state.current_state.get("cache_shape"), kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), output_names=pipeline_state.current_state.get("output_names"), - length=seq_len - 1, + length=seq_len - prompt_seq_len, empty=False, ) kv_cache.setup(state=kv_cache_state) @@ -235,7 +235,7 @@ def test_kv_cache_creation( def test_autoreg_preproces_can_run( text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache ): - seq_len, _ = text_generation_attributes + seq_len, prompt_seq_len = text_generation_attributes autoreg_prep = AutoRegressiveOperatorPreprocess( sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 ) @@ -251,7 +251,7 @@ def test_autoreg_preproces_can_run( ) # tokens, attention mask, causal, positions tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - assert tokens.shape[-1] == 1 + assert tokens.shape[-1] == prompt_seq_len assert attention_mask.shape[-1] == seq_len assert positions[0] == mock_kv_cache.total_num_processed_tokens assert outputs.get("in_generation") is None @@ -302,9 +302,9 @@ def test_multi_engine_preprocess_cant_operate( assert not multi_prep.can_operate(inputs) -def test_run_single_engine_once( +def test_run_single_token_engine_once( single_token_engine_no_internal_cache, - mock_kv_cache_engine, + mock_kv_cache_single_token_engine, ): mock_engine_inputs = [ @@ -315,7 +315,7 @@ def test_run_single_engine_once( ] inputs = NlEngineInput( engine_inputs=mock_engine_inputs, - kv_cache=mock_kv_cache_engine, + kv_cache=mock_kv_cache_single_token_engine, tokens=mock_engine_inputs[0].tolist(), ) output = single_token_engine_no_internal_cache.run(inputs) From 913665aea6af951a8bafdfa63ca568a14937f066 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 8 Nov 2023 10:07:44 -0500 Subject: [PATCH 19/57] fix test --- tests/deepsparse/v2/unit/test_text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index 0d0c4ef3be..2d2edda94e 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -319,7 +319,7 @@ def test_run_single_token_engine_once( tokens=mock_engine_inputs[0].tolist(), ) output = single_token_engine_no_internal_cache.run(inputs) - assert output + assert output.get("logits") is not None def test_prep_for_generation( From e15521fc53bbe073504cd5ec13fcbbe639702a9f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 8 Nov 2023 10:38:19 -0500 Subject: [PATCH 20/57] add docstrings/comments --- .../v2/unit/test_text_generation.py | 87 ++++++++++++++++--- 1 file changed, 75 insertions(+), 12 deletions(-) diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py index 2d2edda94e..59204cb2d0 100644 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ b/tests/deepsparse/v2/unit/test_text_generation.py @@ -119,7 +119,7 @@ def mock_kv_cache(): @pytest.fixture -def mock_kv_cache_full(): +def mock_kv_cache_three_tokens_processed(): kv_cache = DecoderKVCache() kv_cache.setup( state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, @@ -185,6 +185,10 @@ def mock_logits(model_attributes): def test_process_inputs( text_generation_attributes, model_attributes, small_prompt, large_prompt ): + """ + Check if the ProcessInputsTextGeneration Operator successfully processes the + inputs and generation config. + """ sequence_length, _ = text_generation_attributes tokenizer, _ = model_attributes process_inputs = ProcessInputsTextGeneration( @@ -213,6 +217,10 @@ def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cach def test_kv_cache_creation( text_generation_attributes, model_attributes, pipeline_state ): + """ + Check if the KVCacheCreator successfully creates a kv_cache object, given the + single_token_engine attributes stored in the pipeline_state. + """ seq_length, prompt_seq_len = text_generation_attributes tokenizer, _ = model_attributes kv_cache_creator = KVCacheCreator( @@ -235,23 +243,29 @@ def test_kv_cache_creation( def test_autoreg_preproces_can_run( text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache ): - seq_len, prompt_seq_len = text_generation_attributes + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes autoreg_prep = AutoRegressiveOperatorPreprocess( sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 ) inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + # The prompt_sequence_length is greater than the number of tokens that are to be + # operated on. Therefore, use the single_token_engine and can_operate() should be + # True. assert autoreg_prep.can_operate(inputs) outputs = autoreg_prep.run( tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state ) - - assert ( - len(outputs.get("engine_inputs")) == 4 - ) # tokens, attention mask, causal, positions + # Assert 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - assert tokens.shape[-1] == prompt_seq_len + assert tokens.shape[-1] == 1 assert attention_mask.shape[-1] == seq_len assert positions[0] == mock_kv_cache.total_num_processed_tokens assert outputs.get("in_generation") is None @@ -260,32 +274,47 @@ def test_autoreg_preproces_can_run( def test_autoreg_preproces_cant_run( text_generation_attributes, mock_kv_cache, mock_tokens_multiple ): + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + seq_len, _ = text_generation_attributes autoreg_prep = AutoRegressiveOperatorPreprocess( sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # can_operate() should be False as the prompt_sequence_length is equal to the + # number of tokens we want to operate on. Therefore, the multi-token engine + # should run instead. assert not autoreg_prep.can_operate(inputs) def test_mult_engine_preprocess( text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple ): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + seq_len, _ = text_generation_attributes multi_prep = MultiEnginePrefill( sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) ) inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # The number of tokens is equal to the prompt_sequence_length. + # Therefore, the multi_token_engine can run and can_operate() should be True. assert multi_prep.can_operate(inputs) outputs = multi_prep.run( tokens=mock_tokens_multiple, kv_cache=mock_kv_cache, pipeline_state=pipeline_state, ) - assert ( - len(outputs.get("engine_inputs")) == 4 - ) # tokens, attention mask, causal, positions + # Expect 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + # Assert proper shapes for all engine_inputs assert tokens.shape[-1] == len(mock_tokens_multiple) assert attention_mask.shape[-1] == seq_len assert positions.shape[-1] == len(mock_tokens_multiple) @@ -294,11 +323,18 @@ def test_mult_engine_preprocess( def test_multi_engine_preprocess_cant_operate( text_generation_attributes, mock_kv_cache, mock_tokens ): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ seq_len, _ = text_generation_attributes multi_prep = MultiEnginePrefill( sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 ) inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + # The prompt_sequence_length is one greater than the total number of tokens we're + # processing. Therefore, this operator should not run and can_operate() should be + # False. assert not multi_prep.can_operate(inputs) @@ -306,6 +342,10 @@ def test_run_single_token_engine_once( single_token_engine_no_internal_cache, mock_kv_cache_single_token_engine, ): + """ + This operator runs through the single-token NLEngine once, given engine_inputs and + kv_cache. + """ mock_engine_inputs = [ numpy.array([[15496]]), @@ -326,9 +366,13 @@ def test_prep_for_generation( text_generation_attributes, model_attributes, mock_tokens_multiple, - mock_kv_cache_full, + mock_kv_cache_three_tokens_processed, mock_inference_state, ): + """ + This test will assess the PrepareGeneration, which runs after prompt_inference + and before generation. + """ seq_len, prompt_seq_len = text_generation_attributes tokenizer, _ = model_attributes prep_for_generation = PrepareGeneration( @@ -336,7 +380,13 @@ def test_prep_for_generation( token_generator=TokenGeneratorOperator(), sequence_length=seq_len, ) - inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache_full} + inputs = { + "tokens": mock_tokens_multiple, + "kv_cache": mock_kv_cache_three_tokens_processed, + } + # can_operate() if the total number of prompt tokens is equal to the + # number of processed tokens stored in the kv_cache, indicating prompt inference is + # complete and generation can begin. assert prep_for_generation.can_operate(inputs) prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))] @@ -361,6 +411,11 @@ def test_generate_new_token( mock_inference_state, mock_logits, ): + """ + This test is responsible for testing the GenerateNewTokenOperator, which generates + one new token, given a token_generator (stored in the inference_state) and logits + from the engine. + """ tokenizer, _ = model_attributes generate_new_token = GenerateNewTokenOperator( force_max_tokens=False, tokenizer=tokenizer @@ -374,14 +429,22 @@ def test_generate_new_token( outputs, state = generate_new_token.run( logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state ) + # The new_token generated/returned by ths operator should match the last token in + # token_generator assert outputs.get("new_token") == state.get("token_generator").tokens[-1] def test_compile_logits(mock_logits, mock_inference_state): mock_inference_state.update_state({"prompt_logits": [mock_logits]}) compile_prompt_logits = CompilePromptLogits() + # Can operate as long as we're not in generation but in prompt_inference. This + # can_operate() will check for the `in_generation` flag in the input. assert compile_prompt_logits.can_operate({}) output, state = compile_prompt_logits.run( logits=mock_logits, inference_state=mock_inference_state ) + # The CompilePromptLogits is responsible for updating a list of prompt logits + # calculated at each step during prompt inference. After one step of running this + # operator, the total number of prompt_logits in the inference state should be + # the current length of prompt logits + 1 assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 From 379481e159186434b482df82d17d5893a4a23071 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 9 Nov 2023 16:53:41 -0500 Subject: [PATCH 21/57] break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed --- .../v2/unit/test_text_generation.py | 450 ------------------ .../v2/unit/text_generation/conftest.py | 173 +++++++ .../v2/unit/text_generation/test_kv_cache.py | 41 ++ .../v2/unit/text_generation/test_msic.py | 31 ++ .../text_generation/test_process_inputs.py | 47 ++ .../test_single_token_engine.py | 98 ++++ .../text_generation/test_token_generation.py | 92 ++++ .../text_multi_token_engine.py | 63 +++ 8 files changed, 545 insertions(+), 450 deletions(-) delete mode 100644 tests/deepsparse/v2/unit/test_text_generation.py create mode 100644 tests/deepsparse/v2/unit/text_generation/conftest.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_kv_cache.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_msic.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_process_inputs.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_token_generation.py create mode 100644 tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py diff --git a/tests/deepsparse/v2/unit/test_text_generation.py b/tests/deepsparse/v2/unit/test_text_generation.py deleted file mode 100644 index 59204cb2d0..0000000000 --- a/tests/deepsparse/v2/unit/test_text_generation.py +++ /dev/null @@ -1,450 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy - -import numpy -from transformers import AutoTokenizer - -import pytest -from deepsparse.transformers.helpers import get_deployment_path -from deepsparse.transformers.pipelines.text_generation import TextGenerationInput -from deepsparse.transformers.utils import DecoderKVCache -from deepsparse.transformers.utils.helpers import initialize_kv_cache_state -from deepsparse.v2 import InferenceState, PipelineState -from deepsparse.v2.text_generation import ( - AutoRegressiveOperatorPreprocess, - CompilePromptLogits, - GenerateNewTokenOperator, - GenerationDefaults, - KVCacheCreator, - KVCacheCreatorInput, - MultiEnginePrefill, - NlEngineInput, - NLEngineOperator, - PrepareGeneration, - ProcessInputsTextGeneration, - TokenGeneratorOperator, -) - - -@pytest.fixture -def text_generation_attributes(): - sequence_length = 5 - prompt_sequence_length = 1 - return sequence_length, prompt_sequence_length - - -@pytest.fixture -def model_attributes(text_generation_attributes): - model_path = "hf:mgoin/TinyStories-1M-deepsparse" - sequence_length, _ = text_generation_attributes - deployment_path, model_path = get_deployment_path(model_path) - - tokenizer = AutoTokenizer.from_pretrained( - deployment_path, - trust_remote_code=False, - model_max_length=sequence_length, - ) - - tokenizer.padding_side = "left" - if not tokenizer.pad_token: - tokenizer.pad_token = tokenizer.eos_token - - return tokenizer, model_path - - -@pytest.fixture -def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes): - seq_length, _ = text_generation_attributes - _, model_path = model_attributes - - nl_engine_operator = NLEngineOperator( - sequence_length=seq_length, input_ids_length=1, model_path=model_path - ) - return nl_engine_operator - - -@pytest.fixture -def pipeline_state(single_token_engine_no_internal_cache): - pipeline_state = PipelineState() - pipeline_state_vals = {} - pipeline_state_vals[ - "onnx_input_names_no_cache" - ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache - pipeline_state_vals[ - "cache_shape" - ] = single_token_engine_no_internal_cache.cache_shape - pipeline_state_vals[ - "output_names" - ] = single_token_engine_no_internal_cache.output_names - pipeline_state_vals[ - "kv_cache_data_type" - ] = single_token_engine_no_internal_cache.kv_cache_data_type - pipeline_state.create_state(pipeline_state_vals) - return pipeline_state - - -@pytest.fixture -def large_prompt(): - prompt = "Hello, how are you doing today?" - generation_config = {"top_p": 0, "top_k": 0, "max_length": 10} - return TextGenerationInput(prompt=prompt, generation_config=generation_config) - - -@pytest.fixture -def small_prompt(): - prompt = "Hello" - return TextGenerationInput(prompt=prompt) - - -@pytest.fixture -def mock_kv_cache(): - kv_cache = DecoderKVCache() - kv_cache.setup( - state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, - ) - return kv_cache - - -@pytest.fixture -def mock_kv_cache_three_tokens_processed(): - kv_cache = DecoderKVCache() - kv_cache.setup( - state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, - num_processed_tokens=3, - ) - return kv_cache - - -@pytest.fixture -def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes): - seq_len, prompt_seq_len = text_generation_attributes - kv_cache = DecoderKVCache() - kv_cache_state = initialize_kv_cache_state( - cache_shape=pipeline_state.current_state.get("cache_shape"), - kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), - output_names=pipeline_state.current_state.get("output_names"), - length=seq_len - prompt_seq_len, - empty=False, - ) - kv_cache.setup(state=kv_cache_state) - return kv_cache - - -@pytest.fixture -def mock_tokens(): - return [15496] - - -@pytest.fixture -def mock_tokens_multiple(): - return [15496, 15496, 15496] - - -@pytest.fixture -def mock_inference_state(): - generation_config = GenerationDefaults() - inference_state = InferenceState() - inference_state.create_state({}) - inference_state.update_state({"generation_config": generation_config}) - return inference_state - - -@pytest.fixture -def mock_token_generator(model_attributes, mock_tokens_multiple): - tokenizer, _ = model_attributes - token_generator_creator = TokenGeneratorOperator() - prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer)) - token_generator_creator_output = token_generator_creator.run( - logits_shape=prompt_logits[0, -1, :].shape, - deterministic=True, - sampling_temperature=1.0, - tokens=copy.copy(mock_tokens_multiple), - ) - return token_generator_creator_output.get("token_generator") - - -@pytest.fixture -def mock_logits(model_attributes): - tokenizer, _ = model_attributes - return numpy.random.rand(1, 1, len(tokenizer)) - - -def test_process_inputs( - text_generation_attributes, model_attributes, small_prompt, large_prompt -): - """ - Check if the ProcessInputsTextGeneration Operator successfully processes the - inputs and generation config. - """ - sequence_length, _ = text_generation_attributes - tokenizer, _ = model_attributes - process_inputs = ProcessInputsTextGeneration( - sequence_length=sequence_length, tokenizer=tokenizer - ) - - outputs, state_update = process_inputs.run(small_prompt) - assert len(outputs.get("tokens")) == 1 - assert isinstance(state_update.get("generation_config"), GenerationDefaults) - assert state_update.get("prompts") == small_prompt.sequences - - outputs, state_update = process_inputs.run(large_prompt) - - assert not isinstance(state_update.get("generation_config"), GenerationDefaults) - assert state_update.get( - "generation_config" - ).max_length == large_prompt.generation_config.get("max_length") - assert outputs.get("tokens") - assert state_update.get("top_k") == large_prompt.generation_config.get("top_k") - - -def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache): - assert single_token_engine_no_internal_cache.input_ids_length == 1 - - -def test_kv_cache_creation( - text_generation_attributes, model_attributes, pipeline_state -): - """ - Check if the KVCacheCreator successfully creates a kv_cache object, given the - single_token_engine attributes stored in the pipeline_state. - """ - seq_length, prompt_seq_len = text_generation_attributes - tokenizer, _ = model_attributes - kv_cache_creator = KVCacheCreator( - tokenizer=tokenizer, - prompt_sequence_length=prompt_seq_len, - sequence_length=seq_length, - internal_kv_cache=False, - ) - - assert kv_cache_creator.input_schema == KVCacheCreatorInput - kv_cache = kv_cache_creator.run( - cache_shape=pipeline_state.current_state.get("cache_shape"), - kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), - output_names=pipeline_state.current_state.get("output_names"), - ) - assert kv_cache.get("kv_cache") - assert kv_cache.get("kv_cache").total_num_processed_tokens == 0 - - -def test_autoreg_preproces_can_run( - text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache -): - """ - Check if the single-token engine preprocess operator can run based on the provided - tokens and prompt_sequence_length. - """ - - seq_len, _ = text_generation_attributes - autoreg_prep = AutoRegressiveOperatorPreprocess( - sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 - ) - inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} - - # The prompt_sequence_length is greater than the number of tokens that are to be - # operated on. Therefore, use the single_token_engine and can_operate() should be - # True. - assert autoreg_prep.can_operate(inputs) - outputs = autoreg_prep.run( - tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state - ) - # Assert 4 engine inputs: tokens, attention mask, causal, positions - assert len(outputs.get("engine_inputs")) == 4 - tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - - assert tokens.shape[-1] == 1 - assert attention_mask.shape[-1] == seq_len - assert positions[0] == mock_kv_cache.total_num_processed_tokens - assert outputs.get("in_generation") is None - - -def test_autoreg_preproces_cant_run( - text_generation_attributes, mock_kv_cache, mock_tokens_multiple -): - """ - Check if the single-token engine preprocess operator can run based on the provided - tokens and prompt_sequence_length. - """ - - seq_len, _ = text_generation_attributes - autoreg_prep = AutoRegressiveOperatorPreprocess( - sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) - ) - inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} - # can_operate() should be False as the prompt_sequence_length is equal to the - # number of tokens we want to operate on. Therefore, the multi-token engine - # should run instead. - assert not autoreg_prep.can_operate(inputs) - - -def test_mult_engine_preprocess( - text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple -): - """ - Check if the multi-token engine preprocess operator can run based on the provided - tokens and prompt_sequence_length. - """ - - seq_len, _ = text_generation_attributes - multi_prep = MultiEnginePrefill( - sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) - ) - inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} - # The number of tokens is equal to the prompt_sequence_length. - # Therefore, the multi_token_engine can run and can_operate() should be True. - assert multi_prep.can_operate(inputs) - outputs = multi_prep.run( - tokens=mock_tokens_multiple, - kv_cache=mock_kv_cache, - pipeline_state=pipeline_state, - ) - # Expect 4 engine inputs: tokens, attention mask, causal, positions - assert len(outputs.get("engine_inputs")) == 4 - tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") - # Assert proper shapes for all engine_inputs - assert tokens.shape[-1] == len(mock_tokens_multiple) - assert attention_mask.shape[-1] == seq_len - assert positions.shape[-1] == len(mock_tokens_multiple) - - -def test_multi_engine_preprocess_cant_operate( - text_generation_attributes, mock_kv_cache, mock_tokens -): - """ - Check if the multi-token engine preprocess operator can run based on the provided - tokens and prompt_sequence_length. - """ - seq_len, _ = text_generation_attributes - multi_prep = MultiEnginePrefill( - sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 - ) - inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} - # The prompt_sequence_length is one greater than the total number of tokens we're - # processing. Therefore, this operator should not run and can_operate() should be - # False. - assert not multi_prep.can_operate(inputs) - - -def test_run_single_token_engine_once( - single_token_engine_no_internal_cache, - mock_kv_cache_single_token_engine, -): - """ - This operator runs through the single-token NLEngine once, given engine_inputs and - kv_cache. - """ - - mock_engine_inputs = [ - numpy.array([[15496]]), - numpy.array([[0, 0, 0, 0, 1]]), - numpy.array([[0]]), - numpy.array([[[[0, 0, 0, 0, 1]]]]), - ] - inputs = NlEngineInput( - engine_inputs=mock_engine_inputs, - kv_cache=mock_kv_cache_single_token_engine, - tokens=mock_engine_inputs[0].tolist(), - ) - output = single_token_engine_no_internal_cache.run(inputs) - assert output.get("logits") is not None - - -def test_prep_for_generation( - text_generation_attributes, - model_attributes, - mock_tokens_multiple, - mock_kv_cache_three_tokens_processed, - mock_inference_state, -): - """ - This test will assess the PrepareGeneration, which runs after prompt_inference - and before generation. - """ - seq_len, prompt_seq_len = text_generation_attributes - tokenizer, _ = model_attributes - prep_for_generation = PrepareGeneration( - prompt_sequence_length=prompt_seq_len, - token_generator=TokenGeneratorOperator(), - sequence_length=seq_len, - ) - inputs = { - "tokens": mock_tokens_multiple, - "kv_cache": mock_kv_cache_three_tokens_processed, - } - # can_operate() if the total number of prompt tokens is equal to the - # number of processed tokens stored in the kv_cache, indicating prompt inference is - # complete and generation can begin. - assert prep_for_generation.can_operate(inputs) - - prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))] - mock_inference_state.update_state({"prompt_logits": prompt_logits}) - outputs, state = prep_for_generation.run( - tokens=mock_tokens_multiple, - kv_cache=mock_kv_cache, - inference_state=mock_inference_state, - ) - assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1 - assert outputs.get("in_generation") - assert numpy.array_equal( - state.get("generated_logits")[0], - numpy.expand_dims(prompt_logits[0][:, -1, :], 0), - ) - - -def test_generate_new_token( - model_attributes, - mock_token_generator, - mock_kv_cache, - mock_inference_state, - mock_logits, -): - """ - This test is responsible for testing the GenerateNewTokenOperator, which generates - one new token, given a token_generator (stored in the inference_state) and logits - from the engine. - """ - tokenizer, _ = model_attributes - generate_new_token = GenerateNewTokenOperator( - force_max_tokens=False, tokenizer=tokenizer - ) - mock_inference_state.update_state( - { - "token_generator": mock_token_generator, - "generated_tokens": [mock_token_generator.tokens], - } - ) - outputs, state = generate_new_token.run( - logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state - ) - # The new_token generated/returned by ths operator should match the last token in - # token_generator - assert outputs.get("new_token") == state.get("token_generator").tokens[-1] - - -def test_compile_logits(mock_logits, mock_inference_state): - mock_inference_state.update_state({"prompt_logits": [mock_logits]}) - compile_prompt_logits = CompilePromptLogits() - # Can operate as long as we're not in generation but in prompt_inference. This - # can_operate() will check for the `in_generation` flag in the input. - assert compile_prompt_logits.can_operate({}) - output, state = compile_prompt_logits.run( - logits=mock_logits, inference_state=mock_inference_state - ) - # The CompilePromptLogits is responsible for updating a list of prompt logits - # calculated at each step during prompt inference. After one step of running this - # operator, the total number of prompt_logits in the inference state should be - # the current length of prompt logits + 1 - assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py new file mode 100644 index 0000000000..5d8483e5f6 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/conftest.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import numpy +from transformers import AutoTokenizer + +import pytest +from deepsparse.transformers.helpers import get_deployment_path +from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.utils import DecoderKVCache +from deepsparse.transformers.utils.helpers import initialize_kv_cache_state +from deepsparse.v2 import InferenceState, PipelineState +from deepsparse.v2.text_generation import ( + GenerationDefaults, + NLEngineOperator, + TokenGeneratorOperator, +) + + +@pytest.fixture(scope="module") +def text_generation_attributes(): + sequence_length = 5 + prompt_sequence_length = 1 + return sequence_length, prompt_sequence_length + + +@pytest.fixture(scope="module") +def model_attributes(text_generation_attributes): + model_path = "hf:mgoin/TinyStories-1M-deepsparse" + sequence_length, _ = text_generation_attributes + deployment_path, model_path = get_deployment_path(model_path) + + tokenizer = AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=False, + model_max_length=sequence_length, + ) + + tokenizer.padding_side = "left" + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer, model_path + + +@pytest.fixture(scope="module") +def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes): + seq_length, _ = text_generation_attributes + _, model_path = model_attributes + + nl_engine_operator = NLEngineOperator( + sequence_length=seq_length, input_ids_length=1, model_path=model_path + ) + return nl_engine_operator + + +@pytest.fixture(scope="module") +def pipeline_state(single_token_engine_no_internal_cache): + pipeline_state = PipelineState() + pipeline_state_vals = {} + pipeline_state_vals[ + "onnx_input_names_no_cache" + ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache + pipeline_state_vals[ + "cache_shape" + ] = single_token_engine_no_internal_cache.cache_shape + pipeline_state_vals[ + "output_names" + ] = single_token_engine_no_internal_cache.output_names + pipeline_state_vals[ + "kv_cache_data_type" + ] = single_token_engine_no_internal_cache.kv_cache_data_type + pipeline_state.create_state(pipeline_state_vals) + return pipeline_state + + +@pytest.fixture(scope="module") +def large_prompt(): + prompt = "Hello, how are you doing today?" + generation_config = {"top_p": 0, "top_k": 0, "max_length": 10} + return TextGenerationInput(prompt=prompt, generation_config=generation_config) + + +@pytest.fixture(scope="module") +def small_prompt(): + prompt = "Hello" + return TextGenerationInput(prompt=prompt) + + +@pytest.fixture(scope="module") +def mock_kv_cache(): + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + ) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_kv_cache_three_tokens_processed(): + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + num_processed_tokens=3, + ) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes): + seq_len, prompt_seq_len = text_generation_attributes + kv_cache = DecoderKVCache() + kv_cache_state = initialize_kv_cache_state( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names"), + length=seq_len - prompt_seq_len, + empty=False, + ) + kv_cache.setup(state=kv_cache_state) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_tokens(): + return [15496] + + +@pytest.fixture(scope="module") +def mock_tokens_multiple(): + return [15496, 15496, 15496] + + +@pytest.fixture(scope="module") +def mock_inference_state(): + generation_config = GenerationDefaults() + inference_state = InferenceState() + inference_state.create_state({}) + inference_state.update_state({"generation_config": generation_config}) + return inference_state + + +@pytest.fixture(scope="module") +def mock_token_generator(model_attributes, mock_tokens_multiple): + tokenizer, _ = model_attributes + token_generator_creator = TokenGeneratorOperator() + prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer)) + token_generator_creator_output = token_generator_creator.run( + logits_shape=prompt_logits[0, -1, :].shape, + deterministic=True, + sampling_temperature=1.0, + tokens=copy.copy(mock_tokens_multiple), + ) + return token_generator_creator_output.get("token_generator") + + +@pytest.fixture(scope="module") +def mock_logits(model_attributes): + tokenizer, _ = model_attributes + return numpy.random.rand(1, 1, len(tokenizer)) diff --git a/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py new file mode 100644 index 0000000000..0c6e42503a --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py @@ -0,0 +1,41 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput + + +def test_kv_cache_creation( + text_generation_attributes, model_attributes, pipeline_state +): + """ + Check if the KVCacheCreator successfully creates a kv_cache object, given the + single_token_engine attributes stored in the pipeline_state. + """ + seq_length, prompt_seq_len = text_generation_attributes + tokenizer, _ = model_attributes + kv_cache_creator = KVCacheCreator( + tokenizer=tokenizer, + prompt_sequence_length=prompt_seq_len, + sequence_length=seq_length, + internal_kv_cache=False, + ) + + assert kv_cache_creator.input_schema == KVCacheCreatorInput + kv_cache = kv_cache_creator.run( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names"), + ) + assert kv_cache.get("kv_cache") + assert kv_cache.get("kv_cache").total_num_processed_tokens == 0 diff --git a/tests/deepsparse/v2/unit/text_generation/test_msic.py b/tests/deepsparse/v2/unit/text_generation/test_msic.py new file mode 100644 index 0000000000..caa0cc2efd --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_msic.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import CompilePromptLogits + + +def test_compile_logits(mock_logits, mock_inference_state): + mock_inference_state.update_state({"prompt_logits": [mock_logits]}) + compile_prompt_logits = CompilePromptLogits() + # Can operate as long as we're not in generation but in prompt_inference. This + # can_operate() will check for the `in_generation` flag in the input. + assert compile_prompt_logits.can_operate({}) + output, state = compile_prompt_logits.run( + logits=mock_logits, inference_state=mock_inference_state + ) + # The CompilePromptLogits is responsible for updating a list of prompt logits + # calculated at each step during prompt inference. After one step of running this + # operator, the total number of prompt_logits in the inference state should be + # the current length of prompt logits + 1 + assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py new file mode 100644 index 0000000000..be59db7475 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import ( + GenerationDefaults, + ProcessInputsTextGeneration, +) + + +def test_process_inputs( + text_generation_attributes, model_attributes, small_prompt, large_prompt +): + """ + Check if the ProcessInputsTextGeneration Operator successfully processes the + inputs and generation config. + """ + sequence_length, _ = text_generation_attributes + tokenizer, _ = model_attributes + process_inputs = ProcessInputsTextGeneration( + sequence_length=sequence_length, tokenizer=tokenizer + ) + + outputs, state_update = process_inputs.run(small_prompt) + assert len(outputs.get("input_ids")) == 1 + assert len(outputs.get("attention_mask")) == 1 + assert isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get("prompts") == small_prompt.sequences + + outputs, state_update = process_inputs.run(large_prompt) + + assert not isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get( + "generation_config" + ).max_length == large_prompt.generation_config.get("max_length") + assert outputs.get("input_ids") is not None + assert state_update.get("top_k") == large_prompt.generation_config.get("top_k") diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py new file mode 100644 index 0000000000..335a28fbe3 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py @@ -0,0 +1,98 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy + +from deepsparse.v2.text_generation import ( + AutoRegressiveOperatorPreprocess, + NlEngineInput, +) + + +def test_autoreg_preproces_can_run( + text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache +): + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + + # The prompt_sequence_length is greater than the number of tokens that are to be + # operated on. Therefore, use the single_token_engine and can_operate() should be + # True. + assert autoreg_prep.can_operate(inputs) + outputs = autoreg_prep.run( + tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state + ) + # Assert 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + + assert tokens.shape[-1] == 1 + assert attention_mask.shape[-1] == seq_len + assert positions[0] == mock_kv_cache.total_num_processed_tokens + assert outputs.get("in_generation") is None + + +def test_autoreg_preproces_cant_run( + text_generation_attributes, mock_kv_cache, mock_tokens_multiple +): + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # can_operate() should be False as the prompt_sequence_length is equal to the + # number of tokens we want to operate on. Therefore, the multi-token engine + # should run instead. + assert not autoreg_prep.can_operate(inputs) + + +def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache): + assert single_token_engine_no_internal_cache.input_ids_length == 1 + + +def test_run_single_token_engine_once( + single_token_engine_no_internal_cache, + mock_kv_cache_single_token_engine, +): + """ + This operator runs through the single-token NLEngine once, given engine_inputs and + kv_cache. + """ + + mock_engine_inputs = [ + numpy.array([[15496]]), + numpy.array([[0, 0, 0, 0, 1]]), + numpy.array([[0]]), + numpy.array([[[[0, 0, 0, 0, 1]]]]), + ] + inputs = NlEngineInput( + engine_inputs=mock_engine_inputs, + kv_cache=mock_kv_cache_single_token_engine, + tokens=mock_engine_inputs[0].tolist(), + ) + output = single_token_engine_no_internal_cache.run(inputs) + assert output.get("logits") is not None diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py new file mode 100644 index 0000000000..fbd9e06778 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy + +from deepsparse.v2.text_generation import ( + GenerateNewTokenOperator, + PrepareGeneration, + TokenGeneratorOperator, +) + + +def test_prep_for_generation( + text_generation_attributes, + model_attributes, + mock_tokens_multiple, + mock_kv_cache_three_tokens_processed, + mock_inference_state, +): + """ + This test will assess the PrepareGeneration, which runs after prompt_inference + and before generation. + """ + seq_len, prompt_seq_len = text_generation_attributes + tokenizer, _ = model_attributes + prep_for_generation = PrepareGeneration( + prompt_sequence_length=prompt_seq_len, + token_generator=TokenGeneratorOperator(), + sequence_length=seq_len, + ) + inputs = { + "tokens": mock_tokens_multiple, + "kv_cache": mock_kv_cache_three_tokens_processed, + } + # can_operate() if the total number of prompt tokens is equal to the + # number of processed tokens stored in the kv_cache, indicating prompt inference is + # complete and generation can begin. + assert prep_for_generation.can_operate(inputs) + + prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))] + mock_inference_state.update_state({"prompt_logits": prompt_logits}) + outputs, state = prep_for_generation.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache_three_tokens_processed, + inference_state=mock_inference_state, + ) + assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1 + assert outputs.get("in_generation") + assert numpy.array_equal( + state.get("generated_logits")[0], + numpy.expand_dims(prompt_logits[0][:, -1, :], 0), + ) + + +def test_generate_new_token( + model_attributes, + mock_token_generator, + mock_kv_cache, + mock_inference_state, + mock_logits, +): + """ + This test is responsible for testing the GenerateNewTokenOperator, which generates + one new token, given a token_generator (stored in the inference_state) and logits + from the engine. + """ + tokenizer, _ = model_attributes + generate_new_token = GenerateNewTokenOperator( + force_max_tokens=False, tokenizer=tokenizer + ) + mock_inference_state.update_state( + { + "token_generator": mock_token_generator, + "generated_tokens": [mock_token_generator.tokens], + } + ) + outputs, state = generate_new_token.run( + logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state + ) + # The new_token generated/returned by ths operator should match the last token in + # token_generator + assert outputs.get("new_token") == state.get("token_generator").tokens[-1] diff --git a/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py new file mode 100644 index 0000000000..d2c822af4c --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import MultiEnginePrefill + + +def test_mult_engine_preprocess( + text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple +): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # The number of tokens is equal to the prompt_sequence_length. + # Therefore, the multi_token_engine can run and can_operate() should be True. + assert multi_prep.can_operate(inputs) + outputs = multi_prep.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache, + pipeline_state=pipeline_state, + ) + # Expect 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + # Assert proper shapes for all engine_inputs + assert tokens.shape[-1] == len(mock_tokens_multiple) + assert attention_mask.shape[-1] == seq_len + assert positions.shape[-1] == len(mock_tokens_multiple) + + +def test_multi_engine_preprocess_cant_operate( + text_generation_attributes, mock_kv_cache, mock_tokens +): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + seq_len, _ = text_generation_attributes + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + # The prompt_sequence_length is one greater than the total number of tokens we're + # processing. Therefore, this operator should not run and can_operate() should be + # False. + assert not multi_prep.can_operate(inputs) From 0a50d1dee8a1abe32c4f1c40e27ab16589d32bc2 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 10 Nov 2023 09:11:00 -0500 Subject: [PATCH 22/57] [Pipeline Refactor] Unit Testing for Text Generation Operators (#1392) * unit testing for text generation operators * additional changes * unit testing completion * remove debug * fix * add todo * more clean-up * fix test * add docstrings/comments * break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed * fix name --- src/deepsparse/v2/routers/router.py | 1 + .../v2/text_generation/kv_cache_operator.py | 2 +- .../v2/text_generation/nl_engine_operator.py | 2 +- .../v2/text_generation/prep_for_generation.py | 52 +----- .../v2/text_generation/process_inputs.py | 10 +- .../v2/unit/text_generation/conftest.py | 173 ++++++++++++++++++ .../v2/unit/text_generation/test_kv_cache.py | 41 +++++ .../v2/unit/text_generation/test_misc.py | 31 ++++ .../text_generation/test_process_inputs.py | 47 +++++ .../test_single_token_engine.py | 98 ++++++++++ .../text_generation/test_token_generation.py | 92 ++++++++++ .../text_multi_token_engine.py | 63 +++++++ 12 files changed, 558 insertions(+), 54 deletions(-) create mode 100644 tests/deepsparse/v2/unit/text_generation/conftest.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_kv_cache.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_misc.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_process_inputs.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_token_generation.py create mode 100644 tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 1b70164002..6b0d851aef 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -158,4 +158,5 @@ def next( @staticmethod def validate(ops) -> bool: + # TODO: still needs to be implemented for the GraphRouter pass diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py index 0b232402b3..3c15d0ff5a 100644 --- a/src/deepsparse/v2/text_generation/kv_cache_operator.py +++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py @@ -24,7 +24,7 @@ from deepsparse.v2.operators import Operator -__all__ = ["KVCacheCreator"] +__all__ = ["KVCacheCreator", "KVCacheCreatorInput"] class KVCacheCreatorOutput(BaseModel): diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 0bd9098a40..7549f986d9 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -29,7 +29,7 @@ ) -__all__ = ["NLEngineOperator"] +__all__ = ["NLEngineOperator", "NlEngineInput"] class NlEngineInput(BaseModel): diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 544af43980..0ea4a06a02 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -11,11 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy from typing import Any import numpy from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.transformers.utils.helpers import set_generated_length from deepsparse.v2.operators import Operator from deepsparse.v2.text_generation import TokenGeneratorOperator from deepsparse.v2.utils import InferenceState @@ -31,9 +33,9 @@ def __init__( prompt_sequence_length: int, sequence_length: int, ): - self.prompt_sequence_length = prompt_sequence_length self.sequence_length = sequence_length self.token_generator_creator = token_generator + self.prompt_sequence_length = prompt_sequence_length def can_operate(self, inp: Any): kv_cache = inp.get("kv_cache") @@ -47,49 +49,6 @@ def can_operate(self, inp: Any): return True return False - @staticmethod - def set_generated_length( - max_length: int, - prompt_tokens_length: int, - sequence_length: int, - prompt_sequence_length: int, - max_new_tokens: int, - finish_reason_choices: "FinishReason", # noqa - ): - """ - Determine the length of the generated tokens. The hard cap on the total number - of tokens is based on the sequence length. If max_length is provided and is less - than the sequence length, it will be used to cap the total number of tokens - generated. If it is not provided, the max_new_tokens attribute will be used and - also capped by the sequence length. - - :param max_length: max_length attribute, provided as input during inference - :param prompt_tokens_length: the number of prompt tokens used as part of the - generated output - :param sequence_length: the sequence length used for the pipeline - :param prompt_sequence_length: the prompt sequence length used for the pipeline - :param max_new_tokens: the max_new_tokens attribute, which may be provided - as part of the input during inference - """ - if max_length: - # if max_length provided, use that to cap total tokens generated - max_tokens = max_length - finish_reason = finish_reason_choices.LENGTH - else: - # if not provided, max tokens is based on max_new_tokens + prompt tokens - max_tokens = ( - min(max_new_tokens, sequence_length - prompt_sequence_length) - + prompt_tokens_length - ) - finish_reason = finish_reason_choices.MAX_NEW_TOKENS - - # hard model/pipeline cap - return ( - (sequence_length, finish_reason_choices.CAPACITY) - if sequence_length < max_tokens - else (max_tokens, finish_reason) - ) - def run( self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs ): @@ -107,13 +66,13 @@ def run( logits_shape=prompt_logits[0, -1, :].shape, deterministic=not generation_config.do_sample, sampling_temperature=generation_config.temperature, - tokens=tokens, + tokens=copy.copy(tokens), **inference_state.current_state, ) token_generator = token_generator_creator_output.get("token_generator") token_generator.generate(prompt_logits[0, -1, :]) - max_tokens, length_finish_reason = PrepareGeneration.set_generated_length( + max_tokens, length_finish_reason = set_generated_length( max_length=generation_config.max_length, prompt_tokens_length=1, max_new_tokens=generation_config.max_new_tokens, @@ -131,7 +90,6 @@ def run( "finished_reason": [], "token_generator": token_generator, } - output = { "tokens": token_generator.tokens, "kv_cache": kv_cache, diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 5d47c8ff39..214b8526e3 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -26,6 +26,9 @@ from deepsparse.v2.operators import Operator +__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"] + + class GenerationDefaults: num_return_sequences = 1 max_length = 100 @@ -38,9 +41,6 @@ class GenerationDefaults: temperature = 1.0 -__all__ = ["ProcessInputsTextGeneration"] - - class ProcessInputsTextGeneration(Operator): """ Input processing operator. Responsible for tokenizing the input, handling the @@ -54,10 +54,10 @@ class ProcessInputsTextGeneration(Operator): def __init__( self, tokenizer: transformers.PreTrainedTokenizerBase, + sequence_length: int, generation_config: Union[ str, pathlib.Path, Dict, transformers.GenerationConfig - ], - sequence_length: int, + ] = None, ): self.generation_config = generation_config self.tokenizer = tokenizer diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py new file mode 100644 index 0000000000..5d8483e5f6 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/conftest.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import numpy +from transformers import AutoTokenizer + +import pytest +from deepsparse.transformers.helpers import get_deployment_path +from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.utils import DecoderKVCache +from deepsparse.transformers.utils.helpers import initialize_kv_cache_state +from deepsparse.v2 import InferenceState, PipelineState +from deepsparse.v2.text_generation import ( + GenerationDefaults, + NLEngineOperator, + TokenGeneratorOperator, +) + + +@pytest.fixture(scope="module") +def text_generation_attributes(): + sequence_length = 5 + prompt_sequence_length = 1 + return sequence_length, prompt_sequence_length + + +@pytest.fixture(scope="module") +def model_attributes(text_generation_attributes): + model_path = "hf:mgoin/TinyStories-1M-deepsparse" + sequence_length, _ = text_generation_attributes + deployment_path, model_path = get_deployment_path(model_path) + + tokenizer = AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=False, + model_max_length=sequence_length, + ) + + tokenizer.padding_side = "left" + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer, model_path + + +@pytest.fixture(scope="module") +def single_token_engine_no_internal_cache(text_generation_attributes, model_attributes): + seq_length, _ = text_generation_attributes + _, model_path = model_attributes + + nl_engine_operator = NLEngineOperator( + sequence_length=seq_length, input_ids_length=1, model_path=model_path + ) + return nl_engine_operator + + +@pytest.fixture(scope="module") +def pipeline_state(single_token_engine_no_internal_cache): + pipeline_state = PipelineState() + pipeline_state_vals = {} + pipeline_state_vals[ + "onnx_input_names_no_cache" + ] = single_token_engine_no_internal_cache.onnx_input_names_no_cache + pipeline_state_vals[ + "cache_shape" + ] = single_token_engine_no_internal_cache.cache_shape + pipeline_state_vals[ + "output_names" + ] = single_token_engine_no_internal_cache.output_names + pipeline_state_vals[ + "kv_cache_data_type" + ] = single_token_engine_no_internal_cache.kv_cache_data_type + pipeline_state.create_state(pipeline_state_vals) + return pipeline_state + + +@pytest.fixture(scope="module") +def large_prompt(): + prompt = "Hello, how are you doing today?" + generation_config = {"top_p": 0, "top_k": 0, "max_length": 10} + return TextGenerationInput(prompt=prompt, generation_config=generation_config) + + +@pytest.fixture(scope="module") +def small_prompt(): + prompt = "Hello" + return TextGenerationInput(prompt=prompt) + + +@pytest.fixture(scope="module") +def mock_kv_cache(): + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + ) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_kv_cache_three_tokens_processed(): + kv_cache = DecoderKVCache() + kv_cache.setup( + state={"dummy_cache_name": numpy.array([[[[0], [0], [1], [2], [3]]]])}, + num_processed_tokens=3, + ) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_kv_cache_single_token_engine(pipeline_state, text_generation_attributes): + seq_len, prompt_seq_len = text_generation_attributes + kv_cache = DecoderKVCache() + kv_cache_state = initialize_kv_cache_state( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names"), + length=seq_len - prompt_seq_len, + empty=False, + ) + kv_cache.setup(state=kv_cache_state) + return kv_cache + + +@pytest.fixture(scope="module") +def mock_tokens(): + return [15496] + + +@pytest.fixture(scope="module") +def mock_tokens_multiple(): + return [15496, 15496, 15496] + + +@pytest.fixture(scope="module") +def mock_inference_state(): + generation_config = GenerationDefaults() + inference_state = InferenceState() + inference_state.create_state({}) + inference_state.update_state({"generation_config": generation_config}) + return inference_state + + +@pytest.fixture(scope="module") +def mock_token_generator(model_attributes, mock_tokens_multiple): + tokenizer, _ = model_attributes + token_generator_creator = TokenGeneratorOperator() + prompt_logits = numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer)) + token_generator_creator_output = token_generator_creator.run( + logits_shape=prompt_logits[0, -1, :].shape, + deterministic=True, + sampling_temperature=1.0, + tokens=copy.copy(mock_tokens_multiple), + ) + return token_generator_creator_output.get("token_generator") + + +@pytest.fixture(scope="module") +def mock_logits(model_attributes): + tokenizer, _ = model_attributes + return numpy.random.rand(1, 1, len(tokenizer)) diff --git a/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py new file mode 100644 index 0000000000..0c6e42503a --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_kv_cache.py @@ -0,0 +1,41 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import KVCacheCreator, KVCacheCreatorInput + + +def test_kv_cache_creation( + text_generation_attributes, model_attributes, pipeline_state +): + """ + Check if the KVCacheCreator successfully creates a kv_cache object, given the + single_token_engine attributes stored in the pipeline_state. + """ + seq_length, prompt_seq_len = text_generation_attributes + tokenizer, _ = model_attributes + kv_cache_creator = KVCacheCreator( + tokenizer=tokenizer, + prompt_sequence_length=prompt_seq_len, + sequence_length=seq_length, + internal_kv_cache=False, + ) + + assert kv_cache_creator.input_schema == KVCacheCreatorInput + kv_cache = kv_cache_creator.run( + cache_shape=pipeline_state.current_state.get("cache_shape"), + kv_cache_data_type=pipeline_state.current_state.get("kv_cache_data_type"), + output_names=pipeline_state.current_state.get("output_names"), + ) + assert kv_cache.get("kv_cache") + assert kv_cache.get("kv_cache").total_num_processed_tokens == 0 diff --git a/tests/deepsparse/v2/unit/text_generation/test_misc.py b/tests/deepsparse/v2/unit/text_generation/test_misc.py new file mode 100644 index 0000000000..caa0cc2efd --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_misc.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import CompilePromptLogits + + +def test_compile_logits(mock_logits, mock_inference_state): + mock_inference_state.update_state({"prompt_logits": [mock_logits]}) + compile_prompt_logits = CompilePromptLogits() + # Can operate as long as we're not in generation but in prompt_inference. This + # can_operate() will check for the `in_generation` flag in the input. + assert compile_prompt_logits.can_operate({}) + output, state = compile_prompt_logits.run( + logits=mock_logits, inference_state=mock_inference_state + ) + # The CompilePromptLogits is responsible for updating a list of prompt logits + # calculated at each step during prompt inference. After one step of running this + # operator, the total number of prompt_logits in the inference state should be + # the current length of prompt logits + 1 + assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py new file mode 100644 index 0000000000..be59db7475 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py @@ -0,0 +1,47 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import ( + GenerationDefaults, + ProcessInputsTextGeneration, +) + + +def test_process_inputs( + text_generation_attributes, model_attributes, small_prompt, large_prompt +): + """ + Check if the ProcessInputsTextGeneration Operator successfully processes the + inputs and generation config. + """ + sequence_length, _ = text_generation_attributes + tokenizer, _ = model_attributes + process_inputs = ProcessInputsTextGeneration( + sequence_length=sequence_length, tokenizer=tokenizer + ) + + outputs, state_update = process_inputs.run(small_prompt) + assert len(outputs.get("input_ids")) == 1 + assert len(outputs.get("attention_mask")) == 1 + assert isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get("prompts") == small_prompt.sequences + + outputs, state_update = process_inputs.run(large_prompt) + + assert not isinstance(state_update.get("generation_config"), GenerationDefaults) + assert state_update.get( + "generation_config" + ).max_length == large_prompt.generation_config.get("max_length") + assert outputs.get("input_ids") is not None + assert state_update.get("top_k") == large_prompt.generation_config.get("top_k") diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py new file mode 100644 index 0000000000..335a28fbe3 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py @@ -0,0 +1,98 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy + +from deepsparse.v2.text_generation import ( + AutoRegressiveOperatorPreprocess, + NlEngineInput, +) + + +def test_autoreg_preproces_can_run( + text_generation_attributes, pipeline_state, mock_tokens, mock_kv_cache +): + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + + # The prompt_sequence_length is greater than the number of tokens that are to be + # operated on. Therefore, use the single_token_engine and can_operate() should be + # True. + assert autoreg_prep.can_operate(inputs) + outputs = autoreg_prep.run( + tokens=mock_tokens, kv_cache=mock_kv_cache, pipeline_state=pipeline_state + ) + # Assert 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + + assert tokens.shape[-1] == 1 + assert attention_mask.shape[-1] == seq_len + assert positions[0] == mock_kv_cache.total_num_processed_tokens + assert outputs.get("in_generation") is None + + +def test_autoreg_preproces_cant_run( + text_generation_attributes, mock_kv_cache, mock_tokens_multiple +): + """ + Check if the single-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + autoreg_prep = AutoRegressiveOperatorPreprocess( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # can_operate() should be False as the prompt_sequence_length is equal to the + # number of tokens we want to operate on. Therefore, the multi-token engine + # should run instead. + assert not autoreg_prep.can_operate(inputs) + + +def test_nl_single_token_engine_no_internal(single_token_engine_no_internal_cache): + assert single_token_engine_no_internal_cache.input_ids_length == 1 + + +def test_run_single_token_engine_once( + single_token_engine_no_internal_cache, + mock_kv_cache_single_token_engine, +): + """ + This operator runs through the single-token NLEngine once, given engine_inputs and + kv_cache. + """ + + mock_engine_inputs = [ + numpy.array([[15496]]), + numpy.array([[0, 0, 0, 0, 1]]), + numpy.array([[0]]), + numpy.array([[[[0, 0, 0, 0, 1]]]]), + ] + inputs = NlEngineInput( + engine_inputs=mock_engine_inputs, + kv_cache=mock_kv_cache_single_token_engine, + tokens=mock_engine_inputs[0].tolist(), + ) + output = single_token_engine_no_internal_cache.run(inputs) + assert output.get("logits") is not None diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py new file mode 100644 index 0000000000..fbd9e06778 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy + +from deepsparse.v2.text_generation import ( + GenerateNewTokenOperator, + PrepareGeneration, + TokenGeneratorOperator, +) + + +def test_prep_for_generation( + text_generation_attributes, + model_attributes, + mock_tokens_multiple, + mock_kv_cache_three_tokens_processed, + mock_inference_state, +): + """ + This test will assess the PrepareGeneration, which runs after prompt_inference + and before generation. + """ + seq_len, prompt_seq_len = text_generation_attributes + tokenizer, _ = model_attributes + prep_for_generation = PrepareGeneration( + prompt_sequence_length=prompt_seq_len, + token_generator=TokenGeneratorOperator(), + sequence_length=seq_len, + ) + inputs = { + "tokens": mock_tokens_multiple, + "kv_cache": mock_kv_cache_three_tokens_processed, + } + # can_operate() if the total number of prompt tokens is equal to the + # number of processed tokens stored in the kv_cache, indicating prompt inference is + # complete and generation can begin. + assert prep_for_generation.can_operate(inputs) + + prompt_logits = [numpy.random.rand(1, len(mock_tokens_multiple), len(tokenizer))] + mock_inference_state.update_state({"prompt_logits": prompt_logits}) + outputs, state = prep_for_generation.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache_three_tokens_processed, + inference_state=mock_inference_state, + ) + assert len(outputs.get("tokens")) == len(mock_tokens_multiple) + 1 + assert outputs.get("in_generation") + assert numpy.array_equal( + state.get("generated_logits")[0], + numpy.expand_dims(prompt_logits[0][:, -1, :], 0), + ) + + +def test_generate_new_token( + model_attributes, + mock_token_generator, + mock_kv_cache, + mock_inference_state, + mock_logits, +): + """ + This test is responsible for testing the GenerateNewTokenOperator, which generates + one new token, given a token_generator (stored in the inference_state) and logits + from the engine. + """ + tokenizer, _ = model_attributes + generate_new_token = GenerateNewTokenOperator( + force_max_tokens=False, tokenizer=tokenizer + ) + mock_inference_state.update_state( + { + "token_generator": mock_token_generator, + "generated_tokens": [mock_token_generator.tokens], + } + ) + outputs, state = generate_new_token.run( + logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state + ) + # The new_token generated/returned by ths operator should match the last token in + # token_generator + assert outputs.get("new_token") == state.get("token_generator").tokens[-1] diff --git a/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py new file mode 100644 index 0000000000..d2c822af4c --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/text_multi_token_engine.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from deepsparse.v2.text_generation import MultiEnginePrefill + + +def test_mult_engine_preprocess( + text_generation_attributes, pipeline_state, mock_kv_cache, mock_tokens_multiple +): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + + seq_len, _ = text_generation_attributes + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens_multiple) + ) + inputs = {"tokens": mock_tokens_multiple, "kv_cache": mock_kv_cache} + # The number of tokens is equal to the prompt_sequence_length. + # Therefore, the multi_token_engine can run and can_operate() should be True. + assert multi_prep.can_operate(inputs) + outputs = multi_prep.run( + tokens=mock_tokens_multiple, + kv_cache=mock_kv_cache, + pipeline_state=pipeline_state, + ) + # Expect 4 engine inputs: tokens, attention mask, causal, positions + assert len(outputs.get("engine_inputs")) == 4 + tokens, attention_mask, positions, causal_mask = outputs.get("engine_inputs") + # Assert proper shapes for all engine_inputs + assert tokens.shape[-1] == len(mock_tokens_multiple) + assert attention_mask.shape[-1] == seq_len + assert positions.shape[-1] == len(mock_tokens_multiple) + + +def test_multi_engine_preprocess_cant_operate( + text_generation_attributes, mock_kv_cache, mock_tokens +): + """ + Check if the multi-token engine preprocess operator can run based on the provided + tokens and prompt_sequence_length. + """ + seq_len, _ = text_generation_attributes + multi_prep = MultiEnginePrefill( + sequence_length=seq_len, prompt_sequence_length=len(mock_tokens) + 1 + ) + inputs = {"tokens": mock_tokens, "kv_cache": mock_kv_cache} + # The prompt_sequence_length is one greater than the total number of tokens we're + # processing. Therefore, this operator should not run and can_operate() should be + # False. + assert not multi_prep.can_operate(inputs) From 4f248ddba0b6a2776ceaa5a7662251a8f8e59b4e Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 13 Nov 2023 18:24:10 +0100 Subject: [PATCH 23/57] Delete tests/deepsparse/v2/unit/text_generation/test_msic.py --- .../v2/unit/text_generation/test_msic.py | 31 ------------------- 1 file changed, 31 deletions(-) delete mode 100644 tests/deepsparse/v2/unit/text_generation/test_msic.py diff --git a/tests/deepsparse/v2/unit/text_generation/test_msic.py b/tests/deepsparse/v2/unit/text_generation/test_msic.py deleted file mode 100644 index caa0cc2efd..0000000000 --- a/tests/deepsparse/v2/unit/text_generation/test_msic.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from deepsparse.v2.text_generation import CompilePromptLogits - - -def test_compile_logits(mock_logits, mock_inference_state): - mock_inference_state.update_state({"prompt_logits": [mock_logits]}) - compile_prompt_logits = CompilePromptLogits() - # Can operate as long as we're not in generation but in prompt_inference. This - # can_operate() will check for the `in_generation` flag in the input. - assert compile_prompt_logits.can_operate({}) - output, state = compile_prompt_logits.run( - logits=mock_logits, inference_state=mock_inference_state - ) - # The CompilePromptLogits is responsible for updating a list of prompt logits - # calculated at each step during prompt inference. After one step of running this - # operator, the total number of prompt_logits in the inference state should be - # the current length of prompt logits + 1 - assert len(state.get("prompt_logits")) == len([mock_logits]) + 1 From 20980a769f2abe89f0247c5f7e58193286af9217 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 13 Nov 2023 16:12:35 -0500 Subject: [PATCH 24/57] [Continuous Batching] Queue Implementation to support batching grouping and prioritization (#1373) * [Continuous Batching] Queue Implementation to support batching grouping and prioritization * has_key method * thread safety * add blocking option for pop_batch * update docstring * allow mutex to be shared across continuous batching objects * revert last commit --- src/deepsparse/v2/operators/__init__.py | 2 + .../v2/schedulers/utils/__init__.py | 18 ++ .../utils/continuous_batching_queues.py | 220 ++++++++++++++++++ tests/deepsparse/v2/schedulers/__init__.py | 13 ++ .../v2/schedulers/utils/__init__.py | 13 ++ .../utils/test_continuous_batching_queues.py | 177 ++++++++++++++ 6 files changed, 443 insertions(+) create mode 100644 src/deepsparse/v2/schedulers/utils/__init__.py create mode 100644 src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py create mode 100644 tests/deepsparse/v2/schedulers/__init__.py create mode 100644 tests/deepsparse/v2/schedulers/utils/__init__.py create mode 100644 tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py index 9d1a9812ac..bf58018493 100644 --- a/src/deepsparse/v2/operators/__init__.py +++ b/src/deepsparse/v2/operators/__init__.py @@ -1,4 +1,5 @@ # flake8: noqa +# isort: skip_file # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # @@ -14,3 +15,4 @@ # See the License for the specific language governing permissions and # limitations under the License. from .operator import * +from .engine_operator import * diff --git a/src/deepsparse/v2/schedulers/utils/__init__.py b/src/deepsparse/v2/schedulers/utils/__init__.py new file mode 100644 index 0000000000..e2e25b1c90 --- /dev/null +++ b/src/deepsparse/v2/schedulers/utils/__init__.py @@ -0,0 +1,18 @@ +# flake8: noqa +# isort: skip_file + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .continuous_batching_queues import * diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py new file mode 100644 index 0000000000..84d4f38e3d --- /dev/null +++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_queues.py @@ -0,0 +1,220 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from concurrent.futures import Future +from queue import Queue +from threading import Condition, Lock +from time import time +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple + + +__all__ = [ + "ContinuousBatchingQueue", + "ContinuousBatchingQueues", + "QueueEntry", +] + + +# maximum wait time of longest item in queue before it is prioritized +_MAX_WAIT_MS = 100 + + +class QueueEntry(NamedTuple): + value: Any + future: Optional[Future] + entry_time_ms: float + + def time_elapsed(self) -> float: + return _current_time_ms() - self.entry_time_ms + + +class ContinuousBatchingQueue(Queue): + """ + Extension of queue.Queue with helper functions for dequeueing valid + batch sizes for continuous batching + + :param batch_sizes: valid batch sizes that can be grouped for continuous + batching + """ + + def __init__(self, batch_sizes: List[int], *args, **kwargs): + super().__init__(*args, **kwargs) + + self._batch_sizes = batch_sizes + self._min_batch_size = min(self.batch_sizes) + + @property + def batch_sizes(self) -> List[int]: + """ + :return: valid batch sizes that this queue can return + """ + return self._batch_sizes + + def pop_batch(self) -> List[Any]: + """ + :return: + """ + batch_size = self.max_queued_batch_size() + if batch_size == 0: + raise RuntimeError( + f"Cannot create a batch with {self.qsize()} entries and valid " + f"batch sizes: {self.batch_sizes}" + ) + + return [self.get() for _ in range(batch_size)] + + def has_batch(self) -> bool: + """ + :return: True if a batch of valid size can be filled with the current qsize + """ + return self.qsize() >= self._min_batch_size + + def max_queued_batch_size(self) -> int: + """ + :return: the maximum batch size that can be filled by members of this queue + """ + num_entries = self.qsize() + max_size = 0 + + for batch_size in self.batch_sizes: + if num_entries >= batch_size > max_size: + # current batch size can be satisfied and is the largest so far + max_size = batch_size + + return max_size + + def peek(self): + """ + :return: threadsafe peek of the first item in the queue + """ + with self.mutex: + return self.queue[0] + + +class ContinuousBatchingQueues: + """ + Threadsafe collection of Queues designed to support continuous batching. + Each Queue should be keyed by an operator where possible, however keys + are kept generic. + + On request for next - a job will be returned with an operator key and + a batch of inputs. The default heuristic for the next job will be + a combination of wait time and largest batch that can be run + """ + + def __init__(self): + self._queues = {} # Dict[Any, ContinuousBatchingQueue] + self._mutex = Lock() + + # add condition for wait/notify when an item is added to any queue + self._item_added = Condition(self._mutex) + + def __contains__(self, key: Any) -> bool: + """ + :param key: key to look up + :return: True if the given key has a queue in this group + """ + with self._mutex: + return key in self._queues + + def add_queue(self, key: Any, batch_sizes: List[int]): + """ + Adds a queue for a single operator that can be run at multiple batch sizes + + :param key: key to identify queue with, preferably the engine operator + :param batch_sizes: batch sizes that the operator can be run at + """ + with self._mutex: + self._queues[key] = ContinuousBatchingQueue(batch_sizes=batch_sizes) + + def add_queue_item(self, key: Any, item: Any, future: Optional[Future] = None): + """ + Adds an item to the given queue + + :param key: key for queue to add to + :param item: item to add in queue + :param future: optional future that should be used for resolution of value + """ + if key not in self: + raise KeyError(f"Cannot add item to queue for unregistered key {key}") + + entry = QueueEntry(value=item, future=future, entry_time_ms=_current_time_ms()) + + with self._mutex: + self._queues[key].put(entry) + self._item_added.notify() + + def has_next_batch(self) -> bool: + """ + :return: true if any Queue has enough entries to fill a valid batch size + """ + with self._mutex: + return any(queue.has_batch() for queue in self._queues.values()) + + def pop_batch( + self, + select_fn: Callable[[Dict[Any, ContinuousBatchingQueue]], Any] = None, + block: bool = True, + ) -> Tuple[Any, List[QueueEntry]]: + """ + :param select_fn: function that takes in a dictionary of queue key + (i.e. EngineOperator) to its ContinuousBatchingQueue of QueueItem + objects and returns the key of the queue that should be returned. + Only keys with queues large enough to fill a batch will be given. + If not provided, the default select_fn will return the queue that + can fill the largest batch size, or the queue that has the first item + with the longest wait time if that time is over 100ms. + :param block: if True, will wait for a valid batch to be in a queue before + popping and returning, if False, will raise an error if a full batch + cannot be popped. Default True + :return: Tuple of the queue key (EngineOperator) and + batch of QueueEntry objects as a list that have been popped and should + be run as a batch + """ + with self._mutex: + while not (valid_queues := self._filter_empty_queues()): + if block: + # wait to search for a valid queue again until a new item is added + self._item_added.wait() + else: + raise RuntimeError( + "Cannot pop_batch when no queues have enough items to fill " + "a valid batch size, check with has_next_batch before calling " + "pop_batch" + ) + + select_fn = select_fn or _default_select_fn + selected_key = select_fn(valid_queues) + + return selected_key, self._queues[selected_key].pop_batch() + + def _filter_empty_queues(self) -> Dict[Any, ContinuousBatchingQueue]: + return {key: queue for key, queue in self._queues.items() if queue.has_batch()} + + +def _default_select_fn(queues: Dict[Any, ContinuousBatchingQueue]) -> Any: + # find the maximum wait time of a queue + wait_times = [(key, queue.peek().time_elapsed()) for key, queue in queues.items()] + max_wait_key, max_wait = max(wait_times, key=lambda x: x[1]) # key on time + + if max_wait >= _MAX_WAIT_MS: + # if max time is greater than the threshold return that queue + return max_wait_key + + # default to the largest batch size that can be satisfied + return max(queues.keys(), key=lambda key: queues[key].max_queued_batch_size()) + + +def _current_time_ms(): + return time() * 1000 diff --git a/tests/deepsparse/v2/schedulers/__init__.py b/tests/deepsparse/v2/schedulers/__init__.py new file mode 100644 index 0000000000..0c44f887a4 --- /dev/null +++ b/tests/deepsparse/v2/schedulers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/v2/schedulers/utils/__init__.py b/tests/deepsparse/v2/schedulers/utils/__init__.py new file mode 100644 index 0000000000..0c44f887a4 --- /dev/null +++ b/tests/deepsparse/v2/schedulers/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py new file mode 100644 index 0000000000..1713d54f82 --- /dev/null +++ b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_queues.py @@ -0,0 +1,177 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from threading import Thread + +import pytest +from deepsparse.v2.schedulers.utils import ( + ContinuousBatchingQueue, + ContinuousBatchingQueues, + QueueEntry, +) + + +@pytest.mark.parametrize( + "batch_sizes,num_entries,expected_batch_size", + [ + ([1, 4, 8], 20, 8), + ([1, 4, 8], 6, 4), + ([1, 4, 8], 4, 4), + ([1, 4, 8], 3, 1), + ([4], 5, 4), + ], +) +def test_queue_single_pop(batch_sizes, num_entries, expected_batch_size): + queue = ContinuousBatchingQueue(batch_sizes=batch_sizes) + assert not queue.has_batch() + for i in range(num_entries): + queue.put(i) + + assert queue.has_batch() + assert queue.max_queued_batch_size() == expected_batch_size + + batch = queue.pop_batch() + assert len(batch) == expected_batch_size + assert batch == list(range(expected_batch_size)) + + +def test_queue_multi_pop(): + queue = ContinuousBatchingQueue(batch_sizes=[2, 4, 8]) + + for i in range(23): + if i < 2: + assert not queue.has_batch() + else: + assert queue.has_batch() + queue.put(i) + + def pop_and_assert_queue_size_and_pop(expected_qsize, expected_batch_size): + assert queue.qsize() == expected_qsize + assert queue.has_batch() + assert queue.max_queued_batch_size() == expected_batch_size + assert len(queue.pop_batch()) == expected_batch_size + + # pop items from queue, checkign remaining qsize and correct batch size is popped + pop_and_assert_queue_size_and_pop(23, 8) + pop_and_assert_queue_size_and_pop(15, 8) + pop_and_assert_queue_size_and_pop(7, 4) + pop_and_assert_queue_size_and_pop(3, 2) + + assert not queue.has_batch() + queue.put(23) + pop_and_assert_queue_size_and_pop(2, 2) + + assert queue.empty() + + +def test_queue_invalid_pop(): + queue = ContinuousBatchingQueue(batch_sizes=[4, 8]) + for i in range(3): + queue.put(i) + + with pytest.raises(RuntimeError): + # queue size 3, min batch size 4 + queue.pop_batch() + + +def test_queues_pop_batch_max_valid_batch(): + queues = ContinuousBatchingQueues() + + queues.add_queue("key_1", [2, 4]) + queues.add_queue("key_2", [3]) + + assert not queues.has_next_batch() + + queues.add_queue_item("key_1", 1) + queues.add_queue_item("key_1", 2) + assert queues.has_next_batch() + + queues.add_queue_item("key_2", 1) + queues.add_queue_item("key_2", 2) + queues.add_queue_item("key_2", 3) + # NOTE - if this block takes more than 100ms, test may fail + # as timeout may lead key_1 to be popped first + + # key_2 should be popped first because it has larger loaded batch size + first_popped_key, first_popped_batch = queues.pop_batch() + assert first_popped_key == "key_2" + assert len(first_popped_batch) == 3 + assert all(isinstance(item, QueueEntry) for item in first_popped_batch) + + assert queues.has_next_batch() + + second_popped_key, second_popped_batch = queues.pop_batch() + assert second_popped_key == "key_1" + assert len(second_popped_batch) == 2 + assert all(isinstance(item, QueueEntry) for item in second_popped_batch) + + +def test_queues_pop_batch_time_elapsed_priority(): + queues = ContinuousBatchingQueues() + + queues.add_queue("key_1", [2, 4]) + queues.add_queue("key_2", [3]) + + assert not queues.has_next_batch() + + queues.add_queue_item("key_1", 1) + queues.add_queue_item("key_1", 2) + assert queues.has_next_batch() + + # sleep 150ms (time threshold is 100ms) + time.sleep(0.15) + + queues.add_queue_item("key_2", 1) + queues.add_queue_item("key_2", 2) + queues.add_queue_item("key_2", 3) + + # key 1 should be popped first because its first item has been waiting longer + # than the time threshold and key_2 was just added + + popped_key, popped_batch = queues.pop_batch() + assert popped_key == "key_1" + assert len(popped_batch) == 2 + + +def test_queues_pop_batch_blocking(): + queues = ContinuousBatchingQueues() + queues.add_queue("key_1", [2]) + + def test_fn(): + # pop batch and block until true + key, batch = queues.pop_batch(block=True) + # compare to expected results + assert key == "key_1" + assert batch == [1, 2] + + # start a thread to pop batch + # it should hang indefinitely because block=True and there are no items yet in queue + thread = Thread(target=queues.pop_batch) + thread.start() + + # confirm thread is still running + assert thread.is_alive() + time.sleep(0.15) + # sleep and confirm thread is still hanging + assert thread.is_alive() + + # confirm thread still runs after a single insertion (min batch size is 2) + queues.add_queue_item("key_1", 1) + assert thread.is_alive() + + # add a second item and assert thread finishes + queues.add_queue_item("key_1", 2) + time.sleep(0.1) + assert not thread.is_alive() From d81012d0e942d10ce5462027f14d87dd1cdf77bf Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 13 Nov 2023 16:22:30 -0500 Subject: [PATCH 25/57] [Continuous Batching] Executor thread for running continuous batching (#1374) * [Continuous Batching] Executor thread for running continuous batching * quality * ensure that executor stops when main thread does - clean up test hack --- .../v2/operators/engine_operator.py | 32 +++++++ .../v2/schedulers/utils/__init__.py | 1 + .../utils/continuous_batching_executor.py | 79 ++++++++++++++++++ .../test_continuous_batching_executor.py | 83 +++++++++++++++++++ 4 files changed, 195 insertions(+) create mode 100644 src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py create mode 100644 tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index bd58aefafa..9ee8d734c5 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -39,6 +39,28 @@ class EngineOperatorInputs(BaseModel): default=None, ) + @classmethod + def join(cls, inputs: List["EngineOperatorInputs"]) -> "EngineOperatorInputs": + """ + :param inputs: list of separate EngineOperatorInputs, batch size must be 1 + :return: list of inputs joined into a single input with a multi batch size + """ + all_engine_inputs = [engine_input.engine_inputs for engine_input in inputs] + + for engine_inputs in all_engine_inputs: + if engine_inputs[0].shape[0] != 1: + raise RuntimeError( + "join requires all inputs to have batch size 1, found input with " + f"batch size {engine_inputs[0].shape[0]}" + ) + + # use join_engine_outputs since dtype is the same + joined_engine_inputs = join_engine_outputs( + all_engine_inputs, len(all_engine_inputs) + ) + + return cls(engine_inputs=joined_engine_inputs) + class Config: arbitrary_types_allowed = True @@ -46,6 +68,16 @@ class Config: class EngineOperatorOutputs(BaseModel): engine_outputs: List = Field(description="engine outputs") + def split(self) -> List["EngineOperatorOutputs"]: + """ + :return: list of the current outputs split to a batch size of 1 each + """ + # using split_engine_inputs since input/output dtypes + # are the same (List[ndarray]) + split_outputs, _ = split_engine_inputs(self.engine_outputs, batch_size=1) + + return [self.__class__(engine_outputs=outputs) for outputs in split_outputs] + class EngineOperator(Operator): input_schema = EngineOperatorInputs diff --git a/src/deepsparse/v2/schedulers/utils/__init__.py b/src/deepsparse/v2/schedulers/utils/__init__.py index e2e25b1c90..521341a7fc 100644 --- a/src/deepsparse/v2/schedulers/utils/__init__.py +++ b/src/deepsparse/v2/schedulers/utils/__init__.py @@ -16,3 +16,4 @@ # limitations under the License. from .continuous_batching_queues import * +from .continuous_batching_executor import * diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py new file mode 100644 index 0000000000..86afdf309c --- /dev/null +++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from threading import Thread +from typing import Dict + +from deepsparse import Engine +from deepsparse.v2.operators import EngineOperator +from deepsparse.v2.schedulers.utils.continuous_batching_queues import ( + ContinuousBatchingQueues, +) + + +__all__ = [ + "ContinuousBatchingExecutorThread", +] + + +class ContinuousBatchingExecutorThread(Thread): + """ + Thread that when started runs indefinitely, grabbing a valid batch from + the queues when possible and running them in the correct engine + + :param queues: ContinuousBatchingQueues object containing a queue for + each valid engine + :param operators_to_engines: dictionary mapping valid engine operators + to a dictionary of its valid batch sizes mapped to an engine compiled + for that batch size + """ + + def __init__( + self, + queues: ContinuousBatchingQueues, + operators_to_engines: Dict[EngineOperator, Dict[int, Engine]], + ): + self._queues = queues + self._operators_to_engines = operators_to_engines + self._should_stop = False + + super().__init__(target=self._working_loop) + self.daemon = True # worker thread should exit when main thread exits + + def _working_loop(self): + # indefinitely wait for batch, run batch, split and resolve futures + while True: + # wait for next batch to be available + engine_operator, batch = self._queues.pop_batch(block=True) + + # unpack batch of QueueEntry objects + engine_inputs, futures, _ = list(zip(*batch)) + batch_size = len(engine_inputs) + + # type is EngineOperatorInputs + joined_inputs = engine_operator.input_schema.join(engine_inputs) + + # get engine for this operator compiled to the popped batch size + # and set the inputs to execute with it + joined_inputs.engine = self._operators_to_engines[engine_operator][ + batch_size + ] + + # run the engine operator with the given engine at the joined batch size + joined_outputs = engine_operator(joined_inputs) + + # split outputs and return the results to their respective futures + split_outputs = joined_outputs.split() + for output, future in zip(split_outputs, futures): + future.set_result(output) diff --git a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py new file mode 100644 index 0000000000..1d5ed9d92b --- /dev/null +++ b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from concurrent.futures import Future + +import numpy + +from deepsparse.v2.operators import EngineOperator +from deepsparse.v2.schedulers.utils import ( + ContinuousBatchingExecutorThread, + ContinuousBatchingQueues, +) + + +def test_continuous_batching_executor_thread(): + # mobilenet model with batch_size=2 + engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base", batch_size=2) + + # create queues object and add operator + queues = ContinuousBatchingQueues() + queues.add_queue(engine_operator, batch_sizes=[2]) + + # create engine map + operators_to_engines = {engine_operator: {2: engine_operator.engine}} + + worker_thread = ContinuousBatchingExecutorThread(queues, operators_to_engines) + + # thread not started yet + assert not worker_thread.is_alive() + + # start and assert thread is alive + worker_thread.start() + assert worker_thread.is_alive() + + # create first input and add it to queue + input_1 = engine_operator.input_schema( + engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)] + ) + future_1 = Future() + queues.add_queue_item(engine_operator, input_1, future=future_1) + + # assert that future is not yet resolved + assert not future_1.done() + + # create second input and add it to queue + input_2 = engine_operator.input_schema( + engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)] + ) + future_2 = Future() + queues.add_queue_item(engine_operator, input_2, future=future_2) + + # wait 1 second to give engine time to complete + time.sleep(1) + + assert future_1.done() + assert future_2.done() + + result_1 = future_1.result() + result_2 = future_2.result() + + assert isinstance(result_1, engine_operator.output_schema) + assert isinstance(result_2, engine_operator.output_schema) + + def assert_batch_size_one(arrays): + for array in arrays: + assert array.shape[0] == 1 + + # make sure only a single batch item was returned to each future + # TODO: test that the correct bs1 item is returned (can test against bs1 engine) + assert_batch_size_one(result_1.engine_outputs) + assert_batch_size_one(result_2.engine_outputs) From 5c48505eacc1ef49635d1b0c865aa5c10f768381 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 13 Nov 2023 16:24:17 -0500 Subject: [PATCH 26/57] [ContinuousBatching] ContinuousBatchingScheduler Implementation (#1375) * [ContinuousBatching] ContinuousBatchingScheduler Implementation * cleanup unnecessary stop condition --- src/deepsparse/v2/schedulers/__init__.py | 2 + .../continuous_batching_scheduler.py | 141 ++++++++++++++++++ .../test_continuous_batching_scheduler.py | 48 ++++++ 3 files changed, 191 insertions(+) create mode 100644 src/deepsparse/v2/schedulers/continuous_batching_scheduler.py create mode 100644 tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py index 04c37077e1..b4d78521ab 100644 --- a/src/deepsparse/v2/schedulers/__init__.py +++ b/src/deepsparse/v2/schedulers/__init__.py @@ -1,4 +1,5 @@ # flake8: noqa +# isort: skip_file # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. # @@ -16,3 +17,4 @@ from .scheduler import * from .scheduler_group import * +from .continuous_batching_scheduler import * diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py new file mode 100644 index 0000000000..96e0a502b6 --- /dev/null +++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py @@ -0,0 +1,141 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import Future +from threading import Lock +from typing import List + +from deepsparse.v2.operators import EngineOperator, Operator +from deepsparse.v2.schedulers.scheduler import OperatorScheduler +from deepsparse.v2.schedulers.utils import ( + ContinuousBatchingExecutorThread, + ContinuousBatchingQueues, +) + + +__all__ = ["ContinuousBatchingScheduler"] + + +class ContinuousBatchingScheduler(OperatorScheduler): + """ + Manages EngineOperator jobs that should be run with continuous batching. + Groups requests for the same engine into larger batches and returns + the result to the respeictive request threads after scheduled completion + + :param max_workers: maximum number of threads to execute at once, default 1 + """ + + def __init__(self, max_workers: int = 1): + self._max_workers = max_workers + + self._mutex = Lock() + + # Dict[EngineOperator, Dict[batch_size, Engine]] + self._operators_to_engines = {} # EngineOperator -> Dict[batch_size, Engine] + self._queues = ContinuousBatchingQueues() + + # create and start max number of worker threads + self._threads = [ + ContinuousBatchingExecutorThread(self._queues, self._operators_to_engines) + for _ in range(self.max_workers) + ] + for worker_thread in self._threads: + worker_thread.start() + + @property + def max_workers(self) -> int: + """ + :return: maximum number of threads to execute at once + """ + return self._max_workers + + def submit(self, *args, operator: Operator, **kwargs) -> Future: + """ + :param operator: operator to run + :param operator_input: input schema to the operator + :return: future referencing the asynchronously run output of the operator + """ + inputs = args[0] + if not isinstance(inputs, operator.input_schema): + raise ValueError( + "Inputs to ContinuousBatchingScheduler must be the specific " + f"input schema to the given operator. Expected {operator.input_schema}" + f"found {type(inputs)}" + ) + + future = Future() + self._queues.add_queue_item(key=operator, item=inputs, future=future) + + return future + + def can_process(self, *args, operator: Operator, **kwargs) -> bool: + """ + :param operator: operator to check + :param operator_input: operator_input to check + :return: True if this Operator can process the given operator and input. + SchedulerGroup always returns True + """ + return operator in self._operators_to_engines and operator in self._queues + + def add_engine_operator( + self, engine_operator: EngineOperator, batch_sizes: List[int] + ): + """ + Adds tracking for an engine operator to this scheduler + with continuous batching for the given sizes + + :param engine_operator: an EngineOperator, must be compiled with + batch_size=1 + :param batch_sizes: batch sizes to use for continuous batching + """ + # lock updates to _operators_to_engines while updating + self._mutex.acquire() + + # validation + if engine_operator in self._operators_to_engines: + # operator already added + return + + if not isinstance(engine_operator, EngineOperator): + raise ValueError( + f"Expected an EngineOperator instance, found {type(engine_operator)}" + ) + if engine_operator.batch_size != 1: + raise ValueError( + "For continuous batching, EngineOperator must have batch_size=1. " + f"found batch_size={engine_operator.batch_size}" + ) + + # build EngineOperator -> List[batch_size] dict + operator_engines = {} + # base engine, expected batch size is 1 + operator_engines[engine_operator.batch_size] = engine_operator.engine + + # compile auxillary engines for continuous batching + for batch_size in batch_sizes: + if batch_size == 1: + continue # already added + operator_engines[batch_size] = operator_engines.create_engine( + batch_size=batch_size + ) + + self._operators_to_engines[engine_operator] = operator_engines + self._queues.add_queue( + key=engine_operator, + batch_sizes=list(operator_engines.keys()), + ) + + # release lock + self._mutex.release() diff --git a/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py new file mode 100644 index 0000000000..7ed49de004 --- /dev/null +++ b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py @@ -0,0 +1,48 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from concurrent.futures import Future + +import numpy + +from deepsparse.v2.operators import EngineOperator +from deepsparse.v2.schedulers import ContinuousBatchingScheduler + + +def test_continuous_batching_executor_thread(): + # simple test that ContinuousBatchingScheduler can be instantiated and return + # a result from a request, for testing multi-batch execution, making enough + # concurrent requests guarantee batched execution is out of scope + scheduler = ContinuousBatchingScheduler() + + # mobilenet model with batch_size=2 + engine_operator = EngineOperator( + "zoo:mobilenet_v2-1.0-imagenet-base", + batch_size=1, + ) + + scheduler.add_engine_operator(engine_operator, [1]) + + # submit job to scheduler and expect future to be returned + engine_input = engine_operator.input_schema( + engine_inputs=[numpy.random.randn(1, 3, 224, 224).astype(numpy.float32)] + ) + future = scheduler.submit(engine_input, operator=engine_operator) + assert isinstance(future, Future) + assert not future.done() # assume this runs before engine has a chance to complete + + # assert that output resolves and contains a numpy array + engine_output = future.result() + assert isinstance(engine_output, engine_operator.output_schema) + assert isinstance(engine_output.engine_outputs[0], numpy.ndarray) From e1b7f3703fc91429f20fd7b79e06487898f4fa6e Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 13 Nov 2023 16:27:26 -0500 Subject: [PATCH 27/57] [continuous batching] singleton pattern for scheduler (#1391) * [continuous batching] singleton pattern for scheduler * catch from review --- .../continuous_batching_scheduler.py | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py index 96e0a502b6..669c5922a0 100644 --- a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py +++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py @@ -28,11 +28,32 @@ __all__ = ["ContinuousBatchingScheduler"] +_GLOBAL_SCHEDULER = None + + class ContinuousBatchingScheduler(OperatorScheduler): """ Manages EngineOperator jobs that should be run with continuous batching. Groups requests for the same engine into larger batches and returns - the result to the respeictive request threads after scheduled completion + the result to the respective request threads after scheduled completion + + Example code for getting or creating a shared instance for scheduling + between pipelines and adding an engine operator to the scheduler + within a pipeline + + ```python + + class MyPipeline(Pipeline): + + def __init__(self): + ... + engine_operator = EngineOperator(...) + ... + continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance() + continuous_batching_scheduler.add_engine_operator(engine_operator) + + super.__init__(...) + ``` :param max_workers: maximum number of threads to execute at once, default 1 """ @@ -54,6 +75,19 @@ def __init__(self, max_workers: int = 1): for worker_thread in self._threads: worker_thread.start() + @classmethod + def get_instance(cls) -> "ContinuousBatchingScheduler": + """ + :return: global instance of the continuous batching scheduler. If one + does not exist yet, a scheduler with a single worker thread to + schedule all jobs is created and started + """ + if _GLOBAL_SCHEDULER is not None: + return _GLOBAL_SCHEDULER # noqa: F823 + + _GLOBAL_SCHEDULER = cls(max_workers=1) + return _GLOBAL_SCHEDULER + @property def max_workers(self) -> int: """ From bbd534da76610a8bc0d6d55352cf0fb65737985e Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 14 Nov 2023 10:47:40 +0100 Subject: [PATCH 28/57] [Pipeline Refactor][Text-Generation] Create a helper function for creating engine_inputs (#1364) * rebasing off my initial commit * cleanups * unit testing for text generation operators * additional changes * unit testing completion * remove debug * fix * add todo * more clean-up * fix test * add docstrings/comments * break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed * Delete tests/deepsparse/v2/unit/text_generation/test_msic.py --------- Co-authored-by: Dipika Sikka --- src/deepsparse/transformers/utils/helpers.py | 92 ++++++++++++++++++- .../autoregressive_preprocess_operator.py | 34 ++----- .../multi_engine_prefill_operator.py | 81 +++------------- .../transformers/utils/test_helpers.py | 74 +++++++++++++++ 4 files changed, 185 insertions(+), 96 deletions(-) diff --git a/src/deepsparse/transformers/utils/helpers.py b/src/deepsparse/transformers/utils/helpers.py index 38e3ec4a4c..648bdef9cf 100644 --- a/src/deepsparse/transformers/utils/helpers.py +++ b/src/deepsparse/transformers/utils/helpers.py @@ -14,7 +14,7 @@ import logging import pathlib import uuid -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy from transformers import AutoTokenizer, GenerationConfig @@ -33,6 +33,7 @@ "override_config", "process_generation_config", "validate_session_ids", + "compute_engine_inputs", "set_generated_length", ] @@ -82,6 +83,95 @@ def set_generated_length( ) +def compute_engine_inputs(onnx_input_names: str, **kwargs) -> List[numpy.ndarray]: + """ + Given the names of the onnx inputs, compute the inputs + to the engine. The inputs will be calculating from the + passed kwargs. The information about the required kwargs + can be found in the docstring of the individual compute + functions. + + :param onnx_input_names: The names of the onnx inputs + :param kwargs: The kwargs to compute the inputs from + :return: The computed inputs to the engine + """ + engine_inputs = [] + for input_name in onnx_input_names: + if input_name == "causal_mask": + # delay the computation of the causal mask + continue + # fetch the compute function for the + # given input_name + compute_func = _get_compute_func(input_name) + # compute the engine input from the kwargs + # and append it to the engine_inputs + engine_inputs.append(compute_func(**kwargs)) + + if "causal_mask" in onnx_input_names: + # compute the causal mask and append it to the engine_inputs + input_ids, attention_mask, *_ = engine_inputs + engine_inputs.append(create_causal_mask(input_ids, attention_mask)) + + return engine_inputs + + +def _get_compute_func(input_name: str) -> Callable[..., numpy.ndarray]: + # given the input_name, return the appropriate compute function + compute_func = { + "input_ids": _compute_input_ids, + "attention_mask": _compute_attention_mask, + "positions": _compute_positions, + }.get(input_name) + if compute_func is None: + raise ValueError( + "Could not find compute function " f"for the input_name: {input_name}" + ) + return compute_func + + +def _compute_input_ids(token_batch: List[int], **kwargs) -> numpy.ndarray: + # convert the token_batch to a numpy array + return numpy.array([token_batch]) + + +def _compute_attention_mask( + sequence_length: int, + prompt_sequence_length: int, + num_total_processed_tokens: int, + **kwargs, +) -> numpy.ndarray: + # create a fully masked attention mask with the appropriate + # shape (equal to the sequence_length) + attention_mask = numpy.zeros((1, sequence_length), dtype=numpy.int64) + # unmask the appropriate number of tokens, the sum of + # - the number of tokens already processed and cached (num_total_processed_tokens) + # - the number of tokens currently processed (prompt_sequence_length) + # the sum cannot exceed the maximum length of the attention_mask + num_attention_entries_to_unmask = min( + num_total_processed_tokens + prompt_sequence_length, sequence_length + ) + # unmask the bits from the right-hand side + attention_mask[:, -num_attention_entries_to_unmask:] = 1 + return attention_mask + + +def _compute_positions( + num_total_processed_tokens: int, prompt_sequence_length: int, **kwargs +): + # create the positions array with the appropriate shape + # positions count starts from the number of tokens already processed + # and ends at the number of tokens already processed + the number of tokens + # currently processed + return ( + numpy.arange( + num_total_processed_tokens, + num_total_processed_tokens + prompt_sequence_length, + ) + .reshape(1, -1) + .astype(numpy.int64) + ) + + def validate_session_ids( session_ids: Optional[str], other_attributes: Dict[str, Any] ) -> Optional[List[str]]: diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py index 6e97412e43..17d8dd662c 100644 --- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py @@ -15,9 +15,7 @@ import logging from typing import Any -import numpy - -from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.transformers.utils.helpers import compute_engine_inputs from deepsparse.v2.operators import Operator from deepsparse.v2.utils import PipelineState @@ -66,30 +64,16 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg num_total_processed_tokens = kv_cache.total_num_processed_tokens new_token = tokens[num_total_processed_tokens] - engine_input_names = pipeline_state.current_state.get( - "onnx_input_names_no_cache" - ) - - # padding is added to left, so attention mask is 1s from the - # right up to the number of total tokens (prompt + generated) - attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) - num_attention_entries_to_unmask = min( - num_total_processed_tokens + 1, self.sequence_length - ) # cap by seq len - attention_mask[:, -num_attention_entries_to_unmask:] = 1 - positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64) - input_ids = numpy.array([[new_token]]) - causal_mask = create_causal_mask(input_ids, attention_mask) - engine_inputs_map = dict( - input_ids=input_ids, - attention_mask=attention_mask, - causal_mask=causal_mask, - positions=positions, + engine_inputs = compute_engine_inputs( + onnx_input_names=pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ), + token_batch=[new_token], + prompt_sequence_length=1, + sequence_length=self.sequence_length, + num_total_processed_tokens=num_total_processed_tokens, ) - - engine_inputs = [engine_inputs_map[name] for name in engine_input_names] - return { "engine_inputs": engine_inputs, "kv_cache": kv_cache, diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py index 9a885c2355..513c34dfc2 100644 --- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py +++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py @@ -13,12 +13,9 @@ # limitations under the License. import logging -from enum import Enum from typing import Any -import numpy - -from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.transformers.utils.helpers import compute_engine_inputs from deepsparse.v2.operators import Operator from deepsparse.v2.utils import PipelineState @@ -28,34 +25,14 @@ __all__ = ["MultiEnginePrefill"] -class OnnxInputNames(Enum): - INPUT_IDS = "input_ids" - ATTN_MASK = "attention_mask" - CAUSAL_MASK = "causal_mask" - POSITIONS = "positions" - - -# NOTE: A possible clean-up could involve combining this Operator and the -# autoregressive_preprocess_operator - - class MultiEnginePrefill(Operator): def __init__(self, prompt_sequence_length, sequence_length): """ Prepare the tokens for the multi-token engine. This requires creating the - attention mask, positions, and causal mask. The output contains these three - arrays to be passed into the multi-token engine. + appropriate engine_inputsto be passed into the multi-token engine. """ self.prompt_sequence_length = prompt_sequence_length self.sequence_length = sequence_length - self.cases = { - OnnxInputNames.ATTN_MASK.value: self._case_attn_mask, - OnnxInputNames.POSITIONS.value: self._case_positions, - } - _LOGGER.warn( - "This operator requires the PipelineState to be set-up with the " - "onnx_input_names_no_cache attribute set from the NLEngineOperator." - ) def can_operate(self, inp: Any): """ @@ -75,59 +52,23 @@ def can_operate(self, inp: Any): return True return False - def _case_attn_mask(self, num_total_processed_tokens: int): - # create an empty attention mask - engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) - # calculate the number of entries in attention mask that should be set to 1 - num_attention_entries_to_unmask = min( - num_total_processed_tokens + self.prompt_sequence_length, - self.sequence_length, - ) - engine_input[:, -num_attention_entries_to_unmask:] = 1 - return engine_input - - def _case_positions(self, num_total_processed_tokens: int): - return ( - numpy.arange( - num_total_processed_tokens, - num_total_processed_tokens + self.prompt_sequence_length, - ) - .reshape(1, -1) - .astype(numpy.int64) - ) - def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length) - onnx_input_names_no_cache = pipeline_state.current_state.get( - "onnx_input_names_no_cache" - ) - num_total_processed_tokens = kv_cache.total_num_processed_tokens start = num_total_processed_tokens end = start + self.prompt_sequence_length token_batch = tokens[start:end] - engine_inputs = [] - for name in onnx_input_names_no_cache: - if name == OnnxInputNames.INPUT_IDS.value: - engine_input = numpy.array([token_batch]) - elif ( - name == OnnxInputNames.ATTN_MASK.value - or name == OnnxInputNames.POSITIONS.value - ): - engine_input = self.cases[name](num_total_processed_tokens) - elif name == OnnxInputNames.CAUSAL_MASK.value: - continue - - engine_inputs.append(engine_input) - - if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache: - causal_mask = create_causal_mask( - input_ids=engine_inputs[0], - attention_mask=engine_inputs[1], - ) - engine_inputs.append(causal_mask) + engine_inputs = compute_engine_inputs( + onnx_input_names=pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ), + token_batch=token_batch, + prompt_sequence_length=self.prompt_sequence_length, + sequence_length=self.sequence_length, + num_total_processed_tokens=num_total_processed_tokens, + ) return { "engine_inputs": engine_inputs, diff --git a/tests/deepsparse/transformers/utils/test_helpers.py b/tests/deepsparse/transformers/utils/test_helpers.py index 7fcadcbf9c..95e4ee7fa7 100644 --- a/tests/deepsparse/transformers/utils/test_helpers.py +++ b/tests/deepsparse/transformers/utils/test_helpers.py @@ -16,12 +16,86 @@ import pytest from deepsparse.transformers.utils.helpers import ( + compute_engine_inputs, create_causal_mask, initialize_kv_cache_state, validate_session_ids, ) +@pytest.mark.parametrize( + "onnx_input_names, " + "token_batch, " + "prompt_sequence_length, " + "sequence_length, " + "num_total_processed_tokens, " + "expected_engine_inputs", + [ + ( + ["input_ids", "attention_mask", "positions"], + [1, 2, 3], + 3, + 6, + 2, + [ + numpy.array([[1, 2, 3]]), + numpy.array([[0, 1, 1, 1, 1, 1]]), + numpy.array([[2, 3, 4]]), + ], + ), + ( + ["input_ids", "attention_mask", "positions", "causal_mask"], + [1, 2, 3], + 3, + 6, + 2, + [ + numpy.array([[1, 2, 3]]), + numpy.array([[0, 1, 1, 1, 1, 1]]), + numpy.array([[2, 3, 4]]), + create_causal_mask( + input_ids=numpy.array([[1, 2, 3]]), + attention_mask=numpy.array([[0, 1, 1, 1, 1, 1]]), + ), + ], + ), + ( + ["input_ids", "attention_mask", "positions", "causal_mask"], + [15], + 1, + 5, + 3, + [ + numpy.array([[15]]), + numpy.array([[0, 1, 1, 1, 1]]), + numpy.array([[3]]), + create_causal_mask( + input_ids=numpy.array([[15]]), + attention_mask=numpy.array([[0, 1, 1, 1, 1]]), + ), + ], + ), + ], +) +def test_compute_engine_inputs( + onnx_input_names, + token_batch, + prompt_sequence_length, + sequence_length, + num_total_processed_tokens, + expected_engine_inputs, +): + engine_inputs = compute_engine_inputs( + onnx_input_names=onnx_input_names, + token_batch=token_batch, + prompt_sequence_length=prompt_sequence_length, + sequence_length=sequence_length, + num_total_processed_tokens=num_total_processed_tokens, + ) + for x, y in zip(engine_inputs, expected_engine_inputs): + assert numpy.array_equal(x, y) + + @pytest.mark.parametrize( "input_ids, attention_mask, expected_causal_mask", [ From 51c4ee68523978aa84eb66f39925bd24bdf6a617 Mon Sep 17 00:00:00 2001 From: Damian Date: Fri, 17 Nov 2023 14:52:48 +0000 Subject: [PATCH 29/57] pipeline runs, but incorrectly --- .../transformers/utils/token_generator.py | 10 +-- .../v2/text_generation/join_output.py | 3 + .../v2/text_generation/nl_engine_operator.py | 45 +++++++++- src/deepsparse/v2/text_generation/pipeline.py | 85 +++++++++++++++++-- .../v2/text_generation/prep_for_generation.py | 1 + .../v2/unit/text_generation/conftest.py | 4 +- tests/testdata/gsm8k-v0-greedy_until | 1 + tests/testdata/gsm8k-v0-res.json | 1 + 8 files changed, 135 insertions(+), 15 deletions(-) create mode 100644 tests/testdata/gsm8k-v0-greedy_until create mode 100644 tests/testdata/gsm8k-v0-res.json diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index 5fa82b7bc4..76f922de11 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -77,16 +77,16 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: :param logits: the logits from the model with shape (vocab_size,) :return: the sampled token """ - if self.top_k: - logits = self.apply_top_k(logits) - if self.top_p: - logits = self.apply_top_p(logits) - if self.deterministic: token = numpy.argmax(logits) self.tokens.append(token) return token + if self.top_k: + logits = self.apply_top_k(logits) + if self.top_p: + logits = self.apply_top_p(logits) + if self.sampling_temperature != 1.0: logits /= self.sampling_temperature diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py index 8a6c77a2f1..29c086d713 100644 --- a/src/deepsparse/v2/text_generation/join_output.py +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -33,6 +33,9 @@ def __init__(self, tokenizer): self.tokenizer = tokenizer def run(self, inp: List[CompileGenerationsOutput], **kwargs): + + if not isinstance(inp, list): + inp = [[inp]] batch_outputs = [x for x in inp[0]] generated_tokens = [x.generated_tokens for x in batch_outputs] generated_logits = [x.generated_logits for x in batch_outputs] diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 7549f986d9..9c33cb1f93 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -18,6 +18,7 @@ from pydantic import BaseModel, Field +from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.utils.onnx import ( CACHE_INPUT_PREFIX, overwrite_onnx_model_inputs_for_kv_cache_models, @@ -29,7 +30,12 @@ ) -__all__ = ["NLEngineOperator", "NlEngineInput"] +__all__ = [ + "NlEngineOperator", + "NlEngineOperatorNoCache", + "NlEngineInputNoCache", + "NlEngineInput", +] class NlEngineInput(BaseModel): @@ -39,7 +45,12 @@ class NlEngineInput(BaseModel): in_generation: bool = Field(description="in_generation", default=None) -class NLEngineOperator(EngineOperator): +class NlEngineInputNoCache(BaseModel): + input_ids: Any + attention_mask: Any + + +class NlEngineOperator(EngineOperator): """ Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator. @@ -195,3 +206,33 @@ def output_names(self) -> List[str]: :return: The output names for the onnx model """ return self.engine.output_names + + +class NlEngineOperatorNoCache(EngineOperator): + + input_schema = NlEngineInputNoCache + output_schema = None + + def __init__(self, sequence_length, **kwargs): + model_path, *_ = overwrite_transformer_onnx_model_inputs( + path=kwargs.get("model_path"), + max_length=sequence_length, + batch_size=kwargs.get("batch_size", 1), + ) + super().__init__(**kwargs) + + def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: + engine_inputs = [inp.input_ids, inp.attention_mask] + logits = ( + super() + .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs) + .get("engine_outputs") + ) + return { + "logits": logits, + "logits_shape": None, + "deterministic": None, + "kv_cache": None, + "tokens": None, + "sampling_temperature": None, + }, {"prompt_logits": logits} diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 4695220819..0f1c3cf559 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -17,8 +17,9 @@ from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.utils import split_engine_inputs +from deepsparse.utils.onnx import default_cached_outputs from deepsparse.v2.pipeline import Pipeline -from deepsparse.v2.routers import GraphRouter +from deepsparse.v2.routers import GraphRouter, LinearRouter from deepsparse.v2.schedulers import OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, @@ -29,7 +30,8 @@ JoinOutput, KVCacheCreator, MultiEnginePrefill, - NLEngineOperator, + NlEngineOperator, + NlEngineOperatorNoCache, PrepareforPrefill, PrepareGeneration, ProcessInputsTextGeneration, @@ -39,6 +41,79 @@ from deepsparse.v2.utils import PipelineState +class TextGenerationPipelineNoCache(Pipeline): + def __init__( + self, + model_path: str, + sequence_length: int = 1024, + engine_kwargs: Optional[Dict] = None, + onnx_model_name: Optional[str] = None, + generation_config=None, # TODO: Typing here + **kwargs, + ): + + ( + self.model_path, + self.config, + self.tokenizer, + engine_kwargs, + ) = setup_transformers_pipeline( + model_path, + sequence_length, + onnx_model_name=onnx_model_name, + engine_kwargs=engine_kwargs, + ) + self.verify_no_kv_cache_present() + + token_generator = TokenGeneratorOperator() + + ops = [ + ProcessInputsTextGeneration( + generation_config=process_generation_config(generation_config), + sequence_length=sequence_length, + tokenizer=self.tokenizer, + ), + NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), + PrepareGeneration( + sequence_length=sequence_length, + prompt_sequence_length=1, + token_generator=token_generator, + ), + GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True), + CompileGeneratedTokens(), + CompileGenerations(), + JoinOutput(tokenizer=self.tokenizer), + ProcessOutputs(tokenizer=self.tokenizer), + ] + router = LinearRouter(end_route=len(ops)) + scheduler = [OperatorScheduler()] + super().__init__( + ops=ops, + router=router, + schedulers=scheduler, + ) + + def run(self, *args, **kwargs): + # we need to set the fixed_sequences_length flag to True + # for the non-kv cache pipeline + kwargs.update(dict(fixed_sequences_length=True)) + return super().run(*args, **kwargs) + + def verify_no_kv_cache_present(self) -> bool: + """ + Verifies that the ONNX model does not have + KV cache inputs/outputs present. + :return: True if compatible, False otherwise + """ + is_kv_cache_present = any(default_cached_outputs(self.model_path)) + if is_kv_cache_present: + raise ValueError( + f"The model: {self.model_path} has KV cache inputs/outputs present. " + "Please use the TextGenerationPipeline instead." + ) + return not is_kv_cache_present + + class TextGenerationPipeline(Pipeline): def __init__( self, @@ -65,14 +140,14 @@ def __init__( if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": internal_kv_cache = False - single_engine_operator = NLEngineOperator( + single_engine_operator = NlEngineOperator( sequence_length=sequence_length, internal_kv_cache=internal_kv_cache, input_ids_length=1, **engine_kwargs, ) - multi_engine_operator = NLEngineOperator( + multi_engine_operator = NlEngineOperator( sequence_length=sequence_length, internal_kv_cache=internal_kv_cache, input_ids_length=prompt_sequence_length, @@ -194,5 +269,3 @@ def expand_inputs(self, items, batch_size): def condense_inputs(self, *args, **kwargs): return args[0], kwargs - - \ No newline at end of file diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 0ea4a06a02..9b63946c16 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -91,6 +91,7 @@ def run( "token_generator": token_generator, } output = { + "logits": prompt_logits, "tokens": token_generator.tokens, "kv_cache": kv_cache, "in_generation": True, diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py index 5d8483e5f6..7524db454a 100644 --- a/tests/deepsparse/v2/unit/text_generation/conftest.py +++ b/tests/deepsparse/v2/unit/text_generation/conftest.py @@ -25,7 +25,7 @@ from deepsparse.v2 import InferenceState, PipelineState from deepsparse.v2.text_generation import ( GenerationDefaults, - NLEngineOperator, + NlEngineOperator, TokenGeneratorOperator, ) @@ -61,7 +61,7 @@ def single_token_engine_no_internal_cache(text_generation_attributes, model_attr seq_length, _ = text_generation_attributes _, model_path = model_attributes - nl_engine_operator = NLEngineOperator( + nl_engine_operator = NlEngineOperator( sequence_length=seq_length, input_ids_length=1, model_path=model_path ) return nl_engine_operator diff --git a/tests/testdata/gsm8k-v0-greedy_until b/tests/testdata/gsm8k-v0-greedy_until new file mode 100644 index 0000000000..09a6a1eadb --- /dev/null +++ b/tests/testdata/gsm8k-v0-greedy_until @@ -0,0 +1 @@ +3b4bf5c7d1504339aa06bcb50212dba05ff761d30de6faf720fdc818b16316ad \ No newline at end of file diff --git a/tests/testdata/gsm8k-v0-res.json b/tests/testdata/gsm8k-v0-res.json new file mode 100644 index 0000000000..fb6514a0e7 --- /dev/null +++ b/tests/testdata/gsm8k-v0-res.json @@ -0,0 +1 @@ +{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}} \ No newline at end of file From fa96efb7105962607c9b27dd0f24e2e89314a973 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 20 Nov 2023 13:26:32 +0000 Subject: [PATCH 30/57] it works for a single sequence --- .../v2/text_generation/nl_engine_operator.py | 15 ++++++--------- src/deepsparse/v2/text_generation/pipeline.py | 4 ++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 9c33cb1f93..cb27f69cc0 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -16,9 +16,9 @@ import os from typing import Any, List, Tuple +import numpy from pydantic import BaseModel, Field -from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.utils.onnx import ( CACHE_INPUT_PREFIX, overwrite_onnx_model_inputs_for_kv_cache_models, @@ -213,12 +213,7 @@ class NlEngineOperatorNoCache(EngineOperator): input_schema = NlEngineInputNoCache output_schema = None - def __init__(self, sequence_length, **kwargs): - model_path, *_ = overwrite_transformer_onnx_model_inputs( - path=kwargs.get("model_path"), - max_length=sequence_length, - batch_size=kwargs.get("batch_size", 1), - ) + def __init__(self, **kwargs): super().__init__(**kwargs) def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: @@ -228,11 +223,13 @@ def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs) .get("engine_outputs") ) + + logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1) return { - "logits": logits, + "logits": [logits], "logits_shape": None, "deterministic": None, "kv_cache": None, "tokens": None, "sampling_temperature": None, - }, {"prompt_logits": logits} + }, {"prompt_logits": [logits]} diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 0f1c3cf559..d36dabab5d 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -60,6 +60,7 @@ def __init__( ) = setup_transformers_pipeline( model_path, sequence_length, + tokenizer_padding_side="right", onnx_model_name=onnx_model_name, engine_kwargs=engine_kwargs, ) @@ -73,14 +74,13 @@ def __init__( sequence_length=sequence_length, tokenizer=self.tokenizer, ), - NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), + NlEngineOperatorNoCache(**engine_kwargs), PrepareGeneration( sequence_length=sequence_length, prompt_sequence_length=1, token_generator=token_generator, ), GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True), - CompileGeneratedTokens(), CompileGenerations(), JoinOutput(tokenizer=self.tokenizer), ProcessOutputs(tokenizer=self.tokenizer), From e41ddf891662cea1ddfa1e6af08a90a4dfddf918 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 20 Nov 2023 14:06:07 +0000 Subject: [PATCH 31/57] cleanup. now lets figure out how to run multiple sequences --- .../v2/text_generation/join_output.py | 6 +++++- .../v2/text_generation/nl_engine_operator.py | 18 ++++++++++-------- tests/testdata/gsm8k-v0-greedy_until | 1 - tests/testdata/gsm8k-v0-res.json | 1 - 4 files changed, 15 insertions(+), 11 deletions(-) delete mode 100644 tests/testdata/gsm8k-v0-greedy_until delete mode 100644 tests/testdata/gsm8k-v0-res.json diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py index 29c086d713..5813702f46 100644 --- a/src/deepsparse/v2/text_generation/join_output.py +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -32,10 +32,14 @@ class JoinOutput(Operator): def __init__(self, tokenizer): self.tokenizer = tokenizer - def run(self, inp: List[CompileGenerationsOutput], **kwargs): + def run(self, inp: List[List[CompileGenerationsOutput]], **kwargs): if not isinstance(inp, list): + # when running without KV Cache + # this will be a single + # CompileGenerationsOutput for now inp = [[inp]] + batch_outputs = [x for x in inp[0]] generated_tokens = [x.generated_tokens for x in batch_outputs] generated_logits = [x.generated_logits for x in batch_outputs] diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index cb27f69cc0..fe28bdfe2c 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -209,6 +209,11 @@ def output_names(self) -> List[str]: class NlEngineOperatorNoCache(EngineOperator): + """ + Operator the Natural Language Engine, that operates without + KV Cache. This means that this operator merely maps input_ids + and attention_mask to logits + """ input_schema = NlEngineInputNoCache output_schema = None @@ -224,12 +229,9 @@ def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: .get("engine_outputs") ) + # By default, the engine outputs logits for all tokens in the sequence. + # Let's filter out the logits for the padding tokens. logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1) - return { - "logits": [logits], - "logits_shape": None, - "deterministic": None, - "kv_cache": None, - "tokens": None, - "sampling_temperature": None, - }, {"prompt_logits": [logits]} + return {"logits": [logits], "kv_cache": None, "tokens": None}, { + "prompt_logits": [logits] + } diff --git a/tests/testdata/gsm8k-v0-greedy_until b/tests/testdata/gsm8k-v0-greedy_until deleted file mode 100644 index 09a6a1eadb..0000000000 --- a/tests/testdata/gsm8k-v0-greedy_until +++ /dev/null @@ -1 +0,0 @@ -3b4bf5c7d1504339aa06bcb50212dba05ff761d30de6faf720fdc818b16316ad \ No newline at end of file diff --git a/tests/testdata/gsm8k-v0-res.json b/tests/testdata/gsm8k-v0-res.json deleted file mode 100644 index fb6514a0e7..0000000000 --- a/tests/testdata/gsm8k-v0-res.json +++ /dev/null @@ -1 +0,0 @@ -{"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}} \ No newline at end of file From b80a417a3fd5f46035ee3a73dfc4cffae074d3c6 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 20 Nov 2023 21:55:11 +0100 Subject: [PATCH 32/57] [Pipeline Refactor][Text-Generation] Refactor `transformers` helpers functions (#1394) * add split/join functionality * update router to include split/join in parent class, refactor pipeline code to remove repeat code, update map function * process multiple generations * initial commit * fix error * unit testing for text generation operators * additional changes * unit testing completion * remove debug * fix * add todo * more clean-up * fix test * add docstrings/comments * break out tests to individual unit test files; add conftest and make scope of fixtures module to help with speed * Delete tests/deepsparse/v2/unit/text_generation/test_msic.py * pipeline runs, but incorrectly * Revert "pipeline runs, but incorrectly" This reverts commit 51c4ee68523978aa84eb66f39925bd24bdf6a617. * PR review comments --------- Co-authored-by: Dipika Sikka --- src/deepsparse/transformers/helpers.py | 114 ++++++++++++++++-- .../transformers/pipelines/pipeline.py | 38 ++---- src/deepsparse/utils/onnx.py | 8 +- src/deepsparse/v2/text_generation/pipeline.py | 68 ++--------- 4 files changed, 130 insertions(+), 98 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index d7acc71a99..7273b61406 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -17,24 +17,26 @@ """ +import logging import os import re from pathlib import Path from tempfile import NamedTemporaryFile -from typing import List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy import onnx +import transformers from onnx import ModelProto from deepsparse.log import get_main_logger -from deepsparse.utils.onnx import _MODEL_DIR_ONNX_NAME, truncate_onnx_model +from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model from sparsezoo import Model from sparsezoo.utils import save_onnx __all__ = [ - "get_deployment_path", + "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", "fix_numpy_types", "get_transformer_layer_init_names", @@ -44,7 +46,94 @@ _LOGGER = get_main_logger() -def get_deployment_path(model_path: str) -> Tuple[str, str]: +def setup_transformers_pipeline( + model_path: str, + sequence_length: int, + tokenizer_padding_side: str = "left", + engine_kwargs: Optional[Dict] = None, + onnx_model_name: Optional[str] = None, +) -> Tuple[ + str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any] +]: + """ + A helper function that sets up the model path, config, tokenizer, + and engine kwargs for a transformers model. + :param model_path: The path to the model to load + :param sequence_length: The sequence length to use for the model + :param tokenizer_padding_side: The side to pad on for the tokenizer, + either "left" or "right" + :param engine_kwargs: The kwargs to pass to the engine + :param onnx_model_name: The name of the onnx model to be loaded. + If not specified, defaults are used (see setup_onnx_file_path) + :return The model path, config, tokenizer, and engine kwargs + """ + model_path, config, tokenizer = setup_onnx_file_path( + model_path, sequence_length, onnx_model_name + ) + + tokenizer.padding_side = tokenizer_padding_side + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + + engine_kwargs = engine_kwargs or {} + if engine_kwargs.get("model_path"): + raise ValueError( + "The engine kwargs already specify " + f"a model path: {engine_kwargs['model_path']}, " + f"but a model path was also provided: {model_path}. " + "Please only provide one." + ) + engine_kwargs["model_path"] = model_path + return model_path, config, tokenizer, engine_kwargs + + +def setup_onnx_file_path( + model_path: str, + sequence_length: int, + onnx_model_name: Optional[str] = None, + task: Optional[str] = None, +) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]: + """ + Parses ONNX model from the `model_path` provided. It additionally + creates config and tokenizer objects from the `deployment path`, + derived from the `model_path` provided. + :param model_path: path to the model to be parsed + :param sequence_length: maximum sequence length of the model + :param onnx_model_name: optionally, the precise name of the ONNX model + of interest may be specified. If not specified, the default ONNX model + name will be used (refer to `get_deployment_path` for details) + :return: file path to the processed ONNX file for the engine to compile + """ + deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name) + + hf_logger = logging.getLogger("transformers") + hf_logger_level = hf_logger.level + hf_logger.setLevel(logging.ERROR) + + config = transformers.PretrainedConfig.from_pretrained( + deployment_path, finetuning_task=task + ) + hf_logger.setLevel(hf_logger_level) + + trust_remote_code = False + tokenizer = transformers.AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=trust_remote_code, + model_max_length=sequence_length, + ) + + if not config or not tokenizer: + raise RuntimeError( + "Invalid config or tokenizer provided. Please provide " + "paths to the files or ensure they exist in the `model_path` provided. " + "See `tokenizer` and `config` arguments for details." + ) + return onnx_path, config, tokenizer + + +def get_deployment_path( + model_path: str, onnx_model_name: Optional[str] = None +) -> Tuple[str, str]: """ Returns the path to the deployment directory for the given model path and the path to the mandatory @@ -53,9 +142,12 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: for running the transformers model in the deepsparse pipeline :param model_path: path to model directory, sparsezoo stub, or ONNX file + :param onnx_model_name: name of the ONNX file to look for in the deployment + directory. Defaults to MODEL_ONNX_NAME :return: path to the deployment directory and path to the ONNX file inside the deployment directory """ + onnx_model_name = onnx_model_name or MODEL_ONNX_NAME if os.path.isfile(model_path): # return the parent directory of the ONNX file return os.path.dirname(model_path), model_path @@ -63,26 +155,26 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: if os.path.isdir(model_path): model_files = os.listdir(model_path) - if _MODEL_DIR_ONNX_NAME not in model_files: + if onnx_model_name not in model_files: raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " + f"{onnx_model_name} not found in transformers model directory " f"{model_path}. Be sure that an export of the model is written to " - f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}" + f"{os.path.join(model_path, onnx_model_name)}" ) - return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME) + return model_path, os.path.join(model_path, onnx_model_name) elif model_path.startswith("zoo:"): zoo_model = Model(model_path) deployment_path = zoo_model.deployment_directory_path - return deployment_path, os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + return deployment_path, os.path.join(deployment_path, onnx_model_name) elif model_path.startswith("hf:"): from huggingface_hub import snapshot_download deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + onnx_path = os.path.join(deployment_path, onnx_model_name) if not os.path.isfile(onnx_path): raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " + f"{onnx_model_name} not found in transformers model directory " f"{deployment_path}. Be sure that an export of the model is written to " f"{onnx_path}" ) diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py index 065a26ce71..ac54c4a3db 100644 --- a/src/deepsparse/transformers/pipelines/pipeline.py +++ b/src/deepsparse/transformers/pipelines/pipeline.py @@ -16,19 +16,18 @@ Base Pipeline class for transformers inference pipeline """ -import logging + import warnings from pathlib import Path from typing import Any, Dict, List, Mapping, Optional, Union import numpy import transformers -from transformers.models.auto import AutoTokenizer from deepsparse import Bucketable, Pipeline +from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.transformers.helpers import ( - get_deployment_path, - overwrite_transformer_onnx_model_inputs, + setup_onnx_file_path as setup_onnx_file_path_v2, ) @@ -124,24 +123,15 @@ def setup_onnx_file_path(self) -> str: :return: file path to the processed ONNX file for the engine to compile """ - deployment_path, onnx_path = get_deployment_path(self.model_path) - - # temporarily set transformers logger to ERROR to avoid - # printing misleading warnings - hf_logger = logging.getLogger("transformers") - hf_logger_level = hf_logger.level - hf_logger.setLevel(logging.ERROR) - self.config = transformers.PretrainedConfig.from_pretrained( - deployment_path, - finetuning_task=self.task if hasattr(self, "task") else None, - ) - hf_logger.setLevel(hf_logger_level) - - self.tokenizer = AutoTokenizer.from_pretrained( - deployment_path, - trust_remote_code=self._trust_remote_code, - model_max_length=self.sequence_length, + # we will be soon retiring V1 pipelines. This is why I am deciding + # to reuse the functions from V2 pipelines in the (soon) legacy pipelines + onnx_path, config, tokenizer = setup_onnx_file_path_v2( + model_path=self.model_path, + sequence_length=self.sequence_length, + task=self.task if hasattr(self, "task") else None, ) + self.config = config + self.tokenizer = tokenizer if not self._delay_overwriting_inputs: # overwrite onnx graph to given required input shape @@ -153,12 +143,6 @@ def setup_onnx_file_path(self) -> str: onnx_path, max_length=self.sequence_length ) - if not self.config or not self.tokenizer: - raise RuntimeError( - "Invalid config or tokenizer provided. Please provide " - "paths to the files or ensure they exist in the `model_path` provided. " - "See `tokenizer` and `config` arguments for details." - ) return onnx_path def tokens_to_engine_input( diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index e69bf67321..f518620c2f 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -56,12 +56,12 @@ "has_model_kv_cache", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", - "_MODEL_DIR_ONNX_NAME", + "MODEL_ONNX_NAME", ] _LOGGER = logging.getLogger(__name__) -_MODEL_DIR_ONNX_NAME = "model.onnx" +MODEL_ONNX_NAME = "model.onnx" CACHE_INPUT_PREFIX = "past_key_values" CACHE_OUTPUT_PREFIX = "present" @@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model.deployment_directory_path # default to the main onnx file for the model - model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path + model = model.deployment.get_file(MODEL_ONNX_NAME).path elif File is not object and isinstance(model, File): # get the downloaded_path -- will auto download if not on local system @@ -146,7 +146,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model_path = Path(model) if model_path.is_dir(): - return str(model_path / _MODEL_DIR_ONNX_NAME) + return str(model_path / MODEL_ONNX_NAME) return model diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 240da04907..5ab73f7a48 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict +from typing import Dict, Optional +from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.utils import split_engine_inputs from deepsparse.v2.pipeline import Pipeline @@ -47,23 +48,20 @@ def __init__( internal_kv_cache: bool = True, force_max_tokens: bool = False, generation_config=None, - engine_kwargs: Dict = None, + engine_kwargs: Optional[Dict] = None, ): + ( + self.model_path, + self.config, + self.tokenizer, + engine_kwargs, + ) = setup_transformers_pipeline( + model_path, sequence_length, engine_kwargs=engine_kwargs + ) pipeline_state = PipelineState() pipeline_state_vals = {} - # TODO: The code below will be replaced with a transformers set-up Operator. - self.tokenizer = None - model_path = self.setup_onnx_file_path(model_path, sequence_length) - self.tokenizer.padding_side = "left" - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - if not engine_kwargs: - engine_kwargs = {} - engine_kwargs["model_path"] = model_path - if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": internal_kv_cache = False @@ -82,7 +80,7 @@ def __init__( ) # NOTE: Currently using pipeline state. Can swap to simply pass in the - # attributes to the specific Operator that neeed them, as class attributes. + # attributes to the specific Operator that need them, as class attributes. pipeline_state_vals[ "onnx_input_names_no_cache" ] = single_engine_operator.onnx_input_names_no_cache @@ -196,45 +194,3 @@ def expand_inputs(self, items, batch_size): def condense_inputs(self, *args, **kwargs): return args[0], kwargs - - # TODO: Move to be part of a generic transformers set-up Operator. - def setup_onnx_file_path(self, model_path, sequence_length) -> str: - import logging - - import transformers - from transformers import AutoTokenizer - - from deepsparse.transformers.helpers import get_deployment_path - - """ - Parses ONNX model from the `model_path` provided. It additionally - creates config and tokenizer objects from the `deployment path`, - derived from the `model_path` provided. - - :return: file path to the processed ONNX file for the engine to compile - """ - deployment_path, onnx_path = get_deployment_path(model_path) - - hf_logger = logging.getLogger("transformers") - hf_logger_level = hf_logger.level - hf_logger.setLevel(logging.ERROR) - self.config = transformers.PretrainedConfig.from_pretrained( - deployment_path, - finetuning_task=self.task if hasattr(self, "task") else None, - ) - hf_logger.setLevel(hf_logger_level) - - self._trust_remote_code = False - self.tokenizer = AutoTokenizer.from_pretrained( - deployment_path, - trust_remote_code=self._trust_remote_code, - model_max_length=sequence_length, - ) - - if not self.config or not self.tokenizer: - raise RuntimeError( - "Invalid config or tokenizer provided. Please provide " - "paths to the files or ensure they exist in the `model_path` provided. " - "See `tokenizer` and `config` arguments for details." - ) - return onnx_path From 1b9238a28e664cef1bb6fc2a57c9193fb3d55ce8 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 20 Nov 2023 21:55:53 +0100 Subject: [PATCH 33/57] [Text Generation][V2] End-to-end tests (#1402) * initial commit * initial commit * its working now * beautification * thank you Dipika <3 * ready to review --- .../transformers/utils/token_generator.py | 11 +- .../v2/text_generation/process_inputs.py | 19 +- .../v2/integration_tests/__init__.py | 13 + .../v2/integration_tests/configs/codegen.yaml | 6 + .../v2/integration_tests/configs/gpt_neo.yaml | 6 + .../v2/integration_tests/configs/opt.yaml | 6 + .../v2/integration_tests/helpers.py | 137 +++++++ .../v2/integration_tests/test_llms.py | 368 ++++++++++++++++++ 8 files changed, 547 insertions(+), 19 deletions(-) create mode 100644 tests/deepsparse/v2/integration_tests/__init__.py create mode 100644 tests/deepsparse/v2/integration_tests/configs/codegen.yaml create mode 100644 tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml create mode 100644 tests/deepsparse/v2/integration_tests/configs/opt.yaml create mode 100644 tests/deepsparse/v2/integration_tests/helpers.py create mode 100644 tests/deepsparse/v2/integration_tests/test_llms.py diff --git a/src/deepsparse/transformers/utils/token_generator.py b/src/deepsparse/transformers/utils/token_generator.py index 5fa82b7bc4..0421da06e2 100644 --- a/src/deepsparse/transformers/utils/token_generator.py +++ b/src/deepsparse/transformers/utils/token_generator.py @@ -77,16 +77,17 @@ def generate(self, logits: numpy.ndarray) -> numpy.ndarray: :param logits: the logits from the model with shape (vocab_size,) :return: the sampled token """ - if self.top_k: - logits = self.apply_top_k(logits) - if self.top_p: - logits = self.apply_top_p(logits) - if self.deterministic: token = numpy.argmax(logits) self.tokens.append(token) return token + if self.top_k: + logits = self.apply_top_k(logits) + + if self.top_p: + logits = self.apply_top_p(logits) + if self.sampling_temperature != 1.0: logits /= self.sampling_temperature diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 214b8526e3..0f9147f916 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -17,7 +17,10 @@ import transformers -from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.pipelines.text_generation import ( + GenerationDefaults, + TextGenerationInput, +) from deepsparse.transformers.utils.helpers import ( check_and_return_generation_config, override_config, @@ -26,19 +29,7 @@ from deepsparse.v2.operators import Operator -__all__ = ["ProcessInputsTextGeneration", "GenerationDefaults"] - - -class GenerationDefaults: - num_return_sequences = 1 - max_length = 100 - max_new_tokens = None - output_scores = False - top_k = 0 - top_p = 0.0 - repetition_penalty = 0.0 - do_sample = False - temperature = 1.0 +__all__ = ["ProcessInputsTextGeneration"] class ProcessInputsTextGeneration(Operator): diff --git a/tests/deepsparse/v2/integration_tests/__init__.py b/tests/deepsparse/v2/integration_tests/__init__.py new file mode 100644 index 0000000000..0c44f887a4 --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml new file mode 100644 index 0000000000..904358b55f --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml @@ -0,0 +1,6 @@ +cadence: "nightly" +model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none" +torch_model_name: "salesforce/codegen-350m-mono" +prompt: "\ndef Fibonacci(n):\n # Check if input is 0 then it will\n # print incorrect input" +precision: 0.0001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml new file mode 100644 index 0000000000..b422efc831 --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml @@ -0,0 +1,6 @@ +cadence: "commit" +model_path: "hf:mgoin/TinyStories-1M-ds" +torch_model_name: "roneneldan/TinyStories-1M" +prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" +precision: 0.001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/configs/opt.yaml b/tests/deepsparse/v2/integration_tests/configs/opt.yaml new file mode 100644 index 0000000000..ff2350dbe7 --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/configs/opt.yaml @@ -0,0 +1,6 @@ +cadence: "nightly" +model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none" +torch_model_name: "facebook/opt-1.3b" +prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" +precision: 0.0001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/helpers.py b/tests/deepsparse/v2/integration_tests/helpers.py new file mode 100644 index 0000000000..8d7f3d58d2 --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/helpers.py @@ -0,0 +1,137 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import Any, Dict, List, Tuple, Union + +import numpy +import yaml +from transformers import AutoModelForCausalLM, AutoTokenizer + +import pytest + + +class TorchGroundTruthSource: + """ + An object that generates ground truth logits and + cache states from a prompt. This object can + generate tokens in an autoregressive manner, and thus + will output: + - prompt logits, + - generated logits, + - prompt cache state, + - generated sequence + """ + + def __init__(self, num_tokens_to_generate: int, model_name: str): + + self.model = AutoModelForCausalLM.from_pretrained(model_name) + self.tokenizer = self._create_tokenizer(model_name) + + self.num_tokens_to_generate = num_tokens_to_generate + + def tokenize(self, prompt: str): + return self.tokenizer(prompt, return_tensors="pt") + + def __call__( + self, prompt: str + ) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]: + # afaik it is not possible to get 'past_key_values' from + # the generate method, so we have to run the model twice + out = self.model.generate( + self.tokenize(prompt).input_ids, + max_new_tokens=self.num_tokens_to_generate, + output_scores=True, + return_dict_in_generate=True, + use_cache=True, + ) + generated_text = self.tokenizer.decode( + out.sequences[0], skip_special_tokens=True + ) + generated_logits = numpy.concatenate( + [[score.numpy() for score in out.scores]] + ).transpose( + 1, 0, 2 + ) # (1, num_tokens_to_generate, vocab_size) + + out = self.model(**self.tokenize(prompt)) + prompt_logits = out.logits.detach().numpy()[ + :, :-1, : + ] # (1, prompt_length, vocab_size) + prompt_cache = [ + entry.detach().numpy() + for key_value_tuple in out.past_key_values + for entry in key_value_tuple + ] # List[(1, num_heads, past_length, head_dim)] + + return generated_logits, prompt_logits, prompt_cache, generated_text + + @staticmethod + def _create_tokenizer(model_name): + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.padding_side = "left" + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer + + +def parse_params(configs_directory: str) -> List[Dict[str, Any]]: + # parses the config file provided + assert os.path.isdir( + configs_directory + ), f"Config_directory {configs_directory} is not a directory" + + config_dicts = [] + for file in os.listdir(configs_directory): + if file.endswith(".yaml"): + config_path = os.path.join(configs_directory, file) + # reads the yaml file + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + cadence = os.environ.get("CADENCE", "commit") + expected_cadence = config["cadence"] + + if not isinstance(expected_cadence, list): + expected_cadence = [expected_cadence] + if cadence in expected_cadence: + config_dicts.append(config) + else: + logging.info( + f"Skipping testing model: {config['model_path']} " + f"for cadence: {config['cadence']}" + ) + else: + raise FileNotFoundError( + f"Could not find a yaml file in {configs_directory}" + ) + return config_dicts + + +def validate_internal_kv_cache( + internal_kv_cache, available_kv_cache_types: Union[str, List[str]] +) -> bool: + if internal_kv_cache and True not in available_kv_cache_types: + pytest.skip( + "The tests for running the pipeline with " + "internal kv cache management are disabled." + ) + if not internal_kv_cache and False not in available_kv_cache_types: + pytest.skip( + "The tests for running the pipeline with " + "external kv cache management are disabled." + ) + return internal_kv_cache diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py new file mode 100644 index 0000000000..34a8f7a258 --- /dev/null +++ b/tests/deepsparse/v2/integration_tests/test_llms.py @@ -0,0 +1,368 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This test suite consumes config files to test the text generation pipeline +for various scenarios. + +A sample config file is a yaml that r_equires the following fields: + cadence: The cadence of the tests. The available options are: + "nightly", "weekly" and "commit". By default, only + the tests that have cadence "commit" will be run + in GHA. This parameter can be both a string or a + list of strings. + model_path: The path to the model to be tested + (sparsezoo stub/hf model path/local_path) + torch_model_name: The name of the torch model + (to generate ground truth info) + prompt: The prompt to use for testing + precision: The precision for the logits/kv_cache entries + comparison + internal_kv_cache: The type of the internal KV cache + management. Is a list that can contain the following + values: [True], [False] or [True, False] (to test both + external and internal KV cache management) +""" +import os +from typing import List, Tuple + +import numpy + +import pytest +from deepsparse.transformers.pipelines.text_generation import TextGenerationOutput +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.text_generation import TextGenerationPipeline +from sparsezoo import Model +from tests.deepsparse.transformers.pipelines.integration_tests.helpers import ( + TorchGroundTruthSource, + parse_params, + validate_internal_kv_cache, +) + + +CONFIGS_DIRECTORY = "tests/deepsparse/v2/integration_tests/configs" + + +@pytest.fixture() +def max_new_tokens() -> int: + return 64 + + +@pytest.mark.parametrize("params_dict", parse_params(CONFIGS_DIRECTORY)) +@pytest.mark.parametrize( + "internal_kv_cache", + [True, False], +) +class TestsIntegrationLLMsPipelines: + """ + This test suite is meant to test the main scenarios of + the text generation pipeline. + """ + + def get_pipeline(self, **kwargs) -> Pipeline: + """ + If no kwargs provided, returns the cached "default" + pipeline that is used for most of the tests. + Otherwise, returns a pipeline with the given kwargs + (the default pipeline kwargs are updated with the + user-provided kwargs) + + :param kwargs: the optional kwargs to be used to + create the pipeline (if not provided, the cached + "default" pipeline is returned) + :return: the appropriate pipeline + """ + if not kwargs: + if self.default_pipeline is None: + self.default_pipeline = TextGenerationPipeline( + **self.default_pipeline_kwargs + ) + return self.default_pipeline + + # return a pipeline with the updated default kwargs + updated_kwargs = self.default_pipeline_kwargs.copy() + updated_kwargs.update(kwargs) + return TextGenerationPipeline(**updated_kwargs) + + @pytest.fixture + def setup(self, params_dict, max_new_tokens, internal_kv_cache): + # set the params_dict as the class attributes + for key, value in params_dict.items(): + setattr(self, key, value) + # check whether the specified cache management type + # is supported for testing (skip if not supported) + self.internal_kv_cache: bool = validate_internal_kv_cache( + internal_kv_cache, self.internal_kv_cache + ) + # create torch ground source + torch_source = TorchGroundTruthSource( + num_tokens_to_generate=max_new_tokens + 1, + model_name=self.torch_model_name, + ) + # create torch ground truth + self.torch_ground_truth = torch_source(self.prompt) + + # specify the default pipeline kwargs + self.default_pipeline_kwargs = dict( + model_path=self.model_path, + internal_kv_cache=self.internal_kv_cache, + force_max_tokens=True, + ) + self.default_pipeline = None + self.max_new_tokens = max_new_tokens + + def test_ort_single_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by single-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + + pipeline = self.get_pipeline( + prompt_sequence_length=1, + engine_type="onnxruntime", + ) + output = pipeline( + prompt=self.prompt, + include_prompt_logits=True, + generation_kwargs=dict( + max_new_tokens=self.max_new_tokens, + output_scores=True, + ), + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + ) + + def test_ort_multi_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + pipeline = self.get_pipeline( + engine_type="onnxruntime", + ) + output = pipeline( + prompt=self.prompt, + include_prompt_logits=True, + generation_kwargs=dict( + max_new_tokens=self.max_new_tokens, output_scores=True + ), + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + ) + + def test_deepsparse_single_token_prefill(self, setup): + # Test the pipeline that uses deepsparse engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by single-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally or internally + + pipeline = self.get_pipeline( + prompt_sequence_length=1, + ) + + output = pipeline( + prompt=self.prompt, + include_prompt_logits=True, + generation_kwargs=dict( + max_new_tokens=self.max_new_tokens, output_scores=True + ), + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + # disable kv cache validation if using internal kv cache + run_kv_cache_validation=not self.internal_kv_cache, + ) + + def test_deepsparse_multi_token_prefill(self, setup): + # Test the pipeline that uses deepsparse engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed internally or externally + + pipeline = self.get_pipeline() + output = pipeline( + prompt=self.prompt, + include_prompt_logits=True, + generation_kwargs=dict( + max_new_tokens=self.max_new_tokens, output_scores=True + ), + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + # disable kv cache validation if using internal kv cache + run_kv_cache_validation=not self.internal_kv_cache, + ) + + @pytest.mark.skip( + "This test is skipped because we do " + "not have support for non-kv-cache models yet" + ) + def test_inference_no_kv_cache_deepsparse(self, setup): + self._test_inference_no_kv_cache(engine_type="deepsparse") + + @pytest.mark.skip( + "This test is skipped because we do " + "not have support for non-kv-cache models yet" + ) + def test_inference_no_kv_cache_ort(self, setup): + self._test_inference_no_kv_cache(engine_type="onnxruntime") + + def _test_inference_no_kv_cache(self, engine_type): + model_path_no_cache = self._get_model_path_no_cache() + pipeline = self.get_pipeline( + model_path=model_path_no_cache, engine_type=engine_type + ) + assert not pipeline.cache_support_enabled, ( + "This pipeline test inference using non-kv cache " + "model and thus should not support kv cache" + ) + + output = pipeline( + self.prompt, max_length=1, output_scores=True, include_prompt_logits=True + ) + prompt_length = self.torch_ground_truth[1].shape[1] + # prompt logits + one logit for the new generated token + logits = output.generations[0].score[-(prompt_length + 1) :, :] + # compute ground truth logits analogously + generated_logits, prompt_logits, *_ = self.torch_ground_truth + logits_gt = numpy.concatenate( + [prompt_logits[0], generated_logits[0, :1, :]], axis=0 + ) + assert numpy.allclose(logits, logits_gt, atol=self.precision) + + def _test_output( + self, + output: TextGenerationOutput, + torch_ground_truth: Tuple[numpy.ndarray, ...], + run_kv_cache_validation: bool = True, + ): + + ( + generated_logits, + prompt_logits, + prompt_kv_cache, + generated_text, + ) = torch_ground_truth + + # concatenate target prompt_logits and generated_logits + target_logits = numpy.concatenate([prompt_logits, generated_logits], axis=1) + # get the logits of the generated sequence + score = output.generations[0].score + + # we expect the logits to be exactly the same + # as the target logits; the generated sequence should + # also be the same as the target sequence + assert numpy.allclose(score, target_logits[0], atol=self.precision) + assert self.prompt + output.generations[0].text == generated_text + + if hasattr(output, "kv_cache_state") and run_kv_cache_validation: + # (if applicable) the kv cache should be the same as the + # target kv cache + expected_cache = list(output.kv_cache_state[0].values()) + total_num_processed_tokens = output.total_num_processed_tokens[0] + self._test_kv_cache_state( + expected_cache=expected_cache, + target_cache=prompt_kv_cache, + total_num_processed_tokens=total_num_processed_tokens, + ) + + def _test_kv_cache_state( + self, + expected_cache: List[numpy.ndarray], + target_cache: List[numpy.ndarray], + total_num_processed_tokens: int, + ): + for x, y in zip(expected_cache, target_cache): + start_index = total_num_processed_tokens + end_index = total_num_processed_tokens - y.shape[2] + # x is (in general) composed of three arrays: + # - padding cache entries (from 0 to -start_index) + # - prompt cache entries (from -start_index to -end_index) + # - generated cache entries (from -end_index to -1) + # as target_cache only pertains to prompt cache entries, we need to + # compare only the prompt cache entries in x with y + assert numpy.allclose( + x[:, :, -start_index:-end_index, :], y, atol=self.precision + ) + + def _get_model_path_no_cache(self): + if not self.model_path.startswith("zoo:"): + pytest.skip("For this test, for now only the zoo model is supported") + model = Model(self.model_path) + # fetch the necessary file names for pipeline creation + required_file_names = [ + os.path.basename(file.name) for file in model.deployment.files + ] + training_directory = model.training + onnx_model_name_no_cache = [ + os.path.basename(file.name) + for file in model.training.files + if file.name.endswith(".onnx") + ][0] + + # check if 'training' exists, + # if not, download the files + if "training" not in os.listdir(model._path): + for filename in required_file_names: + # download the files to a training directory + if filename.endswith(".data"): + # data files are typically stored in a deployment directory + # download them to training + file = model.deployment.get_file(filename) + assert ( + file is not None + ), f"Unable to find file {filename} in model {model}" + file.name = file.name.replace("deployment", "training") + file.download() + continue + + if filename.endswith(".onnx"): + # instead of `model.onnx` the onnx_model_name_no_cache + # should be downloaded + filename = filename.replace("model.onnx", onnx_model_name_no_cache) + + file = training_directory.get_file(filename) + assert ( + file is not None + ), f"Unable to find file {filename} in model {model}" + file.download() + # rename the model file to `model.onnx` + os.rename( + os.path.join(training_directory.path, onnx_model_name_no_cache), + os.path.join(training_directory.path, "model.onnx"), + ) + return training_directory._path From 9b441f5d314ce0fde92ec3dd9181bf2db3928b40 Mon Sep 17 00:00:00 2001 From: Damian Date: Tue, 21 Nov 2023 13:08:58 +0000 Subject: [PATCH 34/57] integration tests pass --- .../v2/text_generation/join_output.py | 7 +- .../v2/text_generation/nl_engine_operator.py | 8 +- src/deepsparse/v2/text_generation/pipeline.py | 2 +- .../v2/integration_tests/configs/codegen.yaml | 1 + .../v2/integration_tests/configs/gpt_neo.yaml | 1 + .../v2/integration_tests/configs/opt.yaml | 1 + .../v2/integration_tests/test_llms.py | 100 +++++------------- 7 files changed, 39 insertions(+), 81 deletions(-) diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py index 5813702f46..56d9ac47b1 100644 --- a/src/deepsparse/v2/text_generation/join_output.py +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import Dict, List, Tuple import numpy @@ -32,9 +32,8 @@ class JoinOutput(Operator): def __init__(self, tokenizer): self.tokenizer = tokenizer - def run(self, inp: List[List[CompileGenerationsOutput]], **kwargs): - - if not isinstance(inp, list): + def run(self, inp: Tuple[List[CompileGenerationsOutput], Dict], **kwargs): + if not isinstance(inp, Tuple): # when running without KV Cache # this will be a single # CompileGenerationsOutput for now diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index fe28bdfe2c..3fa8653ea6 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -19,6 +19,7 @@ import numpy from pydantic import BaseModel, Field +from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.utils.onnx import ( CACHE_INPUT_PREFIX, overwrite_onnx_model_inputs_for_kv_cache_models, @@ -218,7 +219,12 @@ class NlEngineOperatorNoCache(EngineOperator): input_schema = NlEngineInputNoCache output_schema = None - def __init__(self, **kwargs): + def __init__(self, sequence_length: int, **kwargs): + overwrite_transformer_onnx_model_inputs( + path=kwargs.get("model_path"), + batch_size=kwargs.get("batch_size", 1), + max_length=sequence_length, + ) super().__init__(**kwargs) def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index d36dabab5d..7c270873fa 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -74,7 +74,7 @@ def __init__( sequence_length=sequence_length, tokenizer=self.tokenizer, ), - NlEngineOperatorNoCache(**engine_kwargs), + NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), PrepareGeneration( sequence_length=sequence_length, prompt_sequence_length=1, diff --git a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml index 904358b55f..9ec212a6cc 100644 --- a/tests/deepsparse/v2/integration_tests/configs/codegen.yaml +++ b/tests/deepsparse/v2/integration_tests/configs/codegen.yaml @@ -1,6 +1,7 @@ cadence: "nightly" model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none" torch_model_name: "salesforce/codegen-350m-mono" +model_name_no_kv_cache: None prompt: "\ndef Fibonacci(n):\n # Check if input is 0 then it will\n # print incorrect input" precision: 0.0001 internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml index b422efc831..71c57e1f97 100644 --- a/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml +++ b/tests/deepsparse/v2/integration_tests/configs/gpt_neo.yaml @@ -1,6 +1,7 @@ cadence: "commit" model_path: "hf:mgoin/TinyStories-1M-ds" torch_model_name: "roneneldan/TinyStories-1M" +model_name_no_kv_cache: "model-orig.onnx" prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" precision: 0.001 internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/configs/opt.yaml b/tests/deepsparse/v2/integration_tests/configs/opt.yaml index ff2350dbe7..216d4c03ca 100644 --- a/tests/deepsparse/v2/integration_tests/configs/opt.yaml +++ b/tests/deepsparse/v2/integration_tests/configs/opt.yaml @@ -1,6 +1,7 @@ cadence: "nightly" model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none" torch_model_name: "facebook/opt-1.3b" +model_name_no_kv_cache: None prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" precision: 0.0001 internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py index 34a8f7a258..321070f276 100644 --- a/tests/deepsparse/v2/integration_tests/test_llms.py +++ b/tests/deepsparse/v2/integration_tests/test_llms.py @@ -23,6 +23,8 @@ list of strings. model_path: The path to the model to be tested (sparsezoo stub/hf model path/local_path) + model_name_no_kv_cache: The name of the onnx model without + the KV cache support torch_model_name: The name of the torch model (to generate ground truth info) prompt: The prompt to use for testing @@ -33,7 +35,6 @@ values: [True], [False] or [True, False] (to test both external and internal KV cache management) """ -import os from typing import List, Tuple import numpy @@ -41,8 +42,10 @@ import pytest from deepsparse.transformers.pipelines.text_generation import TextGenerationOutput from deepsparse.v2.pipeline import Pipeline -from deepsparse.v2.text_generation import TextGenerationPipeline -from sparsezoo import Model +from deepsparse.v2.text_generation import ( + TextGenerationPipeline, + TextGenerationPipelineNoCache, +) from tests.deepsparse.transformers.pipelines.integration_tests.helpers import ( TorchGroundTruthSource, parse_params, @@ -69,7 +72,7 @@ class TestsIntegrationLLMsPipelines: the text generation pipeline. """ - def get_pipeline(self, **kwargs) -> Pipeline: + def get_pipeline(self, kv_cache_support=True, **kwargs) -> Pipeline: """ If no kwargs provided, returns the cached "default" pipeline that is used for most of the tests. @@ -82,9 +85,14 @@ def get_pipeline(self, **kwargs) -> Pipeline: "default" pipeline is returned) :return: the appropriate pipeline """ + text_generation_pipeline_class = ( + TextGenerationPipeline + if kv_cache_support + else TextGenerationPipelineNoCache + ) if not kwargs: if self.default_pipeline is None: - self.default_pipeline = TextGenerationPipeline( + self.default_pipeline = text_generation_pipeline_class( **self.default_pipeline_kwargs ) return self.default_pipeline @@ -92,7 +100,7 @@ def get_pipeline(self, **kwargs) -> Pipeline: # return a pipeline with the updated default kwargs updated_kwargs = self.default_pipeline_kwargs.copy() updated_kwargs.update(kwargs) - return TextGenerationPipeline(**updated_kwargs) + return text_generation_pipeline_class(**updated_kwargs) @pytest.fixture def setup(self, params_dict, max_new_tokens, internal_kv_cache): @@ -135,7 +143,7 @@ def test_ort_single_token_prefill(self, setup): pipeline = self.get_pipeline( prompt_sequence_length=1, - engine_type="onnxruntime", + engine_kwargs=dict(engine_type="onnxruntime"), ) output = pipeline( prompt=self.prompt, @@ -163,7 +171,7 @@ def test_ort_multi_token_prefill(self, setup): "Cannot run ORT pipeline with the internal deepsparse cache enabled." ) pipeline = self.get_pipeline( - engine_type="onnxruntime", + engine_kwargs=dict(engine_type="onnxruntime"), ) output = pipeline( prompt=self.prompt, @@ -227,37 +235,27 @@ def test_deepsparse_multi_token_prefill(self, setup): run_kv_cache_validation=not self.internal_kv_cache, ) - @pytest.mark.skip( - "This test is skipped because we do " - "not have support for non-kv-cache models yet" - ) def test_inference_no_kv_cache_deepsparse(self, setup): self._test_inference_no_kv_cache(engine_type="deepsparse") - @pytest.mark.skip( - "This test is skipped because we do " - "not have support for non-kv-cache models yet" - ) def test_inference_no_kv_cache_ort(self, setup): self._test_inference_no_kv_cache(engine_type="onnxruntime") def _test_inference_no_kv_cache(self, engine_type): - model_path_no_cache = self._get_model_path_no_cache() pipeline = self.get_pipeline( - model_path=model_path_no_cache, engine_type=engine_type - ) - assert not pipeline.cache_support_enabled, ( - "This pipeline test inference using non-kv cache " - "model and thus should not support kv cache" + onnx_model_name=self.model_name_no_kv_cache, + kv_cache_support=False, + engine_kwargs=dict(engine_type=engine_type), ) output = pipeline( - self.prompt, max_length=1, output_scores=True, include_prompt_logits=True + prompt=self.prompt, + include_prompt_logits=True, + generation_kwargs=dict(output_scores=True), ) - prompt_length = self.torch_ground_truth[1].shape[1] - # prompt logits + one logit for the new generated token - logits = output.generations[0].score[-(prompt_length + 1) :, :] - # compute ground truth logits analogously + + logits = output.generations[0].score + # logits -> prompt logits + one logit for the new generated token generated_logits, prompt_logits, *_ = self.torch_ground_truth logits_gt = numpy.concatenate( [prompt_logits[0], generated_logits[0, :1, :]], axis=0 @@ -318,51 +316,3 @@ def _test_kv_cache_state( assert numpy.allclose( x[:, :, -start_index:-end_index, :], y, atol=self.precision ) - - def _get_model_path_no_cache(self): - if not self.model_path.startswith("zoo:"): - pytest.skip("For this test, for now only the zoo model is supported") - model = Model(self.model_path) - # fetch the necessary file names for pipeline creation - required_file_names = [ - os.path.basename(file.name) for file in model.deployment.files - ] - training_directory = model.training - onnx_model_name_no_cache = [ - os.path.basename(file.name) - for file in model.training.files - if file.name.endswith(".onnx") - ][0] - - # check if 'training' exists, - # if not, download the files - if "training" not in os.listdir(model._path): - for filename in required_file_names: - # download the files to a training directory - if filename.endswith(".data"): - # data files are typically stored in a deployment directory - # download them to training - file = model.deployment.get_file(filename) - assert ( - file is not None - ), f"Unable to find file {filename} in model {model}" - file.name = file.name.replace("deployment", "training") - file.download() - continue - - if filename.endswith(".onnx"): - # instead of `model.onnx` the onnx_model_name_no_cache - # should be downloaded - filename = filename.replace("model.onnx", onnx_model_name_no_cache) - - file = training_directory.get_file(filename) - assert ( - file is not None - ), f"Unable to find file {filename} in model {model}" - file.download() - # rename the model file to `model.onnx` - os.rename( - os.path.join(training_directory.path, onnx_model_name_no_cache), - os.path.join(training_directory.path, "model.onnx"), - ) - return training_directory._path From c858b1f603622881b330ef942d9bdfaca5bcb846 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 21 Nov 2023 10:39:02 -0500 Subject: [PATCH 35/57] [Pipeline Refactor][Text Generation][Continuous Batching] Integration (#1409) * update split/join * use map * update * run end-to-end * clean-up * fix bug with batch size, introduce SplitRoute dataclass * update tests to use new inputs/outputs * use the normal scheduler for internal kv_cache * add pipeline inpuits * clean-up * change engine type, update docstrings, update override function to be more generic * move subgraph functionality to its own function; clean-up cont batching in text gen pipeline * update linear pathway to also use subgraph execution * rebase fix * fix tests --- .../v2/operators/engine_operator.py | 12 +- src/deepsparse/v2/operators/operator.py | 5 +- src/deepsparse/v2/pipeline.py | 224 +++++++++++------- src/deepsparse/v2/routers/router.py | 2 - .../continuous_batching_scheduler.py | 20 +- .../utils/continuous_batching_executor.py | 2 +- .../compile_generated_tokens.py | 2 +- .../v2/text_generation/compile_logits.py | 14 +- .../v2/text_generation/generate_new_token.py | 12 +- .../v2/text_generation/nl_engine_operator.py | 184 +++++++++++--- src/deepsparse/v2/text_generation/pipeline.py | 48 +++- src/deepsparse/v2/utils/__init__.py | 4 + src/deepsparse/v2/utils/data.py | 39 +++ src/deepsparse/v2/utils/helpers.py | 37 +++ .../v2/integration_tests/test_llms.py | 6 +- .../v2/unit/text_generation/conftest.py | 11 +- .../v2/unit/text_generation/test_misc.py | 13 +- .../text_generation/test_process_inputs.py | 6 +- .../test_single_token_engine.py | 6 +- .../text_generation/test_token_generation.py | 10 +- 20 files changed, 486 insertions(+), 171 deletions(-) create mode 100644 src/deepsparse/v2/utils/data.py create mode 100644 src/deepsparse/v2/utils/helpers.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index 9ee8d734c5..630de2d5bd 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -20,7 +20,7 @@ from deepsparse import Context as EngineContext from deepsparse import Engine, MultiModelEngine, Scheduler from deepsparse.benchmark import ORTEngine -from deepsparse.utils import model_to_path +from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs from deepsparse.v2.operators import Operator @@ -29,12 +29,12 @@ SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE] -__all__ = ["EngineOperator"] +__all__ = ["EngineOperator", "EngineOperatorInputs", "EngineOperatorOutputs"] class EngineOperatorInputs(BaseModel): engine_inputs: List = Field(description="engine_inputs") - engine: Optional[Engine] = Field( + engine: Optional[Union[ORTEngine, Engine]] = Field( description="override the engine to run forward pass with", default=None, ) @@ -95,8 +95,8 @@ def __init__( engine_kwargs: Dict = None, ): self.model_path = model_to_path(model_path) - self._batch_size = 1 self.engine_context = engine_context + self._batch_size = 1 if self.engine_context is not None: num_cores = num_cores or self.engine_context.num_cores @@ -131,6 +131,7 @@ def batch_size(self) -> int: """ return self._batch_size + # TODO: maybe add a few args to make this less opaque? def create_engine( self, **kwargs, @@ -142,7 +143,8 @@ def create_engine( constructor/compilation :return: inference engine """ - onnx_file_path = self.model_path + + onnx_file_path = kwargs.pop("model_path", self.model_path) engine_args = deepcopy(self._engine_args) engine_args.update(kwargs) engine_type = self._engine_type.lower() diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 5bb0be841a..2923862b12 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,7 +17,7 @@ from pydantic import BaseModel -from deepsparse.v2.utils import InferenceState, PipelineState +from deepsparse.v2.utils import InferenceState __all__ = ["Operator"] @@ -57,7 +57,6 @@ def __call__( self, *args, inference_state: InferenceState, - pipeline_state: PipelineState, **kwargs, ) -> Any: """ @@ -90,13 +89,11 @@ def __call__( run_output = self.run( inference_input, inference_state=inference_state, - pipeline_state=pipeline_state, ) else: run_output = self.run( *args, inference_state=inference_state, - pipeline_state=pipeline_state, **kwargs, ) if self.has_output_schema(): diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index f56680d2b9..78d112a2b3 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -15,13 +15,18 @@ import copy from concurrent.futures import Future -from functools import partial -from typing import Any, Callable, Dict, List, Union +from typing import Any, Dict, List, Union -from deepsparse.v2.operators import Operator +from deepsparse.v2.operators import EngineOperator, Operator from deepsparse.v2.routers import Router -from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup +from deepsparse.v2.schedulers import ( + ContinuousBatchingScheduler, + OperatorScheduler, + SchedulerGroup, +) from deepsparse.v2.utils import InferenceState, PipelineState +from deepsparse.v2.utils.data import SubGraph +from deepsparse.v2.utils.helpers import run_func __all__ = ["Pipeline"] @@ -50,6 +55,7 @@ def __init__( ops: Union[Dict[str, Operator], List[Operator]], router: Router, schedulers: List[OperatorScheduler], + continuous_batching_scheduler: ContinuousBatchingScheduler, pipeline_state: PipelineState = None, ): @@ -57,32 +63,92 @@ def __init__( self.router = router self.schedulers = schedulers self.pipeline_state = pipeline_state + self._continuous_batching_scheduler = continuous_batching_scheduler self.validate() self._scheduler_group = SchedulerGroup(self.schedulers) - def _run_sequential( + def _run_next( self, inp: Any, inference_state: InferenceState, - pipeline_state: PipelineState, - start: str, - end: str, + next_step: str, ): - next_step = start - while next_step != end: - outputs = self._run_next_step( - func=self.ops[next_step], - next_step=next_step, - input=inp, - pipeline_state=pipeline_state, - inference_state=inference_state, + if ( + isinstance(self.ops[next_step], EngineOperator) + and self._continuous_batching_scheduler + ): + func = self._continuous_batching_scheduler.submit + inp = self.ops[next_step].input_schema(**inp) + else: + func = self._scheduler_group.submit + + return run_func( + func=func, + operator=self.ops[next_step], + inp=inp, + pipeline_state=self.pipeline_state, + inference_state=inference_state, + ) + + def _run_sub_graphs( + self, sub_graph_inputs: List[Any], sub_graphs: List[SubGraph] + ) -> List[Any]: + """ + Run a list of sub_graphs asynchronously. Polls to identify the sub graph that is + still running but has completed its current step. Schedules the next step + subgraph step. This is repeated until all subgraphs have finished running and + have reached their end step (stored in the Subgraph.end attribute). + + :param sub_graph_inputs: A list of inputs that should be passed to each + subgraph. Each subgraph is given an element of the list as input to its + first node. + :param sub_graphs: A list of Subgraph objects. Each stores the relevant + execution information for the particular subgraph, such as its current step + in the sub graph, inference state, output, and end step. + + :returns: a list of outputs for all the completed Subgraph objects. Returned + in the same order that the subgraphs were passed to the function. + """ + for i in range(len(sub_graphs)): + sub_graphs[i].output = self._run_next( + sub_graph_inputs[i], sub_graphs[i].inf, sub_graphs[i].step ) - next_step, operator_output, state_update = outputs - if state_update: - inference_state.update_state(state_update) - inp = operator_output - return inp + + # Execute all sub graphs until all graphs have been completed. + while True: + for sub_graph in sub_graphs: + if isinstance(sub_graph.output, Future) and sub_graph.output.done(): + # get the result for the completed operator; resolve its output + operator_output = sub_graph.output.result() + operator_output = sub_graph.parse_output(operator_output) + + # determine the next step for the particular operator, using + # its previous output and previously stored step + next_step = self.router.next( + sub_graph.step, self.ops, operator_output + ) + # update the step + sub_graph.step = next_step + + # store the output for the next step. If the next step is + # end step, this particular route has completed. Simply + # update the output value + if next_step in sub_graph.end: + sub_graph.output = operator_output + else: + sub_graph.output = self._run_next( + inp=operator_output, + inference_state=sub_graph.inf, + next_step=next_step, + ) + break + + # keep running until all sub graphs have completed. + if not any(isinstance(x.output, Future) for x in sub_graphs): + break + + return [x.output for x in sub_graphs] def _apply_split(self, inp: Any, inference_state: InferenceState): """ @@ -93,59 +159,29 @@ def _apply_split(self, inp: Any, inference_state: InferenceState): """ batches, orig_batch_size = self.expand_inputs(inp, 1) - run_with_state = partial( - self._run_sequential, - pipeline_state=self.pipeline_state, - start=self.router.route[self.router.SPLIT_ROUTE], - end=self.router.JOIN_ROUTE, - ) - inference_state_list = [ - copy.deepcopy(inference_state) for x in range(len(batches)) - ] - futures = self._scheduler_group.map( - batches, - inference_state_list, - func=run_with_state, - ) - return self.condense_inputs([x.result() for x in futures]) - def _run_next_step( - self, - *args, - func: Callable, - next_step: Union[str, int], - input: Any = None, - **kwargs, - ): - """ - Generic function to run a given func, process the output and determine the next - step. - """ - if input: - operator_output = ( - func(*args, **kwargs, **input) - if isinstance(input, dict) - else func(input, *args, **kwargs) + # Create a list of SplitRoutes, per batch size 1 + # Each SplitRoute object holds information about the particular path it + # follows. All start at the same step defined by SPLIT_ROUTE and start + # with the same inference_state. + split_graphs = [ + SubGraph( + inf=copy.deepcopy(inference_state), + step=self.router.route[self.router.SPLIT_ROUTE], + end=[self.router.JOIN_ROUTE], ) - else: - operator_output = func(*args, **kwargs) - - if isinstance(operator_output, Future): - operator_output = operator_output.result() - - state_update = None - if isinstance(operator_output, tuple): - state_update = operator_output[-1] - operator_output = operator_output[0] + for i in range(len(batches)) + ] - next_step = self.router.next(next_step, self.ops, operator_output) - return next_step, operator_output, state_update + outputs = self._run_sub_graphs( + sub_graph_inputs=batches, sub_graphs=split_graphs + ) + return self.condense_inputs(outputs) def run( self, *args, inference_state: InferenceState, - pipeline_state: PipelineState, **kwargs, ): """ @@ -158,36 +194,56 @@ def run( """ next_step = self.router.START_ROUTE operator_output = None - while next_step != self.router.END_ROUTE: + + # Split Grap Execution (i.e multiple subgraphs) # NOTE: split_route should only appear after the start route node if next_step == self.router.SPLIT_ROUTE: + if operator_output is None: + raise ValueError( + f"{self.router.SPLIT_ROUTE} should appear after " + f"{self.ROUTER.START_ROUTE}" + ) + operator_output = self._apply_split(operator_output, inference_state) next_step = self.router.route[self.router.JOIN_ROUTE] + if next_step == self.router.END_ROUTE: + return operator_output if next_step == self.router.START_ROUTE: - outputs = self._run_next_step( + operator_output = run_func( *args, - next_step=next_step, func=self._scheduler_group.submit, - inference_state=inference_state, operator=self.ops[next_step], - pipeline_state=pipeline_state, + inference_state=inference_state, + pipeline_state=self.pipeline_state, **kwargs, - ) + ).result() + + if isinstance(operator_output, tuple): + operator_output, state_update = ( + operator_output[0], + operator_output[-1], + ) + inference_state.update_state(state_update) + + next_step = self.router.next(next_step, self.ops, operator_output) + else: - outputs = self._run_next_step( - func=self._scheduler_group.submit, - input=operator_output, - next_step=next_step, - inference_state=inference_state, - operator=self.ops[next_step], - pipeline_state=pipeline_state, + # Single graph execution + graph = SubGraph( + inf=copy.deepcopy(inference_state), + step=next_step, + end=[self.router.SPLIT_ROUTE, self.router.END_ROUTE], ) - next_step, operator_output, state_update = outputs - if state_update: - inference_state.update_state(state_update) + operator_output = self._run_sub_graphs( + sub_graph_inputs=[operator_output], sub_graphs=[graph] + )[0] + + inference_state = graph.inf + next_step = graph.step + return operator_output def __call__(self, *args, **kwargs): @@ -204,11 +260,7 @@ def __call__(self, *args, **kwargs): inference_state = InferenceState() inference_state.create_state({}) - if "pipeline_state" in kwargs: - self.pipeline_state = kwargs.get("pipeline_state") - kwargs["inference_state"] = inference_state - kwargs["pipeline_state"] = self.pipeline_state return self.run(*args, **kwargs) diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 6b0d851aef..6740f706f1 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -83,8 +83,6 @@ class LinearRouter(Router): def __init__(self, end_route: int, start_route: int = 0): super().__init__(end_route=end_route, start_route=start_route) - self.SPLIT_ROUTE = None - self.JOIN_ROUTE = None _LOGGER.warn("SPLIT and JOIN are not yet supported for the LinearRouter.") def next( diff --git a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py index 669c5922a0..cc74ac0996 100644 --- a/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py +++ b/src/deepsparse/v2/schedulers/continuous_batching_scheduler.py @@ -50,7 +50,7 @@ def __init__(self): engine_operator = EngineOperator(...) ... continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance() - continuous_batching_scheduler.add_engine_operator(engine_operator) + continuous_batching_scheduler.add_engine_operator(engine_operator, [1]) super.__init__(...) ``` @@ -58,6 +58,8 @@ def __init__(self): :param max_workers: maximum number of threads to execute at once, default 1 """ + # TODO: If the singleton always returns max_workers 1, should we remove this arg/not + # give the user a choice? def __init__(self, max_workers: int = 1): self._max_workers = max_workers @@ -82,6 +84,8 @@ def get_instance(cls) -> "ContinuousBatchingScheduler": does not exist yet, a scheduler with a single worker thread to schedule all jobs is created and started """ + global _GLOBAL_SCHEDULER + if _GLOBAL_SCHEDULER is not None: return _GLOBAL_SCHEDULER # noqa: F823 @@ -161,8 +165,18 @@ def add_engine_operator( for batch_size in batch_sizes: if batch_size == 1: continue # already added - operator_engines[batch_size] = operator_engines.create_engine( - batch_size=batch_size + + override_model_path = None + # text generation/NLEngineOperator specific; could add generic method + # for all engine_operators, if desired + if hasattr(engine_operator, "override_model_inputs"): + override_model_path = engine_operator.override_model_inputs( + model_path=engine_operator.model_path, batch_size=batch_size + ) + + # will break for internal kv_cache; needs additional argument + operator_engines[batch_size] = engine_operator.create_engine( + batch_size=batch_size, model_path=override_model_path ) self._operators_to_engines[engine_operator] = operator_engines diff --git a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py index 86afdf309c..40ff00ca4f 100644 --- a/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py +++ b/src/deepsparse/v2/schedulers/utils/continuous_batching_executor.py @@ -71,7 +71,7 @@ def _working_loop(self): ] # run the engine operator with the given engine at the joined batch size - joined_outputs = engine_operator(joined_inputs) + joined_outputs = engine_operator(joined_inputs, inference_state=None) # split outputs and return the results to their respective futures split_outputs = joined_outputs.split() diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py index c87436ab3a..630067f8c3 100644 --- a/src/deepsparse/v2/text_generation/compile_generated_tokens.py +++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py @@ -42,7 +42,7 @@ def run( if finish_reason is not None: in_generation = False - state_update = { # TODO: check if necessary + state_update = { "finished_reason": finished_reason, "generated_tokens": generated_tokens, "generated_logits": generated_logits, diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py index 21bd50e03e..48a7158f66 100644 --- a/src/deepsparse/v2/text_generation/compile_logits.py +++ b/src/deepsparse/v2/text_generation/compile_logits.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any - from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs from deepsparse.v2.utils import InferenceState @@ -28,12 +27,13 @@ class CompilePromptLogits(Operator): take prompt logits from each iteration run and update the inference state. """ - def can_operate(self, inp: Any): - if inp.get("in_generation") is None: + def can_operate(self, inp: NLEngineOutputs): + if inp.in_generation is None: return True return False - def run(self, logits, inference_state: InferenceState, **kwargs): + def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): + logits = inp.engine_outputs logit_type = "prompt_logits" if inference_state.current_state.get(logit_type) is not None: @@ -44,6 +44,6 @@ def run(self, logits, inference_state: InferenceState, **kwargs): state_update = {logit_type: current_logits} return { - "kv_cache": kwargs.get("kv_cache"), - "tokens": kwargs.get("tokens"), + "kv_cache": inp.kv_cache, + "tokens": inp.tokens, }, state_update diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py index 33ab546e39..5bf48bbdbc 100644 --- a/src/deepsparse/v2/text_generation/generate_new_token.py +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Sequence, Union +from typing import Sequence, Union import transformers from deepsparse.transformers.pipelines.text_generation import FinishReason from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs from deepsparse.v2.utils import InferenceState @@ -30,12 +31,15 @@ def __init__( self.force_max_tokens = force_max_tokens self.tokenizer = tokenizer - def can_operate(self, inp: Any): - if inp.get("in_generation"): + def can_operate(self, inp: NLEngineOutputs): + if inp.in_generation: return True return False - def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs): + def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): + logits = inp.engine_outputs + kv_cache = inp.kv_cache + token_generator = inference_state.current_state.get("token_generator") token = token_generator.generate(logits=logits[0, -1, :]) finish_reason = None diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 7549f986d9..d8c80bbaee 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -14,10 +14,13 @@ import copy import os -from typing import Any, List, Tuple +from pathlib import Path +from typing import Any, List, Optional, Tuple, Union +import numpy from pydantic import BaseModel, Field +from deepsparse.utils import join_engine_outputs, split_engine_inputs from deepsparse.utils.onnx import ( CACHE_INPUT_PREFIX, overwrite_onnx_model_inputs_for_kv_cache_models, @@ -29,14 +32,76 @@ ) -__all__ = ["NLEngineOperator", "NlEngineInput"] +__all__ = ["NLEngineOperator", "NLEngineInputs"] -class NlEngineInput(BaseModel): - engine_inputs: List = Field(description="engine inputs") +class NLEngineInputs(BaseModel): + engine_inputs: List = Field(description="engine_inputs") kv_cache: Any = Field(description="kv_cache object") tokens: List = Field(description="tokens") - in_generation: bool = Field(description="in_generation", default=None) + in_generation: Any = Field(description="in_generation", default=None) + engine: Optional[Any] = Field( + description="override the engine to run forward pass with", + default=None, + ) + + @classmethod + def join(cls, inputs: List["NLEngineInputs"]) -> "NLEngineInputs": + """ + :param inputs: list of separate EngineOperatorInputs, batch size must be 1 + :return: list of inputs joined into a single input with a multi batch size + """ + all_engine_inputs = [] + all_kv_cache = [] + all_tokens = [] + all_generation = [] + + for engine_input in inputs: + all_engine_inputs.append(engine_input.engine_inputs) + all_kv_cache.append(engine_input.kv_cache) + all_tokens.append(engine_input.tokens) + all_generation.append(engine_input.in_generation) + + for engine_inputs in all_engine_inputs: + if engine_inputs[0].shape[0] != 1: + raise RuntimeError( + "join requires all inputs to have batch size 1, found input with " + f"batch size {engine_inputs[0].shape[0]}" + ) + return cls( + engine_inputs=all_engine_inputs, + tokens=all_tokens, + in_generation=all_generation, + kv_cache=all_kv_cache, + ) + + class Config: + arbitrary_types_allowed = True + + +class NLEngineOutputs(BaseModel): + engine_outputs: Any = Field(description="engine_outputs") + kv_cache: Any = Field(description="kv_cache object") + tokens: List = Field(description="tokens") + in_generation: Any = Field(description="in_generation", default=None) + + def split(self) -> List["NLEngineOutputs"]: + """ + :return: list of the current outputs split to a batch size of 1 each + """ + split_outputs = [ + numpy.expand_dims(self.engine_outputs[i], 0) + for i in range(len(self.engine_outputs)) + ] + return [ + self.__class__( + engine_outputs=split_outputs[i], + kv_cache=self.kv_cache[i], + tokens=self.tokens[i], + in_generation=self.in_generation[i], + ) + for i in range(len(split_outputs)) + ] class NLEngineOperator(EngineOperator): @@ -48,8 +113,8 @@ class NLEngineOperator(EngineOperator): multi-token case. """ - input_schema = NlEngineInput - output_schema = None + input_schema = NLEngineInputs + output_schema = NLEngineOutputs def __init__( self, @@ -59,17 +124,17 @@ def __init__( **kwargs, ): + self.sequence_length = sequence_length + self.input_ids_length = input_ids_length self.kv_cache_data_type = None - ( - onnx_file_path, - output_indices_to_be_cached, - kv_cache_data_type, - ) = overwrite_onnx_model_inputs_for_kv_cache_models( - onnx_file_path=kwargs.get("model_path"), - batch_size=kwargs.get("batch_size", 1), - sequence_length=sequence_length, - input_ids_length=input_ids_length, + self.internal_kv_cache = internal_kv_cache + self.model_path = kwargs.get("model_path") + (onnx_file_path, additional_outputs) = self.override_model_inputs( + self.model_path, batch_size=1, return_additional_outputs=True ) + output_indices_to_be_cached, kv_cache_data_type, = additional_outputs.get( + "output_indices_to_be_cached" + ), additional_outputs.get("kv_cache_data_type") engine_kwargs = kwargs.get("engine_kwargs", {}) if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE: @@ -86,43 +151,95 @@ def __init__( kwargs["engine_kwargs"] = engine_kwargs kwargs["model_path"] = onnx_file_path + super().__init__(**kwargs) - self.input_ids_length = input_ids_length + def override_model_inputs( + self, + model_path: Union[str, Path], + batch_size: int, + return_additional_outputs=False, + ): + """ + Override the model based on the provided batch_size, sequence_length, + and input_ids_length. + + :param model_path: Path to the model + :param batch_size: The batch size to be used for the model + :return: new overwritten model file path. Optionally returns additional outputs + specific to the NLDecoder engine + """ + ( + onnx_file_path, + output_indices_to_be_cached, + kv_cache_data_type, + ) = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=model_path, + batch_size=batch_size, + sequence_length=self.sequence_length, + input_ids_length=self.input_ids_length, + ) + if return_additional_outputs: + return onnx_file_path, { + "output_indices_to_be_cached": output_indices_to_be_cached, + "kv_cache_data_type": kv_cache_data_type, + } + return onnx_file_path - def run(self, inp: NlEngineInput, **kwargs) -> Any: + def run(self, inp: NLEngineInputs, **kwargs) -> NLEngineOutputs: engine_input = inp.engine_inputs kv_cache = inp.kv_cache - inputs = self._add_kv_cache_to_input(engine_input, kv_cache) - if bool(kv_cache.engine_internal_cache): + split = True + if not isinstance(kv_cache, list): + split = False + kv_cache = [kv_cache] + engine_input = [engine_input] + + inputs = list(map(self._add_kv_cache_to_input, engine_input, kv_cache)) + + if bool(kv_cache[0].engine_internal_cache): # conventionally, before dispatching # inputs to the engine, we validate them # if val_inp=True. However, in this case # we want to pass the empty kv cache inputs # (batch_size=0) to the engine. Therefore, # we skip the validation + + # Internal kv_cache works for batch_size of 1 atm out = self.engine._eng_net.execute_list_out( - inputs, kv_cache.engine_internal_cache + inputs[0], kv_cache[0].engine_internal_cache ) else: # run the engine without the LIB.kv_cache object + # stack multiple batch inputs along the batch dimension + inputs = join_engine_outputs(inputs, len(inputs)) out = ( super() - .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs) + .run( + EngineOperatorInputs(engine_inputs=inputs, engine=inp.engine), + **kwargs, + ) .get("engine_outputs") ) + # logits should be stacked along batch dim + # kv_cache_state should be a list where each dim 0 is batch_size logits, *kv_cache_state = out - self._update_kv_cache( - kv_cache_state=kv_cache_state, - input_ids_len=self.input_ids_length, - kv_cache=kv_cache, - ) + kv_cache_state, _ = split_engine_inputs(kv_cache_state, 1) + + if len(kv_cache_state) > 0: + for i in range(len(kv_cache)): + self._update_kv_cache( + kv_cache_state=kv_cache_state[i], kv_cache=kv_cache[i] + ) + else: + # internal kv cache case + self._update_kv_cache(kv_cache=kv_cache[0]) output = { - "logits": logits, - "kv_cache": kv_cache, + "engine_outputs": logits, + "kv_cache": kv_cache if split else kv_cache[0], "tokens": inp.tokens, "in_generation": inp.in_generation, } @@ -137,9 +254,9 @@ def _add_kv_cache_to_input(self, engine_input, kv_cache): new_inp = [kv_cache_state[name] for name in self.engine.input_names] return new_inp - def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache): + def _update_kv_cache(self, kv_cache, kv_cache_state=None): if bool(kv_cache.engine_internal_cache): - kv_cache.total_num_processed_tokens += input_ids_len + kv_cache.total_num_processed_tokens += self.input_ids_length return kv_cache_state = { @@ -147,10 +264,7 @@ def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache): for name, array in zip(self.onnx_input_names_cached, kv_cache_state) } - kv_cache.update( - state=kv_cache_state, - input_ids_len=input_ids_len, - ) + kv_cache.update(state=kv_cache_state, input_ids_len=self.input_ids_length) @property def onnx_input_names_no_cache(self) -> List[str]: diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 5ab73f7a48..ae7334cffd 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -12,14 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Optional +import logging +from typing import Dict, List, Optional from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.utils import split_engine_inputs +from deepsparse.v2.operators import EngineOperator from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter -from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, CompileGeneratedTokens, @@ -39,6 +41,9 @@ from deepsparse.v2.utils import PipelineState +_LOGGER = logging.getLogger(__name__) + + class TextGenerationPipeline(Pipeline): def __init__( self, @@ -48,6 +53,7 @@ def __init__( internal_kv_cache: bool = True, force_max_tokens: bool = False, generation_config=None, + continuous_batch_sizes: Optional[List[int]] = None, engine_kwargs: Optional[Dict] = None, ): ( @@ -133,6 +139,20 @@ def __init__( compile_generated_tokens = CompileGeneratedTokens() join_output = JoinOutput(tokenizer=self.tokenizer) + # TODO: do we want to support lists for different engines? + continuous_batching_scheduler = None + if continuous_batch_sizes: + if internal_kv_cache: + _LOGGER.warn( + "internal kv_cache is currently not supported with continuous ", + "batching", + ) + else: + continuous_batching_scheduler = self._get_continuous_batching_scheduler( + batch_sizes=continuous_batch_sizes, + engines=[single_engine_operator, multi_engine_operator], + ) + ops = { "process_input": process_inputs, "single_engine": single_engine_operator, @@ -183,7 +203,11 @@ def __init__( ) scheduler = [OperatorScheduler()] super().__init__( - ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state + ops=ops, + router=router, + schedulers=scheduler, + pipeline_state=pipeline_state, + continuous_batching_scheduler=continuous_batching_scheduler, ) def expand_inputs(self, items, batch_size): @@ -194,3 +218,21 @@ def expand_inputs(self, items, batch_size): def condense_inputs(self, *args, **kwargs): return args[0], kwargs + + def _get_continuous_batching_scheduler( + self, batch_sizes: List[int], engines: List[EngineOperator] + ) -> ContinuousBatchingScheduler: + """ + Fetch the continuous batching scheduler. Requires adding the EngineOperator + that will run through the scheduler. + + :param batch_sizes: List of batch sizes to be used by the models + :param engine: List of EngineOperators which should be scheduled using the + continuous batching scheduler + + :returns: ContinuousBatchingScheduler + """ + continuous_batching_scheduler = ContinuousBatchingScheduler.get_instance() + for op in engines: + continuous_batching_scheduler.add_engine_operator(op, batch_sizes) + return continuous_batching_scheduler diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py index 358405d7af..75935a9729 100644 --- a/src/deepsparse/v2/utils/__init__.py +++ b/src/deepsparse/v2/utils/__init__.py @@ -13,5 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .helpers import * from .state import * from .types import * + + +from .data import * # isort:skip diff --git a/src/deepsparse/v2/utils/data.py b/src/deepsparse/v2/utils/data.py new file mode 100644 index 0000000000..40402734cf --- /dev/null +++ b/src/deepsparse/v2/utils/data.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, List + +from deepsparse.v2.utils import InferenceState + + +__all__ = ["SubGraph"] + + +@dataclass +class SubGraph: + """ + Helper dataclass to store information about each running sub graph. + """ + + step: int + inf: InferenceState + end: List[str] + output: Any = None + + def parse_output(self, operator_output: Any): + if isinstance(operator_output, tuple): + operator_output, state_update = operator_output[0], operator_output[-1] + self.inf.update_state(state_update) + return operator_output diff --git a/src/deepsparse/v2/utils/helpers.py b/src/deepsparse/v2/utils/helpers.py new file mode 100644 index 0000000000..1f4bedc6c9 --- /dev/null +++ b/src/deepsparse/v2/utils/helpers.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable + + +__all__ = ["run_func"] + + +def run_func( + *args, + func: Callable, + inp: Any = None, + **kwargs, +): + """ + Generic function to run a given Callable. + """ + if inp: + output = ( + func(*args, **kwargs, **inp) + if isinstance(inp, dict) + else func(inp, *args, **kwargs) + ) + else: + output = func(*args, **kwargs) + return output diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py index 34a8f7a258..c53899f30c 100644 --- a/tests/deepsparse/v2/integration_tests/test_llms.py +++ b/tests/deepsparse/v2/integration_tests/test_llms.py @@ -135,7 +135,7 @@ def test_ort_single_token_prefill(self, setup): pipeline = self.get_pipeline( prompt_sequence_length=1, - engine_type="onnxruntime", + engine_kwargs={"engine_type": "onnxruntime"}, ) output = pipeline( prompt=self.prompt, @@ -163,7 +163,7 @@ def test_ort_multi_token_prefill(self, setup): "Cannot run ORT pipeline with the internal deepsparse cache enabled." ) pipeline = self.get_pipeline( - engine_type="onnxruntime", + engine_kwargs={"engine_type": "onnxruntime"}, ) output = pipeline( prompt=self.prompt, @@ -244,7 +244,7 @@ def test_inference_no_kv_cache_ort(self, setup): def _test_inference_no_kv_cache(self, engine_type): model_path_no_cache = self._get_model_path_no_cache() pipeline = self.get_pipeline( - model_path=model_path_no_cache, engine_type=engine_type + model_path=model_path_no_cache, engine_kwargs={"engine_type": engine_type} ) assert not pipeline.cache_support_enabled, ( "This pipeline test inference using non-kv cache " diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py index 5d8483e5f6..3840a9bb0a 100644 --- a/tests/deepsparse/v2/unit/text_generation/conftest.py +++ b/tests/deepsparse/v2/unit/text_generation/conftest.py @@ -19,15 +19,14 @@ import pytest from deepsparse.transformers.helpers import get_deployment_path -from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.pipelines.text_generation import ( + GenerationDefaults, + TextGenerationInput, +) from deepsparse.transformers.utils import DecoderKVCache from deepsparse.transformers.utils.helpers import initialize_kv_cache_state from deepsparse.v2 import InferenceState, PipelineState -from deepsparse.v2.text_generation import ( - GenerationDefaults, - NLEngineOperator, - TokenGeneratorOperator, -) +from deepsparse.v2.text_generation import NLEngineOperator, TokenGeneratorOperator @pytest.fixture(scope="module") diff --git a/tests/deepsparse/v2/unit/text_generation/test_misc.py b/tests/deepsparse/v2/unit/text_generation/test_misc.py index caa0cc2efd..f215e2aedb 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_misc.py +++ b/tests/deepsparse/v2/unit/text_generation/test_misc.py @@ -13,16 +13,23 @@ # limitations under the License. from deepsparse.v2.text_generation import CompilePromptLogits +from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs -def test_compile_logits(mock_logits, mock_inference_state): +def test_compile_logits(mock_logits, mock_inference_state, mock_tokens, mock_kv_cache): mock_inference_state.update_state({"prompt_logits": [mock_logits]}) compile_prompt_logits = CompilePromptLogits() # Can operate as long as we're not in generation but in prompt_inference. This # can_operate() will check for the `in_generation` flag in the input. - assert compile_prompt_logits.can_operate({}) + inp = NLEngineOutputs( + engine_outputs=mock_logits, + tokens=mock_tokens, + kv_cache=mock_kv_cache, + in_generation=None, + ) + assert compile_prompt_logits.can_operate(inp=inp) output, state = compile_prompt_logits.run( - logits=mock_logits, inference_state=mock_inference_state + inp=inp, inference_state=mock_inference_state ) # The CompilePromptLogits is responsible for updating a list of prompt logits # calculated at each step during prompt inference. After one step of running this diff --git a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py index be59db7475..02f4540c44 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py +++ b/tests/deepsparse/v2/unit/text_generation/test_process_inputs.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from deepsparse.v2.text_generation import ( - GenerationDefaults, - ProcessInputsTextGeneration, -) +from deepsparse.transformers.pipelines.text_generation import GenerationDefaults +from deepsparse.v2.text_generation import ProcessInputsTextGeneration def test_process_inputs( diff --git a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py index 335a28fbe3..19bb4d1c4a 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py +++ b/tests/deepsparse/v2/unit/text_generation/test_single_token_engine.py @@ -16,7 +16,7 @@ from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, - NlEngineInput, + NLEngineInputs, ) @@ -89,10 +89,10 @@ def test_run_single_token_engine_once( numpy.array([[0]]), numpy.array([[[[0, 0, 0, 0, 1]]]]), ] - inputs = NlEngineInput( + inputs = NLEngineInputs( engine_inputs=mock_engine_inputs, kv_cache=mock_kv_cache_single_token_engine, tokens=mock_engine_inputs[0].tolist(), ) output = single_token_engine_no_internal_cache.run(inputs) - assert output.get("logits") is not None + assert output.get("engine_outputs") is not None diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py index fbd9e06778..d04f863171 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py +++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py @@ -18,6 +18,7 @@ PrepareGeneration, TokenGeneratorOperator, ) +from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs def test_prep_for_generation( @@ -68,6 +69,7 @@ def test_generate_new_token( mock_kv_cache, mock_inference_state, mock_logits, + mock_tokens, ): """ This test is responsible for testing the GenerateNewTokenOperator, which generates @@ -84,8 +86,14 @@ def test_generate_new_token( "generated_tokens": [mock_token_generator.tokens], } ) + inp = NLEngineOutputs( + engine_outputs=mock_logits, + tokens=mock_tokens, + kv_cache=mock_kv_cache, + in_generation=True, + ) outputs, state = generate_new_token.run( - logits=mock_logits, kv_cache=mock_kv_cache, inference_state=mock_inference_state + inp=inp, inference_state=mock_inference_state ) # The new_token generated/returned by ths operator should match the last token in # token_generator From bb3ff413f77927020016a8f16ae38606a5750218 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 21 Nov 2023 15:41:41 -0500 Subject: [PATCH 36/57] [Pipeline Refactor] Operator Registry (#1420) * initial registry functionality * use sparsezoo mixin --- src/deepsparse/v2/__init__.py | 1 + src/deepsparse/v2/operators/__init__.py | 1 + src/deepsparse/v2/operators/operator.py | 15 ++ src/deepsparse/v2/operators/registry.py | 76 +++++++ src/deepsparse/v2/task.py | 204 ++++++++++++++++++ src/deepsparse/v2/text_generation/pipeline.py | 2 + 6 files changed, 299 insertions(+) create mode 100644 src/deepsparse/v2/operators/registry.py create mode 100644 src/deepsparse/v2/task.py diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py index 29fcd4126c..5fd33a9503 100644 --- a/src/deepsparse/v2/__init__.py +++ b/src/deepsparse/v2/__init__.py @@ -18,4 +18,5 @@ from .pipeline import * from .routers import * from .schedulers import * +from .task import * from .utils import * diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py index bf58018493..ae14f2a373 100644 --- a/src/deepsparse/v2/operators/__init__.py +++ b/src/deepsparse/v2/operators/__init__.py @@ -16,3 +16,4 @@ # limitations under the License. from .operator import * from .engine_operator import * +from .registry import * diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 2923862b12..377088e09e 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -18,6 +18,7 @@ from pydantic import BaseModel from deepsparse.v2.utils import InferenceState +from deepsparse.v2.operators.registry import OperatorRegistry __all__ = ["Operator"] @@ -100,6 +101,20 @@ def __call__( return self.output_schema(**run_output) return run_output + @staticmethod + def create( + task: str, + **kwargs, + ) -> "Operator": + """ + :param task: Operator task + :param kwargs: extra task specific kwargs to be passed to task Operator + implementation + :return: operator object initialized for the given task + """ + operator_constructor = OperatorRegistry.get_task_constructor(task) + return operator_constructor(**kwargs) + @abstractmethod def run(self, *args, **kwargs) -> Any: """ diff --git a/src/deepsparse/v2/operators/registry.py b/src/deepsparse/v2/operators/registry.py new file mode 100644 index 0000000000..1b83b20728 --- /dev/null +++ b/src/deepsparse/v2/operators/registry.py @@ -0,0 +1,76 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Type + +from deepsparse.v2.task import SupportedTasks, dynamic_import_task +from sparsezoo.utils.registry import ( + RegistryMixin, + get_from_registry, + register, + registered_names, +) + + +__all__ = ["OperatorRegistry"] + + +class OperatorRegistry(RegistryMixin): + """ + Register operators with given task name(s). Leverages the RegistryMixin + functionality. + """ + + @classmethod + def register_value(cls, operator, name): + from deepsparse.v2.operators import Operator + + if not isinstance(name, list): + name = [name] + + for task_name in name: + register(Operator, operator, task_name, require_subclass=True) + + return operator + + @classmethod + def get_task_constructor(cls, task: str) -> Type["Operator"]: # noqa: F821 + """ + This function retrieves the class previously registered via + `OperatorRegistry.register` for `task`. + + If `task` starts with "import:", it is treated as a module to be imported, + and retrieves the task via the `TASK` attribute of the imported module. + + If `task` starts with "custom", then it is mapped to the "custom" task. + + :param task: The task name to get the constructor for + :return: The class registered to `task` + :raises ValueError: if `task` was not registered via `OperatorRegistry.register` + """ + from deepsparse.v2.operators import Operator + + if task.startswith("import:"): + # dynamically import the task from a file + task = dynamic_import_task(module_or_path=task.replace("import:", "")) + elif task.startswith("custom"): + # support any task that has "custom" at the beginning via the "custom" task + task = "custom" + else: + task = task.lower().replace("-", "_") + + tasks = registered_names(Operator) + # step needed to import relevant files required to load the operator + SupportedTasks.check_register_task(task, tasks) + return get_from_registry(Operator, task) diff --git a/src/deepsparse/v2/task.py b/src/deepsparse/v2/task.py new file mode 100644 index 0000000000..f1f4fc6d66 --- /dev/null +++ b/src/deepsparse/v2/task.py @@ -0,0 +1,204 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Classes and implementations for supported tasks in the DeepSparse pipeline and system +""" + +import importlib +import logging +import os +import sys +from collections import namedtuple +from typing import Iterable, List, Optional, Tuple + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["SupportedTasks", "AliasedTask"] + + +class AliasedTask: + """ + A task that can have multiple aliases to match to. + For example, question_answering which can alias to qa as well + + :param name: the name of the task such as question_answering or text_classification + :param aliases: the aliases the task can go by in addition to the name such as + qa, glue, sentiment_analysis, etc + """ + + def __init__(self, name: str, aliases: List[str]): + self._name = name + self._aliases = aliases + + @property + def name(self) -> str: + """ + :return: the name of the task such as question_answering + """ + return self._name + + @property + def aliases(self) -> List[str]: + """ + :return: the aliases the task can go by such as qa, glue, sentiment_analysis + """ + return self._aliases + + def matches(self, task: str) -> bool: + """ + :param task: the name of the task to check whether the given instance matches. + Checks the current name as well as any aliases. + Everything is compared at lower case and "-" and whitespace + are replaced with "_". + :return: True if task does match the current instance, False otherwise + """ + task = task.lower().replace("-", "_") + + # replace whitespace with "_" + task = "_".join(task.split()) + + return task == self.name or task in self.aliases + + +class SupportedTasks: + """ + The supported tasks in the DeepSparse pipeline and system + """ + + text_generation = namedtuple( + "text_generation", ["text_generation", "opt", "bloom"] + )( + text_generation=AliasedTask("text_generation", []), + opt=AliasedTask("opt", []), + bloom=AliasedTask("bloom", []), + ) + + all_task_categories = [text_generation] + + @classmethod + def check_register_task( + cls, task: str, extra_tasks: Optional[Iterable[str]] = None + ): + """ + :param task: task name to validate and import dependencies for + :param extra_tasks: valid task names that are not included in supported tasks. + i.e. tasks registered to Pipeline at runtime + """ + if cls.is_text_generation(task): + import deepsparse.v2.text_generation.pipeline # noqa: F401 + + all_tasks = set(cls.task_names() + (list(extra_tasks or []))) + if task not in all_tasks: + raise ValueError( + f"Unknown Pipeline task {task}. Currently supported tasks are " + f"{list(all_tasks)}" + ) + + @classmethod + def is_text_generation(cls, task: str) -> bool: + """ + :param task: the name of the task to check whether it is a text generation task + such as codegen + :return: True if it is a text generation task, False otherwise + """ + return any( + text_generation_task.matches(task) + for text_generation_task in cls.text_generation + ) + + @classmethod + def task_names(cls): + task_names = ["custom"] + for task_category in cls.all_task_categories: + for task in task_category: + unique_aliases = ( + alias for alias in task._aliases if alias != task._name + ) + task_names += (task._name, *unique_aliases) + return task_names + + +def dynamic_import_task(module_or_path: str) -> str: + """ + Dynamically imports `module` with importlib, and returns the `TASK` + attribute on the module (something like `importlib.import_module(module).TASK`). + + Example contents of `module`: + ```python + from deepsparse.pipeline import Pipeline + from deepsparse.transformers.pipelines.question_answering import ( + QuestionAnsweringPipeline, + ) + + TASK = "my_qa_task" + Pipeline.register(TASK)(QuestionAnsweringPipeline) + ``` + + NOTE: this modifies `sys.path`. + + :raises FileNotFoundError: if path does not exist + :raises RuntimeError: if the imported module does not contain `TASK` + :raises RuntimeError: if the module doesn't register the task + :return: The task from the imported module. + """ + parent_dir, module_name = _split_dir_and_name(module_or_path) + if not os.path.exists(os.path.join(parent_dir, module_name + ".py")): + raise FileNotFoundError( + f"Unable to find file for {module_or_path}. " + f"Looked for {module_name}.py under {parent_dir if parent_dir else '.'}" + ) + + # add parent_dir to sys.path so we can import the file as a module + sys.path.append(os.curdir) + if parent_dir: + _LOGGER.info(f"Adding {parent_dir} to sys.path") + sys.path.append(parent_dir) + + # do the import + _LOGGER.info(f"Importing '{module_name}'") + module_or_path = importlib.import_module(module_name) + + if not hasattr(module_or_path, "TASK"): + raise RuntimeError( + "When using --task import:, " + "module must set the `TASK` attribute." + ) + + task = getattr(module_or_path, "TASK") + _LOGGER.info(f"Using task={repr(task)}") + + return task + + +def _split_dir_and_name(module_or_path: str) -> Tuple[str, str]: + """ + Examples: + - `a` -> `("", "a")` + - `a.b` -> `("a", "b")` + - `a.b.c` -> `("a/b", "c")` + + :return: module split into directory & name + """ + if module_or_path.endswith(".py"): + # assume path + split_char = os.sep + module_or_path = module_or_path.replace(".py", "") + else: + # assume module + split_char = "." + *dirs, module_name = module_or_path.split(split_char) + parent_dir = os.sep if dirs == [""] else os.sep.join(dirs) + return parent_dir, module_name diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index ae7334cffd..344980dc3f 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -19,6 +19,7 @@ from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.utils import split_engine_inputs from deepsparse.v2.operators import EngineOperator +from deepsparse.v2.operators.registry import OperatorRegistry from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler @@ -44,6 +45,7 @@ _LOGGER = logging.getLogger(__name__) +@OperatorRegistry.register(name="text_generation") class TextGenerationPipeline(Pipeline): def __init__( self, From 90de2b352c47fff541113f5529a52a715c079885 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 22 Nov 2023 11:23:57 +0000 Subject: [PATCH 37/57] fix tricky rebase --- src/deepsparse/v2/operators/operator.py | 2 +- src/deepsparse/v2/pipeline.py | 4 ++-- .../v2/text_generation/nl_engine_operator.py | 24 ++++++++++--------- src/deepsparse/v2/text_generation/pipeline.py | 14 +++++------ .../v2/unit/text_generation/conftest.py | 4 ++-- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 377088e09e..e775056f8f 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,8 +17,8 @@ from pydantic import BaseModel -from deepsparse.v2.utils import InferenceState from deepsparse.v2.operators.registry import OperatorRegistry +from deepsparse.v2.utils import InferenceState __all__ = ["Operator"] diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 78d112a2b3..59970b2820 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -15,7 +15,7 @@ import copy from concurrent.futures import Future -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union from deepsparse.v2.operators import EngineOperator, Operator from deepsparse.v2.routers import Router @@ -55,7 +55,7 @@ def __init__( ops: Union[Dict[str, Operator], List[Operator]], router: Router, schedulers: List[OperatorScheduler], - continuous_batching_scheduler: ContinuousBatchingScheduler, + continuous_batching_scheduler: Optional[ContinuousBatchingScheduler] = None, pipeline_state: PipelineState = None, ): diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 407415d00a..9bef8ceb87 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -33,10 +33,12 @@ ) -__all__ = ["NLEngineOperator", - "NlEngineOperatorNoCache", - "NlEngineInputNoCache", - "NLEngineInputs"] +__all__ = [ + "NLEngineOperator", + "NLEngineOperatorNoCache", + "NLEngineInputsNoCache", + "NLEngineInputs", +] class NLEngineInputs(BaseModel): @@ -108,12 +110,12 @@ def split(self) -> List["NLEngineOutputs"]: ] -class NlEngineInputsNoCache(BaseModel): +class NLEngineInputsNoCache(BaseModel): input_ids: Any attention_mask: Any -class NlEngineOperator(EngineOperator): +class NLEngineOperator(EngineOperator): """ Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator. @@ -122,8 +124,8 @@ class NlEngineOperator(EngineOperator): multi-token case. """ - input_schema = NlEngineInputs - output_schema = NlEngineOutputs + input_schema = NLEngineInputs + output_schema = NLEngineOutputs def __init__( self, @@ -320,14 +322,14 @@ def output_names(self) -> List[str]: return self.engine.output_names -class NlEngineOperatorNoCache(EngineOperator): +class NLEngineOperatorNoCache(EngineOperator): """ Operator the Natural Language Engine, that operates without KV Cache. This means that this operator merely maps input_ids and attention_mask to logits """ - input_schema = NlEngineInputNoCache + input_schema = NLEngineInputsNoCache output_schema = None def __init__(self, sequence_length: int, **kwargs): @@ -338,7 +340,7 @@ def __init__(self, sequence_length: int, **kwargs): ) super().__init__(**kwargs) - def run(self, inp: NlEngineInputNoCache, **kwargs) -> Any: + def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any: engine_inputs = [inp.input_ids, inp.attention_mask] logits = ( super() diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 44e38399a5..f21f671676 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -23,8 +23,6 @@ from deepsparse.v2.operators.registry import OperatorRegistry from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter, LinearRouter -from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, @@ -35,8 +33,8 @@ JoinOutput, KVCacheCreator, MultiEnginePrefill, - NlEngineOperator, - NlEngineOperatorNoCache, + NLEngineOperator, + NLEngineOperatorNoCache, PrepareforPrefill, PrepareGeneration, ProcessInputsTextGeneration, @@ -48,6 +46,7 @@ _LOGGER = logging.getLogger(__name__) + class TextGenerationPipelineNoCache(Pipeline): def __init__( self, @@ -81,7 +80,7 @@ def __init__( sequence_length=sequence_length, tokenizer=self.tokenizer, ), - NlEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), + NLEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), PrepareGeneration( sequence_length=sequence_length, prompt_sequence_length=1, @@ -120,6 +119,7 @@ def verify_no_kv_cache_present(self) -> bool: ) return not is_kv_cache_present + @OperatorRegistry.register(name="text_generation") class TextGenerationPipeline(Pipeline): def __init__( @@ -148,14 +148,14 @@ def __init__( if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": internal_kv_cache = False - single_engine_operator = NlEngineOperator( + single_engine_operator = NLEngineOperator( sequence_length=sequence_length, internal_kv_cache=internal_kv_cache, input_ids_length=1, **engine_kwargs, ) - multi_engine_operator = NlEngineOperator( + multi_engine_operator = NLEngineOperator( sequence_length=sequence_length, internal_kv_cache=internal_kv_cache, input_ids_length=prompt_sequence_length, diff --git a/tests/deepsparse/v2/unit/text_generation/conftest.py b/tests/deepsparse/v2/unit/text_generation/conftest.py index 96d36d57c2..3840a9bb0a 100644 --- a/tests/deepsparse/v2/unit/text_generation/conftest.py +++ b/tests/deepsparse/v2/unit/text_generation/conftest.py @@ -26,7 +26,7 @@ from deepsparse.transformers.utils import DecoderKVCache from deepsparse.transformers.utils.helpers import initialize_kv_cache_state from deepsparse.v2 import InferenceState, PipelineState -from deepsparse.v2.text_generation import NlEngineOperator, TokenGeneratorOperator +from deepsparse.v2.text_generation import NLEngineOperator, TokenGeneratorOperator @pytest.fixture(scope="module") @@ -60,7 +60,7 @@ def single_token_engine_no_internal_cache(text_generation_attributes, model_attr seq_length, _ = text_generation_attributes _, model_path = model_attributes - nl_engine_operator = NlEngineOperator( + nl_engine_operator = NLEngineOperator( sequence_length=seq_length, input_ids_length=1, model_path=model_path ) return nl_engine_operator From 66ca295b240aeb9e65926cb831c0040c125606a9 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 22 Nov 2023 11:26:42 +0000 Subject: [PATCH 38/57] one more cleanup --- src/deepsparse/v2/operators/operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index e775056f8f..377088e09e 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,8 +17,8 @@ from pydantic import BaseModel -from deepsparse.v2.operators.registry import OperatorRegistry from deepsparse.v2.utils import InferenceState +from deepsparse.v2.operators.registry import OperatorRegistry __all__ = ["Operator"] From dcded1dc98f62ac42a76c6841d17796f6ee4c306 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 22 Nov 2023 11:56:13 +0000 Subject: [PATCH 39/57] got tests to work after rebase. implementing SPLIT and JOIN in linearouter now --- src/deepsparse/v2/operators/operator.py | 2 +- .../v2/text_generation/generate_new_token.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 377088e09e..e775056f8f 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,8 +17,8 @@ from pydantic import BaseModel -from deepsparse.v2.utils import InferenceState from deepsparse.v2.operators.registry import OperatorRegistry +from deepsparse.v2.utils import InferenceState __all__ = ["Operator"] diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py index 5bf48bbdbc..fd91b3412c 100644 --- a/src/deepsparse/v2/text_generation/generate_new_token.py +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Sequence, Union +from typing import Any, Dict, Sequence, Union import transformers @@ -36,9 +36,14 @@ def can_operate(self, inp: NLEngineOutputs): return True return False - def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): - logits = inp.engine_outputs - kv_cache = inp.kv_cache + def run(self, *args, inference_state: InferenceState, **kwargs): + if args: + inp = args[0] + logits = inp.engine_outputs + kv_cache = inp.kv_cache + else: + logits = kwargs.get("logits") # inp.engine_outputs + kv_cache = kwargs.get("kv_cache") # inp.kv_cache token_generator = inference_state.current_state.get("token_generator") token = token_generator.generate(logits=logits[0, -1, :]) From 127aa00d5be96371aa59cf9ee91bf31e24fd71a0 Mon Sep 17 00:00:00 2001 From: Damian Date: Wed, 22 Nov 2023 13:52:21 +0000 Subject: [PATCH 40/57] pipeline working, with GraphRouter. Needs some more testing --- .../v2/text_generation/nl_engine_operator.py | 3 +- src/deepsparse/v2/text_generation/pipeline.py | 74 ++++++++++++++----- .../v2/text_generation/process_inputs.py | 4 +- .../v2/integration_tests/test_llms.py | 4 +- 4 files changed, 60 insertions(+), 25 deletions(-) diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 9bef8ceb87..2843d3dd17 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -350,7 +350,8 @@ def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any: # By default, the engine outputs logits for all tokens in the sequence. # Let's filter out the logits for the padding tokens. - logits = numpy.compress(inp.attention_mask[0], logits[0], axis=1) + logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1) + print(logits.shape) return {"logits": [logits], "kv_cache": None, "tokens": None}, { "prompt_logits": [logits] } diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index f21f671676..fb736e7771 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -74,24 +74,51 @@ def __init__( token_generator = TokenGeneratorOperator() - ops = [ - ProcessInputsTextGeneration( - generation_config=process_generation_config(generation_config), - sequence_length=sequence_length, - tokenizer=self.tokenizer, - ), - NLEngineOperatorNoCache(sequence_length=sequence_length, **engine_kwargs), - PrepareGeneration( - sequence_length=sequence_length, - prompt_sequence_length=1, - token_generator=token_generator, - ), - GenerateNewTokenOperator(tokenizer=self.tokenizer, force_max_tokens=True), - CompileGenerations(), - JoinOutput(tokenizer=self.tokenizer), - ProcessOutputs(tokenizer=self.tokenizer), - ] - router = LinearRouter(end_route=len(ops)) + process_inputs = ProcessInputsTextGeneration( + generation_config=process_generation_config(generation_config), + sequence_length=sequence_length, + tokenizer=self.tokenizer, + ) + engine_operator = NLEngineOperatorNoCache( + sequence_length=sequence_length, + **engine_kwargs, + ) + prepare_generation = PrepareGeneration( + sequence_length=sequence_length, + prompt_sequence_length=1, + token_generator=token_generator, + ) + generate_new_token = GenerateNewTokenOperator( + tokenizer=self.tokenizer, force_max_tokens=True + ) + compile_generations = CompileGenerations() + join_output = JoinOutput(tokenizer=self.tokenizer) + process_outputs = ProcessOutputs(tokenizer=self.tokenizer) + + ops = { + "process_input": process_inputs, + "engine_operator": engine_operator, + "prepare_generation": prepare_generation, + "generate_new_token": generate_new_token, + "compile_generations": compile_generations, + "join_output": join_output, + "process_outputs": process_outputs, + } + routes = { + "process_input": "SPLIT", + "SPLIT": "engine_operator", + "engine_operator": "prepare_generation", + "prepare_generation": "generate_new_token", + "generate_new_token": "compile_generations", + "compile_generations": "JOIN", + "JOIN": "join_output", + "join_output": "process_outputs", + "process_outputs": "STOP", + } + + router = GraphRouter( + end_route="STOP", start_route="process_input", route=routes + ) scheduler = [OperatorScheduler()] super().__init__( ops=ops, @@ -102,9 +129,18 @@ def __init__( def run(self, *args, **kwargs): # we need to set the fixed_sequences_length flag to True # for the non-kv cache pipeline - kwargs.update(dict(fixed_sequences_length=True)) + kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1)) return super().run(*args, **kwargs) + def condense_inputs(self, *args, **kwargs): + return args[0], kwargs + + def expand_inputs(self, items, batch_size): + items = [items.get(key) for key in items.keys()] + out, orig_batch_size = split_engine_inputs(items, batch_size) + combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] + return combined_batches, orig_batch_size + def verify_no_kv_cache_present(self) -> bool: """ Verifies that the ONNX model does not have diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 0f9147f916..85956416a1 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -36,8 +36,8 @@ class ProcessInputsTextGeneration(Operator): """ Input processing operator. Responsible for tokenizing the input, handling the generation_config (if provided), updating the inference_state for later use, - and returning the tokens for prompt inferece. The expected input is defined by - the input_schema, which for this operator is TextGeneratioInput. + and returning the tokens for prompt inference. The expected input is defined by + the input_schema, which for this operator is TextGenerationInput. """ input_schema = TextGenerationInput diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py index 321070f276..350c77b3f8 100644 --- a/tests/deepsparse/v2/integration_tests/test_llms.py +++ b/tests/deepsparse/v2/integration_tests/test_llms.py @@ -223,9 +223,7 @@ def test_deepsparse_multi_token_prefill(self, setup): output = pipeline( prompt=self.prompt, include_prompt_logits=True, - generation_kwargs=dict( - max_new_tokens=self.max_new_tokens, output_scores=True - ), + generation_kwargs=dict(output_scores=True), ) self._test_output( From af576981bdedc193128f9c588e724fd107ca30e3 Mon Sep 17 00:00:00 2001 From: Damian Date: Mon, 27 Nov 2023 15:09:02 +0000 Subject: [PATCH 41/57] ready for review --- .../v2/text_generation/generate_new_token.py | 11 +++-------- src/deepsparse/v2/text_generation/join_output.py | 5 ----- .../v2/text_generation/nl_engine_operator.py | 12 ++++++------ src/deepsparse/v2/text_generation/pipeline.py | 8 +++++--- tests/deepsparse/v2/integration_tests/test_llms.py | 13 +++++++++---- .../unit/text_generation/test_token_generation.py | 4 +++- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py index fd91b3412c..ba3fb445aa 100644 --- a/src/deepsparse/v2/text_generation/generate_new_token.py +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Sequence, Union +from typing import Sequence, Union import transformers @@ -37,13 +37,8 @@ def can_operate(self, inp: NLEngineOutputs): return False def run(self, *args, inference_state: InferenceState, **kwargs): - if args: - inp = args[0] - logits = inp.engine_outputs - kv_cache = inp.kv_cache - else: - logits = kwargs.get("logits") # inp.engine_outputs - kv_cache = kwargs.get("kv_cache") # inp.kv_cache + logits = args[0].engine_outputs if args else kwargs.get("logits") + kv_cache = args[0].kv_cache if args else kwargs.get("kv_cache") token_generator = inference_state.current_state.get("token_generator") token = token_generator.generate(logits=logits[0, -1, :]) diff --git a/src/deepsparse/v2/text_generation/join_output.py b/src/deepsparse/v2/text_generation/join_output.py index 56d9ac47b1..7479ee7493 100644 --- a/src/deepsparse/v2/text_generation/join_output.py +++ b/src/deepsparse/v2/text_generation/join_output.py @@ -33,11 +33,6 @@ def __init__(self, tokenizer): self.tokenizer = tokenizer def run(self, inp: Tuple[List[CompileGenerationsOutput], Dict], **kwargs): - if not isinstance(inp, Tuple): - # when running without KV Cache - # this will be a single - # CompileGenerationsOutput for now - inp = [[inp]] batch_outputs = [x for x in inp[0]] generated_tokens = [x.generated_tokens for x in batch_outputs] diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 2843d3dd17..aaa1899fd5 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -110,11 +110,6 @@ def split(self) -> List["NLEngineOutputs"]: ] -class NLEngineInputsNoCache(BaseModel): - input_ids: Any - attention_mask: Any - - class NLEngineOperator(EngineOperator): """ @@ -322,6 +317,11 @@ def output_names(self) -> List[str]: return self.engine.output_names +class NLEngineInputsNoCache(BaseModel): + input_ids: Any + attention_mask: Any + + class NLEngineOperatorNoCache(EngineOperator): """ Operator the Natural Language Engine, that operates without @@ -351,7 +351,7 @@ def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any: # By default, the engine outputs logits for all tokens in the sequence. # Let's filter out the logits for the padding tokens. logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1) - print(logits.shape) + return {"logits": [logits], "kv_cache": None, "tokens": None}, { "prompt_logits": [logits] } diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index fb736e7771..7c76b613e5 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -22,7 +22,7 @@ from deepsparse.v2.operators import EngineOperator from deepsparse.v2.operators.registry import OperatorRegistry from deepsparse.v2.pipeline import Pipeline -from deepsparse.v2.routers import GraphRouter, LinearRouter +from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import ContinuousBatchingScheduler, OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, @@ -52,9 +52,9 @@ def __init__( self, model_path: str, sequence_length: int = 1024, - engine_kwargs: Optional[Dict] = None, onnx_model_name: Optional[str] = None, - generation_config=None, # TODO: Typing here + generation_config=None, + engine_kwargs: Optional[Dict] = None, **kwargs, ): @@ -116,6 +116,8 @@ def __init__( "process_outputs": "STOP", } + # TODO: Using the GraphRouter, but should use + # LinearRouter with appropriate split/join support router = GraphRouter( end_route="STOP", start_route="process_input", route=routes ) diff --git a/tests/deepsparse/v2/integration_tests/test_llms.py b/tests/deepsparse/v2/integration_tests/test_llms.py index 350c77b3f8..3485658dda 100644 --- a/tests/deepsparse/v2/integration_tests/test_llms.py +++ b/tests/deepsparse/v2/integration_tests/test_llms.py @@ -85,6 +85,9 @@ def get_pipeline(self, kv_cache_support=True, **kwargs) -> Pipeline: "default" pipeline is returned) :return: the appropriate pipeline """ + # TODO: This if statement should disappear once + # the TextGenerationPipeline contains the + # non-kv-cache version of the pipeline text_generation_pipeline_class = ( TextGenerationPipeline if kv_cache_support @@ -223,7 +226,9 @@ def test_deepsparse_multi_token_prefill(self, setup): output = pipeline( prompt=self.prompt, include_prompt_logits=True, - generation_kwargs=dict(output_scores=True), + generation_kwargs=dict( + max_new_tokens=self.max_new_tokens, output_scores=True + ), ) self._test_output( @@ -247,18 +252,18 @@ def _test_inference_no_kv_cache(self, engine_type): ) output = pipeline( - prompt=self.prompt, + prompt=[self.prompt, self.prompt], include_prompt_logits=True, generation_kwargs=dict(output_scores=True), ) - logits = output.generations[0].score # logits -> prompt logits + one logit for the new generated token generated_logits, prompt_logits, *_ = self.torch_ground_truth logits_gt = numpy.concatenate( [prompt_logits[0], generated_logits[0, :1, :]], axis=0 ) - assert numpy.allclose(logits, logits_gt, atol=self.precision) + for gen in output.generations: + assert numpy.allclose(gen.score, logits_gt, atol=self.precision) def _test_output( self, diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py index d04f863171..219b1048fd 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py +++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py @@ -93,7 +93,9 @@ def test_generate_new_token( in_generation=True, ) outputs, state = generate_new_token.run( - inp=inp, inference_state=mock_inference_state + logits=inp.engine_outputs, + kv_cache=inp.kv_cache, + inference_state=mock_inference_state, ) # The new_token generated/returned by ths operator should match the last token in # token_generator From 4397c80c4eb50cf9e40e1be21e36d3604d141ee3 Mon Sep 17 00:00:00 2001 From: Damian Date: Tue, 28 Nov 2023 07:43:15 +0000 Subject: [PATCH 42/57] cleanup --- src/deepsparse/v2/text_generation/__init__.py | 2 + .../v2/text_generation/nl_engine_operator.py | 48 +----- .../nl_engine_operator_no_kv_cache.py | 67 ++++++++ src/deepsparse/v2/text_generation/pipeline.py | 113 ------------- .../text_generation/pipeline_no_kv_cache.py | 148 ++++++++++++++++++ .../test_pipeline_no_kv_cache.py | 43 +++++ 6 files changed, 261 insertions(+), 160 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py create mode 100644 src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py create mode 100644 tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py index 08836b8bbe..6f1323de50 100644 --- a/src/deepsparse/v2/text_generation/__init__.py +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -21,6 +21,7 @@ from .kv_cache_operator import * from .multi_engine_prefill_operator import * from .nl_engine_operator import * +from .nl_engine_operator_no_kv_cache import * from .prep_for_prefill import * from .process_inputs import * from .process_outputs import * @@ -30,3 +31,4 @@ from .prep_for_generation import * # isort:skip from .pipeline import * # isort:skip +from .pipeline_no_kv_cache import * # isort:skip diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index aaa1899fd5..c6583e37cf 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -20,7 +20,6 @@ import numpy from pydantic import BaseModel, Field -from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.utils import join_engine_outputs, split_engine_inputs from deepsparse.utils.onnx import ( CACHE_INPUT_PREFIX, @@ -33,12 +32,7 @@ ) -__all__ = [ - "NLEngineOperator", - "NLEngineOperatorNoCache", - "NLEngineInputsNoCache", - "NLEngineInputs", -] +__all__ = ["NLEngineOperator", "NLEngineInputs", "NLEngineOutputs"] class NLEngineInputs(BaseModel): @@ -315,43 +309,3 @@ def output_names(self) -> List[str]: :return: The output names for the onnx model """ return self.engine.output_names - - -class NLEngineInputsNoCache(BaseModel): - input_ids: Any - attention_mask: Any - - -class NLEngineOperatorNoCache(EngineOperator): - """ - Operator the Natural Language Engine, that operates without - KV Cache. This means that this operator merely maps input_ids - and attention_mask to logits - """ - - input_schema = NLEngineInputsNoCache - output_schema = None - - def __init__(self, sequence_length: int, **kwargs): - overwrite_transformer_onnx_model_inputs( - path=kwargs.get("model_path"), - batch_size=kwargs.get("batch_size", 1), - max_length=sequence_length, - ) - super().__init__(**kwargs) - - def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any: - engine_inputs = [inp.input_ids, inp.attention_mask] - logits = ( - super() - .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs) - .get("engine_outputs") - ) - - # By default, the engine outputs logits for all tokens in the sequence. - # Let's filter out the logits for the padding tokens. - logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1) - - return {"logits": [logits], "kv_cache": None, "tokens": None}, { - "prompt_logits": [logits] - } diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py b/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py new file mode 100644 index 0000000000..746010560f --- /dev/null +++ b/src/deepsparse/v2/text_generation/nl_engine_operator_no_kv_cache.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import numpy +from pydantic import BaseModel + +from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs +from deepsparse.v2.operators.engine_operator import EngineOperator, EngineOperatorInputs + + +__all__ = [ + "NLEngineOperatorNoCache", + "NLEngineInputsNoCache", +] + + +class NLEngineInputsNoCache(BaseModel): + input_ids: Any + attention_mask: Any + + +class NLEngineOperatorNoCache(EngineOperator): + """ + Operator the Natural Language Engine, that operates without + KV Cache. This means that this operator merely maps input_ids + and attention_mask to logits + """ + + input_schema = NLEngineInputsNoCache + output_schema = None + + def __init__(self, sequence_length: int, **kwargs): + overwrite_transformer_onnx_model_inputs( + path=kwargs.get("model_path"), + batch_size=kwargs.get("batch_size", 1), + max_length=sequence_length, + ) + super().__init__(**kwargs) + + def run(self, inp: NLEngineInputsNoCache, **kwargs) -> Any: + engine_inputs = [inp.input_ids, inp.attention_mask] + logits = ( + super() + .run(EngineOperatorInputs(engine_inputs=engine_inputs), **kwargs) + .get("engine_outputs") + ) + + # By default, the engine outputs logits for all tokens in the sequence. + # Let's filter out the logits for the padding tokens. + logits = numpy.compress(inp.attention_mask.flatten(), logits[0], axis=1) + + return {"logits": [logits], "kv_cache": None, "tokens": None}, { + "prompt_logits": [logits] + } diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 7c76b613e5..344980dc3f 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -18,7 +18,6 @@ from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.utils import split_engine_inputs -from deepsparse.utils.onnx import default_cached_outputs from deepsparse.v2.operators import EngineOperator from deepsparse.v2.operators.registry import OperatorRegistry from deepsparse.v2.pipeline import Pipeline @@ -34,7 +33,6 @@ KVCacheCreator, MultiEnginePrefill, NLEngineOperator, - NLEngineOperatorNoCache, PrepareforPrefill, PrepareGeneration, ProcessInputsTextGeneration, @@ -47,117 +45,6 @@ _LOGGER = logging.getLogger(__name__) -class TextGenerationPipelineNoCache(Pipeline): - def __init__( - self, - model_path: str, - sequence_length: int = 1024, - onnx_model_name: Optional[str] = None, - generation_config=None, - engine_kwargs: Optional[Dict] = None, - **kwargs, - ): - - ( - self.model_path, - self.config, - self.tokenizer, - engine_kwargs, - ) = setup_transformers_pipeline( - model_path, - sequence_length, - tokenizer_padding_side="right", - onnx_model_name=onnx_model_name, - engine_kwargs=engine_kwargs, - ) - self.verify_no_kv_cache_present() - - token_generator = TokenGeneratorOperator() - - process_inputs = ProcessInputsTextGeneration( - generation_config=process_generation_config(generation_config), - sequence_length=sequence_length, - tokenizer=self.tokenizer, - ) - engine_operator = NLEngineOperatorNoCache( - sequence_length=sequence_length, - **engine_kwargs, - ) - prepare_generation = PrepareGeneration( - sequence_length=sequence_length, - prompt_sequence_length=1, - token_generator=token_generator, - ) - generate_new_token = GenerateNewTokenOperator( - tokenizer=self.tokenizer, force_max_tokens=True - ) - compile_generations = CompileGenerations() - join_output = JoinOutput(tokenizer=self.tokenizer) - process_outputs = ProcessOutputs(tokenizer=self.tokenizer) - - ops = { - "process_input": process_inputs, - "engine_operator": engine_operator, - "prepare_generation": prepare_generation, - "generate_new_token": generate_new_token, - "compile_generations": compile_generations, - "join_output": join_output, - "process_outputs": process_outputs, - } - routes = { - "process_input": "SPLIT", - "SPLIT": "engine_operator", - "engine_operator": "prepare_generation", - "prepare_generation": "generate_new_token", - "generate_new_token": "compile_generations", - "compile_generations": "JOIN", - "JOIN": "join_output", - "join_output": "process_outputs", - "process_outputs": "STOP", - } - - # TODO: Using the GraphRouter, but should use - # LinearRouter with appropriate split/join support - router = GraphRouter( - end_route="STOP", start_route="process_input", route=routes - ) - scheduler = [OperatorScheduler()] - super().__init__( - ops=ops, - router=router, - schedulers=scheduler, - ) - - def run(self, *args, **kwargs): - # we need to set the fixed_sequences_length flag to True - # for the non-kv cache pipeline - kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1)) - return super().run(*args, **kwargs) - - def condense_inputs(self, *args, **kwargs): - return args[0], kwargs - - def expand_inputs(self, items, batch_size): - items = [items.get(key) for key in items.keys()] - out, orig_batch_size = split_engine_inputs(items, batch_size) - combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] - return combined_batches, orig_batch_size - - def verify_no_kv_cache_present(self) -> bool: - """ - Verifies that the ONNX model does not have - KV cache inputs/outputs present. - :return: True if compatible, False otherwise - """ - is_kv_cache_present = any(default_cached_outputs(self.model_path)) - if is_kv_cache_present: - raise ValueError( - f"The model: {self.model_path} has KV cache inputs/outputs present. " - "Please use the TextGenerationPipeline instead." - ) - return not is_kv_cache_present - - @OperatorRegistry.register(name="text_generation") class TextGenerationPipeline(Pipeline): def __init__( diff --git a/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py new file mode 100644 index 0000000000..a6ec2ae207 --- /dev/null +++ b/src/deepsparse/v2/text_generation/pipeline_no_kv_cache.py @@ -0,0 +1,148 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Dict, Optional + +from deepsparse.transformers.helpers import setup_transformers_pipeline +from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.utils import split_engine_inputs +from deepsparse.utils.onnx import default_cached_outputs +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.routers import GraphRouter +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.text_generation import ( + CompileGenerations, + GenerateNewTokenOperator, + JoinOutput, + NLEngineOperatorNoCache, + PrepareGeneration, + ProcessInputsTextGeneration, + ProcessOutputs, + TokenGeneratorOperator, +) + + +_LOGGER = logging.getLogger(__name__) + + +class TextGenerationPipelineNoCache(Pipeline): + def __init__( + self, + model_path: str, + sequence_length: int = 1024, + onnx_model_name: Optional[str] = None, + generation_config=None, + engine_kwargs: Optional[Dict] = None, + **kwargs, + ): + + ( + self.model_path, + self.config, + self.tokenizer, + engine_kwargs, + ) = setup_transformers_pipeline( + model_path, + sequence_length, + tokenizer_padding_side="right", + onnx_model_name=onnx_model_name, + engine_kwargs=engine_kwargs, + ) + self.verify_no_kv_cache_present() + + token_generator = TokenGeneratorOperator() + + process_inputs = ProcessInputsTextGeneration( + generation_config=process_generation_config(generation_config), + sequence_length=sequence_length, + tokenizer=self.tokenizer, + ) + engine_operator = NLEngineOperatorNoCache( + sequence_length=sequence_length, + **engine_kwargs, + ) + prepare_generation = PrepareGeneration( + sequence_length=sequence_length, + prompt_sequence_length=1, + token_generator=token_generator, + ) + generate_new_token = GenerateNewTokenOperator( + tokenizer=self.tokenizer, force_max_tokens=True + ) + compile_generations = CompileGenerations() + join_output = JoinOutput(tokenizer=self.tokenizer) + process_outputs = ProcessOutputs(tokenizer=self.tokenizer) + + ops = { + "process_input": process_inputs, + "engine_operator": engine_operator, + "prepare_generation": prepare_generation, + "generate_new_token": generate_new_token, + "compile_generations": compile_generations, + "join_output": join_output, + "process_outputs": process_outputs, + } + routes = { + "process_input": "SPLIT", + "SPLIT": "engine_operator", + "engine_operator": "prepare_generation", + "prepare_generation": "generate_new_token", + "generate_new_token": "compile_generations", + "compile_generations": "JOIN", + "JOIN": "join_output", + "join_output": "process_outputs", + "process_outputs": "STOP", + } + + # TODO: Using the GraphRouter, but should use + # LinearRouter with appropriate split/join support + router = GraphRouter( + end_route="STOP", start_route="process_input", route=routes + ) + scheduler = [OperatorScheduler()] + super().__init__( + ops=ops, + router=router, + schedulers=scheduler, + ) + + def run(self, *args, **kwargs): + # we need to set the fixed_sequences_length flag to True + # for the non-kv cache pipeline + kwargs.update(dict(fixed_sequences_length=True, max_new_tokens=1)) + return super().run(*args, **kwargs) + + def condense_inputs(self, *args, **kwargs): + return args[0], kwargs + + def expand_inputs(self, items, batch_size): + items = [items.get(key) for key in items.keys()] + out, orig_batch_size = split_engine_inputs(items, batch_size) + combined_batches = [{"input_ids": b[0], "attention_mask": b[1]} for b in out] + return combined_batches, orig_batch_size + + def verify_no_kv_cache_present(self) -> bool: + """ + Verifies that the ONNX model does not have + KV cache inputs/outputs present. + :return: True if compatible, False otherwise + """ + is_kv_cache_present = any(default_cached_outputs(self.model_path)) + if is_kv_cache_present: + raise ValueError( + f"The model: {self.model_path} has KV cache inputs/outputs present. " + "Please use the TextGenerationPipeline instead." + ) + return not is_kv_cache_present diff --git a/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py b/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py new file mode 100644 index 0000000000..a6fbfc4d11 --- /dev/null +++ b/tests/deepsparse/v2/unit/text_generation/test_pipeline_no_kv_cache.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +from deepsparse.v2.text_generation import TextGenerationPipelineNoCache + + +@pytest.mark.parametrize( + "onnx_model_name, raise_error", + [("model.onnx", True), (None, True), ("model-orig.onnx", False)], +) +def test_verify_no_kv_cache_present(model_attributes, onnx_model_name, raise_error): + _, model_path = model_attributes + # model_path points to .../directory/model.onnx + # we need to go up one level to .../directory + model_path = os.path.dirname(model_path) + + if raise_error: + with pytest.raises(ValueError): + if onnx_model_name is None: + TextGenerationPipelineNoCache(model_path=model_path) + else: + TextGenerationPipelineNoCache( + model_path=model_path, onnx_model_name=onnx_model_name + ) + return + else: + TextGenerationPipelineNoCache( + model_path=model_path, onnx_model_name=onnx_model_name + ) From 105b1d532a63b60bf9560ee6ee3464365ccfb7c8 Mon Sep 17 00:00:00 2001 From: Damian Date: Tue, 5 Dec 2023 11:46:41 +0000 Subject: [PATCH 43/57] simplify after PR review round --- .../v2/text_generation/generate_new_token.py | 6 +++--- .../v2/text_generation/prep_for_generation.py | 15 ++++++++++----- .../unit/text_generation/test_token_generation.py | 4 +--- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py index ba3fb445aa..5bf48bbdbc 100644 --- a/src/deepsparse/v2/text_generation/generate_new_token.py +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -36,9 +36,9 @@ def can_operate(self, inp: NLEngineOutputs): return True return False - def run(self, *args, inference_state: InferenceState, **kwargs): - logits = args[0].engine_outputs if args else kwargs.get("logits") - kv_cache = args[0].kv_cache if args else kwargs.get("kv_cache") + def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): + logits = inp.engine_outputs + kv_cache = inp.kv_cache token_generator = inference_state.current_state.get("token_generator") token = token_generator.generate(logits=logits[0, -1, :]) diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py index 9b63946c16..c4f244d2e8 100644 --- a/src/deepsparse/v2/text_generation/prep_for_generation.py +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -20,6 +20,7 @@ from deepsparse.transformers.utils.helpers import set_generated_length from deepsparse.v2.operators import Operator from deepsparse.v2.text_generation import TokenGeneratorOperator +from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs from deepsparse.v2.utils import InferenceState @@ -41,10 +42,11 @@ def can_operate(self, inp: Any): kv_cache = inp.get("kv_cache") tokens = inp.get("tokens") - # If the number of prompt tokens is greater than what we've processed, - # don't start generation. Should be equal when started as all prompt logits - # should be accounted for and we should have updated the kv_cache for the single - # token engine. + # If the number of prompt tokens is greater + # than what we've processed, don't start generation. + # Should be equal when started as all prompt logits + # should be accounted for, and we should have updated + # the kv_cache for the single token engine. if len(tokens) == kv_cache.total_num_processed_tokens: return True return False @@ -90,10 +92,13 @@ def run( "finished_reason": [], "token_generator": token_generator, } + output = { - "logits": prompt_logits, "tokens": token_generator.tokens, "kv_cache": kv_cache, "in_generation": True, } + if kv_cache is None: + output = NLEngineOutputs(**output, engine_outputs=prompt_logits) + return output, state_update diff --git a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py index 219b1048fd..d04f863171 100644 --- a/tests/deepsparse/v2/unit/text_generation/test_token_generation.py +++ b/tests/deepsparse/v2/unit/text_generation/test_token_generation.py @@ -93,9 +93,7 @@ def test_generate_new_token( in_generation=True, ) outputs, state = generate_new_token.run( - logits=inp.engine_outputs, - kv_cache=inp.kv_cache, - inference_state=mock_inference_state, + inp=inp, inference_state=mock_inference_state ) # The new_token generated/returned by ths operator should match the last token in # token_generator From e15a24bfd642aee97ebf96a40ec630c3d1ac5ca9 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 5 Dec 2023 10:17:29 -0500 Subject: [PATCH 44/57] [Pipeline Refactor] Fix Operator scheduling to fix issue with slow execution (#1453) * fix scheduling to fix issue with engine running very slowly; introduce new completed attribute for Subgraph instead of checking instance type * fix warning message --- src/deepsparse/v2/operators/operator.py | 2 +- src/deepsparse/v2/pipeline.py | 11 +++-------- src/deepsparse/v2/text_generation/pipeline.py | 3 +-- src/deepsparse/v2/utils/data.py | 1 + 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 377088e09e..e775056f8f 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,8 +17,8 @@ from pydantic import BaseModel -from deepsparse.v2.utils import InferenceState from deepsparse.v2.operators.registry import OperatorRegistry +from deepsparse.v2.utils import InferenceState __all__ = ["Operator"] diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 78d112a2b3..450a6702c4 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -14,7 +14,6 @@ import copy -from concurrent.futures import Future from typing import Any, Dict, List, Union from deepsparse.v2.operators import EngineOperator, Operator @@ -116,9 +115,9 @@ def _run_sub_graphs( ) # Execute all sub graphs until all graphs have been completed. - while True: + while any(not x.completed for x in sub_graphs): for sub_graph in sub_graphs: - if isinstance(sub_graph.output, Future) and sub_graph.output.done(): + if not sub_graph.completed: # get the result for the completed operator; resolve its output operator_output = sub_graph.output.result() operator_output = sub_graph.parse_output(operator_output) @@ -136,17 +135,13 @@ def _run_sub_graphs( # update the output value if next_step in sub_graph.end: sub_graph.output = operator_output + sub_graph.completed = True else: sub_graph.output = self._run_next( inp=operator_output, inference_state=sub_graph.inf, next_step=next_step, ) - break - - # keep running until all sub graphs have completed. - if not any(isinstance(x.output, Future) for x in sub_graphs): - break return [x.output for x in sub_graphs] diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 344980dc3f..6e27942d19 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -146,8 +146,7 @@ def __init__( if continuous_batch_sizes: if internal_kv_cache: _LOGGER.warn( - "internal kv_cache is currently not supported with continuous ", - "batching", + "internal kv_cache is not supported with continuous_batching " ) else: continuous_batching_scheduler = self._get_continuous_batching_scheduler( diff --git a/src/deepsparse/v2/utils/data.py b/src/deepsparse/v2/utils/data.py index 40402734cf..9ed340cb7c 100644 --- a/src/deepsparse/v2/utils/data.py +++ b/src/deepsparse/v2/utils/data.py @@ -31,6 +31,7 @@ class SubGraph: inf: InferenceState end: List[str] output: Any = None + completed: bool = False def parse_output(self, operator_output: Any): if isinstance(operator_output, tuple): From 36f742bc25cc6d5c7354850c18f1c100b9e6365b Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 5 Dec 2023 12:26:57 -0500 Subject: [PATCH 45/57] [Pipeline Refactor] Add `Pipeline.create` method to initialize pipelines (#1457) * add pipeline create method for pipeline creation using the operator registry * add instance check --- src/deepsparse/v2/pipeline.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 450a6702c4..ff9014799b 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -173,6 +173,21 @@ def _apply_split(self, inp: Any, inference_state: InferenceState): ) return self.condense_inputs(outputs) + @staticmethod + def create(task: str, **kwargs) -> "Pipeline": + """ + :param task: Pipeline task + :param kwargs: extra task specific kwargs to be passed to the Pipeline + :return: pipeline object initialized for the given task + """ + pipeline = Operator.create(task=task, **kwargs) + if not isinstance(pipeline, Pipeline): + raise RuntimeError( + "Pipeline was not created for the given task. The " + "provided task should be registered using the OperatorRegistry" + ) + return pipeline + def run( self, *args, From c0267d91c544ea6f298554b4fdc6b6e0207d768e Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 5 Dec 2023 14:24:26 -0500 Subject: [PATCH 46/57] [Pipeline Refactor] async (#1380) * initial functionality and working example with image classification * remove testing image * rebase fixes * initial functionality and working example with image classification * text gen * updates func * prompt inference, initial functionality * remove image; update state docstring * Fix typo * add todo for split/join * remove context, clean-up args, remove prefill_preprocess_operaator * fix docstrings * initial functionality and working example with image classification * updates func * prompt inference, initial functionality * finish generation operators and update routes * further breakdown operators * add operators * fix can_operate condition * update can_operate to not rely on the inference_state * rebase + update * fix condition * async initial functionality * fix capacity settting again * add blocking * more testing * update to use split/join * fix * rebase fix * remove index * change event loop * rebase fix * update async run to use new operator scheduling properly --- src/deepsparse/v2/pipeline.py | 107 ++++++++++++++---- src/deepsparse/v2/schedulers/scheduler.py | 24 +++- .../v2/schedulers/scheduler_group.py | 7 +- 3 files changed, 112 insertions(+), 26 deletions(-) diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index ff9014799b..402b557a2a 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import asyncio import copy -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union from deepsparse.v2.operators import EngineOperator, Operator from deepsparse.v2.routers import Router @@ -68,10 +68,7 @@ def __init__( self._scheduler_group = SchedulerGroup(self.schedulers) def _run_next( - self, - inp: Any, - inference_state: InferenceState, - next_step: str, + self, inp: Any, inference_state: InferenceState, next_step: str, **kwargs ): if ( isinstance(self.ops[next_step], EngineOperator) @@ -88,10 +85,14 @@ def _run_next( inp=inp, pipeline_state=self.pipeline_state, inference_state=inference_state, + **kwargs, ) - def _run_sub_graphs( - self, sub_graph_inputs: List[Any], sub_graphs: List[SubGraph] + async def _run_sub_graphs( + self, + sub_graph_inputs: List[Any], + sub_graphs: List[SubGraph], + loop: Optional[asyncio.AbstractEventLoop] = None, ) -> List[Any]: """ Run a list of sub_graphs asynchronously. Polls to identify the sub graph that is @@ -111,7 +112,7 @@ def _run_sub_graphs( """ for i in range(len(sub_graphs)): sub_graphs[i].output = self._run_next( - sub_graph_inputs[i], sub_graphs[i].inf, sub_graphs[i].step + sub_graph_inputs[i], sub_graphs[i].inf, sub_graphs[i].step, loop=loop ) # Execute all sub graphs until all graphs have been completed. @@ -119,6 +120,8 @@ def _run_sub_graphs( for sub_graph in sub_graphs: if not sub_graph.completed: # get the result for the completed operator; resolve its output + if isinstance(sub_graph.output, asyncio.Future): + await sub_graph.output operator_output = sub_graph.output.result() operator_output = sub_graph.parse_output(operator_output) @@ -141,18 +144,80 @@ def _run_sub_graphs( inp=operator_output, inference_state=sub_graph.inf, next_step=next_step, + loop=loop, ) return [x.output for x in sub_graphs] - def _apply_split(self, inp: Any, inference_state: InferenceState): + async def run_async(self, *args, inference_state: InferenceState, **kwargs): """ - Split inputs using the pipeline's expand_inputs function. Inputs are split - into a batch size of one when a SPLIT_ROUTE node is found in a given pipeline's - provided router. The split batches are run asynchronously and then joined when - a JOIN_ROUTE node is found, using the pipeline's condense_inputs function. + Run through the operators using the provided router and scheduler. + The input to a given operator is the output of the previous operator. + + :param inference_state: inference_state for the pipeline. + :param pipeline_state: pipeline_state for the pipeline. The values in the state + are created during pipeline creation and are read-only during inference. """ + loop = asyncio.get_running_loop() + + next_step = self.router.START_ROUTE + operator_output = None + + while next_step != self.router.END_ROUTE: + # Either a dictionary key or valid index + + if next_step == self.router.SPLIT_ROUTE: + if operator_output is None: + raise ValueError( + f"{self.router.SPLIT_ROUTE} should appear after " + f"{self.ROUTER.START_ROUTE}" + ) + + operator_output = await self._apply_split( + operator_output, inference_state, loop=loop + ) + next_step = self.router.route[self.router.JOIN_ROUTE] + if next_step == self.router.END_ROUTE: + return operator_output + if next_step == self.router.START_ROUTE: + outputs = run_func( + *args, + func=self._scheduler_group.submit, + operator=self.ops[next_step], + inference_state=inference_state, + pipeline_state=self.pipeline_state, + loop=loop, + **kwargs, + ) + await outputs + operator_output = outputs.result() + + else: + outputs = self._run_next( + inp=operator_output, + next_step=next_step, + inference_state=inference_state, + loop=loop, + ) + await outputs + operator_output = outputs.result() + + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + + next_step = self.router.next(next_step, self.ops, operator_output) + if state_update: + inference_state.update_state(state_update) + return operator_output + + async def _apply_split( + self, + inp: Any, + inference_state: InferenceState, + loop: Optional[asyncio.AbstractEventLoop] = None, + ): batches, orig_batch_size = self.expand_inputs(inp, 1) # Create a list of SplitRoutes, per batch size 1 @@ -168,8 +233,8 @@ def _apply_split(self, inp: Any, inference_state: InferenceState): for i in range(len(batches)) ] - outputs = self._run_sub_graphs( - sub_graph_inputs=batches, sub_graphs=split_graphs + outputs = await self._run_sub_graphs( + sub_graph_inputs=batches, sub_graphs=split_graphs, loop=loop ) return self.condense_inputs(outputs) @@ -215,7 +280,9 @@ def run( f"{self.ROUTER.START_ROUTE}" ) - operator_output = self._apply_split(operator_output, inference_state) + operator_output = asyncio.run( + self._apply_split(operator_output, inference_state) + ) next_step = self.router.route[self.router.JOIN_ROUTE] if next_step == self.router.END_ROUTE: return operator_output @@ -247,8 +314,10 @@ def run( end=[self.router.SPLIT_ROUTE, self.router.END_ROUTE], ) - operator_output = self._run_sub_graphs( - sub_graph_inputs=[operator_output], sub_graphs=[graph] + operator_output = asyncio.run( + self._run_sub_graphs( + sub_graph_inputs=[operator_output], sub_graphs=[graph] + ) )[0] inference_state = graph.inf diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 5313683107..37f2cfce90 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -13,8 +13,9 @@ # limitations under the License. +import asyncio from concurrent.futures import Future, ThreadPoolExecutor -from typing import Callable +from typing import Callable, Optional from deepsparse.v2.operators import Operator @@ -37,6 +38,21 @@ class OperatorScheduler: def __init__(self, max_workers: int = 1): self._threadpool = ThreadPoolExecutor(max_workers=max_workers) + def async_run( + self, + *args, + operator: Operator, + loop: Optional[asyncio.AbstractEventLoop], + **kwargs, + ) -> asyncio.Future: + import functools + + """Use an asyncio event loop to run the operator""" + + return loop.run_in_executor( + self._threadpool, functools.partial(operator, *args, **kwargs) + ) + def submit( self, *args, @@ -47,11 +63,7 @@ def submit( :param operator: operator to run :return: future referencing the asynchronously run output of the operator """ - return self._threadpool.submit( - operator, - *args, - **kwargs, - ) + return self._threadpool.submit(operator, *args, **kwargs) def can_process( self, diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 14d869a0f2..201fcee150 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -14,7 +14,7 @@ from concurrent.futures import Future -from typing import List +from typing import Any, List from deepsparse.v2.operators import Operator from deepsparse.v2.schedulers.scheduler import OperatorScheduler @@ -38,6 +38,7 @@ def submit( self, *args, operator: Operator, + loop: Any = None, **kwargs, ) -> Future: """ @@ -50,6 +51,10 @@ def submit( operator=operator, **kwargs, ): + if loop: + return scheduler.async_run( + *args, operator=operator, loop=loop, **kwargs + ) return scheduler.submit( *args, operator=operator, From 2d9b0a13f5391d4852d1a85c0fad3d5a4a031285 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 5 Dec 2023 15:51:21 -0500 Subject: [PATCH 47/57] rebase fixes (#1458) --- src/deepsparse/transformers/helpers.py | 16 ++++------------ .../transformers/pipelines/pipeline.py | 7 ++++--- src/deepsparse/utils/onnx.py | 8 ++++---- 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index d6a02c4374..4d04879c35 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -39,6 +39,7 @@ __all__ = [ + "get_deployment_path", "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", "fix_numpy_types", @@ -54,7 +55,6 @@ def setup_transformers_pipeline( sequence_length: int, tokenizer_padding_side: str = "left", engine_kwargs: Optional[Dict] = None, - onnx_model_name: Optional[str] = None, ) -> Tuple[ str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any] ]: @@ -66,13 +66,9 @@ def setup_transformers_pipeline( :param tokenizer_padding_side: The side to pad on for the tokenizer, either "left" or "right" :param engine_kwargs: The kwargs to pass to the engine - :param onnx_model_name: The name of the onnx model to be loaded. - If not specified, defaults are used (see setup_onnx_file_path) :return The model path, config, tokenizer, and engine kwargs """ - model_path, config, tokenizer = setup_onnx_file_path( - model_path, sequence_length, onnx_model_name - ) + model_path, config, tokenizer = setup_onnx_file_path(model_path, sequence_length) tokenizer.padding_side = tokenizer_padding_side if not tokenizer.pad_token: @@ -93,7 +89,6 @@ def setup_transformers_pipeline( def setup_onnx_file_path( model_path: str, sequence_length: int, - onnx_model_name: Optional[str] = None, task: Optional[str] = None, ) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]: """ @@ -102,12 +97,9 @@ def setup_onnx_file_path( derived from the `model_path` provided. :param model_path: path to the model to be parsed :param sequence_length: maximum sequence length of the model - :param onnx_model_name: optionally, the precise name of the ONNX model - of interest may be specified. If not specified, the default ONNX model - name will be used (refer to `get_deployment_path` for details) :return: file path to the processed ONNX file for the engine to compile """ - deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name) + deployment_path, onnx_path = get_deployment_path(model_path) hf_logger = logging.getLogger("transformers") hf_logger_level = hf_logger.level @@ -162,7 +154,7 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}" ) return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME) - + elif model_path.startswith("zoo:") or model_path.startswith("hf:"): onnx_model_path = model_to_path(model_path) return os.path.dirname(onnx_model_path), onnx_model_path diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py index 3f8c6ce543..0d54449e56 100644 --- a/src/deepsparse/transformers/pipelines/pipeline.py +++ b/src/deepsparse/transformers/pipelines/pipeline.py @@ -24,11 +24,12 @@ import numpy import transformers +from transformers.models.auto import AutoTokenizer from deepsparse import Bucketable, Pipeline -from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs from deepsparse.transformers.helpers import ( - setup_onnx_file_path as setup_onnx_file_path_v2, + get_deployment_path, + overwrite_transformer_onnx_model_inputs, ) @@ -153,7 +154,7 @@ def setup_onnx_file_path(self) -> str: ) = overwrite_transformer_onnx_model_inputs( onnx_path, max_length=self.sequence_length ) - + if not self.config or not self.tokenizer: raise RuntimeError( "Invalid config or tokenizer provided. Please provide " diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index 35d932c75d..ae0913ffd7 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -56,12 +56,12 @@ "has_model_kv_cache", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", - "MODEL_ONNX_NAME", + "_MODEL_DIR_ONNX_NAME", ] _LOGGER = logging.getLogger(__name__) -MODEL_ONNX_NAME = "model.onnx" +_MODEL_DIR_ONNX_NAME = "model.onnx" CACHE_INPUT_PREFIX = "past_key_values" CACHE_OUTPUT_PREFIX = "present" @@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model.deployment.path # default to the main onnx file for the model - model = model.deployment.get_file(MODEL_ONNX_NAME).path + model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path elif File is not object and isinstance(model, File): # get the downloaded_path -- will auto download if not on local system @@ -161,7 +161,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model_path = Path(model) if model_path.is_dir(): - return str(model_path / MODEL_ONNX_NAME) + return str(model_path / _MODEL_DIR_ONNX_NAME) return model From a2aaa518a382562f882c8611052dcbf823c19eee Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 5 Dec 2023 16:54:14 -0500 Subject: [PATCH 48/57] more fixes (#1459) --- src/deepsparse/transformers/helpers.py | 6 ++---- src/deepsparse/v2/pipeline.py | 6 +++--- .../schedulers/test_continuous_batching_scheduler.py | 1 - .../utils/test_continuous_batching_executor.py | 6 +++--- tests/deepsparse/v2/test_image_classification.py | 10 +++++++--- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index 4d04879c35..d5fc5ed438 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -68,7 +68,7 @@ def setup_transformers_pipeline( :param engine_kwargs: The kwargs to pass to the engine :return The model path, config, tokenizer, and engine kwargs """ - model_path, config, tokenizer = setup_onnx_file_path(model_path, sequence_length) + model_path, config, tokenizer = fetch_onnx_file_path(model_path, sequence_length) tokenizer.padding_side = tokenizer_padding_side if not tokenizer.pad_token: @@ -86,7 +86,7 @@ def setup_transformers_pipeline( return model_path, config, tokenizer, engine_kwargs -def setup_onnx_file_path( +def fetch_onnx_file_path( model_path: str, sequence_length: int, task: Optional[str] = None, @@ -135,8 +135,6 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: for running the transformers model in the deepsparse pipeline :param model_path: path to model directory, sparsezoo stub, or ONNX file - :param onnx_model_name: name of the ONNX file to look for in the deployment - directory. Defaults to MODEL_ONNX_NAME :return: path to the deployment directory and path to the ONNX file inside the deployment directory """ diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 402b557a2a..40d41c586e 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -54,8 +54,8 @@ def __init__( ops: Union[Dict[str, Operator], List[Operator]], router: Router, schedulers: List[OperatorScheduler], - continuous_batching_scheduler: ContinuousBatchingScheduler, - pipeline_state: PipelineState = None, + continuous_batching_scheduler: Optional[ContinuousBatchingScheduler] = None, + pipeline_state: Optional[PipelineState] = None, ): self.ops = ops @@ -277,7 +277,7 @@ def run( if operator_output is None: raise ValueError( f"{self.router.SPLIT_ROUTE} should appear after " - f"{self.ROUTER.START_ROUTE}" + f"{self.router.START_ROUTE}" ) operator_output = asyncio.run( diff --git a/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py index 7ed49de004..85cac323e0 100644 --- a/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py +++ b/tests/deepsparse/v2/schedulers/test_continuous_batching_scheduler.py @@ -29,7 +29,6 @@ def test_continuous_batching_executor_thread(): # mobilenet model with batch_size=2 engine_operator = EngineOperator( "zoo:mobilenet_v2-1.0-imagenet-base", - batch_size=1, ) scheduler.add_engine_operator(engine_operator, [1]) diff --git a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py index 1d5ed9d92b..2b7c5a5e68 100644 --- a/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py +++ b/tests/deepsparse/v2/schedulers/utils/test_continuous_batching_executor.py @@ -26,14 +26,14 @@ def test_continuous_batching_executor_thread(): # mobilenet model with batch_size=2 - engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base", batch_size=2) + engine_operator = EngineOperator("zoo:mobilenet_v2-1.0-imagenet-base") # create queues object and add operator queues = ContinuousBatchingQueues() - queues.add_queue(engine_operator, batch_sizes=[2]) + queues.add_queue(engine_operator, batch_sizes=[1]) # create engine map - operators_to_engines = {engine_operator: {2: engine_operator.engine}} + operators_to_engines = {engine_operator: {1: engine_operator.engine}} worker_thread = ContinuousBatchingExecutorThread(queues, operators_to_engines) diff --git a/tests/deepsparse/v2/test_image_classification.py b/tests/deepsparse/v2/test_image_classification.py index 03e2807454..c6b04e6f2f 100644 --- a/tests/deepsparse/v2/test_image_classification.py +++ b/tests/deepsparse/v2/test_image_classification.py @@ -34,6 +34,10 @@ def test_image_classification(get_images): "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned95-none" ) pipeline = ImageClassificationPipeline(model_path=model_path) - output = pipeline(ImageClassificationInput(images=get_images)) - assert output.labels == [[207], [670]] - assert numpy.allclose(output.scores, [[21.85], [17.33]], atol=0.01) + ground_truth = [[207], [670]] + scores = [[21.85], [17.33]] + + for i in range(len(get_images)): + output = pipeline(ImageClassificationInput(images=get_images[i])) + assert output.labels == ground_truth[i] + assert numpy.allclose(output.scores, scores[i], atol=0.01) From dcab3f9ac1472d6c79bcfd353f6edc5e63c367be Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Wed, 6 Dec 2023 12:37:49 +0000 Subject: [PATCH 49/57] bring back functionalities that were lost in v2 during rebasing --- src/deepsparse/transformers/helpers.py | 31 +++++++++----------------- src/deepsparse/utils/onnx.py | 19 ++++++++++++++-- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index 7273b61406..b1fdf72a67 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -30,12 +30,12 @@ from onnx import ModelProto from deepsparse.log import get_main_logger -from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model -from sparsezoo import Model +from deepsparse.utils.onnx import MODEL_ONNX_NAME, model_to_path, truncate_onnx_model from sparsezoo.utils import save_onnx __all__ = [ + "get_deployment_path", "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", "fix_numpy_types", @@ -62,12 +62,12 @@ def setup_transformers_pipeline( :param sequence_length: The sequence length to use for the model :param tokenizer_padding_side: The side to pad on for the tokenizer, either "left" or "right" - :param engine_kwargs: The kwargs to pass to the engine :param onnx_model_name: The name of the onnx model to be loaded. If not specified, defaults are used (see setup_onnx_file_path) + :param engine_kwargs: The kwargs to pass to the engine :return The model path, config, tokenizer, and engine kwargs """ - model_path, config, tokenizer = setup_onnx_file_path( + model_path, config, tokenizer = fetch_onnx_file_path( model_path, sequence_length, onnx_model_name ) @@ -87,7 +87,7 @@ def setup_transformers_pipeline( return model_path, config, tokenizer, engine_kwargs -def setup_onnx_file_path( +def fetch_onnx_file_path( model_path: str, sequence_length: int, onnx_model_name: Optional[str] = None, @@ -102,6 +102,7 @@ def setup_onnx_file_path( :param onnx_model_name: optionally, the precise name of the ONNX model of interest may be specified. If not specified, the default ONNX model name will be used (refer to `get_deployment_path` for details) + :param task: task to use for the config. Defaults to None :return: file path to the processed ONNX file for the engine to compile """ deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name) @@ -148,6 +149,7 @@ def get_deployment_path( the deployment directory """ onnx_model_name = onnx_model_name or MODEL_ONNX_NAME + if os.path.isfile(model_path): # return the parent directory of the ONNX file return os.path.dirname(model_path), model_path @@ -163,22 +165,9 @@ def get_deployment_path( ) return model_path, os.path.join(model_path, onnx_model_name) - elif model_path.startswith("zoo:"): - zoo_model = Model(model_path) - deployment_path = zoo_model.deployment_directory_path - return deployment_path, os.path.join(deployment_path, onnx_model_name) - elif model_path.startswith("hf:"): - from huggingface_hub import snapshot_download - - deployment_path = snapshot_download(repo_id=model_path.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, onnx_model_name) - if not os.path.isfile(onnx_path): - raise ValueError( - f"{onnx_model_name} not found in transformers model directory " - f"{deployment_path}. Be sure that an export of the model is written to " - f"{onnx_path}" - ) - return deployment_path, onnx_path + elif model_path.startswith("zoo:") or model_path.startswith("hf:"): + onnx_model_path = model_to_path(model_path) + return os.path.dirname(onnx_model_path), onnx_model_path else: raise ValueError( f"model_path {model_path} is not a valid file, directory, or zoo stub" diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index f518620c2f..e4b41f3286 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -129,7 +129,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: if Model is not object and isinstance(model, Model): # trigger download and unzipping of deployment directory if not cached - model.deployment_directory_path + model.deployment.path # default to the main onnx file for the model model = model.deployment.get_file(MODEL_ONNX_NAME).path @@ -138,6 +138,21 @@ def model_to_path(model: Union[str, Model, File]) -> str: # get the downloaded_path -- will auto download if not on local system model = model.path + if isinstance(model, str) and model.startswith("hf:"): + # load Hugging Face model from stub + from huggingface_hub import snapshot_download + + deployment_path = snapshot_download(repo_id=model.replace("hf:", "", 1)) + onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME) + if not os.path.isfile(onnx_path): + raise ValueError( + f"Could not find the ONNX model file '{MODEL_ONNX_NAME}' in the " + f"Hugging Face Hub repository located at {deployment_path}. Please " + f"ensure the model has been correctly exported to ONNX format and " + f"exists in the repository." + ) + return onnx_path + if not isinstance(model, str): raise ValueError("unsupported type for model: {}".format(type(model))) @@ -549,7 +564,7 @@ def overwrite_onnx_model_inputs_for_kv_cache_models( else: raise ValueError(f"Unexpected external input name: {external_input.name}") - _LOGGER.info( + _LOGGER.debug( "Overwriting in-place the input shapes " f"of the transformer model at {onnx_file_path}" ) From e5d2f39961fc6abf0d5cb24bc8fb4ac40bd34e7d Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:56:23 +0100 Subject: [PATCH 50/57] Update src/deepsparse/transformers/helpers.py --- src/deepsparse/transformers/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index b1fdf72a67..4caebb58c6 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -63,7 +63,7 @@ def setup_transformers_pipeline( :param tokenizer_padding_side: The side to pad on for the tokenizer, either "left" or "right" :param onnx_model_name: The name of the onnx model to be loaded. - If not specified, defaults are used (see setup_onnx_file_path) + If not specified, defaults are used (see fetch_onnx_file_path) :param engine_kwargs: The kwargs to pass to the engine :return The model path, config, tokenizer, and engine kwargs """ From 9ed5b06f1cb01700971d31446412edbbe63c4069 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 17:26:36 +0000 Subject: [PATCH 51/57] ready for review --- src/deepsparse/transformers/helpers.py | 4 +- .../text_generation/generate_new_token.py | 5 +- .../nl_engine_operator_no_kv_cache.py | 2 +- .../text_generation/pipeline_no_kv_cache.py | 14 +- .../text_generation/prep_for_generation.py | 7 +- .../legacy/integration_tests/__init__.py | 13 - .../integration_tests/configs/codegen.yaml | 8 - .../integration_tests/configs/gpt_neo.yaml | 8 - .../legacy/integration_tests/configs/opt.yaml | 8 - .../legacy/integration_tests/helpers.py | 146 ------- .../legacy/integration_tests/test_llms.py | 369 ------------------ .../integration_tests/test_llms.py | 20 +- .../test_pipeline_no_kv_cache.py | 4 +- 13 files changed, 30 insertions(+), 578 deletions(-) delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py delete mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index 4caebb58c6..34591d8a64 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -167,7 +167,9 @@ def get_deployment_path( elif model_path.startswith("zoo:") or model_path.startswith("hf:"): onnx_model_path = model_to_path(model_path) - return os.path.dirname(onnx_model_path), onnx_model_path + return os.path.dirname(onnx_model_path), onnx_model_path.replace( + MODEL_ONNX_NAME, onnx_model_name + ) else: raise ValueError( f"model_path {model_path} is not a valid file, directory, or zoo stub" diff --git a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py index 830a3e20bd..1eae6c138d 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py +++ b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py @@ -49,8 +49,9 @@ def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): callback = inference_state.current_state.get("callback") stop = inference_state.current_state.get("stop") - if kv_cache.total_num_processed_tokens >= kv_cache.capacity: - finish_reason = FinishReason.CAPACITY + if kv_cache: + if kv_cache.total_num_processed_tokens >= kv_cache.capacity: + finish_reason = FinishReason.CAPACITY if token == self.tokenizer.eos_token_id and not self.force_max_tokens: finish_reason = FinishReason.STOP diff --git a/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py index 746010560f..c6ae6c51f3 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/nl_engine_operator_no_kv_cache.py @@ -17,8 +17,8 @@ import numpy from pydantic import BaseModel +from deepsparse.operators.engine_operator import EngineOperator, EngineOperatorInputs from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs -from deepsparse.v2.operators.engine_operator import EngineOperator, EngineOperatorInputs __all__ = [ diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py index a6ec2ae207..7b5c7f67f0 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py @@ -15,14 +15,11 @@ import logging from typing import Dict, Optional +from deepsparse.pipeline import Pipeline +from deepsparse.routers import GraphRouter +from deepsparse.schedulers import OperatorScheduler from deepsparse.transformers.helpers import setup_transformers_pipeline -from deepsparse.transformers.utils.helpers import process_generation_config -from deepsparse.utils import split_engine_inputs -from deepsparse.utils.onnx import default_cached_outputs -from deepsparse.v2.pipeline import Pipeline -from deepsparse.v2.routers import GraphRouter -from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.text_generation import ( +from deepsparse.transformers.pipelines.text_generation import ( CompileGenerations, GenerateNewTokenOperator, JoinOutput, @@ -32,6 +29,9 @@ ProcessOutputs, TokenGeneratorOperator, ) +from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.utils import split_engine_inputs +from deepsparse.utils.onnx import default_cached_outputs _LOGGER = logging.getLogger(__name__) diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py index 975948fb57..df14398542 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py @@ -18,12 +18,11 @@ from deepsparse.operators import Operator from deepsparse.transformers.pipelines.text_generation import TokenGeneratorOperator +from deepsparse.transformers.pipelines.text_generation.nl_engine_operator import ( + NLEngineOutputs, +) from deepsparse.transformers.schemas.text_generation_schemas import FinishReason from deepsparse.transformers.utils.helpers import set_generated_length -from deepsparse.v2.operators import Operator -from deepsparse.v2.text_generation import TokenGeneratorOperator -from deepsparse.v2.text_generation.nl_engine_operator import NLEngineOutputs -from deepsparse.v2.utils import InferenceState from deepsparse.utils import InferenceState diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py deleted file mode 100644 index 0c44f887a4..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml deleted file mode 100644 index 62aac94a6b..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml +++ /dev/null @@ -1,8 +0,0 @@ -cadence: "nightly" -model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none" -torch_model_name: "salesforce/codegen-350m-mono" -task: ["text-generation"]#, "chat"] -prompt: "\ndef Fibonacci(n):\n # Check if input is 0 then it will\n # print incorrect input" -has_bos_token: False -precision: 0.0001 -internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml deleted file mode 100644 index 6dd3d59e33..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml +++ /dev/null @@ -1,8 +0,0 @@ -cadence: "commit" -model_path: "hf:mgoin/TinyStories-1M-ds" -torch_model_name: "roneneldan/TinyStories-1M" -task: ["text-generation"] -prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" -has_bos_token: True -precision: 0.001 -internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml deleted file mode 100644 index 2dfed87fd6..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml +++ /dev/null @@ -1,8 +0,0 @@ -cadence: "nightly" -model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none" -torch_model_name: "facebook/opt-1.3b" -task: ["text-generation"] -prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" -has_bos_token: True -precision: 0.0001 -internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py deleted file mode 100644 index e51ac7947a..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -from typing import Any, Dict, List, Tuple, Union - -import numpy -import yaml -from transformers import AutoModelForCausalLM, AutoTokenizer - -import pytest - - -class TorchGroundTruthSource: - """ - An object that generates ground truth logits and - cache states from a prompt. This object can - generate tokens in an autoregressive manner, and thus - will output: - - prompt logits, - - generated logits, - - prompt cache state, - - generated sequence - """ - - def __init__(self, num_tokens_to_generate: int, model_name: str): - - self.model = AutoModelForCausalLM.from_pretrained(model_name) - self.tokenizer = self._create_tokenizer(model_name) - - self.num_tokens_to_generate = num_tokens_to_generate - - def tokenize(self, prompt: str): - return self.tokenizer(prompt, return_tensors="pt") - - def __call__( - self, prompt: str - ) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]: - # afaik it is not possible to get 'past_key_values' from - # the generate method, so we have to run the model twice - out = self.model.generate( - self.tokenize(prompt).input_ids, - max_new_tokens=self.num_tokens_to_generate, - output_scores=True, - return_dict_in_generate=True, - use_cache=True, - ) - generated_text = self.tokenizer.decode( - out.sequences[0], skip_special_tokens=True - ) - generated_logits = numpy.concatenate( - [[score.numpy() for score in out.scores]] - ).transpose( - 1, 0, 2 - ) # (1, num_tokens_to_generate, vocab_size) - - out = self.model(**self.tokenize(prompt)) - prompt_logits = out.logits.detach().numpy()[ - :, :-1, : - ] # (1, prompt_length, vocab_size) - prompt_cache = [ - entry.detach().numpy() - for key_value_tuple in out.past_key_values - for entry in key_value_tuple - ] # List[(1, num_heads, past_length, head_dim)] - - return generated_logits, prompt_logits, prompt_cache, generated_text - - @staticmethod - def _create_tokenizer(model_name): - tokenizer = AutoTokenizer.from_pretrained(model_name) - tokenizer.padding_side = "left" - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - - return tokenizer - - -def parse_params(configs_directory: str) -> List[Dict[str, Any]]: - # parses the config file provided - assert os.path.isdir( - configs_directory - ), f"Config_directory {configs_directory} is not a directory" - - config_dicts = [] - for file in os.listdir(configs_directory): - if file.endswith(".yaml"): - config_path = os.path.join(configs_directory, file) - # reads the yaml file - with open(config_path, "r") as f: - config = yaml.safe_load(f) - - cadence = os.environ.get("CADENCE", "commit") - expected_cadence = config["cadence"] - - if not isinstance(expected_cadence, list): - expected_cadence = [expected_cadence] - if cadence in expected_cadence: - config_dicts.append(config) - else: - logging.info( - f"Skipping testing model: {config['model_path']} " - f"for cadence: {config['cadence']}" - ) - else: - raise FileNotFoundError( - f"Could not find a yaml file in {configs_directory}" - ) - return config_dicts - - -def validate_internal_kv_cache( - internal_kv_cache, available_kv_cache_types: Union[str, List[str]] -) -> bool: - if internal_kv_cache and True not in available_kv_cache_types: - pytest.skip( - "The tests for running the pipeline with " - "internal kv cache management are disabled." - ) - if not internal_kv_cache and False not in available_kv_cache_types: - pytest.skip( - "The tests for running the pipeline with " - "external kv cache management are disabled." - ) - return internal_kv_cache - - -def validate_task(task: str, available_tasks: Union[str, List[str]]) -> bool: - if task not in available_tasks: - pytest.skip( - f"The tests for running the pipeline with task: {task} are disabled. " - f"The available tasks, as specified in the config are: {available_tasks}" - ) - return task diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py deleted file mode 100644 index eb02b91ba9..0000000000 --- a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This test suite consumes config files to test the text generation pipeline -for various scenerios. - -A sample config file is a yaml that requires the following fields: - cadence: The cadence of the tests. The available options are: - "nightly", "weekly" and "commit". By default, only - the tests that have cadence "commit" will be run - in GHA. This parameter can be both a string or a - list of strings. - model_path: The path to the model to be tested - (sparsezoo stub/hf model path/local_path) - torch_model_name: The name of the torch model - (to generate ground truth info) - task: The task to be tested - (e.g. text-generation) - prompt: The prompt to use for testing - has_bos_token: Whether the model has a bos token - precision: The precision for the logits/kv_cache entries - comparison - internal_kv_cache: The type of the internal KV cache - management. Is a list that can contain the following - values: [True], [False] or [True, False] (to test both - external and internal KV cache management) -""" -import os -from typing import List, Tuple - -import numpy - -import pytest - -# NOTE: this tests the legacy text generation pipeline. integration tests exist -# for the new pipeline under v2 -from deepsparse.legacy import Pipeline -from deepsparse.transformers.schemas.text_generation_schemas import TextGenerationOutput -from sparsezoo import Model -from tests.deepsparse.transformers.pipelines.legacy.integration_tests.helpers import ( - TorchGroundTruthSource, - parse_params, - validate_internal_kv_cache, - validate_task, -) - - -CONFIGS_DIRECTORY = ( - "tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs" -) - - -@pytest.fixture() -def max_new_tokens() -> int: - return 64 - - -@pytest.mark.parametrize("params_dict", parse_params(CONFIGS_DIRECTORY)) -@pytest.mark.parametrize( - "internal_kv_cache", - [True, False], -) -@pytest.mark.parametrize( - "task", - ["text-generation", "chat"], -) -class TestsIntegrationLLMsPipelines: - """ - This test suite is meant to test the main scenarios of - the text generation pipeline. - """ - - def get_pipeline(self, **kwargs) -> Pipeline: - """ - If no kwargs provided, returns the cached "default" - pipeline that is used for most of the tests. - Otherwise, returns a pipeline with the given kwargs - (the default pipeline kwargs are updated with the - user-provided kwargs) - - :param kwargs: the optional kwargs to be used to - create the pipeline (if not provided, the cached - "default" pipeline is returned) - :return: the appropriate pipeline - """ - if not kwargs: - if self.default_pipeline is None: - self.default_pipeline = Pipeline.create(**self.default_pipeline_kwargs) - return self.default_pipeline - - # return a pipeline with the updated default kwargs - updated_kwargs = self.default_pipeline_kwargs.copy() - updated_kwargs.update(kwargs) - return Pipeline.create(**updated_kwargs) - - @pytest.fixture - def setup(self, params_dict, max_new_tokens, internal_kv_cache, task): - # set the params_dict as the class attributes - for key, value in params_dict.items(): - setattr(self, key, value) - # check whether the specified cache management type - # is supported for testing (skip if not supported) - self.internal_kv_cache: bool = validate_internal_kv_cache( - internal_kv_cache, self.internal_kv_cache - ) - self.task: str = validate_task(task, self.task) - # create torch ground source - torch_source = TorchGroundTruthSource( - num_tokens_to_generate=max_new_tokens + 1, - model_name=self.torch_model_name, - ) - # create torch ground truth - self.torch_ground_truth = torch_source(self.prompt) - - # specify the default pipeline kwargs - self.default_pipeline_kwargs = dict( - task=self.task, - model_path=self.model_path, - internal_kv_cache=self.internal_kv_cache, - ) - self.default_pipeline = None - self.max_new_tokens = max_new_tokens - - def test_ort_single_token_prefill(self, setup): - # Test the pipeline that uses ORT engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by single-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed externally - - if self.internal_kv_cache: - pytest.skip( - "Cannot run ORT pipeline with the internal deepsparse cache enabled." - ) - - pipeline = self.get_pipeline( - prompt_sequence_length=1, - engine_type="onnxruntime", - ) - pipeline._debug = True - output = pipeline( - self.prompt, - max_new_tokens=self.max_new_tokens, - output_scores=True, - include_prompt_logits=True, - ) - - self._test_output( - output=output, - torch_ground_truth=self.torch_ground_truth, - ) - - def test_ort_multi_token_prefill(self, setup): - # Test the pipeline that uses ORT engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by multi-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed externally - - if self.internal_kv_cache: - pytest.skip( - "Cannot run ORT pipeline with the internal deepsparse cache enabled." - ) - pipeline = self.get_pipeline( - engine_type="onnxruntime", - ) - pipeline._debug = True - output = pipeline( - self.prompt, - max_new_tokens=self.max_new_tokens, - output_scores=True, - include_prompt_logits=True, - ) - - self._test_output( - output=output, - torch_ground_truth=self.torch_ground_truth, - ) - - def test_deepsparse_single_token_prefill(self, setup): - # Test the pipeline that uses deepsparse engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by single-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed externally or internally - - pipeline = self.get_pipeline( - prompt_sequence_length=1, - ) - pipeline._debug = True - output = pipeline( - self.prompt, - max_new_tokens=self.max_new_tokens, - output_scores=True, - include_prompt_logits=True, - ) - - self._test_output( - output=output, - torch_ground_truth=self.torch_ground_truth, - # disable kv cache validation if using internal kv cache - run_kv_cache_validation=not self.internal_kv_cache, - ) - - def test_deepsparse_multi_token_prefill(self, setup): - # Test the pipeline that uses deepsparse engine. The test covers the - # following scenario: - # 1. Prompt preprocessing is performed by multi-token engine - # 2. The KV Cache is never filled up - # 3. KV Cache managed internally or externally - - pipeline = self.get_pipeline() - pipeline._debug = True - output = pipeline( - self.prompt, - max_new_tokens=self.max_new_tokens, - output_scores=True, - include_prompt_logits=True, - ) - - self._test_output( - output=output, - torch_ground_truth=self.torch_ground_truth, - # disable kv cache validation if using internal kv cache - run_kv_cache_validation=not self.internal_kv_cache, - ) - - def test_inference_no_kv_cache_deepsparse(self, setup): - self._test_inference_no_kv_cache(engine_type="deepsparse") - - def test_inference_no_kv_cache_ort(self, setup): - self._test_inference_no_kv_cache(engine_type="onnxruntime") - - def _test_inference_no_kv_cache(self, engine_type): - model_path_no_cache = self._get_model_path_no_cache() - pipeline = self.get_pipeline( - model_path=model_path_no_cache, engine_type=engine_type - ) - assert not pipeline.cache_support_enabled, ( - "This pipeline test inference using non-kv cache " - "model and thus should not support kv cache" - ) - - output = pipeline( - self.prompt, max_length=1, output_scores=True, include_prompt_logits=True - ) - prompt_length = self.torch_ground_truth[1].shape[1] - # prompt logits + one logit for the new generated token - logits = output.generations[0].score[-(prompt_length + 1) :, :] - # compute ground truth logits analogously - generated_logits, prompt_logits, *_ = self.torch_ground_truth - logits_gt = numpy.concatenate( - [prompt_logits[0], generated_logits[0, :1, :]], axis=0 - ) - assert numpy.allclose(logits, logits_gt, atol=self.precision) - - def _test_output( - self, - output: TextGenerationOutput, - torch_ground_truth: Tuple[numpy.ndarray, ...], - run_kv_cache_validation: bool = True, - ): - - ( - generated_logits, - prompt_logits, - prompt_kv_cache, - generated_text, - ) = torch_ground_truth - - # concatenate target prompt_logits and generated_logits - target_logits = numpy.concatenate([prompt_logits, generated_logits], axis=1) - # get the logits of the generated sequence - score = output.generations[0].score - - # we expect the logits to be exactly the same - # as the target logits; the generated sequence should - # also be the same as the target sequence - assert numpy.allclose(score, target_logits[0], atol=self.precision) - assert self.prompt + output.generations[0].text == generated_text - - if hasattr(output, "kv_cache_state") and run_kv_cache_validation: - # (if applicable) the kv cache should be the same as the - # target kv cache - expected_cache = list(output.kv_cache_state[0].values()) - total_num_processed_tokens = output.total_num_processed_tokens[0] - self._test_kv_cache_state( - expected_cache=expected_cache, - target_cache=prompt_kv_cache, - total_num_processed_tokens=total_num_processed_tokens, - ) - - def _test_kv_cache_state( - self, - expected_cache: List[numpy.ndarray], - target_cache: List[numpy.ndarray], - total_num_processed_tokens: int, - ): - for x, y in zip(expected_cache, target_cache): - start_index = total_num_processed_tokens - end_index = total_num_processed_tokens - y.shape[2] - # x is (in general) composed of three arrays: - # - padding cache entries (from 0 to -start_index) - # - prompt cache entries (from -start_index to -end_index) - # - generated cache entries (from -end_index to -1) - # as target_cache only pertains to prompt cache entries, we need to - # compare only the prompt cache entries in x with y - assert numpy.allclose( - x[:, :, -start_index:-end_index, :], y, atol=self.precision - ) - - def _get_model_path_no_cache(self): - if not self.model_path.startswith("zoo:"): - pytest.skip("For this test, for now only the zoo model is supported") - model = Model(self.model_path) - # fetch the necessary file names for pipeline creation - required_file_names = [ - os.path.basename(file.name) for file in model.deployment.files - ] - training_directory = model.training - onnx_model_name_no_cache = [ - os.path.basename(file.name) - for file in model.training.files - if file.name.endswith(".onnx") - ][0] - - # check if 'training' exists, - # if not, download the files - if "training" not in os.listdir(model._path): - for filename in required_file_names: - # download the files to a training directory - if filename.endswith(".data"): - # data files are typically stored in a deployment directory - # download them to training - file = model.deployment.get_file(filename) - assert ( - file is not None - ), f"Unable to find file {filename} in model {model}" - file.name = file.name.replace("deployment", "training") - file.download() - continue - - if filename.endswith(".onnx"): - # instead of `model.onnx` the onnx_model_name_no_cache - # should be downloaded - filename = filename.replace("model.onnx", onnx_model_name_no_cache) - - file = training_directory.get_file(filename) - assert ( - file is not None - ), f"Unable to find file {filename} in model {model}" - file.download() - # rename the model file to `model.onnx` - os.rename( - os.path.join(training_directory.path, onnx_model_name_no_cache), - os.path.join(training_directory.path, "model.onnx"), - ) - return training_directory._path diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py index 3485658dda..43dbfc107b 100644 --- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py +++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py @@ -40,20 +40,22 @@ import numpy import pytest -from deepsparse.transformers.pipelines.text_generation import TextGenerationOutput -from deepsparse.v2.pipeline import Pipeline -from deepsparse.v2.text_generation import ( +from deepsparse.pipeline import Pipeline +from deepsparse.transformers.pipelines.text_generation import ( + TextGenerationOutput, TextGenerationPipeline, TextGenerationPipelineNoCache, ) -from tests.deepsparse.transformers.pipelines.integration_tests.helpers import ( +from tests.deepsparse.transformers.text_generation.integration_tests.helpers import ( TorchGroundTruthSource, parse_params, validate_internal_kv_cache, ) -CONFIGS_DIRECTORY = "tests/deepsparse/v2/integration_tests/configs" +CONFIGS_DIRECTORY = ( + "tests/deepsparse/transformers/text_generation/integration_tests/configs" +) @pytest.fixture() @@ -146,7 +148,7 @@ def test_ort_single_token_prefill(self, setup): pipeline = self.get_pipeline( prompt_sequence_length=1, - engine_kwargs=dict(engine_type="onnxruntime"), + engine_type="onnxruntime", ) output = pipeline( prompt=self.prompt, @@ -173,9 +175,7 @@ def test_ort_multi_token_prefill(self, setup): pytest.skip( "Cannot run ORT pipeline with the internal deepsparse cache enabled." ) - pipeline = self.get_pipeline( - engine_kwargs=dict(engine_type="onnxruntime"), - ) + pipeline = self.get_pipeline(engine_type="onnxruntime") output = pipeline( prompt=self.prompt, include_prompt_logits=True, @@ -248,7 +248,7 @@ def _test_inference_no_kv_cache(self, engine_type): pipeline = self.get_pipeline( onnx_model_name=self.model_name_no_kv_cache, kv_cache_support=False, - engine_kwargs=dict(engine_type=engine_type), + engine_type=engine_type, ) output = pipeline( diff --git a/tests/deepsparse/transformers/text_generation/unit/text_generation/test_pipeline_no_kv_cache.py b/tests/deepsparse/transformers/text_generation/unit/text_generation/test_pipeline_no_kv_cache.py index a6fbfc4d11..de12d0e709 100644 --- a/tests/deepsparse/transformers/text_generation/unit/text_generation/test_pipeline_no_kv_cache.py +++ b/tests/deepsparse/transformers/text_generation/unit/text_generation/test_pipeline_no_kv_cache.py @@ -15,7 +15,9 @@ import os import pytest -from deepsparse.v2.text_generation import TextGenerationPipelineNoCache +from deepsparse.transformers.pipelines.text_generation.pipeline_no_kv_cache import ( + TextGenerationPipelineNoCache, +) @pytest.mark.parametrize( From 1ac1f5ce882a66f03c81911bd1b3eda143126fd2 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 17:27:31 +0000 Subject: [PATCH 52/57] bring tests back" --- .../legacy/integration_tests/__init__.py | 13 + .../integration_tests/configs/codegen.yaml | 8 + .../integration_tests/configs/gpt_neo.yaml | 8 + .../legacy/integration_tests/configs/opt.yaml | 8 + .../legacy/integration_tests/helpers.py | 146 +++++++ .../legacy/integration_tests/test_llms.py | 369 ++++++++++++++++++ 6 files changed, 552 insertions(+) create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py create mode 100644 tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py new file mode 100644 index 0000000000..0c44f887a4 --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml new file mode 100644 index 0000000000..62aac94a6b --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/codegen.yaml @@ -0,0 +1,8 @@ +cadence: "nightly" +model_path: "zoo:nlg/text_generation/codegen_mono-350m/pytorch/huggingface/bigpython_bigquery_thepile/base-none" +torch_model_name: "salesforce/codegen-350m-mono" +task: ["text-generation"]#, "chat"] +prompt: "\ndef Fibonacci(n):\n # Check if input is 0 then it will\n # print incorrect input" +has_bos_token: False +precision: 0.0001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml new file mode 100644 index 0000000000..6dd3d59e33 --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/gpt_neo.yaml @@ -0,0 +1,8 @@ +cadence: "commit" +model_path: "hf:mgoin/TinyStories-1M-ds" +torch_model_name: "roneneldan/TinyStories-1M" +task: ["text-generation"] +prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" +has_bos_token: True +precision: 0.001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml new file mode 100644 index 0000000000..2dfed87fd6 --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs/opt.yaml @@ -0,0 +1,8 @@ +cadence: "nightly" +model_path: "zoo:nlg/text_generation/opt-1.3b/pytorch/huggingface/opt_pretrain/base-none" +torch_model_name: "facebook/opt-1.3b" +task: ["text-generation"] +prompt: "Didn't know what time it was, the lights were low\n I leaned back on my radio" +has_bos_token: True +precision: 0.0001 +internal_kv_cache: [True, False] \ No newline at end of file diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py new file mode 100644 index 0000000000..e51ac7947a --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/helpers.py @@ -0,0 +1,146 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +from typing import Any, Dict, List, Tuple, Union + +import numpy +import yaml +from transformers import AutoModelForCausalLM, AutoTokenizer + +import pytest + + +class TorchGroundTruthSource: + """ + An object that generates ground truth logits and + cache states from a prompt. This object can + generate tokens in an autoregressive manner, and thus + will output: + - prompt logits, + - generated logits, + - prompt cache state, + - generated sequence + """ + + def __init__(self, num_tokens_to_generate: int, model_name: str): + + self.model = AutoModelForCausalLM.from_pretrained(model_name) + self.tokenizer = self._create_tokenizer(model_name) + + self.num_tokens_to_generate = num_tokens_to_generate + + def tokenize(self, prompt: str): + return self.tokenizer(prompt, return_tensors="pt") + + def __call__( + self, prompt: str + ) -> Tuple[numpy.ndarray, numpy.ndarray, List[numpy.ndarray], str]: + # afaik it is not possible to get 'past_key_values' from + # the generate method, so we have to run the model twice + out = self.model.generate( + self.tokenize(prompt).input_ids, + max_new_tokens=self.num_tokens_to_generate, + output_scores=True, + return_dict_in_generate=True, + use_cache=True, + ) + generated_text = self.tokenizer.decode( + out.sequences[0], skip_special_tokens=True + ) + generated_logits = numpy.concatenate( + [[score.numpy() for score in out.scores]] + ).transpose( + 1, 0, 2 + ) # (1, num_tokens_to_generate, vocab_size) + + out = self.model(**self.tokenize(prompt)) + prompt_logits = out.logits.detach().numpy()[ + :, :-1, : + ] # (1, prompt_length, vocab_size) + prompt_cache = [ + entry.detach().numpy() + for key_value_tuple in out.past_key_values + for entry in key_value_tuple + ] # List[(1, num_heads, past_length, head_dim)] + + return generated_logits, prompt_logits, prompt_cache, generated_text + + @staticmethod + def _create_tokenizer(model_name): + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.padding_side = "left" + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + return tokenizer + + +def parse_params(configs_directory: str) -> List[Dict[str, Any]]: + # parses the config file provided + assert os.path.isdir( + configs_directory + ), f"Config_directory {configs_directory} is not a directory" + + config_dicts = [] + for file in os.listdir(configs_directory): + if file.endswith(".yaml"): + config_path = os.path.join(configs_directory, file) + # reads the yaml file + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + cadence = os.environ.get("CADENCE", "commit") + expected_cadence = config["cadence"] + + if not isinstance(expected_cadence, list): + expected_cadence = [expected_cadence] + if cadence in expected_cadence: + config_dicts.append(config) + else: + logging.info( + f"Skipping testing model: {config['model_path']} " + f"for cadence: {config['cadence']}" + ) + else: + raise FileNotFoundError( + f"Could not find a yaml file in {configs_directory}" + ) + return config_dicts + + +def validate_internal_kv_cache( + internal_kv_cache, available_kv_cache_types: Union[str, List[str]] +) -> bool: + if internal_kv_cache and True not in available_kv_cache_types: + pytest.skip( + "The tests for running the pipeline with " + "internal kv cache management are disabled." + ) + if not internal_kv_cache and False not in available_kv_cache_types: + pytest.skip( + "The tests for running the pipeline with " + "external kv cache management are disabled." + ) + return internal_kv_cache + + +def validate_task(task: str, available_tasks: Union[str, List[str]]) -> bool: + if task not in available_tasks: + pytest.skip( + f"The tests for running the pipeline with task: {task} are disabled. " + f"The available tasks, as specified in the config are: {available_tasks}" + ) + return task diff --git a/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py new file mode 100644 index 0000000000..eb02b91ba9 --- /dev/null +++ b/tests/deepsparse/transformers/pipelines/legacy/integration_tests/test_llms.py @@ -0,0 +1,369 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This test suite consumes config files to test the text generation pipeline +for various scenerios. + +A sample config file is a yaml that requires the following fields: + cadence: The cadence of the tests. The available options are: + "nightly", "weekly" and "commit". By default, only + the tests that have cadence "commit" will be run + in GHA. This parameter can be both a string or a + list of strings. + model_path: The path to the model to be tested + (sparsezoo stub/hf model path/local_path) + torch_model_name: The name of the torch model + (to generate ground truth info) + task: The task to be tested + (e.g. text-generation) + prompt: The prompt to use for testing + has_bos_token: Whether the model has a bos token + precision: The precision for the logits/kv_cache entries + comparison + internal_kv_cache: The type of the internal KV cache + management. Is a list that can contain the following + values: [True], [False] or [True, False] (to test both + external and internal KV cache management) +""" +import os +from typing import List, Tuple + +import numpy + +import pytest + +# NOTE: this tests the legacy text generation pipeline. integration tests exist +# for the new pipeline under v2 +from deepsparse.legacy import Pipeline +from deepsparse.transformers.schemas.text_generation_schemas import TextGenerationOutput +from sparsezoo import Model +from tests.deepsparse.transformers.pipelines.legacy.integration_tests.helpers import ( + TorchGroundTruthSource, + parse_params, + validate_internal_kv_cache, + validate_task, +) + + +CONFIGS_DIRECTORY = ( + "tests/deepsparse/transformers/pipelines/legacy/integration_tests/configs" +) + + +@pytest.fixture() +def max_new_tokens() -> int: + return 64 + + +@pytest.mark.parametrize("params_dict", parse_params(CONFIGS_DIRECTORY)) +@pytest.mark.parametrize( + "internal_kv_cache", + [True, False], +) +@pytest.mark.parametrize( + "task", + ["text-generation", "chat"], +) +class TestsIntegrationLLMsPipelines: + """ + This test suite is meant to test the main scenarios of + the text generation pipeline. + """ + + def get_pipeline(self, **kwargs) -> Pipeline: + """ + If no kwargs provided, returns the cached "default" + pipeline that is used for most of the tests. + Otherwise, returns a pipeline with the given kwargs + (the default pipeline kwargs are updated with the + user-provided kwargs) + + :param kwargs: the optional kwargs to be used to + create the pipeline (if not provided, the cached + "default" pipeline is returned) + :return: the appropriate pipeline + """ + if not kwargs: + if self.default_pipeline is None: + self.default_pipeline = Pipeline.create(**self.default_pipeline_kwargs) + return self.default_pipeline + + # return a pipeline with the updated default kwargs + updated_kwargs = self.default_pipeline_kwargs.copy() + updated_kwargs.update(kwargs) + return Pipeline.create(**updated_kwargs) + + @pytest.fixture + def setup(self, params_dict, max_new_tokens, internal_kv_cache, task): + # set the params_dict as the class attributes + for key, value in params_dict.items(): + setattr(self, key, value) + # check whether the specified cache management type + # is supported for testing (skip if not supported) + self.internal_kv_cache: bool = validate_internal_kv_cache( + internal_kv_cache, self.internal_kv_cache + ) + self.task: str = validate_task(task, self.task) + # create torch ground source + torch_source = TorchGroundTruthSource( + num_tokens_to_generate=max_new_tokens + 1, + model_name=self.torch_model_name, + ) + # create torch ground truth + self.torch_ground_truth = torch_source(self.prompt) + + # specify the default pipeline kwargs + self.default_pipeline_kwargs = dict( + task=self.task, + model_path=self.model_path, + internal_kv_cache=self.internal_kv_cache, + ) + self.default_pipeline = None + self.max_new_tokens = max_new_tokens + + def test_ort_single_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by single-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + + pipeline = self.get_pipeline( + prompt_sequence_length=1, + engine_type="onnxruntime", + ) + pipeline._debug = True + output = pipeline( + self.prompt, + max_new_tokens=self.max_new_tokens, + output_scores=True, + include_prompt_logits=True, + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + ) + + def test_ort_multi_token_prefill(self, setup): + # Test the pipeline that uses ORT engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally + + if self.internal_kv_cache: + pytest.skip( + "Cannot run ORT pipeline with the internal deepsparse cache enabled." + ) + pipeline = self.get_pipeline( + engine_type="onnxruntime", + ) + pipeline._debug = True + output = pipeline( + self.prompt, + max_new_tokens=self.max_new_tokens, + output_scores=True, + include_prompt_logits=True, + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + ) + + def test_deepsparse_single_token_prefill(self, setup): + # Test the pipeline that uses deepsparse engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by single-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed externally or internally + + pipeline = self.get_pipeline( + prompt_sequence_length=1, + ) + pipeline._debug = True + output = pipeline( + self.prompt, + max_new_tokens=self.max_new_tokens, + output_scores=True, + include_prompt_logits=True, + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + # disable kv cache validation if using internal kv cache + run_kv_cache_validation=not self.internal_kv_cache, + ) + + def test_deepsparse_multi_token_prefill(self, setup): + # Test the pipeline that uses deepsparse engine. The test covers the + # following scenario: + # 1. Prompt preprocessing is performed by multi-token engine + # 2. The KV Cache is never filled up + # 3. KV Cache managed internally or externally + + pipeline = self.get_pipeline() + pipeline._debug = True + output = pipeline( + self.prompt, + max_new_tokens=self.max_new_tokens, + output_scores=True, + include_prompt_logits=True, + ) + + self._test_output( + output=output, + torch_ground_truth=self.torch_ground_truth, + # disable kv cache validation if using internal kv cache + run_kv_cache_validation=not self.internal_kv_cache, + ) + + def test_inference_no_kv_cache_deepsparse(self, setup): + self._test_inference_no_kv_cache(engine_type="deepsparse") + + def test_inference_no_kv_cache_ort(self, setup): + self._test_inference_no_kv_cache(engine_type="onnxruntime") + + def _test_inference_no_kv_cache(self, engine_type): + model_path_no_cache = self._get_model_path_no_cache() + pipeline = self.get_pipeline( + model_path=model_path_no_cache, engine_type=engine_type + ) + assert not pipeline.cache_support_enabled, ( + "This pipeline test inference using non-kv cache " + "model and thus should not support kv cache" + ) + + output = pipeline( + self.prompt, max_length=1, output_scores=True, include_prompt_logits=True + ) + prompt_length = self.torch_ground_truth[1].shape[1] + # prompt logits + one logit for the new generated token + logits = output.generations[0].score[-(prompt_length + 1) :, :] + # compute ground truth logits analogously + generated_logits, prompt_logits, *_ = self.torch_ground_truth + logits_gt = numpy.concatenate( + [prompt_logits[0], generated_logits[0, :1, :]], axis=0 + ) + assert numpy.allclose(logits, logits_gt, atol=self.precision) + + def _test_output( + self, + output: TextGenerationOutput, + torch_ground_truth: Tuple[numpy.ndarray, ...], + run_kv_cache_validation: bool = True, + ): + + ( + generated_logits, + prompt_logits, + prompt_kv_cache, + generated_text, + ) = torch_ground_truth + + # concatenate target prompt_logits and generated_logits + target_logits = numpy.concatenate([prompt_logits, generated_logits], axis=1) + # get the logits of the generated sequence + score = output.generations[0].score + + # we expect the logits to be exactly the same + # as the target logits; the generated sequence should + # also be the same as the target sequence + assert numpy.allclose(score, target_logits[0], atol=self.precision) + assert self.prompt + output.generations[0].text == generated_text + + if hasattr(output, "kv_cache_state") and run_kv_cache_validation: + # (if applicable) the kv cache should be the same as the + # target kv cache + expected_cache = list(output.kv_cache_state[0].values()) + total_num_processed_tokens = output.total_num_processed_tokens[0] + self._test_kv_cache_state( + expected_cache=expected_cache, + target_cache=prompt_kv_cache, + total_num_processed_tokens=total_num_processed_tokens, + ) + + def _test_kv_cache_state( + self, + expected_cache: List[numpy.ndarray], + target_cache: List[numpy.ndarray], + total_num_processed_tokens: int, + ): + for x, y in zip(expected_cache, target_cache): + start_index = total_num_processed_tokens + end_index = total_num_processed_tokens - y.shape[2] + # x is (in general) composed of three arrays: + # - padding cache entries (from 0 to -start_index) + # - prompt cache entries (from -start_index to -end_index) + # - generated cache entries (from -end_index to -1) + # as target_cache only pertains to prompt cache entries, we need to + # compare only the prompt cache entries in x with y + assert numpy.allclose( + x[:, :, -start_index:-end_index, :], y, atol=self.precision + ) + + def _get_model_path_no_cache(self): + if not self.model_path.startswith("zoo:"): + pytest.skip("For this test, for now only the zoo model is supported") + model = Model(self.model_path) + # fetch the necessary file names for pipeline creation + required_file_names = [ + os.path.basename(file.name) for file in model.deployment.files + ] + training_directory = model.training + onnx_model_name_no_cache = [ + os.path.basename(file.name) + for file in model.training.files + if file.name.endswith(".onnx") + ][0] + + # check if 'training' exists, + # if not, download the files + if "training" not in os.listdir(model._path): + for filename in required_file_names: + # download the files to a training directory + if filename.endswith(".data"): + # data files are typically stored in a deployment directory + # download them to training + file = model.deployment.get_file(filename) + assert ( + file is not None + ), f"Unable to find file {filename} in model {model}" + file.name = file.name.replace("deployment", "training") + file.download() + continue + + if filename.endswith(".onnx"): + # instead of `model.onnx` the onnx_model_name_no_cache + # should be downloaded + filename = filename.replace("model.onnx", onnx_model_name_no_cache) + + file = training_directory.get_file(filename) + assert ( + file is not None + ), f"Unable to find file {filename} in model {model}" + file.download() + # rename the model file to `model.onnx` + os.rename( + os.path.join(training_directory.path, onnx_model_name_no_cache), + os.path.join(training_directory.path, "model.onnx"), + ) + return training_directory._path From a7344598e39809feb8c7ef0a2940baf197aa894b Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 17:31:10 +0000 Subject: [PATCH 53/57] quality --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 4ffcada6e2..51788944c0 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,22 @@ See the License for the specific language governing permissions and limitations under the License. --> + +

From 60fa00f0a86c97e47b402555f1adf2578b7e2e56 Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 17:31:58 +0000 Subject: [PATCH 54/57] original readme --- README.md | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/README.md b/README.md index 51788944c0..4ffcada6e2 100644 --- a/README.md +++ b/README.md @@ -14,22 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. --> - -

From 9371990aaecd3815f49bebcc1aa0ac253550b50f Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Wed, 20 Dec 2023 10:58:19 +0000 Subject: [PATCH 55/57] addressing Dipikas comments --- .../pipelines/text_generation/generate_new_token.py | 8 +++++--- .../pipelines/text_generation/pipeline_no_kv_cache.py | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py index 1eae6c138d..e45374b9de 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py +++ b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py @@ -49,9 +49,11 @@ def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): callback = inference_state.current_state.get("callback") stop = inference_state.current_state.get("stop") - if kv_cache: - if kv_cache.total_num_processed_tokens >= kv_cache.capacity: - finish_reason = FinishReason.CAPACITY + if ( + kv_cache is not None + and kv_cache.total_num_processed_tokens >= kv_cache.capacity + ): + finish_reason = FinishReason.CAPACITY if token == self.tokenizer.eos_token_id and not self.force_max_tokens: finish_reason = FinishReason.STOP diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py index 7b5c7f67f0..de4e2a5838 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py @@ -24,6 +24,7 @@ GenerateNewTokenOperator, JoinOutput, NLEngineOperatorNoCache, + ParseTextGenerationInputs, PrepareGeneration, ProcessInputsTextGeneration, ProcessOutputs, @@ -64,6 +65,8 @@ def __init__( token_generator = TokenGeneratorOperator() + parse_inputs = ParseTextGenerationInputs() + process_inputs = ProcessInputsTextGeneration( generation_config=process_generation_config(generation_config), sequence_length=sequence_length, @@ -86,6 +89,7 @@ def __init__( process_outputs = ProcessOutputs(tokenizer=self.tokenizer) ops = { + "parse_inputs": parse_inputs, "process_input": process_inputs, "engine_operator": engine_operator, "prepare_generation": prepare_generation, From 4eed46399a309e8a4c17742bfc19c58f9c702984 Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Wed, 20 Dec 2023 16:00:36 +0100 Subject: [PATCH 56/57] Update src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py --- .../pipelines/text_generation/pipeline_no_kv_cache.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py index de4e2a5838..ffa8eeebac 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py +++ b/src/deepsparse/transformers/pipelines/text_generation/pipeline_no_kv_cache.py @@ -99,6 +99,7 @@ def __init__( "process_outputs": process_outputs, } routes = { + "parse_inputs": "process_input", "process_input": "SPLIT", "SPLIT": "engine_operator", "engine_operator": "prepare_generation", From 111d533248ed7b6d966e22e27eabe73c20a5b83f Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Thu, 21 Dec 2023 15:44:13 +0000 Subject: [PATCH 57/57] addressing PR review --- .../text_generation/generate_new_token.py | 22 ++++++++++++++----- .../text_generation/prep_for_generation.py | 21 +++++++++--------- .../schemas/text_generation_schemas.py | 8 +++++++ .../integration_tests/test_llms.py | 4 ++-- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py index e45374b9de..471a3d8dd2 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py +++ b/src/deepsparse/transformers/pipelines/text_generation/generate_new_token.py @@ -19,7 +19,10 @@ from deepsparse.transformers.pipelines.text_generation.nl_engine_operator import ( NLEngineOutputs, ) -from deepsparse.transformers.schemas.text_generation_schemas import FinishReason +from deepsparse.transformers.schemas.text_generation_schemas import ( + FinishReason, + PromptLogitsNoKVCacheInference, +) from deepsparse.utils import InferenceState @@ -33,14 +36,23 @@ def __init__( self.force_max_tokens = force_max_tokens self.tokenizer = tokenizer - def can_operate(self, inp: NLEngineOutputs): + def can_operate(self, inp: Union[PromptLogitsNoKVCacheInference, NLEngineOutputs]): if inp.in_generation: return True return False - def run(self, inp: NLEngineOutputs, inference_state: InferenceState, **kwargs): - logits = inp.engine_outputs - kv_cache = inp.kv_cache + def run( + self, + inp: Union[PromptLogitsNoKVCacheInference, NLEngineOutputs], + inference_state: InferenceState, + **kwargs, + ): + logits = ( + inp.engine_outputs + if isinstance(inp, NLEngineOutputs) + else inp.prompt_logits + ) + kv_cache = inp.kv_cache if isinstance(inp, NLEngineOutputs) else None token_generator = inference_state.current_state.get("token_generator") token = token_generator.generate(logits=logits[0, -1, :]) diff --git a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py index df14398542..572840d13e 100644 --- a/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py +++ b/src/deepsparse/transformers/pipelines/text_generation/prep_for_generation.py @@ -18,10 +18,10 @@ from deepsparse.operators import Operator from deepsparse.transformers.pipelines.text_generation import TokenGeneratorOperator -from deepsparse.transformers.pipelines.text_generation.nl_engine_operator import ( - NLEngineOutputs, +from deepsparse.transformers.schemas.text_generation_schemas import ( + FinishReason, + PromptLogitsNoKVCacheInference, ) -from deepsparse.transformers.schemas.text_generation_schemas import FinishReason from deepsparse.transformers.utils.helpers import set_generated_length from deepsparse.utils import InferenceState @@ -94,13 +94,12 @@ def run( "finished_reason": [], "token_generator": token_generator, } - - output = { - "tokens": token_generator.tokens, - "kv_cache": kv_cache, - "in_generation": True, - } if kv_cache is None: - output = NLEngineOutputs(**output, engine_outputs=prompt_logits) - + output = PromptLogitsNoKVCacheInference(prompt_logits=prompt_logits) + else: + output = { + "tokens": token_generator.tokens, + "kv_cache": kv_cache, + "in_generation": True, + } return output, state_update diff --git a/src/deepsparse/transformers/schemas/text_generation_schemas.py b/src/deepsparse/transformers/schemas/text_generation_schemas.py index c3d9e28229..7c08aa8d80 100644 --- a/src/deepsparse/transformers/schemas/text_generation_schemas.py +++ b/src/deepsparse/transformers/schemas/text_generation_schemas.py @@ -165,3 +165,11 @@ class TextGenerationOutput(BaseModel): class Config: arbitrary_types_allowed = True extra = "allow" + + +class PromptLogitsNoKVCacheInference(BaseModel): + prompt_logits: Any = Field( + description="A set of prompt logits generated " + "during the inference pass with a " + "non-kv cache model" + ) diff --git a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py index 43dbfc107b..82a81d611c 100644 --- a/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py +++ b/tests/deepsparse/transformers/text_generation/integration_tests/test_llms.py @@ -15,7 +15,7 @@ This test suite consumes config files to test the text generation pipeline for various scenarios. -A sample config file is a yaml that r_equires the following fields: +A sample config file is a yaml that requires the following fields: cadence: The cadence of the tests. The available options are: "nightly", "weekly" and "commit". By default, only the tests that have cadence "commit" will be run @@ -42,10 +42,10 @@ import pytest from deepsparse.pipeline import Pipeline from deepsparse.transformers.pipelines.text_generation import ( - TextGenerationOutput, TextGenerationPipeline, TextGenerationPipelineNoCache, ) +from deepsparse.transformers.schemas.text_generation_schemas import TextGenerationOutput from tests.deepsparse.transformers.text_generation.integration_tests.helpers import ( TorchGroundTruthSource, parse_params,