Python/Local Hugging Face Inference for Completions and Embeddings (m…

…icrosoft#658) ### Motivation and Context This PR introduces native python support for Hugging Face models that can: complete text, generate new text, summarize, and that generate embeddings. Currently only supports downloading the models locally from the HF model hub. Future plans include supporting the HF inference API as well. ### Description - Added 2 services: `hf_text_completion` and `hf_text_embedding` - `hf_text_completion` supports the following tasks: _text-generation_, _text2text-generation_, and _summarization_ - `hf_text_embedding` supports any model supported by the sentence-transformers pip package - Added dependencies: pytorch, transformers, sentence-transformers to `requirements.txt` and `poetry.lock` - fixed typo: `get_embedding_service_service_id` -> `get_embedding_service_id` - Added a number of integration tests for supported HF models
banreyms · Apr 27, 2023 · f6059cd · f6059cd
1 parent 5b1ed2f
commit f6059cd
Show file tree

Hide file tree

Showing 26 changed files with 1,673 additions and 127 deletions.
diff --git a/FEATURE_MATRIX.md b/FEATURE_MATRIX.md
@@ -13,13 +13,14 @@
 |---|---|---|---|
 | OpenAI                            | ✅ | ✅ | |
 | AzureOpenAI                       | ✅ | ✅ | |
-| Hugging Face                      | ✅ | ❌ | Coming soon to Python - both native and web endpoint support |
-| Custom                            | ✅ | ❌ | Requires the user to define the service schema in their application |
+| Hugging Face Inference API        | 🔄 | ❌ | Coming soon to Python, not all scenarios are covered for .NET |
+| Hugging Face Local                | ❌ | ✅ | |
+| Custom                            | ✅ | 🔄 | Requires the user to define the service schema in their application |
 
 ## Tokenizers
 | | C# | Python | Notes |
 |---|---|---|---|
-| GPT2                              | ✅ | 🔄 | Can be manually added to Python via `pip install transformers` |
+| GPT2                              | ✅ | ✅ | |
 | GPT3                              | ✅ | ❌ | |
 | tiktoken                          | 🔄 | ❌ | Coming soon to Python and C#. Can be manually added to Python via `pip install tiktoken` |
 

diff --git a/python/poetry.lock b/python/poetry.lock
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -11,6 +11,9 @@ python = "^3.8"
 numpy = "^1.24.2"
 openai = "^0.27.0"
 aiofiles = "^23.1.0"
+transformers = "^4.28.1"
+torch = "^2.0.0"
+sentence-transformers = "^2.2.2"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^2.21.0"

diff --git a/python/requirements.txt b/python/requirements.txt
@@ -1,3 +1,6 @@
 openai==0.27.*
 numpy==1.24.*
-aiofiles>=23.1.0
+aiofiles>=23.1.0
+transformers>=4.28.0
+sentence-transformers>=2.2.2
+torch>=2.0.0
diff --git a/python/semantic_kernel/connectors/ai/hugging_face/__init__.py b/python/semantic_kernel/connectors/ai/hugging_face/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion import (
+    HuggingFaceTextCompletion,
+)
+from semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding import (
+    HuggingFaceTextEmbedding,
+)
+
+__all__ = ["HuggingFaceTextCompletion", "HuggingFaceTextEmbedding"]
diff --git a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_completion.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from logging import Logger
+from typing import Optional
+
+import torch
+from transformers import pipeline
+
+from semantic_kernel.connectors.ai.ai_exception import AIException
+from semantic_kernel.connectors.ai.complete_request_settings import (
+    CompleteRequestSettings,
+)
+from semantic_kernel.connectors.ai.text_completion_client_base import (
+    TextCompletionClientBase,
+)
+from semantic_kernel.utils.null_logger import NullLogger
+
+
+class HuggingFaceTextCompletion(TextCompletionClientBase):
+    _model_id: str
+    _task: str
+    _device: int
+    _log: Logger
+
+    def __init__(
+        self,
+        model_id: str,
+        device: Optional[int] = -1,
+        task: Optional[str] = None,
+        log: Optional[Logger] = None,
+    ) -> None:
+        """
+        Initializes a new instance of the HuggingFaceTextCompletion class.
+
+        Arguments:
+            model_id {str} -- Hugging Face model card string, see
+                https://huggingface.co/models
+            device {Optional[int]} -- Device to run the model on, -1 for CPU, 0+ for GPU.
+            task {Optional[str]} -- Model completion task type, options are:
+                - summarization: takes a long text and returns a shorter summary.
+                - text-generation: takes incomplete text and returns a set of completion candidates.
+                - text2text-generation (default): takes an input prompt and returns a completion.
+                text2text-generation is the default as it behaves more like GPT-3+.
+            log {Optional[Logger]} -- Logger instance.
+
+        Note that this model will be downloaded from the Hugging Face model hub.
+        """
+        self._model_id = model_id
+        self._task = "text2text-generation" if task is None else task
+        self._log = log if log is not None else NullLogger()
+        self.device = (
+            "cuda:" + device if device >= 0 and torch.cuda.is_available() else "cpu"
+        )
+        self.generator = pipeline(
+            task=self._task, model=self._model_id, device=self.device
+        )
+
+    async def complete_async(
+        self, prompt: str, request_settings: CompleteRequestSettings
+    ) -> str:
+        """
+        Completes a prompt using the Hugging Face model.
+
+        Arguments:
+            prompt {str} -- Prompt to complete.
+            request_settings {CompleteRequestSettings} -- Request settings.
+
+        Returns:
+            str -- Completion result.
+        """
+        try:
+            result = self.generator(
+                prompt,
+                num_return_sequences=1,
+                temperature=request_settings.temperature,
+                top_p=request_settings.top_p,
+                max_length=request_settings.max_tokens,
+                pad_token_id=50256,  # EOS token
+            )
+
+            if self._task == "text-generation" or self._task == "text2text-generation":
+                return result[0]["generated_text"]
+
+            elif self._task == "summarization":
+                return result[0]["summary_text"]
+
+            else:
+                raise AIException(
+                    AIException.ErrorCodes.InvalidConfiguration,
+                    "Unsupported hugging face pipeline task: only \
+                        text-generation, text2text-generation, and summarization are supported.",
+                )
+
+        except Exception as e:
+            raise AIException("Hugging Face completion failed", e)
diff --git a/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py b/python/semantic_kernel/connectors/ai/hugging_face/services/hf_text_embedding.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from logging import Logger
+from typing import List, Optional
+
+import torch
+from numpy import array, ndarray
+from sentence_transformers import SentenceTransformer
+
+from semantic_kernel.connectors.ai.ai_exception import AIException
+from semantic_kernel.connectors.ai.embeddings.embedding_generator_base import (
+    EmbeddingGeneratorBase,
+)
+from semantic_kernel.utils.null_logger import NullLogger
+
+
+class HuggingFaceTextEmbedding(EmbeddingGeneratorBase):
+    _model_id: str
+    _device: int
+    _log: Logger
+
+    def __init__(
+        self,
+        model_id: str,
+        device: Optional[int] = -1,
+        log: Optional[Logger] = None,
+    ) -> None:
+        """
+        Initializes a new instance of the HuggingFaceTextEmbedding class.
+
+        Arguments:
+            model_id {str} -- Hugging Face model card string, see
+                https://huggingface.co/sentence-transformers
+            device {Optional[int]} -- Device to run the model on, -1 for CPU, 0+ for GPU.
+            log {Optional[Logger]} -- Logger instance.
+
+        Note that this model will be downloaded from the Hugging Face model hub.
+        """
+        self._model_id = model_id
+        self._log = log if log is not None else NullLogger()
+        self.device = (
+            "cuda:" + device if device >= 0 and torch.cuda.is_available() else "cpu"
+        )
+        self.generator = SentenceTransformer(
+            model_name_or_path=self._model_id, device=self.device
+        )
+
+    async def generate_embeddings_async(self, texts: List[str]) -> ndarray:
+        """
+        Generates embeddings for a list of texts.
+
+        Arguments:
+            texts {List[str]} -- Texts to generate embeddings for.
+
+        Returns:
+            ndarray -- Embeddings for the texts.
+        """
+        try:
+            self._log.info(f"Generating embeddings for {len(texts)} texts")
+            embeddings = self.generator.encode(texts)
+            return array(embeddings)
+        except Exception as e:
+            raise AIException("Hugging Face embeddings failed", e)
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_chat_completion.py
@@ -125,7 +125,7 @@ async def complete_chat_async(
 
         return response.choices[0].message.content
 
-    async def complete_simple_async(
+    async def complete_async(
         self, prompt: str, request_settings: CompleteRequestSettings
     ) -> str:
         """

diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_text_completion.py b/python/semantic_kernel/connectors/ai/open_ai/services/open_ai_text_completion.py
@@ -54,7 +54,7 @@ def _setup_open_ai(self) -> Any:
 
         return openai
 
-    async def complete_simple_async(
+    async def complete_async(
         self, prompt: str, request_settings: CompleteRequestSettings
     ) -> str:
         """
@@ -83,14 +83,14 @@ async def complete_simple_async(
         if request_settings.number_of_responses != 1:
             raise AIException(
                 AIException.ErrorCodes.InvalidRequest,
-                "complete_simple_async only supports a single completion, "
+                "complete_async only supports a single completion, "
                 f"but {request_settings.number_of_responses} were requested",
             )
 
         if request_settings.logprobs != 0:
             raise AIException(
                 AIException.ErrorCodes.InvalidRequest,
-                "complete_simple_async does not support logprobs, "
+                "complete_async does not support logprobs, "
                 f"but logprobs={request_settings.logprobs} was requested",
             )
 

diff --git a/python/semantic_kernel/connectors/ai/text_completion_client_base.py b/python/semantic_kernel/connectors/ai/text_completion_client_base.py
@@ -12,7 +12,7 @@
 
 class TextCompletionClientBase(ABC):
     @abstractmethod
-    async def complete_simple_async(
+    async def complete_async(
         self,
         prompt: str,
         settings: "CompleteRequestSettings",

diff --git a/python/semantic_kernel/kernel_config.py b/python/semantic_kernel/kernel_config.py
@@ -192,7 +192,7 @@ def get_chat_service_service_id(self, service_id: Optional[str] = None) -> str:
 
         return service_id
 
-    def get_embedding_service_service_id(self, service_id: Optional[str] = None) -> str:
+    def get_embedding_service_id(self, service_id: Optional[str] = None) -> str:
         if service_id is None or service_id not in self._embedding_services:
             if self._default_embedding_service is None:
                 raise ValueError("No default embedding service is set")

diff --git a/python/semantic_kernel/orchestration/sk_function.py b/python/semantic_kernel/orchestration/sk_function.py
@@ -131,9 +131,7 @@ async def _local_func(client, request_settings, context):
                     context.variables.update(completion)
                 else:
                     prompt = await function_config.prompt_template.render_async(context)
-                    completion = await client.complete_simple_async(
-                        prompt, request_settings
-                    )
+                    completion = await client.complete_async(prompt, request_settings)
                     context.variables.update(completion)
             except Exception as e:
                 # TODO: "critical exceptions"

diff --git a/python/semantic_kernel/text/__init__.py b/python/semantic_kernel/text/__init__.py
@@ -1,8 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
-from semantic_kernel.text.function_extension import (
-    aggregate_chunked_results_async,
-)
+from semantic_kernel.text.function_extension import aggregate_chunked_results_async
 from semantic_kernel.text.text_chunker import (
     split_markdown_lines,
     split_markdown_paragraph,

diff --git a/python/tests/end-to-end/basics_with_azure_oai_chat_service.py b/python/tests/end-to-end/basics_with_azure_oai_chat_service.py
@@ -2,7 +2,7 @@
 
 import asyncio
 
-from utils import e2e_summarization
+from utils import e2e_text_completion
 
 import semantic_kernel as sk
 import semantic_kernel.connectors.ai.open_ai as sk_oai
@@ -16,4 +16,4 @@
     "chat-gpt", sk_oai.AzureChatCompletion("gpt-35-turbo", endpoint, api_key)
 )
 
-asyncio.run(e2e_summarization.summarize_function_test(kernel))
+asyncio.run(e2e_text_completion.summarize_function_test(kernel))
diff --git a/python/tests/end-to-end/basics_with_azure_oai_text_service.py b/python/tests/end-to-end/basics_with_azure_oai_text_service.py
@@ -2,7 +2,7 @@
 
 import asyncio
 
-from utils import e2e_summarization
+from utils import e2e_text_completion
 
 import semantic_kernel as sk
 import semantic_kernel.connectors.ai.open_ai as sk_oai
@@ -17,4 +17,4 @@
     "davinci-003", sk_oai.AzureTextCompletion("text-davinci-003", endpoint, api_key)
 )
 
-asyncio.run(e2e_summarization.summarize_function_test(kernel))
+asyncio.run(e2e_text_completion.summarize_function_test(kernel))
diff --git a/python/tests/end-to-end/basics_with_hf_local_text2text_service.py b/python/tests/end-to-end/basics_with_hf_local_text2text_service.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+
+from utils import e2e_text_completion
+
+import semantic_kernel as sk
+import semantic_kernel.connectors.ai.hugging_face as sk_hf
+
+kernel = sk.Kernel()
+
+# Configure LLM service
+kernel.config.add_text_service(
+    "google/flan-t5-base",
+    sk_hf.HuggingFaceTextCompletion("google/flan-t5-base", task="text2text-generation"),
+)
+
+asyncio.run(e2e_text_completion.simple_completion(kernel))
diff --git a/python/tests/end-to-end/basics_with_hf_local_text_generation_service.py b/python/tests/end-to-end/basics_with_hf_local_text_generation_service.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+
+from utils import e2e_text_completion
+
+import semantic_kernel as sk
+import semantic_kernel.connectors.ai.hugging_face as sk_hf
+
+kernel = sk.Kernel()
+
+# Configure LLM service
+kernel.config.add_text_service(
+    "gpt2", sk_hf.HuggingFaceTextCompletion("gpt2", task="text-generation")
+)
+
+asyncio.run(e2e_text_completion.simple_completion(kernel))
diff --git a/python/tests/end-to-end/basics_with_hf_local_text_summarization.py b/python/tests/end-to-end/basics_with_hf_local_text_summarization.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+
+from utils import e2e_text_completion
+
+import semantic_kernel as sk
+import semantic_kernel.connectors.ai.hugging_face as sk_hf
+
+kernel = sk.Kernel()
+
+# Configure LLM service
+kernel.config.add_text_service(
+    "facebook/bart-large-cnn",
+    sk_hf.HuggingFaceTextCompletion("facebook/bart-large-cnn", task="summarization"),
+)
+
+asyncio.run(e2e_text_completion.simple_summarization(kernel))
diff --git a/python/tests/end-to-end/basics_with_oai_chat_service.py b/python/tests/end-to-end/basics_with_oai_chat_service.py
@@ -2,7 +2,7 @@
 
 import asyncio
 
-from utils import e2e_summarization
+from utils import e2e_text_completion
 
 import semantic_kernel as sk
 import semantic_kernel.connectors.ai.open_ai as sk_oai
@@ -16,4 +16,4 @@
     "chat-gpt", sk_oai.OpenAIChatCompletion("gpt-3.5-turbo", api_key, org_id)
 )
 
-asyncio.run(e2e_summarization.summarize_function_test(kernel))
+asyncio.run(e2e_text_completion.summarize_function_test(kernel))
diff --git a/python/tests/end-to-end/basics_with_oai_text_service.py b/python/tests/end-to-end/basics_with_oai_text_service.py
@@ -2,7 +2,7 @@
 
 import asyncio
 
-from utils import e2e_summarization
+from utils import e2e_text_completion
 
 import semantic_kernel as sk
 import semantic_kernel.connectors.ai.open_ai as sk_oai
@@ -17,4 +17,4 @@
     "davinci-003", sk_oai.OpenAITextCompletion("text-davinci-003", api_key, org_id)
 )
 
-asyncio.run(e2e_summarization.summarize_function_test(kernel))
+asyncio.run(e2e_text_completion.summarize_function_test(kernel))