Skip to content

Commit

Permalink
Python/Local Hugging Face Inference for Completions and Embeddings (m…
Browse files Browse the repository at this point in the history
…icrosoft#658)

### Motivation and Context
This PR introduces native python support for Hugging Face models that
can: complete text, generate new text, summarize, and that generate
embeddings. Currently only supports downloading the models locally from
the HF model hub. Future plans include supporting the HF inference API
as well.

### Description
- Added 2 services: `hf_text_completion` and `hf_text_embedding`
- `hf_text_completion` supports the following tasks: _text-generation_,
_text2text-generation_, and _summarization_
- `hf_text_embedding` supports any model supported by the
sentence-transformers pip package
- Added dependencies: pytorch, transformers, sentence-transformers to
`requirements.txt` and `poetry.lock`
- fixed typo: `get_embedding_service_service_id` ->
`get_embedding_service_id`
- Added a number of integration tests for supported HF models
  • Loading branch information
awharrison-28 authored Apr 27, 2023
1 parent 5b1ed2f commit f6059cd
Show file tree
Hide file tree
Showing 26 changed files with 1,673 additions and 127 deletions.
7 changes: 4 additions & 3 deletions FEATURE_MATRIX.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
|---|---|---|---|
| OpenAI ||| |
| AzureOpenAI ||| |
| Hugging Face ||| Coming soon to Python - both native and web endpoint support |
| Custom ||| Requires the user to define the service schema in their application |
| Hugging Face Inference API | 🔄 || Coming soon to Python, not all scenarios are covered for .NET |
| Hugging Face Local ||| |
| Custom || 🔄 | Requires the user to define the service schema in their application |

## Tokenizers
| | C# | Python | Notes |
|---|---|---|---|
| GPT2 || 🔄 | Can be manually added to Python via `pip install transformers` |
| GPT2 || | |
| GPT3 ||| |
| tiktoken | 🔄 || Coming soon to Python and C#. Can be manually added to Python via `pip install tiktoken` |

Expand Down
1,053 changes: 1,046 additions & 7 deletions python/poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ python = "^3.8"
numpy = "^1.24.2"
openai = "^0.27.0"
aiofiles = "^23.1.0"
transformers = "^4.28.1"
torch = "^2.0.0"
sentence-transformers = "^2.2.2"

[tool.poetry.group.dev.dependencies]
pre-commit = "^2.21.0"
Expand Down
5 changes: 4 additions & 1 deletion python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
openai==0.27.*
numpy==1.24.*
aiofiles>=23.1.0
aiofiles>=23.1.0
transformers>=4.28.0
sentence-transformers>=2.2.2
torch>=2.0.0
10 changes: 10 additions & 0 deletions python/semantic_kernel/connectors/ai/hugging_face/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.connectors.ai.hugging_face.services.hf_text_completion import (
HuggingFaceTextCompletion,
)
from semantic_kernel.connectors.ai.hugging_face.services.hf_text_embedding import (
HuggingFaceTextEmbedding,
)

__all__ = ["HuggingFaceTextCompletion", "HuggingFaceTextEmbedding"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Copyright (c) Microsoft. All rights reserved.

from logging import Logger
from typing import Optional

import torch
from transformers import pipeline

from semantic_kernel.connectors.ai.ai_exception import AIException
from semantic_kernel.connectors.ai.complete_request_settings import (
CompleteRequestSettings,
)
from semantic_kernel.connectors.ai.text_completion_client_base import (
TextCompletionClientBase,
)
from semantic_kernel.utils.null_logger import NullLogger


class HuggingFaceTextCompletion(TextCompletionClientBase):
_model_id: str
_task: str
_device: int
_log: Logger

def __init__(
self,
model_id: str,
device: Optional[int] = -1,
task: Optional[str] = None,
log: Optional[Logger] = None,
) -> None:
"""
Initializes a new instance of the HuggingFaceTextCompletion class.
Arguments:
model_id {str} -- Hugging Face model card string, see
https://huggingface.co/models
device {Optional[int]} -- Device to run the model on, -1 for CPU, 0+ for GPU.
task {Optional[str]} -- Model completion task type, options are:
- summarization: takes a long text and returns a shorter summary.
- text-generation: takes incomplete text and returns a set of completion candidates.
- text2text-generation (default): takes an input prompt and returns a completion.
text2text-generation is the default as it behaves more like GPT-3+.
log {Optional[Logger]} -- Logger instance.
Note that this model will be downloaded from the Hugging Face model hub.
"""
self._model_id = model_id
self._task = "text2text-generation" if task is None else task
self._log = log if log is not None else NullLogger()
self.device = (
"cuda:" + device if device >= 0 and torch.cuda.is_available() else "cpu"
)
self.generator = pipeline(
task=self._task, model=self._model_id, device=self.device
)

async def complete_async(
self, prompt: str, request_settings: CompleteRequestSettings
) -> str:
"""
Completes a prompt using the Hugging Face model.
Arguments:
prompt {str} -- Prompt to complete.
request_settings {CompleteRequestSettings} -- Request settings.
Returns:
str -- Completion result.
"""
try:
result = self.generator(
prompt,
num_return_sequences=1,
temperature=request_settings.temperature,
top_p=request_settings.top_p,
max_length=request_settings.max_tokens,
pad_token_id=50256, # EOS token
)

if self._task == "text-generation" or self._task == "text2text-generation":
return result[0]["generated_text"]

elif self._task == "summarization":
return result[0]["summary_text"]

else:
raise AIException(
AIException.ErrorCodes.InvalidConfiguration,
"Unsupported hugging face pipeline task: only \
text-generation, text2text-generation, and summarization are supported.",
)

except Exception as e:
raise AIException("Hugging Face completion failed", e)
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) Microsoft. All rights reserved.

from logging import Logger
from typing import List, Optional

import torch
from numpy import array, ndarray
from sentence_transformers import SentenceTransformer

from semantic_kernel.connectors.ai.ai_exception import AIException
from semantic_kernel.connectors.ai.embeddings.embedding_generator_base import (
EmbeddingGeneratorBase,
)
from semantic_kernel.utils.null_logger import NullLogger


class HuggingFaceTextEmbedding(EmbeddingGeneratorBase):
_model_id: str
_device: int
_log: Logger

def __init__(
self,
model_id: str,
device: Optional[int] = -1,
log: Optional[Logger] = None,
) -> None:
"""
Initializes a new instance of the HuggingFaceTextEmbedding class.
Arguments:
model_id {str} -- Hugging Face model card string, see
https://huggingface.co/sentence-transformers
device {Optional[int]} -- Device to run the model on, -1 for CPU, 0+ for GPU.
log {Optional[Logger]} -- Logger instance.
Note that this model will be downloaded from the Hugging Face model hub.
"""
self._model_id = model_id
self._log = log if log is not None else NullLogger()
self.device = (
"cuda:" + device if device >= 0 and torch.cuda.is_available() else "cpu"
)
self.generator = SentenceTransformer(
model_name_or_path=self._model_id, device=self.device
)

async def generate_embeddings_async(self, texts: List[str]) -> ndarray:
"""
Generates embeddings for a list of texts.
Arguments:
texts {List[str]} -- Texts to generate embeddings for.
Returns:
ndarray -- Embeddings for the texts.
"""
try:
self._log.info(f"Generating embeddings for {len(texts)} texts")
embeddings = self.generator.encode(texts)
return array(embeddings)
except Exception as e:
raise AIException("Hugging Face embeddings failed", e)
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ async def complete_chat_async(

return response.choices[0].message.content

async def complete_simple_async(
async def complete_async(
self, prompt: str, request_settings: CompleteRequestSettings
) -> str:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _setup_open_ai(self) -> Any:

return openai

async def complete_simple_async(
async def complete_async(
self, prompt: str, request_settings: CompleteRequestSettings
) -> str:
"""
Expand Down Expand Up @@ -83,14 +83,14 @@ async def complete_simple_async(
if request_settings.number_of_responses != 1:
raise AIException(
AIException.ErrorCodes.InvalidRequest,
"complete_simple_async only supports a single completion, "
"complete_async only supports a single completion, "
f"but {request_settings.number_of_responses} were requested",
)

if request_settings.logprobs != 0:
raise AIException(
AIException.ErrorCodes.InvalidRequest,
"complete_simple_async does not support logprobs, "
"complete_async does not support logprobs, "
f"but logprobs={request_settings.logprobs} was requested",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class TextCompletionClientBase(ABC):
@abstractmethod
async def complete_simple_async(
async def complete_async(
self,
prompt: str,
settings: "CompleteRequestSettings",
Expand Down
2 changes: 1 addition & 1 deletion python/semantic_kernel/kernel_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def get_chat_service_service_id(self, service_id: Optional[str] = None) -> str:

return service_id

def get_embedding_service_service_id(self, service_id: Optional[str] = None) -> str:
def get_embedding_service_id(self, service_id: Optional[str] = None) -> str:
if service_id is None or service_id not in self._embedding_services:
if self._default_embedding_service is None:
raise ValueError("No default embedding service is set")
Expand Down
4 changes: 1 addition & 3 deletions python/semantic_kernel/orchestration/sk_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,7 @@ async def _local_func(client, request_settings, context):
context.variables.update(completion)
else:
prompt = await function_config.prompt_template.render_async(context)
completion = await client.complete_simple_async(
prompt, request_settings
)
completion = await client.complete_async(prompt, request_settings)
context.variables.update(completion)
except Exception as e:
# TODO: "critical exceptions"
Expand Down
4 changes: 1 addition & 3 deletions python/semantic_kernel/text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.text.function_extension import (
aggregate_chunked_results_async,
)
from semantic_kernel.text.function_extension import aggregate_chunked_results_async
from semantic_kernel.text.text_chunker import (
split_markdown_lines,
split_markdown_paragraph,
Expand Down
4 changes: 2 additions & 2 deletions python/tests/end-to-end/basics_with_azure_oai_chat_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio

from utils import e2e_summarization
from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.open_ai as sk_oai
Expand All @@ -16,4 +16,4 @@
"chat-gpt", sk_oai.AzureChatCompletion("gpt-35-turbo", endpoint, api_key)
)

asyncio.run(e2e_summarization.summarize_function_test(kernel))
asyncio.run(e2e_text_completion.summarize_function_test(kernel))
4 changes: 2 additions & 2 deletions python/tests/end-to-end/basics_with_azure_oai_text_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio

from utils import e2e_summarization
from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.open_ai as sk_oai
Expand All @@ -17,4 +17,4 @@
"davinci-003", sk_oai.AzureTextCompletion("text-davinci-003", endpoint, api_key)
)

asyncio.run(e2e_summarization.summarize_function_test(kernel))
asyncio.run(e2e_text_completion.summarize_function_test(kernel))
18 changes: 18 additions & 0 deletions python/tests/end-to-end/basics_with_hf_local_text2text_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio

from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.hugging_face as sk_hf

kernel = sk.Kernel()

# Configure LLM service
kernel.config.add_text_service(
"google/flan-t5-base",
sk_hf.HuggingFaceTextCompletion("google/flan-t5-base", task="text2text-generation"),
)

asyncio.run(e2e_text_completion.simple_completion(kernel))
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio

from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.hugging_face as sk_hf

kernel = sk.Kernel()

# Configure LLM service
kernel.config.add_text_service(
"gpt2", sk_hf.HuggingFaceTextCompletion("gpt2", task="text-generation")
)

asyncio.run(e2e_text_completion.simple_completion(kernel))
18 changes: 18 additions & 0 deletions python/tests/end-to-end/basics_with_hf_local_text_summarization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio

from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.hugging_face as sk_hf

kernel = sk.Kernel()

# Configure LLM service
kernel.config.add_text_service(
"facebook/bart-large-cnn",
sk_hf.HuggingFaceTextCompletion("facebook/bart-large-cnn", task="summarization"),
)

asyncio.run(e2e_text_completion.simple_summarization(kernel))
4 changes: 2 additions & 2 deletions python/tests/end-to-end/basics_with_oai_chat_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio

from utils import e2e_summarization
from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.open_ai as sk_oai
Expand All @@ -16,4 +16,4 @@
"chat-gpt", sk_oai.OpenAIChatCompletion("gpt-3.5-turbo", api_key, org_id)
)

asyncio.run(e2e_summarization.summarize_function_test(kernel))
asyncio.run(e2e_text_completion.summarize_function_test(kernel))
4 changes: 2 additions & 2 deletions python/tests/end-to-end/basics_with_oai_text_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import asyncio

from utils import e2e_summarization
from utils import e2e_text_completion

import semantic_kernel as sk
import semantic_kernel.connectors.ai.open_ai as sk_oai
Expand All @@ -17,4 +17,4 @@
"davinci-003", sk_oai.OpenAITextCompletion("text-davinci-003", api_key, org_id)
)

asyncio.run(e2e_summarization.summarize_function_test(kernel))
asyncio.run(e2e_text_completion.summarize_function_test(kernel))
Loading

0 comments on commit f6059cd

Please sign in to comment.