From 0f067b1fe90099006837b889c432433e8e686fa0 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 18:32:22 +0300 Subject: [PATCH 01/18] refactor templates storing --- .../evolution/chat_templates/abstract.py | 43 +++++++++++++++++++ .../evolution/chat_templates/abstract.yaml | 33 -------------- .../chat_templates/base_instruction.txt | 3 -- .../evolution/chat_templates/concrete.py | 40 +++++++++++++++++ .../chat_templates/concretizing.yaml | 33 -------------- .../evolution/chat_templates/reasoning.py | 43 +++++++++++++++++++ .../evolution/chat_templates/reasoning.yaml | 33 -------------- .../evolution/chat_templates/schemas.py | 16 +++++++ 8 files changed, 142 insertions(+), 102 deletions(-) create mode 100644 autointent/generation/utterances/evolution/chat_templates/abstract.py delete mode 100644 autointent/generation/utterances/evolution/chat_templates/abstract.yaml delete mode 100644 autointent/generation/utterances/evolution/chat_templates/base_instruction.txt create mode 100644 autointent/generation/utterances/evolution/chat_templates/concrete.py delete mode 100644 autointent/generation/utterances/evolution/chat_templates/concretizing.yaml create mode 100644 autointent/generation/utterances/evolution/chat_templates/reasoning.py delete mode 100644 autointent/generation/utterances/evolution/chat_templates/reasoning.yaml create mode 100644 autointent/generation/utterances/evolution/chat_templates/schemas.py diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py new file mode 100644 index 00000000..0bdb2a88 --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -0,0 +1,43 @@ +"""Chat template for evolution augmentation via abstractization.""" + +from typing import ClassVar + +from autointent.schemas import Intent + +from .schemas import Message, Role + + +class AbstractEvolution: + """Chat template for evolution augmentation via abstraction.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Reserve Restaurant" + "Utterance: I want to reserve a table for 4 persons at 9 pm.", + ), + ), + Message(role=Role.ASSISTANT, content="Please, reserve a table for me."), + Message( + role=Role.ASSISTANT, + content=( + "Intent name: requesting technical support" + "Utterance: My Lenovo laptop is constantly rebooting and overheating." + ), + ), + Message(role=Role.ASSISTANT, content="I'm having trouble with my laptop."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> str: + """Make chat to complete.""" + return self._messages + Message( + role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + ) diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.yaml b/autointent/generation/utterances/evolution/chat_templates/abstract.yaml deleted file mode 100644 index 37ddc558..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Reserve Restaurant - - Utterance: - I want to reserve a table for 4 persons at 9 pm. -- role: assistant - content: | - Please, reserve a table for be. -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - My Lenovo laptop is constantly rebooting and overheating. -- role: assistant - content: | - I'm having trouble with my laptop. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt b/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt deleted file mode 100644 index e2b498a3..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt +++ /dev/null @@ -1,3 +0,0 @@ -I want you to act as a rewriter. - You will be provided with an utterance and the topic (name of intent class) of the utterance. - You need to complicate the utterance using the following method: diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py new file mode 100644 index 00000000..fc0a873d --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -0,0 +1,40 @@ +"""Chat template for evolution augmentation via concretizing.""" + +from typing import ClassVar + +from autointent.schemas import Intent + +from .schemas import Message, Role + + +class ConcreteEvolution: + """Chat template for evolution augmentation via concretizing.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Reserve Restaurant\n" + "Utterance: I want to make a reservation for dinner tonight." + ), + ), + Message(role=Role.ASSISTANT, content="I want to reserve a table for 4 persons at 9 pm."), + Message( + role=Role.USER, + content=("Intent name: requesting technical support\n" "Utterance: I'm having trouble with my laptop."), + ), + Message(role=Role.ASSISTANT, content="My laptop is constantly rebooting and overheating."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> str: + """Make chat to complete.""" + return self._messages + Message( + role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + ) diff --git a/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml b/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml deleted file mode 100644 index 67037448..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Reserve Restaurant - - Utterance: - I want to make a reservation for dinner tonight. -- role: assistant - content: | - I want to reserve a table for 4 persons at 9 pm. -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - I'm having trouble with my laptop. -- role: assistant - content: | - My laptop is constantly rebooting and overheating. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py new file mode 100644 index 00000000..52dfbbb0 --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -0,0 +1,43 @@ +"""Chat template for evolution augmentation via reasoning.""" + +from typing import ClassVar + +from autointent.schemas import Intent + +from .schemas import Message, Role + + +class ReasoningEvolution: + """Chat template for evolution augmentation via reasoning.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Asking for Directions\n" + "Utterance: How do I get to the nearest coffee shop?" + ), + ), + Message(role=Role.ASSISTANT, content="If there are some place where I can buy a coffee, how can I get there?"), + Message( + role=Role.USER, + content=( + "Intent name: requesting technical support\n" + "Utterance: I want to get help from technical support for my laptop." + ), + ), + Message(role=Role.ASSISTANT, content="I don't know what's happening with my laptop."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> str: + """Make chat to complete.""" + return self._messages + Message( + role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + ) diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml b/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml deleted file mode 100644 index 02c3eb88..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Asking for Directions - - Utterance: - How do I get to the nearest coffee shop? -- role: assistant - content: | - If there are some place where I can buy a coffee, how can I get there? -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - I want to get help from technical support for my laptop. -- role: assistant - content: | - I don't know what's happening with my laptop. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/chat_templates/schemas.py b/autointent/generation/utterances/evolution/chat_templates/schemas.py new file mode 100644 index 00000000..92c7a287 --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/schemas.py @@ -0,0 +1,16 @@ +"""Schemas that are useful for working with prompts.""" +from typing import TypedDict + + +class Message(TypedDict): + """Schema for message to LLM.""" + + role: str + content: str + +class Role: + """Roles in a chat with LLM.""" + + SYSTEM = "system" + USER = "user" + ASSISTANT = "assistant" From fac77e3207bc8f5ff0d25e98d59badf77ab4e975 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 18:48:55 +0300 Subject: [PATCH 02/18] make base class --- .../evolution/chat_templates/abstract.py | 3 ++- .../utterances/evolution/chat_templates/base.py | 14 ++++++++++++++ .../evolution/chat_templates/concrete.py | 3 ++- .../evolution/chat_templates/reasoning.py | 3 ++- 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 autointent/generation/utterances/evolution/chat_templates/base.py diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py index 0bdb2a88..3ba166c6 100644 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.py +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -4,10 +4,11 @@ from autointent.schemas import Intent +from .base import EvolutionChatTemplate from .schemas import Message, Role -class AbstractEvolution: +class AbstractEvolution(EvolutionChatTemplate): """Chat template for evolution augmentation via abstraction.""" _messages: ClassVar[list[Message]] = [ diff --git a/autointent/generation/utterances/evolution/chat_templates/base.py b/autointent/generation/utterances/evolution/chat_templates/base.py new file mode 100644 index 00000000..e74a6059 --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/base.py @@ -0,0 +1,14 @@ +"""Base class for chat templates for evolution augmentation.""" +from abc import ABC, abstractmethod + +from autointent.schemas import Intent + +from .schemas import Message + + +class EvolutionChatTemplate(ABC): + """Base class for chat templates for evolution augmentation.""" + + @abstractmethod + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: + """Make a chat to complete by LLM.""" diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py index fc0a873d..38fb5954 100644 --- a/autointent/generation/utterances/evolution/chat_templates/concrete.py +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -4,10 +4,11 @@ from autointent.schemas import Intent +from .base import EvolutionChatTemplate from .schemas import Message, Role -class ConcreteEvolution: +class ConcreteEvolution(EvolutionChatTemplate): """Chat template for evolution augmentation via concretizing.""" _messages: ClassVar[list[Message]] = [ diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py index 52dfbbb0..8731a781 100644 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.py +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -4,10 +4,11 @@ from autointent.schemas import Intent +from .base import EvolutionChatTemplate from .schemas import Message, Role -class ReasoningEvolution: +class ReasoningEvolution(EvolutionChatTemplate): """Chat template for evolution augmentation via reasoning.""" _messages: ClassVar[list[Message]] = [ From f949881ebac9627232c9506418c014740ce69887 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 19:00:15 +0300 Subject: [PATCH 03/18] update evolver --- .../evolution/chat_templates/abstract.py | 4 +- .../evolution/chat_templates/concrete.py | 4 +- .../evolution/chat_templates/reasoning.py | 4 +- .../utterances/evolution/evolver.py | 49 +++++-------------- 4 files changed, 19 insertions(+), 42 deletions(-) diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py index 3ba166c6..dd10bbc4 100644 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.py +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -37,8 +37,8 @@ class AbstractEvolution(EvolutionChatTemplate): Message(role=Role.ASSISTANT, content="I'm having trouble with my laptop."), ] - def __call__(self, utterance: str, intent_data: Intent) -> str: + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" ) diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py index 38fb5954..77d15f75 100644 --- a/autointent/generation/utterances/evolution/chat_templates/concrete.py +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -34,8 +34,8 @@ class ConcreteEvolution(EvolutionChatTemplate): Message(role=Role.ASSISTANT, content="My laptop is constantly rebooting and overheating."), ] - def __call__(self, utterance: str, intent_data: Intent) -> str: + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" ) diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py index 8731a781..ba8a7c90 100644 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.py +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -37,8 +37,8 @@ class ReasoningEvolution(EvolutionChatTemplate): Message(role=Role.ASSISTANT, content="I don't know what's happening with my laptop."), ] - def __call__(self, utterance: str, intent_data: Intent) -> str: + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name}\nUtterance: {utterance}" + role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" ) diff --git a/autointent/generation/utterances/evolution/evolver.py b/autointent/generation/utterances/evolution/evolver.py index 690afd23..e195273a 100644 --- a/autointent/generation/utterances/evolution/evolver.py +++ b/autointent/generation/utterances/evolution/evolver.py @@ -4,19 +4,17 @@ Deeply inspired by DeepEval evolutions. """ -import importlib.resources as ires import random +from collections.abc import Callable from typing import Literal -import yaml from datasets import Dataset as HFDataset from datasets import concatenate_datasets from autointent import Dataset from autointent.custom_types import Split from autointent.generation.utterances.generator import Generator -from autointent.generation.utterances.utils import safe_format # type: ignore[attr-defined] -from autointent.schemas import Sample +from autointent.schemas import Intent, Sample EvolutionType = Literal["reasoning", "concretizing", "abstract", "formal", "informal", "funny", "goofy"] @@ -29,31 +27,21 @@ class UtteranceEvolver: to change it in a specific way. """ - def __init__(self, generator: Generator, evolutions: list[EvolutionType], seed: int = 0) -> None: + def __init__(self, generator: Generator, prompt_makers: list[Callable], seed: int = 0) -> None: """Initialize.""" self.generator = generator - self.evolutions = evolutions - self.prompts = _load_prompts() + self.prompt_makers = prompt_makers random.seed(seed) - def _evolve(self, utterance: str, intent_name: str, evolution: EvolutionType) -> str: + def _evolve(self, utterance: str, intent_data: Intent) -> str: """Apply evolutions single time.""" - messages_yaml = safe_format( - self.prompts[evolution], - base_instruction=self.prompts["base_instruction"], - utterance=utterance, - intent_name=intent_name, - ) - messages = yaml.safe_load(messages_yaml) - return self.generator.get_chat_completion(messages) + maker = random.choice(self.prompt_makers) + chat = maker(utterance, intent_data) + return self.generator.get_chat_completion(chat) - def __call__(self, utterance: str, intent_name: str, n_evolutions: int = 1) -> list[str]: + def __call__(self, utterance: str, intent_data: Intent, n_evolutions: int = 1) -> list[str]: """Apply evolutions mupltiple times.""" - res = [] - for _ in range(n_evolutions): - evolution = random.choice(self.evolutions) - res.append(self._evolve(utterance, intent_name, evolution)) - return res + return [self._evolve(utterance, intent_data) for _ in range(n_evolutions)] def augment( self, dataset: Dataset, split_name: str = Split.TRAIN, n_evolutions: int = 1, update_split: bool = True @@ -68,25 +56,14 @@ def augment( for sample in original_split: utterance = sample[Dataset.utterance_feature] label = sample[Dataset.label_feature] - intent_info = next(intent for intent in dataset.intents if intent.id == label) + intent_data = next(intent for intent in dataset.intents if intent.id == label) generated_utterances = self( - utterance=utterance, intent_name=intent_info.name or "", n_evolutions=n_evolutions + utterance=utterance, intent_data=intent_data, n_evolutions=n_evolutions ) new_samples.extend( - [{Dataset.label_feature: intent_info.id, Dataset.utterance_feature: ut} for ut in generated_utterances] + [{Dataset.label_feature: intent_data.id, Dataset.utterance_feature: ut} for ut in generated_utterances] ) if update_split: generated_split = HFDataset.from_list(new_samples) dataset[split_name] = concatenate_datasets([original_split, generated_split]) return [Sample(**sample) for sample in new_samples] - - -def _load_prompts() -> dict[str, str]: - files = ires.files("autointent.generation.utterances.evolution.chat_templates") - - res = {} - for file_name in ["reasoning.yaml", "concretizing.yaml", "abstract.yaml", "base_instruction.txt"]: - with files.joinpath(file_name).open() as file: - res[file_name.split(".")[0]] = file.read() - - return res From de6f82079850255444e73d70749ba7d4dd1543f5 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 20:33:51 +0300 Subject: [PATCH 04/18] move `schemas.py` --- .../utterances/evolution/chat_templates/__init__.py | 5 +++++ .../utterances/evolution/chat_templates/abstract.py | 2 +- .../generation/utterances/evolution/chat_templates/base.py | 4 ++-- .../utterances/evolution/chat_templates/concrete.py | 2 +- .../utterances/evolution/chat_templates/reasoning.py | 2 +- .../utterances/{evolution/chat_templates => }/schemas.py | 2 ++ 6 files changed, 12 insertions(+), 5 deletions(-) rename autointent/generation/utterances/{evolution/chat_templates => }/schemas.py (99%) diff --git a/autointent/generation/utterances/evolution/chat_templates/__init__.py b/autointent/generation/utterances/evolution/chat_templates/__init__.py index e69de29b..e509ede2 100644 --- a/autointent/generation/utterances/evolution/chat_templates/__init__.py +++ b/autointent/generation/utterances/evolution/chat_templates/__init__.py @@ -0,0 +1,5 @@ +from .abstract import AbstractEvolution +from .concrete import ConcreteEvolution +from .reasoning import ReasoningEvolution + +__all__ = ["AbstractEvolution", "ConcreteEvolution", "ReasoningEvolution"] diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py index dd10bbc4..f33c7ad3 100644 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.py +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -2,10 +2,10 @@ from typing import ClassVar +from autointent.generation.utterances.schemas import Message, Role from autointent.schemas import Intent from .base import EvolutionChatTemplate -from .schemas import Message, Role class AbstractEvolution(EvolutionChatTemplate): diff --git a/autointent/generation/utterances/evolution/chat_templates/base.py b/autointent/generation/utterances/evolution/chat_templates/base.py index e74a6059..cb119d1b 100644 --- a/autointent/generation/utterances/evolution/chat_templates/base.py +++ b/autointent/generation/utterances/evolution/chat_templates/base.py @@ -1,10 +1,10 @@ """Base class for chat templates for evolution augmentation.""" + from abc import ABC, abstractmethod +from autointent.generation.utterances.schemas import Message from autointent.schemas import Intent -from .schemas import Message - class EvolutionChatTemplate(ABC): """Base class for chat templates for evolution augmentation.""" diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py index 77d15f75..ef911c2b 100644 --- a/autointent/generation/utterances/evolution/chat_templates/concrete.py +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -2,10 +2,10 @@ from typing import ClassVar +from autointent.generation.utterances.schemas import Message, Role from autointent.schemas import Intent from .base import EvolutionChatTemplate -from .schemas import Message, Role class ConcreteEvolution(EvolutionChatTemplate): diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py index ba8a7c90..6371a4b7 100644 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.py +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -2,10 +2,10 @@ from typing import ClassVar +from autointent.generation.utterances.schemas import Message, Role from autointent.schemas import Intent from .base import EvolutionChatTemplate -from .schemas import Message, Role class ReasoningEvolution(EvolutionChatTemplate): diff --git a/autointent/generation/utterances/evolution/chat_templates/schemas.py b/autointent/generation/utterances/schemas.py similarity index 99% rename from autointent/generation/utterances/evolution/chat_templates/schemas.py rename to autointent/generation/utterances/schemas.py index 92c7a287..e296f6cd 100644 --- a/autointent/generation/utterances/evolution/chat_templates/schemas.py +++ b/autointent/generation/utterances/schemas.py @@ -1,4 +1,5 @@ """Schemas that are useful for working with prompts.""" + from typing import TypedDict @@ -8,6 +9,7 @@ class Message(TypedDict): role: str content: str + class Role: """Roles in a chat with LLM.""" From 4a15243817a1f1dea79b18827f3332a1546ff43d Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 20:34:05 +0300 Subject: [PATCH 05/18] update cli --- .../generation/utterances/evolution/cli.py | 20 +++++-------------- .../utterances/evolution/evolver.py | 4 +--- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/autointent/generation/utterances/evolution/cli.py b/autointent/generation/utterances/evolution/cli.py index be04e11e..5e18dc22 100644 --- a/autointent/generation/utterances/evolution/cli.py +++ b/autointent/generation/utterances/evolution/cli.py @@ -7,6 +7,8 @@ from autointent.generation.utterances.evolution.evolver import UtteranceEvolver from autointent.generation.utterances.generator import Generator +from .chat_templates import AbstractEvolution, ConcreteEvolution, ReasoningEvolution + if TYPE_CHECKING: from .evolver import EvolutionType @@ -37,28 +39,16 @@ def main() -> None: parser.add_argument("--reasoning", action="store_true", help="Whether to use `Reasoning` evolution") parser.add_argument("--concretizing", action="store_true", help="Whether to use `Concretizing` evolution") parser.add_argument("--abstract", action="store_true", help="Whether to use `Abstract` evolution") - parser.add_argument("--formal", action="store_true", help="Whether to use `Formal` evolution") - parser.add_argument("--informal", action="store_true", help="Whether to use `Informal` evolution") - parser.add_argument("--funny", action="store_true", help="Whether to use `Funny` evolution") - parser.add_argument("--goofy", action="store_true", help="Whether to use `Goofy` evolution") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() evolutions: list[EvolutionType] = [] if args.reasoning: - evolutions.append("reasoning") + evolutions.append(ReasoningEvolution) if args.concretizing: - evolutions.append("concretizing") + evolutions.append(ConcreteEvolution) if args.abstract: - evolutions.append("abstract") - if args.formal: - evolutions.append("formal") - if args.informal: - evolutions.append("informal") - if args.funny: - evolutions.append("funny") - if args.goofy: - evolutions.append("goofy") + evolutions.append(AbstractEvolution) dataset = load_dataset(args.input_path) diff --git a/autointent/generation/utterances/evolution/evolver.py b/autointent/generation/utterances/evolution/evolver.py index e195273a..c86242e7 100644 --- a/autointent/generation/utterances/evolution/evolver.py +++ b/autointent/generation/utterances/evolution/evolver.py @@ -57,9 +57,7 @@ def augment( utterance = sample[Dataset.utterance_feature] label = sample[Dataset.label_feature] intent_data = next(intent for intent in dataset.intents if intent.id == label) - generated_utterances = self( - utterance=utterance, intent_data=intent_data, n_evolutions=n_evolutions - ) + generated_utterances = self(utterance=utterance, intent_data=intent_data, n_evolutions=n_evolutions) new_samples.extend( [{Dataset.label_feature: intent_data.id, Dataset.utterance_feature: ut} for ut in generated_utterances] ) From 80e3644a53e43c36d0af0343c8b891c5353ae7ea Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 20:46:10 +0300 Subject: [PATCH 06/18] minor bug fix --- .../utterances/evolution/chat_templates/abstract.py | 9 +++++---- .../utterances/evolution/chat_templates/concrete.py | 7 ++++--- .../utterances/evolution/chat_templates/reasoning.py | 7 ++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py index f33c7ad3..4ab88c35 100644 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.py +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -23,7 +23,7 @@ class AbstractEvolution(EvolutionChatTemplate): "3. Rewritten utterance must be fully answerable.\n" "4. Rewritten utterance should not contain more than 10 words.\n\n" "Intent name: Reserve Restaurant" - "Utterance: I want to reserve a table for 4 persons at 9 pm.", + "Utterance: I want to reserve a table for 4 persons at 9 pm." ), ), Message(role=Role.ASSISTANT, content="Please, reserve a table for me."), @@ -39,6 +39,7 @@ class AbstractEvolution(EvolutionChatTemplate): def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" - return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" - ) + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py index ef911c2b..dcca78ba 100644 --- a/autointent/generation/utterances/evolution/chat_templates/concrete.py +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -36,6 +36,7 @@ class ConcreteEvolution(EvolutionChatTemplate): def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" - return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" - ) + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py index 6371a4b7..39791acc 100644 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.py +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -39,6 +39,7 @@ class ReasoningEvolution(EvolutionChatTemplate): def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: """Make chat to complete.""" - return self._messages + Message( - role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}" - ) + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] From 5761719e0655e071b972a02ccdb32ae385307fc2 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 20:46:20 +0300 Subject: [PATCH 07/18] fix typing --- .../evolution/chat_templates/__init__.py | 3 ++- autointent/generation/utterances/evolution/cli.py | 14 +++++--------- .../generation/utterances/evolution/evolver.py | 10 +++++----- autointent/generation/utterances/generator.py | 4 +++- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/autointent/generation/utterances/evolution/chat_templates/__init__.py b/autointent/generation/utterances/evolution/chat_templates/__init__.py index e509ede2..8417ff1a 100644 --- a/autointent/generation/utterances/evolution/chat_templates/__init__.py +++ b/autointent/generation/utterances/evolution/chat_templates/__init__.py @@ -1,5 +1,6 @@ from .abstract import AbstractEvolution +from .base import EvolutionChatTemplate from .concrete import ConcreteEvolution from .reasoning import ReasoningEvolution -__all__ = ["AbstractEvolution", "ConcreteEvolution", "ReasoningEvolution"] +__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution"] diff --git a/autointent/generation/utterances/evolution/cli.py b/autointent/generation/utterances/evolution/cli.py index 5e18dc22..1f5b9431 100644 --- a/autointent/generation/utterances/evolution/cli.py +++ b/autointent/generation/utterances/evolution/cli.py @@ -1,16 +1,12 @@ """CLI for evolutionary augmenter.""" from argparse import ArgumentParser -from typing import TYPE_CHECKING from autointent import load_dataset from autointent.generation.utterances.evolution.evolver import UtteranceEvolver from autointent.generation.utterances.generator import Generator -from .chat_templates import AbstractEvolution, ConcreteEvolution, ReasoningEvolution - -if TYPE_CHECKING: - from .evolver import EvolutionType +from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution def main() -> None: @@ -42,13 +38,13 @@ def main() -> None: parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() - evolutions: list[EvolutionType] = [] + evolutions: list[EvolutionChatTemplate] = [] if args.reasoning: - evolutions.append(ReasoningEvolution) + evolutions.append(ReasoningEvolution()) if args.concretizing: - evolutions.append(ConcreteEvolution) + evolutions.append(ConcreteEvolution()) if args.abstract: - evolutions.append(AbstractEvolution) + evolutions.append(AbstractEvolution()) dataset = load_dataset(args.input_path) diff --git a/autointent/generation/utterances/evolution/evolver.py b/autointent/generation/utterances/evolution/evolver.py index c86242e7..bdcf1069 100644 --- a/autointent/generation/utterances/evolution/evolver.py +++ b/autointent/generation/utterances/evolution/evolver.py @@ -5,8 +5,7 @@ """ import random -from collections.abc import Callable -from typing import Literal +from collections.abc import Callable, Sequence from datasets import Dataset as HFDataset from datasets import concatenate_datasets @@ -14,10 +13,9 @@ from autointent import Dataset from autointent.custom_types import Split from autointent.generation.utterances.generator import Generator +from autointent.generation.utterances.schemas import Message from autointent.schemas import Intent, Sample -EvolutionType = Literal["reasoning", "concretizing", "abstract", "formal", "informal", "funny", "goofy"] - class UtteranceEvolver: """ @@ -27,7 +25,9 @@ class UtteranceEvolver: to change it in a specific way. """ - def __init__(self, generator: Generator, prompt_makers: list[Callable], seed: int = 0) -> None: + def __init__( + self, generator: Generator, prompt_makers: Sequence[Callable[[str, Intent], list[Message]]], seed: int = 0 + ) -> None: """Initialize.""" self.generator = generator self.prompt_makers = prompt_makers diff --git a/autointent/generation/utterances/generator.py b/autointent/generation/utterances/generator.py index 69672278..dfc9ec86 100644 --- a/autointent/generation/utterances/generator.py +++ b/autointent/generation/utterances/generator.py @@ -5,6 +5,8 @@ import openai from dotenv import load_dotenv +from .schemas import Message + class Generator: """Wrapper class for accessing OpenAI API.""" @@ -15,7 +17,7 @@ def __init__(self) -> None: self.client = openai.OpenAI(base_url=os.environ["OPENAI_BASE_URL"], api_key=os.environ["OPENAI_API_KEY"]) self.model_name = os.environ["OPENAI_MODEL_NAME"] - def get_chat_completion(self, messages: list[dict[str, str]]) -> str: + def get_chat_completion(self, messages: list[Message]) -> str: """Prompt LLM and return its answer.""" response = self.client.chat.completions.create( messages=messages, # type: ignore[arg-type] From 21957b4cd8fb254caac9d9e31be2933f464962ab Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 21:09:49 +0300 Subject: [PATCH 08/18] refactor chat template for basic utterance generator --- .../utterances/basic/chat_template.py | 115 +++++++++++++++++ .../utterances/basic/chat_template.yaml | 119 ------------------ 2 files changed, 115 insertions(+), 119 deletions(-) create mode 100644 autointent/generation/utterances/basic/chat_template.py delete mode 100644 autointent/generation/utterances/basic/chat_template.yaml diff --git a/autointent/generation/utterances/basic/chat_template.py b/autointent/generation/utterances/basic/chat_template.py new file mode 100644 index 00000000..6e10d88e --- /dev/null +++ b/autointent/generation/utterances/basic/chat_template.py @@ -0,0 +1,115 @@ +"""Chat template for evolution augmentation via abstractization.""" + +import random +from typing import ClassVar + +from autointent import Dataset +from autointent.generation.utterances.schemas import Message, Role + + +class ExampleGenerator: + """Chat template for generating additional examples for a given intent class.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "You will be provided with a set of example utterances and the name " + "of the common topic (intent name) of these utterances. " + "Your task is to generate more examples that fit within the same intent name.\n\n" + "Note:\n" + "- You can generate similar utterances with only slot values changed\n" + "- You can generate completely different utterance from the same intent name\n" + "- Intent name can be missed, then you should infer from example utterances only\n" + "- Example utterances can be missed, then you should infer from intent name only\n" + "{extra_instructions}\n\n" + "Intent name: ordering_pizza\n\n" + "Example Utterances:\n" + "1. I want to order a large pepperoni pizza.\n" + "2. Can I get a medium cheese pizza with extra olives?\n" + "3. Please deliver a small veggie pizza to my address.\n\n" + "Please generate 3 more examples for the provided intent name." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. I'd like to order a large margherita pizza.\n" + "2. Can you deliver a medium Hawaiian pizza with extra pineapple?\n" + "3. Please send a small BBQ chicken pizza to my home." + ), + ), + Message( + role=Role.USER, + content=( + "Intent name: booking a hotel\n\n" + "Example Utterances:\n" + "1. I need to book a room for two nights in New York.\n\n" + "Please generate 2 more examples for the provided intent name." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. Can you reserve a deluxe room for my trip to Tokyo?\n" + "2. I need to book a hotel room with a mountain view in Denver." + ), + ), + Message( + role=Role.USER, + content=( + "Intent name:\n\n" + "Example Utterances:\n" + "1. What is the weather like today?\n\n" + "Please generate 2 more examples for the provided intent class." + ), + ), + Message( + role=Role.ASSISTANT, + content=("1. Can you tell me the forecast for tomorrow?\n" "2. Is it going to rain this weekend?"), + ), + Message( + role=Role.USER, + content=( + "Intent name: Scheduling a Meeting\n\n" + "Example Utterances:\n\n" + "Please generate 3 more examples for the provided intent class." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. I need to schedule a meeting for next Tuesday.\n" + "2. Can you set up a conference call for tomorrow afternoon?\n" + "3. Please arrange a meeting with the marketing team next week." + ), + ), + ] + + def __init__(self, dataset: Dataset, split: str, extra_instructions: str | None = None) -> None: + """Initialize.""" + if extra_instructions is None: + extra_instructions = "" + + msg = self._messages[0] + msg["content"] = msg["content"].format(extra_instructions=extra_instructions) + + self.dataset = dataset + self.split = split + + def __call__(self, intent_id: int, n_examples: int, max_sample_utterances: int | None = None) -> list[Message]: + """Generate additional examples for the provided intent class.""" + filtered_split = self.dataset[self.split].filter(lambda sample: sample[Dataset.label_feature] == intent_id) + sample_utterances = filtered_split[Dataset.utterance_feature] + intent = next(i for i in self.dataset.intents if i.id == intent_id) + if max_sample_utterances is not None: + sample_utterances = random.sample(sample_utterances, k=max_sample_utterances) + return [ + *self._messages, + Message( + role=Role.USER, + content=f"Intent name: {intent.name}\n\n" + f"Example Utterances:\n{sample_utterances}\n\n" + f"Please generate {n_examples} more examples for the provided intent class.\n", + ), + ] diff --git a/autointent/generation/utterances/basic/chat_template.yaml b/autointent/generation/utterances/basic/chat_template.yaml deleted file mode 100644 index 9d512a11..00000000 --- a/autointent/generation/utterances/basic/chat_template.yaml +++ /dev/null @@ -1,119 +0,0 @@ -- role: system - content: | - You will be provided with a set of example utterances and the name of the common topic (intent class) of these utterances. Your task is to generate more examples that fit within the same intent class. - - Note: - - You can generate similar utterances with only slot values changed - - You can generate completely different utterance from the same intent class - - Intent name can be missed, then you should infer from example utterances only - - Example utterances can be missed, then you should infer from intent name only - {extra_instructions} -- role: user - content: | - Intent Class: ordering_pizza - - Example Utterances: - 1. I want to order a large pepperoni pizza. - 2. Can I get a medium cheese pizza with extra olives? - 3. Please deliver a small veggie pizza to my address. - - Please generate 3 more examples for the provided intent class. -- role: assistant - content: | - 1. I'd like to order a large margherita pizza. - 2. Can you deliver a medium Hawaiian pizza with extra pineapple? - 3. Please send a small BBQ chicken pizza to my home. -- role: user - content: | - Intent Class: booking a hotel - - Example Utterances: - 1. I need to book a room for two nights in New York. - - Please generate 2 more examples for the provided intent class. -- role: assistant - content: | - 1. Can you reserve a deluxe room for my trip to Tokyo? - 2. I need to book a hotel room with a mountain view in Denver. -- role: user - content: | - Intent Class: - - Example Utterances: - 1. What is the weather like today? - - Please generate 2 more examples for the provided intent class. -- role: assistant - content: | - 1. Can you tell me the forecast for tomorrow? - 2. Is it going to rain this weekend? -- role: user - content: | - Intent Class: Scheduling a Meeting - - Example Utterances: - - Please generate 3 more examples for the provided intent class. -- role: assistant - content: | - 1. I need to schedule a meeting for next Tuesday. - 2. Can you set up a conference call for tomorrow afternoon? - 3. Please arrange a meeting with the marketing team next week. -- role: user - content: | - Intent Class: {intent_name} - - Example Utterances: - {example_utterances} - - Please generate {n_examples} more examples for the provided intent class. - -# ### Intent Class: Asking for Directions -# **Example Utterances:** -# 1. "How do I get to the nearest coffee shop?" -# 2. "Can you give me directions to the airport?" -# 3. "What is the best route to the museum from here?" - -# ### Intent Class: Making a Restaurant Reservation -# **Example Utterances:** -# 1. "I want to make a reservation for dinner tonight." -# 2. "Can you book a table for two at the Italian restaurant?" -# 3. "Please reserve a table for four at the steakhouse for Saturday evening." - -# ### Intent Class: Requesting Technical Support -# **Example Utterances:** -# 1. "I'm having trouble with my laptop." -# 2. "Can you help me fix my Wi-Fi connection?" -# 3. "My software is not working properly, can you assist?" - -# ### Intent Class: Inquiring About Product Availability -# **Example Utterances:** -# 1. "Do you have the new iPhone in stock?" -# 2. "Is the blue shirt available in size medium?" -# 3. "Can you check if the latest book by John Doe is available?" - -# ### Intent Class: Requesting Account Information -# **Example Utterances:** -# 1. "What is my current account balance?" -# 2. "Can you tell me my recent transactions?" -# 3. "I need to check my account statement for last month." - -# ### Intent Class: Booking a Flight -# **Example Utterances:** -# 1. "I want to book a flight to Los Angeles." -# 2. "Can you find me a flight to Paris next week?" -# 3. "Please book a round-trip flight to New York for next month." - -# ### Intent Class: Requesting Movie Recommendations -# **Example Utterances:** -# 1. "Can you recommend a good action movie?" -# 2. "What are some highly rated comedies?" -# 3. "I'm in the mood for a romantic film, any suggestions?" - -# ### Intent Class: Ordering Groceries -# **Example Utterances:** -# 1. "I need to order some milk and bread." -# 2. "Can you add apples and bananas to my grocery list?" -# 3. "Please order a dozen eggs and a pack of butter." - -# You can use these intent names and example utterances to further train or test your language model for generating more examples within each intent class. From 1681e77d3fcebe1bcc278dc00fa574fc970a9cb2 Mon Sep 17 00:00:00 2001 From: voorhs Date: Tue, 28 Jan 2025 21:26:37 +0300 Subject: [PATCH 09/18] update utterance synthesizer --- .../utterances/basic/chat_template.py | 32 ++++-- autointent/generation/utterances/basic/cli.py | 35 ++----- .../utterances/basic/extra_instructions.json | 14 --- .../utterances/basic/utterance_generator.py | 97 ++----------------- autointent/generation/utterances/utils.py | 27 ------ 5 files changed, 38 insertions(+), 167 deletions(-) delete mode 100644 autointent/generation/utterances/basic/extra_instructions.json delete mode 100644 autointent/generation/utterances/utils.py diff --git a/autointent/generation/utterances/basic/chat_template.py b/autointent/generation/utterances/basic/chat_template.py index 6e10d88e..c866d282 100644 --- a/autointent/generation/utterances/basic/chat_template.py +++ b/autointent/generation/utterances/basic/chat_template.py @@ -1,13 +1,23 @@ """Chat template for evolution augmentation via abstractization.""" import random +from abc import ABC, abstractmethod from typing import ClassVar from autointent import Dataset from autointent.generation.utterances.schemas import Message, Role +from autointent.schemas import Intent -class ExampleGenerator: +class BaseSynthesizer(ABC): + """Base class.""" + + @abstractmethod + def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]: + """Generate examples for this intent.""" + + +class SynthesizerChatTemplate(BaseSynthesizer): """Chat template for generating additional examples for a given intent class.""" _messages: ClassVar[list[Message]] = [ @@ -86,7 +96,13 @@ class ExampleGenerator: ), ] - def __init__(self, dataset: Dataset, split: str, extra_instructions: str | None = None) -> None: + def __init__( + self, + dataset: Dataset, + split: str, + extra_instructions: str | None = None, + max_sample_utterances: int | None = None, + ) -> None: """Initialize.""" if extra_instructions is None: extra_instructions = "" @@ -96,19 +112,19 @@ def __init__(self, dataset: Dataset, split: str, extra_instructions: str | None self.dataset = dataset self.split = split + self.max_sample_utterances = max_sample_utterances - def __call__(self, intent_id: int, n_examples: int, max_sample_utterances: int | None = None) -> list[Message]: + def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]: """Generate additional examples for the provided intent class.""" - filtered_split = self.dataset[self.split].filter(lambda sample: sample[Dataset.label_feature] == intent_id) + filtered_split = self.dataset[self.split].filter(lambda sample: sample[Dataset.label_feature] == intent_data.id) sample_utterances = filtered_split[Dataset.utterance_feature] - intent = next(i for i in self.dataset.intents if i.id == intent_id) - if max_sample_utterances is not None: - sample_utterances = random.sample(sample_utterances, k=max_sample_utterances) + if self.max_sample_utterances is not None: + sample_utterances = random.sample(sample_utterances, k=self.max_sample_utterances) return [ *self._messages, Message( role=Role.USER, - content=f"Intent name: {intent.name}\n\n" + content=f"Intent name: {intent_data.name}\n\n" f"Example Utterances:\n{sample_utterances}\n\n" f"Please generate {n_examples} more examples for the provided intent class.\n", ), diff --git a/autointent/generation/utterances/basic/cli.py b/autointent/generation/utterances/basic/cli.py index 782776f8..f3b2dd30 100644 --- a/autointent/generation/utterances/basic/cli.py +++ b/autointent/generation/utterances/basic/cli.py @@ -3,9 +3,11 @@ from argparse import ArgumentParser from autointent import load_dataset -from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator +from autointent.generation.utterances.basic.utterance_generator import UtteranceGenerator from autointent.generation.utterances.generator import Generator +from .chat_template import SynthesizerChatTemplate + def main() -> None: """ClI endpoint.""" @@ -41,37 +43,12 @@ def main() -> None: default=5, help="Number of utterances to use as an example for augmentation", ) - parser.add_argument( - "--custom-instruction", - type=str, - action="append", - help="Add extra instructions to default prompt." - "You can use this argument multiple times to add multiple instructions", - ) - parser.add_argument( - "--length", - choices=LengthType.__args__, # type: ignore[attr-defined] - default="none", - help="How to extend the prompt with length instruction", - ) - parser.add_argument( - "--style", - choices=StyleType.__args__, # type: ignore[attr-defined] - default="none", - help="How to extend the prompt with style instruction", - ) - parser.add_argument( - "--same-punctuation", - action="store_true", - help="Whether to extend the prompt with punctuation instruction", - ) args = parser.parse_args() dataset = load_dataset(args.input_path) - generator = UtteranceGenerator( - Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation - ) - generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances) + template = SynthesizerChatTemplate(dataset, "train", max_sample_utterances=args.n_sample_utterances) + generator = UtteranceGenerator(Generator(), template) + generator.augment(dataset, n_generations=args.n_generations) dataset.to_json(args.output_path) diff --git a/autointent/generation/utterances/basic/extra_instructions.json b/autointent/generation/utterances/basic/extra_instructions.json deleted file mode 100644 index a17c384b..00000000 --- a/autointent/generation/utterances/basic/extra_instructions.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "length": { - "same": "Generated utterances should have the similar length in a sense of number of words to example utterances", - "longer": "Generated utterances can be a little bit longer in a sense of number of words than example utterances", - "shorter": "Generated utterances can be a little bit shorter in a sense of number of words than example utterances" - }, - "style": { - "same": "Generated utterances should follow the same style of conversation as example utterances", - "formal": "Generated utterances should follow a formal style of conversation", - "informal": "Generated utterances doesn't have to follow a formal style of conversation", - "playful": "Generated utterances can be playful and funny" - }, - "punctuation": "Generated utterances should follow the same punctuation style" -} diff --git a/autointent/generation/utterances/basic/utterance_generator.py b/autointent/generation/utterances/basic/utterance_generator.py index ecd22de3..0d3c3f06 100644 --- a/autointent/generation/utterances/basic/utterance_generator.py +++ b/autointent/generation/utterances/basic/utterance_generator.py @@ -1,22 +1,15 @@ """Basic generation of new utterances from existing ones.""" -import importlib.resources as ires -import json -import random -from typing import Any, Literal +from collections.abc import Callable -import yaml from datasets import Dataset as HFDataset from datasets import concatenate_datasets from autointent import Dataset from autointent.custom_types import Split from autointent.generation.utterances.generator import Generator -from autointent.generation.utterances.utils import safe_format # type: ignore[attr-defined] -from autointent.schemas import Sample - -LengthType = Literal["none", "same", "longer", "shorter"] -StyleType = Literal["none", "formal", "informal", "playful"] +from autointent.generation.utterances.schemas import Message +from autointent.schemas import Intent, Sample class UtteranceGenerator: @@ -28,34 +21,14 @@ class UtteranceGenerator: punctuation and length of the desired generations. """ - def __init__( - self, - generator: Generator, - custom_instruction: list[str], - length: LengthType, - style: StyleType, - same_punctuation: bool, - ) -> None: + def __init__(self, generator: Generator, prompt_maker: Callable[[Intent, int], list[Message]]) -> None: """Initialize.""" self.generator = generator - prompt_template_yaml = _load_prompt() - self.prompt_template_yaml = _add_extra_instructions( - prompt_template_yaml, - custom_instruction, - length, - style, - same_punctuation, - ) + self.prompt_maker = prompt_maker - def __call__(self, intent_name: str, example_utterances: list[str], n_generations: int) -> list[str]: + def __call__(self, intent_data: Intent, n_generations: int) -> list[str]: """Generate new utterances.""" - messages_yaml = safe_format( - self.prompt_template_yaml, - intent_name=intent_name, - example_utterances=_format_utterances(example_utterances), - n_examples=n_generations, - ) - messages = yaml.safe_load(messages_yaml) + messages = self.prompt_maker(intent_data, n_generations) response_text = self.generator.get_chat_completion(messages) return _extract_utterances(response_text) @@ -64,7 +37,6 @@ def augment( dataset: Dataset, split_name: str = Split.TRAIN, n_generations: int = 5, - max_sample_utterances: int = 5, update_split: bool = True, ) -> list[Sample]: """ @@ -75,13 +47,8 @@ def augment( original_split = dataset[split_name] new_samples = [] for intent in dataset.intents: - filtered_split = original_split.filter(lambda sample, id=intent.id: sample[Dataset.label_feature] == id) - sample_utterances = filtered_split[Dataset.utterance_feature] - if max_sample_utterances is not None: - sample_utterances = random.sample(sample_utterances, k=max_sample_utterances) generated_utterances = self( - intent_name=intent.name or "", - example_utterances=sample_utterances, + intent_data=intent, n_generations=n_generations, ) new_samples.extend( @@ -93,54 +60,6 @@ def augment( return [Sample(**sample) for sample in new_samples] -def _load_prompt() -> str: - with ires.files("autointent.generation.utterances.basic").joinpath("chat_template.yaml").open() as file: - return file.read() - - -def _load_extra_instructions() -> dict[str, Any]: - with ires.files("autointent.generation.utterances.basic").joinpath("extra_instructions.json").open() as file: - return json.load(file) # type: ignore[no-any-return] - - -def _add_extra_instructions( - prompt_template_yaml: str, - custom_instruction: list[str], - length: LengthType, - style: StyleType, - same_punctuation: bool, -) -> str: - instructions = _load_extra_instructions() - - extra_instructions = [] - if length != "none": - extra_instructions.append(instructions["length"][length]) - if style != "none": - extra_instructions.append(instructions["style"][style]) - if same_punctuation: - extra_instructions.append(instructions["punctuation"]) - - extra_instructions.extend(custom_instruction) - - parsed_extra_instructions = "\n ".join([f"- {s}" for s in extra_instructions]) - return safe_format(prompt_template_yaml, extra_instructions=parsed_extra_instructions) # type: ignore[no-any-return] - - -def _format_utterances(utterances: list[str]) -> str: - """ - Convert given utterances into string that is ready to insert into prompt. - - Given list of utterances, the output string is returned in the following format: - .. code-block:: - 1. I want to order a large pepperoni pizza. - 2. Can I get a medium cheese pizza with extra olives? - 3. Please deliver a small veggie pizza to my address. - - Note that tab is inserted before each line because of how yaml processes multi-line fields. - """ - return "\n ".join(f"{i}. {ut}" for i, ut in enumerate(utterances)) - - def _extract_utterances(response_text: str) -> list[str]: """ Parse LLM output. diff --git a/autointent/generation/utterances/utils.py b/autointent/generation/utterances/utils.py deleted file mode 100644 index 7742ea1f..00000000 --- a/autointent/generation/utterances/utils.py +++ /dev/null @@ -1,27 +0,0 @@ -# type: ignore # noqa: PGH003 - -import string -from typing import Any - - -class SafeFormatter(string.Formatter): - """Utility class for loading prompt templates.""" - - def get_value(self, key, args, kwargs) -> Any: # noqa: ANN001, ANN401 - """Overloaded.""" - if isinstance(key, str): - return kwargs.get(key, "{" + key + "}") - return super().get_value(key, args, kwargs) - - def parse(self, format_string): # noqa: ANN001, ANN201 - """Overloaded.""" - try: - return super().parse(format_string) - except ValueError: - return [(format_string, None, None, None)] - - -def safe_format(format_string: str, *args: tuple[str], **kwargs: dict[str, str]) -> str: - """Format chat template.""" - formatter = SafeFormatter() - return formatter.format(format_string, *args, **kwargs) From de8d6025aff6e6b65b08cd19f14d24b356a07e9f Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 10:41:44 +0300 Subject: [PATCH 10/18] update `Dataset.from_hub` method --- autointent/_dataset/_dataset.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/autointent/_dataset/_dataset.py b/autointent/_dataset/_dataset.py index 774773b2..d15ca8ae 100644 --- a/autointent/_dataset/_dataset.py +++ b/autointent/_dataset/_dataset.py @@ -100,13 +100,14 @@ def from_hub(cls, repo_id: str) -> "Dataset": :param repo_id: ID of the Hugging Face repository. :return: Initialized Dataset object. """ - splits, intents = load_dataset(repo_id), [] + from ._reader import DictReader + + splits = load_dataset(repo_id) + mapping = dict(**splits) if Split.INTENTS in get_dataset_config_names(repo_id): - intents = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list() - return cls( - splits.items(), - intents=[Intent.model_validate(intent) for intent in intents], - ) + mapping["intents"] = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list() + + return DictReader().read(mapping) def to_multilabel(self) -> "Dataset": """ From a3a9c761270d57db4c916b050cba20c15c0cebe8 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 10:42:08 +0300 Subject: [PATCH 11/18] add debug messages to cli endpoint --- autointent/generation/utterances/basic/cli.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/autointent/generation/utterances/basic/cli.py b/autointent/generation/utterances/basic/cli.py index f3b2dd30..baa6968c 100644 --- a/autointent/generation/utterances/basic/cli.py +++ b/autointent/generation/utterances/basic/cli.py @@ -1,5 +1,6 @@ """CLI for basic utterance generator.""" +import logging from argparse import ArgumentParser from autointent import load_dataset @@ -8,6 +9,9 @@ from .chat_template import SynthesizerChatTemplate +logging.basicConfig(level="INFO") +logger = logging.getLogger(__name__) + def main() -> None: """ClI endpoint.""" @@ -30,6 +34,7 @@ def main() -> None: default=None, help="Local path where to save result", ) + parser.add_argument("--split", type=str, default="train") parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used") parser.add_argument( "--n-generations", @@ -46,9 +51,16 @@ def main() -> None: args = parser.parse_args() dataset = load_dataset(args.input_path) - template = SynthesizerChatTemplate(dataset, "train", max_sample_utterances=args.n_sample_utterances) + template = SynthesizerChatTemplate(dataset, args.split, max_sample_utterances=args.n_sample_utterances) generator = UtteranceGenerator(Generator(), template) - generator.augment(dataset, n_generations=args.n_generations) + + n_before = len(dataset[args.split]) + new_samples = generator.augment(dataset, split_name=args.split, n_generations=args.n_generations) + n_after = len(dataset[args.split]) + + logger.info("# samples before %s", n_before) + logger.info("# samples generated %s", len(new_samples)) + logger.info("# samples after %s", n_after) dataset.to_json(args.output_path) From 218e707169ffacc53c958d1de74523d0fe158cd2 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 11:17:44 +0300 Subject: [PATCH 12/18] update cli for evolver --- autointent/generation/utterances/evolution/cli.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/autointent/generation/utterances/evolution/cli.py b/autointent/generation/utterances/evolution/cli.py index 1f5b9431..f05991ed 100644 --- a/autointent/generation/utterances/evolution/cli.py +++ b/autointent/generation/utterances/evolution/cli.py @@ -1,5 +1,6 @@ """CLI for evolutionary augmenter.""" +import logging from argparse import ArgumentParser from autointent import load_dataset @@ -8,6 +9,9 @@ from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution +logging.basicConfig(level="INFO") +logger = logging.getLogger(__name__) + def main() -> None: """CLI endpoint.""" @@ -18,6 +22,7 @@ def main() -> None: required=True, help="Path to json or hugging face repo with dataset", ) + parser.add_argument("--split", type=str, default="train") parser.add_argument( "--output-path", type=str, @@ -47,9 +52,15 @@ def main() -> None: evolutions.append(AbstractEvolution()) dataset = load_dataset(args.input_path) + n_before = len(dataset[args.split]) generator = UtteranceEvolver(Generator(), evolutions, args.seed) - generator.augment(dataset, n_evolutions=args.n_evolutions) + new_samples = generator.augment(dataset, split_name=args.split, n_evolutions=args.n_evolutions) + n_after = len(dataset[args.split]) + + logger.info("# samples before %s", n_before) + logger.info("# samples generated %s", len(new_samples)) + logger.info("# samples after %s", n_after) dataset.to_json(args.output_path) From 1e04f3e5bc14c0c67cd5f42fcbf62a1088422712 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 11:41:13 +0300 Subject: [PATCH 13/18] fix shared classvar issue --- autointent/generation/utterances/basic/chat_template.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/autointent/generation/utterances/basic/chat_template.py b/autointent/generation/utterances/basic/chat_template.py index c866d282..98b52aae 100644 --- a/autointent/generation/utterances/basic/chat_template.py +++ b/autointent/generation/utterances/basic/chat_template.py @@ -2,6 +2,7 @@ import random from abc import ABC, abstractmethod +from copy import deepcopy from typing import ClassVar from autointent import Dataset @@ -20,7 +21,7 @@ def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]: class SynthesizerChatTemplate(BaseSynthesizer): """Chat template for generating additional examples for a given intent class.""" - _messages: ClassVar[list[Message]] = [ + __messages: ClassVar[list[Message]] = [ Message( role=Role.USER, content=( @@ -107,6 +108,8 @@ def __init__( if extra_instructions is None: extra_instructions = "" + self._messages = deepcopy(self.__messages) + msg = self._messages[0] msg["content"] = msg["content"].format(extra_instructions=extra_instructions) From 9072524d2e177879850056e485180804c83c46aa Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 11:41:41 +0300 Subject: [PATCH 14/18] add tests for basic chat template --- tests/generation/test_basic_synthesizer.py | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/generation/test_basic_synthesizer.py diff --git a/tests/generation/test_basic_synthesizer.py b/tests/generation/test_basic_synthesizer.py new file mode 100644 index 00000000..47ed92c0 --- /dev/null +++ b/tests/generation/test_basic_synthesizer.py @@ -0,0 +1,29 @@ +from unittest.mock import Mock + +from autointent.generation.utterances.basic.chat_template import SynthesizerChatTemplate + + +def has_unfilled_fields(template): + try: + # Attempt to format the string with empty values + template.format(**{}) # noqa: PIE804 + return False # No unfilled fields # noqa: TRY300 + except KeyError: + return True # Unfilled fields detected + + +def test_default_chat_template(dataset): + template = SynthesizerChatTemplate(dataset, split="train_0") + prompt = template(dataset.intents[0], n_examples=1) + for msg in prompt: + assert not has_unfilled_fields(msg["content"]) + assert "extra_instructions" not in prompt + + +def test_extra_instructions(dataset): + template = SynthesizerChatTemplate(dataset, split="train_0", extra_instructions="football") + prompt = template(dataset.intents[0], n_examples=1)[0]["content"] + assert "extra_instructions" not in prompt + assert "football" in prompt + + From 06d1d58bb0c89423e1c54aa31f96599706b37a1a Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 12:17:57 +0300 Subject: [PATCH 15/18] configure explicit import from `generation.utterances` submodule --- autointent/generation/utterances/__init__.py | 14 ++++++++++++++ autointent/generation/utterances/basic/__init__.py | 4 ++++ .../generation/utterances/evolution/__init__.py | 4 ++++ 3 files changed, 22 insertions(+) diff --git a/autointent/generation/utterances/__init__.py b/autointent/generation/utterances/__init__.py index e69de29b..db54310e 100644 --- a/autointent/generation/utterances/__init__.py +++ b/autointent/generation/utterances/__init__.py @@ -0,0 +1,14 @@ +from .basic import SynthesizerChatTemplate, UtteranceGenerator +from .evolution import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution, UtteranceEvolver +from .generator import Generator + +__all__ = [ + "AbstractEvolution", + "ConcreteEvolution", + "EvolutionChatTemplate", + "Generator", + "ReasoningEvolution", + "SynthesizerChatTemplate", + "UtteranceEvolver", + "UtteranceGenerator", +] diff --git a/autointent/generation/utterances/basic/__init__.py b/autointent/generation/utterances/basic/__init__.py index e69de29b..5ae1c024 100644 --- a/autointent/generation/utterances/basic/__init__.py +++ b/autointent/generation/utterances/basic/__init__.py @@ -0,0 +1,4 @@ +from .chat_template import SynthesizerChatTemplate +from .utterance_generator import UtteranceGenerator + +__all__ = ["SynthesizerChatTemplate", "UtteranceGenerator"] diff --git a/autointent/generation/utterances/evolution/__init__.py b/autointent/generation/utterances/evolution/__init__.py index e69de29b..27007e7f 100644 --- a/autointent/generation/utterances/evolution/__init__.py +++ b/autointent/generation/utterances/evolution/__init__.py @@ -0,0 +1,4 @@ +from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution +from .evolver import UtteranceEvolver + +__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution", "UtteranceEvolver"] From ac2631326dcc85489319fa34483425609a37e56d Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 12:18:10 +0300 Subject: [PATCH 16/18] add tests for augmentation --- tests/generation/test_basic_synthesizer.py | 25 +++++++++++++++- tests/generation/test_evolver.py | 34 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 tests/generation/test_evolver.py diff --git a/tests/generation/test_basic_synthesizer.py b/tests/generation/test_basic_synthesizer.py index 47ed92c0..674b3535 100644 --- a/tests/generation/test_basic_synthesizer.py +++ b/tests/generation/test_basic_synthesizer.py @@ -1,6 +1,6 @@ from unittest.mock import Mock -from autointent.generation.utterances.basic.chat_template import SynthesizerChatTemplate +from autointent.generation.utterances import SynthesizerChatTemplate, UtteranceGenerator def has_unfilled_fields(template): @@ -27,3 +27,26 @@ def test_extra_instructions(dataset): assert "football" in prompt +def test_on_dataset(dataset): + mock_llm = Mock() + mock_llm.get_chat_completion.return_value = "1. LLM answer" + + split_name = "train_0" + + template = SynthesizerChatTemplate(dataset, split=split_name) + augmenter = UtteranceGenerator(mock_llm, template) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, update_split=False) + n_after = len(dataset[split_name]) + + assert n_before == n_after + assert len(new_samples) == len(dataset.intents) + assert all(sample.utterance == "LLM answer" for sample in new_samples) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, update_split=True) + n_after = len(dataset[split_name]) + + assert n_before + len(new_samples) == n_after + assert len(new_samples) == len(dataset.intents) diff --git a/tests/generation/test_evolver.py b/tests/generation/test_evolver.py new file mode 100644 index 00000000..c95c9def --- /dev/null +++ b/tests/generation/test_evolver.py @@ -0,0 +1,34 @@ +from unittest.mock import Mock + +from autointent.generation.utterances import AbstractEvolution, UtteranceEvolver + + +def test_default_chat_template(dataset): + template = AbstractEvolution() + prompt = template("some utterance", dataset.intents[0]) + assert "some utterance" in prompt[-1]["content"] + + +def test_on_dataset(dataset): + mock_llm = Mock() + mock_llm.get_chat_completion.return_value = "LLM answer" + + split_name = "train_0" + + template = AbstractEvolution() + augmenter = UtteranceEvolver(mock_llm, [template]) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, n_evolutions=1, update_split=False) + n_after = len(dataset[split_name]) + + assert n_before == n_after + assert len(new_samples) == n_before + assert all(sample.utterance == "LLM answer" for sample in new_samples) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, n_evolutions=1, update_split=True) + n_after = len(dataset[split_name]) + + assert n_before + len(new_samples) == n_after + assert len(new_samples) == n_before From 1adc68b76146a1a8bcce83234bac41ac1eeb860a Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 12:31:03 +0300 Subject: [PATCH 17/18] commit to trigger gh actions --- autointent/generation/utterances/basic/utterance_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autointent/generation/utterances/basic/utterance_generator.py b/autointent/generation/utterances/basic/utterance_generator.py index 0d3c3f06..1d962272 100644 --- a/autointent/generation/utterances/basic/utterance_generator.py +++ b/autointent/generation/utterances/basic/utterance_generator.py @@ -42,7 +42,7 @@ def augment( """ Augment some split of dataset. - Note that for now it supports only single-label datasets. + TODO Note that for now it supports only single-label datasets. """ original_split = dataset[split_name] new_samples = [] From 24ef2589c413c80dc51b674a1e97455f9f741855 Mon Sep 17 00:00:00 2001 From: voorhs Date: Fri, 31 Jan 2025 13:28:39 +0300 Subject: [PATCH 18/18] fix one doctest --- autointent/modules/scoring/_knn/knn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index 6de4d162..cda70bd9 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -43,7 +43,7 @@ class KNNScorer(ScoringModule): .. testoutput:: [[0.67297815 0.32702185] - [0.44031678 0.55968322]] + [0.44031667 0.55968333]] """