diff --git a/autointent/_dataset/_dataset.py b/autointent/_dataset/_dataset.py index 774773b2..d15ca8ae 100644 --- a/autointent/_dataset/_dataset.py +++ b/autointent/_dataset/_dataset.py @@ -100,13 +100,14 @@ def from_hub(cls, repo_id: str) -> "Dataset": :param repo_id: ID of the Hugging Face repository. :return: Initialized Dataset object. """ - splits, intents = load_dataset(repo_id), [] + from ._reader import DictReader + + splits = load_dataset(repo_id) + mapping = dict(**splits) if Split.INTENTS in get_dataset_config_names(repo_id): - intents = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list() - return cls( - splits.items(), - intents=[Intent.model_validate(intent) for intent in intents], - ) + mapping["intents"] = load_dataset(repo_id, Split.INTENTS)[Split.INTENTS].to_list() + + return DictReader().read(mapping) def to_multilabel(self) -> "Dataset": """ diff --git a/autointent/generation/utterances/__init__.py b/autointent/generation/utterances/__init__.py index e69de29b..db54310e 100644 --- a/autointent/generation/utterances/__init__.py +++ b/autointent/generation/utterances/__init__.py @@ -0,0 +1,14 @@ +from .basic import SynthesizerChatTemplate, UtteranceGenerator +from .evolution import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution, UtteranceEvolver +from .generator import Generator + +__all__ = [ + "AbstractEvolution", + "ConcreteEvolution", + "EvolutionChatTemplate", + "Generator", + "ReasoningEvolution", + "SynthesizerChatTemplate", + "UtteranceEvolver", + "UtteranceGenerator", +] diff --git a/autointent/generation/utterances/basic/__init__.py b/autointent/generation/utterances/basic/__init__.py index e69de29b..5ae1c024 100644 --- a/autointent/generation/utterances/basic/__init__.py +++ b/autointent/generation/utterances/basic/__init__.py @@ -0,0 +1,4 @@ +from .chat_template import SynthesizerChatTemplate +from .utterance_generator import UtteranceGenerator + +__all__ = ["SynthesizerChatTemplate", "UtteranceGenerator"] diff --git a/autointent/generation/utterances/basic/chat_template.py b/autointent/generation/utterances/basic/chat_template.py new file mode 100644 index 00000000..98b52aae --- /dev/null +++ b/autointent/generation/utterances/basic/chat_template.py @@ -0,0 +1,134 @@ +"""Chat template for evolution augmentation via abstractization.""" + +import random +from abc import ABC, abstractmethod +from copy import deepcopy +from typing import ClassVar + +from autointent import Dataset +from autointent.generation.utterances.schemas import Message, Role +from autointent.schemas import Intent + + +class BaseSynthesizer(ABC): + """Base class.""" + + @abstractmethod + def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]: + """Generate examples for this intent.""" + + +class SynthesizerChatTemplate(BaseSynthesizer): + """Chat template for generating additional examples for a given intent class.""" + + __messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "You will be provided with a set of example utterances and the name " + "of the common topic (intent name) of these utterances. " + "Your task is to generate more examples that fit within the same intent name.\n\n" + "Note:\n" + "- You can generate similar utterances with only slot values changed\n" + "- You can generate completely different utterance from the same intent name\n" + "- Intent name can be missed, then you should infer from example utterances only\n" + "- Example utterances can be missed, then you should infer from intent name only\n" + "{extra_instructions}\n\n" + "Intent name: ordering_pizza\n\n" + "Example Utterances:\n" + "1. I want to order a large pepperoni pizza.\n" + "2. Can I get a medium cheese pizza with extra olives?\n" + "3. Please deliver a small veggie pizza to my address.\n\n" + "Please generate 3 more examples for the provided intent name." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. I'd like to order a large margherita pizza.\n" + "2. Can you deliver a medium Hawaiian pizza with extra pineapple?\n" + "3. Please send a small BBQ chicken pizza to my home." + ), + ), + Message( + role=Role.USER, + content=( + "Intent name: booking a hotel\n\n" + "Example Utterances:\n" + "1. I need to book a room for two nights in New York.\n\n" + "Please generate 2 more examples for the provided intent name." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. Can you reserve a deluxe room for my trip to Tokyo?\n" + "2. I need to book a hotel room with a mountain view in Denver." + ), + ), + Message( + role=Role.USER, + content=( + "Intent name:\n\n" + "Example Utterances:\n" + "1. What is the weather like today?\n\n" + "Please generate 2 more examples for the provided intent class." + ), + ), + Message( + role=Role.ASSISTANT, + content=("1. Can you tell me the forecast for tomorrow?\n" "2. Is it going to rain this weekend?"), + ), + Message( + role=Role.USER, + content=( + "Intent name: Scheduling a Meeting\n\n" + "Example Utterances:\n\n" + "Please generate 3 more examples for the provided intent class." + ), + ), + Message( + role=Role.ASSISTANT, + content=( + "1. I need to schedule a meeting for next Tuesday.\n" + "2. Can you set up a conference call for tomorrow afternoon?\n" + "3. Please arrange a meeting with the marketing team next week." + ), + ), + ] + + def __init__( + self, + dataset: Dataset, + split: str, + extra_instructions: str | None = None, + max_sample_utterances: int | None = None, + ) -> None: + """Initialize.""" + if extra_instructions is None: + extra_instructions = "" + + self._messages = deepcopy(self.__messages) + + msg = self._messages[0] + msg["content"] = msg["content"].format(extra_instructions=extra_instructions) + + self.dataset = dataset + self.split = split + self.max_sample_utterances = max_sample_utterances + + def __call__(self, intent_data: Intent, n_examples: int) -> list[Message]: + """Generate additional examples for the provided intent class.""" + filtered_split = self.dataset[self.split].filter(lambda sample: sample[Dataset.label_feature] == intent_data.id) + sample_utterances = filtered_split[Dataset.utterance_feature] + if self.max_sample_utterances is not None: + sample_utterances = random.sample(sample_utterances, k=self.max_sample_utterances) + return [ + *self._messages, + Message( + role=Role.USER, + content=f"Intent name: {intent_data.name}\n\n" + f"Example Utterances:\n{sample_utterances}\n\n" + f"Please generate {n_examples} more examples for the provided intent class.\n", + ), + ] diff --git a/autointent/generation/utterances/basic/chat_template.yaml b/autointent/generation/utterances/basic/chat_template.yaml deleted file mode 100644 index 9d512a11..00000000 --- a/autointent/generation/utterances/basic/chat_template.yaml +++ /dev/null @@ -1,119 +0,0 @@ -- role: system - content: | - You will be provided with a set of example utterances and the name of the common topic (intent class) of these utterances. Your task is to generate more examples that fit within the same intent class. - - Note: - - You can generate similar utterances with only slot values changed - - You can generate completely different utterance from the same intent class - - Intent name can be missed, then you should infer from example utterances only - - Example utterances can be missed, then you should infer from intent name only - {extra_instructions} -- role: user - content: | - Intent Class: ordering_pizza - - Example Utterances: - 1. I want to order a large pepperoni pizza. - 2. Can I get a medium cheese pizza with extra olives? - 3. Please deliver a small veggie pizza to my address. - - Please generate 3 more examples for the provided intent class. -- role: assistant - content: | - 1. I'd like to order a large margherita pizza. - 2. Can you deliver a medium Hawaiian pizza with extra pineapple? - 3. Please send a small BBQ chicken pizza to my home. -- role: user - content: | - Intent Class: booking a hotel - - Example Utterances: - 1. I need to book a room for two nights in New York. - - Please generate 2 more examples for the provided intent class. -- role: assistant - content: | - 1. Can you reserve a deluxe room for my trip to Tokyo? - 2. I need to book a hotel room with a mountain view in Denver. -- role: user - content: | - Intent Class: - - Example Utterances: - 1. What is the weather like today? - - Please generate 2 more examples for the provided intent class. -- role: assistant - content: | - 1. Can you tell me the forecast for tomorrow? - 2. Is it going to rain this weekend? -- role: user - content: | - Intent Class: Scheduling a Meeting - - Example Utterances: - - Please generate 3 more examples for the provided intent class. -- role: assistant - content: | - 1. I need to schedule a meeting for next Tuesday. - 2. Can you set up a conference call for tomorrow afternoon? - 3. Please arrange a meeting with the marketing team next week. -- role: user - content: | - Intent Class: {intent_name} - - Example Utterances: - {example_utterances} - - Please generate {n_examples} more examples for the provided intent class. - -# ### Intent Class: Asking for Directions -# **Example Utterances:** -# 1. "How do I get to the nearest coffee shop?" -# 2. "Can you give me directions to the airport?" -# 3. "What is the best route to the museum from here?" - -# ### Intent Class: Making a Restaurant Reservation -# **Example Utterances:** -# 1. "I want to make a reservation for dinner tonight." -# 2. "Can you book a table for two at the Italian restaurant?" -# 3. "Please reserve a table for four at the steakhouse for Saturday evening." - -# ### Intent Class: Requesting Technical Support -# **Example Utterances:** -# 1. "I'm having trouble with my laptop." -# 2. "Can you help me fix my Wi-Fi connection?" -# 3. "My software is not working properly, can you assist?" - -# ### Intent Class: Inquiring About Product Availability -# **Example Utterances:** -# 1. "Do you have the new iPhone in stock?" -# 2. "Is the blue shirt available in size medium?" -# 3. "Can you check if the latest book by John Doe is available?" - -# ### Intent Class: Requesting Account Information -# **Example Utterances:** -# 1. "What is my current account balance?" -# 2. "Can you tell me my recent transactions?" -# 3. "I need to check my account statement for last month." - -# ### Intent Class: Booking a Flight -# **Example Utterances:** -# 1. "I want to book a flight to Los Angeles." -# 2. "Can you find me a flight to Paris next week?" -# 3. "Please book a round-trip flight to New York for next month." - -# ### Intent Class: Requesting Movie Recommendations -# **Example Utterances:** -# 1. "Can you recommend a good action movie?" -# 2. "What are some highly rated comedies?" -# 3. "I'm in the mood for a romantic film, any suggestions?" - -# ### Intent Class: Ordering Groceries -# **Example Utterances:** -# 1. "I need to order some milk and bread." -# 2. "Can you add apples and bananas to my grocery list?" -# 3. "Please order a dozen eggs and a pack of butter." - -# You can use these intent names and example utterances to further train or test your language model for generating more examples within each intent class. diff --git a/autointent/generation/utterances/basic/cli.py b/autointent/generation/utterances/basic/cli.py index 782776f8..baa6968c 100644 --- a/autointent/generation/utterances/basic/cli.py +++ b/autointent/generation/utterances/basic/cli.py @@ -1,11 +1,17 @@ """CLI for basic utterance generator.""" +import logging from argparse import ArgumentParser from autointent import load_dataset -from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator +from autointent.generation.utterances.basic.utterance_generator import UtteranceGenerator from autointent.generation.utterances.generator import Generator +from .chat_template import SynthesizerChatTemplate + +logging.basicConfig(level="INFO") +logger = logging.getLogger(__name__) + def main() -> None: """ClI endpoint.""" @@ -28,6 +34,7 @@ def main() -> None: default=None, help="Local path where to save result", ) + parser.add_argument("--split", type=str, default="train") parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used") parser.add_argument( "--n-generations", @@ -41,37 +48,19 @@ def main() -> None: default=5, help="Number of utterances to use as an example for augmentation", ) - parser.add_argument( - "--custom-instruction", - type=str, - action="append", - help="Add extra instructions to default prompt." - "You can use this argument multiple times to add multiple instructions", - ) - parser.add_argument( - "--length", - choices=LengthType.__args__, # type: ignore[attr-defined] - default="none", - help="How to extend the prompt with length instruction", - ) - parser.add_argument( - "--style", - choices=StyleType.__args__, # type: ignore[attr-defined] - default="none", - help="How to extend the prompt with style instruction", - ) - parser.add_argument( - "--same-punctuation", - action="store_true", - help="Whether to extend the prompt with punctuation instruction", - ) args = parser.parse_args() dataset = load_dataset(args.input_path) - generator = UtteranceGenerator( - Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation - ) - generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances) + template = SynthesizerChatTemplate(dataset, args.split, max_sample_utterances=args.n_sample_utterances) + generator = UtteranceGenerator(Generator(), template) + + n_before = len(dataset[args.split]) + new_samples = generator.augment(dataset, split_name=args.split, n_generations=args.n_generations) + n_after = len(dataset[args.split]) + + logger.info("# samples before %s", n_before) + logger.info("# samples generated %s", len(new_samples)) + logger.info("# samples after %s", n_after) dataset.to_json(args.output_path) diff --git a/autointent/generation/utterances/basic/extra_instructions.json b/autointent/generation/utterances/basic/extra_instructions.json deleted file mode 100644 index a17c384b..00000000 --- a/autointent/generation/utterances/basic/extra_instructions.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "length": { - "same": "Generated utterances should have the similar length in a sense of number of words to example utterances", - "longer": "Generated utterances can be a little bit longer in a sense of number of words than example utterances", - "shorter": "Generated utterances can be a little bit shorter in a sense of number of words than example utterances" - }, - "style": { - "same": "Generated utterances should follow the same style of conversation as example utterances", - "formal": "Generated utterances should follow a formal style of conversation", - "informal": "Generated utterances doesn't have to follow a formal style of conversation", - "playful": "Generated utterances can be playful and funny" - }, - "punctuation": "Generated utterances should follow the same punctuation style" -} diff --git a/autointent/generation/utterances/basic/utterance_generator.py b/autointent/generation/utterances/basic/utterance_generator.py index ecd22de3..1d962272 100644 --- a/autointent/generation/utterances/basic/utterance_generator.py +++ b/autointent/generation/utterances/basic/utterance_generator.py @@ -1,22 +1,15 @@ """Basic generation of new utterances from existing ones.""" -import importlib.resources as ires -import json -import random -from typing import Any, Literal +from collections.abc import Callable -import yaml from datasets import Dataset as HFDataset from datasets import concatenate_datasets from autointent import Dataset from autointent.custom_types import Split from autointent.generation.utterances.generator import Generator -from autointent.generation.utterances.utils import safe_format # type: ignore[attr-defined] -from autointent.schemas import Sample - -LengthType = Literal["none", "same", "longer", "shorter"] -StyleType = Literal["none", "formal", "informal", "playful"] +from autointent.generation.utterances.schemas import Message +from autointent.schemas import Intent, Sample class UtteranceGenerator: @@ -28,34 +21,14 @@ class UtteranceGenerator: punctuation and length of the desired generations. """ - def __init__( - self, - generator: Generator, - custom_instruction: list[str], - length: LengthType, - style: StyleType, - same_punctuation: bool, - ) -> None: + def __init__(self, generator: Generator, prompt_maker: Callable[[Intent, int], list[Message]]) -> None: """Initialize.""" self.generator = generator - prompt_template_yaml = _load_prompt() - self.prompt_template_yaml = _add_extra_instructions( - prompt_template_yaml, - custom_instruction, - length, - style, - same_punctuation, - ) + self.prompt_maker = prompt_maker - def __call__(self, intent_name: str, example_utterances: list[str], n_generations: int) -> list[str]: + def __call__(self, intent_data: Intent, n_generations: int) -> list[str]: """Generate new utterances.""" - messages_yaml = safe_format( - self.prompt_template_yaml, - intent_name=intent_name, - example_utterances=_format_utterances(example_utterances), - n_examples=n_generations, - ) - messages = yaml.safe_load(messages_yaml) + messages = self.prompt_maker(intent_data, n_generations) response_text = self.generator.get_chat_completion(messages) return _extract_utterances(response_text) @@ -64,24 +37,18 @@ def augment( dataset: Dataset, split_name: str = Split.TRAIN, n_generations: int = 5, - max_sample_utterances: int = 5, update_split: bool = True, ) -> list[Sample]: """ Augment some split of dataset. - Note that for now it supports only single-label datasets. + TODO Note that for now it supports only single-label datasets. """ original_split = dataset[split_name] new_samples = [] for intent in dataset.intents: - filtered_split = original_split.filter(lambda sample, id=intent.id: sample[Dataset.label_feature] == id) - sample_utterances = filtered_split[Dataset.utterance_feature] - if max_sample_utterances is not None: - sample_utterances = random.sample(sample_utterances, k=max_sample_utterances) generated_utterances = self( - intent_name=intent.name or "", - example_utterances=sample_utterances, + intent_data=intent, n_generations=n_generations, ) new_samples.extend( @@ -93,54 +60,6 @@ def augment( return [Sample(**sample) for sample in new_samples] -def _load_prompt() -> str: - with ires.files("autointent.generation.utterances.basic").joinpath("chat_template.yaml").open() as file: - return file.read() - - -def _load_extra_instructions() -> dict[str, Any]: - with ires.files("autointent.generation.utterances.basic").joinpath("extra_instructions.json").open() as file: - return json.load(file) # type: ignore[no-any-return] - - -def _add_extra_instructions( - prompt_template_yaml: str, - custom_instruction: list[str], - length: LengthType, - style: StyleType, - same_punctuation: bool, -) -> str: - instructions = _load_extra_instructions() - - extra_instructions = [] - if length != "none": - extra_instructions.append(instructions["length"][length]) - if style != "none": - extra_instructions.append(instructions["style"][style]) - if same_punctuation: - extra_instructions.append(instructions["punctuation"]) - - extra_instructions.extend(custom_instruction) - - parsed_extra_instructions = "\n ".join([f"- {s}" for s in extra_instructions]) - return safe_format(prompt_template_yaml, extra_instructions=parsed_extra_instructions) # type: ignore[no-any-return] - - -def _format_utterances(utterances: list[str]) -> str: - """ - Convert given utterances into string that is ready to insert into prompt. - - Given list of utterances, the output string is returned in the following format: - .. code-block:: - 1. I want to order a large pepperoni pizza. - 2. Can I get a medium cheese pizza with extra olives? - 3. Please deliver a small veggie pizza to my address. - - Note that tab is inserted before each line because of how yaml processes multi-line fields. - """ - return "\n ".join(f"{i}. {ut}" for i, ut in enumerate(utterances)) - - def _extract_utterances(response_text: str) -> list[str]: """ Parse LLM output. diff --git a/autointent/generation/utterances/evolution/__init__.py b/autointent/generation/utterances/evolution/__init__.py index e69de29b..27007e7f 100644 --- a/autointent/generation/utterances/evolution/__init__.py +++ b/autointent/generation/utterances/evolution/__init__.py @@ -0,0 +1,4 @@ +from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution +from .evolver import UtteranceEvolver + +__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution", "UtteranceEvolver"] diff --git a/autointent/generation/utterances/evolution/chat_templates/__init__.py b/autointent/generation/utterances/evolution/chat_templates/__init__.py index e69de29b..8417ff1a 100644 --- a/autointent/generation/utterances/evolution/chat_templates/__init__.py +++ b/autointent/generation/utterances/evolution/chat_templates/__init__.py @@ -0,0 +1,6 @@ +from .abstract import AbstractEvolution +from .base import EvolutionChatTemplate +from .concrete import ConcreteEvolution +from .reasoning import ReasoningEvolution + +__all__ = ["AbstractEvolution", "ConcreteEvolution", "EvolutionChatTemplate", "ReasoningEvolution"] diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.py b/autointent/generation/utterances/evolution/chat_templates/abstract.py new file mode 100644 index 00000000..4ab88c35 --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/abstract.py @@ -0,0 +1,45 @@ +"""Chat template for evolution augmentation via abstractization.""" + +from typing import ClassVar + +from autointent.generation.utterances.schemas import Message, Role +from autointent.schemas import Intent + +from .base import EvolutionChatTemplate + + +class AbstractEvolution(EvolutionChatTemplate): + """Chat template for evolution augmentation via abstraction.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Reserve Restaurant" + "Utterance: I want to reserve a table for 4 persons at 9 pm." + ), + ), + Message(role=Role.ASSISTANT, content="Please, reserve a table for me."), + Message( + role=Role.ASSISTANT, + content=( + "Intent name: requesting technical support" + "Utterance: My Lenovo laptop is constantly rebooting and overheating." + ), + ), + Message(role=Role.ASSISTANT, content="I'm having trouble with my laptop."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: + """Make chat to complete.""" + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] diff --git a/autointent/generation/utterances/evolution/chat_templates/abstract.yaml b/autointent/generation/utterances/evolution/chat_templates/abstract.yaml deleted file mode 100644 index 37ddc558..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/abstract.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Reserve Restaurant - - Utterance: - I want to reserve a table for 4 persons at 9 pm. -- role: assistant - content: | - Please, reserve a table for be. -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - My Lenovo laptop is constantly rebooting and overheating. -- role: assistant - content: | - I'm having trouble with my laptop. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/chat_templates/base.py b/autointent/generation/utterances/evolution/chat_templates/base.py new file mode 100644 index 00000000..cb119d1b --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/base.py @@ -0,0 +1,14 @@ +"""Base class for chat templates for evolution augmentation.""" + +from abc import ABC, abstractmethod + +from autointent.generation.utterances.schemas import Message +from autointent.schemas import Intent + + +class EvolutionChatTemplate(ABC): + """Base class for chat templates for evolution augmentation.""" + + @abstractmethod + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: + """Make a chat to complete by LLM.""" diff --git a/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt b/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt deleted file mode 100644 index e2b498a3..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/base_instruction.txt +++ /dev/null @@ -1,3 +0,0 @@ -I want you to act as a rewriter. - You will be provided with an utterance and the topic (name of intent class) of the utterance. - You need to complicate the utterance using the following method: diff --git a/autointent/generation/utterances/evolution/chat_templates/concrete.py b/autointent/generation/utterances/evolution/chat_templates/concrete.py new file mode 100644 index 00000000..dcca78ba --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/concrete.py @@ -0,0 +1,42 @@ +"""Chat template for evolution augmentation via concretizing.""" + +from typing import ClassVar + +from autointent.generation.utterances.schemas import Message, Role +from autointent.schemas import Intent + +from .base import EvolutionChatTemplate + + +class ConcreteEvolution(EvolutionChatTemplate): + """Chat template for evolution augmentation via concretizing.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Reserve Restaurant\n" + "Utterance: I want to make a reservation for dinner tonight." + ), + ), + Message(role=Role.ASSISTANT, content="I want to reserve a table for 4 persons at 9 pm."), + Message( + role=Role.USER, + content=("Intent name: requesting technical support\n" "Utterance: I'm having trouble with my laptop."), + ), + Message(role=Role.ASSISTANT, content="My laptop is constantly rebooting and overheating."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: + """Make chat to complete.""" + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] diff --git a/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml b/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml deleted file mode 100644 index 67037448..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/concretizing.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Reserve Restaurant - - Utterance: - I want to make a reservation for dinner tonight. -- role: assistant - content: | - I want to reserve a table for 4 persons at 9 pm. -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - I'm having trouble with my laptop. -- role: assistant - content: | - My laptop is constantly rebooting and overheating. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.py b/autointent/generation/utterances/evolution/chat_templates/reasoning.py new file mode 100644 index 00000000..39791acc --- /dev/null +++ b/autointent/generation/utterances/evolution/chat_templates/reasoning.py @@ -0,0 +1,45 @@ +"""Chat template for evolution augmentation via reasoning.""" + +from typing import ClassVar + +from autointent.generation.utterances.schemas import Message, Role +from autointent.schemas import Intent + +from .base import EvolutionChatTemplate + + +class ReasoningEvolution(EvolutionChatTemplate): + """Chat template for evolution augmentation via reasoning.""" + + _messages: ClassVar[list[Message]] = [ + Message( + role=Role.USER, + content=( + "I want you to act as a rewriter. " + "You will be provided with an utterance and the topic (name of intent class) of the utterance. " + "You need to complicate the utterance using the following method:\n" + "1. Rewrite the utterance by removing specific inquiries or replacing with more generic.\n" + "2. Rewritten utterance should be concise and understandable by humans.\n" + "3. Rewritten utterance must be fully answerable.\n" + "4. Rewritten utterance should not contain more than 10 words.\n\n" + "Intent name: Asking for Directions\n" + "Utterance: How do I get to the nearest coffee shop?" + ), + ), + Message(role=Role.ASSISTANT, content="If there are some place where I can buy a coffee, how can I get there?"), + Message( + role=Role.USER, + content=( + "Intent name: requesting technical support\n" + "Utterance: I want to get help from technical support for my laptop." + ), + ), + Message(role=Role.ASSISTANT, content="I don't know what's happening with my laptop."), + ] + + def __call__(self, utterance: str, intent_data: Intent) -> list[Message]: + """Make chat to complete.""" + return [ + *self._messages, + Message(role=Role.USER, content=f"Intent name: {intent_data.name or ''}\nUtterance: {utterance}"), + ] diff --git a/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml b/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml deleted file mode 100644 index 02c3eb88..00000000 --- a/autointent/generation/utterances/evolution/chat_templates/reasoning.yaml +++ /dev/null @@ -1,33 +0,0 @@ -- role: user - content: | - {base_instruction} - 1. Rewrite the utterance by removing specific inquiries or replacing with more generic. - 2. Rewritten utterance should be concise and understandable by humans. - 3. Rewritten utterance must be fully answerable. - 4. Rewritten utterance should not contain more than 10 words. - - Intent Class: - Asking for Directions - - Utterance: - How do I get to the nearest coffee shop? -- role: assistant - content: | - If there are some place where I can buy a coffee, how can I get there? -- role: user - content: | - Intent Class: - requesting technical support - - Utterance: - I want to get help from technical support for my laptop. -- role: assistant - content: | - I don't know what's happening with my laptop. -- role: user - content: | - Intent Class: - {intent_name} - - Utterance: - {utterance} diff --git a/autointent/generation/utterances/evolution/cli.py b/autointent/generation/utterances/evolution/cli.py index be04e11e..f05991ed 100644 --- a/autointent/generation/utterances/evolution/cli.py +++ b/autointent/generation/utterances/evolution/cli.py @@ -1,14 +1,16 @@ """CLI for evolutionary augmenter.""" +import logging from argparse import ArgumentParser -from typing import TYPE_CHECKING from autointent import load_dataset from autointent.generation.utterances.evolution.evolver import UtteranceEvolver from autointent.generation.utterances.generator import Generator -if TYPE_CHECKING: - from .evolver import EvolutionType +from .chat_templates import AbstractEvolution, ConcreteEvolution, EvolutionChatTemplate, ReasoningEvolution + +logging.basicConfig(level="INFO") +logger = logging.getLogger(__name__) def main() -> None: @@ -20,6 +22,7 @@ def main() -> None: required=True, help="Path to json or hugging face repo with dataset", ) + parser.add_argument("--split", type=str, default="train") parser.add_argument( "--output-path", type=str, @@ -37,33 +40,27 @@ def main() -> None: parser.add_argument("--reasoning", action="store_true", help="Whether to use `Reasoning` evolution") parser.add_argument("--concretizing", action="store_true", help="Whether to use `Concretizing` evolution") parser.add_argument("--abstract", action="store_true", help="Whether to use `Abstract` evolution") - parser.add_argument("--formal", action="store_true", help="Whether to use `Formal` evolution") - parser.add_argument("--informal", action="store_true", help="Whether to use `Informal` evolution") - parser.add_argument("--funny", action="store_true", help="Whether to use `Funny` evolution") - parser.add_argument("--goofy", action="store_true", help="Whether to use `Goofy` evolution") parser.add_argument("--seed", type=int, default=0) args = parser.parse_args() - evolutions: list[EvolutionType] = [] + evolutions: list[EvolutionChatTemplate] = [] if args.reasoning: - evolutions.append("reasoning") + evolutions.append(ReasoningEvolution()) if args.concretizing: - evolutions.append("concretizing") + evolutions.append(ConcreteEvolution()) if args.abstract: - evolutions.append("abstract") - if args.formal: - evolutions.append("formal") - if args.informal: - evolutions.append("informal") - if args.funny: - evolutions.append("funny") - if args.goofy: - evolutions.append("goofy") + evolutions.append(AbstractEvolution()) dataset = load_dataset(args.input_path) + n_before = len(dataset[args.split]) generator = UtteranceEvolver(Generator(), evolutions, args.seed) - generator.augment(dataset, n_evolutions=args.n_evolutions) + new_samples = generator.augment(dataset, split_name=args.split, n_evolutions=args.n_evolutions) + n_after = len(dataset[args.split]) + + logger.info("# samples before %s", n_before) + logger.info("# samples generated %s", len(new_samples)) + logger.info("# samples after %s", n_after) dataset.to_json(args.output_path) diff --git a/autointent/generation/utterances/evolution/evolver.py b/autointent/generation/utterances/evolution/evolver.py index 690afd23..bdcf1069 100644 --- a/autointent/generation/utterances/evolution/evolver.py +++ b/autointent/generation/utterances/evolution/evolver.py @@ -4,21 +4,17 @@ Deeply inspired by DeepEval evolutions. """ -import importlib.resources as ires import random -from typing import Literal +from collections.abc import Callable, Sequence -import yaml from datasets import Dataset as HFDataset from datasets import concatenate_datasets from autointent import Dataset from autointent.custom_types import Split from autointent.generation.utterances.generator import Generator -from autointent.generation.utterances.utils import safe_format # type: ignore[attr-defined] -from autointent.schemas import Sample - -EvolutionType = Literal["reasoning", "concretizing", "abstract", "formal", "informal", "funny", "goofy"] +from autointent.generation.utterances.schemas import Message +from autointent.schemas import Intent, Sample class UtteranceEvolver: @@ -29,31 +25,23 @@ class UtteranceEvolver: to change it in a specific way. """ - def __init__(self, generator: Generator, evolutions: list[EvolutionType], seed: int = 0) -> None: + def __init__( + self, generator: Generator, prompt_makers: Sequence[Callable[[str, Intent], list[Message]]], seed: int = 0 + ) -> None: """Initialize.""" self.generator = generator - self.evolutions = evolutions - self.prompts = _load_prompts() + self.prompt_makers = prompt_makers random.seed(seed) - def _evolve(self, utterance: str, intent_name: str, evolution: EvolutionType) -> str: + def _evolve(self, utterance: str, intent_data: Intent) -> str: """Apply evolutions single time.""" - messages_yaml = safe_format( - self.prompts[evolution], - base_instruction=self.prompts["base_instruction"], - utterance=utterance, - intent_name=intent_name, - ) - messages = yaml.safe_load(messages_yaml) - return self.generator.get_chat_completion(messages) + maker = random.choice(self.prompt_makers) + chat = maker(utterance, intent_data) + return self.generator.get_chat_completion(chat) - def __call__(self, utterance: str, intent_name: str, n_evolutions: int = 1) -> list[str]: + def __call__(self, utterance: str, intent_data: Intent, n_evolutions: int = 1) -> list[str]: """Apply evolutions mupltiple times.""" - res = [] - for _ in range(n_evolutions): - evolution = random.choice(self.evolutions) - res.append(self._evolve(utterance, intent_name, evolution)) - return res + return [self._evolve(utterance, intent_data) for _ in range(n_evolutions)] def augment( self, dataset: Dataset, split_name: str = Split.TRAIN, n_evolutions: int = 1, update_split: bool = True @@ -68,25 +56,12 @@ def augment( for sample in original_split: utterance = sample[Dataset.utterance_feature] label = sample[Dataset.label_feature] - intent_info = next(intent for intent in dataset.intents if intent.id == label) - generated_utterances = self( - utterance=utterance, intent_name=intent_info.name or "", n_evolutions=n_evolutions - ) + intent_data = next(intent for intent in dataset.intents if intent.id == label) + generated_utterances = self(utterance=utterance, intent_data=intent_data, n_evolutions=n_evolutions) new_samples.extend( - [{Dataset.label_feature: intent_info.id, Dataset.utterance_feature: ut} for ut in generated_utterances] + [{Dataset.label_feature: intent_data.id, Dataset.utterance_feature: ut} for ut in generated_utterances] ) if update_split: generated_split = HFDataset.from_list(new_samples) dataset[split_name] = concatenate_datasets([original_split, generated_split]) return [Sample(**sample) for sample in new_samples] - - -def _load_prompts() -> dict[str, str]: - files = ires.files("autointent.generation.utterances.evolution.chat_templates") - - res = {} - for file_name in ["reasoning.yaml", "concretizing.yaml", "abstract.yaml", "base_instruction.txt"]: - with files.joinpath(file_name).open() as file: - res[file_name.split(".")[0]] = file.read() - - return res diff --git a/autointent/generation/utterances/generator.py b/autointent/generation/utterances/generator.py index 69672278..dfc9ec86 100644 --- a/autointent/generation/utterances/generator.py +++ b/autointent/generation/utterances/generator.py @@ -5,6 +5,8 @@ import openai from dotenv import load_dotenv +from .schemas import Message + class Generator: """Wrapper class for accessing OpenAI API.""" @@ -15,7 +17,7 @@ def __init__(self) -> None: self.client = openai.OpenAI(base_url=os.environ["OPENAI_BASE_URL"], api_key=os.environ["OPENAI_API_KEY"]) self.model_name = os.environ["OPENAI_MODEL_NAME"] - def get_chat_completion(self, messages: list[dict[str, str]]) -> str: + def get_chat_completion(self, messages: list[Message]) -> str: """Prompt LLM and return its answer.""" response = self.client.chat.completions.create( messages=messages, # type: ignore[arg-type] diff --git a/autointent/generation/utterances/schemas.py b/autointent/generation/utterances/schemas.py new file mode 100644 index 00000000..e296f6cd --- /dev/null +++ b/autointent/generation/utterances/schemas.py @@ -0,0 +1,18 @@ +"""Schemas that are useful for working with prompts.""" + +from typing import TypedDict + + +class Message(TypedDict): + """Schema for message to LLM.""" + + role: str + content: str + + +class Role: + """Roles in a chat with LLM.""" + + SYSTEM = "system" + USER = "user" + ASSISTANT = "assistant" diff --git a/autointent/generation/utterances/utils.py b/autointent/generation/utterances/utils.py deleted file mode 100644 index 7742ea1f..00000000 --- a/autointent/generation/utterances/utils.py +++ /dev/null @@ -1,27 +0,0 @@ -# type: ignore # noqa: PGH003 - -import string -from typing import Any - - -class SafeFormatter(string.Formatter): - """Utility class for loading prompt templates.""" - - def get_value(self, key, args, kwargs) -> Any: # noqa: ANN001, ANN401 - """Overloaded.""" - if isinstance(key, str): - return kwargs.get(key, "{" + key + "}") - return super().get_value(key, args, kwargs) - - def parse(self, format_string): # noqa: ANN001, ANN201 - """Overloaded.""" - try: - return super().parse(format_string) - except ValueError: - return [(format_string, None, None, None)] - - -def safe_format(format_string: str, *args: tuple[str], **kwargs: dict[str, str]) -> str: - """Format chat template.""" - formatter = SafeFormatter() - return formatter.format(format_string, *args, **kwargs) diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index 6de4d162..cda70bd9 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -43,7 +43,7 @@ class KNNScorer(ScoringModule): .. testoutput:: [[0.67297815 0.32702185] - [0.44031678 0.55968322]] + [0.44031667 0.55968333]] """ diff --git a/tests/generation/test_basic_synthesizer.py b/tests/generation/test_basic_synthesizer.py new file mode 100644 index 00000000..674b3535 --- /dev/null +++ b/tests/generation/test_basic_synthesizer.py @@ -0,0 +1,52 @@ +from unittest.mock import Mock + +from autointent.generation.utterances import SynthesizerChatTemplate, UtteranceGenerator + + +def has_unfilled_fields(template): + try: + # Attempt to format the string with empty values + template.format(**{}) # noqa: PIE804 + return False # No unfilled fields # noqa: TRY300 + except KeyError: + return True # Unfilled fields detected + + +def test_default_chat_template(dataset): + template = SynthesizerChatTemplate(dataset, split="train_0") + prompt = template(dataset.intents[0], n_examples=1) + for msg in prompt: + assert not has_unfilled_fields(msg["content"]) + assert "extra_instructions" not in prompt + + +def test_extra_instructions(dataset): + template = SynthesizerChatTemplate(dataset, split="train_0", extra_instructions="football") + prompt = template(dataset.intents[0], n_examples=1)[0]["content"] + assert "extra_instructions" not in prompt + assert "football" in prompt + + +def test_on_dataset(dataset): + mock_llm = Mock() + mock_llm.get_chat_completion.return_value = "1. LLM answer" + + split_name = "train_0" + + template = SynthesizerChatTemplate(dataset, split=split_name) + augmenter = UtteranceGenerator(mock_llm, template) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, update_split=False) + n_after = len(dataset[split_name]) + + assert n_before == n_after + assert len(new_samples) == len(dataset.intents) + assert all(sample.utterance == "LLM answer" for sample in new_samples) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, update_split=True) + n_after = len(dataset[split_name]) + + assert n_before + len(new_samples) == n_after + assert len(new_samples) == len(dataset.intents) diff --git a/tests/generation/test_evolver.py b/tests/generation/test_evolver.py new file mode 100644 index 00000000..c95c9def --- /dev/null +++ b/tests/generation/test_evolver.py @@ -0,0 +1,34 @@ +from unittest.mock import Mock + +from autointent.generation.utterances import AbstractEvolution, UtteranceEvolver + + +def test_default_chat_template(dataset): + template = AbstractEvolution() + prompt = template("some utterance", dataset.intents[0]) + assert "some utterance" in prompt[-1]["content"] + + +def test_on_dataset(dataset): + mock_llm = Mock() + mock_llm.get_chat_completion.return_value = "LLM answer" + + split_name = "train_0" + + template = AbstractEvolution() + augmenter = UtteranceEvolver(mock_llm, [template]) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, n_evolutions=1, update_split=False) + n_after = len(dataset[split_name]) + + assert n_before == n_after + assert len(new_samples) == n_before + assert all(sample.utterance == "LLM answer" for sample in new_samples) + + n_before = len(dataset[split_name]) + new_samples = augmenter.augment(dataset, split_name=split_name, n_evolutions=1, update_split=True) + n_after = len(dataset[split_name]) + + assert n_before + len(new_samples) == n_after + assert len(new_samples) == n_before