Skip to content

Commit

Permalink
Feat/augmentation utterances (#94)
Browse files Browse the repository at this point in the history
* feat: added generation utterances

* feat: update generation

* feat: change prompt templates

* Refactor/move to our dataset class (#100)

* refactor basic utterance generator

* make `load_dataset` utility public

* polish `load_dataset` utility

* move basic utterance generator to `Dataset`

* refactor cli for basic utterance generator

* refactor evolutions module

* some bug fix in basic utterance generation

* some bug fix in evolutionary augmentations

* refactor `Generator` and fix codestyle

* fix typing

* fix import issues

* try to fix

---------

Co-authored-by: Алексеев Илья <[email protected]>
Co-authored-by: voorhs <[email protected]>
  • Loading branch information
3 people authored Jan 27, 2025
1 parent d9807cc commit 1ff18cf
Show file tree
Hide file tree
Showing 27 changed files with 725 additions and 22 deletions.
15 changes: 13 additions & 2 deletions autointent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,18 @@
from ._vector_index import VectorIndex
from ._dataset import Dataset
from ._hash import Hasher
from .context import Context
from .context import Context, load_dataset
from ._pipeline import Pipeline

__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "Ranker", "VectorIndex", "setup_logging"]

__all__ = [
"Context",
"Dataset",
"Embedder",
"Hasher",
"Pipeline",
"Ranker",
"VectorIndex",
"load_dataset",
"setup_logging",
]
3 changes: 2 additions & 1 deletion autointent/context/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Core utilities for auto ML features."""

from ._context import Context
from ._utils import load_dataset

__all__ = ["Context"]
__all__ = ["Context", "load_dataset"]
4 changes: 2 additions & 2 deletions autointent/context/_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
VectorIndexConfig,
)

from ._utils import NumpyEncoder, load_data
from ._utils import NumpyEncoder, load_dataset
from .data_handler import DataHandler
from .optimization_info import OptimizationInfo

Expand Down Expand Up @@ -81,7 +81,7 @@ def configure_data(self, config: DataConfig) -> None:
:param config: Configuration for the data handling process.
"""
self.data_handler = DataHandler(
dataset=load_data(config.train_path),
dataset=load_dataset(config.train_path),
random_seed=self.seed,
)

Expand Down
14 changes: 7 additions & 7 deletions autointent/context/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def default(self, obj: Any) -> str | int | float | list[Any] | Any: # noqa: ANN
return super().default(obj)


def load_data(filepath: str | Path) -> Dataset:
def load_dataset(path: str | Path) -> Dataset:
"""
Load data from a specified path or use default sample data.
Load data from a specified path or use default sample data or load from hugging face hub.
This function loads a dataset from a JSON file or retrieves sample data
included with the `autointent` package for default multiclass or multilabel
Expand All @@ -50,10 +50,10 @@ def load_data(filepath: str | Path) -> Dataset:
- "default-multilabel": Loads sample multilabel dataset.
:return: A `Dataset` object containing the loaded data.
"""
if filepath == "default-multiclass":
if path == "default-multiclass":
return Dataset.from_hub("AutoIntent/clinc150_subset")
if filepath == "default-multilabel":
if path == "default-multilabel":
return Dataset.from_hub("AutoIntent/clinc150_subset").to_multilabel()
if not Path(filepath).exists():
return Dataset.from_hub(str(filepath))
return Dataset.from_json(filepath)
if not Path(path).exists():
return Dataset.from_hub(str(path))
return Dataset.from_json(path)
1 change: 0 additions & 1 deletion autointent/generation/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
"""Experimental subpackage that someday will evolve into data augmentation tools."""
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from openai import AsyncOpenAI

from autointent import Dataset
from autointent.generation.prompt_scheme import PromptDescription
from autointent.generation.intents.prompt_scheme import PromptDescription
from autointent.schemas import Intent, Sample


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pydantic import BaseModel, field_validator

from autointent.generation.prompts import PROMPT_DESCRIPTION
from autointent.generation.utterances.prompts import PROMPT_DESCRIPTION


class PromptDescription(BaseModel):
Expand Down
Empty file.
Empty file.
119 changes: 119 additions & 0 deletions autointent/generation/utterances/basic/chat_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
- role: system
content: |
You will be provided with a set of example utterances and the name of the common topic (intent class) of these utterances. Your task is to generate more examples that fit within the same intent class.
Note:
- You can generate similar utterances with only slot values changed
- You can generate completely different utterance from the same intent class
- Intent name can be missed, then you should infer from example utterances only
- Example utterances can be missed, then you should infer from intent name only
{extra_instructions}
- role: user
content: |
Intent Class: ordering_pizza
Example Utterances:
1. I want to order a large pepperoni pizza.
2. Can I get a medium cheese pizza with extra olives?
3. Please deliver a small veggie pizza to my address.
Please generate 3 more examples for the provided intent class.
- role: assistant
content: |
1. I'd like to order a large margherita pizza.
2. Can you deliver a medium Hawaiian pizza with extra pineapple?
3. Please send a small BBQ chicken pizza to my home.
- role: user
content: |
Intent Class: booking a hotel
Example Utterances:
1. I need to book a room for two nights in New York.
Please generate 2 more examples for the provided intent class.
- role: assistant
content: |
1. Can you reserve a deluxe room for my trip to Tokyo?
2. I need to book a hotel room with a mountain view in Denver.
- role: user
content: |
Intent Class:
Example Utterances:
1. What is the weather like today?
Please generate 2 more examples for the provided intent class.
- role: assistant
content: |
1. Can you tell me the forecast for tomorrow?
2. Is it going to rain this weekend?
- role: user
content: |
Intent Class: Scheduling a Meeting
Example Utterances:
Please generate 3 more examples for the provided intent class.
- role: assistant
content: |
1. I need to schedule a meeting for next Tuesday.
2. Can you set up a conference call for tomorrow afternoon?
3. Please arrange a meeting with the marketing team next week.
- role: user
content: |
Intent Class: {intent_name}
Example Utterances:
{example_utterances}
Please generate {n_examples} more examples for the provided intent class.
# ### Intent Class: Asking for Directions
# **Example Utterances:**
# 1. "How do I get to the nearest coffee shop?"
# 2. "Can you give me directions to the airport?"
# 3. "What is the best route to the museum from here?"

# ### Intent Class: Making a Restaurant Reservation
# **Example Utterances:**
# 1. "I want to make a reservation for dinner tonight."
# 2. "Can you book a table for two at the Italian restaurant?"
# 3. "Please reserve a table for four at the steakhouse for Saturday evening."

# ### Intent Class: Requesting Technical Support
# **Example Utterances:**
# 1. "I'm having trouble with my laptop."
# 2. "Can you help me fix my Wi-Fi connection?"
# 3. "My software is not working properly, can you assist?"

# ### Intent Class: Inquiring About Product Availability
# **Example Utterances:**
# 1. "Do you have the new iPhone in stock?"
# 2. "Is the blue shirt available in size medium?"
# 3. "Can you check if the latest book by John Doe is available?"

# ### Intent Class: Requesting Account Information
# **Example Utterances:**
# 1. "What is my current account balance?"
# 2. "Can you tell me my recent transactions?"
# 3. "I need to check my account statement for last month."

# ### Intent Class: Booking a Flight
# **Example Utterances:**
# 1. "I want to book a flight to Los Angeles."
# 2. "Can you find me a flight to Paris next week?"
# 3. "Please book a round-trip flight to New York for next month."

# ### Intent Class: Requesting Movie Recommendations
# **Example Utterances:**
# 1. "Can you recommend a good action movie?"
# 2. "What are some highly rated comedies?"
# 3. "I'm in the mood for a romantic film, any suggestions?"

# ### Intent Class: Ordering Groceries
# **Example Utterances:**
# 1. "I need to order some milk and bread."
# 2. "Can you add apples and bananas to my grocery list?"
# 3. "Please order a dozen eggs and a pack of butter."

# You can use these intent names and example utterances to further train or test your language model for generating more examples within each intent class.
83 changes: 83 additions & 0 deletions autointent/generation/utterances/basic/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""CLI for basic utterance generator."""

from argparse import ArgumentParser

from autointent import load_dataset
from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator
from autointent.generation.utterances.generator import Generator


def main() -> None:
"""ClI endpoint."""
parser = ArgumentParser()
parser.add_argument(
"--input-path",
type=str,
required=True,
help="Path to json or hugging face repo with dataset",
)
parser.add_argument(
"--output-path",
type=str,
required=True,
help="Local path where to save result",
)
parser.add_argument(
"--output-repo",
type=str,
default=None,
help="Local path where to save result",
)
parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used")
parser.add_argument(
"--n-generations",
type=int,
default=5,
help="Number of utterances to generate for each intent",
)
parser.add_argument(
"--n-sample-utterances",
type=int,
default=5,
help="Number of utterances to use as an example for augmentation",
)
parser.add_argument(
"--custom-instruction",
type=str,
action="append",
help="Add extra instructions to default prompt."
"You can use this argument multiple times to add multiple instructions",
)
parser.add_argument(
"--length",
choices=LengthType.__args__, # type: ignore[attr-defined]
default="none",
help="How to extend the prompt with length instruction",
)
parser.add_argument(
"--style",
choices=StyleType.__args__, # type: ignore[attr-defined]
default="none",
help="How to extend the prompt with style instruction",
)
parser.add_argument(
"--same-punctuation",
action="store_true",
help="Whether to extend the prompt with punctuation instruction",
)
args = parser.parse_args()

dataset = load_dataset(args.input_path)
generator = UtteranceGenerator(
Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation
)
generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances)

dataset.to_json(args.output_path)

if args.output_repo is not None:
dataset.push_to_hub(args.output_repo, private=args.private)


if __name__ == "__main__":
main()
14 changes: 14 additions & 0 deletions autointent/generation/utterances/basic/extra_instructions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"length": {
"same": "Generated utterances should have the similar length in a sense of number of words to example utterances",
"longer": "Generated utterances can be a little bit longer in a sense of number of words than example utterances",
"shorter": "Generated utterances can be a little bit shorter in a sense of number of words than example utterances"
},
"style": {
"same": "Generated utterances should follow the same style of conversation as example utterances",
"formal": "Generated utterances should follow a formal style of conversation",
"informal": "Generated utterances doesn't have to follow a formal style of conversation",
"playful": "Generated utterances can be playful and funny"
},
"punctuation": "Generated utterances should follow the same punctuation style"
}
Loading

0 comments on commit 1ff18cf

Please sign in to comment.