Feat/augmentation utterances (#94)

* feat: added generation utterances * feat: update generation * feat: change prompt templates * Refactor/move to our dataset class (#100) * refactor basic utterance generator * make `load_dataset` utility public * polish `load_dataset` utility * move basic utterance generator to `Dataset` * refactor cli for basic utterance generator * refactor evolutions module * some bug fix in basic utterance generation * some bug fix in evolutionary augmentations * refactor `Generator` and fix codestyle * fix typing * fix import issues * try to fix --------- Co-authored-by: Алексеев Илья <[email protected]> Co-authored-by: voorhs <[email protected]>
deeppavlov · Jan 27, 2025 · 1ff18cf · 1ff18cf
1 parent d9807cc
commit 1ff18cf
Show file tree

Hide file tree

Showing 27 changed files with 725 additions and 22 deletions.
diff --git a/autointent/__init__.py b/autointent/__init__.py
@@ -6,7 +6,18 @@
 from ._vector_index import VectorIndex
 from ._dataset import Dataset
 from ._hash import Hasher
-from .context import Context
+from .context import Context, load_dataset
 from ._pipeline import Pipeline
 
-__all__ = ["Context", "Dataset", "Embedder", "Hasher", "Pipeline", "Ranker", "VectorIndex", "setup_logging"]
+
+__all__ = [
+    "Context",
+    "Dataset",
+    "Embedder",
+    "Hasher",
+    "Pipeline",
+    "Ranker",
+    "VectorIndex",
+    "load_dataset",
+    "setup_logging",
+]
diff --git a/autointent/context/__init__.py b/autointent/context/__init__.py
@@ -1,5 +1,6 @@
 """Core utilities for auto ML features."""
 
 from ._context import Context
+from ._utils import load_dataset
 
-__all__ = ["Context"]
+__all__ = ["Context", "load_dataset"]
diff --git a/autointent/context/_context.py b/autointent/context/_context.py
@@ -17,7 +17,7 @@
     VectorIndexConfig,
 )
 
-from ._utils import NumpyEncoder, load_data
+from ._utils import NumpyEncoder, load_dataset
 from .data_handler import DataHandler
 from .optimization_info import OptimizationInfo
 
@@ -81,7 +81,7 @@ def configure_data(self, config: DataConfig) -> None:
         :param config: Configuration for the data handling process.
         """
         self.data_handler = DataHandler(
-            dataset=load_data(config.train_path),
+            dataset=load_dataset(config.train_path),
             random_seed=self.seed,
         )
 

diff --git a/autointent/context/_utils.py b/autointent/context/_utils.py
@@ -37,9 +37,9 @@ def default(self, obj: Any) -> str | int | float | list[Any] | Any:  # noqa: ANN
         return super().default(obj)
 
 
-def load_data(filepath: str | Path) -> Dataset:
+def load_dataset(path: str | Path) -> Dataset:
     """
-    Load data from a specified path or use default sample data.
+    Load data from a specified path or use default sample data or load from hugging face hub.
 
     This function loads a dataset from a JSON file or retrieves sample data
     included with the `autointent` package for default multiclass or multilabel
@@ -50,10 +50,10 @@ def load_data(filepath: str | Path) -> Dataset:
                       - "default-multilabel": Loads sample multilabel dataset.
     :return: A `Dataset` object containing the loaded data.
     """
-    if filepath == "default-multiclass":
+    if path == "default-multiclass":
         return Dataset.from_hub("AutoIntent/clinc150_subset")
-    if filepath == "default-multilabel":
+    if path == "default-multilabel":
         return Dataset.from_hub("AutoIntent/clinc150_subset").to_multilabel()
-    if not Path(filepath).exists():
-        return Dataset.from_hub(str(filepath))
-    return Dataset.from_json(filepath)
+    if not Path(path).exists():
+        return Dataset.from_hub(str(path))
+    return Dataset.from_json(path)
diff --git a/autointent/generation/__init__.py b/autointent/generation/__init__.py
@@ -1 +0,0 @@
-"""Experimental subpackage that someday will evolve into data augmentation tools."""

diff --git a/autointent/generation/intents/__init__.py b/autointent/generation/intents/__init__.py
diff --git a/...tent/generation/description_generation.py → ...eration/intents/description_generation.py b/...tent/generation/description_generation.py → ...eration/intents/description_generation.py
@@ -7,7 +7,7 @@
 from openai import AsyncOpenAI
 
 from autointent import Dataset
-from autointent.generation.prompt_scheme import PromptDescription
+from autointent.generation.intents.prompt_scheme import PromptDescription
 from autointent.schemas import Intent, Sample
 
 

diff --git a/autointent/generation/prompt_scheme.py → ...ntent/generation/intents/prompt_scheme.py b/autointent/generation/prompt_scheme.py → ...ntent/generation/intents/prompt_scheme.py
@@ -2,7 +2,7 @@
 
 from pydantic import BaseModel, field_validator
 
-from autointent.generation.prompts import PROMPT_DESCRIPTION
+from autointent.generation.utterances.prompts import PROMPT_DESCRIPTION
 
 
 class PromptDescription(BaseModel):

diff --git a/autointent/generation/utterances/__init__.py b/autointent/generation/utterances/__init__.py
diff --git a/autointent/generation/utterances/basic/__init__.py b/autointent/generation/utterances/basic/__init__.py
diff --git a/autointent/generation/utterances/basic/chat_template.yaml b/autointent/generation/utterances/basic/chat_template.yaml
@@ -0,0 +1,119 @@
+- role: system
+  content: |
+    You will be provided with a set of example utterances and the name of the common topic (intent class) of these utterances. Your task is to generate more examples that fit within the same intent class.
+
+    Note:
+    - You can generate similar utterances with only slot values changed
+    - You can generate completely different utterance from the same intent class
+    - Intent name can be missed, then you should infer from example utterances only
+    - Example utterances can be missed, then you should infer from intent name only
+    {extra_instructions}
+- role: user
+  content: |
+    Intent Class: ordering_pizza
+
+    Example Utterances:
+    1. I want to order a large pepperoni pizza.
+    2. Can I get a medium cheese pizza with extra olives?
+    3. Please deliver a small veggie pizza to my address.
+
+    Please generate 3 more examples for the provided intent class.
+- role: assistant
+  content: |
+    1. I'd like to order a large margherita pizza.
+    2. Can you deliver a medium Hawaiian pizza with extra pineapple?
+    3. Please send a small BBQ chicken pizza to my home.
+- role: user
+  content: |
+    Intent Class: booking a hotel
+
+    Example Utterances:
+    1. I need to book a room for two nights in New York.
+
+    Please generate 2 more examples for the provided intent class.
+- role: assistant
+  content: |
+    1. Can you reserve a deluxe room for my trip to Tokyo?
+    2. I need to book a hotel room with a mountain view in Denver.
+- role: user
+  content: |
+    Intent Class:
+
+    Example Utterances:
+    1. What is the weather like today?
+
+    Please generate 2 more examples for the provided intent class.
+- role: assistant
+  content: |
+    1. Can you tell me the forecast for tomorrow?
+    2. Is it going to rain this weekend?
+- role: user
+  content: |
+    Intent Class: Scheduling a Meeting
+
+    Example Utterances:
+
+    Please generate 3 more examples for the provided intent class.
+- role: assistant
+  content: |
+    1. I need to schedule a meeting for next Tuesday.
+    2. Can you set up a conference call for tomorrow afternoon?
+    3. Please arrange a meeting with the marketing team next week.
+- role: user
+  content: |
+    Intent Class: {intent_name}
+
+    Example Utterances:
+    {example_utterances}
+
+    Please generate {n_examples} more examples for the provided intent class.
+
+# ### Intent Class: Asking for Directions
+# **Example Utterances:**
+# 1. "How do I get to the nearest coffee shop?"
+# 2. "Can you give me directions to the airport?"
+# 3. "What is the best route to the museum from here?"
+
+# ### Intent Class: Making a Restaurant Reservation
+# **Example Utterances:**
+# 1. "I want to make a reservation for dinner tonight."
+# 2. "Can you book a table for two at the Italian restaurant?"
+# 3. "Please reserve a table for four at the steakhouse for Saturday evening."
+
+# ### Intent Class: Requesting Technical Support
+# **Example Utterances:**
+# 1. "I'm having trouble with my laptop."
+# 2. "Can you help me fix my Wi-Fi connection?"
+# 3. "My software is not working properly, can you assist?"
+
+# ### Intent Class: Inquiring About Product Availability
+# **Example Utterances:**
+# 1. "Do you have the new iPhone in stock?"
+# 2. "Is the blue shirt available in size medium?"
+# 3. "Can you check if the latest book by John Doe is available?"
+
+# ### Intent Class: Requesting Account Information
+# **Example Utterances:**
+# 1. "What is my current account balance?"
+# 2. "Can you tell me my recent transactions?"
+# 3. "I need to check my account statement for last month."
+
+# ### Intent Class: Booking a Flight
+# **Example Utterances:**
+# 1. "I want to book a flight to Los Angeles."
+# 2. "Can you find me a flight to Paris next week?"
+# 3. "Please book a round-trip flight to New York for next month."
+
+# ### Intent Class: Requesting Movie Recommendations
+# **Example Utterances:**
+# 1. "Can you recommend a good action movie?"
+# 2. "What are some highly rated comedies?"
+# 3. "I'm in the mood for a romantic film, any suggestions?"
+
+# ### Intent Class: Ordering Groceries
+# **Example Utterances:**
+# 1. "I need to order some milk and bread."
+# 2. "Can you add apples and bananas to my grocery list?"
+# 3. "Please order a dozen eggs and a pack of butter."
+
+# You can use these intent names and example utterances to further train or test your language model for generating more examples within each intent class.
diff --git a/autointent/generation/utterances/basic/cli.py b/autointent/generation/utterances/basic/cli.py
@@ -0,0 +1,83 @@
+"""CLI for basic utterance generator."""
+
+from argparse import ArgumentParser
+
+from autointent import load_dataset
+from autointent.generation.utterances.basic.utterance_generator import LengthType, StyleType, UtteranceGenerator
+from autointent.generation.utterances.generator import Generator
+
+
+def main() -> None:
+    """ClI endpoint."""
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--input-path",
+        type=str,
+        required=True,
+        help="Path to json or hugging face repo with dataset",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=True,
+        help="Local path where to save result",
+    )
+    parser.add_argument(
+        "--output-repo",
+        type=str,
+        default=None,
+        help="Local path where to save result",
+    )
+    parser.add_argument("--private", action="store_true", help="Publish privately if --output-repo option is used")
+    parser.add_argument(
+        "--n-generations",
+        type=int,
+        default=5,
+        help="Number of utterances to generate for each intent",
+    )
+    parser.add_argument(
+        "--n-sample-utterances",
+        type=int,
+        default=5,
+        help="Number of utterances to use as an example for augmentation",
+    )
+    parser.add_argument(
+        "--custom-instruction",
+        type=str,
+        action="append",
+        help="Add extra instructions to default prompt."
+        "You can use this argument multiple times to add multiple instructions",
+    )
+    parser.add_argument(
+        "--length",
+        choices=LengthType.__args__,  # type: ignore[attr-defined]
+        default="none",
+        help="How to extend the prompt with length instruction",
+    )
+    parser.add_argument(
+        "--style",
+        choices=StyleType.__args__,  # type: ignore[attr-defined]
+        default="none",
+        help="How to extend the prompt with style instruction",
+    )
+    parser.add_argument(
+        "--same-punctuation",
+        action="store_true",
+        help="Whether to extend the prompt with punctuation instruction",
+    )
+    args = parser.parse_args()
+
+    dataset = load_dataset(args.input_path)
+    generator = UtteranceGenerator(
+        Generator(), args.custom_instruction or [], args.length, args.style, args.same_punctuation
+    )
+    generator.augment(dataset, n_generations=args.n_generations, max_sample_utterances=args.n_sample_utterances)
+
+    dataset.to_json(args.output_path)
+
+    if args.output_repo is not None:
+        dataset.push_to_hub(args.output_repo, private=args.private)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/autointent/generation/utterances/basic/extra_instructions.json b/autointent/generation/utterances/basic/extra_instructions.json
@@ -0,0 +1,14 @@
+{
+    "length": {
+        "same": "Generated utterances should have the similar length in a sense of number of words to example utterances",
+        "longer": "Generated utterances can be a little bit longer in a sense of number of words than example utterances",
+        "shorter": "Generated utterances can be a little bit shorter in a sense of number of words than example utterances"
+    },
+    "style": {
+        "same": "Generated utterances should follow the same style of conversation as example utterances",
+        "formal": "Generated utterances should follow a formal style of conversation",
+        "informal": "Generated utterances doesn't have to follow a formal style of conversation",
+        "playful": "Generated utterances can be playful and funny"
+    },
+    "punctuation": "Generated utterances should follow the same punctuation style"
+}
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		"""Experimental subpackage that someday will evolve into data augmentation tools."""