linto-ai · htagourti · Feb 5, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 7, 2025
diff --git a/.hydra-conf/config.yaml b/.hydra-conf/config.yaml
@@ -43,4 +43,7 @@ services_broker:
   url: ${oc.env:SERVICES_BROKER,redis://localhost:6379}
   password: ${oc.env:BROKER_PASS,EMPTY}
 
+document:
+  min_chunk_size: ${oc.decode:${oc.env:MIN_CHUNK_SIZE,null}}
+
 debug: false
diff --git a/.hydra-conf/services/fr.yaml b/.hydra-conf/services/fr.yaml
@@ -1,5 +1,5 @@
 fr:
-  type: summary
+  type: document
   fields: 2
   name: summarize-fr
   route: summarize-fr
@@ -20,4 +20,9 @@ fr:
       reduceSummary: false
       consolidateSummary: false
       reduce_prompt: null
-      type: abstractive
+      type: markdown
+      document:
+        template_path: ./templates/template.md
+        max_sentences_title_generation: ${oc.decode:${oc.env:MAX_SENTENCES_TITLE_GENERATION,6}}
+        doc_title_prompt: ./prompts/doc_title_prompt.txt
+        paragraph_title_prompt: ./prompts/paragraph_title_prompt.txt
diff --git a/app/backends/backend.py b/app/backends/backend.py
@@ -39,9 +39,12 @@ def __init__(self, task_data):
         self.logger.info(f"Setting up backend with params: {task_data['backendParams']} for task: {task_data['task_id']}")
         self.task_id = task_data['task_id']
         self.content = task_data['content']
+        self.name = task_data['name']
+        self.task_type = task_data['type']
+        self.promptFields = task_data['fields']
         try:
             # Load prompt from txt file
-            self.loadPrompt(task_data["type"], task_data["fields"])
+            self.loadPrompt()
 
             # Set default values for all attributes
             for key, default_value in cfg.backend_defaults.items():
@@ -56,12 +59,12 @@ def __init__(self, task_data):
                 setattr(self, key, value)
 
             # Set up tokenizer and chunker
-            self.tokenizer =  LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
+            self.tokenizer =  LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer", legacy=False)
             self.prompt_token_count = len(self.tokenizer(self.prompt)['input_ids'])
             self.chunker = Chunker(self.tokenizer, self.createNewTurnAfter)
 
             if (task_data["backendParams"]['reduceSummary'] == True) and (task_data["backendParams"]["reduce_prompt"] is not None):
-                self.load_reduce_prompt(task_data["type"], task_data["backendParams"]["reduce_prompt"])
+                self.load_reduce_prompt(task_data["name"], task_data["backendParams"]["reduce_prompt"])
             else :
                 self.reduce_prompt = None
 
@@ -71,20 +74,19 @@ def __init__(self, task_data):
             self.logger.error(f"Error setting up backend: {e}")
             raise e
 
-    def loadPrompt(self, service_name: str, fieldCount: int = 0):
+    def loadPrompt(self):
         """
         Loads the prompt from a text file and sets the prompt fields.
 
         Args:
             service_name (str): The name of the service to load the prompt for.
             fieldCount (int, optional): The number of prompt fields. Defaults to 0.
         """
-        self.logger.info(f"Loading prompt for service: {service_name}")
-        self.promptFields = fieldCount
+        self.logger.info(f"Loading prompt for service: {self.name}")
         self.logger.info(f"Prompt fields: {self.promptFields}")
 
         # Construct path to the prompt text file
-        txt_filepath = os.path.join(cfg.prompt_path,f'{service_name}.txt')
+        txt_filepath = os.path.join(cfg.prompt_path,f'{self.name}.txt')
         try:
             with open(txt_filepath, 'r') as f:
                 # Prevent file system caching
@@ -95,8 +97,8 @@ def loadPrompt(self, service_name: str, fieldCount: int = 0):
             self.logger.error(f"Error loading prompt from {txt_filepath}: {e}")
             raise e
 
-    def load_reduce_prompt(self, service_name: str, reduce_prompt: str):
-        self.logger.info(f"Loading reduce prompt for service: {service_name}")
+    def load_reduce_prompt(self, reduce_prompt: str):
+        self.logger.info(f"Loading reduce prompt for service: {self.name}")
 
         # Construct path to the prompt text file
         txt_filepath = os.path.join(cfg.prompt_path,f'{reduce_prompt}.txt')

diff --git a/app/backends/document.py b/app/backends/document.py
@@ -0,0 +1,149 @@
+from pydantic import BaseModel
+import asyncio
+import jinja2
+from  typing import List, Sequence
+from pathlib import Path
+import os
+
+import logging
+
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_huggingface import HuggingFaceEmbeddings
+import re
+from conf import cfg_instance
+
+# Load configuration
+cfg = cfg_instance(cfg_name="config")
+semantic_chunker = SemanticChunker(HuggingFaceEmbeddings(model_name = "sentence-transformers/all-mpnet-base-v2"), min_chunk_size=cfg.document.min_chunk_size)
+class Chapter(BaseModel):
+    title: str
+    content: str
+
+class DocGenerator:
+    def __init__(self, task_data, llm_adapter) -> None:
+        self.document_config = task_data['backendParams']["document"]
+        self.llm_adapter = llm_adapter
+        self.chunker = semantic_chunker
+        self.template_path = self.document_config["template_path"]
+        self.template_file = Path(self.template_path).name
+        self.max_sentences_title_generation = self.document_config["max_sentences_title_generation"]
+        self.semaphore = asyncio.Semaphore(cfg.semaphore.max_concurrent_inferences)
+        self.logger = logging.getLogger("DocGenerator")
+        # System prompts
+        self.load_prompts(task_data)
+
+    def load_prompts(self, task_data):
+        """
+        Loads the title generation prompts from text files.
+        """
+
+        # Construct path to the prompt text files from task config
+        doc_title_prompt_path = self.document_config["doc_title_prompt"]
+        paragraph_title_prompt_path = self.document_config["paragraph_title_prompt"]
+        try:
+            with open(doc_title_prompt_path, 'r') as f:
+                # Prevent file system caching
+                os.fsync(f.fileno())
+                self.doc_title_prompt = f.read()
+            with open(paragraph_title_prompt_path, 'r') as f:
+                # Prevent file system caching
+                os.fsync(f.fileno())
+                self.paragraph_title_prompt = f.read()
+        except Exception as e:
+            self.logger.error(f"Error loading title generation prompts: {e}")
+            raise e
+
+    def split_summary_into_paragraphs(self, summary):
+        """
+        Splits the provided summary text into paragraphs.
+        Args:
+            summary (str): The summary text to be split into paragraphs.
+        Returns:
+            list: A list of paragraphs obtained from the summary text.
+        """
+
+        return self.chunker.split_text(summary)
+
+
+    async def create_chapters(self, paragraphs: List[str]) -> List:
+        """
+        Asynchronously creates chapters from a list of paragraphs.
+        This method processes each paragraph, generates a title using the first max_sentences_title_generation sentences,
+        and creates a Chapter object with the generated title and the paragraph content.
+        Args:
+            paragraphs (List[str]): A list of paragraphs to be converted into chapters.
+        Returns:
+            Sequence[Chapter]: A sequence of Chapter objects created from the paragraphs.
+        """
+
+        chapters: List[Chapter] = []
+
+        # Process each paragraph
+        async def process_paragraph(paragraph: str):
+            async with self.semaphore:
+                sentences = re.split(self.chunker.sentence_split_regex, paragraph)[:self.max_sentences_title_generation]
+                title = await self.llm_adapter.async_publish(' '.join(sentences), system_prompt=self.paragraph_title_prompt, temperature=0.5, max_tokens=20)
+                return Chapter(title=title, content=paragraph)
+
+        tasks = [process_paragraph(paragraph) for paragraph in paragraphs]
+
+        # Generate chapters
+        chapters = await asyncio.gather(*tasks)
+
+        # Generate document title based on chapter titles
+        chapter_titles = [chapter.title for chapter in chapters]
+        title = await self.llm_adapter.async_publish('\n'.join(chapter_titles), system_prompt=self.doc_title_prompt, temperature=0.5, max_tokens=20)
+        return title, chapters
+
+
+    def render_document(self, title: str, chapters: Sequence[Chapter]) -> str:
+        """
+        Renders a document using a Jinja template.
+        Args:
+            title (str): The title of the document.
+            chapters (Sequence[Chapter]): A sequence of Chapter objects to be included in the document.
+        Returns:
+            str: The rendered document as a string.
+        """
+
+        chapters_to_render = [
+            {
+                "title": chapter.title,
+                "paragraph": chapter.content,
+            }
+            for chapter in chapters if chapter.content
+        ]
+
+        # Load Jinja template
+        current_dir = Path(self.template_path).parent
+        jinja_environment = jinja2.Environment(loader=jinja2.FileSystemLoader(current_dir))
+        template = jinja_environment.get_template(self.template_file)
+
+        # Render template
+        rendered = template.render(
+            title=title,
+            chapters=chapters_to_render,
+        )
+
+        return rendered
+
+    def run(self, summary: str) -> List[Chapter]:
+        """
+        Pipeline to split a summary into paragraphs and segment it into chapters then output a markdown document.
+        Args:
+            summary (str): The input summary to process.
+        Returns:
+            List[Chapter]: A list of segmented chapters.
+        """
+        self.logger.info(f"Running document generation")
+
+        # Split summary into paragraphs
+        paragraphs = self.split_summary_into_paragraphs(summary)
+
+        # Create chapters
+        title, chapters =  asyncio.run(self.create_chapters(paragraphs))
+
+        # Render document
+        return self.render_document(title, chapters)
+
+#TODO add min_chunk_size to the config and using in the semantic split 
diff --git a/app/backends/llm_inference.py b/app/backends/llm_inference.py
@@ -1,13 +1,14 @@
 import asyncio
 from .backend import LLMBackend
 from .batch_manager import BatchManager
-
+from .document import DocGenerator
 
 class LLMInferenceEngine(LLMBackend):
     def __init__(self, task_data: dict, celery_task):
         super().__init__(task_data)
         self.batch_manager = BatchManager(task_data,self.tokenizer, self.prompt, self.prompt_token_count, self.reduce_prompt, celery_task)
-
+        if self.task_type == "document":
+            self.doc_generator = DocGenerator(task_data, self.batch_manager.openai_adapter)
 
     def run(self) -> str:
         """
@@ -44,5 +45,10 @@ def run(self) -> str:
         # consolidate turns for progressive summary
         if self.consolidateSummary:
             self.summary = self.chunker.consolidate_turns(self.summary)
+
+        self.summary = self.batch_manager.format_summary(self.summary)
+
+        if self.task_type == "document":
+            self.summary = self.doc_generator.run(self.summary)
 
-        return self.batch_manager.format_summary(self.summary)
+        return self.summary
diff --git a/app/backends/openai_adapter.py b/app/backends/openai_adapter.py
@@ -3,6 +3,9 @@
 from typing import List
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 from conf import cfg_instance
+import typing
+from pydantic import BaseModel, AfterValidator
+import json
 
 cfg = cfg_instance(cfg_name="config")
 
@@ -21,7 +24,14 @@ def __init__(self, task_data: dict):
 
 
     @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(cfg.api_params.max_retries))
-    def publish(self, content: str) -> str:
+    def publish(
+        self, 
+        content: str,
+        system_prompt: typing.Optional[str] = None,
+        temperature: typing.Optional[float] = None,
+        top_p: typing.Optional[float] = None,
+        max_tokens: typing.Optional[int] = None
+        ) -> str:
         """
         Sync publishes a message to the OpenAI chat model and returns the response.
         Args:
@@ -30,21 +40,31 @@ def publish(self, content: str) -> str:
             str: The response content from the chat model if successful.
             None: If an error occurs during the publishing process.
         """
+
+        # Add system prompt if provided
+        messages = [{"role": "system", "content": system_prompt}] if system_prompt else []
+
+        # Add user message
+        messages.append({"role": "user", "content": content})
         chat_response = self.client.chat.completions.create(
             model=self.modelName,
-
-            messages=[
-                {"role": "user", "content": content}
-            ],
-            temperature=self.temperature,
-            top_p=self.top_p,
-            max_tokens=self.maxGenerationLength
+            messages=messages,
+            temperature=temperature if temperature is not None else self.temperature,
+            top_p=top_p if top_p is not None else self.top_p,
+            max_tokens=max_tokens if max_tokens is not None else self.maxGenerationLength
         )
         return chat_response.choices[0].message.content
 
 
     @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(cfg.api_params.max_retries))
-    async def async_publish(self, content: str) -> str:
+    async def async_publish(
+        self, 
+        content: str,
+        system_prompt: typing.Optional[str] = None,
+        temperature: typing.Optional[float] = None,
+        top_p: typing.Optional[float] = None,
+        max_tokens: typing.Optional[int] = None
+        ) -> str:
         """
         Async publishes a message to the OpenAI chat model and returns the response.
         Args:
@@ -53,11 +73,35 @@ async def async_publish(self, content: str) -> str:
             str: The response content from the chat model if successful.
             None: If an error occurs during the publishing process.
         """
+
+        # Add system prompt if provided
+        messages = [{"role": "system", "content": system_prompt}] if system_prompt else []
+
+        # Add user message
+        messages.append({"role": "user", "content": content})
+
         chat_response = await self.async_client.chat.completions.create(
             model=self.modelName,
-            messages=[{"role": "user", "content": content}],
-            temperature=self.temperature,
-            top_p=self.top_p,
-            max_tokens=self.maxGenerationLength
+            messages=messages,
+            temperature=temperature if temperature is not None else self.temperature,
+            top_p=top_p if top_p is not None else self.top_p,
+            max_tokens=max_tokens if max_tokens is not None else self.maxGenerationLength
+        )
+        return chat_response.choices[0].message.content
+
+
+    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(cfg.api_params.max_retries))
+    async def generate_title(self, text : str) -> str:
+        messages = [
+            {"role": "system", "content": "Please generate a short title for the following text.\n\nBe VERY SUCCINCT. No more than 6 words."},
+            {"role": "user", "content": text},
+        ]
+
+
+        response = await self.async_client.chat.completions.create(
+            model=self.modelName,
+            messages=messages,
+            max_tokens=20,
+            temperature=0.5,
         )
-        return chat_response.choices[0].message.content
+        return response.choices[0].message.content.strip()
diff --git a/app/http_server/ingress.py b/app/http_server/ingress.py
@@ -76,7 +76,8 @@ async def generate(file: UploadFile = None, flavor: str = Form(...), temperature
 
             task_data = {
                 "backend": service['backend'],
-                "type": service['name'],
+                "name": service['name'],
+                "type": service['type'],
                 "backendParams": backend_params,
                 "fields": service['fields'],
                 "content": content