migrated to llamacpp.create_chat_completion

MaartenGr · Dec 18, 2024 · d3a3ca9 · d3a3ca9
1 parent 8cf724f
commit d3a3ca9
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 31 deletions.
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
@@ -8,13 +8,34 @@
 
 
 DEFAULT_PROMPT = """
-Q: I have a topic that contains the following documents:
+This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
+
+Example 1:
+Sample texts from this topic:
+- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
+- Meat, but especially beef, is the word food in terms of emissions.
+- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
+
+Keywords: meat beef eat eating emissions steak food health processed chicken
+Topic name: Environmental impacts of eating meat
+
+Example 2:
+Sample texts from this topic:
+- I have ordered the product weeks ago but it still has not arrived!
+- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
+- I got a message stating that I received the monitor but that is not true!
+- It took a month longer to deliver than was advised...
+
+Keywords: deliver weeks product shipping long delivery received arrived arrive week
+Topic name: Shipping and delivery issues
+---
+Extract the topic name from the following documents:
+Sample texts from this topic:
 [DOCUMENTS]
+Keywords: [KEYWORDS]
+Provide the extracted topic name directly without any explanation."""
 
-The topic is described by the following keywords: '[KEYWORDS]'.
-
-Based on the above information, can you give a short label of the topic?
-A: """
+DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts."
 
 
 class LlamaCPP(BaseRepresentation):
@@ -93,14 +114,15 @@ def __init__(
         self,
         model: Union[str, Llama],
         prompt: str = None,
+        system_prompt: str = None,
         pipeline_kwargs: Mapping[str, Any] = {},
         nr_docs: int = 4,
         diversity: float = None,
-        doc_length: int = None,
+        doc_length: int = 100,
         tokenizer: Union[str, Callable] = None,
     ):
         if isinstance(model, str):
-            self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
+            self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="llama-2")
         elif isinstance(model, Llama):
             self.model = model
         else:
@@ -110,6 +132,7 @@ def __init__(
                 "local LLM or a ` llama_cpp.Llama` object."
             )
         self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
+        self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
         self.default_prompt_ = DEFAULT_PROMPT
         self.pipeline_kwargs = pipeline_kwargs
         self.nr_docs = nr_docs
@@ -150,33 +173,31 @@ def extract_topics(
             self.prompts_.append(prompt)
 
             # Extract result from generator and use that as label
-            topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
-            topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]
-
-            if len(topic_description) < 10:
-                topic_description += [("", 0) for _ in range(10 - len(topic_description))]
-
-            updated_topics[topic] = topic_description
+            # topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
+            topic_description = self.model.create_chat_completion(
+                  messages=[
+                      {"role": "system", "content": self.system_prompt},
+                      {
+                          "role": "user",
+                          "content": prompt
+                      }
+                  ], ** self.pipeline_kwargs
+            )
+            label = topic_description["choices"][0]["message"]["content"].strip().replace("Topic name: ", "")
+            updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]
 
         return updated_topics
 
     def _create_prompt(self, docs, topic, topics):
         keywords = ", ".join(list(zip(*topics[topic]))[0])
 
-        # Use the default prompt and replace keywords
-        if self.prompt == DEFAULT_PROMPT:
-            prompt = self.prompt.replace("[KEYWORDS]", keywords)
-
-        # Use a prompt that leverages either keywords or documents in
-        # a custom location
-        else:
-            prompt = self.prompt
-            if "[KEYWORDS]" in prompt:
-                prompt = prompt.replace("[KEYWORDS]", keywords)
-            if "[DOCUMENTS]" in prompt:
-                to_replace = ""
-                for doc in docs:
-                    to_replace += f"- {doc}\n"
-                prompt = prompt.replace("[DOCUMENTS]", to_replace)
+        prompt = self.prompt
+        if "[KEYWORDS]" in prompt:
+            prompt = prompt.replace("[KEYWORDS]", keywords)
+        if "[DOCUMENTS]" in prompt:
+            to_replace = ""
+            for doc in docs:
+                to_replace += f"- {doc}\n"
+            prompt = prompt.replace("[DOCUMENTS]", to_replace)
 
         return prompt
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
@@ -44,8 +44,8 @@ def decode(self, doc_chunks):
         truncated_document: A truncated document
     """
     if doc_length is not None:
-        if tokenizer == "char":
-            truncated_document = document[:doc_length]
+        if tokenizer == "char" or tokenizer is None:
+            truncated_document = document[:doc_length] + " (...)"
         elif tokenizer == "whitespace":
             truncated_document = " ".join(document.split()[:doc_length])
         elif tokenizer == "vectorizer":