Skip to content

Commit

Permalink
migrated to llamacpp.create_chat_completion
Browse files Browse the repository at this point in the history
  • Loading branch information
Leo-LiHao committed Dec 18, 2024
1 parent 8cf724f commit d3a3ca9
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 31 deletions.
79 changes: 50 additions & 29 deletions bertopic/representation/_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,34 @@


DEFAULT_PROMPT = """
Q: I have a topic that contains the following documents:
This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
Example 1:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
Example 2:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
- I got a message stating that I received the monitor but that is not true!
- It took a month longer to deliver than was advised...
Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
Extract the topic name from the following documents:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
Provide the extracted topic name directly without any explanation."""

The topic is described by the following keywords: '[KEYWORDS]'.
Based on the above information, can you give a short label of the topic?
A: """
DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts."


class LlamaCPP(BaseRepresentation):
Expand Down Expand Up @@ -93,14 +114,15 @@ def __init__(
self,
model: Union[str, Llama],
prompt: str = None,
system_prompt: str = None,
pipeline_kwargs: Mapping[str, Any] = {},
nr_docs: int = 4,
diversity: float = None,
doc_length: int = None,
doc_length: int = 100,
tokenizer: Union[str, Callable] = None,
):
if isinstance(model, str):
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="llama-2")
elif isinstance(model, Llama):
self.model = model
else:
Expand All @@ -110,6 +132,7 @@ def __init__(
"local LLM or a ` llama_cpp.Llama` object."
)
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
self.pipeline_kwargs = pipeline_kwargs
self.nr_docs = nr_docs
Expand Down Expand Up @@ -150,33 +173,31 @@ def extract_topics(
self.prompts_.append(prompt)

# Extract result from generator and use that as label
topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]

if len(topic_description) < 10:
topic_description += [("", 0) for _ in range(10 - len(topic_description))]

updated_topics[topic] = topic_description
# topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = self.model.create_chat_completion(
messages=[
{"role": "system", "content": self.system_prompt},
{
"role": "user",
"content": prompt
}
], ** self.pipeline_kwargs
)
label = topic_description["choices"][0]["message"]["content"].strip().replace("Topic name: ", "")
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]

return updated_topics

def _create_prompt(self, docs, topic, topics):
keywords = ", ".join(list(zip(*topics[topic]))[0])

# Use the default prompt and replace keywords
if self.prompt == DEFAULT_PROMPT:
prompt = self.prompt.replace("[KEYWORDS]", keywords)

# Use a prompt that leverages either keywords or documents in
# a custom location
else:
prompt = self.prompt
if "[KEYWORDS]" in prompt:
prompt = prompt.replace("[KEYWORDS]", keywords)
if "[DOCUMENTS]" in prompt:
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)
prompt = self.prompt
if "[KEYWORDS]" in prompt:
prompt = prompt.replace("[KEYWORDS]", keywords)
if "[DOCUMENTS]" in prompt:
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)

return prompt
4 changes: 2 additions & 2 deletions bertopic/representation/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def decode(self, doc_chunks):
truncated_document: A truncated document
"""
if doc_length is not None:
if tokenizer == "char":
truncated_document = document[:doc_length]
if tokenizer == "char" or tokenizer is None:
truncated_document = document[:doc_length] + " (...)"
elif tokenizer == "whitespace":
truncated_document = " ".join(document.split()[:doc_length])
elif tokenizer == "vectorizer":
Expand Down

0 comments on commit d3a3ca9

Please sign in to comment.