From 2601ac304b8be38d1949811ffaba5d4ffa1875f8 Mon Sep 17 00:00:00 2001
From: Sivakumar Sriumapathy <95053700+SSivakumar12@users.noreply.github.com>
Date: Mon, 14 Oct 2024 17:41:40 +0000
Subject: [PATCH 1/5] Fixed Issue: #1977

---
 bertopic/representation/_cohere.py         | 7 +++++++
 bertopic/representation/_langchain.py      | 6 ++++++
 bertopic/representation/_llamacpp.py       | 6 ++++++
 bertopic/representation/_openai.py         | 7 +++++++
 bertopic/representation/_textgeneration.py | 6 ++++++
 bertopic/representation/_utils.py          | 3 ++-
 6 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index 8ca31c8f..f45024ae 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -126,6 +126,13 @@ def __init__(
         self.tokenizer = tokenizer
         self.prompts_ = []
 
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
+
     def extract_topics(
         self,
         topic_model,
diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
index df5c4839..264d1b20 100644
--- a/bertopic/representation/_langchain.py
+++ b/bertopic/representation/_langchain.py
@@ -148,6 +148,12 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index 83b18952..321b13fd 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -118,6 +118,12 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index 8fd25a1b..95a7b991 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -180,6 +180,13 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
+
     def extract_topics(
         self,
         topic_model,
diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
index b028e575..9205f9d6 100644
--- a/bertopic/representation/_textgeneration.py
+++ b/bertopic/representation/_textgeneration.py
@@ -114,6 +114,12 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
+        if self.tokenizer is None and self.doc_length is not None:
+            raise ValueError(
+                "Please select from one of the valid options for the `tokenizer` parameter: \n"
+                "{'char', 'whitespace', 'vectorizer'} \n"
+                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+            )
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index 2a99fd1f..4a524045 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -1,8 +1,9 @@
 import random
 import time
+from typing import Union
 
 
-def truncate_document(topic_model, doc_length, tokenizer, document: str):
+def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str:
     """Truncate a document to a certain length.
 
     If you want to add a custom tokenizer, then it will need to have a `decode` and

From ede4d99126c20e5824d7b37883dafacf271e2d14 Mon Sep 17 00:00:00 2001
From: D-Sivakumar Sriumapathy <DEV\d-sriumas@a-8zsdl8o4p0g2.dev.dacc.phz>
Date: Mon, 18 Nov 2024 19:41:59 +0000
Subject: [PATCH 2/5] standardise errors into function for consistency

---
 bertopic/representation/_cohere.py         |  9 ++-------
 bertopic/representation/_langchain.py      | 10 +++-------
 bertopic/representation/_llamacpp.py       |  9 ++-------
 bertopic/representation/_openai.py         |  8 ++------
 bertopic/representation/_textgeneration.py |  9 ++-------
 bertopic/representation/_utils.py          |  9 +++++++++
 6 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index f45024ae..a0f84eb6 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -4,7 +4,7 @@
 from scipy.sparse import csr_matrix
 from typing import Mapping, List, Tuple, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -126,12 +126,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.prompts_ = []
 
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
index 264d1b20..3e217f4e 100644
--- a/bertopic/representation/_langchain.py
+++ b/bertopic/representation/_langchain.py
@@ -4,7 +4,7 @@
 from typing import Callable, Mapping, List, Tuple, Union
 
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 DEFAULT_PROMPT = "What are these documents about? Please give a single label."
 
@@ -148,12 +148,8 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
+
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index 321b13fd..52f53071 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -4,7 +4,7 @@
 from llama_cpp import Llama
 from typing import Mapping, List, Tuple, Any, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -118,12 +118,7 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index 95a7b991..12880158 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -8,6 +8,7 @@
 from bertopic.representation._utils import (
     retry_with_exponential_backoff,
     truncate_document,
+    validate_truncate_document_parameters
 )
 
 
@@ -180,12 +181,7 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
index 9205f9d6..7809dc9b 100644
--- a/bertopic/representation/_textgeneration.py
+++ b/bertopic/representation/_textgeneration.py
@@ -5,7 +5,7 @@
 from transformers.pipelines.base import Pipeline
 from typing import Mapping, List, Tuple, Any, Union, Callable
 from bertopic.representation._base import BaseRepresentation
-from bertopic.representation._utils import truncate_document
+from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters
 
 
 DEFAULT_PROMPT = """
@@ -114,12 +114,7 @@ def __init__(
         self.tokenizer = tokenizer
 
         self.prompts_ = []
-        if self.tokenizer is None and self.doc_length is not None:
-            raise ValueError(
-                "Please select from one of the valid options for the `tokenizer` parameter: \n"
-                "{'char', 'whitespace', 'vectorizer'} \n"
-                "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
-            )
+        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index 4a524045..abc21018 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -58,6 +58,15 @@ def decode(self, doc_chunks):
         return truncated_document
     return document
 
+def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]:
+    """validates parameters that are used in the function `truncate_document`"""
+    if tokenizer is None and doc_length is not None:
+        raise ValueError(
+            "Please select from one of the valid options for the `tokenizer` parameter: \n"
+            "{'char', 'whitespace', 'vectorizer'} \n"
+            "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
+        )
+
 
 def retry_with_exponential_backoff(
     func,

From 7796fbc56fc3fb4bb30061f27e8f809bd6ab2519 Mon Sep 17 00:00:00 2001
From: D-Sivakumar Sriumapathy <DEV\d-sriumas@a-8zsdl8o4p0g2.dev.dacc.phz>
Date: Thu, 21 Nov 2024 14:18:03 +0000
Subject: [PATCH 3/5] add additional edgecase and removal of underscore for
 consistency

---
 bertopic/representation/_cohere.py         | 3 ++-
 bertopic/representation/_langchain.py      | 2 +-
 bertopic/representation/_llamacpp.py       | 2 +-
 bertopic/representation/_openai.py         | 3 ++-
 bertopic/representation/_textgeneration.py | 2 +-
 bertopic/representation/_utils.py          | 6 ++++++
 6 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index a0f84eb6..b38c1dca 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -124,9 +124,10 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        validate_truncate_document_parameters(self.tokenizer, self.doc_length)
+
         self.prompts_ = []
 
-        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
index 3e217f4e..93f8fbf7 100644
--- a/bertopic/representation/_langchain.py
+++ b/bertopic/representation/_langchain.py
@@ -148,7 +148,7 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
-        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
+        validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
 
     def extract_topics(
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index 52f53071..3fd3541b 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -116,9 +116,9 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
         self.prompts_ = []
-        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index 12880158..2fa184b4 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -170,6 +170,8 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        validate_truncate_document_parameters(self.tokenizer, self.doc_length)
+
         self.prompts_ = []
 
         self.generator_kwargs = generator_kwargs
@@ -181,7 +183,6 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
-        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py
index 7809dc9b..ada27d38 100644
--- a/bertopic/representation/_textgeneration.py
+++ b/bertopic/representation/_textgeneration.py
@@ -112,9 +112,9 @@ def __init__(
         self.diversity = diversity
         self.doc_length = doc_length
         self.tokenizer = tokenizer
+        validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
         self.prompts_ = []
-        _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
     def extract_topics(
         self,
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index abc21018..e01b134b 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -66,6 +66,12 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None,
             "{'char', 'whitespace', 'vectorizer'} \n"
             "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
         )
+    elif tokenizer is not None and doc_length is None:
+        raise ValueError(
+            "If `tokenizer` is provided, `doc_length` of type int must be provided as well."
+        )
+    else:
+        pass
 
 
 def retry_with_exponential_backoff(

From 19bab3d22eac83cbaf7329dd828a7150d21686be Mon Sep 17 00:00:00 2001
From: D-Sivakumar Sriumapathy <DEV\d-sriumas@a-8zsdl8o4p0g2.dev.dacc.phz>
Date: Sun, 1 Dec 2024 20:40:52 +0000
Subject: [PATCH 4/5] remove redundant else clause

---
 bertopic/representation/_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index e01b134b..c2a33107 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -70,8 +70,6 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None,
         raise ValueError(
             "If `tokenizer` is provided, `doc_length` of type int must be provided as well."
         )
-    else:
-        pass
 
 
 def retry_with_exponential_backoff(

From aa03a2504bb79d965646a3b080c940bb23454fff Mon Sep 17 00:00:00 2001
From: D-Sivakumar Sriumapathy <DEV\d-sriumas@a-8zsdl8o4p0g2.dev.dacc.phz>
Date: Tue, 3 Dec 2024 19:53:06 +0000
Subject: [PATCH 5/5] fixing linting errors

---
 bertopic/representation/_cohere.py    | 1 -
 bertopic/representation/_langchain.py | 1 -
 bertopic/representation/_openai.py    | 3 +--
 bertopic/representation/_utils.py     | 7 +++----
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index b38c1dca..a0c74434 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -128,7 +128,6 @@ def __init__(
 
         self.prompts_ = []
 
-
     def extract_topics(
         self,
         topic_model,
diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py
index 93f8fbf7..e7588df4 100644
--- a/bertopic/representation/_langchain.py
+++ b/bertopic/representation/_langchain.py
@@ -150,7 +150,6 @@ def __init__(
         self.tokenizer = tokenizer
         validate_truncate_document_parameters(self.tokenizer, self.doc_length)
 
-
     def extract_topics(
         self,
         topic_model,
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index 2fa184b4..e05a9c66 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -8,7 +8,7 @@
 from bertopic.representation._utils import (
     retry_with_exponential_backoff,
     truncate_document,
-    validate_truncate_document_parameters
+    validate_truncate_document_parameters,
 )
 
 
@@ -183,7 +183,6 @@ def __init__(
         if not self.generator_kwargs.get("stop") and not chat:
             self.generator_kwargs["stop"] = "\n"
 
-
     def extract_topics(
         self,
         topic_model,
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index c2a33107..255c8fbe 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -58,8 +58,9 @@ def decode(self, doc_chunks):
         return truncated_document
     return document
 
+
 def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]:
-    """validates parameters that are used in the function `truncate_document`"""
+    """Validates parameters that are used in the function `truncate_document`."""
     if tokenizer is None and doc_length is not None:
         raise ValueError(
             "Please select from one of the valid options for the `tokenizer` parameter: \n"
@@ -67,9 +68,7 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None,
             "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n"
         )
     elif tokenizer is not None and doc_length is None:
-        raise ValueError(
-            "If `tokenizer` is provided, `doc_length` of type int must be provided as well."
-        )
+        raise ValueError("If `tokenizer` is provided, `doc_length` of type int must be provided as well.")
 
 
 def retry_with_exponential_backoff(