From 2601ac304b8be38d1949811ffaba5d4ffa1875f8 Mon Sep 17 00:00:00 2001 From: Sivakumar Sriumapathy <95053700+SSivakumar12@users.noreply.github.com> Date: Mon, 14 Oct 2024 17:41:40 +0000 Subject: [PATCH 1/5] Fixed Issue: #1977 --- bertopic/representation/_cohere.py | 7 +++++++ bertopic/representation/_langchain.py | 6 ++++++ bertopic/representation/_llamacpp.py | 6 ++++++ bertopic/representation/_openai.py | 7 +++++++ bertopic/representation/_textgeneration.py | 6 ++++++ bertopic/representation/_utils.py | 3 ++- 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index 8ca31c8f..f45024ae 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -126,6 +126,13 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) + def extract_topics( self, topic_model, diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index df5c4839..264d1b20 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -148,6 +148,12 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 83b18952..321b13fd 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -118,6 +118,12 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 8fd25a1b..95a7b991 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -180,6 +180,13 @@ def __init__( if not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) + def extract_topics( self, topic_model, diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index b028e575..9205f9d6 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -114,6 +114,12 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] + if self.tokenizer is None and self.doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) def extract_topics( self, diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index 2a99fd1f..4a524045 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -1,8 +1,9 @@ import random import time +from typing import Union -def truncate_document(topic_model, doc_length, tokenizer, document: str): +def truncate_document(topic_model, doc_length: Union[int, None], tokenizer: Union[str, callable], document: str) -> str: """Truncate a document to a certain length. If you want to add a custom tokenizer, then it will need to have a `decode` and From ede4d99126c20e5824d7b37883dafacf271e2d14 Mon Sep 17 00:00:00 2001 From: D-Sivakumar Sriumapathy Date: Mon, 18 Nov 2024 19:41:59 +0000 Subject: [PATCH 2/5] standardise errors into function for consistency --- bertopic/representation/_cohere.py | 9 ++------- bertopic/representation/_langchain.py | 10 +++------- bertopic/representation/_llamacpp.py | 9 ++------- bertopic/representation/_openai.py | 8 ++------ bertopic/representation/_textgeneration.py | 9 ++------- bertopic/representation/_utils.py | 9 +++++++++ 6 files changed, 20 insertions(+), 34 deletions(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index f45024ae..a0f84eb6 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -4,7 +4,7 @@ from scipy.sparse import csr_matrix from typing import Mapping, List, Tuple, Union, Callable from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import truncate_document +from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ @@ -126,12 +126,7 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] - if self.tokenizer is None and self.doc_length is not None: - raise ValueError( - "Please select from one of the valid options for the `tokenizer` parameter: \n" - "{'char', 'whitespace', 'vectorizer'} \n" - "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" - ) + _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 264d1b20..3e217f4e 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -4,7 +4,7 @@ from typing import Callable, Mapping, List, Tuple, Union from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import truncate_document +from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = "What are these documents about? Please give a single label." @@ -148,12 +148,8 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer - if self.tokenizer is None and self.doc_length is not None: - raise ValueError( - "Please select from one of the valid options for the `tokenizer` parameter: \n" - "{'char', 'whitespace', 'vectorizer'} \n" - "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" - ) + _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) + def extract_topics( self, diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 321b13fd..52f53071 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -4,7 +4,7 @@ from llama_cpp import Llama from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import truncate_document +from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ @@ -118,12 +118,7 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] - if self.tokenizer is None and self.doc_length is not None: - raise ValueError( - "Please select from one of the valid options for the `tokenizer` parameter: \n" - "{'char', 'whitespace', 'vectorizer'} \n" - "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" - ) + _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 95a7b991..12880158 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -8,6 +8,7 @@ from bertopic.representation._utils import ( retry_with_exponential_backoff, truncate_document, + validate_truncate_document_parameters ) @@ -180,12 +181,7 @@ def __init__( if not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" - if self.tokenizer is None and self.doc_length is not None: - raise ValueError( - "Please select from one of the valid options for the `tokenizer` parameter: \n" - "{'char', 'whitespace', 'vectorizer'} \n" - "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" - ) + _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index 9205f9d6..7809dc9b 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -5,7 +5,7 @@ from transformers.pipelines.base import Pipeline from typing import Mapping, List, Tuple, Any, Union, Callable from bertopic.representation._base import BaseRepresentation -from bertopic.representation._utils import truncate_document +from bertopic.representation._utils import truncate_document, validate_truncate_document_parameters DEFAULT_PROMPT = """ @@ -114,12 +114,7 @@ def __init__( self.tokenizer = tokenizer self.prompts_ = [] - if self.tokenizer is None and self.doc_length is not None: - raise ValueError( - "Please select from one of the valid options for the `tokenizer` parameter: \n" - "{'char', 'whitespace', 'vectorizer'} \n" - "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" - ) + _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index 4a524045..abc21018 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -58,6 +58,15 @@ def decode(self, doc_chunks): return truncated_document return document +def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]: + """validates parameters that are used in the function `truncate_document`""" + if tokenizer is None and doc_length is not None: + raise ValueError( + "Please select from one of the valid options for the `tokenizer` parameter: \n" + "{'char', 'whitespace', 'vectorizer'} \n" + "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" + ) + def retry_with_exponential_backoff( func, From 7796fbc56fc3fb4bb30061f27e8f809bd6ab2519 Mon Sep 17 00:00:00 2001 From: D-Sivakumar Sriumapathy Date: Thu, 21 Nov 2024 14:18:03 +0000 Subject: [PATCH 3/5] add additional edgecase and removal of underscore for consistency --- bertopic/representation/_cohere.py | 3 ++- bertopic/representation/_langchain.py | 2 +- bertopic/representation/_llamacpp.py | 2 +- bertopic/representation/_openai.py | 3 ++- bertopic/representation/_textgeneration.py | 2 +- bertopic/representation/_utils.py | 6 ++++++ 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index a0f84eb6..b38c1dca 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -124,9 +124,10 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + validate_truncate_document_parameters(self.tokenizer, self.doc_length) + self.prompts_ = [] - _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 3e217f4e..93f8fbf7 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -148,7 +148,7 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer - _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) + validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 52f53071..3fd3541b 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -116,9 +116,9 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] - _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 12880158..2fa184b4 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -170,6 +170,8 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + validate_truncate_document_parameters(self.tokenizer, self.doc_length) + self.prompts_ = [] self.generator_kwargs = generator_kwargs @@ -181,7 +183,6 @@ def __init__( if not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" - _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_textgeneration.py b/bertopic/representation/_textgeneration.py index 7809dc9b..ada27d38 100644 --- a/bertopic/representation/_textgeneration.py +++ b/bertopic/representation/_textgeneration.py @@ -112,9 +112,9 @@ def __init__( self.diversity = diversity self.doc_length = doc_length self.tokenizer = tokenizer + validate_truncate_document_parameters(self.tokenizer, self.doc_length) self.prompts_ = [] - _ = validate_truncate_document_parameters(self.tokenizer, self.doc_length) def extract_topics( self, diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index abc21018..e01b134b 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -66,6 +66,12 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, "{'char', 'whitespace', 'vectorizer'} \n" "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" ) + elif tokenizer is not None and doc_length is None: + raise ValueError( + "If `tokenizer` is provided, `doc_length` of type int must be provided as well." + ) + else: + pass def retry_with_exponential_backoff( From 19bab3d22eac83cbaf7329dd828a7150d21686be Mon Sep 17 00:00:00 2001 From: D-Sivakumar Sriumapathy Date: Sun, 1 Dec 2024 20:40:52 +0000 Subject: [PATCH 4/5] remove redundant else clause --- bertopic/representation/_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index e01b134b..c2a33107 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -70,8 +70,6 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, raise ValueError( "If `tokenizer` is provided, `doc_length` of type int must be provided as well." ) - else: - pass def retry_with_exponential_backoff( From aa03a2504bb79d965646a3b080c940bb23454fff Mon Sep 17 00:00:00 2001 From: D-Sivakumar Sriumapathy Date: Tue, 3 Dec 2024 19:53:06 +0000 Subject: [PATCH 5/5] fixing linting errors --- bertopic/representation/_cohere.py | 1 - bertopic/representation/_langchain.py | 1 - bertopic/representation/_openai.py | 3 +-- bertopic/representation/_utils.py | 7 +++---- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index b38c1dca..a0c74434 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -128,7 +128,6 @@ def __init__( self.prompts_ = [] - def extract_topics( self, topic_model, diff --git a/bertopic/representation/_langchain.py b/bertopic/representation/_langchain.py index 93f8fbf7..e7588df4 100644 --- a/bertopic/representation/_langchain.py +++ b/bertopic/representation/_langchain.py @@ -150,7 +150,6 @@ def __init__( self.tokenizer = tokenizer validate_truncate_document_parameters(self.tokenizer, self.doc_length) - def extract_topics( self, topic_model, diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index 2fa184b4..e05a9c66 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -8,7 +8,7 @@ from bertopic.representation._utils import ( retry_with_exponential_backoff, truncate_document, - validate_truncate_document_parameters + validate_truncate_document_parameters, ) @@ -183,7 +183,6 @@ def __init__( if not self.generator_kwargs.get("stop") and not chat: self.generator_kwargs["stop"] = "\n" - def extract_topics( self, topic_model, diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index c2a33107..255c8fbe 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -58,8 +58,9 @@ def decode(self, doc_chunks): return truncated_document return document + def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, ValueError]: - """validates parameters that are used in the function `truncate_document`""" + """Validates parameters that are used in the function `truncate_document`.""" if tokenizer is None and doc_length is not None: raise ValueError( "Please select from one of the valid options for the `tokenizer` parameter: \n" @@ -67,9 +68,7 @@ def validate_truncate_document_parameters(tokenizer, doc_length) -> Union[None, "If `tokenizer` is of type callable ensure it has methods to encode and decode a document \n" ) elif tokenizer is not None and doc_length is None: - raise ValueError( - "If `tokenizer` is provided, `doc_length` of type int must be provided as well." - ) + raise ValueError("If `tokenizer` is provided, `doc_length` of type int must be provided as well.") def retry_with_exponential_backoff(