Skip to content

Commit

Permalink
chore: remove obsolete flake8 config and update line length (#2066)
Browse files Browse the repository at this point in the history
  • Loading branch information
afuetterer authored Jul 1, 2024
1 parent d1ffb2f commit 39bbfdb
Show file tree
Hide file tree
Showing 52 changed files with 425 additions and 1,297 deletions.
2 changes: 0 additions & 2 deletions .flake8

This file was deleted.

751 changes: 184 additions & 567 deletions bertopic/_bertopic.py

Large diffs are not rendered by default.

44 changes: 10 additions & 34 deletions bertopic/_save_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ def push_to_hf_hub(
save_ctfidf: Whether to save c-TF-IDF information
"""
if not _has_hf_hub:
raise ValueError(
"Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`"
)
raise ValueError("Make sure you have the huggingface hub installed via `pip install --upgrade huggingface_hub`")

# Create repo if it doesn't exist yet and infer complete repo_id
repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True)
Expand All @@ -156,9 +154,7 @@ def push_to_hf_hub(

# Add README if it does not exist
try:
get_hf_file_metadata(
hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)
)
get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
except: # noqa: E722
if model_card:
readme_text = generate_readme(model, repo_id)
Expand Down Expand Up @@ -241,13 +237,9 @@ def load_files_from_hf(path):

# c-TF-IDF
try:
ctfidf_config = load_cfg_from_json(
hf_hub_download(path, CTFIDF_CFG_NAME, revision=None)
)
ctfidf_config = load_cfg_from_json(hf_hub_download(path, CTFIDF_CFG_NAME, revision=None))
try:
ctfidf_tensors = hf_hub_download(
path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None
)
ctfidf_tensors = hf_hub_download(path, CTFIDF_SAFE_WEIGHTS_NAME, revision=None)
ctfidf_tensors = load_safetensors(ctfidf_tensors)
except: # noqa: E722
ctfidf_tensors = hf_hub_download(path, CTFIDF_WEIGHTS_NAME, revision=None)
Expand All @@ -268,9 +260,7 @@ def load_files_from_hf(path):
topic_list = list(topics["topic_representations"].keys())
images = {}
for topic in topic_list:
image = Image.open(
hf_hub_download(path, f"images/{topic}.jpg", revision=None)
)
image = Image.open(hf_hub_download(path, f"images/{topic}.jpg", revision=None))
images[int(topic)] = image

return topics, params, tensors, ctfidf_tensors, ctfidf_config, images
Expand All @@ -283,11 +273,7 @@ def generate_readme(model, repo_id: str):

# Get Statistics
model_name = repo_id.split("/")[-1]
params = {
param: value
for param, value in model.get_params().items()
if "model" not in param
}
params = {param: value for param, value in model.get_params().items() if "model" not in param}
params = "\n".join([f"* {param}: {value}" for param, value in params.items()])
topics = sorted(list(set(model.topics_)))
nr_topics = str(len(set(model.topics_)))
Expand All @@ -298,23 +284,15 @@ def generate_readme(model, repo_id: str):
nr_documents = ""

# Topic information
topic_keywords = [
" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics
]
topic_keywords = [" - ".join(list(zip(*model.get_topic(topic)))[0][:5]) for topic in topics]
topic_freq = [model.get_topic_freq(topic) for topic in topics]
topic_labels = (
model.custom_labels_
if model.custom_labels_
else [model.topic_labels_[topic] for topic in topics]
)
topic_labels = model.custom_labels_ if model.custom_labels_ else [model.topic_labels_[topic] for topic in topics]
topics = [
f"| {topic} | {topic_keywords[index]} | {topic_freq[topic]} | {topic_labels[index]} | \n"
for index, topic in enumerate(topics)
]
topics = topic_table_head + "".join(topics)
frameworks = "\n".join(
[f"* {param}: {value}" for param, value in get_package_versions().items()]
)
frameworks = "\n".join([f"* {param}: {value}" for param, value in get_package_versions().items()])

# Fill Statistics into model card
model_card = model_card.replace("{MODEL_NAME}", model_name)
Expand All @@ -330,9 +308,7 @@ def generate_readme(model, repo_id: str):
if not has_visual_aspect:
model_card = model_card.replace("{PIPELINE_TAG}", "text-classification")
else:
model_card = model_card.replace(
"pipeline_tag: {PIPELINE_TAG}\n", ""
) # TODO add proper tag for this instance
model_card = model_card.replace("pipeline_tag: {PIPELINE_TAG}\n", "") # TODO add proper tag for this instance

return model_card

Expand Down
29 changes: 7 additions & 22 deletions bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,14 @@ def check_documents_type(documents):
if not any([isinstance(doc, str) for doc in documents]):
raise TypeError("Make sure that the iterable only contains strings.")
else:
raise TypeError(
"Make sure that the documents variable is an iterable containing strings only."
)
raise TypeError("Make sure that the documents variable is an iterable containing strings only.")


def check_embeddings_shape(embeddings, docs):
"""Check if the embeddings have the correct shape."""
if embeddings is not None:
if not any(
[isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]
):
raise ValueError(
"Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. "
)
if not any([isinstance(embeddings, np.ndarray), isinstance(embeddings, csr_matrix)]):
raise ValueError("Make sure to input embeddings as a numpy array or scipy.sparse.csr.csr_matrix. ")
else:
if embeddings.shape[0] != len(docs):
raise ValueError(
Expand Down Expand Up @@ -137,16 +131,11 @@ def validate_distance_matrix(X, n_samples):
# check it has correct size
n = s[0]
if n != (n_samples * (n_samples - 1) / 2):
raise ValueError(
"The condensed distance matrix must have " "shape (n*(n-1)/2,)."
)
raise ValueError("The condensed distance matrix must have " "shape (n*(n-1)/2,).")
elif len(s) == 2:
# check it has correct size
if (s[0] != n_samples) or (s[1] != n_samples):
raise ValueError(
"The distance matrix must be of shape "
"(n, n) where n is the number of samples."
)
raise ValueError("The distance matrix must be of shape " "(n, n) where n is the number of samples.")
# force zero diagonal and convert to condensed
np.fill_diagonal(X, 0)
X = squareform(X)
Expand Down Expand Up @@ -182,15 +171,11 @@ def get_unique_distances(dists: np.array, noise_max=1e-7) -> np.array:
for i in range(dists.shape[0] - 1):
if dists[i] == dists[i + 1]:
# returns the next unique distance or the current distance with the added noise
next_unique_dist = next(
(d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max
)
next_unique_dist = next((d for d in dists[i + 1 :] if d != dists[i]), dists[i] + noise_max)

# the noise can never be large then the difference between the next unique distance and the current one
curr_max_noise = min(noise_max, next_unique_dist - dists_cp[i])
dists_cp[i + 1] = np.random.uniform(
low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise
)
dists_cp[i + 1] = np.random.uniform(low=dists_cp[i] + curr_max_noise / 2, high=dists_cp[i] + curr_max_noise)
return dists_cp


Expand Down
4 changes: 1 addition & 3 deletions bertopic/backend/_flair.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
embeddings = []
for document in tqdm(documents, disable=not verbose):
try:
sentence = (
Sentence(document) if document else Sentence("an empty document")
)
sentence = Sentence(document) if document else Sentence("an empty document")
self.embedding_model.embed(sentence)
except RuntimeError:
sentence = Sentence("an empty document")
Expand Down
4 changes: 1 addition & 3 deletions bertopic/backend/_gensim.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
vector_shape = self.embedding_model.get_vector(
list(self.embedding_model.index_to_key)[0]
).shape[0]
vector_shape = self.embedding_model.get_vector(list(self.embedding_model.index_to_key)[0]).shape[0]
empty_vector = np.zeros(vector_shape)

# Extract word embeddings and pool to document-level
Expand Down
14 changes: 5 additions & 9 deletions bertopic/backend/_hftransformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:

embeddings = []
for document, features in tqdm(
zip(
documents, self.embedding_model(dataset, truncation=True, padding=True)
),
zip(documents, self.embedding_model(dataset, truncation=True, padding=True)),
total=len(dataset),
disable=not verbose,
):
Expand All @@ -79,12 +77,10 @@ def _embed(self, document: str, features: np.ndarray) -> np.ndarray:
https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2#usage-huggingface-transformers
"""
token_embeddings = np.array(features)
attention_mask = self.embedding_model.tokenizer(
document, truncation=True, padding=True, return_tensors="np"
)["attention_mask"]
input_mask_expanded = np.broadcast_to(
np.expand_dims(attention_mask, -1), token_embeddings.shape
)
attention_mask = self.embedding_model.tokenizer(document, truncation=True, padding=True, return_tensors="np")[
"attention_mask"
]
input_mask_expanded = np.broadcast_to(np.expand_dims(attention_mask, -1), token_embeddings.shape)
sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = np.clip(
input_mask_expanded.sum(1),
Expand Down
23 changes: 6 additions & 17 deletions bertopic/backend/_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ def __init__(
except: # noqa: E722
self.tokenizer = None

def embed(
self, documents: List[str], images: List[str] = None, verbose: bool = False
) -> np.ndarray:
def embed(self, documents: List[str], images: List[str] = None, verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words or images into an n-dimensional
matrix of embeddings.
Expand Down Expand Up @@ -124,9 +122,7 @@ def embed(
elif image_embeddings is not None:
return image_embeddings

def embed_documents(
self, documents: List[str], verbose: bool = False
) -> np.ndarray:
def embed_documents(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.
Expand All @@ -139,9 +135,7 @@ def embed_documents(
that each have an embeddings size of `m`
"""
truncated_docs = [self._truncate_document(doc) for doc in documents]
embeddings = self.embedding_model.encode(
truncated_docs, show_progress_bar=verbose
)
embeddings = self.embedding_model.encode(truncated_docs, show_progress_bar=verbose)
return embeddings

def embed_words(self, words: List[str], verbose: bool = False) -> np.ndarray:
Expand Down Expand Up @@ -170,15 +164,12 @@ def embed_images(self, images, verbose):
end_index = (i * self.batch_size) + self.batch_size

images_to_embed = [
Image.open(image) if isinstance(image, str) else image
for image in images[start_index:end_index]
Image.open(image) if isinstance(image, str) else image for image in images[start_index:end_index]
]
if self.image_model is not None:
img_emb = self.image_model.encode(images_to_embed)
else:
img_emb = self.embedding_model.encode(
images_to_embed, show_progress_bar=False
)
img_emb = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
embeddings.extend(img_emb.tolist())

# Close images
Expand All @@ -191,9 +182,7 @@ def embed_images(self, images, verbose):
if self.image_model is not None:
embeddings = self.image_model.encode(images_to_embed)
else:
embeddings = self.embedding_model.encode(
images_to_embed, show_progress_bar=False
)
embeddings = self.embedding_model.encode(images_to_embed, show_progress_bar=False)
return embeddings

def _truncate_document(self, document):
Expand Down
8 changes: 2 additions & 6 deletions bertopic/backend/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
if self.batch_size is not None:
embeddings = []
for batch in tqdm(self._chunks(prepared_documents), disable=not verbose):
response = self.client.embeddings.create(
input=batch, **self.generator_kwargs
)
response = self.client.embeddings.create(input=batch, **self.generator_kwargs)
embeddings.extend([r.embedding for r in response.data])

# Delay subsequent calls
Expand All @@ -81,9 +79,7 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:

# Extract embeddings all at once
else:
response = self.client.embeddings.create(
input=prepared_documents, **self.generator_kwargs
)
response = self.client.embeddings.create(input=prepared_documents, **self.generator_kwargs)
embeddings = [r.embedding for r in response.data]
return np.array(embeddings)

Expand Down
5 changes: 1 addition & 4 deletions bertopic/backend/_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
that each have an embeddings size of `m`
"""
embeddings = np.array(
[
self.embedding_model([doc]).cpu().numpy()[0]
for doc in tqdm(documents, disable=not verbose)
]
[self.embedding_model([doc]).cpu().numpy()[0] for doc in tqdm(documents, disable=not verbose)]
)
return embeddings
16 changes: 4 additions & 12 deletions bertopic/backend/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@
]


def select_backend(
embedding_model, language: str = None, verbose: bool = False
) -> BaseEmbedder:
def select_backend(embedding_model, language: str = None, verbose: bool = False) -> BaseEmbedder:
"""Select an embedding model based on language or a specific provided model.
When selecting a language, we choose all-MiniLM-L6-v2 for English and
paraphrase-multilingual-MiniLM-L12-v2 for all other languages as it support 100+ languages.
Expand Down Expand Up @@ -115,9 +113,7 @@ def select_backend(
return USEBackend(embedding_model)

# Sentence Transformer embeddings
if "sentence_transformers" in str(type(embedding_model)) or isinstance(
embedding_model, str
):
if "sentence_transformers" in str(type(embedding_model)) or isinstance(embedding_model, str):
from ._sentencetransformers import SentenceTransformerBackend

return SentenceTransformerBackend(embedding_model)
Expand All @@ -134,13 +130,9 @@ def select_backend(
from ._sentencetransformers import SentenceTransformerBackend

if language.lower() in ["English", "english", "en"]:
return SentenceTransformerBackend(
"sentence-transformers/all-MiniLM-L6-v2"
)
return SentenceTransformerBackend("sentence-transformers/all-MiniLM-L6-v2")
elif language.lower() in languages or language == "multilingual":
return SentenceTransformerBackend(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
return SentenceTransformerBackend("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
else:
raise ValueError(
f"{language} is currently not supported. However, you can "
Expand Down
4 changes: 1 addition & 3 deletions bertopic/cluster/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ def hdbscan_delegator(model, func: str, embeddings: np.ndarray = None):
if "cuml" in str_type_model and "hdbscan" in str_type_model:
from cuml.cluster import hdbscan as cuml_hdbscan

predictions, probabilities = cuml_hdbscan.approximate_predict(
model, embeddings
)
predictions, probabilities = cuml_hdbscan.approximate_predict(model, embeddings)
return predictions, probabilities

predictions = model.predict(embeddings)
Expand Down
4 changes: 1 addition & 3 deletions bertopic/plotting/_approximate_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,7 @@ def text_color(val):

def highligh_color(data, color="white"):
attr = "background-color: {}".format(color)
return pd.DataFrame(
np.where(data == 0, attr, ""), index=data.index, columns=data.columns
)
return pd.DataFrame(np.where(data == 0, attr, ""), index=data.index, columns=data.columns)

if len(df) == 0:
return df
Expand Down
Loading

0 comments on commit 39bbfdb

Please sign in to comment.