Skip to content

Commit

Permalink
added top_n_frequency_analysis, and rotating ticks on barplot, countplot
Browse files Browse the repository at this point in the history
  • Loading branch information
john-james-ai committed Jun 7, 2024
1 parent 9b2a3ad commit 73d0c42
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "studioai"
version = "0.2.21"
version = "0.2.22"
description = "Atelier for Artificial Intelligence and Data Science"
authors = [
"John James <[email protected]>",
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ per-file-ignores =

# Enables maccabe complexity checks
# see https://github.com/PyCQA/mccabe#plugin-for-flake8
max-complexity = 10
max-complexity = 15

exclude = .git,__pycache__,old,build,dist,.venv,.eggs,.tox
64 changes: 62 additions & 2 deletions studioai/analysis/explore/eda.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# URL : https://github.com/john-james-ai/studioai #
# ------------------------------------------------------------------------------------------------ #
# Created : Thursday August 10th 2023 08:29:08 pm #
# Modified : Wednesday May 22nd 2024 02:54:40 am #
# Modified : Friday June 7th 2024 02:16:21 pm #
# ------------------------------------------------------------------------------------------------ #
# License : MIT License #
# Copyright : (c) 2023 John James #
Expand Down Expand Up @@ -65,7 +65,7 @@ def plot(self) -> Visualizer: # pragma: no cover
return self._visualizer

@property
def stats(self) -> Visualizer: # pragma: no cover
def stats(self) -> Inference: # pragma: no cover
self._inference.data = self.df
return self._inference

Expand Down Expand Up @@ -256,6 +256,66 @@ def unique(self, columns: list = None) -> pd.DataFrame:
df = self.df.drop_duplicates().reset_index(drop=True)
return self._format(df=df)

# ------------------------------------------------------------------------------------------- #
def top_n_frequency_analysis(self, x: str, n: int, data: pd.DataFrame = None):
"""Returns a dataframe with proportional and cumulative counts of one or more categorical variables.
Args:
x (Union[str,List[str]]): A string or list of strings indicating the variables included in the count.
n (int): Number of rows to include in top n.
data (pd.DataFrame). Data to analyze. Optional.
"""

# Use instance variable df if data is None
data = data or self.df

# Calculate frequency distribution
freq = data[x].value_counts().reset_index()
freq.columns = [x, "Count"]

# Calculate cumulative count and proportions
freq["Cumulative Count"] = freq["Count"].cumsum()
total_count = freq["Count"].sum()
freq["Proportion"] = freq["Count"] / total_count
freq["Cumulative Proportion"] = freq["Proportion"].cumsum()

# Top N rows
top_n = freq.head(n).copy()

# Row for the rest of the dataset
if len(freq) > n:
rest_count = freq.iloc[n:]["Count"].sum()
rest_cumulative_count = top_n["Cumulative Count"].iloc[-1] + rest_count
rest_proportion = rest_count / total_count
rest_cumulative_proportion = (
1.0 # because it's the rest, it covers the remaining percentage
)
rest_row = pd.DataFrame(
{
x: [f"Rest of {x}"],
"Count": [rest_count],
"Cumulative Count": [rest_cumulative_count],
"Proportion": [rest_proportion],
"Cumulative Proportion": [rest_cumulative_proportion],
}
)
top_n = pd.concat([top_n, rest_row], ignore_index=True)

# Total row
total_row = pd.DataFrame(
{
x: ["Total"],
"Count": [total_count],
"Cumulative Count": [total_count],
"Proportion": [1.0],
"Cumulative Proportion": [1.0],
}
)
top_n = pd.concat([top_n, total_row], ignore_index=True)

return top_n

# ------------------------------------------------------------------------------------------- #
def frequency(
self,
Expand Down
45 changes: 42 additions & 3 deletions studioai/analysis/visualize/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# URL : https://github.com/john-james-ai/studioai #
# ------------------------------------------------------------------------------------------------ #
# Created : Saturday August 26th 2023 06:25:27 am #
# Modified : Thursday May 9th 2024 09:26:13 am #
# Modified : Friday June 7th 2024 02:18:54 pm #
# ------------------------------------------------------------------------------------------------ #
# License : MIT License #
# Copyright : (c) 2023 John James #
Expand Down Expand Up @@ -381,6 +381,8 @@ def countplot(
plot_counts: bool = False,
title: str = None,
figsize: bool = (12, 4),
rotate_xticks: int = None,
rotate_yticks: int = None,
ax: plt.Axes = None,
**kwargs,
) -> plt.Axes:
Expand All @@ -402,6 +404,8 @@ def countplot(
plot_counts (bool): If True, the bars are annotated with absolute and relative counts. Default = False
title (str): Title for the plot. Optional
figsize (tuple): Size of figure in inches. Ignored if ax is provided.
rotate_xticks (int): The degrees to rotate the x_ticks. None means no rotation.
rotate_yticks (int): The degrees to rotate the y_ticks. None means no rotation.
ax: (plt.Axes): A matplotlib Axes object. Optional. If not provide, one will be obtained from the canvas.
"""
Expand Down Expand Up @@ -436,6 +440,22 @@ def countplot(
*args,
**kwargs,
)

if rotate_xticks is not None:
ax = ax.set_xticks(
ax.get_xticks(),
ax.get_xticklabels(),
rotation=rotate_xticks,
ha="right",
)
elif rotate_yticks is not None:
ax = ax.set_yticks(
ax.get_yticks(),
ax.get_yticklabels(),
rotation=rotate_yticks,
va="center",
)

if plot_counts:
if orient == "v":
for p in ax.patches:
Expand Down Expand Up @@ -472,6 +492,8 @@ def barplot(
orient: str = None,
title: str = None,
figsize: bool = (12, 4),
rotate_xticks: int = None,
rotate_yticks: int = None,
ax: plt.Axes = None,
**kwargs,
) -> plt.Axes:
Expand All @@ -493,6 +515,8 @@ def barplot(
when both x and y are numeric or when plotting wide-form data.
title (str): Title for the plot. Optional
figsize (tuple): Size of figure in inches. Ignored if ax is provided.
rotate_xticks (int): The degrees to rotate the x_ticks. None means no rotation.
rotate_yticks (int): The degrees to rotate the y_ticks. None means no rotation.
ax: (plt.Axes): A matplotlib Axes object. Optional. If not provide, one will be obtained from the canvas.
"""
Expand All @@ -515,6 +539,21 @@ def barplot(
**kwargs,
)

if rotate_xticks is not None:
ax = ax.set_xticks(
ax.get_xticks(),
ax.get_xticklabels(),
rotation=rotate_xticks,
ha="right",
)
elif rotate_yticks is not None:
ax = ax.set_yticks(
ax.get_yticks(),
ax.get_yticklabels(),
rotation=rotate_yticks,
va="center",
)

if title is not None:
_ = ax.set_title(title)

Expand Down Expand Up @@ -1286,8 +1325,8 @@ def heatmap(
cbar_kws: dict = None,
cbar_ax: plt.Axes = None,
square: bool = True,
xticklabels: Union["str, bool", list, int] = "auto",
yticklabels: Union["str, bool", list, int] = "auto",
xticklabels: Union["str", "bool", list, int] = "auto",
yticklabels: Union["str", "bool", list, int] = "auto",
mask: bool = None,
ax: plt.Axes = None,
title: str = None,
Expand Down

0 comments on commit 73d0c42

Please sign in to comment.