Skip to content

Commit

Permalink
Merge pull request #20 from DataShades/SXLLCAXAAD-8
Browse files Browse the repository at this point in the history
SXLLCAXAAD-8 / add logic for working with missing values
  • Loading branch information
alexmorev authored Nov 22, 2024
2 parents 22c4c70 + 6e26e7d commit 9f307dc
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 16 deletions.
16 changes: 14 additions & 2 deletions ckanext/charts/chart_builders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,20 @@ def skip_null_values_field(self) -> dict[str, Any]:
"validators": [
self.get_validator("boolean_validator"),
],
"help_text": """Entries in the data with N/A or NULL will not be
graphed and will be skipped"""
"help_text": """Entries of the data with missing values will not be
graphed or will be skipped""",
}

def break_chart_field(self) -> dict[str, Any]:
return {
"field_name": "break_chart",
"label": "Break the chart",
"form_snippet": "chart_checkbox.html",
"group": "Data",
"validators": [
self.get_validator("boolean_validator"),
],
"help_text": "Break the graph at missing values",
}

def sort_x_field(self) -> dict[str, Any]:
Expand Down
95 changes: 81 additions & 14 deletions ckanext/charts/chart_builders/plotly.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from pandas.core.frame import DataFrame
from pandas.errors import ParserError
from plotly.subplots import make_subplots

from ckanext.charts import exception
Expand Down Expand Up @@ -46,37 +49,95 @@ class PlotlyLineBuilder(PlotlyBuilder):
def to_json(self) -> Any:
return self.build_line_chart()


def _split_data_by_year(self) -> dict[str, Any]:
"""
Prepare data for a line chart. It splits the data by year stated
in the date format column which is used for x-axis.
"""
self.df["date"] = pd.to_datetime(self.df[self.settings["x"]]).dt.date
self.df = self.df[["date", self.settings["y"][0]]].set_index(["date"])
self.df = self.df[[self.settings["y"][0]]].groupby(["date"]).mean()
if len(self.settings["years"]) > 1:
self.df.drop_duplicates(subset=[self.settings["x"]], inplace=True)
self.df = self.df[[self.settings["x"], self.settings["y"][0]]]
self.df["year"] = pd.to_datetime(self.df[self.settings["x"]]).dt.year

self.df = self.df.pivot(
index=self.settings["x"],
columns="year",
values=self.settings["y"][0],
)

self.settings["y"] = self.df.columns.to_list()
self.df[self.settings["x"]] = self.df.index
self.df[self.settings["x"]] = pd.to_datetime(
self.df[self.settings["x"]],
unit="ns",
).dt.strftime("%m-%d %H:%M")

self.df.index = [pd.to_datetime(self.df.index).strftime("%m/%d"), pd.to_datetime(self.df.index).strftime("%Y")]
self.df = self.df[self.settings["y"][0]].unstack()
return self

self.settings["y"] = self.df.columns.to_list()
self.df["date_time"] = self.df.index

return self
def _skip_null_values(self, column: str) -> tuple[Any]:
"""
Return values for x-axis and y-axis after removing missing values.
"""
if self.settings.get("split_data") and len(self.settings["years"]) > 1:
df = self.df.dropna(subset=column)
else:
df = self.df

if self.settings.get("skip_null_values"):
if self.settings.get("break_chart"):
x, y = self._break_chart_by_missing_data(df, column)
else:
x = df[self.settings["x"]]
y = df[column]
else:
x = df[self.settings["x"]]
y = df[column].fillna(0)

return x, y

def _skip_null_values(self, column) -> tuple[Any]:
if self.settings.get("skip_null_values", True):
x = self.df[self.df[column].notna()][self.settings["x"]]
y = self.df[self.df[column].notna()][column]

def _break_chart_by_missing_data(self, df: DataFrame, column: str) -> tuple[Any]:
"""
Find gaps in date column and fill them with missing dates.
"""
if len(self.settings["years"]) > 1:
df["xaxis"] = df[self.settings["x"]]

if self.settings.get("split_data"):
df[self.settings["x"]] = df.index

df["date"] = pd.to_datetime(df[self.settings["x"]]).dt.date

all_dates = pd.date_range(
start=df["date"].min(),
end=df["date"].max(),
unit="ns",
).date
date_range_df = pd.DataFrame({"date": all_dates})
df = pd.merge(date_range_df, df, on="date", how="left")

x = df["xaxis"]
y = df[column]
else:
x = self.df[self.settings["x"]]
y = self.df[column].fillna(0)
x = df[self.settings["x"]]
y = df[column]

return x, y


def build_line_chart(self) -> Any:
"""
Build a line chart. It supports multi columns for y-axis
to display on the line chart.
"""
try:
dates = pd.to_datetime(self.df[self.settings["x"]], unit="ns")
self.settings["years"] = dates.dt.year.unique()
except (ParserError, ValueError):
self.settings["years"] = []

if self.settings.get("split_data", False):
self._split_data_by_year()

Expand All @@ -89,6 +150,7 @@ def build_line_chart(self) -> Any:
x=x,
y=y,
name=self.settings["y"][0],
connectgaps=not self.settings.get("break_chart"),
),
secondary_y=False,
)
Expand All @@ -102,10 +164,14 @@ def build_line_chart(self) -> Any:
x=x,
y=y,
name=column,
connectgaps=not self.settings.get("break_chart"),
),
secondary_y=True,
)

if self.settings.get("split_data") and len(self.settings["years"]) > 1:
fig.update_layout(xaxis={"categoryorder": "category ascending"})

if chart_title := self.settings.get("chart_title"):
fig.update_layout(title_text=chart_title)

Expand Down Expand Up @@ -256,6 +322,7 @@ def get_form_fields(self):
self.sort_y_field(),
self.split_data_field(),
self.skip_null_values_field(),
self.break_chart_field(),
self.limit_field(maximum=1000000),
self.chart_title_field(),
self.chart_xlabel_field(),
Expand Down

0 comments on commit 9f307dc

Please sign in to comment.