Skip to content

Commit

Permalink
Merge pull request #16 from robertmcleod2/chain-assistant-final-version
Browse files Browse the repository at this point in the history
Chain assistant final version
  • Loading branch information
robertmcleod2 authored Oct 14, 2024
2 parents 640db68 + b6a7c6a commit b254d31
Show file tree
Hide file tree
Showing 7 changed files with 5,898 additions and 2,511 deletions.
206 changes: 192 additions & 14 deletions notebooks/smart_meter_data_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@
"# generate random daily usage data\n",
"df['usage'] = np.sin(np.pi * (df.index.hour)/24) * 10 + np.random.randint(0, 5, len(df))\n",
"\n",
"# add anomalies\n",
"# add single anomaly days\n",
"anomaly_days = ['2024-08-10', '2024-08-20']\n",
"for day in anomaly_days:\n",
" df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 12 + np.random.randint(0, 5, len(df.loc[day, 'usage']))\n",
"\n",
"# add prolonged anomaly period\n",
"prolonged_anomaly_days = ['2024-08-27', '2024-08-28', '2024-08-29', '2024-08-30', '2024-08-31']\n",
"for day in prolonged_anomaly_days:\n",
" df.loc[day, 'usage'] = np.sin(np.pi * (df.loc[day, 'usage'].index.hour)/24) * 11.5 + np.random.randint(0, 5, len(df.loc[day, 'usage']))\n",
"\n",
"# plot using plotly\n",
"fig = px.line(df, x=df.index, y='usage')\n",
"fig.show()\n",
Expand All @@ -35,6 +40,67 @@
"df.to_csv('smart_meter_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# generate weather data\n",
"date_range = pd.date_range(start='2024-08-01', end='2024-09-01', freq='15min')\n",
"date_range = date_range[:-1]\n",
"\n",
"df_weather = pd.DataFrame(index=date_range)\n",
"\n",
"df_weather['temperature'] = np.sin(np.pi * (df_weather.index.hour)/24) * 10 + 15 + np.random.randint(0, 5, len(df_weather))\n",
"\n",
"# add single anomaly days\n",
"anomaly_days = ['2024-08-10', '2024-08-20']\n",
"for day in anomaly_days:\n",
" df_weather.loc[day, 'temperature'] = np.sin(np.pi * (df_weather.loc[day, 'temperature'].index.hour)/24) * 10 + 25 + np.random.randint(0, 5, len(df.loc[day, 'usage']))\n",
"\n",
"fig = px.line(df_weather, x=df_weather.index, y='temperature')\n",
"fig.show()\n",
"\n",
"# save to csv\n",
"df_weather.to_csv('weather_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"from scipy.stats import zscore\n",
"\n",
"\n",
"def detect_daily_anomalies(df):\n",
" \"\"\"\n",
" Detects anomalies in energy usage from smart meter data.\n",
"\n",
" Simple anomaly detection using z-score exceeding 2 for daily usage.\n",
" \"\"\"\n",
"\n",
" # Aggregate the data to daily usage\n",
" df_daily = df.resample(\"D\").sum()\n",
"\n",
" # Calculate the Z-score for daily usage\n",
" df_daily[\"zscore\"] = zscore(df_daily[\"usage\"])\n",
"\n",
" # Identify anomalies\n",
" df_daily[\"anomalies\"] = df_daily[\"zscore\"] > 2\n",
"\n",
" # if there are any anomalies, return the dates\n",
" if df_daily[\"anomalies\"].any():\n",
" anomalies = df_daily[df_daily[\"anomalies\"]].index\n",
" return anomalies\n",
" else:\n",
" return None\n",
" \n",
"anomalies = detect_daily_anomalies(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -43,23 +109,135 @@
"source": [
"from scipy.stats import zscore\n",
"\n",
"# Aggregate the data to daily usage\n",
"df_daily = df.resample('D').sum()\n",
"\n",
"# Calculate the Z-score for daily usage\n",
"df_daily['zscore'] = zscore(df_daily['usage'])\n",
"def detect_prolonged_anomalies(df, min_consecutive_days=3, zscore_threshold=1.5):\n",
" \"\"\"\n",
" Detects prolonged anomalies in energy usage from smart meter data.\n",
"\n",
"# Identify outliers (e.g., Z-score > 2 or < -2)\n",
"df_daily['outlier'] = df_daily['zscore'].abs() > 2\n",
" Anomaly detection using the average z-score exceeding a threshold for a prolonged period.\n",
"\n",
"# add a column to the original dataframe to indicate if the day is an outlier\n",
"df['outlier'] = df.index.floor('D').isin(df_daily[df_daily['outlier']].index)\n",
" Parameters:\n",
" df (pd.DataFrame): DataFrame containing the energy usage data with a datetime index.\n",
" min_consecutive_days (int): Minimum number of consecutive days to consider as a prolonged anomaly.\n",
" zscore_threshold (float): Z-score threshold for detecting anomalies.\n",
"\n",
"# plot original data with outliers highlighted with a red box covering the day\n",
"fig = px.line(df, x=df.index, y='usage', title='Daily Usage with Outliers Highlighted')\n",
"for outlier in df_daily[df_daily['outlier']].index:\n",
" fig.add_vrect(x0=outlier, x1=outlier + pd.Timedelta(days=1), fillcolor='red', opacity=0.25, line_width=0, annotation_text='Outlier')\n",
"fig.show()"
" Returns:\n",
" pd.DataFrame: DataFrame with the anomaly_window column indicating prolonged anomalies.\n",
" \"\"\"\n",
"\n",
" # Aggregate the data to daily usage\n",
" df_daily = df.resample(\"D\").sum()\n",
"\n",
" # Calculate the Z-score for daily usage\n",
" df_daily[\"zscore\"] = zscore(df_daily[\"usage\"])\n",
"\n",
" # Initialize a column to mark prolonged anomalies\n",
" df_daily[\"prolonged_anomaly_length\"] = 0\n",
"\n",
" # Sliding window to calculate the average Z-score over periods\n",
" for window_size in range(min_consecutive_days, len(df_daily) + 1):\n",
" avg_zscore = df_daily[\"zscore\"].rolling(window=window_size).mean()\n",
" prolonged_anomaly = (avg_zscore > zscore_threshold).astype(int) * window_size\n",
" df_daily[\"prolonged_anomaly_length\"] = np.maximum(\n",
" df_daily[\"prolonged_anomaly_length\"], prolonged_anomaly\n",
" )\n",
"\n",
" # Create prolonged_anomaly_window column\n",
" df_daily[\"anomaly_window\"] = False\n",
" for i in range(len(df_daily)):\n",
" if df_daily[\"prolonged_anomaly_length\"].iloc[i] > 0:\n",
" start_idx = i - df_daily[\"prolonged_anomaly_length\"].iloc[i] + 1\n",
" df_daily.iloc[start_idx : i + 1, df_daily.columns.get_loc(\"anomaly_window\")] = True\n",
"\n",
" # if there are any outliers, return the dates\n",
" if df_daily[\"anomaly_window\"].any():\n",
" anomalies = df_daily[df_daily[\"anomaly_window\"]].index\n",
" # get start and end dates of each of the prolonged anomaly windows\n",
" prolonged_anomalies = []\n",
" for i in range(len(anomalies)):\n",
" if i == 0:\n",
" start_date = anomalies[i]\n",
" elif anomalies[i] != anomalies[i - 1] + pd.Timedelta(days=1):\n",
" prolonged_anomalies.append((start_date, anomalies[i - 1]))\n",
" start_date = anomalies[i]\n",
" elif i == len(anomalies) - 1:\n",
" prolonged_anomalies.append((start_date, anomalies[i]))\n",
" return prolonged_anomalies\n",
" else:\n",
" return None\n",
" \n",
"# Detect prolonged anomalies in the smart meter data\n",
"prolonged_anomalies = detect_prolonged_anomalies(df)\n",
"print(prolonged_anomalies)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"weather_data = pd.read_csv(\"weather_data.csv\", index_col=0, parse_dates=True)\n",
"\n",
"def analyse_weather_data(df, anomalies, prolonged_anomalies):\n",
" \"\"\"\n",
" Analyse weather data to identify correlations with energy usage anomalies.\n",
"\n",
" Parameters:\n",
" df (pd.DataFrame): DataFrame containing weather data with a datetime index.\n",
" anomalies (list): List of dates with energy usage anomalies.\n",
" prolonged_anomalies (list): List of tuples with start and end dates of prolonged anomalies.\n",
"\n",
" Returns:\n",
" pd.DataFrame: DataFrame with weather data during anomalies.\n",
" \"\"\"\n",
"\n",
" # resample to daily data\n",
" df_daily = df.resample(\"D\").mean()\n",
"\n",
" # calculate average temperature\n",
" average_temperature = df_daily[\"temperature\"].mean()\n",
"\n",
" # calculate average temperature for each anomaly\n",
" anomaly_temperatures = {}\n",
" for anomaly in anomalies:\n",
" anomaly_temperatures[anomaly] = df_daily.loc[anomaly, \"temperature\"]\n",
"\n",
" # calculate average temperature for each prolonged anomaly\n",
" prolonged_anomaly_temperatures = {}\n",
" for start_date, end_date in prolonged_anomalies:\n",
" prolonged_anomaly_temperatures[(start_date, end_date)] = df_daily.loc[start_date:end_date, \"temperature\"].mean()\n",
"\n",
" average_temperature_str = \"Average temperature: {:.2f}\".format(average_temperature)\n",
" anomaly_temperatures_str = \"Temperature during anomalies: \" + \", \".join(\n",
" [f\"[{date.strftime('%Y-%m-%d')}: {temperature:.2f}]\" for date, temperature in anomaly_temperatures.items()]\n",
" )\n",
" prolonged_anomaly_temperatures_str = \"Temperature during prolonged anomalies: \" + \", \".join(\n",
" [\n",
" f\"[{start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}: {temperature:.2f}]\"\n",
" for (start_date, end_date), temperature in prolonged_anomaly_temperatures.items()\n",
" ]\n",
" )\n",
" return average_temperature_str, anomaly_temperatures_str, prolonged_anomaly_temperatures_str\n",
"\n",
"\n",
"# Analyse weather data during energy usage anomalies\n",
"average_temperature_str, anomaly_temperatures_str, prolonged_anomaly_temperatures_str = analyse_weather_data(weather_data, anomalies, prolonged_anomalies)\n",
"\n",
"print(average_temperature_str)\n",
"print(anomaly_temperatures_str)\n",
"print(prolonged_anomaly_temperatures_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_daily"
]
}
],
Expand Down
16 changes: 13 additions & 3 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
import streamlit as st
from chatbot_rag_anomaly_detection import ChatbotRAG as Chatbot
from dotenv import load_dotenv
from utils import check_password

load_dotenv()

if not check_password():
st.stop() # Do not continue if check_password is not True.
# if not check_password():
# st.stop() # Do not continue if check_password is not True.

st.title("Energy Usage Anomaly Detection Assistant")
st.logo(image="src/logo.png", size="large")
st.markdown(
"""
<style>
img[data-testid="stLogo"] {
height: 3.5rem;
}
</style>
""",
unsafe_allow_html=True,
)

if "messages" not in st.session_state:
st.session_state.messages = []
Expand Down
46 changes: 33 additions & 13 deletions src/chatbot_rag_anomaly_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveJsonSplitter
from utils import detect_anomalies, load_smart_meter_data, plot_anomalies
from utils import (
analyse_weather_data,
detect_daily_anomalies,
detect_prolonged_anomalies,
generate_anomaly_text,
load_smart_meter_data,
load_weather_data,
plot_anomalies,
plot_weather,
)

set_debug(True)

Expand All @@ -21,20 +30,26 @@
class ChatbotRAG:

def __init__(self):

### Load smart meter data and detect anomalies ###
df = load_smart_meter_data()
anomalies = detect_anomalies(df)
if anomalies is not None:
fig = plot_anomalies(df, anomalies)
st.session_state.messages.append({"role": "assistant", "content": fig})
anomalies_str = ", ".join(anomalies.strftime("%Y-%m-%d"))
self.anomaly_text = f"""Anomalies have been detected in the customer's energy usage. \
The anomalies are on the following dates: {anomalies_str}. \
Help the user to identify the causes of the anomalies and suggest ways to fix them."""
else:
self.anomaly_text = """No anomalies have been detected in the customer's energy usage. \
You may still help the user to address any concerns they have about their energy usage."""
weather_df = load_weather_data()
anomalies = detect_daily_anomalies(df)
prolonged_anomalies = detect_prolonged_anomalies(df)
average_temperature_str, anomaly_temperatures_str, prolonged_anomaly_temperatures_str = (
analyse_weather_data(weather_df, anomalies, prolonged_anomalies)
)
fig = plot_anomalies(df, anomalies, prolonged_anomalies)
st.session_state.messages.append({"role": "assistant", "content": fig})
fig_weather = plot_weather(weather_df)
st.session_state.messages.append({"role": "assistant", "content": fig_weather})
self.anomaly_text = generate_anomaly_text(
anomalies,
prolonged_anomalies,
average_temperature_str,
anomaly_temperatures_str,
prolonged_anomaly_temperatures_str,
)

### initialize chain ###
self.initialize_chain()
Expand All @@ -44,6 +59,7 @@ def __init__(self):
from the smart meter data and any other relevant context."""
response = self.stream(initial_prompt)
st.plotly_chart(fig)
st.plotly_chart(fig_weather)
st.session_state.messages.append({"role": "assistant", "content": response})

def initialize_chain(self):
Expand Down Expand Up @@ -81,6 +97,10 @@ def initialize_chain(self):
You are helping a user to detect anomalies in their energy usage. \
An anomaly detection has already been performed. \
{self.anomaly_text} \
If there are anomalies, you can compare when they have occured against the additional context \
provided to determine if there are any patterns. You have been provided data up to 2024-08-31, and \
so if any anomalies continue up to the end of the data you have been provided, \
they may still be ongoing. This must be taken into account when providing advice.
"""
+ """ The user will describe their energy usage and you will help them to resolve \
or determine additional issues. \
Expand Down
Binary file added src/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit b254d31

Please sign in to comment.