Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: adding streamlit interface to wise pizza #24

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ scipy>=1.8.0
tqdm
cloudpickle
pivottablejs
streamlit==1.28.0
289 changes: 289 additions & 0 deletions streamlit/streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
"""An example of Streamlit leveraging Wise pizza."""

import streamlit as st
import os, sys
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

root_path = os.path.realpath('../..')

# this assumes that all of the following files are checked in the same directory
sys.path.append(os.path.join(root_path,"wise-pizza"))

# create data-related directories
data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
if not os.path.isdir(data_dir):
os.mkdir(data_dir)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be nice to be able to launch the app from any folder (didn't work for me initially and I had to change the root_path)
You could add the streamlit folder to the wise-pizza folder and make the relevant changes to achieve this

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Must have been solved, please try this again!


from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average

# False if you want nice interactive plots
# True if you want static plots (Doesn't work on all platforms yet)
plot_is_static = False


# SETTING PAGE CONFIG TO WIDE MODE AND ADDING A TITLE AND FAVICON
st.set_page_config(layout="wide", page_title="Wise Pizza", page_icon=":pizza:")

st.title('Wise Pizza powered by Streamlit')
st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those')

# upload the file from the computer
def load_data_upload():
uploaded_file = st.file_uploader("Choose a file")
if not uploaded_file:
st.warning('Please input a dataset.')
st.stop()
st.success('Dataset inputted.')
data = pd.read_csv(uploaded_file)
return data

on = st.toggle('Use sample data from Github')
url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')

# select the datasource, either local or from github
if on:
st.write(f'Downloading data from {url_data}!')
df = pd.read_csv(url_data)
else:
df=load_data_upload()

# show dataset preview
st.text('Table preview')
st.table(df.head(10))

# ask the user via streamlit for the target column
totals = st.selectbox(
"What is the target column that you want to analyse? e.g. GMV/revenue",
df.columns,
index=None,
placeholder="Select target column",
)
st.write('You selected:', totals)

# ask the user via streamlit for the size column
size = st.selectbox(
"What is the volume column of your dataset? e.g. number of users/transactions",
agusfigueroa-htg marked this conversation as resolved.
Show resolved Hide resolved
df.columns,
index=None,
placeholder="Select volume column",
)
st.write('You selected:', size)

# ask the user via streamlit for the flag column
flag_column = st.selectbox(
"What is the flag column of your dataset you wanan split it by? Ensure this column is binary",
agusfigueroa-htg marked this conversation as resolved.
Show resolved Hide resolved
df.columns,
index=None,
placeholder="Select time column",
)
st.write('You selected:', flag_column)

# wait until flag column is added
if not flag_column:
st.stop()

# calculate unique flags in the dataset
flags = sorted(df[flag_column].unique())

# show an error message if the specified flag column is not binary
if len(flags)>2:
st.error('Your flag is not binary', icon="🚨")

# allow users to define what's "old" and "new" in the comparison
flags_option = st.selectbox(
'Which one in your data belongs to group A?',
(flags))

# listing all potential dimensions in the dataframe.
# all of them are potential columns to exclude
candidates_excluded_columns = [element for element in df.columns if element not in [totals,size,flag_column]]

# list with specified columns to exclude.
excluded_columns = st.multiselect(
'Please select all columns that you want to exclude from the analysis',
candidates_excluded_columns)

# all non dimensional columns are the ones picked by the user (if any), plus the ones that indicate totals, size and flag
non_dimensional_columns = excluded_columns + [totals,size,flag_column]

# calculating dimensions
dims = [element for element in df.columns if element not in non_dimensional_columns]

# creating the dataframes for the comparison calculations
data = df[df[flag_column] != flags_option] # take the group to compare to
pre_data = df[df[flag_column] == flags_option] # take the group to be compared

st.subheader('Finding the juiciest slices', divider='rainbow')
st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')

## finding juiciest slices
agusfigueroa-htg marked this conversation as resolved.
Show resolved Hide resolved
sf = explain_levels(
df=df,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=20,
solver="lasso",
return_fig=True
)

# storing the plot in a variable
plot_sf=sf.plot(width=500, height=500)
# exposing the plot via streamlit
st.plotly_chart(plot_sf, use_container_width=True)

st.subheader('Analysing differences', divider='rainbow')
st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown')

## running explain calculations
sf1 = explain_changes_in_totals(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=20,
how="totals",
solver="lasso",
return_fig=True
)
# specifying a two column layout on streamlit
col1, col2 = st.columns(2)
# storing the plots in variables
# exposing the plots via streamlit
with col1:
plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
st.plotly_chart(plot_sf1, use_container_width=True)

with col2:
plot_sf2=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
st.plotly_chart(plot_sf2, use_container_width=True)

st.subheader('Decomposing differences', divider='rainbow')
st.text('`split_fits` to separately decompose contribution of size changes and average changes')

## running explain calculations
sf2 = explain_changes_in_totals(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=1,
min_segments=10,
how="split_fits",
solver="lasso",
return_fig=True
)
# storing the plot in a variable
# exposing the plot via streamlit
plot_sf=sf2.plot(width=500, height=500)
st.plotly_chart(plot_sf, use_container_width=True)

st.text('`extra_dim` to treat size vs average change contribution as an additional dimension')

## running explain calculations
sf3 = explain_changes_in_totals(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=20,
how="extra_dim",
solver="lasso",
return_fig=True
)
# specifying a two column layout on streamlit
col1, col2 = st.columns(2)

# storing the plots in variables
# exposing the plots via streamlit
with col1:
plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
st.plotly_chart(plot_sf1, use_container_width=True)

with col2:
plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
st.plotly_chart(plot_sf2, use_container_width=True)

st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint')

## running explain calculations
sf3 = explain_changes_in_totals(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=15,
how="force_dim",
solver="lasso",
return_fig=True
)
# specifying a two column layout on streamlit
col1, col2 = st.columns(2)

# storing the plots in variables
# exposing the plots via streamlit
with col1:
plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
st.plotly_chart(plot_sf1, use_container_width=True)

with col2:
plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
st.plotly_chart(plot_sf2, use_container_width=True)

st.subheader('Explaining changes in average', divider='rainbow')

## running explain calculations
sf4 = explain_changes_in_average(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=20,
how="totals",
solver="lasso",
return_fig=True
)
# specifying a two column layout on streamlit
col1, col2 = st.columns(2)

# storing the plots in variables
# exposing the plots via streamlit
with col1:
plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
st.plotly_chart(plot_sf1, use_container_width=True)

with col2:
plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
st.plotly_chart(plot_sf2, use_container_width=True)

## running explain calculations
sf6 = explain_changes_in_average(
df1=pre_data,
df2=data,
dims=dims,
total_name=totals,
size_name=size,
max_depth=2,
min_segments=20,
how="split_fits",
solver="lasso",
return_fig=True
)

# storing the plot in a variable
plot_sf=sf6.plot(width=500, height=500)
# exposing the plot via streamlit
st.plotly_chart(plot_sf, use_container_width=True)
14 changes: 12 additions & 2 deletions wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def explain_changes_in_average(
force_add_up: bool = False,
constrain_signs: bool = True,
verbose: int = 0,
return_fig: bool = False
):
"""
Find segments most useful in explaining the difference between the averages of the two datasets
Expand All @@ -48,6 +49,7 @@ def explain_changes_in_average(
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
@return: A fitted object
"""
df1 = df1.copy()
Expand Down Expand Up @@ -86,6 +88,7 @@ def explain_changes_in_average(
force_add_up=force_add_up,
constrain_signs=constrain_signs,
verbose=verbose,
return_fig=return_fig
)

if hasattr(sf, "pre_total"):
Expand Down Expand Up @@ -119,6 +122,7 @@ def explain_changes_in_totals(
force_add_up: bool = False,
constrain_signs: bool = True,
verbose: int = 0,
return_fig: bool = False
):
"""
Find segments most useful in explaining the difference between the totals of the two datasets
Expand All @@ -141,6 +145,7 @@ def explain_changes_in_totals(
@param constrain_signs: Whether to constrain weights of segments to have the same
sign as naive segment averages
@param verbose: If set to a truish value, lots of debug info is printed to console
@param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
@return: A fitted object
"""

Expand Down Expand Up @@ -181,6 +186,7 @@ def explain_changes_in_totals(
force_add_up=force_add_up,
constrain_signs=constrain_signs,
verbose=verbose,
return_fig=return_fig
)

sf_avg = explain_levels(
Expand All @@ -195,6 +201,7 @@ def explain_changes_in_totals(
force_add_up=force_add_up,
constrain_signs=constrain_signs,
verbose=verbose,
return_fig=return_fig
)

sf_size.final_size = final_size
Expand All @@ -207,6 +214,7 @@ def explain_changes_in_totals(
plot_is_static=plot_is_static,
width=width,
height=height,
return_fig=return_fig
)
)
return sp
Expand All @@ -233,7 +241,7 @@ def explain_changes_in_totals(
sf.post_total = df2[total_name].sum()

sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall(
sf, plot_is_static=plot_is_static, width=width, height=height
sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
)
sf.task = "changes in totals"
return sf
Expand All @@ -252,6 +260,7 @@ def explain_levels(
verbose=0,
force_add_up: bool = False,
constrain_signs: bool = True,
return_fig: bool = False
):
"""
Find segments whose average is most different from the global one
Expand All @@ -267,6 +276,7 @@ def explain_levels(
@param verbose: If set to a truish value, lots of debug info is printed to console
@param force_add_up: Force the contributions of chosen segments to add up to zero
@param constrain_signs: Whether to constrain weights of segments to have the same
@param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
sign as naive segment averages
@return: A fitted object
"""
Expand Down Expand Up @@ -304,7 +314,7 @@ def explain_levels(
s["total"] += average * s["seg_size"]
# print(average)
sf.reg.intercept_ = average
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments(
sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=return_fig: plot_segments(
sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
)
sf.task = "levels"
Expand Down
Loading
Loading