diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fa0a6a9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +# app/Dockerfile + +FROM python:3.9-slim + +# Copy Code +WORKDIR /wise_pizza + +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + software-properties-common \ + git \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip3 install -r docker_requirements.txt + +EXPOSE 8501 + +HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health + +ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/README.md b/README.md index 4623ecd..a4d829c 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ Sometimes, rather than explaining the change in totals from one period to the ne - [Comparison between two datasets](#understanding-differences-in-two-time-periods-or-two-dataframes) - [Installation](#installation) - [Quick Start](#quick-start) +- [Streamlit app](#streamlit-app) + - [Docker container](#docker-container) - [For Developers](#for-developers) - [Tests](#testing) @@ -134,10 +136,31 @@ sf.segments ``` Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb) -## For Developers +## Streamlit app +In the root directory of this repository, there is a Streamlit app. This is an interface that allows you to upload your own files and run analyses, as you saw in the Jupyter Notebook provided as an example. + +To run this, you need to: +1. Create a virtual environment (e.g. using pyenv) +2. Activate the virtual environment +3. Run `pip -r requirements.txt` before running, to install necessary dependencies +4. Run `streamlit run streamlit_app.py` to run Streamlit + +### Docker container +We created a Docker container that makes it easier to deploy this solution elsewhere +You need to first: Create the Docker image +Create the Docker image +```Python +docker build -t streamlit . +``` +And then simply run the image + +```Python +docker run -p 8501:8501 streamlit +``` +## For Developers ### Testing We use [PyTest](https://docs.pytest.org/) for testing. If you want to contribute code, make sure that the tests in tests/ run without errors. diff --git a/docker_requirements.txt b/docker_requirements.txt new file mode 100644 index 0000000..0f668d9 --- /dev/null +++ b/docker_requirements.txt @@ -0,0 +1,12 @@ +ipython==8.14.0 +kaleido==0.2.1 +numpy==1.24.0 +pandas==2.0.2 +pytest==7.4.3 +plotly==5.15.0 +scikit_learn==1.3.2 +scipy==1.8.0 +tqdm==4.66.1 +cloudpickle==3.0.0 +pivottablejs==0.9.0 +streamlit==1.28.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 528bed9..18b5aba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ scipy>=1.8.0 tqdm cloudpickle pivottablejs +streamlit==1.28.0 \ No newline at end of file diff --git a/streamlit_app.py b/streamlit_app.py new file mode 100644 index 0000000..a2e2f13 --- /dev/null +++ b/streamlit_app.py @@ -0,0 +1,290 @@ +"""An example of Streamlit leveraging Wise pizza.""" + +import streamlit as st +import os, sys +import pandas as pd +from wise_pizza.explain import explain_levels, explain_changes_in_totals, explain_changes_in_average + + +# False if you want nice interactive plots +# True if you want static plots (Doesn't work on all platforms yet) +plot_is_static = False + + +# SETTING PAGE CONFIG TO WIDE MODE AND ADDING A TITLE AND FAVICON +st.set_page_config(layout="wide", page_title="Wise Pizza", page_icon=":pizza:") + +st.title('Wise Pizza powered by Streamlit') +st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those') + +# upload the file from the computer +def load_data_upload(): + uploaded_file = st.file_uploader("Choose a file") + if not uploaded_file: + st.warning('Please input a dataset.') + st.stop() + st.success('Dataset inputted.') + data = pd.read_csv(uploaded_file) + return data + +on = st.toggle('Use sample data from Github') +url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv') + +# select the datasource, either local or from github +if on: + st.write(f'Downloading data from {url_data}!') + df = pd.read_csv(url_data) +else: + df=load_data_upload() + +# show dataset preview +st.text('Table preview') +st.table(df.head(10)) + +# ask the user for relevant dimensions +dims = st.multiselect( + "Select the dimensions you want to include in the analysis", + df.select_dtypes(exclude=['number']).columns.tolist() + ) + +# ask the user via streamlit to select if they want to run a comparison between subgroups or not +flag_comparison = st.toggle('I want to run a comparison between two subgroups in my data') + +# return columns that are candidate for comparison +def flag_columns(df): + binary_columns = df.columns[df.nunique() == 2].tolist() + if not binary_columns: + st.warning('No column in the dataset is binary, no comparison can be carried out') + st.stop() + return binary_columns + +if flag_comparison: + #calculate binary columns + binary_columns=flag_columns(df) + + # ask the user via streamlit for the flag column + flag_column = st.selectbox( + "What is the flag column of your dataset that defines the two subgroups?", + binary_columns, + index=None, + placeholder="Select flag column column", + ) + st.write('You selected:', flag_column) + + # wait until flag column is added + if not flag_column: + st.stop() + + # calculate unique flags in the dataset + flags = sorted(df[flag_column].unique()) + + # allow users to define what's "old" and "new" in the comparison + flags_option = st.selectbox( + 'Which value in your flag column belongs to group A?', + (flags)) + + +# ask the user via streamlit for the target column +totals = st.selectbox( + "Name of column that contains totals per segment (e.g. GMV/revenue)", + # display only numerical columns + df.select_dtypes(include=['number']).columns.tolist(), + index=None, + placeholder="Select target column", +) +st.write('You selected:', totals) + +# ask the user via streamlit for the size column +size = st.selectbox( + "Name of column containing segment size (e.g. number of users/number of transactions)", + # display only numerical columns + df.select_dtypes(include=['number']).columns.tolist(), + index=None, + placeholder="Select volume column", +) +st.write('You selected:', size) + +st.subheader('Finding the juiciest slices', divider='rainbow') +st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.') + +if not totals or not size or not dims: + st.warning('Please input all fields above.') + st.stop() + +## finding juiciest slices +sf = explain_levels( + df=df, + dims=dims, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=20, + solver="lasso", + return_fig=True +) + +# storing the plot in a variable +plot_sf=sf.plot(width=500, height=500) +# exposing the plot via streamlit +st.plotly_chart(plot_sf, use_container_width=True) + +if flag_comparison: + st.subheader('Analysing differences', divider='rainbow') + st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown') + + # creating the dataframes for the comparison calculations + data = df[df[flag_column] != flags_option] # take the group to compare to + pre_data = df[df[flag_column] == flags_option] # take the group to be compared + + # define the relevant dimensions for the comparison feature + # for this, the flag column is to be excluded + comparison_dimensions = list(filter(lambda x: x != flag_column, dims)) + + ## running explain calculations + sf1 = explain_changes_in_totals( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=20, + how="totals", + solver="lasso", + return_fig=True + ) + # specifying a two column layout on streamlit + col1, col2 = st.columns(2) + # storing the plots in variables + # exposing the plots via streamlit + with col1: + plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0] + st.plotly_chart(plot_sf1, use_container_width=True) + + with col2: + plot_sf2=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[1] + st.plotly_chart(plot_sf2, use_container_width=True) + + st.subheader('Decomposing differences', divider='rainbow') + st.text('`split_fits` to separately decompose contribution of size changes and average changes') + + ## running explain calculations + sf2 = explain_changes_in_totals( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=1, + min_segments=10, + how="split_fits", + solver="lasso", + return_fig=True + ) + # storing the plot in a variable + # exposing the plot via streamlit + plot_sf=sf2.plot(width=500, height=500) + st.plotly_chart(plot_sf, use_container_width=True) + + st.text('`extra_dim` to treat size vs average change contribution as an additional dimension') + + ## running explain calculations + sf3 = explain_changes_in_totals( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=20, + how="extra_dim", + solver="lasso", + return_fig=True + ) + # specifying a two column layout on streamlit + col1, col2 = st.columns(2) + + # storing the plots in variables + # exposing the plots via streamlit + with col1: + plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0] + st.plotly_chart(plot_sf1, use_container_width=True) + + with col2: + plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1] + st.plotly_chart(plot_sf2, use_container_width=True) + + st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint') + + ## running explain calculations + sf3 = explain_changes_in_totals( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=15, + how="force_dim", + solver="lasso", + return_fig=True + ) + # specifying a two column layout on streamlit + col1, col2 = st.columns(2) + + # storing the plots in variables + # exposing the plots via streamlit + with col1: + plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0] + st.plotly_chart(plot_sf1, use_container_width=True) + + with col2: + plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1] + st.plotly_chart(plot_sf2, use_container_width=True) + + st.subheader('Explaining changes in average', divider='rainbow') + + ## running explain calculations + sf4 = explain_changes_in_average( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=20, + how="totals", + solver="lasso", + return_fig=True + ) + # specifying a two column layout on streamlit + col1, col2 = st.columns(2) + + # storing the plots in variables + # exposing the plots via streamlit + with col1: + plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0] + st.plotly_chart(plot_sf1, use_container_width=True) + + with col2: + plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1] + st.plotly_chart(plot_sf2, use_container_width=True) + + ## running explain calculations + sf6 = explain_changes_in_average( + df1=pre_data, + df2=data, + dims=comparison_dimensions, + total_name=totals, + size_name=size, + max_depth=2, + min_segments=20, + how="split_fits", + solver="lasso", + return_fig=True + ) + + # storing the plot in a variable + plot_sf=sf6.plot(width=500, height=500) + # exposing the plot via streamlit + st.plotly_chart(plot_sf, use_container_width=True) \ No newline at end of file diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py index 4c2c95b..5834b2b 100644 --- a/wise_pizza/explain.py +++ b/wise_pizza/explain.py @@ -26,6 +26,7 @@ def explain_changes_in_average( force_add_up: bool = False, constrain_signs: bool = True, verbose: int = 0, + return_fig: bool = False ): """ Find segments most useful in explaining the difference between the averages of the two datasets @@ -48,6 +49,7 @@ def explain_changes_in_average( @param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages @param verbose: If set to a truish value, lots of debug info is printed to console + @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures @return: A fitted object """ df1 = df1.copy() @@ -86,6 +88,7 @@ def explain_changes_in_average( force_add_up=force_add_up, constrain_signs=constrain_signs, verbose=verbose, + return_fig=return_fig ) if hasattr(sf, "pre_total"): @@ -119,6 +122,7 @@ def explain_changes_in_totals( force_add_up: bool = False, constrain_signs: bool = True, verbose: int = 0, + return_fig: bool = False ): """ Find segments most useful in explaining the difference between the totals of the two datasets @@ -141,6 +145,7 @@ def explain_changes_in_totals( @param constrain_signs: Whether to constrain weights of segments to have the same sign as naive segment averages @param verbose: If set to a truish value, lots of debug info is printed to console + @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures @return: A fitted object """ @@ -181,6 +186,7 @@ def explain_changes_in_totals( force_add_up=force_add_up, constrain_signs=constrain_signs, verbose=verbose, + return_fig=return_fig ) sf_avg = explain_levels( @@ -195,6 +201,7 @@ def explain_changes_in_totals( force_add_up=force_add_up, constrain_signs=constrain_signs, verbose=verbose, + return_fig=return_fig ) sf_size.final_size = final_size @@ -207,6 +214,7 @@ def explain_changes_in_totals( plot_is_static=plot_is_static, width=width, height=height, + return_fig=return_fig ) ) return sp @@ -233,7 +241,7 @@ def explain_changes_in_totals( sf.post_total = df2[total_name].sum() sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall( - sf, plot_is_static=plot_is_static, width=width, height=height + sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig ) sf.task = "changes in totals" return sf @@ -252,6 +260,7 @@ def explain_levels( verbose=0, force_add_up: bool = False, constrain_signs: bool = True, + return_fig: bool = False ): """ Find segments whose average is most different from the global one @@ -267,6 +276,7 @@ def explain_levels( @param verbose: If set to a truish value, lots of debug info is printed to console @param force_add_up: Force the contributions of chosen segments to add up to zero @param constrain_signs: Whether to constrain weights of segments to have the same + @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures sign as naive segment averages @return: A fitted object """ @@ -304,7 +314,7 @@ def explain_levels( s["total"] += average * s["seg_size"] # print(average) sf.reg.intercept_ = average - sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments( + sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=return_fig: plot_segments( sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig ) sf.task = "levels" diff --git a/wise_pizza/plotting.py b/wise_pizza/plotting.py index 7b4ce04..86db39c 100644 --- a/wise_pizza/plotting.py +++ b/wise_pizza/plotting.py @@ -20,6 +20,7 @@ def plot_split_segments( plot_is_static: bool = False, width: int = 2000, height: int = 500, + return_fig: bool = False ): """ Plot split segments for explain_changes: split_fits @@ -123,7 +124,10 @@ def plot_split_segments( width=width + len(size_data.index) * 30, ) else: - fig.show() + if return_fig: + return fig + else: + fig.show() def plot_segments( @@ -274,7 +278,11 @@ def waterfall_layout_args(sf: SliceFinder, width: int = 1000, height: int = 1000 def plot_waterfall( - sf: SliceFinder, plot_is_static: bool = False, width: int = 1000, height: int = 1000 + sf: SliceFinder, + plot_is_static: bool = False, + width: int = 1000, + height: int = 1000, + return_fig: bool = False ): """ Plot waterfall and Bar for explain_changes @@ -312,14 +320,17 @@ def plot_waterfall( **waterfall_layout_args(sf, width, height) ) - if plot_is_static: - # Convert the figure to a static image - image_bytes = to_image(fig, format="png", scale=2) - image_bytes2 = to_image(fig2, format="png", scale=2) - - # Display the static image in the Jupyter notebook - display(Image(image_bytes, width=width, height=height)) - display(Image(image_bytes2, width=width, height=height)) + if return_fig: + return [fig, fig2] else: - fig.show() - fig2.show() + if plot_is_static: + # Convert the figure to a static image + image_bytes = to_image(fig, format="png", scale=2) + image_bytes2 = to_image(fig2, format="png", scale=2) + + # Display the static image in the Jupyter notebook + display(Image(image_bytes, width=width, height=height)) + display(Image(image_bytes2, width=width, height=height)) + else: + fig.show() + fig2.show() \ No newline at end of file