From b6af65a3b2bbaf436e83fa066e910be6310a5ce7 Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 15:27:18 +0100
Subject: [PATCH 1/9] feature: adding streamlit interface

---
 requirements.txt           |   1 +
 streamlit/streamlit_app.py | 268 +++++++++++++++++++++++++++++++++++++
 wise_pizza/explain.py      |  14 +-
 wise_pizza/plotting.py     |  35 +++--
 4 files changed, 304 insertions(+), 14 deletions(-)
 create mode 100644 streamlit/streamlit_app.py

diff --git a/requirements.txt b/requirements.txt
index 528bed9..18b5aba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ scipy>=1.8.0
 tqdm
 cloudpickle
 pivottablejs
+streamlit==1.28.0
\ No newline at end of file
diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
new file mode 100644
index 0000000..5fea2ec
--- /dev/null
+++ b/streamlit/streamlit_app.py
@@ -0,0 +1,268 @@
+"""An example of Streamlit leveraging Wise pizza."""
+
+import altair as alt
+import pydeck as pdk
+import streamlit as st
+
+import os, sys
+import datetime
+import random
+from typing import List
+import copy
+import gzip
+
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, accuracy_score
+import numpy as np
+import pandas as pd
+
+from io import StringIO
+
+import warnings
+warnings.filterwarnings("ignore")
+
+root_path = os.path.realpath('../..')
+print(root_path)
+
+# this assumes that all of the following files are checked in the same directory
+sys.path.append(os.path.join(root_path,"wise-pizza"))
+
+# create data-related directories
+data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
+if not os.path.isdir(data_dir):
+    os.mkdir(data_dir)
+print(data_dir)
+
+from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
+# False if you want nice interactive plots
+# True if you want static plots (Doesn't work on all platforms yet)
+plot_is_static = False
+
+
+# SETTING PAGE CONFIG TO WIDE MODE AND ADDING A TITLE AND FAVICON
+st.set_page_config(layout="wide", page_title="Wise Pizza", page_icon=":pizza:")
+
+st.title('Wise Pizza powered by Streamlit')
+st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those')
+
+def load_data_upload():
+    uploaded_file = st.file_uploader("Choose a file")
+    data = pd.read_csv(uploaded_file)
+    return data
+
+def load_data_snowflake(input_query, conn):
+    cur = conn.cursor()
+    cur.execute(input_query)
+    sql_df = cur.fetch_pandas_all()
+    return sql_df
+
+on = st.toggle('Use sample data from Github')
+url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')
+
+if on:
+    st.write(f'Downloading data from {url_data}!')
+    df = pd.read_csv(url_data)
+else:
+    df=load_data_upload()
+
+st.text('Table preview')
+st.table(df.head(10))
+
+totals = st.selectbox(
+   "What is the target column that you want to analyse? e.g. GMV/revenue",
+   df.columns,
+   index=None,
+   placeholder="Select target column",
+)
+st.write('You selected:', totals)
+
+
+size = st.selectbox(
+   "What is the volume column of your dataset? e.g. number of users/transactions",
+   df.columns,
+   index=None,
+   placeholder="Select volume column",
+)
+st.write('You selected:', size)
+
+
+flag_column = st.selectbox(
+   "What is the flag column of your dataset you wanan split it by? Ensure this column is binary",
+   df.columns,
+   index=None,
+   placeholder="Select time column",
+)
+st.write('You selected:', flag_column)
+
+flags = sorted(df[flag_column].unique())  # unique flags in the dataset
+
+if len(flags)>2:
+    st.error('Your flag is not binary', icon="🚨")
+
+flags_option = st.selectbox(
+    'Which one in your data belongs to group A?',
+    (flags))
+
+candidates_excluded_columns = [element for element in df.columns if element not in [totals,size,flag_column]]
+
+excluded_columns = st.multiselect(
+    'Please select all columns that you want to exclude from the analysis',
+    candidates_excluded_columns)
+
+non_dimensional_columns = excluded_columns + [totals,size,flag_column]
+
+dims = [element for element in df.columns if element not in non_dimensional_columns]
+
+data = df[df[flag_column] != flags_option]  # take the group to compare to
+pre_data = df[df[flag_column] == flags_option]  # take the group to be compared
+
+st.table(df[dims].head(10))
+
+st.subheader('Finding the juiciest slices', divider='rainbow')
+st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')
+
+##Finding juiciest slices
+sf = explain_levels(
+    df=df,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    solver="lasso",
+    return_fig=True
+)
+
+plot_sf=sf.plot(width=500, height=500)
+st.plotly_chart(plot_sf, use_container_width=True)
+
+st.subheader('Analysing differences', divider='rainbow')
+st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown')
+
+##explaining changes overall
+sf1 = explain_changes_in_totals(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    how="totals",
+    solver="lasso",
+    return_fig=True
+)
+col1, col2 = st.columns(2)
+
+with col1:
+    plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+    st.plotly_chart(plot_sf1, use_container_width=True)
+
+with col2:
+    plot_sf2=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+    st.plotly_chart(plot_sf2, use_container_width=True)
+
+st.subheader('Decomposing differences', divider='rainbow')
+st.text('`split_fits` to separately decompose contribution of size changes and average changes')
+
+sf2 = explain_changes_in_totals(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=1,
+    min_segments=10,
+    how="split_fits",
+    solver="lasso",
+    return_fig=True
+)
+plot_sf=sf2.plot(width=500, height=500)
+st.plotly_chart(plot_sf, use_container_width=True)
+
+st.text('`extra_dim` to treat size vs average change contribution as an additional dimension')
+sf3 = explain_changes_in_totals(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    how="extra_dim",
+    solver="lasso",
+    return_fig=True
+)
+
+col1, col2 = st.columns(2)
+
+with col1:
+    plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+    st.plotly_chart(plot_sf1, use_container_width=True)
+
+with col2:
+    plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+    st.plotly_chart(plot_sf2, use_container_width=True)
+
+st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint')
+sf3 = explain_changes_in_totals(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=15,
+    how="force_dim",
+    solver="lasso",
+    return_fig=True
+)
+col1, col2 = st.columns(2)
+
+with col1:
+    plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+    st.plotly_chart(plot_sf1, use_container_width=True)
+
+with col2:
+    plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+    st.plotly_chart(plot_sf2, use_container_width=True)
+
+st.subheader('Explaining changes in average', divider='rainbow')
+
+sf4 = explain_changes_in_average(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    how="totals",
+    solver="lasso",
+    return_fig=True
+)
+
+col1, col2 = st.columns(2)
+
+with col1:
+    plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+    st.plotly_chart(plot_sf1, use_container_width=True)
+
+with col2:
+    plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+    st.plotly_chart(plot_sf2, use_container_width=True)
+
+sf6 = explain_changes_in_average(
+    df1=pre_data,
+    df2=data,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    how="split_fits",
+    solver="lasso",
+    return_fig=True
+)
+plot_sf=sf6.plot(width=500, height=500)
+st.plotly_chart(plot_sf, use_container_width=True)
\ No newline at end of file
diff --git a/wise_pizza/explain.py b/wise_pizza/explain.py
index 4c2c95b..5834b2b 100644
--- a/wise_pizza/explain.py
+++ b/wise_pizza/explain.py
@@ -26,6 +26,7 @@ def explain_changes_in_average(
     force_add_up: bool = False,
     constrain_signs: bool = True,
     verbose: int = 0,
+    return_fig: bool = False
 ):
     """
     Find segments most useful in explaining the difference between the averages of the two datasets
@@ -48,6 +49,7 @@ def explain_changes_in_average(
     @param constrain_signs: Whether to constrain weights of segments to have the same
     sign as naive segment averages
     @param verbose: If set to a truish value, lots of debug info is printed to console
+    @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
     @return: A fitted object
     """
     df1 = df1.copy()
@@ -86,6 +88,7 @@ def explain_changes_in_average(
         force_add_up=force_add_up,
         constrain_signs=constrain_signs,
         verbose=verbose,
+        return_fig=return_fig
     )
 
     if hasattr(sf, "pre_total"):
@@ -119,6 +122,7 @@ def explain_changes_in_totals(
     force_add_up: bool = False,
     constrain_signs: bool = True,
     verbose: int = 0,
+    return_fig: bool = False
 ):
     """
     Find segments most useful in explaining the difference between the totals of the two datasets
@@ -141,6 +145,7 @@ def explain_changes_in_totals(
     @param constrain_signs: Whether to constrain weights of segments to have the same
     sign as naive segment averages
     @param verbose: If set to a truish value, lots of debug info is printed to console
+    @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
     @return: A fitted object
     """
 
@@ -181,6 +186,7 @@ def explain_changes_in_totals(
             force_add_up=force_add_up,
             constrain_signs=constrain_signs,
             verbose=verbose,
+            return_fig=return_fig
         )
 
         sf_avg = explain_levels(
@@ -195,6 +201,7 @@ def explain_changes_in_totals(
             force_add_up=force_add_up,
             constrain_signs=constrain_signs,
             verbose=verbose,
+            return_fig=return_fig
         )
 
         sf_size.final_size = final_size
@@ -207,6 +214,7 @@ def explain_changes_in_totals(
                 plot_is_static=plot_is_static,
                 width=width,
                 height=height,
+                return_fig=return_fig
             )
         )
         return sp
@@ -233,7 +241,7 @@ def explain_changes_in_totals(
         sf.post_total = df2[total_name].sum()
 
         sf.plot = lambda plot_is_static=False, width=1000, height=1000: plot_waterfall(
-            sf, plot_is_static=plot_is_static, width=width, height=height
+            sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
         )
         sf.task = "changes in totals"
         return sf
@@ -252,6 +260,7 @@ def explain_levels(
     verbose=0,
     force_add_up: bool = False,
     constrain_signs: bool = True,
+    return_fig: bool = False
 ):
     """
     Find segments whose average is most different from the global one
@@ -267,6 +276,7 @@ def explain_levels(
     @param verbose: If set to a truish value, lots of debug info is printed to console
     @param force_add_up: Force the contributions of chosen segments to add up to zero
     @param constrain_signs: Whether to constrain weights of segments to have the same
+    @param return_fig: If set to true, plot returns the figure object, otherwise shows the figures
     sign as naive segment averages
     @return: A fitted object
     """
@@ -304,7 +314,7 @@ def explain_levels(
         s["total"] += average * s["seg_size"]
     # print(average)
     sf.reg.intercept_ = average
-    sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=False: plot_segments(
+    sf.plot = lambda plot_is_static=False, width=2000, height=500, return_fig=return_fig: plot_segments(
         sf, plot_is_static=plot_is_static, width=width, height=height, return_fig=return_fig
     )
     sf.task = "levels"
diff --git a/wise_pizza/plotting.py b/wise_pizza/plotting.py
index 7b4ce04..86db39c 100644
--- a/wise_pizza/plotting.py
+++ b/wise_pizza/plotting.py
@@ -20,6 +20,7 @@ def plot_split_segments(
     plot_is_static: bool = False,
     width: int = 2000,
     height: int = 500,
+    return_fig: bool = False
 ):
     """
     Plot split segments for explain_changes: split_fits
@@ -123,7 +124,10 @@ def plot_split_segments(
             width=width + len(size_data.index) * 30,
         )
     else:
-        fig.show()
+        if return_fig:
+            return fig
+        else:
+            fig.show()
 
 
 def plot_segments(
@@ -274,7 +278,11 @@ def waterfall_layout_args(sf: SliceFinder, width: int = 1000, height: int = 1000
 
 
 def plot_waterfall(
-    sf: SliceFinder, plot_is_static: bool = False, width: int = 1000, height: int = 1000
+    sf: SliceFinder, 
+    plot_is_static: bool = False, 
+    width: int = 1000, 
+    height: int = 1000,
+    return_fig: bool = False
 ):
     """
     Plot waterfall and Bar for explain_changes
@@ -312,14 +320,17 @@ def plot_waterfall(
         **waterfall_layout_args(sf, width, height)
     )
 
-    if plot_is_static:
-        # Convert the figure to a static image
-        image_bytes = to_image(fig, format="png", scale=2)
-        image_bytes2 = to_image(fig2, format="png", scale=2)
-
-        # Display the static image in the Jupyter notebook
-        display(Image(image_bytes, width=width, height=height))
-        display(Image(image_bytes2, width=width, height=height))
+    if return_fig:
+            return [fig, fig2]
     else:
-        fig.show()
-        fig2.show()
+        if plot_is_static:
+            # Convert the figure to a static image
+            image_bytes = to_image(fig, format="png", scale=2)
+            image_bytes2 = to_image(fig2, format="png", scale=2)
+
+            # Display the static image in the Jupyter notebook
+            display(Image(image_bytes, width=width, height=height))
+            display(Image(image_bytes2, width=width, height=height))
+        else:
+            fig.show()
+            fig2.show()
\ No newline at end of file

From 057000d38193905d9f73d3db60e135856281cafe Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 15:28:45 +0100
Subject: [PATCH 2/9] Update streamlit_app.py

---
 streamlit/streamlit_app.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index 5fea2ec..b16d53b 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -50,12 +50,6 @@ def load_data_upload():
     data = pd.read_csv(uploaded_file)
     return data
 
-def load_data_snowflake(input_query, conn):
-    cur = conn.cursor()
-    cur.execute(input_query)
-    sql_df = cur.fetch_pandas_all()
-    return sql_df
-
 on = st.toggle('Use sample data from Github')
 url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')
 

From 8e2ee7681598e9a42b21626b6be801e71c21658a Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 15:31:31 +0100
Subject: [PATCH 3/9] fix: remove unnecessary prints

---
 streamlit/streamlit_app.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index b16d53b..ebc1d8c 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -22,7 +22,6 @@
 warnings.filterwarnings("ignore")
 
 root_path = os.path.realpath('../..')
-print(root_path)
 
 # this assumes that all of the following files are checked in the same directory
 sys.path.append(os.path.join(root_path,"wise-pizza"))
@@ -31,7 +30,6 @@
 data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
 if not os.path.isdir(data_dir):
     os.mkdir(data_dir)
-print(data_dir)
 
 from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
 # False if you want nice interactive plots

From 7362a4da120921cb266ef15f58e9baf770c31350 Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 15:35:06 +0100
Subject: [PATCH 4/9] fix: remove unnecessary libraries

---
 streamlit/streamlit_app.py | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index ebc1d8c..2e1b5a4 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -1,25 +1,10 @@
 """An example of Streamlit leveraging Wise pizza."""
 
-import altair as alt
-import pydeck as pdk
 import streamlit as st
-
 import os, sys
-import datetime
-import random
-from typing import List
-import copy
-import gzip
-
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score, accuracy_score
-import numpy as np
 import pandas as pd
+from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
 
-from io import StringIO
-
-import warnings
-warnings.filterwarnings("ignore")
 
 root_path = os.path.realpath('../..')
 
@@ -31,7 +16,6 @@
 if not os.path.isdir(data_dir):
     os.mkdir(data_dir)
 
-from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
 # False if you want nice interactive plots
 # True if you want static plots (Doesn't work on all platforms yet)
 plot_is_static = False
@@ -108,8 +92,6 @@ def load_data_upload():
 data = df[df[flag_column] != flags_option]  # take the group to compare to
 pre_data = df[df[flag_column] == flags_option]  # take the group to be compared
 
-st.table(df[dims].head(10))
-
 st.subheader('Finding the juiciest slices', divider='rainbow')
 st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')
 

From 4319f93e772cbe3ebc1216a4c4767d6dc918642a Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 15:49:25 +0100
Subject: [PATCH 5/9] chore: add comments

---
 streamlit/streamlit_app.py | 52 ++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index 2e1b5a4..bb0aeb8 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -27,6 +27,7 @@
 st.title('Wise Pizza powered by Streamlit')
 st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those')
 
+# upload the file from the computer
 def load_data_upload():
     uploaded_file = st.file_uploader("Choose a file")
     data = pd.read_csv(uploaded_file)
@@ -35,15 +36,18 @@ def load_data_upload():
 on = st.toggle('Use sample data from Github')
 url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')
 
+# select the datasource, either local or from github
 if on:
     st.write(f'Downloading data from {url_data}!')
     df = pd.read_csv(url_data)
 else:
     df=load_data_upload()
 
+# show dataset preview
 st.text('Table preview')
 st.table(df.head(10))
 
+# ask the user via streamlit for the target column
 totals = st.selectbox(
    "What is the target column that you want to analyse? e.g. GMV/revenue",
    df.columns,
@@ -52,7 +56,7 @@ def load_data_upload():
 )
 st.write('You selected:', totals)
 
-
+# ask the user via streamlit for the size column
 size = st.selectbox(
    "What is the volume column of your dataset? e.g. number of users/transactions",
    df.columns,
@@ -61,7 +65,7 @@ def load_data_upload():
 )
 st.write('You selected:', size)
 
-
+# ask the user via streamlit for the flag column
 flag_column = st.selectbox(
    "What is the flag column of your dataset you wanan split it by? Ensure this column is binary",
    df.columns,
@@ -70,32 +74,41 @@ def load_data_upload():
 )
 st.write('You selected:', flag_column)
 
-flags = sorted(df[flag_column].unique())  # unique flags in the dataset
+# calculate unique flags in the dataset
+flags = sorted(df[flag_column].unique())  
 
+# show an error message if the specified flag column is not binary
 if len(flags)>2:
     st.error('Your flag is not binary', icon="🚨")
 
+# allow users to define what's "old" and "new" in the comparison
 flags_option = st.selectbox(
     'Which one in your data belongs to group A?',
     (flags))
 
+# listing all potential dimensions in the dataframe. 
+# all of them are potential columns to exclude
 candidates_excluded_columns = [element for element in df.columns if element not in [totals,size,flag_column]]
 
+# list with specified columns to exclude.
 excluded_columns = st.multiselect(
     'Please select all columns that you want to exclude from the analysis',
     candidates_excluded_columns)
 
+# all non dimensional columns are the ones picked by the user (if any), plus the ones that indicate totals, size and flag
 non_dimensional_columns = excluded_columns + [totals,size,flag_column]
 
+# calculating dimensions
 dims = [element for element in df.columns if element not in non_dimensional_columns]
 
+# creating the dataframes for the comparison calculations
 data = df[df[flag_column] != flags_option]  # take the group to compare to
 pre_data = df[df[flag_column] == flags_option]  # take the group to be compared
 
 st.subheader('Finding the juiciest slices', divider='rainbow')
 st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')
 
-##Finding juiciest slices
+## finding juiciest slices
 sf = explain_levels(
     df=df,
     dims=dims,
@@ -107,13 +120,15 @@ def load_data_upload():
     return_fig=True
 )
 
+# storing the plot in a variable
 plot_sf=sf.plot(width=500, height=500)
+# exposing the plot via streamlit
 st.plotly_chart(plot_sf, use_container_width=True)
 
 st.subheader('Analysing differences', divider='rainbow')
 st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown')
 
-##explaining changes overall
+## running explain calculations
 sf1 = explain_changes_in_totals(
     df1=pre_data,
     df2=data,
@@ -126,8 +141,10 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
+# specifying a two column layout on streamlit
 col1, col2 = st.columns(2)
-
+# storing the plots in variables
+# exposing the plots via streamlit
 with col1:
     plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
     st.plotly_chart(plot_sf1, use_container_width=True)
@@ -139,6 +156,7 @@ def load_data_upload():
 st.subheader('Decomposing differences', divider='rainbow')
 st.text('`split_fits` to separately decompose contribution of size changes and average changes')
 
+## running explain calculations
 sf2 = explain_changes_in_totals(
     df1=pre_data,
     df2=data,
@@ -151,10 +169,14 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
+# storing the plot in a variable
+# exposing the plot via streamlit
 plot_sf=sf2.plot(width=500, height=500)
 st.plotly_chart(plot_sf, use_container_width=True)
 
 st.text('`extra_dim` to treat size vs average change contribution as an additional dimension')
+
+## running explain calculations
 sf3 = explain_changes_in_totals(
     df1=pre_data,
     df2=data,
@@ -167,9 +189,11 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
-
+# specifying a two column layout on streamlit
 col1, col2 = st.columns(2)
 
+# storing the plots in variables
+# exposing the plots via streamlit
 with col1:
     plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
     st.plotly_chart(plot_sf1, use_container_width=True)
@@ -179,6 +203,8 @@ def load_data_upload():
     st.plotly_chart(plot_sf2, use_container_width=True)
 
 st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint')
+
+## running explain calculations
 sf3 = explain_changes_in_totals(
     df1=pre_data,
     df2=data,
@@ -191,8 +217,11 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
+# specifying a two column layout on streamlit
 col1, col2 = st.columns(2)
 
+# storing the plots in variables
+# exposing the plots via streamlit
 with col1:
     plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
     st.plotly_chart(plot_sf1, use_container_width=True)
@@ -203,6 +232,7 @@ def load_data_upload():
 
 st.subheader('Explaining changes in average', divider='rainbow')
 
+## running explain calculations
 sf4 = explain_changes_in_average(
     df1=pre_data,
     df2=data,
@@ -215,9 +245,11 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
-
+# specifying a two column layout on streamlit
 col1, col2 = st.columns(2)
 
+# storing the plots in variables
+# exposing the plots via streamlit
 with col1:
     plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
     st.plotly_chart(plot_sf1, use_container_width=True)
@@ -226,6 +258,7 @@ def load_data_upload():
     plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
     st.plotly_chart(plot_sf2, use_container_width=True)
 
+## running explain calculations
 sf6 = explain_changes_in_average(
     df1=pre_data,
     df2=data,
@@ -238,5 +271,8 @@ def load_data_upload():
     solver="lasso",
     return_fig=True
 )
+
+# storing the plot in a variable
 plot_sf=sf6.plot(width=500, height=500)
+# exposing the plot via streamlit
 st.plotly_chart(plot_sf, use_container_width=True)
\ No newline at end of file

From 34f0b5bbda51b2459039d9cd9decac472a092ef0 Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 17:54:32 +0100
Subject: [PATCH 6/9] fix: working directory and error handling

adding back the working directory lines and using the stop function to avoid error messages
---
 streamlit/streamlit_app.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index bb0aeb8..cf5a08f 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -3,6 +3,25 @@
 import streamlit as st
 import os, sys
 import pandas as pd
+
+import warnings
+warnings.filterwarnings("ignore")
+
+#Data importer 
+import snowflake.connector
+
+root_path = os.path.realpath('../..')
+print(root_path)
+
+# this assumes that all of the following files are checked in the same directory
+sys.path.append(os.path.join(root_path,"wise-pizza"))
+
+# create data-related directories
+data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
+if not os.path.isdir(data_dir):
+    os.mkdir(data_dir)
+print(data_dir)
+
 from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
 
 
@@ -30,6 +49,10 @@
 # upload the file from the computer
 def load_data_upload():
     uploaded_file = st.file_uploader("Choose a file")
+    if not uploaded_file:
+        st.warning('Please input a dataset.')
+        st.stop()
+    st.success('Dataset inputted.')
     data = pd.read_csv(uploaded_file)
     return data
 
@@ -74,6 +97,10 @@ def load_data_upload():
 )
 st.write('You selected:', flag_column)
 
+# wait until flag column is added
+if not flag_column:
+        st.stop()
+
 # calculate unique flags in the dataset
 flags = sorted(df[flag_column].unique())  
 

From db04dfa190fe015fde946d1b481cb45fcdc9b804 Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <agustin.figueroa@hometogo.de>
Date: Tue, 14 Nov 2023 17:57:31 +0100
Subject: [PATCH 7/9] fix: formatting

---
 streamlit/streamlit_app.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
index cf5a08f..bc7af6d 100644
--- a/streamlit/streamlit_app.py
+++ b/streamlit/streamlit_app.py
@@ -7,11 +7,7 @@
 import warnings
 warnings.filterwarnings("ignore")
 
-#Data importer 
-import snowflake.connector
-
 root_path = os.path.realpath('../..')
-print(root_path)
 
 # this assumes that all of the following files are checked in the same directory
 sys.path.append(os.path.join(root_path,"wise-pizza"))
@@ -20,21 +16,9 @@
 data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
 if not os.path.isdir(data_dir):
     os.mkdir(data_dir)
-print(data_dir)
 
 from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
 
-
-root_path = os.path.realpath('../..')
-
-# this assumes that all of the following files are checked in the same directory
-sys.path.append(os.path.join(root_path,"wise-pizza"))
-
-# create data-related directories
-data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
-if not os.path.isdir(data_dir):
-    os.mkdir(data_dir)
-
 # False if you want nice interactive plots
 # True if you want static plots (Doesn't work on all platforms yet)
 plot_is_static = False

From ba3cff0b80e310dfa3fc3659f40161000fc6a5ae Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <77272542+agusfigueroa-htg@users.noreply.github.com>
Date: Wed, 22 Nov 2023 14:49:39 +0100
Subject: [PATCH 8/9] feature: add docker container

---
 Dockerfile                 |  23 +++
 README.md                  |  25 +++-
 docker_requirements.txt    |  12 ++
 streamlit/streamlit_app.py | 289 ------------------------------------
 streamlit_app.py           | 290 +++++++++++++++++++++++++++++++++++++
 5 files changed, 349 insertions(+), 290 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker_requirements.txt
 delete mode 100644 streamlit/streamlit_app.py
 create mode 100644 streamlit_app.py

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..fa0a6a9
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,23 @@
+# app/Dockerfile
+
+FROM python:3.9-slim
+
+# Copy Code
+WORKDIR /wise_pizza
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . .
+
+RUN pip3 install -r docker_requirements.txt
+
+EXPOSE 8501
+
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+
+ENTRYPOINT ["streamlit", "run", "streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 4623ecd..ea14406 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,8 @@ Sometimes, rather than explaining the change in totals from one period to the ne
     - [Comparison between two datasets](#understanding-differences-in-two-time-periods-or-two-dataframes)
 - [Installation](#installation)
 - [Quick Start](#quick-start)
+- [Streamlit app](#streamlit-app)
+    - [Docker container](#docker-container)
 - [For Developers](#for-developers)
  - [Tests](#testing)
 
@@ -134,10 +136,31 @@ sf.segments
 ```
 Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb)
 
-## For Developers
+## Streamlit app
+In the root directory of this repository there is a Streamlit app. This is an interface that allows you to upload your own files and run analyses as you saw in the Jupyter Notebook provided as an example.
+
+To run this, you need to:
+1. Create a virtual environment (e.g. using pyenv)
+2. Activate the virtual environment.
+3. Run `pip -r requirements.txt` before running, to install necessary dependencies.
+4. Run `streamlit run streamlit_app.py` to execute the webapp.
+
+### Docker container
 
+We created a Docker container that makes it easier to deploy this solutoin elsewhere.
 
+You need to first:
+Create the Docker image
 
+```Python
+docker build -t streamlit .      
+```
+And then simply run the image
+
+```Python
+docker run -p 8501:8501 streamlit    
+```
+## For Developers
 
 ### Testing
 We use [PyTest](https://docs.pytest.org/) for testing. If you want to contribute code, make sure that the tests in tests/ run without errors.
diff --git a/docker_requirements.txt b/docker_requirements.txt
new file mode 100644
index 0000000..0f668d9
--- /dev/null
+++ b/docker_requirements.txt
@@ -0,0 +1,12 @@
+ipython==8.14.0
+kaleido==0.2.1
+numpy==1.24.0
+pandas==2.0.2
+pytest==7.4.3
+plotly==5.15.0
+scikit_learn==1.3.2
+scipy==1.8.0
+tqdm==4.66.1
+cloudpickle==3.0.0
+pivottablejs==0.9.0
+streamlit==1.28.0
\ No newline at end of file
diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
deleted file mode 100644
index bc7af6d..0000000
--- a/streamlit/streamlit_app.py
+++ /dev/null
@@ -1,289 +0,0 @@
-"""An example of Streamlit leveraging Wise pizza."""
-
-import streamlit as st
-import os, sys
-import pandas as pd
-
-import warnings
-warnings.filterwarnings("ignore")
-
-root_path = os.path.realpath('../..')
-
-# this assumes that all of the following files are checked in the same directory
-sys.path.append(os.path.join(root_path,"wise-pizza"))
-
-# create data-related directories
-data_dir = os.path.realpath(os.path.join(root_path, 'wise-pizza/data'))
-if not os.path.isdir(data_dir):
-    os.mkdir(data_dir)
-
-from wise_pizza import explain_levels, explain_changes_in_totals, explain_changes_in_average
-
-# False if you want nice interactive plots
-# True if you want static plots (Doesn't work on all platforms yet)
-plot_is_static = False
-
-
-# SETTING PAGE CONFIG TO WIDE MODE AND ADDING A TITLE AND FAVICON
-st.set_page_config(layout="wide", page_title="Wise Pizza", page_icon=":pizza:")
-
-st.title('Wise Pizza powered by Streamlit')
-st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those')
-
-# upload the file from the computer
-def load_data_upload():
-    uploaded_file = st.file_uploader("Choose a file")
-    if not uploaded_file:
-        st.warning('Please input a dataset.')
-        st.stop()
-    st.success('Dataset inputted.')
-    data = pd.read_csv(uploaded_file)
-    return data
-
-on = st.toggle('Use sample data from Github')
-url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')
-
-# select the datasource, either local or from github
-if on:
-    st.write(f'Downloading data from {url_data}!')
-    df = pd.read_csv(url_data)
-else:
-    df=load_data_upload()
-
-# show dataset preview
-st.text('Table preview')
-st.table(df.head(10))
-
-# ask the user via streamlit for the target column
-totals = st.selectbox(
-   "What is the target column that you want to analyse? e.g. GMV/revenue",
-   df.columns,
-   index=None,
-   placeholder="Select target column",
-)
-st.write('You selected:', totals)
-
-# ask the user via streamlit for the size column
-size = st.selectbox(
-   "What is the volume column of your dataset? e.g. number of users/transactions",
-   df.columns,
-   index=None,
-   placeholder="Select volume column",
-)
-st.write('You selected:', size)
-
-# ask the user via streamlit for the flag column
-flag_column = st.selectbox(
-   "What is the flag column of your dataset you wanan split it by? Ensure this column is binary",
-   df.columns,
-   index=None,
-   placeholder="Select time column",
-)
-st.write('You selected:', flag_column)
-
-# wait until flag column is added
-if not flag_column:
-        st.stop()
-
-# calculate unique flags in the dataset
-flags = sorted(df[flag_column].unique())  
-
-# show an error message if the specified flag column is not binary
-if len(flags)>2:
-    st.error('Your flag is not binary', icon="🚨")
-
-# allow users to define what's "old" and "new" in the comparison
-flags_option = st.selectbox(
-    'Which one in your data belongs to group A?',
-    (flags))
-
-# listing all potential dimensions in the dataframe. 
-# all of them are potential columns to exclude
-candidates_excluded_columns = [element for element in df.columns if element not in [totals,size,flag_column]]
-
-# list with specified columns to exclude.
-excluded_columns = st.multiselect(
-    'Please select all columns that you want to exclude from the analysis',
-    candidates_excluded_columns)
-
-# all non dimensional columns are the ones picked by the user (if any), plus the ones that indicate totals, size and flag
-non_dimensional_columns = excluded_columns + [totals,size,flag_column]
-
-# calculating dimensions
-dims = [element for element in df.columns if element not in non_dimensional_columns]
-
-# creating the dataframes for the comparison calculations
-data = df[df[flag_column] != flags_option]  # take the group to compare to
-pre_data = df[df[flag_column] == flags_option]  # take the group to be compared
-
-st.subheader('Finding the juiciest slices', divider='rainbow')
-st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')
-
-## finding juiciest slices
-sf = explain_levels(
-    df=df,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=20,
-    solver="lasso",
-    return_fig=True
-)
-
-# storing the plot in a variable
-plot_sf=sf.plot(width=500, height=500)
-# exposing the plot via streamlit
-st.plotly_chart(plot_sf, use_container_width=True)
-
-st.subheader('Analysing differences', divider='rainbow')
-st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown')
-
-## running explain calculations
-sf1 = explain_changes_in_totals(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=20,
-    how="totals",
-    solver="lasso",
-    return_fig=True
-)
-# specifying a two column layout on streamlit
-col1, col2 = st.columns(2)
-# storing the plots in variables
-# exposing the plots via streamlit
-with col1:
-    plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
-    st.plotly_chart(plot_sf1, use_container_width=True)
-
-with col2:
-    plot_sf2=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
-    st.plotly_chart(plot_sf2, use_container_width=True)
-
-st.subheader('Decomposing differences', divider='rainbow')
-st.text('`split_fits` to separately decompose contribution of size changes and average changes')
-
-## running explain calculations
-sf2 = explain_changes_in_totals(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=1,
-    min_segments=10,
-    how="split_fits",
-    solver="lasso",
-    return_fig=True
-)
-# storing the plot in a variable
-# exposing the plot via streamlit
-plot_sf=sf2.plot(width=500, height=500)
-st.plotly_chart(plot_sf, use_container_width=True)
-
-st.text('`extra_dim` to treat size vs average change contribution as an additional dimension')
-
-## running explain calculations
-sf3 = explain_changes_in_totals(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=20,
-    how="extra_dim",
-    solver="lasso",
-    return_fig=True
-)
-# specifying a two column layout on streamlit
-col1, col2 = st.columns(2)
-
-# storing the plots in variables
-# exposing the plots via streamlit
-with col1:
-    plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
-    st.plotly_chart(plot_sf1, use_container_width=True)
-
-with col2:
-    plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
-    st.plotly_chart(plot_sf2, use_container_width=True)
-
-st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint')
-
-## running explain calculations
-sf3 = explain_changes_in_totals(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=15,
-    how="force_dim",
-    solver="lasso",
-    return_fig=True
-)
-# specifying a two column layout on streamlit
-col1, col2 = st.columns(2)
-
-# storing the plots in variables
-# exposing the plots via streamlit
-with col1:
-    plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
-    st.plotly_chart(plot_sf1, use_container_width=True)
-
-with col2:
-    plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
-    st.plotly_chart(plot_sf2, use_container_width=True)
-
-st.subheader('Explaining changes in average', divider='rainbow')
-
-## running explain calculations
-sf4 = explain_changes_in_average(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=20,
-    how="totals",
-    solver="lasso",
-    return_fig=True
-)
-# specifying a two column layout on streamlit
-col1, col2 = st.columns(2)
-
-# storing the plots in variables
-# exposing the plots via streamlit
-with col1:
-    plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
-    st.plotly_chart(plot_sf1, use_container_width=True)
-
-with col2:
-    plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
-    st.plotly_chart(plot_sf2, use_container_width=True)
-
-## running explain calculations
-sf6 = explain_changes_in_average(
-    df1=pre_data,
-    df2=data,
-    dims=dims,
-    total_name=totals,
-    size_name=size,
-    max_depth=2,
-    min_segments=20,
-    how="split_fits",
-    solver="lasso",
-    return_fig=True
-)
-
-# storing the plot in a variable
-plot_sf=sf6.plot(width=500, height=500)
-# exposing the plot via streamlit
-st.plotly_chart(plot_sf, use_container_width=True)
\ No newline at end of file
diff --git a/streamlit_app.py b/streamlit_app.py
new file mode 100644
index 0000000..a2e2f13
--- /dev/null
+++ b/streamlit_app.py
@@ -0,0 +1,290 @@
+"""An example of Streamlit leveraging Wise pizza."""
+
+import streamlit as st
+import os, sys
+import pandas as pd
+from wise_pizza.explain import explain_levels, explain_changes_in_totals, explain_changes_in_average
+
+
+# False if you want nice interactive plots
+# True if you want static plots (Doesn't work on all platforms yet)
+plot_is_static = False
+
+
+# SETTING PAGE CONFIG TO WIDE MODE AND ADDING A TITLE AND FAVICON
+st.set_page_config(layout="wide", page_title="Wise Pizza", page_icon=":pizza:")
+
+st.title('Wise Pizza powered by Streamlit')
+st.text('Only categorical columns are accepted, bucket the numeric ones if you wanna use those')
+
+# upload the file from the computer
+def load_data_upload():
+    uploaded_file = st.file_uploader("Choose a file")
+    if not uploaded_file:
+        st.warning('Please input a dataset.')
+        st.stop()
+    st.success('Dataset inputted.')
+    data = pd.read_csv(uploaded_file)
+    return data
+
+on = st.toggle('Use sample data from Github')
+url_data = (r'https://raw.githubusercontent.com/transferwise/wise-pizza/main/data/synth_data.csv')
+
+# select the datasource, either local or from github
+if on:
+    st.write(f'Downloading data from {url_data}!')
+    df = pd.read_csv(url_data)
+else:
+    df=load_data_upload()
+
+# show dataset preview
+st.text('Table preview')
+st.table(df.head(10))
+
+# ask the user for relevant dimensions
+dims = st.multiselect(
+    "Select the dimensions you want to include in the analysis",
+    df.select_dtypes(exclude=['number']).columns.tolist()
+    )
+
+# ask the user via streamlit to select if they want to run a comparison between subgroups or not
+flag_comparison = st.toggle('I want to run a comparison between two subgroups in my data')
+
+# return columns that are candidate for comparison
+def flag_columns(df):
+    binary_columns = df.columns[df.nunique() == 2].tolist()
+    if not binary_columns:
+        st.warning('No column in the dataset is binary, no comparison can be carried out')
+        st.stop()
+    return binary_columns
+
+if flag_comparison:
+    #calculate binary columns
+    binary_columns=flag_columns(df)
+
+    # ask the user via streamlit for the flag column
+    flag_column = st.selectbox(
+    "What is the flag column of your dataset that defines the two subgroups?",
+    binary_columns,
+    index=None,
+    placeholder="Select flag column column",
+    )
+    st.write('You selected:', flag_column)
+
+    # wait until flag column is added
+    if not flag_column:
+            st.stop()
+
+    # calculate unique flags in the dataset
+    flags = sorted(df[flag_column].unique())  
+
+    # allow users to define what's "old" and "new" in the comparison
+    flags_option = st.selectbox(
+        'Which value in your flag column belongs to group A?',
+        (flags))
+
+
+# ask the user via streamlit for the target column
+totals = st.selectbox(
+   "Name of column that contains totals per segment (e.g. GMV/revenue)",
+   # display only numerical columns
+   df.select_dtypes(include=['number']).columns.tolist(),
+   index=None,
+   placeholder="Select target column",
+)
+st.write('You selected:', totals)
+
+# ask the user via streamlit for the size column
+size = st.selectbox(
+   "Name of column containing segment size (e.g. number of users/number of transactions)",
+   # display only numerical columns
+   df.select_dtypes(include=['number']).columns.tolist(),
+   index=None,
+   placeholder="Select volume column",
+)
+st.write('You selected:', size)
+
+st.subheader('Finding the juiciest slices', divider='rainbow')
+st.text('This section does not compare groups, but rather checks which features have the most impact in the target column you selected.')
+
+if not totals or not size or not dims:
+        st.warning('Please input all fields above.')
+        st.stop()
+
+## finding juiciest slices
+sf = explain_levels(
+    df=df,
+    dims=dims,
+    total_name=totals,
+    size_name=size,
+    max_depth=2,
+    min_segments=20,
+    solver="lasso",
+    return_fig=True
+)
+
+# storing the plot in a variable
+plot_sf=sf.plot(width=500, height=500)
+# exposing the plot via streamlit
+st.plotly_chart(plot_sf, use_container_width=True)
+
+if flag_comparison:
+    st.subheader('Analysing differences', divider='rainbow')
+    st.text('This section does compare the two groups defined by the flag. Old total is the group A you selected in the dropdown')
+
+    # creating the dataframes for the comparison calculations
+    data = df[df[flag_column] != flags_option]  # take the group to compare to
+    pre_data = df[df[flag_column] == flags_option]  # take the group to be compared
+
+    # define the relevant dimensions for the comparison feature
+    # for this, the flag column is to be excluded
+    comparison_dimensions = list(filter(lambda x: x != flag_column, dims))
+
+    ## running explain calculations
+    sf1 = explain_changes_in_totals(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=2,
+        min_segments=20,
+        how="totals",
+        solver="lasso",
+        return_fig=True
+    )
+    # specifying a two column layout on streamlit
+    col1, col2 = st.columns(2)
+    # storing the plots in variables
+    # exposing the plots via streamlit
+    with col1:
+        plot_sf1=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+        st.plotly_chart(plot_sf1, use_container_width=True)
+
+    with col2:
+        plot_sf2=sf1.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+        st.plotly_chart(plot_sf2, use_container_width=True)
+
+    st.subheader('Decomposing differences', divider='rainbow')
+    st.text('`split_fits` to separately decompose contribution of size changes and average changes')
+
+    ## running explain calculations
+    sf2 = explain_changes_in_totals(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=1,
+        min_segments=10,
+        how="split_fits",
+        solver="lasso",
+        return_fig=True
+    )
+    # storing the plot in a variable
+    # exposing the plot via streamlit
+    plot_sf=sf2.plot(width=500, height=500)
+    st.plotly_chart(plot_sf, use_container_width=True)
+
+    st.text('`extra_dim` to treat size vs average change contribution as an additional dimension')
+
+    ## running explain calculations
+    sf3 = explain_changes_in_totals(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=2,
+        min_segments=20,
+        how="extra_dim",
+        solver="lasso",
+        return_fig=True
+    )
+    # specifying a two column layout on streamlit
+    col1, col2 = st.columns(2)
+
+    # storing the plots in variables
+    # exposing the plots via streamlit
+    with col1:
+        plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+        st.plotly_chart(plot_sf1, use_container_width=True)
+
+    with col2:
+        plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+        st.plotly_chart(plot_sf2, use_container_width=True)
+
+    st.text('`force_dim` like extra_dim, but each segment must contain a Change_from constraint')
+
+    ## running explain calculations
+    sf3 = explain_changes_in_totals(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=2,
+        min_segments=15,
+        how="force_dim",
+        solver="lasso",
+        return_fig=True
+    )
+    # specifying a two column layout on streamlit
+    col1, col2 = st.columns(2)
+
+    # storing the plots in variables
+    # exposing the plots via streamlit
+    with col1:
+        plot_sf1=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+        st.plotly_chart(plot_sf1, use_container_width=True)
+
+    with col2:
+        plot_sf2=sf3.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+        st.plotly_chart(plot_sf2, use_container_width=True)
+
+    st.subheader('Explaining changes in average', divider='rainbow')
+
+    ## running explain calculations
+    sf4 = explain_changes_in_average(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=2,
+        min_segments=20,
+        how="totals",
+        solver="lasso",
+        return_fig=True
+    )
+    # specifying a two column layout on streamlit
+    col1, col2 = st.columns(2)
+
+    # storing the plots in variables
+    # exposing the plots via streamlit
+    with col1:
+        plot_sf1=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[0]
+        st.plotly_chart(plot_sf1, use_container_width=True)
+
+    with col2:
+        plot_sf2=sf4.plot(width=500, height=500, plot_is_static=plot_is_static)[1]
+        st.plotly_chart(plot_sf2, use_container_width=True)
+
+    ## running explain calculations
+    sf6 = explain_changes_in_average(
+        df1=pre_data,
+        df2=data,
+        dims=comparison_dimensions,
+        total_name=totals,
+        size_name=size,
+        max_depth=2,
+        min_segments=20,
+        how="split_fits",
+        solver="lasso",
+        return_fig=True
+    )
+
+    # storing the plot in a variable
+    plot_sf=sf6.plot(width=500, height=500)
+    # exposing the plot via streamlit
+    st.plotly_chart(plot_sf, use_container_width=True)
\ No newline at end of file

From 8d42ce954569fed42b78efd3c2adb8ebbeaf8572 Mon Sep 17 00:00:00 2001
From: agusfigueroa-htg <77272542+agusfigueroa-htg@users.noreply.github.com>
Date: Wed, 22 Nov 2023 14:52:37 +0100
Subject: [PATCH 9/9] Update README.md

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ea14406..a4d829c 100644
--- a/README.md
+++ b/README.md
@@ -137,19 +137,19 @@ sf.segments
 Please see the full example [here](https://github.com/transferwise/wise-pizza/blob/main/notebooks/Finding%20interesting%20segments.ipynb)
 
 ## Streamlit app
-In the root directory of this repository there is a Streamlit app. This is an interface that allows you to upload your own files and run analyses as you saw in the Jupyter Notebook provided as an example.
+In the root directory of this repository, there is a Streamlit app. This is an interface that allows you to upload your own files and run analyses, as you saw in the Jupyter Notebook provided as an example.
 
 To run this, you need to:
 1. Create a virtual environment (e.g. using pyenv)
-2. Activate the virtual environment.
-3. Run `pip -r requirements.txt` before running, to install necessary dependencies.
-4. Run `streamlit run streamlit_app.py` to execute the webapp.
+2. Activate the virtual environment
+3. Run `pip -r requirements.txt` before running, to install necessary dependencies
+4. Run `streamlit run streamlit_app.py` to run Streamlit
 
 ### Docker container
 
-We created a Docker container that makes it easier to deploy this solutoin elsewhere.
+We created a Docker container that makes it easier to deploy this solution elsewhere
 
-You need to first:
+You need to first: Create the Docker image
 Create the Docker image
 
 ```Python