diff --git a/pyproject.toml b/pyproject.toml index 6a50407..5e84bfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tw-experimentation" -version = "0.1.1.13" +version = "0.1.2" description = "Wise AB platform" authors = ["Wise"] readme = "README.md" @@ -56,3 +56,26 @@ optional = true causaltune = "^0.1.3" +[tool.black] +line-length = 88 +target_version = ["py39"] +include = '\.pyi?$' +exclude = ''' + +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.mypy_cache + | \.tox + | \.venv + | _build + | build + | dist + )/ +) +''' + +[tool.isort] +profile = "black" +line_length = 88 \ No newline at end of file diff --git a/tw_experimentation/bayes/bayes_model.py b/tw_experimentation/bayes/bayes_model.py index 2e125b9..f761903 100644 --- a/tw_experimentation/bayes/bayes_model.py +++ b/tw_experimentation/bayes/bayes_model.py @@ -72,7 +72,7 @@ def set_model( assert self._model_is_well_defined() def set_prior_model(self, variable: str, model, model_params: dict): - """Set prior model for a likelihood model parameter + """Set prior model for a likelihood model parameter. Args: variable (str): name of variab model variable to be fed into likelihood @@ -87,7 +87,7 @@ def set_prior_model(self, variable: str, model, model_params: dict): self.prior_model_params[variable] = model_params def set_prior_model_param(self, variable: str, model_params: dict): - """Set parameters for prior + """Set parameters for prior. Args: variable (str): name of variab model variable to be fed into likelihood diff --git a/tw_experimentation/bayes/bayes_test.py b/tw_experimentation/bayes/bayes_test.py index b045768..8d707c9 100644 --- a/tw_experimentation/bayes/bayes_test.py +++ b/tw_experimentation/bayes/bayes_test.py @@ -37,9 +37,7 @@ @dataclass class BayesResult: - """ - Class to store the results of a Bayesian test - """ + """Class to store the results of a Bayesian test.""" targets: List[str] metric_types: List[str] @@ -90,8 +88,8 @@ def bayes_factor_decision( return "reject null" def prob_greater_than_zero(self, target: str): - """ - Compute the probability that the average treatment effect is greater than zero + """Compute the probability that the average treatment effect is greater than + zero. Args: target (str): target metric @@ -103,8 +101,7 @@ def prob_greater_than_zero(self, target: str): } def prob_greater_than_z(self, z: float, target: str): - """ - Compute the probability that the average treatment effect is greater than z + """Compute the probability that the average treatment effect is greater than z. Args: z (float): threshold @@ -117,8 +114,7 @@ def prob_greater_than_z(self, z: float, target: str): } def prob_smaller_than_z(self, z: float, target: str): - """ - Compute the probability that the average treatment effect is smaller than z + """Compute the probability that the average treatment effect is smaller than z. Args: z (float): threshold @@ -131,9 +127,8 @@ def prob_smaller_than_z(self, z: float, target: str): } def prob_greater_than_z_absolute(self, z: float, target: str): - """ - Compute the probability that the absolute value of - the average treatment effect is greater than z + """Compute the probability that the absolute value of the average treatment + effect is greater than z. Args: z (float): threshold @@ -147,8 +142,8 @@ def prob_greater_than_z_absolute(self, z: float, target: str): } def prob_within_interval(self, z_lower: float, z_upper: float, target: str): - """ - Compute the probability that the average treatment effect is within the interval [z_lower, z_upper] + """Compute the probability that the average treatment effect is within the + interval [z_lower, z_upper] Args: z_lower (float): lower bound of interval @@ -163,8 +158,8 @@ def prob_within_interval(self, z_lower: float, z_upper: float, target: str): } def prob_outside_interval(self, z_lower: float, z_upper: float, target: str): - """ - Compute the probability that the average treatment effect is outside the interval [z_lower, z_upper] + """Compute the probability that the average treatment effect is outside the + interval [z_lower, z_upper] Args: z_lower (float): lower bound of interval @@ -184,9 +179,8 @@ def rope( rope_upper: Optional[float] = None, rope_lower: Optional[float] = None, ): - """ - Compute the probability that the average treatment effect - is in the region of practical equivalence (ROPE) + """Compute the probability that the average treatment effect is in the region of + practical equivalence (ROPE) https://easystats.github.io/bayestestR/articles/region_of_practical_equivalence.html @@ -215,7 +209,8 @@ def rope( def _rope_interval_autodetect_intervals( self, target: str, scale_param: Optional[float] = 0.1 ): - """Compute the ROPE interval based on the standard deviation of the target metric + """Compute the ROPE interval based on the standard deviation of the target + metric. Args: target (str): target metric @@ -232,8 +227,7 @@ def _rope_interval_autodetect_intervals( def _posterior_and_hdi_plot( self, sample_per_variant, posterior_hdi_per_variant, distribution_opacity=0.3 ): - """ - Plot the posterior distribution and the high density interval (HDI) + """Plot the posterior distribution and the high density interval (HDI) Args: sample_per_variant (dict): dictionary of posterior samples @@ -306,8 +300,8 @@ def _posterior_and_hdi_plot( return fig def fig_posterior_by_target(self, target: str, distribution_opacity: float = 0.3): - """ - Plot the posterior distribution and the high density interval (HDI) of the expected value + """Plot the posterior distribution and the high density interval (HDI) of the + expected value. Args: target (str): target metric @@ -331,8 +325,8 @@ def fig_posterior_by_target(self, target: str, distribution_opacity: float = 0.3 def fig_posterior_cdf_by_target( self, target: str, distribution_opacity: float = 0.3, facet_rows_variant=False ): - """ - Generates a plot of the empirical cumulative distribution (ECDF) function of treatment effect for a given target. + """Generates a plot of the empirical cumulative distribution (ECDF) function of + treatment effect for a given target. Args: target (str): The target for which to generate the plot. @@ -360,8 +354,8 @@ def fig_posterior_cdf_by_target( def fig_posterior_difference_by_target( self, target: str, distribution_opacity: float = 0.3 ): - """ - Plot the posterior distribution and the high density interval (HDI) of the expected treatment effect + """Plot the posterior distribution and the high density interval (HDI) of the + expected treatment effect. Args: target (str): target metric @@ -390,10 +384,9 @@ def fig_posterior_difference_cdf( shade_areas: bool = True, shade_limits: Tuple[Union[float, None], Union[float, None]] = (None, None), ) -> make_subplots: - """ - Generates a plotly figure showing the cumulative density function of the treatment effect - for each variant, based on the posterior distribution of the difference in means between - the variant and the control group. + """Generates a plotly figure showing the cumulative density function of the + treatment effect for each variant, based on the posterior distribution of the + difference in means between the variant and the control group. Args: sample_per_variant (dict): A dictionary mapping variant names to lists of samples. @@ -498,7 +491,7 @@ def set_model( # self.set_prior_model(*model_and_params) def set_prior_model(self, target, variable: str, model, model_params: dict): - """Set prior model for a likelihood model parameter + """Set prior model for a likelihood model parameter. Args: variable (str): name of variab model variable to be fed into likelihood @@ -516,8 +509,7 @@ def set_prior_model(self, target, variable: str, model, model_params: dict): self.update_prior_model_param(target, variable, model_params) def update_prior_model_param(self, target, variable: str, model_params: dict): - """Update parameters for prior. - The prior model must have been defined before. + """Update parameters for prior. The prior model must have been defined before. Args: variable (str): name of variab model variable to be fed into likelihood @@ -533,7 +525,7 @@ def update_prior_model_param(self, target, variable: str, model_params: dict): self.params_models_per_target[target][variable] = model_params def set_model_to_default(self): - """Reset the bayesian model to default settings""" + """Reset the bayesian model to default settings.""" self.likelihood_model_per_target = {} self.variables_per_target = {} self.prior_models_per_target = {} @@ -566,9 +558,7 @@ def _setup_bayesmodel(self, target, fit_model=True): return bm def compute_posterior(self, store_prior=True, compute_bayes_factor=True, verbose=0): - """Run the Bayesian model via numpyro to obtain the - posterior distribution - """ + """Run the Bayesian model via numpyro to obtain the posterior distribution.""" # TODO: save priors on this level @@ -645,10 +635,8 @@ def compute_bayes_factor(self): self._store_prior_means(target, mcmc) def _compute_posterior_predictive(self): - """ - compute posterior predictive distribution from posterior samples - only possible after model fit - """ + """Compute posterior predictive distribution from posterior samples only + possible after model fit.""" N_SAMPLES_POST_PRED = 100000 for target in self.ed.targets: self.post_pred[target] = {} diff --git a/tw_experimentation/bayes/numpyro_monkeypatch.py b/tw_experimentation/bayes/numpyro_monkeypatch.py index 9e0a4dc..1870fe8 100644 --- a/tw_experimentation/bayes/numpyro_monkeypatch.py +++ b/tw_experimentation/bayes/numpyro_monkeypatch.py @@ -25,8 +25,7 @@ class ZeroInflatedProbsPatch(Distribution): - """ - ZeroInflatedProbs distribution from Numpyro + """ZeroInflatedProbs distribution from Numpyro. https://num.pyro.ai/en/stable/_modules/numpyro/distributions/discrete.html#ZeroInflatedDistribution @@ -42,8 +41,9 @@ def __init__(self, base_dist, gate, *, validate_args=None): # assert base_dist.support.is_discrete if base_dist.event_shape: raise ValueError( - "ZeroInflatedProbs expected empty base_dist.event_shape but got {}" - .format(base_dist.event_shape) + "ZeroInflatedProbs expected empty base_dist.event_shape but got {}".format( + base_dist.event_shape + ) ) # XXX: we might need to promote parameters of base_dist but let's keep # this simplified for now diff --git a/tw_experimentation/checker.py b/tw_experimentation/checker.py index 030e9ac..b6a03fd 100644 --- a/tw_experimentation/checker.py +++ b/tw_experimentation/checker.py @@ -64,8 +64,8 @@ def _dynamic_sample_size_descriptives(self): @property def sample_size_table(self): - """ - Returns a pandas DataFrame containing the sample sizes for each variant and the total sample size. + """Returns a pandas DataFrame containing the sample sizes for each variant and + the total sample size. Returns: pandas.DataFrame: A DataFrame with the sample sizes for each variant and the total sample size. @@ -107,9 +107,7 @@ def qqplot_variant_variant(self, target): return plot_qq_variants(qq_variants, self.ed, target) def create_tables_and_plots(self): - """ - Creates the tables and plots for the monitoring. - """ + """Creates the tables and plots for the monitoring.""" fig_sample_size_pie = plot_sample_size_pie(self.ed) if self.ed.is_dynamic_observation: @@ -147,9 +145,8 @@ def create_tables_and_plots(self): @dataclass class NormalityChecks: ed: ExperimentDataset - """ - A class for checking whether metrics regression residuals are normally distributed. - Relevant for decision to run t-test or not. + """A class for checking whether metrics regression residuals are normally + distributed. Relevant for decision to run t-test or not. Attributes: ----------- @@ -167,9 +164,7 @@ class NormalityChecks: """ def __post_init__(self): - """ - Calculates standardized residuals for each relevant target and variant. - """ + """Calculates standardized residuals for each relevant target and variant.""" self.relevant_targets = [ target for target in self.ed.targets @@ -189,7 +184,7 @@ def __post_init__(self): self.standardized_residuals[target][variant] = standardized_residuals def qqplot(self, target): - """Get a quantile-quantile plot for a given target metric""" + """Get a quantile-quantile plot for a given target metric.""" assert target in self.ed.targets and self.ed.metric_types[target] in [ "continuous", "discrete", @@ -198,11 +193,11 @@ def qqplot(self, target): return fig def all_qqplots(self): - """Get Q-Q plots for all relevant target metrics""" + """Get Q-Q plots for all relevant target metrics.""" return {target: self.qqplot(target) for target in self.relevant_targets} def shapiro_wilk_test(self, target, alpha=0.05): - """Perform Shapiro-Wilk test for normality on a given target metric""" + """Perform Shapiro-Wilk test for normality on a given target metric.""" results = {"variant": [], "statistic": [], "p-value": []} variant_names = variant_name_map(self.ed.n_variants) for variant in range(1, self.ed.n_variants): @@ -223,15 +218,15 @@ def shapiro_wilk_test(self, target, alpha=0.05): return results_df def all_shapiro_wilk_tests(self, alpha=0.05): - """Perform Shapiro-Wilk test for normality on all relevant target metrics""" + """Perform Shapiro-Wilk test for normality on all relevant target metrics.""" return { target: self.shapiro_wilk_test(target, alpha) for target in self.relevant_targets } def create_results(self, alpha=0.05): - """ - Creates an object containing the relevant targets, QQ plots, and Shapiro-Wilk test results. + """Creates an object containing the relevant targets, QQ plots, and Shapiro-Wilk + test results. Args: alpha (float): The significance level for the Shapiro-Wilk test. Defaults to 0.05. @@ -290,8 +285,7 @@ def __post_init__(self): } def dynamic_sample_size_descriptives(self, segment, most_rlvnt_segments_only=True): - """ - Computes the dynamic sample size descriptives for a given segment. + """Computes the dynamic sample size descriptives for a given segment. Args: segment (str): The name of the segment to compute the descriptives for. @@ -318,8 +312,8 @@ def dynamic_sample_size_descriptives(self, segment, most_rlvnt_segments_only=Tru return df_dyn_avg def chi_squared_test_table(self, alpha=0.05): - """ - chi-squared test for independence between the variant and each segment in the experiment. + """Chi-squared test for independence between the variant and each segment in the + experiment. Args: alpha (float): The significance level for the test. Default is 0.05. @@ -367,9 +361,8 @@ def _chi_squared_table(self, dimension: str): return chi_squared_table def chi_squared_heatmaps(self): - """ - Returns a dictionary of plotly figures, one for each segment, illustrating the chi-squared statistic - """ + """Returns a dictionary of plotly figures, one for each segment, illustrating + the chi-squared statistic.""" chi_squared_heatmaps = {} for j, s in enumerate(self.segments): chi_squared_selection = self._chi_squared_table(s).loc[ @@ -428,8 +421,7 @@ def sequential_tests( sds: List[float], alpha: float = 0.05, ) -> pd.DataFrame: - """ - Runs sequential statistical tests on the data. + """Runs sequential statistical tests on the data. Args: metrics (List[str]): A list of metric names to test. @@ -565,8 +557,8 @@ def sequential_test_results( sds: dict = None, alpha: float = 0.05, ): - """ - Computes sequential testing results for a list of metrics, using the effect size means and standard deviations provided, or default values if not specified. + """Computes sequential testing results for a list of metrics, using the effect + size means and standard deviations provided, or default values if not specified. Args: metrics (List): A list of metric names to compute sequential testing results for. @@ -627,10 +619,10 @@ def conf_interval_before_recursion(self, delta, n, tau, sigma, alpha): return delta - np.sqrt(shift), delta + np.sqrt(shift) def fig_sequential_test(self): - """ - Generates a plotly figure with three columns for each target metric, showing the average value over time, - the treatment effect compared to the control group, and the p-value of the sequential test for each variant. - The figure has one row per target metric, and each row shows the data for all variants. + """Generates a plotly figure with three columns for each target metric, showing + the average value over time, the treatment effect compared to the control group, + and the p-value of the sequential test for each variant. The figure has one row + per target metric, and each row shows the data for all variants. Returns: fig (plotly.graph_objs.Figure): the plotly figure object. diff --git a/tw_experimentation/data_generation.py b/tw_experimentation/data_generation.py index 9b8f108..973e179 100644 --- a/tw_experimentation/data_generation.py +++ b/tw_experimentation/data_generation.py @@ -216,7 +216,7 @@ def generate_data_abn_test( class SimpleClickThroughRate(DGP): - """To test ratio metrics""" + """To test ratio metrics.""" def generate_data( self, @@ -282,7 +282,7 @@ def generate_data( *ctrl_dist_args, distribution=ctrl_distribution, n_models=n_models, - **ctrl_dist_kwargs + **ctrl_dist_kwargs, ) # mu = .6 # beta = 1 @@ -292,7 +292,7 @@ def generate_data( *treat_dist_args, distribution=treat_distribution, n_models=n_models, - **treat_dist_kwargs + **treat_dist_kwargs, ) samples = {} @@ -301,7 +301,7 @@ def generate_data( samples[str(v)] = sampler( **final_dist_params_ctrl, **{mix_param: v}, - size=(n_per_model, v.shape[0]) + size=(n_per_model, v.shape[0]), ) df_control = pd.DataFrame( @@ -376,16 +376,13 @@ def generate_data( class GenerativeFromRealData(DGP): - """Generative DGP from real data using residuals - Requires fitted autocausality model / predictions as input - """ + """Generative DGP from real data using residuals Requires fitted autocausality model + / predictions as input.""" pass class SimpleExperimentOneConfounder(DGP): - """ - Synthetic experiment with one observed confounder ('web' or 'mobile' user) - """ + """Synthetic experiment with one observed confounder ('web' or 'mobile' user)""" pass diff --git a/tw_experimentation/plotting/monitoring_plots.py b/tw_experimentation/plotting/monitoring_plots.py index d1cbd60..27bbcc5 100644 --- a/tw_experimentation/plotting/monitoring_plots.py +++ b/tw_experimentation/plotting/monitoring_plots.py @@ -26,8 +26,7 @@ def plotly_reduce_n_points_per_trace( fig: go.Figure, max_n_points: int = 5000, min_n_points_per_trace: int = 200 ): - """ - Reduces the number of data points in a Plotly figure to improve memory usage. + """Reduces the number of data points in a Plotly figure to improve memory usage. Args: fig (plotly.graph_objs.Figure): The Plotly figure to be modified. @@ -63,8 +62,8 @@ def plotly_reduce_n_points_per_trace( def plotly_light_memory(max_n_points=5000, min_n_points_per_trace=200): - """ - Decorator that reduces the number of points in a Plotly figure to optimize memory usage. + """Decorator that reduces the number of points in a Plotly figure to optimize memory + usage. Args: max_n_points (int, optional): The maximum number of points allowed in the figure. Defaults to 5000. @@ -94,11 +93,11 @@ def fig_variant_segment_dependence( ed: ExperimentDataset, text_auto=False, ): - """Heatmap for chi-squared test of dependence between variant and segment - Normalises the color by setting the midpoint of the colorscale to - the 95% quantile of the chi-squared distribution divided by degrees of freedom. - Hence, a cell achieves above a heat at the colorscale midpoint if all - cells had the same value, then the test has a p-value of .05. + """Heatmap for chi-squared test of dependence between variant and segment Normalises + the color by setting the midpoint of the colorscale to the 95% quantile of the chi- + squared distribution divided by degrees of freedom. Hence, a cell achieves above a + heat at the colorscale midpoint if all cells had the same value, then the test has a + p-value of .05. Args: chi_squared_table (pd.DataFrame): table of N(0,1) distributed statistics @@ -166,8 +165,7 @@ def plot_sample_size_pie(ed: ExperimentDataset): @plotly_light_memory(max_n_points=MAX_N_POINTS) def plot_dynamic_sample_size(df_dynamic_sample: pd.DataFrame, ed: ExperimentDataset): - """ - Plots the sample size per variant over time. + """Plots the sample size per variant over time. Args: df_dynamic_sample (pd.DataFrame): A DataFrame containing the sample size data. @@ -231,9 +229,7 @@ def target_metric_distribution( @plotly_light_memory(max_n_points=MAX_N_POINTS) def plot_target_metric_cdf(ed: ExperimentDataset, target: str, use_log: bool = False): - """ - Plots the cumulative density function (CDF) for a given target variable. - """ + """Plots the cumulative density function (CDF) for a given target variable.""" fig = px.ecdf( ed.data, x=target, @@ -256,8 +252,7 @@ def plot_target_metric_cdf(ed: ExperimentDataset, target: str, use_log: bool = F def plot_qq_variants( qq_variants: Dict[str, np.ndarray], ed: ExperimentDataset, target: str ) -> go.Figure: - """ - Plots a Q-Q plot for a given target between different variants. + """Plots a Q-Q plot for a given target between different variants. Args: qq_variants (Dict[str, np.ndarray]): A dictionary containing the quantiles for each variant. @@ -441,8 +436,8 @@ def plot_qq_normal( target: str, standardized_residuals: Dict[str, Dict[int, np.ndarray]], ) -> go.Figure: - """ - Plots a quantile-quantile plot for a given target variable and its standardized residuals for each variant in the experiment. + """Plots a quantile-quantile plot for a given target variable and its standardized + residuals for each variant in the experiment. Args: ed (ExperimentDataset): The experiment dataset containing the data for each variant. diff --git a/tw_experimentation/result_generator.py b/tw_experimentation/result_generator.py index d445c75..9929738 100644 --- a/tw_experimentation/result_generator.py +++ b/tw_experimentation/result_generator.py @@ -29,7 +29,7 @@ def generate_results( segments: Optional[List[str]] = None, remove_outliers=True, ): - """Compute all results for an experiment + """Compute all results for an experiment. Args: df (pd.DataFrame): dataframe with experiment data @@ -98,7 +98,7 @@ def generate_results( def save_output(path: str, name: str, output: dict): - """Save output to pickle file + """Save output to pickle file. Args: path (str): path name diff --git a/tw_experimentation/segmentation_frequentist.py b/tw_experimentation/segmentation_frequentist.py index f824774..c3a2804 100644 --- a/tw_experimentation/segmentation_frequentist.py +++ b/tw_experimentation/segmentation_frequentist.py @@ -21,8 +21,7 @@ @dataclass class SegmentationOutput: - """ - Represents the output of a segmentation analysis. + """Represents the output of a segmentation analysis. Attributes: segments (List[str]): List of segment names. @@ -44,8 +43,7 @@ class SegmentationOutput: def segment_output( self, category: str, multitest_correction="bonferroni", alpha=0.05 ) -> pd.DataFrame: - """ - Returns the results table for a specific segment. + """Returns the results table for a specific segment. Args: segment (str): The name of the segment. @@ -169,7 +167,8 @@ def wise_pizza_frequentist( max_depth: int = 3, min_segments: int = 2, ): - """Fit method for wise pizza to get wise pizza results and frequentist analysis of segments + """Fit method for wise pizza to get wise pizza results and frequentist analysis + of segments. Args: treatment (int, optional): name of variant at consideration. Defaults to 1. @@ -315,7 +314,8 @@ def wise_pizza_frequentist( min_segments: int = 3, auto_display_df=True, ): - """Fit method for wise pizza to get wise pizza results and frequentist analysis of segments + """Fit method for wise pizza to get wise pizza results and frequentist analysis + of segments. Args: treatment (int, optional): name of variant at consideration. Defaults to 1. diff --git a/tw_experimentation/setuper.py b/tw_experimentation/setuper.py index 0dfa929..c1d002d 100644 --- a/tw_experimentation/setuper.py +++ b/tw_experimentation/setuper.py @@ -21,9 +21,8 @@ @dataclass class ExpDesignAutoCalculate: - """ - A class for automatically calculating standard deviation and mean values - for pre-experiment columns in an ExperimentDataset. + """A class for automatically calculating standard deviation and mean values for pre- + experiment columns in an ExperimentDataset. Attributes: ed (ExperimentDataset): The ExperimentDataset object containing @@ -77,7 +76,8 @@ def mean(self, target_pre_exp): class Setuper: - """Tool for designing AB tests + """Tool for designing AB tests. + Result includes: - Minimal Detectable Effect Size - Sample Size calculation @@ -121,7 +121,7 @@ def from_uplift( treatment_share=0.5, relation="absolute", ): - """Design test size from uplift instead of standardized effect size + """Design test size from uplift instead of standardized effect size. Args: uplift (float): uplift to detect @@ -143,7 +143,7 @@ def from_uplift( ) def sample_size_t_test(self): - """t test sample size calculation for continuous outcomes + """T test sample size calculation for continuous outcomes. Returns: dict: sample sizes per group @@ -164,7 +164,7 @@ def sample_size_t_test(self): } def sample_size_two_sample_proportion_z_test(self): - """two sample proportion z test sample size calculation + """Two sample proportion z test sample size calculation. Returns: dict: sample sizes per group @@ -185,17 +185,15 @@ def sample_size_two_sample_proportion_z_test(self): } def sample_size_chi_squared_test(self): - """ - Sample Size calculation for chi squared test - statsmodels.stats.proportion.proportions_chisquare - """ + """Sample Size calculation for chi squared test + statsmodels.stats.proportion.proportions_chisquare.""" pass def sample_size_proportion_test( self, uplift, baseline_conversion=0.5, relation="absolute" ): - """Sample sie for proportion test, requires uplift and - automatically calculates standard deviation + """Sample sie for proportion test, requires uplift and automatically calculates + standard deviation. Args: uplift (float): uplift to detect @@ -223,11 +221,11 @@ def sample_size_proportion_test( ) def effect_size_proportion_test(self): - """effect size for statsmodels.stats.proportion.test_proportions_2indep""" + """Effect size for statsmodels.stats.proportion.test_proportions_2indep.""" pass def effect_size_t_test(self, nobs): - """effect size for t test given power and number of observations + """Effect size for t test given power and number of observations. Args: nobs (int): number of observations @@ -244,7 +242,7 @@ def effect_size_t_test(self, nobs): return esresult def effect_size_two_sample_z_test(self, nobs): - """effect size of two sample independent proportion z test + """Effect size of two sample independent proportion z test. Args: nobs (int): number of observations @@ -269,7 +267,7 @@ def effect_size_chi_squared_test(self): pass def power_t_test(self, nobs): - """power of t test for continuous outcomes + """Power of t test for continuous outcomes. Args: nobs (int): number of observations @@ -289,7 +287,7 @@ def power_t_test(self, nobs): return pwresult def power_two_sample_z_test(self, nobs): - """power two sample independent proportion z test + """Power two sample independent proportion z test. Args: nobs (int): number of observations @@ -311,7 +309,7 @@ def power_two_sample_z_test(self, nobs): def power_proportion_test( self, uplift, nobs, baseline_conversion=0.5, relation="absolute" ): - """power proportion test (as in evan miller) + """Power proportion test (as in evan miller) Args: uplift (float): uplift to detect @@ -344,7 +342,7 @@ def plot_sample_size_to_mde( effect_size_function, max_sample_size=1000, ): - """plot sample size to MDE + """Plot sample size to MDE. Args: effect_size_function (function): function that maps sample size to MDE @@ -365,7 +363,7 @@ def plot_sample_size_to_mde( def uplift_to_effect_size(uplift, mean, sd, relation="absolute"): - """Convert uplift to effect size + """Convert uplift to effect size. Args: uplift (float): expected uplift @@ -386,7 +384,7 @@ def uplift_to_effect_size(uplift, mean, sd, relation="absolute"): def effect_size_to_uplift(es, mean, sd, relation="absolute"): - """Convert effect size to uplift + """Convert effect size to uplift. Args: es (float): expected effect size diff --git a/tw_experimentation/statistical_tests.py b/tw_experimentation/statistical_tests.py index ec5b552..e4ac686 100644 --- a/tw_experimentation/statistical_tests.py +++ b/tw_experimentation/statistical_tests.py @@ -295,8 +295,7 @@ def compute_stats_per_target( type_i_error: float = 0.05, multitest_correction: Optional[str] = None, ): - """ - Compute statistical tests and confidence intervals for each target variable. + """Compute statistical tests and confidence intervals for each target variable. Args: direction (str, optional): The direction of the test. Defaults to "two-sided". @@ -530,8 +529,7 @@ def correct_p_values(self, p_values: Dict[str, float]): return p_values def correct_alphas(self, p_values: Dict[str, float] = None): - """ - Corrects the alpha values for multiple hypothesis testing. + """Corrects the alpha values for multiple hypothesis testing. Args: p_values (list, optional): List of p-values for each hypothesis. Defaults to None. @@ -684,11 +682,8 @@ def proportion_test(sample_1, sample_2, alpha, alternative="two-sided"): class FrequentistTest(BaseTest): - """ - Frequentist static testing - Should include - test stat, confidence interval, p values, power, - """ + """Frequentist static testing Should include test stat, confidence interval, p + values, power,""" def __init__( self, @@ -885,9 +880,8 @@ def get_results_table(self): def cuped(ed: ExperimentDataset, has_correction: bool, alpha: float): - """ - Applies the CUPED method to estimate treatment effects in experiment. - Serves as a wrapper to variance reduction methods + """Applies the CUPED method to estimate treatment effects in experiment. Serves as a + wrapper to variance reduction methods. Args: ExperimentDataset (ExperimentDataset): An ExperimentDataset object containing the data to be analyzed. @@ -978,9 +972,8 @@ def cuped(ed: ExperimentDataset, has_correction: bool, alpha: float): def run_cuped(ed: ExperimentDataset): - """ - Applies the CUPED method to estimate treatment effects in experiment. - Serves as a wrapper to variance reduction methods + """Applies the CUPED method to estimate treatment effects in experiment. Serves as a + wrapper to variance reduction methods. Args: ExperimentDataset (ExperimentDataset): An ExperimentDataset object containing the data to be analyzed. diff --git a/tw_experimentation/streamlit/pages_wrap/page2_Experiment_Design.py b/tw_experimentation/streamlit/pages_wrap/page2_Experiment_Design.py index 66602a9..aa1436e 100644 --- a/tw_experimentation/streamlit/pages_wrap/page2_Experiment_Design.py +++ b/tw_experimentation/streamlit/pages_wrap/page2_Experiment_Design.py @@ -87,7 +87,7 @@ def page_2_experiment_design(): step=0.01, key="exp_design_alpha", format="%.2f", - ) + ) with col12: st.number_input( @@ -97,7 +97,7 @@ def page_2_experiment_design(): step=0.01, key="exp_design_beta", format="%.2f", - ) + ) st.number_input( "Treatment share (e.g., 50.0 = 50%)", diff --git a/tw_experimentation/streamlit/pages_wrap/page4_Experiment_Evaluation.py b/tw_experimentation/streamlit/pages_wrap/page4_Experiment_Evaluation.py index 451a833..b138364 100644 --- a/tw_experimentation/streamlit/pages_wrap/page4_Experiment_Evaluation.py +++ b/tw_experimentation/streamlit/pages_wrap/page4_Experiment_Evaluation.py @@ -68,8 +68,8 @@ def page_4_experiment_evaluation(): st.session_state["evaluation_alpha"] = 5.0 alpha = st.session_state.evaluation_alpha st.markdown( - """ - In case you have multiple treatment variants, you will have the option to perform a multi-test correction on the p-values + """ + In case you have multiple treatment variants, you will have the option to perform a multi-test correction on the p-values (current implementation: Bonferroni method. This is subject to change in the future). """ ) @@ -125,7 +125,7 @@ def page_4_experiment_evaluation(): on_change=swap_checkbox_state, args=("evaluate_CUPED",), help=""" - CUPED is a variance reduction method leveraging pre-experiment data in order to increase the sensitivity of an A/B test. + CUPED is a variance reduction method leveraging pre-experiment data in order to increase the sensitivity of an A/B test. The basic idea is to use pre-experiment data as a control variate in the test; the pre-experiment data is used to transform the target variable so that its variability is lowered after which we apply the standard/vanilla T-test to the transformed target. """, ) @@ -171,27 +171,27 @@ def page_4_experiment_evaluation(): st.divider() st.subheader("Segmentation with Wise Pizza") st.markdown( - """ + """ - In this section you can find unusual segments in terms of the difference between the control and test groups - Please provide segments which you want to analyse, metric to analyse and number of observations """ ) st.markdown( - """ + """ Find segments whose average is most different from the global one - `segments`: List of discrete dimensions to find slices - `target`: Metric to analyse - - `treatment`: If you have different test groups, specify group here, for example treatment=1 means + - `treatment`: If you have different test groups, specify group here, for example treatment=1 means we compare with first treatment group - `min_segments`: Minimum number of segments to find - `max_depth`: Maximum number of dimension to constrain in segment definition - - - *Warning*: The p-values are currently not corrected for multiple comparisons. - However, Wise-Pizza identifies segments as interesting only if the treatment effect is sufficiently high - compared to the segment sample size so this selection is a first approximation of avoiding + + + *Warning*: The p-values are currently not corrected for multiple comparisons. + However, Wise-Pizza identifies segments as interesting only if the treatment effect is sufficiently high + compared to the segment sample size so this selection is a first approximation of avoiding p-value inflation in segmentation analysis. """ ) @@ -235,7 +235,7 @@ def page_4_experiment_evaluation(): st.write("Expand figure below to fullscreen for best view.") st.markdown( - """ + """ #### How can I interpret the results? - We are trying to find unusual segments in terms of the averages (***to highlight the segments contributing the most to the difference between test and control***) @@ -262,7 +262,7 @@ def page_4_experiment_evaluation(): st.write("No segments available for frequentist segmentation analysis") else: st.markdown( - """ + """ #### How can I interpret the results? - For each variant we compare it with the control group @@ -293,8 +293,10 @@ def page_4_experiment_evaluation(): st.dataframe(s) elif not st.session_state["is_experiment"]: st.write( - "You have only provided pre-experiment data. " - 'Please define the experiment in "Experiment Design" first to use Experiment Evaluation.' + """You have only provided pre-experiment data. + Please define the experiment in "Experiment Design" + first to use Experiment Evaluation. + """ ) else: diff --git a/tw_experimentation/streamlit/pages_wrap/page5_Experiment_Evaluation_Bayesian.py b/tw_experimentation/streamlit/pages_wrap/page5_Experiment_Evaluation_Bayesian.py index d36014c..b8493d6 100644 --- a/tw_experimentation/streamlit/pages_wrap/page5_Experiment_Evaluation_Bayesian.py +++ b/tw_experimentation/streamlit/pages_wrap/page5_Experiment_Evaluation_Bayesian.py @@ -5,8 +5,6 @@ ) import plotly.graph_objects as go -import numpy as np - def page_5_experiment_evaluation_bayesian(): st.session_state.update(st.session_state) @@ -22,9 +20,10 @@ def page_5_experiment_evaluation_bayesian(): else: emd = st.session_state.ed.experiment_meta_data() st.write( - "In this section, we fit a Bayesian model on the data.We then use the posterior" - " distribution to compute the probability that the variant is better than the" - " control." + "In this section, we fit a Bayesian model on the data." + "We then use the posterior" + " distribution to compute the probability that the variant " + "is better than the control." ) br = bayes_cache_wrapper( @@ -45,16 +44,24 @@ def page_5_experiment_evaluation_bayesian(): # TODO: Enable bayes factor when tested properly # st.subheader("Bayes Factor") - description = ( - "The Bayes Factor is the p-value analogue in Bayesian hypothesis testing. " - "It allows us to compare the hypothesis of no effect (null hypothesis) to the" - " hypothesis of that there is an effect (alternative hypothesis)." - "The decision is made based on the Risk, which is the probability that the" - "the null hypothesis is true given the data (probability of a false discovery)." - ) + description = """ + The Bayes Factor is the p-value analogue in Bayesian hypothesis testing. + It allows us to compare the hypothesis of no effect (null hypothesis) + to the hypothesis of that there is an effect (alternative hypothesis). + The decision is made based on the Risk, which is the probability that + the null hypothesis is true given the data + (probability of a false discovery).""" # TODO: Enable bayes factor when tested properly # st.write(description) - + fdrs = [ + br.false_discovery_rate(st.session_state["bayes_target_plot"], var) + for var in range(1, st.session_state.ed.n_variants) + ] + fdrs = [f"{fdr:.2f}" for fdr in fdrs] + bayes_factors = [ + f'{br.bayes_factor(st.session_state["bayes_target_plot"], var):.2f}' + for var in range(1, st.session_state.ed.n_variants) + ] fig = go.Figure( data=[ go.Table( @@ -68,14 +75,8 @@ def page_5_experiment_evaluation_bayesian(): ), cells=dict( values=[ - [ - f'{br.bayes_factor(st.session_state["bayes_target_plot"], var):.2f}' - for var in range(1, st.session_state.ed.n_variants) - ], - [ - f'{br.false_discovery_rate(st.session_state["bayes_target_plot"], var)*100:.2f}' - for var in range(1, st.session_state.ed.n_variants) - ], + bayes_factors, + fdrs, [ br.bayes_factor_decision( st.session_state["bayes_target_plot"], var @@ -97,7 +98,8 @@ def page_5_experiment_evaluation_bayesian(): row_order = n_variants <= n_cols_max + 1 description = ( - "The probability that the average treatment effect is **greater than 0** for" + "The probability that the average treatment effect is" + "**greater than 0** for" f" outcome metric **{st.session_state['bayes_target_plot']}** is\n" ) st.write(description) @@ -108,22 +110,31 @@ def page_5_experiment_evaluation_bayesian(): for variant in range(1, n_variants): if row_order: with cols[variant - 1]: + prob_greater_zero = ( + br.prob_greater_than_zero( + st.session_state["bayes_target_plot"] + )[variant] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_greater_than_zero(st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_greater_zero:.2f} %"), ) else: + prob_greater_zero = ( + br.prob_greater_than_zero(st.session_state["bayes_target_plot"])[ + variant + ] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_greater_than_zero(st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_greater_zero:.2f} %"), ) description = ( - "The probability that the average treatment effect is **smaller than 0** for" + "The probability that the average treatment effect" + "is **smaller than 0** for" f" outcome metric **{st.session_state['bayes_target_plot']}** is\n" ) st.write(description) @@ -134,18 +145,26 @@ def page_5_experiment_evaluation_bayesian(): for variant in range(1, n_variants): if row_order: with cols[variant - 1]: + prob_smaller_zero = ( + 1 + - br.prob_greater_than_zero( + st.session_state["bayes_target_plot"] + )[variant] + ) * 100 st.metric( f"{emd.variant_names[variant]}", - value=( - f"{(1-br.prob_greater_than_zero(st.session_state['bayes_target_plot'])[variant])*100:.2f} %" - ), + value=(f"{prob_smaller_zero:.2f} %"), ) else: + prob_smaller_zero = ( + 1 + - br.prob_greater_than_zero(st.session_state["bayes_target_plot"])[ + variant + ] + ) * 100 st.metric( f"{emd.variant_names[variant]}", - value=( - f"{(1-br.prob_greater_than_zero(st.session_state['bayes_target_plot'])[variant])*100:.2f} %" - ), + value=(f"{prob_smaller_zero:.2f} %"), ) st.subheader( @@ -154,7 +173,8 @@ def page_5_experiment_evaluation_bayesian(): st.write( """ - The probability that the average treatment effect is greater or smaller than a custom `threshold`: + The probability that the average treatment effect is greater + or smaller than a custom `threshold`: """ ) @@ -176,18 +196,28 @@ def page_5_experiment_evaluation_bayesian(): for variant in range(1, n_variants): if row_order: with cols2[variant - 1]: + prob_greater_z = ( + br.prob_greater_than_z( + st.session_state["bayes_threshold"], + st.session_state["bayes_target_plot"], + )[variant] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_greater_than_z(st.session_state['bayes_threshold'], st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_greater_z:.2f} %"), ) else: + prob_greater_z = ( + br.prob_greater_than_z( + st.session_state["bayes_threshold"], + st.session_state["bayes_target_plot"], + )[variant] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_greater_than_z(st.session_state['bayes_threshold'], st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_greater_z:.2f} %"), ) description = ( @@ -203,37 +233,58 @@ def page_5_experiment_evaluation_bayesian(): for variant in range(1, n_variants): if row_order: with cols2[variant - 1]: + prob_smaller_z = ( + br.prob_smaller_than_z( + st.session_state["bayes_threshold"], + st.session_state["bayes_target_plot"], + )[variant] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_smaller_than_z(st.session_state['bayes_threshold'], st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_smaller_z:.2f} %"), ) else: + prob_smaller_z = ( + br.prob_smaller_than_z( + st.session_state["bayes_threshold"], + st.session_state["bayes_target_plot"], + )[variant] + * 100 + ) st.metric( f"{emd.variant_names[variant]}", - value=( - f"{br.prob_smaller_than_z(st.session_state['bayes_threshold'], st.session_state['bayes_target_plot'])[variant]*100:.2f} %" - ), + value=(f"{prob_smaller_z:.2f} %"), ) st.subheader( - "Probability that treatment effect is in ROPE (Region of Practical Equivalence)" + "Probability that treatment effect is in ROPE" + "(Region of Practical Equivalence)" ) st.markdown( """ - - As an analogue to frequentist testing, you can define a region of practical equivalence (ROPE). This is an interval where we assume that when the effect is within the interval, the effect is negligible (e.g. not big enough to make an impact, cover cost of rolling out the change). - - Another reason for defining a ROPE interval is that when we simply look at the probability that the effect is greater than 0, that probability will be 50% even without using any data if we assume that the effect is centered around 0. - - The ROPE interval sizes are autodetected based on the variance of the outcome metric. + As an analogue to frequentist testing, + you can define a region of practical equivalence (ROPE). + This is an interval where we assume that when the effect is + within the interval, + the effect is negligible + (e.g. not big enough to make an impact, + cover cost of rolling out the change). + + Another reason for defining a ROPE interval is that when we simply + look at the + probability that the effect is greater than 0, that probability + will be 50% even without using any data if we assume that the effect + is centered around 0. + The ROPE interval sizes are autodetected based on the variance + of the outcome metric. """ ) probs, rope_lower, rope_upper = br.rope(st.session_state["bayes_target_plot"]) description = ( - "The probability that the average treatment effect is outside the **region of" + "The probability that the average treatment effect is" + "outside the **region of" f" practical equivalence ({rope_lower:.2f}, {rope_upper:.2f})** for outcome" f" metric **{st.session_state['bayes_target_plot']}** is\n" ) @@ -270,7 +321,8 @@ def page_5_experiment_evaluation_bayesian(): ) description = ( - "The probability that the average treatment effect is **outside the interval" + "The probability that the average treatment effect is" + "**outside the interval" f" ({st.session_state['bayes_threshold_lower']:.2f}," f" {st.session_state['bayes_threshold_upper']:.2f})** for outcome metric" f" **{st.session_state['bayes_target_plot']}** is\n" @@ -301,7 +353,8 @@ def page_5_experiment_evaluation_bayesian(): elif not st.session_state["is_experiment"]: st.write( "You have only provided pre-experiment data. " - 'Please define the experiment in "Experiment Design" first to use Experiment Evaluation Bayesian.' + 'Please define the experiment in "Experiment Design" first to ' + "use Experiment Evaluation Bayesian." ) else: diff --git a/tw_experimentation/streamlit/streamlit_utils.py b/tw_experimentation/streamlit/streamlit_utils.py index e336c37..b0b42b2 100644 --- a/tw_experimentation/streamlit/streamlit_utils.py +++ b/tw_experimentation/streamlit/streamlit_utils.py @@ -1,18 +1,10 @@ -import sys, os, pathlib +# import sys, os, pathlib +# root_path = os.path.realpath("../..") +# sys.path.insert(0, root_path) -root_path = os.path.realpath("../..") -sys.path.insert(0, root_path) +# sys.path.append(str(pathlib.Path().absolute()).split("/tw_experimentation")[0]) -sys.path.append(str(pathlib.Path().absolute()).split("/tw_experimentation")[0]) -import streamlit as st - -st.write(sys.path) -import pandas as pd -from scipy.stats import chi2_contingency - - -### For PullAndMatchData from tw_experimentation.utils import ExperimentDataset from tw_experimentation.statistical_tests import ( FrequentistTest, @@ -31,7 +23,18 @@ fig_variant_segment_dependence, ) +from tw_experimentation.checker import ( + Monitoring, + SegmentMonitoring, + SequentialTest, + NormalityChecks, +) +from tw_experimentation.bayes.bayes_test import BayesTest +import streamlit as st + +import pandas as pd +from scipy.stats import chi2_contingency from snowflake.sqlalchemy import URL from sqlalchemy import create_engine import json @@ -52,16 +55,7 @@ RESULT_DATABASE, RESULT_SCHEMA, RESULT_TABLE, - ID_COLUMN, - TIMESTAMP_COLUMN, -) -from tw_experimentation.checker import ( - Monitoring, - SegmentMonitoring, - SequentialTest, - NormalityChecks, ) -from tw_experimentation.bayes.bayes_test import BayesTest def fetch_data_from_table_name(warehouse: str, schema: str, table: str): @@ -69,7 +63,7 @@ def fetch_data_from_table_name(warehouse: str, schema: str, table: str): def exp_config_to_json(): - """Converts the current session state to a json file""" + """Converts the current session state to a json file.""" config = { "exp_name": st.session_state.exp_name, "is_experiment": st.session_state.is_experiment, @@ -158,7 +152,7 @@ def reset_exp_variables(vars: Optional[List] = None): def cols_to_select(data_loader_cols: List, cols_to_exclude: List[Union[List, None]]): - """Helper function to select columns from data loader""" + """Helper function to select columns from data loader.""" cols_to_select = set(data_loader_cols) for cols in cols_to_exclude: if cols is not None: @@ -170,9 +164,9 @@ def cols_to_select(data_loader_cols: List, cols_to_exclude: List[Union[List, Non class PullAndMatchData: - """Class for - - pulling data from snowflake - - put data into ExperimentDataset instance + """Class for. + + - pulling data from snowflake - put data into ExperimentDataset instance """ def __init__( @@ -272,20 +266,25 @@ def define_data_model( outcomes: Optional[List[str]] = None, is_dynamic: Optional[bool] = True, ): - """Create data model / ExperimentDataset with all relevant specification - If an ExperimentDataset instance is supplied: Still updates based on other inputs such as - targets, variant etc. if those are supplied + """Create data model / ExperimentDataset with all relevant specification If an + ExperimentDataset instance is supplied: Still updates based on other inputs such + as targets, variant etc. if those are supplied. Args: data (Optional[Union[pd.DataFrame, ExperimentDataset]], optional): - dataframe or ready ExperimentDataset. - If None, relies on data pulled from Snowflake. Defaults to None. - variant (Optional[str], optional): Name of variant columnn. Defaults to None. - targets (Optional[List[str]], optional): List of primary metrics. Defaults to None. - event_timestamp (Optional[str], optional): Timestamp if available. Defaults to None. - outcomes (Optional[List[str]], optional): List of outcome metrics. - Outcomes that are not targets will be excluded from analysis. Defaults to None. - is_dynamic(Optional[bool], optional): Whether experiment is dynamic or static. Defaults to True + dataframe or ready ExperimentDataset. If None, relies on data pulled + from Snowflake. Defaults to None. + variant (Optional[str], optional): Name of + variant columnn. Defaults to None. + targets (Optional[List[str]], optional): + List of primary metrics. Defaults to None. + event_timestamp (Optional[str], + optional): Timestamp if available. Defaults to None. + outcomes(Optional[List[str]], optional): List of outcome metrics. + Outcomes that are not targets will be excluded from analysis. + Defaults to None. + is_dynamic(Optional[bool], optional): Whether experiment is dynamic or static. + Defaults to True """ if variant is not None: @@ -319,7 +318,8 @@ def define_data_model( def _datamodel_is_defined(self): if not isinstance(self._ed, ExperimentDataset): - 'An ExperimentDataset needs to be defined through the method "define_data_model()"' + "An ExperimentDataset needs to be defined through the method" + '"define_data_model()"' return isinstance(self._ed, ExperimentDataset) @property @@ -391,7 +391,7 @@ def generate_experiment_output(*args, **kwargs): def ingest_loaded_output(output, temp_only=True): - """Ingests output from a loaded json file""" + """Ingests output from a loaded json file.""" assert isinstance(output, dict) st.session_state["output_loaded"] = output if not temp_only: @@ -598,14 +598,12 @@ def frequentist_segmentation(ed, segments, alpha=0.05): def _coming_from_other_page(current_page, last_page): - """Check whether coming from other streamlit page + """Check whether coming from other streamlit page. - Args: - current_page (str): current page name - last_page (str): page name on last streamlit run + Args: current_page (str): current page name last_page (str): page name on + last streamlit run - Returns: - bool: arriving from other page or not + Returns: bool: arriving from other page or not """ return not (current_page == last_page) diff --git a/tw_experimentation/synthetic_data.py b/tw_experimentation/synthetic_data.py index cd487f2..9109843 100644 --- a/tw_experimentation/synthetic_data.py +++ b/tw_experimentation/synthetic_data.py @@ -19,24 +19,29 @@ def revenue_conversion_data( has_date_column: bool = True, seed: int = 1, ) -> pd.DataFrame: - """ - Generate synthetic revenue and conversion metrics data for experimentation. + """Generate synthetic revenue and conversion metrics data for experimentation. Args: n (int): Number of observations to generate (default: 1000). - treatment_share (float): Share of observations assigned to treatment (default: 0.5). + treatment_share (float): Share of observations assigned to treatment + (default: 0.5). baseline_conversion (float): Baseline conversion rate (default: 0.5). - treatment_effect_conversion (float): Treatment effect on conversion rate (default: 0.1). + treatment_effect_conversion (float): Treatment effect on conversion rate + (default: 0.1). baseline_mean_revenue (float): Baseline mean revenue (default: 5). sigma_revenue (float): Standard deviation of revenue (default: 2). treatment_effect_revenue (float): Treatment effect on revenue (default: 0.1). - is_dynamic_assignment (bool): Whether to assign treatment dynamically based on trigger dates (default: True). - has_date_column (bool): Whether to include a date column in the output dataframe (default: True). + is_dynamic_assignment (bool): Whether to assign treatment dynamically based + on trigger dates (default: True). + has_date_column (bool): Whether to include a date column in + the output dataframe (default: True). seed (int): Random seed for reproducibility (default: 1). Returns: - pd.DataFrame: Generated synthetic data with columns: 'T', 'conversion', 'revenue', 'pre_exp_revenue', 'num_actions'. - If 'has_date_column' is True, the dataframe will also include a 'trigger_dates' column. + pd.DataFrame: Generated synthetic data with columns: 'T', 'conversion', + 'revenue', 'pre_exp_revenue', 'num_actions'. + If 'has_date_column' is True, the dataframe will also + include a 'trigger_dates' column. """ treatment_expectation = baseline_conversion + treatment_effect_conversion @@ -94,21 +99,22 @@ def revenue_conversion_abn_test( treatment_effect_revenue: float = 0.05, seed: int = 1, ) -> pd.DataFrame: - """ - Generate synthetic data for revenue and conversion metrics in an ABN test. + """Generate synthetic data for revenue and conversion metrics in an ABN test. Args: n_treatments (int): Number of treatments (default: 2). n (int): Number of samples (default: None). baseline_conversion (float): Baseline conversion rate (default: 0.5). - treatment_effect_conversion (float): Treatment effect on conversion rate (default: 0.01). + treatment_effect_conversion (float): Treatment effect on conversion rate + (default: 0.01). baseline_mean_revenue (float): Baseline mean revenue (default: 1). sigma_revenue (float): Standard deviation of revenue (default: 2). treatment_effect_revenue (float): Treatment effect on revenue (default: 0.05). seed (int): Random seed (default: 1). Returns: - pd.DataFrame: Synthetic data with columns: 'T', 'conversion', 'revenue', 'pre_exp_revenue', 'num_actions', 'trigger_dates'. + pd.DataFrame: Synthetic data with columns: 'T', 'conversion', 'revenue', + 'pre_exp_revenue', 'num_actions', 'trigger_dates'. """ np.random.seed(seed) @@ -160,8 +166,7 @@ def click_through_data( treatment_share: float = 0.5, seed: int = 0, ) -> pd.DataFrame: - """ - Generate synthetic click-through data for an experiment. + """Generate synthetic click-through data for an experiment. Args: n_users (int): Number of users. @@ -174,7 +179,6 @@ def click_through_data( Returns: pd.DataFrame: DataFrame containing synthetic click-through data. - """ random.seed(seed) diff --git a/tw_experimentation/tester.py b/tw_experimentation/tester.py index a6d1352..a677dfb 100644 --- a/tw_experimentation/tester.py +++ b/tw_experimentation/tester.py @@ -1,15 +1,13 @@ -""" -Tool for testing ab experiments using different approaches for several metrics. -Code contains Tester class with methods for specific tests. -Support only pandas DataFrames now. +"""Tool for testing ab experiments using different approaches for several metrics. + +Code contains Tester class with methods for specific tests. Support only pandas +DataFrames now. """ import pandas as pd import numpy as np from statsmodels import stats as sms from scipy import stats as scipystats -import statsmodels.stats.weightstats as smws -import matplotlib.pyplot as plt from sklearn.utils import resample @@ -48,10 +46,8 @@ def __init__( } def contingency_table(self): - """ - Output contingency table of outcome for treatment and control - :return: 2x2 DataFrame - """ + """Output contingency table of outcome for treatment and control :return: 2x2 + DataFrame.""" ct = pd.crosstab(self.df[self.treatment], self.df[self.metric]) idx = pd.Index(["Control", "Treatment"], name=None) ct.index = idx @@ -89,18 +85,13 @@ def static_proportion_test(self, direction="two-sided"): } def fishers_exact_test(self, direction="two-sided"): - """ - Fisher's exact test - :param df: DataFrame - :param treatment: str - treatment name - :param metric: str - outcome name - :param direction: - Options: + """Fisher's exact test :param df: DataFrame :param treatment: str treatment name + :param metric: str outcome name :param direction: + + Options: ‘two-sided’: the odds ratio of the underlying population is not one ‘less’: the odds ratio of the underlying population is less than one - ‘greater’: the odds ratio of the underlying population is greater than one + ‘greater’: the odds ratio of the underlying population is greater than 1 :return: SignificanceResult statistic: float pvalue: float @@ -176,8 +167,8 @@ def run(self): return eval("self." + self.config_tests[self.method] + "()") def mann_whitney_u_test(self): - """ - Mann-Whitney-U test. + """Mann-Whitney-U test. + To be used when sample sized is expected to be skewed / not normally distributed :return: statistic: float @@ -339,8 +330,8 @@ def bootstrap_treatment_control_samples(self, n_samples=10, n_elements=2000): class Tester: - """ - Tool for analysing AB tests + """Tool for analysing AB tests. + Result includes: - Statistical result """ @@ -355,9 +346,7 @@ def __init__( continuous_metrics: Union[List[str], None] = None, customer_features: Union[List[str], None] = None, ): - """ - Tester constructor. - """ + """Tester constructor.""" self.df = df self.user_id_column = user_id_column self.treatment = treatment @@ -396,7 +385,7 @@ def ab_test( for metric in self.binary_metrics ] ) - if not self.binary_metrics is None + if self.binary_metrics is not None else {} ) @@ -418,7 +407,7 @@ def ab_test( for metric in self.continuous_metrics ] ) - if not self.continuous_metrics is None + if self.continuous_metrics is not None else {} ) @@ -426,9 +415,9 @@ def ab_test( # bonferroni method if mult_hyp_correction: n_hypotheses = 0 - if not self.binary_metrics is None: + if self.binary_metrics is not None: n_hypotheses += len(self.binary_metrics) - if not self.continuous_metrics is None: + if self.continuous_metrics is not None: n_hypotheses += +len(self.continuous_metrics) alpha = alpha / n_hypotheses @@ -439,26 +428,28 @@ def decision_and_bootstrap( metric_type: Union[str, None] = None, method: Union[str, None] = None, ): - if not metrics is None: + if metrics is not None: for metric in metrics: if ( "effect_estimate" in results[metric] - and not results[metric]["effect_estimate"] is None + and results[metric]["effect_estimate"] is not None ): - results[metric]["confidence_interval_bootstrapped"] = ( - ConfidenceIntervals( - df=self.df, - user_id_column=self.user_id_column, - treatment=self.treatment, - metric=metric, - metric_type=metric_type, - method=method, - action_date=self.action_date, - ).bootstrap_confidence_interval(alpha=alpha) + results[metric][ + "confidence_interval_bootstrapped" + ] = ConfidenceIntervals( + df=self.df, + user_id_column=self.user_id_column, + treatment=self.treatment, + metric=metric, + metric_type=metric_type, + method=method, + action_date=self.action_date, + ).bootstrap_confidence_interval( + alpha=alpha ) if ( "pvalue" in results[metric] - and not results[metric]["pvalue"] is None + and results[metric]["pvalue"] is not None ): if results[metric]["pvalue"] < alpha: results[metric]["Decision"] = "Reject H0" @@ -475,37 +466,39 @@ def decision_and_bootstrap( method_continuous, ) - if not self.binary_metrics is None: + if self.binary_metrics is not None: for metric in self.binary_metrics: if ( "effect_estimate" in binary_test_results[metric] - and not binary_test_results[metric]["effect_estimate"] is None + and binary_test_results[metric]["effect_estimate"] is not None ): - binary_test_results[metric]["confidence_interval_bootstrapped"] = ( - ConfidenceIntervals( - df=self.df, - user_id_column=self.user_id_column, - treatment=self.treatment, - metric=metric, - metric_type="Binary", - method=method_binary, - action_date=self.action_date, - ).bootstrap_confidence_interval(alpha=alpha) + binary_test_results[metric][ + "confidence_interval_bootstrapped" + ] = ConfidenceIntervals( + df=self.df, + user_id_column=self.user_id_column, + treatment=self.treatment, + metric=metric, + metric_type="Binary", + method=method_binary, + action_date=self.action_date, + ).bootstrap_confidence_interval( + alpha=alpha ) if ( "pvalue" in binary_test_results[metric] - and not binary_test_results[metric]["pvalue"] is None + and binary_test_results[metric]["pvalue"] is not None ): if binary_test_results[metric]["pvalue"] < alpha: binary_test_results[metric]["Decision"] = "Reject H0" else: binary_test_results[metric]["Decision"] = "Accept H0" - if not self.continuous_metrics is None: + if self.continuous_metrics is not None: for metric in self.continuous_metrics: if ( "effect_estimate" in continuous_test_results[metric] - and not continuous_test_results[metric]["effect_estimate"] is None + and continuous_test_results[metric]["effect_estimate"] is not None ): continuous_test_results[metric][ "confidence_interval_bootstrapped" @@ -522,7 +515,7 @@ def decision_and_bootstrap( ) if ( "pvalue" in continuous_test_results[metric] - and not continuous_test_results[metric]["pvalue"] is None + and continuous_test_results[metric]["pvalue"] is not None ): if continuous_test_results[metric]["pvalue"] < alpha: continuous_test_results[metric]["Decision"] = "Reject H0" diff --git a/tw_experimentation/utils.py b/tw_experimentation/utils.py index 7787481..3541233 100644 --- a/tw_experimentation/utils.py +++ b/tw_experimentation/utils.py @@ -5,13 +5,13 @@ import numpy as np from numpy.distutils.misc_util import is_sequence from itertools import repeat -from sklearn.utils import resample from tw_experimentation.constants import PLOTLY_COLOR_PALETTE, MetricType def highlight(df): - """Highlight significant results in green, non-significant in red for frequentist stat table""" + """Highlight significant results in green, non-significant in red for frequentist + stat table.""" if df["is_significant"]: if df["Estimated_Effect_relative"] > 0: return ["background-color: lightgreen"] * len(df) @@ -38,7 +38,7 @@ def hex_to_rgb(hex_color: str) -> tuple: def variant_name_map(n_variants: int): - """Name variants to be used in plots + """Name variants to be used in plots. Args: n_variants (int): Number of variants, includes control @@ -122,24 +122,30 @@ def __init__( is_dynamic_observation: Union[Optional[bool], None] = None, is_only_pre_experiment: Optional[bool] = False, ) -> None: - """ - Implements data logic for A/B testing. Assumes that observations are already on experiment analysis level. + """Implements data logic for A/B testing. Assumes that observations are already + on experiment analysis level. Args: data (pd.DataFrame): data with columns variant, target, date variant (str): variant column name targets (Union[str, List[str]]): target column name(s) - pre_experiment_cols (Optional[List[str]], optional): pre-experimental data columns. Defaults to None. - metric_types (Optional[dict[str, str]], optional): metric types ('binary', 'discrete', or 'continuous'). - Defaults to None. + pre_experiment_cols (Optional[List[str]], optional): + pre-experimental data columns. Defaults to None. + metric_types (Optional[dict[str, str]], optional): + metric types ('binary', 'discrete', or 'continuous'). Defaults to None. date (Optional[str], optional): timestamp column. Defaults to None. - ratio_targets (Optional[dict[str, tuple]], optional): ratio targets with numerator and denominator in tuple. + ratio_targets (Optional[dict[str, tuple]], optional): + ratio targets with numerator and denominator in tuple. Not implemented yet. Defaults to None. - n_variants (Optional[int], optional): Number of variants TODO: autodetect this. Defaults to 2. - control_label (Optional[str], optional): Label of the control group variant. Defaults to 0. - is_dynamic_observation (Optional[bool], optional): Whether the assignment are dynamic + n_variants (Optional[int], optional): Number of variants + TODO: autodetect this. Defaults to 2. + control_label (Optional[str], optional): Label of the control group variant. + Defaults to 0. + is_dynamic_observation (Optional[bool], optional): + Whether the assignment are dynamic (for monitoring and sequential analysis). Defaults to True. - is_only_pre_experiment (Optional[bool], optional): Whether it only uses pre-experimental data. + is_only_pre_experiment (Optional[bool], optional): + Whether it only uses pre-experimental data. """ # e.g. ratio_targets {'volume_per_transaction': # (total_volumes_per_customer, n_transactions_per_customer)} @@ -189,7 +195,8 @@ def preprocess_dataset(self, remove_outliers=True): assert ( self.is_only_pre_experiment is False ), "Can only preprocess data from experiment" - # TODO: check if variant = 0,1[,2,...] or string. If the latter, ask for name of control group + # TODO: check if variant = 0,1[,2,...] or string. + # If the latter, ask for name of control group # For now, assume that control = 0, treatment = 1,2,3,... # TODO: Convert date column to datetime @@ -198,7 +205,7 @@ def preprocess_dataset(self, remove_outliers=True): assert 0 in list(self.data[self.variant].unique()) and 1 in list( self.data[self.variant].unique() ) - except: + except Exception as e: # noqaF841 raise Exception("Variants must be 0 (=control), 1,[2,...] (=treatment)") experiment_cols = [ @@ -256,7 +263,8 @@ def _binary_metric_value_check(self): ), "Binary metric types must consist of values 0 or 1" def _detect_is_dynamic_observation(self): - """Detect whether the assignment of variants is dynamic (for monitoring and sequential analysis)""" + """Detect whether the assignment of variants is dynamic (for monitoring and + sequential analysis)""" if self.date is not None and self.data[self.date].nunique() > 1: self.is_dynamic_observation = True else: @@ -308,7 +316,7 @@ def _detect_metric_types(self, metrics): self.metric_types[metric] = MetricType.DISCRETE.value else: self.metric_types[metric] = MetricType.CONTINUOUS.value - except Exception as TypeError: + except Exception as TypeError: # noqaF841 print(error_msg) else: raise TypeError(error_msg) @@ -359,7 +367,7 @@ def experiment_meta_data(self): @property def target_standard_deviations(self): - """Compute standard deviation of each target for control group""" + """Compute standard deviation of each target for control group.""" return { target: self.data.loc[self.data[self.variant] == 0, target].std() for target in self.targets diff --git a/tw_experimentation/variance_reduction/cupac.py b/tw_experimentation/variance_reduction/cupac.py index c9a7e50..82ed4cc 100644 --- a/tw_experimentation/variance_reduction/cupac.py +++ b/tw_experimentation/variance_reduction/cupac.py @@ -1,6 +1,3 @@ -from tw_experimentation.variance_reduction.variance_reduction_method import ( - VarianceReductionMethod, -) from tw_experimentation.variance_reduction.cuped import ( CUPED, ) @@ -23,7 +20,7 @@ def fit( model=HistGradientBoostingRegressor, # TODO: typing model_init_config: dict = {}, model_fit_config: dict = {}, - **kwargs + **kwargs, ) -> Self: """Applies CUPAC to data. diff --git a/tw_experimentation/variance_reduction/cuped.py b/tw_experimentation/variance_reduction/cuped.py index d7f9a25..65a11be 100644 --- a/tw_experimentation/variance_reduction/cuped.py +++ b/tw_experimentation/variance_reduction/cuped.py @@ -16,7 +16,7 @@ def fit( treatment_column: str, target_column: str, covariate_column: str, - **kwargs + **kwargs, ): """Apply CUPED to data. @@ -24,7 +24,8 @@ def fit( data (pd.DataFrame): experiment data containing pre-experiment data column treatment_column (str): name of column containing treatment flags target_column (str): name of column containing target metric - covariate_column (str): name of column containing the covariate (pre-experiment data) + covariate_column (str): name of column containing the covariate + (pre-experiment data) Returns: CUPED: self @@ -36,8 +37,6 @@ def fit( assert set(data[treatment_column].unique()) == {0, 1} treatment = data[treatment_column] - # treatment = pd.get_dummies(data[treatment_column], drop_first=True) # TODO: investigate behaviour when treatment is already binary, also investigate what happens when treatment has more than 2 distinct values - # compute theta by regressing the target on the covariate t = ( sm.OLS( @@ -173,7 +172,7 @@ def fit( treatment_column: str, target_column: str, covariate_columns: List[str], - **kwargs + **kwargs, ): """Fit the multivariate CUPED model.""" @@ -183,8 +182,6 @@ def fit( assert set(data[treatment_column].unique()) == {0, 1} treatment = data[treatment_column] - # treatment = pd.get_dummies(data[treatment_column], drop_first=True) # TODO: investigate behaviour when treatment is already binary, also investigate what happens when treatment has more than 2 distinct values - # compute theta by regressing the target on the covariate covariance = data[[target_column] + covariate_columns].cov() matrix = covariance.loc[covariate_columns, covariate_columns] diff --git a/tw_experimentation/variance_reduction/doubly_robust.py b/tw_experimentation/variance_reduction/doubly_robust.py index 50102bd..18941bc 100644 --- a/tw_experimentation/variance_reduction/doubly_robust.py +++ b/tw_experimentation/variance_reduction/doubly_robust.py @@ -4,7 +4,6 @@ ) import pandas as pd from typing import List -import statsmodels.api as sm from sklearn.ensemble import HistGradientBoostingRegressor # import warnings filter @@ -29,9 +28,9 @@ def fit( model_propensity_init_config={}, model_regression=HistGradientBoostingRegressor, model_regression_init_config={}, - **kwargs + **kwargs, ): - """Fit the Doubly Robust Estimator to the data.s + """Fit the Doubly Robust Estimator to the data.s. Args: data (pd.DataFrame): experiment data @@ -97,7 +96,8 @@ def ci_width(self, data, covariate_columns, alpha: float = 0.05): def calculate_ci_width_reduction( self, data, covariate_columns, alpha: float = 0.05 ): - """Calculate the width reduction rate in 1-`alpha` * 100% confidence interval.""" + """Calculate the width reduction rate in 1-`alpha` * 100% confidence + interval.""" baseline_conf_int = self.baseline_results.conf_int(alpha=alpha, cols=None) baseline_ci_width = baseline_conf_int[1][1] - baseline_conf_int[0][1] diff --git a/tw_experimentation/variance_reduction/evaluation_pipeline.py b/tw_experimentation/variance_reduction/evaluation_pipeline.py index f1be3ee..262deac 100644 --- a/tw_experimentation/variance_reduction/evaluation_pipeline.py +++ b/tw_experimentation/variance_reduction/evaluation_pipeline.py @@ -11,7 +11,6 @@ import seaborn as sns from tqdm import tqdm import matplotlib.pyplot as plt -from joblib import Parallel, delayed import copy @@ -62,8 +61,10 @@ def run( Args: data (pd.DataFrame, optional): experiment data. Defaults to None. - treatment_column (str, optional): name of the column containing treatment assignment. Defaults to None. - target_column (str, optional): name of the column containing target metric. Defaults to None. + treatment_column (str, optional): name of the column containing + treatment assignment. Defaults to None. + target_column (str, optional): name of the column containing target metric. + Defaults to None. true_ate (float, optional): true average treatment effect. Defaults to None. method_params_map (dict, optional): map from a method to a . Defaults to {}. verbose (bool, optional): _description_. Defaults to False. @@ -107,7 +108,6 @@ def run( ) # calculate the statistics of interest - # self.estimators.append(estimator) # NOTE: commented to improve memory efficiency self.estimates.append(estimator.estimate) self.cis.append(estimator.conf_int_95) self.p_values.append(estimator.p_value) @@ -224,22 +224,6 @@ def plot(self, plot_what: List[str]): # plot the distribution of the estimates overlaid with the baseline estimate elif plot_what == "estimates": # TODO: convert to a sns distplot - # fig, ax = plt.subplots() - - # # Plot histograms with kde=True using seaborn's histplot - # sns.histplot(self.estimates, bins=20, color='blue', alpha=0.2, label=f'{self.method.__name__}', kde=True, ax=ax) - # sns.histplot(self.baseline_estimates, bins=20, color='red', alpha=0.2, label='DiM', kde=True, ax=ax) - - # ax.set_xlabel('Estimate') - # ax.set_ylabel('Density') - - # if self.true_ate is not None: - # ax.axvline(x=self.true_ate, color='black', label='True ATE', linestyle='--') - - # ax.legend() - # ax.set_title(f'Estimates for {self.method.__name__}') - # plt.show() - fig, ax = plt.subplots() # plot a kde plot of estimates @@ -279,9 +263,9 @@ def plot(self, plot_what: List[str]): return fig, ax -# ==================================================================================================== -# ==================================================================================================== -# ==================================================================================================== +# ===================================================================================== +# ===================================================================================== +# ===================================================================================== class VREvaluationAll: @@ -319,9 +303,12 @@ def run_all( target_column (str): name of the column containing the target metric method_params_map (dict): map from a method to a dict of parameters true_ate (float, optional): true average treatment effect. Defaults to None. - verbose (bool, optional): flag specifying whether to print progress. Defaults to False. - bootstrap_samples (np.array, optional): an array of bootstrap indices to be used in evaluation. Defaults to None. - n_bootstrap (int, optional): number of samples to be bootstrapped. Defaults to 1000. + verbose (bool, optional): flag specifying whether to print progress. + Defaults to False. + bootstrap_samples (np.array, optional): an array of bootstrap indices + to be used in evaluation. Defaults to None. + n_bootstrap (int, optional): number of samples to be bootstrapped. + Defaults to 1000. Returns: Self: self @@ -413,35 +400,15 @@ def plot( shadow=True, ncol=1, ) - ax.set_title(f"p-values for all methods") + ax.set_title("p-values for all methods") plt.show() # plot the distribution of estimates for all methods elif plot_what == "estimates": - # fig, ax = plt.subplots() - - # for method_name in self.evaluations.keys(): - # # if method_name =='DoublyRobustEstimator': - # # continue - # estimates = self.evaluations[method_name].estimates - # sns.histplot(estimates, bins=20, alpha=0.1, label=f'{method_name}', kde=True, ax=ax) - - # ax.set_xlabel('Estimate') - # ax.set_ylabel('Density') - - # if self.true_ate is not None: - # ax.axvline(x=self.true_ate, color='black', label='True ATE', linestyle='--') - - # ax.legend() - # ax.set_title('Estimates for all methods') - # plt.show() - if ax is None: fig, ax = plt.subplots() for method_name in self.evaluations.keys(): - # if method_name == 'MultivariateRegressionAdjusted' or method_name == 'MultivariateRegression': - # continue estimates = self.evaluations[method_name].estimates sns.kdeplot( estimates, fill=False, label=f"{method_name}", ax=ax, linewidth=2 @@ -477,13 +444,14 @@ def plot( return ax -# ==================================================================================================== -# ==================================================================================================== -# ==================================================================================================== +# =============================================================================== +# ============================================================================== +# =============================================================================== class VREvaluationGrid: - """Implments a class for running evaluations of all methods while changing their parameters.""" + """Implments a class for running evaluations of all methods while changing their + parameters.""" def __init__(self, methods: List[VarianceReductionMethod]): self.methods = methods @@ -553,7 +521,8 @@ def generate_parameters_grid( model_init_configs, model_fit_configs, ): - """Generate a grid of method-parameters map from the base method-parameters map and a list of covariate columns as well as models.""" + """Generate a grid of method-parameters map from the base method-parameters map + and a list of covariate columns as well as models.""" # initialize empty grid params_maps_grid = np.ndarray( @@ -617,8 +586,6 @@ def plot_grid(self): show_title=False, ) - # self.evaluation_grid[0, 1].plot(plot_what='estimates', ax=axes[0, 1], show_plot=False) - # self.evaluation_grid[1, 1].plot(plot_what='estimates', ax=axes[1, 1], show_plot=False) axes[0, -1].legend( loc="center left", bbox_to_anchor=(1, 0.5), diff --git a/tw_experimentation/variance_reduction/mlrate.py b/tw_experimentation/variance_reduction/mlrate.py index b9ba079..7884f53 100644 --- a/tw_experimentation/variance_reduction/mlrate.py +++ b/tw_experimentation/variance_reduction/mlrate.py @@ -5,8 +5,7 @@ split_dataframe, ) from typing import List -from sklearn.ensemble import HistGradientBoostingRegressor -from sklearn.linear_model import ElasticNet + import statsmodels.api as sm import statsmodels.formula.api as smf import pandas as pd @@ -17,7 +16,8 @@ class MLRATE(VarianceReductionMethod): - """Implements Machine Learning Regression-Adjusted Treatment Effect Estimator (MLRATE).""" + """Implements Machine Learning Regression-Adjusted Treatment Effect Estimator + (MLRATE).""" def fit( self, @@ -29,9 +29,9 @@ def fit( model=AutoML, model_init_config: dict = {}, model_fit_config: dict = {}, - **kwargs + **kwargs, ): - """_summary_ + """ Args: data (pd.DataFrame): experiment data @@ -40,8 +40,10 @@ def fit( target_column (str): name of column containing the target metric covariate_columns (List[str]): list of names of covariate columns model (_type_, optional): regression model. Defaults to AutoML. - model_init_config (dict, optional): configuration parameters passed at the initialization of the model. Defaults to {}. - model_fit_config (dict, optional): configuration parameters passed at the fitting of the model. Defaults to {}. + model_init_config (dict, optional): configuration parameters + passed at the initialization of the model. Defaults to {}. + model_fit_config (dict, optional): configuration parameters passed at + the fitting of the model. Defaults to {}. Returns: MLRATE: self @@ -54,7 +56,6 @@ def fit( # split data into `K_splits`-folds for cross-fitting splits, index_to_split_map = split_dataframe(df=data, K=K_splits) - # splits, index_to_split_map = split_dataframe(df=data.loc[data[treatment_column=0]], K=K_splits) N = len(data) target = data[target_column] @@ -120,12 +121,6 @@ def fit( ml_rate_df[[treatment_column, "g_pred", "g_pred_difference"]] ), ).fit() - - # self.regression_results = sm.OLS(ml_rate_df['target'], - # sm.add_constant(ml_rate_df[[treatment_column, 'g_pred']]))\ - # .fit() - - # fit difference-in-means estimator self.fit_baseline( data=data, treatment_column=treatment_column, target_column=target_column ) @@ -151,7 +146,7 @@ def fit( # TODO: Fix! This does not seem to be working properly @staticmethod def robust_variance_estimator(Y: np.array, T: np.array, g_pred: np.array): - """Compute the robust variance estimator + """Compute the robust variance estimator. Args: Y (np.array): target metric vector @@ -190,7 +185,7 @@ def robust_variance_estimator(Y: np.array, T: np.array, g_pred: np.array): return sigma_hat_sqrd def calculate_variance_reduction(self): - """Calculate variance reduction for method + """Calculate variance reduction for method. Returns: float: variance reduction rate @@ -236,7 +231,7 @@ def fit( treatment_column: str, target_column: str, covariate_columns: List[str], - **kwargs + **kwargs, ): dfml = data.reset_index(drop=True) # TODO: check if overwrites data @@ -310,13 +305,10 @@ def fit( ], ) self.variance_reduction_rate = self.calculate_variance_reduction() - # self.robust_variance_estimate = self.robust_variance_estimator(Y=target.to_numpy(), - # T=treatment.to_numpy(), - # g_pred=g_pred) return self def calculate_variance_reduction(self): - """Calculate variance reduction for method + """Calculate variance reduction for method. Returns: float: variance reduction rate diff --git a/tw_experimentation/variance_reduction/multivariate_regression.py b/tw_experimentation/variance_reduction/multivariate_regression.py index ffca8c9..4086bd4 100644 --- a/tw_experimentation/variance_reduction/multivariate_regression.py +++ b/tw_experimentation/variance_reduction/multivariate_regression.py @@ -13,7 +13,7 @@ def fit( treatment_column: str, target_column: str, covariate_columns: List[str], - **kwargs + **kwargs, ): """Apply Multivariate Regression to data. @@ -85,7 +85,8 @@ def ci_width(self, alpha: float = 0.05): ) def calculate_ci_width_reduction(self, alpha=0.05): - """Calculate the width reduction rate in 1-`alpha` * 100% confidence interval.""" + """Calculate the width reduction rate in 1-`alpha` * 100% confidence + interval.""" frac = self.ci_width(self.regression_results, alpha) / self.ci_width( self.baseline_results, alpha @@ -106,7 +107,7 @@ def fit( treatment_column: str, target_column: str, covariate_columns: List[str], - **kwargs + **kwargs, ): """Apply Multivariate Regression to data. @@ -192,7 +193,8 @@ def ci_width(self, alpha: float = 0.05): ) def calculate_ci_width_reduction(self, alpha=0.05): - """Calculate the width reduction rate in 1-`alpha` * 100% confidence interval.""" + """Calculate the width reduction rate in 1-`alpha` * 100% confidence + interval.""" frac = self.ci_width(self.regression_results, alpha) / self.ci_width( self.baseline_results, alpha diff --git a/tw_experimentation/variance_reduction/utils.py b/tw_experimentation/variance_reduction/utils.py index f75f6ed..554836f 100644 --- a/tw_experimentation/variance_reduction/utils.py +++ b/tw_experimentation/variance_reduction/utils.py @@ -77,7 +77,9 @@ def bootstrap_generator( yield data.sample(n=sample_size, replace=True) -# NOTE: Say the sample split is 0.3, i think that for some data the control will be 0.3 and for some it will be the treatment, depending on which will end up first when unique() is taken +# NOTE: Say the sample split is 0.3, i think that for some data the control will be +# 0.3 and for some it will be the treatment, +# depending on which will end up first when unique() is taken def subsample_generator( data: pd.DataFrame, sample_size: int, @@ -94,7 +96,8 @@ def subsample_generator( sample_size (int): sample size of each subsample n_bootstrap (int): number of subsamples treatment_col (str): name of treatment column - sample_split (bool, optional): custom sample split proportion. Defaults to False. + sample_split (bool, optional): custom sample split proportion. + Defaults to False. Yields: pd.DataFrame: A subsample of experiment data. @@ -126,7 +129,7 @@ def subsample_generator( def split_dataframe(df: pd.DataFrame, K: int) -> List[np.array]: - """Splits a dataframe into K splits uniformly at random + """Splits a dataframe into K splits uniformly at random. Args: df (pd.DataFrame): data @@ -220,15 +223,18 @@ def subsample_data( def aaify(data: pd.DataFrame, treatment_column: str, frac_control=False): - """Turn an A/B experiment data into A/A experiment data by randomly assigning treatment to control. + """Turn an A/B experiment data into A/A experiment data by randomly assigning + treatment to control. Args: data (pd.DataFrame): experiment data treatment_column (str): name of column containing treatment flags - frac_control (bool, optional): fraction of data to be assigned to control. Defaults to False. # TODO: typing + frac_control (bool, optional): fraction of data to be assigned to control. + Defaults to False. # TODO: typing Returns: - pd.DataFrame: experiment data with treatment filtered out and new treatment assignment + pd.DataFrame: experiment data with treatment filtered out + and new treatment assignment """ aa_df = data.loc[data[treatment_column] == 0] @@ -245,7 +251,10 @@ def aaify(data: pd.DataFrame, treatment_column: str, frac_control=False): def add_synthetic_effect( - data: pd.DataFrame, treatment_column: str, target_column: str, effect_size: float + data: pd.DataFrame, + treatment_column: str, + target_column: str, + effect_size: float, ): """Add a synthetic effect to the experiment data. @@ -259,14 +268,12 @@ def add_synthetic_effect( pd.DataFrame: experiment data with synthetic effect """ - N = len(data) - synthetic_effect_data = data.copy() synthetic_effect_data[target_column] = ( synthetic_effect_data[target_column] + synthetic_effect_data[treatment_column] * effect_size - ) # + np.random.normal(0, 1, size=N) + ) return synthetic_effect_data diff --git a/tw_experimentation/widgetizer.py b/tw_experimentation/widgetizer.py index dd9bc45..810c33b 100644 --- a/tw_experimentation/widgetizer.py +++ b/tw_experimentation/widgetizer.py @@ -1,4 +1,3 @@ -import pandas as pd import numpy as np @@ -6,32 +5,13 @@ from tw_experimentation.statistical_tests import FrequentistTest from tw_experimentation.setuper import Setuper, effect_size_to_uplift -from tw_experimentation.checker import Monitoring from tw_experimentation.segmentation_frequentist import Segmentation -from tw_experimentation.plotting.monitoring_plots import ( - fig_variant_segment_dependence, -) -from scipy.stats import chi2_contingency - -import matplotlib.pyplot as plt -import plotly.express as px import plotly.graph_objects as go -from plotly.subplots import make_subplots import ipywidgets as widgets -from ipywidgets import ( - VBox, - HBox, - interact, - interactive, - interactive_output, - Label, - FloatSlider, - FloatText, - Select, - SelectMultiple, -) +from ipywidgets import interact + from IPython.display import display METRIC_TYPE_OPTIONS = [ @@ -48,14 +28,6 @@ def __init__(self, ed) -> None: # self.monitor = Monitoring(self.ed) # self.monitor._plot_sample_ratio_mismatch() # self.monitor.target_monitoring() - - extra_cols = ( - set(self.ed.data.columns) - - set(self.ed.targets) - - set([self.ed.date]) - - set([self.ed.variant]) - - set([DAY_COL]) - ) """ self.monitor_metric_wt = widgets.SelectMultiple( options=extra_cols, description="Outcome" @@ -75,195 +47,6 @@ def plot_segment(target, segment): self.monitor.segment_monitoring(target, segment) """ - # TODO: Delete commented code, if unused - # def segments_checks(self, segments): - # title_wt = widgets.HTML(value="

Experiment Monitoring

") - # display(title_wt) - # m = Monitoring(ed=self.ed) - # df_dyn = m.dynamic_sample_size_descriptives() - - # sample_size_today = m.total_sample_size_now() - - # msg_global = f"

Sample Size

" - # for k in range(len(sample_size_today)): - # msg_global += f"Variant {k}" - # if k == 0: - # msg_global += " (Control)" - # msg_global += f": {sample_size_today.iloc[k,0]}
" - - # msg_global += f"Total Sample Size: {sample_size_today.sum().iloc[0]}" - - # chi_squared_global_wt = widgets.HTML(value=msg_global) - # display(chi_squared_global_wt) - # g = sns.lineplot( - # data=df_dyn, - # x=self.ed.date, - # y="variant_cnt", - # hue=self.ed.variant, - # palette="dark", - # ) - # plt.xticks(rotation=45) - # plt.show() - # p_values = {"segment": [], "p-value": []} - # chi_squared_plots = [] - # for segment in segments: - # # Chi-squared-test - # contingency_table = pd.crosstab( - # self.ed.data[self.ed.variant], self.ed.data[segment] - # ) - # chi2, p_value, _, expected = chi2_contingency(contingency_table) - # p_values["segment"].append(segment) - # p_values["p-value"].append(p_value) - # p_values_table = pd.DataFrame.from_dict(p_values) - # p_values_table["is significant"] = False - # p_values_table["decision"] = ( - # "Independence hypothesis between group number and segment value cannot" - # " be rejected" - # ) - # # plot chi squared visualisation - # chi_squared_stat_table = m.chi_squared_table(segment) - # fig = fig_variant_segment_dependence(chi_squared_stat_table, self.ed) - # chi_squared_plots.append(fig) - - # p_values_table.loc[p_values_table["p-value"] < 0.05, "is significant"] = True - # p_values_table.loc[p_values_table["p-value"] < 0.05, "decision"] = ( - # "Dependence between group number and segment value (The segment is" - # " distributed disproportionally along the groups)" - # ) - # msg = ( - # f"

Distribution test

The Chi-squared-test that there is no" - # f" association or relationship between the variant and the segment in an" - # f" A/B test has p-values" - # ) - # display(widgets.HTML(value=msg)) - # display( - # p_values_table.style.set_table_attributes( - # "style='display:inline'" - # ).applymap( - # lambda v: "color:green;" if v is True else "color:red;", - # subset=["is significant"], - # ) - # ) - # # fig = make_subplots(rows=1, cols=len(segments)) - # # for k in range(len(chi_squared_plots)): - # # fig.add_trace(chi_squared_plots[k].data[0], row=1, col=k + 1) - # for fig in chi_squared_plots: - # display(fig) - - # for segment in segments: - # msg = f"

{segment}

" - # display(widgets.HTML(value=msg)) - # df_dyn_seg = m.dynamic_sample_size_descriptives(segment=segment) - # fig = px.line( - # df_dyn_seg, - # x=self.ed.date, - # y="variant_cnt", - # color=self.ed.variant, - # facet_row=segment, - # width=600, - # height=600, - # ) - # fig.show() - - # fig, axes = plt.subplots(1, figsize=(6, 4)) - - # sns.histplot( - # data=self.ed.data, - # x=self.ed.data[segment], - # hue=self.ed.variant, - # palette="bright", - # multiple="dodge", - # alpha=0.3, - # ) - # fig.tight_layout(pad=2.0) - # plt.suptitle("Distribution of " + segment) - # fig.show() - - # def segment_check(self, segment=None): - # m = Monitoring(ed=self.ed) - # plt.show() - # if segment is not None: - # sample_size_segments = m.total_sample_size_now(segment=segment) - # vals_segment = dict(self.ed.data[segment].value_counts().apply(int)) - # - # # Chi-squared-test - # contingency_table = pd.crosstab( - # self.ed.data[self.ed.variant], - # self.ed.data[segment] - # ) - # chi2, p_value, _, expected = chi2_contingency(contingency_table) - # - # msg = f"

Segment monitoring

Distribution test

The Chi-squared-test that there is no association or relationship between the variant and the segment in an A/B test {segment} has p-value" - # msg += f"
p-value {p_value:.3f}" - # msg += "

Sample Size

" - # for k in vals_segment.keys(): - # msg += f"{k}:
" - # for j in range(len(sample_size_segments.xs(k, level=1))): - # msg += f"Variant {j}" - # if j == 0: - # msg += " (Control)" - # msg += f": {sample_size_segments.xs(k, level=1).iloc[j,0]}
" - # - # segment_pvals_wt = widgets.HTML(value=msg, hue=self.ed.variant) - # display(segment_pvals_wt) - # - # df_dyn_seg = m.dynamic_sample_size_descriptives(segment=segment) - # # g_seg = sns.FacetGrid( - # # df_dyn_seg, row=segment, hue=self.ed.variant, aspect=2 - # # ) - # # g_seg.map(sns.lineplot, self.ed.date, "variant_cnt") - # # plt.xticks( - # # rotation=45, - # # horizontalalignment="right", - # # fontweight="light", - # # ) - # fig = px.line( - # df_dyn_seg, - # x=self.ed.date, - # y="variant_cnt", - # color=self.ed.variant, - # facet_row=segment, - # width=600, - # height=600, - # ) - # fig.show() - # - # fig, axes = plt.subplots(1, figsize=(6, 4)) - # - # sns.histplot( - # data=self.ed.data, - # x=self.ed.data[segment], - # hue=self.ed.variant, - # palette="bright", - # multiple="dodge", - # alpha=0.3, - # ) - # fig.tight_layout(pad=2.0) - # plt.suptitle("Distribution of " + segment) - # plt.show() - - # display(chi_squared_gloabl_wt) - - # return VBox([title_wt] + list(freqs.values())) - - # freq_box = widgets.HBox(list(freqs.values())) - # target_frequencies = [freqs[j].value for j in range(self.ed.n_variants)] - # m = Monitoring(ed=self.ed) - # ss_plot_wt = widgets.interactive_output(m._plot_sample_size, {}) - # chi_squared_wt = widgets.HTML(value=f"Chi-Squared-Test p-value: {p_val:.3f}") - # ss_numbers_wt = widgets.HTML(value=f"Chi-Squared-Test p-value: {p_val:.3f}") - # @widgets.interact(**freqs) - # def global_sample_size(**kwargs): - # m = Monitoring(ed=self.ed) - # p_val = m._chi_squared_global(f_exp=list(kwargs.values())) - # return p_val - - # widgets.interact(global_sample_size, **freqs) - # global_wt = widgets.HBox([ss_plot_wt, chi_squared_wt]) - # exp_monitor = widgets.VBox([title_wt, freq_box, global_wt]) - - # display(exp_monitor) - class SegmentationInterface: def __init__(self, ed) -> None: @@ -334,7 +117,7 @@ def classical_test(self): ) self.type_widget = widgets.ToggleButtons( options=METRIC_TYPE_OPTIONS, - description=f"Key outcome metric type:", + description="Key outcome metric type:", disabled=False, style=dict(description_width="initial"), ) @@ -368,7 +151,7 @@ def classical_test(self): self.et_widget = widgets.ToggleButtons( options=["absolute", "relative"], - description=f"Effect type:", + description="Effect type:", disabled=False, style=dict(description_width="initial"), ) @@ -493,15 +276,18 @@ def tester( ) fig.update_xaxes(title_text="Sample Size") fig.update_yaxes(title_text="Minimum Detectable Uplift") - - result_wt = widgets.HTML( - value=( - "

Total Sample Size:" - f" {sample_size['Treatment Sample Size'] + sample_size['Control Sample Size']}

Treatment" - f" Sample Size: {sample_size['Treatment Sample Size']}
Control" - f" Sample Size: {sample_size['Control Sample Size']}

" - ) + sample_size_total = ( + sample_size["Treatment Sample Size"] + + sample_size["Control Sample Size"] + ) + widget_val = ( + "

Total Sample Size:" + f" {sample_size_total}" + "

Treatment" + f" Sample Size: {sample_size['Treatment Sample Size']}
Control" + f" Sample Size: {sample_size['Control Sample Size']}

" ) + result_wt = widgets.HTML(value=widget_val) display(result_wt) display(fig) @@ -629,14 +415,6 @@ def start(self): display(title_wt_freq) if self.ed.n_variants > 2: - # mt_widget = widgets.Dropdown( - # options=["No", "Yes"], - # description="Multitest correction:", - # disabled=False, - # value="No", - # ) - # display(mt_widget) - interact( self._frequentist_process, has_correction=widgets.Dropdown( @@ -647,10 +425,7 @@ def start(self): style=dict(description_width="initial"), ), ) - # self.correction = mt_widget.value - # run_frequentist_bn = widgets.Button(description="Run hypothesis test!") - # run_frequentist_bn.on_click(self._frequentist_process) - # self._frequentist_process() + else: self._frequentist_process(False) @@ -668,32 +443,3 @@ def _frequentist_process(self, has_correction): .bar(subset=["Estimated_Effect_relative"], color="grey") .format(precision=3) ) - - -# class BayesianEvaluation: -# def __init__(self, ed) -> None: -# self.ed = ed -# self.bt = BayesTest(ed=ed) -# -# def start(self): -# DEFAULT_KEY_VARIABLES = { -# "binary": ["probs"], -# "continuous": ["loc", "gate"], -# "discrete": ["rate", "gate"], -# } -# -# self.bt.compute_posterior() -# for target in self.ed.targets: -# self.bt.plot_posterior( -# target=target, -# likelihood_variables=DEFAULT_KEY_VARIABLES[ -# self.ed.metric_types[target] -# ], -# ) -# -# figures = self.bt.plot_posterior_difference( -# target=target, -# likelihood_variables=DEFAULT_KEY_VARIABLES[ -# self.ed.metric_types[target] -# ], -# )