From bea74708ce2734000ddcb17e6f90f9ec7e7c62d5 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Fri, 27 Oct 2023 22:08:27 -0400 Subject: [PATCH] Updates to LLM function and ReadMe.md (#170) * WIP * WIP --- README.md | 69 +++++++++++++++++++-------- tests/ops/test_threshold_functions.py | 4 +- trane/core/problem.py | 17 ++++++- trane/core/utils.py | 2 + trane/llm/__init__.py | 1 - trane/llm/helpers.py | 20 +++++++- trane/ops/threshold_functions.py | 11 +++-- 7 files changed, 94 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 37f5458..52a9803 100755 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Install Trane using pip: python -m pip install trane ``` -## Usage +## Usage Here's a quick demonstration of Trane in action: @@ -38,32 +38,63 @@ Here's a quick demonstration of Trane in action: import trane data, metadata = trane.load_airbnb() -entity_columns = ["location"] -window_size = "2d" - problem_generator = trane.ProblemGenerator( - metadata=metadata, - window_size=window_size, - entity_columns=entity_columns + metadata=metadata, + entity_columns=["location"] ) problems = problem_generator.generate() -print(f'Generated {len(problems)} problems.') -print(problems[108]) -print(problems[108].create_target_values(data).head(5)) +for problem in problems[:5]: + print(problem) ``` -Output: +A few of the generated problems: +``` +================================================== +Generated 40 total problems +-------------------------------------------------- +Classification problems: 5 +Regression problems: 35 +================================================== +For each predict if there exists a record +For each predict if there exists a record with equal to +For each predict if there exists a record with not equal to +For each predict if there exists a record with equal to +For each predict if there exists a record with not equal to +``` +With Trane's LLM add-on (`pip install trane[llm]`), we can determine the relevant problems with OpenAI: +```python +from trane.llm import analyze + +instructions = "determine 5 most relevant problems about user's booking preferences. Do not include 'predict the first/last X' problems" +context = "Airbnb data listings in major cities, including information about hosts, pricing, location, and room type, along with over 5 million historical reviews." +relevant_problems = analyze( + problems=problems, + instructions=instructions, + context=context, + model="gpt-3.5-turbo-16k" +) +for problem in relevant_problems: + print(problem) + print(f'Reasoning: {problem.get_reasoning()}\n') ``` -Generated 168 problems. -For each predict the majority in all related records in the next 2 days. - location time target -0 London 2021-01-01 5 -1 London 2021-01-03 4 -2 London 2021-01-05 5 -3 London 2021-01-07 4 -4 London 2021-01-09 5 +Output +```text +For each predict if there exists a record +Reasoning: This problem can help identify locations with missing data or locations that have not been booked at all. + +For each predict the first in all related records +Reasoning: Predicting the first location in all related records can provide insights into the most frequently booked locations for each city. + +For each predict the first in all related records +Reasoning: Predicting the first rating in all related records can provide insights into the average satisfaction level of guests for each location. + +For each predict the last in all related records +Reasoning: Predicting the last location in all related records can provide insights into the most recent bookings for each city. + +For each predict the last in all related records +Reasoning: Predicting the last rating in all related records can provide insights into the recent satisfaction level of guests for each location. ``` ## Community diff --git a/tests/ops/test_threshold_functions.py b/tests/ops/test_threshold_functions.py index 2633ae7..62d434d 100644 --- a/tests/ops/test_threshold_functions.py +++ b/tests/ops/test_threshold_functions.py @@ -27,8 +27,8 @@ def test_get_k_most_frequent(dtype): @pytest.mark.parametrize( "dtype", [ - ("int64"), - ("int64[pyarrow]"), + ("float64"), + ("float64[pyarrow]"), ], ) def test_get_k_most_frequent_raises(dtype): diff --git a/trane/core/problem.py b/trane/core/problem.py index d283ece..35d4850 100644 --- a/trane/core/problem.py +++ b/trane/core/problem.py @@ -21,11 +21,13 @@ def __init__( operations, entity_column=None, window_size=None, + reasoning=None, ): self.operations = operations self.metadata = metadata self.entity_column = entity_column self.window_size = window_size + self.reasoning = reasoning def __lt__(self, other): return self.__str__() < (other.__str__()) @@ -57,7 +59,16 @@ def get_required_parameters(self): return self.operations[0].required_parameters def set_parameters(self, threshold): - return self.operations[0].set_parameters(threshold) + self.operations[0].set_parameters(threshold) + + def set_reasoning(self, reasoning): + self.reasoning = reasoning + + def get_reasoning(self): + return self.reasoning + + def reset_reasoning(self): + self.reasoning = None def is_classification(self): return isinstance(self.operations[2], ExistsAggregationOp) @@ -118,7 +129,9 @@ def create_target_values(self, dataframes): # Won't this always be normalized? normalized_dataframe = self.get_normalized_dataframe(dataframes) if self.has_parameters_set() is False: - raise ValueError("Filter operation's parameters are not set") + print("Filter operation's parameters are not set, setting them now") + thresholds = self.get_recommended_thresholds(dataframes) + self.set_parameters(thresholds[-1]) target_dataframe_index = self.entity_column if self.entity_column is None: diff --git a/trane/core/utils.py b/trane/core/utils.py index 37037d0..8821ab5 100644 --- a/trane/core/utils.py +++ b/trane/core/utils.py @@ -14,6 +14,8 @@ def determine_gap_size(gap): return pd.Timedelta(gap) elif isinstance(gap, int) or isinstance(gap, pd.Timedelta): return gap + elif not gap: + return 1 return int(gap) diff --git a/trane/llm/__init__.py b/trane/llm/__init__.py index 90477ef..2edaded 100644 --- a/trane/llm/__init__.py +++ b/trane/llm/__init__.py @@ -1,2 +1 @@ -from trane.llm.chat import chat from trane.llm.helpers import * diff --git a/trane/llm/helpers.py b/trane/llm/helpers.py index 6c8c07f..8b24e36 100644 --- a/trane/llm/helpers.py +++ b/trane/llm/helpers.py @@ -1,4 +1,5 @@ import json +import re from IPython.display import Markdown, display @@ -45,6 +46,7 @@ def analyze( instructions, context, model="gpt-3.5-turbo-16k", + jupyter=False, ): prompt_context = f" The context is: {context}" constraints = ( @@ -81,14 +83,20 @@ def analyze( problems_formatted, ) response = openai_gpt(prompt, model) - display(Markdown(response)) + if jupyter: + display(Markdown(response)) + else: + print(response) relevant_ids = extract_problems_from_response(response, model) - print(relevant_ids) relevant_ids = list(set(relevant_ids)) relevant_problems = [] for id_ in relevant_ids: relevant_problems.append(problems[int(id_) - 1]) + + reasonsings = extract_reasonings_from_response(response) + for idx, reason in enumerate(reasonsings): + relevant_problems[idx].set_reasoning(reason) relevant_problems = sorted(relevant_problems, key=lambda p: str(p)) return relevant_problems @@ -108,6 +116,14 @@ def extract_problems_from_response(response, model): return response +def extract_reasonings_from_response(text): + reasonsings = [] + matches = re.findall(r"Reasoning: (.+?)(?:\n\n|$)", text) + for match in matches: + reasonsings.append(match) + return reasonsings + + def format_problems(problems: list) -> str: formatted = "" for idx, problem in enumerate(problems): diff --git a/trane/ops/threshold_functions.py b/trane/ops/threshold_functions.py index 75c4fe1..f445cfb 100644 --- a/trane/ops/threshold_functions.py +++ b/trane/ops/threshold_functions.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from pandas.api.types import ( + is_integer_dtype, is_object_dtype, is_string_dtype, ) @@ -106,13 +107,15 @@ def find_threshold_to_maximize_uncertainty( def get_k_most_frequent(series, k=3): # get the top k most frequent values + dtype = series.dtype if ( - is_object_dtype(series.dtype) - or isinstance(series.dtype, pd.CategoricalDtype) - or is_string_dtype(series.dtype) + is_object_dtype(dtype) + or isinstance(dtype, pd.CategoricalDtype) + or is_string_dtype(dtype) + or is_integer_dtype(dtype) ): return series.value_counts()[:k].index.tolist() - raise ValueError("Series must be categorical, string or object dtype") + raise ValueError("Series must be categorical, string, object or int dtype") def sample_unique_values(series, max_num_unique_values=10, random_state=None):