Sliders working to predict new score

leoncav · Dec 3, 2020 · a99787b · a99787b
1 parent abbdef6
commit a99787b
Show file tree

Hide file tree

Showing 9 changed files with 861 additions and 555 deletions.
diff --git a/fivestar/first_app.py b/fivestar/first_app.py
@@ -1,9 +1,10 @@
 import streamlit as st
 import numpy as np
 import pandas as pd
+from wordcloud import WordCloud
 
 from fivestar.clusters import get_cluster_coords
-from fivestar.lib import get_listing
+# from fivestar.lib import get_listing
 from fivestar.lib import FiveStar
 
 # lists for select boxes (to be replaced by imported lists/params)
@@ -181,9 +182,9 @@
 with slide_col_mid:
     st.subheader('Change your offering')
 
-    guests_accom = st.slider('Guests to accommodate', 0, 16, avg_guests_accom)
-    st.write(guests_accom, 'guests')
-    st.write('')
+    # guests_accom = st.slider('Guests to accommodate', 0, 16, avg_guests_accom)
+    # st.write(guests_accom, 'guests')
+    # st.write('')
 
     can_strict = st.select_slider(
         'Strict cancellation policy (ie xx)',options=['No', 'Yes'])
@@ -195,17 +196,17 @@
     st.write('')
     #st.write('Instantly bookable:', inst_book)
 
-    host_listings_count = st.slider('No of other listings', 0, 16, 1)
-    st.write(host_listings_count + 1, 'listings in total')
-    st.write('')
+    # host_listings_count = st.slider('No of other listings', 0, 16, 1)
+    # st.write(host_listings_count + 1, 'listings in total')
+    # st.write('')
 
-    type_entire = st.select_slider(
-        'Entire flat (vs private room)',options=['No', 'Yes'])
-    st.write('')
+    # type_entire = st.select_slider(
+    #     'Entire flat (vs private room)',options=['No', 'Yes'])
+    # st.write('')
 
-    parking = st.select_slider(
-        'Free parking on premises',options=['No', 'Yes'])
-    st.write('')
+    # parking = st.select_slider(
+    #     'Free parking on premises',options=['No', 'Yes'])
+    # st.write('')
 
     wifi = st.select_slider(
         'Wifi available',options=['No', 'Yes'])
@@ -215,13 +216,13 @@
         'Breakfast included',options=['No', 'Yes'])
     st.write('')
 
-    host_resp_rate = st.select_slider(
-        'Response to questions',options=['Never', 'When I can', 'As much as possible'])
-    st.write('')
+    # host_resp_rate = st.select_slider(
+    #     'Response to questions',options=['Never', 'When I can', 'As much as possible'])
+    # st.write('')
 
-    host_identity = st.select_slider(
-        'Host identity verified',options=['No', 'Yes'])
-    st.write('')
+    # host_identity = st.select_slider(
+    #     'Host identity verified',options=['No', 'Yes'])
+    # st.write('')
 
     price_ratio = st.slider('Price adjustor, £', 0, 250, price)
     st.write('£', price_ratio, )
@@ -231,16 +232,27 @@
     st.write('Expected standard of cleanliness:', cleanliness_delta )
     st.write('')
 
-    amenity_options = st.multiselect('Amenities offered',
-        amenities_example)
-    st.write(len(amenity_options), 'amenities offered')
-    st.write('')
-
+    # amenity_options = st.multiselect('Amenities offered',
+    #     amenities_example)
+    # st.write(len(amenity_options), 'amenities offered')
+    # st.write('')
+values = {
+    'price': price_ratio,
+    'cancellation_policy': can_strict,
+    'Wifi': wifi,
+    'Breakfast': breakfast,
+    'review_scores_cleanliness': cleanliness_delta,
+    'instant_bookable': inst_book,
+    }
+print('These values are coming directly from streamlit:', values)
+new_score = fs.predict_on_new_values(listing_id, values)
 
 with slide_col_right:
     st.subheader('Review score impact')
     st.write('')
-    st.write('review score:', '+0.4')
+    st.write('review score:', new_score)
+
+
 
 # checkbox functionality
 

diff --git a/fivestar/lib.py b/fivestar/lib.py
@@ -5,8 +5,11 @@
 
 from os.path import split
 import pandas as pd
+import numpy as np
 import datetime
 from fivestar.data import get_data
+from fivestar.params import COLUMNS
+from fivestar.model import Model
 
 pd.set_option('display.width', 200)
 
@@ -15,22 +18,95 @@ class FiveStar():
     def __init__(self):
         self.listings = get_data()
         self.clusters = get_data('clusters')
+        self.model = Model().load_model()
+
+
 
     def get_listing(self, listing_id):
         """Look up full info for an id and return it as a dict???"""
         # print(self.listings.shape)
         listings = self.listings
+        columns_to_keep = ['review_scores_accuracy',
+             'review_scores_cleanliness',
+             'review_scores_checkin',
+             'review_scores_communication',
+             'review_scores_location',
+             'review_scores_value',
+             'instant_bookable', 'host_identity_verified',
+             'amenities', 'price', 'neighbourhood_cleansed',
+             'host_listings_count', 'cancellation_policy',
+             'host_response_rate', 'accommodates', 'bedrooms', 'room_type',
+             ]
         # print(listings)
         if listing_id:
+            self.current_listing = listing_id
             data = listings[listings['id'] == int(listing_id)].to_dict('records')
-            if type(data) == 'list' and len(data) > 0:
-                data = data[0]
+            return data[0]
+        #     if type(data) == 'list' and len(data) > 0:
+        #         data = data[0]
+        #     else:
+        #         data = {}
+        # else:
+        #     data = {}
+        # # print(data)
+        # return data
+
+
+    def get_coef_dict(self):
+        coefs = self.model.pipeline.named_steps['rgs'].coef_
+        coefs_dict = {k:v for k,v in zip(COLUMNS,coefs)}
+        return coefs_dict
+
+    def predict_on_new_values(self, listing_id, values):
+        # print('These are the values coming in from predict on new values: ', values)
+        X_new = self.build_X(listing_id, values)
+        # print(X_new)
+        return self.model.predict(X_new)
+
+    # def build_X(self, listing_id, values):
+    #     listing_int = int(listing_id)
+    #     listing = self.listings[self.listings['id'] == listing_int].copy()
+    #     print(listing)
+    #     if values:
+    #         for key, value in values.items():
+    #             if key == 'cancellation_policy':
+    #                 listing['cancellation_policy'] = 'strict' if values['cancellation_policy'] == 'Yes' else 'Other'
+    #             if key == 'Wifi' or key == 'Breakfast':
+    #                 if value == 'No':
+    #                     listing['amenities'] = listing['amenities'].replace(f'key,','')
+    #                 elif key not in listing['amenities']:
+    #                     listing['amenities'] = '{' + key + listing.iloc[0,'amenities'][1:]
+    #             if key == 'instant_bookable':
+    #                 listing[key] = 't' if value == 'Yes' else 'f'
+    #             else:
+    #                 listing[key] = value
+
+
+    #     return listing
+
+    def build_X(self, listing_id, values):
+        listing_attributes = self.get_listing(listing_id)
+        # print(values)
+        for key, value in values.items():
+            if key == 'cancellation_policy':
+                listing_attributes[key] = 'strict' if value == 'Yes' else 'Other'
+            elif key == 'instant_bookable':
+                listing_attributes[key] = 't' if value == 'Yes' else 'f'
+            elif key == 'Wifi' or key == 'Breakfast':
+                if value == 'Yes' and key not in listing_attributes['amenities']:
+                    listing_attributes['amenities'] = listing_attributes['amenities'][:-1] + f',{key}' + '}'
+            # elif key == 'Breakfast':
+            #     if value == 'Yes' and key not in listing_attributes['amenities']:
+            #         listing_attributes['amenities'] = listing_attributes['amenities'][:-1] + f',{key}' + '}'
+                elif value == 'No':
+                    listing_attributes['amenities'] = listing_attributes['amenities'].replace(f',{key}', '')
             else:
-                data = {}
-        else:
-            data = {}
-        # print(data)
-        return data
+                listing_attributes[key] = value
+        # print(listing_attributes['amenities'])
+
+        listing_for_df = {k:[v] for k,v in listing_attributes.items()}
+
+        return pd.DataFrame.from_dict(listing_for_df)
 
 
 def clean_data(data):
@@ -40,15 +116,15 @@ def clean_data(data):
     return data
 
 
-def get_listing(listing_id):
-    """Look up full info for an id and return it as a dict???"""
-    listings = get_data()
-    data = listings.loc[listing_id].to_dict('records')[0]
-    return data
+# def get_listing(listing_id):
+#     """Look up full info for an id and return it as a dict???"""
+#     listings = get_data()
+#     data = listings.loc[listing_id].to_dict('records')[0]
+#     return data
 
 
 if __name__ == '__main__':
     # For introspections purpose to quickly get this functions on ipython
     import fivestar
 
-    print(' dataframe cleaned')
+    # print(' dataframe cleaned')
diff --git a/fivestar/model.py b/fivestar/model.py
@@ -0,0 +1,14 @@
+import joblib
+
+
+class Model():
+
+
+    def predict(self, X_new):
+        y_pred = self.pipeline.predict(X_new)
+        return y_pred
+
+
+    def load_model(self):
+        self.pipeline = joblib.load('model.joblib')
+        return self
diff --git a/fivestar/params.py b/fivestar/params.py
@@ -74,10 +74,11 @@
 
 KEY_AMENITIES = [['Free parking on premises', 'free street parking',
                   'paid parking on premises' ,'paid parking off premises'],
-                'Dryer','Wifi','Breakfast',
-                'Indoor fireplace',
-                ['TV', 'cable tv'],
-                ['Smoke alarm','Smoke detector']]
+                'Wifi', 'Breakfast'
+                # 'Indoor fireplace',
+                # ['TV', 'cable tv'],
+                # ['Smoke alarm','Smoke detector']
+                ]
 
 PRICES = [950760, 301518,667593,357779,578705,502623,1099876,399645,
          578110,463806,462820,614955,972231,683987,527206,387535,
@@ -125,3 +126,7 @@
              'Sutton': [24.9, 28.0, 31.0, 35.0, 44.5, 50.0, 56.19999999999999, 65.4, 85.0]
              }
 
+COLUMNS = ['parking', 'wifi', 'breakfast', 'amenity_count', 'instant_bookable',
+            'host_identity_verified', 'price_ratio', 'listing_count', 'cancellation',
+                            'response_rate', 'room_ratio', 'cleanliness_delta',
+                            'room_type']
diff --git a/fivestar/trainer.py b/fivestar/trainer.py
@@ -29,10 +29,10 @@
 
 class Trainer(object):
     # Mlflow parameters identifying the experiment, you can add all the parameters you wish
-    ESTIMATOR = "Linear"
+    ESTIMATOR = "Ridge"
     EXPERIMENT_NAME = "FiveStar"
 
-    def __init__(self, X, y, **kwargs):
+    def __init__(self, X=None, y=None, **kwargs):
         """
         FYI:
         __init__ is called every time you instatiate Trainer
@@ -56,7 +56,7 @@ def __init__(self, X, y, **kwargs):
         self.X_train = X
         self.y_train = y
         del X, y
-        self.split = self.kwargs.get("split", True)  # cf doc above
+        self.split = self.kwargs.get("split", False)  # cf doc above
         if self.split:
             self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.X_train, self.y_train,
                                                                                   test_size=0.25,
@@ -70,7 +70,7 @@ def get_estimator(self):
         if estimator == "Linear":
             model = LinearRegression()
         else:
-            model = Ridge()
+            model = Ridge(alpha=50)
         estimator_params = self.kwargs.get("estimator_params", {})
         self.mlflow_log_param("estimator", estimator)
         model.set_params(**estimator_params)
@@ -139,6 +139,12 @@ def set_pipeline(self):
             ('rgs', self.get_estimator())], memory=memory)
 
 
+    def predict(self, X):
+        return self.pipeline.predict(X)
+
+    def load_model(self):
+        self.pipeline = joblib.load('model.joblib')
+
     def add_grid_search(self):
         """"
         Apply Gridsearch on self.params defined in get_estimator
@@ -185,14 +191,14 @@ def save_model(self, upload=True, auto_remove=True):
         """Save the model into a .joblib and upload it on Google Storage /models folder
         HINTS : use sklearn.joblib (or jbolib) libraries and google-cloud-storage"""
         joblib.dump(self.pipeline, 'model.joblib')
-        print(colored("model.joblib saved locally", "green"))
+        # print(colored("model.joblib saved locally", "green"))
 
         # Add upload of model.joblib to storage here
-        version = self.kwargs.get('version', None)
-        if version:
-            storage_upload(model_version=version)
-        else:
-            storage_upload()
+        # version = self.kwargs.get('version', None)
+        # if version:
+        #     storage_upload(model_version=version)
+        # else:
+        #     storage_upload()
 
     ### MLFlow methods
     @memoized_property

diff --git a/fivestar/utils.py b/fivestar/utils.py
@@ -3,7 +3,7 @@
 from fivestar.params import BOROUGHS, PRICES
 
 def decode_amenities(df):
-    data = df
+    data = df.copy()
     def str_to_list(strn):
         row_items = strn[1:-1].split(',')
         for key,item in enumerate(row_items):

diff --git a/model.joblib b/model.joblib