A. FULL guideline for Machine Learning

PERSONAL TIPS THAT I MET DURING DS PROJECTS

1)IMPORTATION
FILE MANAGEMENT
import os
os.getcwd() #to get current working directory
os.chdir(path) #change directory to another path

# Go 3 levels up to get data file stores in another directory data
dataset = pd.read_csv('../../../data/dataset.csv')

# Import csv from gcp bucket into jupyter notebook
import pandas as pd
from google.cloud import storage
from io import BytesIO

PROJECT_ID = "your_project_id"
BUCKET_NAME = "your_bucket_name"
FILE_NAME = "your_file_name.csv"

client = storage.Client(project=PROJECT_ID)
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.get_blob(FILE_NAME)
content = blob.download_as_string()
df = pd.read_csv(BytesIO(content))

# Export csv from notebook to gcp bucket
from google.cloud import storage

def upload_to_bucket(bucket_name, blob_path, local_path):
    bucket = storage.Client(project=PROJECT_ID).bucket(bucket_name)
    blob = bucket.blob(blob_path)
    blob.upload_from_filename(local_path)
    return "Done!"
# method call
bucket_name = 'your_bucket_name' # do not give gs:// ,just bucket name
blob_path = 'data_intermediate/df_merge.csv' # path where you want to store the file
local_path = '../data/df_merge.csv' #local file path
upload_to_bucket(bucket_name, blob_path, local_path)

# Import and perform operations on multiple dataframes
month_number = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
df_dict = {m: pd.DataFrame() for m in month_number}

for k in df_dict:
    # Import multiple dataframes in one line of code
    df_dict[k] = pd.read_csv('../../Datasets/inside_airbnb/calendar/2019/calendar_' + k + '.csv')
    print(f'shape before filter for month {k} is {df_dict[k].shape}')
    df_dict[k]['date'] = pd.to_datetime(df_dict[k]['date'])
    df_dict[k] = df_dict[k][df_dict[k].date.dt.month == int(k)]
    df_dict[k] = df_dict[k][(df_dict[k].minimum_nights <= 3) & (df_dict[k].maximum_nights >= 2)]
    print(f'shape after filter for month {k} is {df_dict[k].shape}')
    

# Load a json file and convert it to pandas dataframe perserving tuples as values
with open('path_to_your_json_file.json') as json_file:
    dict_input = json.load(json_file)
data = {k:[v] for k,v in zip(dict_input.keys(), dict_input.values())}
df_input = pd.DataFrame(data)


# Read compressed csv and filter on rows
iter_csv = pd.read_csv('test.csv.gz', usecols=cols_to_keep_listings, compression='gzip', na_values='\\N', iterator=True, chunksize=1000)
listings = pd.concat([chunk[(chunk['data_type'] == 'ENTIRE_HOME') & 
                      (chunk['data_address_city'] == 'Paris') &
                      (chunk['data_subtype'].isin(['APARTMENT', 'HOTEL', 'OTHER', 'APARTHOTEL', 'BED_&_BREAKFAST'])) &
                      (chunk['unifiedId'].isin(listings_ids))
                     ] for chunk in iter_csv])


# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
pd.set_option('display.max_columns', None) # To show all columns

## Connect to SQL database and make some HTTP requests using authentification token using requests library
# Get token. 1-day period
url_token = 'API_token'
credentials = {'mail': 'user_mail,
         'password': 'user_password'}
res = requests.post(url_token, data = credentials)
token = res.json()['token']

# Get housing with the token and transform to DataFrame to be able to manipulate data in Jupyter Notebook. Get specific columns, not all columns in database
url_housing = 'API_housing?columns=host_id;housing_id;housing_name' 
headers = {'Authorization': f'Bearer {token}'}  # Need to use authorization header in every requests
housing_req = requests.get(url_housing, headers=headers)
housing = housing_req.json()
housing = pd.DataFrame(housing)

# Put new housing neighborhood (modify value in directly database)
for id in update_housing.housing_id:
    url_housing_put = f'API_housing/{id}'
    param = {'neighborhood': f'{update_housing.loc[update_housing.housing_id == id].neighborhood.values[0]}'}
    housing_put = requests.put(url_housing_put, data = param, headers = headers)

# Date
from datetime import datetime, date, timedelta
import calendar

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Import Datasets 
# from Kaggle
import os
os.environ['KAGGLE_USERNAME'] = "vinhnguyen94" # username from the json file
os.environ['KAGGLE_KEY'] = "ee939400b83c8ef989727a70f29fab39" # key from the json file
!kaggle competitions download -c hecmontrealdeeplearningcourse # api copied from kaggle. Get the API on the address link
train = pd.read_csv("train.csv", index_col = "chosen column)
# on Colab
from google.colab import files
uploaded = files.upload()
import io
train=pd.read_csv(io.BytesIO(uploaded['train.csv']))
# on Jupyter
ttrain = pd.read_csv('../Datasets/train.csv')

train.describe() // train.describe(include = ['O']) if we want only the strings information. include = 'all' is possible

# Check if the target variable is normal, sometimes there can be very bad mistake that would ruin the model (like a negative price)

train["col"].nunique() # To see how many unique values in a column
train.value_counts()  # To see the values distribution


2) UNDERSTAND DATA

# Some useful definitions
List is a collection which is ordered and changeable (can add/remove items). Allows duplicate members.
Tuple is a collection which is ordered and unchangeable (cannot add/remove items). Allows duplicate members.
Set is a collection which is unordered and unindexed. No duplicate members.
Dictionary is a collection which is ordered and changeable. No duplicate members.


Select the most important features to check the outliers and to check multicorrelation (reduce nb of features)
# Heatmap to see how features are correlated
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

# Plot Spearman correlation (better than Pearson's cuz don't assume linear relationship and Gaussian distribution but less powerful when linearity)
def display_correlation(df):
    r = df.corr(method="spearman")
    plt.figure(figsize=(10,6))
    heatmap = sns.heatmap(df.corr(), vmin=-1, 
                      vmax=1, annot=True)
    plt.title("Spearman Correlation")
    return(r)

# Heatmap to check the features that contribute the most to the target variable
import seaborn as sns
k = 25 #number of variables for heatmap
cols = df.corr().nlargest(k, 'price')['price'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
fig, ax = plt.subplots(figsize=(12,12))        # Sample figsize in inches
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()


# Check feature distribution
df = df_preprocessed_final
target = 'surface'

plt.figure(figsize=(10,5)) # Set figure size before calling plt.plot()
df[f'{target}'].plot.hist(bins=60)
plt.xlabel('Target')
plt.title('Target distribution')


# Spot outliers. Strategy: delete very bad outliers. not all outliers to have a model robust on outliers
fig, ax = plt.subplots()
feature = "TotalBsmtSF"
target = 'SalePrice'
ax.scatter(x = train[feature], y = train[target])
plt.ylabel(target, fontsize=13)
plt.xlabel(feature, fontsize=13)
plt.show()
# Drop chosen outliers
train = train.drop(train[(train[feature]>4000) & (train[target]<300000)].index)  # Example for the house pricing

# Multiple plots TO COMPLETE
g = sns.FacetGrid(train, col='y') # Initialize the FacetGrid with the data (train) and the target var
g.map(plt.hist, 'X', bins=20) # Call plotting function .map with interval of 20 (for age for ex)

# Groupby using different aggregation methods: mean and sum and custom function (here keep the mode of the cluster feature)
df_quartier = df.groupby(['region','quartier']).agg(revenue_brut_sum = ('revenue_brut_year', 'sum'),
                                                    revenue_brut_night_mean = ('revenue_brut_moyen_night', 'mean'),
                                                    cluster_mode = ('cluster', lambda x: x.value_counts().index[0])).reset_index()

# Groupby and show unique values within subgroup
df.groupby("region")["departement"].apply(set).to_frame()


# Move specific col to specific place. col = df.pop(col) => keep the col and remove it from df
y_name = 'arnd'
y = df.pop(y_name)
df.insert(1, y_name, y). # Put at 2nd column

# Find elements not in a list
set(update_df.housing_id) - set(start_df.housing_id) # gives IDs from update_df that are not in start_df

# Groupby and compute percentage within group / subgroup
df_tmp = df.groupby(['Country', 'City']).agg({'revenue':'sum'})
df_tmp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))

# Groupby weighted average

# Custom function for weighted average
def my_w_avg(s, df, wcol):
    return np.average(s, weights=df.loc[s.index, wcol])
    
df_agg = df.groupby(groupby_cols).agg(
                                VOLUME = ('VOLUME', 'sum'),
                                weights = ('weights', 'mean'),
                                score = ('score', functools.partial(my_w_avg, df=df, wcol='weights')),
                                ).reset_index()

# Create dataframe using dictionary
col = ['col1', 'col2', 'col3']
#val = [[0 for k in range(10)], [1 for k in range(10)], [2 for k in range(10)]]
val = [0, 1, 2]
d = {k: [v] for k,v in zip(col,val)}
df = pd.DataFrame(d)

# Create dataframe using three lists
df = pd.DataFrame({'lst1Tite' : [lst1],
                                'lst2Tite' : [lst2],
                                'lst3Tite' : [lst3] }, 
                                columns=['lst1Tite','lst1Tite', 'lst1Tite'])
                                
# Split a column of tuples
df['a'], df['b'] = df.col.str


3) DATA PREPROCESSING
To do data pre-processing on both train and test sets. But WORK ON THE TRAIN FIRST THEN APPLY on TEST

# CONCATENATE train and test set to apply some data preprocessing
combine=pd.concat([train,test])
# MERGE 2 dataframes using the same column
new_train = pd.merge(train, text1, on='id', how='outer')
# MAP one column but need unique index
housing['quartier'] = housing.housing_id.map(df_quartier.set_index('housing_id')['quartier'])

# CREATE NEW COLUMN ON CONDITION
conditions = [(housing.postal_code.str.startswith('75')),(housing.postal_code.str.startswith(('78','91', '92', '93', '94', '95')))]
values = ['paris', 'banlieue']
housing['region'] = np.select(conditions, values)

# Extract first n characters from left of columns in pandas
df['zipcode'].str[:5]

# Keep only rows whose strings startwiths '75'
df_paris = df.loc[df.zipcode.str.startswith('75', na=False)]

# Set max value in a column to 30
maxVal = 30
df['col'] = df['col'].where(df['col'] <= maxVal, maxVal)

# Use function map() to apply a function for every items in a list
list_with_int = list(map(int, list_with_strings))


# VARIANTE to compute ratio per group (subtotal)
sub_total = df.groupby('cluster')['count'].sum().reset_index().rename(columns={'count':'sub_total'})
df = pd.merge(df, sub_total, on='cluster', how='left')
df['ratio_per_group'] = round(df.count/df.sub_total, 2)*100

# Compute subtotal (sub total) to have the ratio per group
total_resa_year_region_sleeps = df_booking_moments.groupby(['year','region', 'sleeps_max_cat'])['resa_count'].transform('sum')
df_booking_moments['sub_total'] = total_resa_year_region_sleeps
df_booking_moments['ratio'] = round(df_booking_moments.resa_count / df_booking_moments.sub_total,2)*100


# Select rows that contain a value in a specific column
df[df['col'].str.contains("hello")]

# Replace the value for one specific id (replace condition) by another chosen value
df.loc[df['First Season'] > 1990, 'First Season'] = 1   # Need to select the column you want to change in the condition

# Convert a column and handle error. Here convert postal_code to int and return 999 if impossible to convert to int
def convert_postal_code(x):
    try:
        res = int(x)
        return res
    except:
        return int(999)
    
df.col = df.col.map(convert_postal_code)

# Find out how many different values for a specific column after a group by
df_duplicates = df[['destination', 'month', 'client']]\
        .reset_index()\
        .groupby(['destination', 'month'])['index']\
        .agg(list)\
        .reset_index(name='list_client')

df_duplicates['len'] = df_duplicates['list_client'].str.len()
df_duplicates.sort_values(by='len', ascending=False)


# Delete only true duplicates
train = train.drop_duplicates("col", keep = "first")
or PERSONALIZED 
duplicates = train[train.duplicated('text')]
duplicates = duplicates[['id', 'text', 'keyword', 'location', 'target']].groupby(['text'], as_index=False).count().sort_values(by='id', ascending=False)
true_duplicate = duplicates["id"] > 1. # Can adapt
duplicates[true_duplicate]
index_false_duplicate = duplicates[duplicates["id"] == 1].index 
duplicates.drop(index_false_duplicate, inplace=True)

# Duplicates that are differently labeled
df_train['target_relabeled'] = df_train['target'].copy() 
df_train.loc[df_train['text'] == 'original text', 'target_relabeled'] = 0/1 #Choose the correct label

# Create column with tuple. Useful for GPS using lat and lnt
housing['GPS'] = tuple(zip(housing.lat, housing.lng))

# Fix issues with too much spaces between words in a column
df.date.map(lambda x: ' '.join(filter(None,x.split(' ')))) ==> first split using space, filter/remove None to have a list. Then join the remaining words in the list by a space

# Use enumerate to create dictionary for creating dataframe
map_admin = {k:v for k,v in enumerate(df_admin.admin_id.unique())} # enumerate renvoie l'index (0,1,2,3,etc) et l'associe à une valeur (admin_id)

# Use dict.items() to have a list of a dictionary's tuple pairs (key:value)
--> Useful for changing/mapping keys and values of a dict
output = {map_admin[k]:list(map(lambda x: map_housings[x], v)) for k,v in dict_labelers.items()}

# Use sub_df in a for loop to create a dataframe. Need to do it when creating a df from dict with multiples values of different sizes per key
for i,x in enumerate(dict.items()):   # reminder: enumerate gives iterable pairs of (index,value) and dict.items() gives pairs of (key,value)
    sub_dict = dict([x]) # create sub_dict per x in the original dict
    sub_df = pd.DataFrame(sub_dict)
    if i==0:
        df_final = sub_df.copy()
    else:
        df_final = pd.concat((df_final, sub_df))
        
# Select all columns starting the 4th column
df.iloc[:,4:]

# Check if intersection of two lists is null
L = [1,2,3,4,5]
M = [5,6,7]
bool(set(L) & set(M))

    
# CALCULATE DISTANCE (haversine distance)
pip install haversine # install module
from haversine import haversine
name = quartiers_paris.L_QU.unique() # Distance of each house from every quarters in Paris
for q,n in zip(quartiers_paris.GPS, name):
  distances = []
  for h in housing_paris.GPS:
    distances.append(haversine(h,q))
  housing_paris[n] = distances
  
# Create function assign_quartier
def assign_quartier(df):
  min_val_col_name = df.idxmin(axis=1). # Take the quartier with the min distance
  df['quartier'] = min_val_col_name
  
# Keep rows with specific values in a specific column  
new_amenities = new_amenities.loc[new_amenities.title.isin(selected_amenities)] # Set selected_amenities list before

# Check amenities distribution (multiple items per row that are separated by ",")
pd.Series(np.concatenate(listings['amenities'].map(lambda x: x.split(",")))).value_counts()
# If multiple items are separated by other characters, we can create a cleaning function and use it with apply
def split_amenities(amenities):
    amenity_list = amenities \
        .replace("\"", "")  \
        .replace("{", "")  \
        .replace("}", "")  \
        .lower().split(",")
    return amenity_list
    
listings['amenities'] = listings['amenities'].apply(split_amenities)

# Convert date to day of week name and add isWE
df['date'] = pd.to_datetime(df['date'])
df['weekday'] = df['date'].dt.weekday
df['weekday_name'] = df['date'].dt.day_name()
df['isWE'] = np.where((df.weekday == 4)|(df.weekday == 5), 1,0)
# Separate weekdays and weekend days
df_week = df[df.isWE == 0]
df_weekend = df[df.isWE == 1]
# Check if we didn't have data leakage
assert df_week.shape[0] + df_weekend.shape[0] == df.shape[0] # Use assert to check if we don't have any error


## USING TRY EXCEPTION TO HANDLE EXCEPTION
# Function to convert format, if error (ParserError from pendulum lib), set specific date
def convert_date_format(x):
    try:
        return maya.parse(x).datetime(to_timezone='Europe/Paris', naive=False).date()  #Do something if no error
    except pendulum.parsing.exceptions.ParserError:
        return '2000-01-01'.                                                           #Do something else if error
housing['first_booking'] = housing['first_booking'].apply(convert_date_format)


## Keep rows from a df according to a percentage. Here: keep the first 80% (already sort by OccRate) per year and per month
df_80 = df.groupby(['year', 'month']).head(df['nb_rows_keep_per_month_80'])


## Create additional rows to have the same length for date
prices = prices.set_index(['date', 'listing_id'])\
                .unstack(fill_value=0)\
                .asfreq('D', fill_value=0)\
                .stack().sort_index(level=1).reset_index()


## Fillna by mean within group
df['price_adjusted'] = df['price'].fillna(df.groupby(['listing_id', 'isWE'])['price'].transform('mean'))

## Fillna by the most frequence within group / subgroup
df["DEBUT_ECHEANCE"] = df.groupby(['NUMERO_CONTRAT', 'NUM_SOUS_CONTRAT', 'MOIS_ANNEE'])['DEBUT_ECHEANCE'].transform(lambda x: x.fillna(x.mode()[0]))


## Round up a column: use ceil function from math
import math
df['bathrooms_adjusted'] = df.bathrooms.apply(lambda x: math.ceil(x))

## Create a dict from two lists
output = dict(zip(keys_list, values_list))


### MISSING VALUES
# Print only the columns with missing values and their corresponding %
df_mv = reservation2
total_miss = df_mv.isnull().sum().sort_values(ascending=False)
percent_miss = (df_mv.isnull().sum()/df_mv.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_miss, percent_miss], axis=1, keys=['Total_miss', 'Percent_miss'])
missing_data[missing_data.Percent_miss > 0]

THEN
mv_features = ['tax', 'fees']
for f in mv_features:
    reservation2[f] = reservation2[f].fillna(0)
    
# Replace MV for one col depending on another column. Here replace MV "beds" by the most frequent value for beds in observations with the same value of bedrooms and bathrooms 
df_na = listings[(listings.beds.isna()) & (~listings.bedrooms.isna()) & (~listings.bathrooms.isna())]
def fill_beds(row):
    df_filtered = listings[(listings.bedrooms == row.bedrooms) & (listings.bathrooms == row.bathrooms)]
    argmax = df_filtered['beds'].value_counts().idxmax(). # argmax is the most frequent value for beds per subgroup
    row['beds'] = argmax
    return row

listings_imputed = df_na.apply(fill_beds, axis = 1)
listings.beds = listings.beds.fillna(listings_imputed.beds)

    
# (Variante) Replace MV for one col depending on another column. Here replace MV "beds" by "accomodates"/2 if > 1 else 1
c1 = (listings.accommodates > 1)
c2 = (listings.accommodates <= 1)
listings.loc[c1,'beds'] = listings.loc[c1,'beds'].fillna(listings.loc[c1,'accommodates']/2)
listings.loc[c2,'beds'] = listings.loc[c2,'beds'].fillna(1)

# (Variante) Replace MV by specific value in one col for rows that have a specific value
id_list = [x1, x2]
df.loc[df.c1.isin(id_list), 'occ_rate_year'] = 100


# Get names of columns with missing values
cols_with_missing = [col for col in train.columns
                     if train[col].isnull().any()]

# Get columns for categorical features (replace by the most frequent occurence) and numerical features (replace by the mean)
categorical_cols = [cname for cname in all_data.columns if
                    all_data[cname].nunique() < 10 and    #to limit the size of the encoding
                    all_data[cname].dtype == "object" and
                    all_data[cname].isnull().sum() > 0
                    ]
numerical_cols = [cname for cname in all_data.columns if 
                all_data[cname].dtype in ['int64', 'float64'] and 
                all_data[cname].isnull().sum() > 0
                ]
# Replace by the mean (num) and the most frequent values (cat). For num, not always by the mean (median or 0)
for feature in categorical_cols:
  all_data[feature] = all_data[feature].fillna(all_data[feature].dropna().mode()[0])
for feature in numerical_cols:
  all_data[feature] = all_data[feature].fillna(all_data[feature].mean())
  
# (Variante) Complete missing values by the median / "Unknown"
train['Age']=train['Age'].fillna(train['Age'].median() / "Unknown")

# (Variante) Complete missing values by the most frequent occurence (letter or numerical category). Ex: frequent port of embarkation
argmax = train.Embarked.value_counts().idxmax()
train["Embarked"] = train["Embarked"].fillna(argmax)

## Create a function that fills MV for beds with the most frequent value for beds in observations with the same value of bedrooms and bathrooms
# First, keep the rows that have NA for beds and not NA for bedrooms and bathrooms
df_beds_na = df[(df.beds.isna()) & (~df.bedrooms.isna()) & (~df.bathrooms.isna())]
# Create the function
def fill_beds(row):
    df_filtered = df[(df.bedrooms == row.bedrooms) & (df.bathrooms == row.bathrooms)]
    argmax = df_filtered.beds.value_counts().idxmax()
    row['beds'] = argmax
    return row
# Apply the function
df_beds_na_for_merge = df_beds_na.apply(fill_beds, axis = 1). # don't forget axis = 1

    
# (Variante) Complete missing values by using other correlated features instead of replacing by the mean of the missing variable
# Guess Age values using median values for Age across sets of Pclass and Gender feature combinations
guess_ages = np.zeros((2,3))  # (2,3) cuz Sex is 2 and Pclass is 3
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df['Sex'] == i) & \
                                  (df['Pclass'] == j+1)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    df['Age'] = df['Age'].astype(int)

# Drop na from specific column
df = df.dropna(subset = ['colA'])

# Groupby year and pivot table to have columns per year 
df_evolution = df.groupby(['year', 'housing_name'])['revenu'].sum()
df_evolution.year = df_evolution.year.astype(str)
df_evolution_pivot = df_evolution.pivot(index='housing_name', columns='year', values='revenu')


# Replace date for specific row
reservation.at[reservation.loc[reservation.code == 'resa_code'].index[0], 'departure_date']= pd.Timestamp('2019-12-31')

# Take the date of the first resa 
reservation['first_resa'] = reservation.groupby(['housing_id', 'housing_name'])['arrival_date'].transform("min") # Add column first_resa with the earliest resa per housing
 
 
# Split resa per day: have one row per day of reservation
reservation_day0 = reservation2[['id', 'arrival_date', 'departure_date']]
reservation_day0 = pd.melt(reservation_day0, id_vars='id', value_name='date')
reservation_day0.date = pd.to_datetime(reservation_day0.date)
reservation_day0.set_index('date', inplace=True)
reservation_day0.drop('variable', axis=1, inplace=True)
reservation_day = reservation_day0.groupby('id').resample('D').ffill().reset_index(level=0, drop=True).reset_index().  # Resample per day
reservation_day = pd.merge(reservation2, reservation_day)
# Drop latest day per resa because we only count the nb of nights and not days
reservation_day= reservation_day.drop(reservation_day[reservation_day['departure_date'] == reservation_day['date']].index)
 
 
## CREATE FUNCTION with if condition
def is_studio_price(bedroom_count, nightPrice):
  is_studio = 1 if bedroom_count == 0 else 0
  if is_studio ==1:
    nightPrice *= 0.10 # Example: decrease nightPrice by 90% if it is a studio
  return nightPrice
 
### Categorizing
# Create ranges of age for plotting
train['Age_bins']=pd.cut(x=train['Age'], bins=[0,19, 39, 59,79], labels=['0-10s','20-30s','40-50s','60-70s']) # number of labels should be one less than bins
or (BETTER)
train['AgeBand'] = pd.cut(train['Age'], 5) # Create 5 categories of Age. We can add labels if we want. Use qcut if to have balanced categories


pd.crosstab(train.Age_bins, train.Survived).plot(kind="bar") #Survived = target

# Convert age into fixed categories of Age after the pd.cut   
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4


# Show rows only if values is in a list. Use isin() for string and number
doublon_list = [0, 999]
housing[housing.housing_id.isin(doublon_list) ][['housing_id', 'housing_name']]


### Creation of new feature. Some ideas
# TITANIC. Create another feature called IsAlone. Reminder: combine = [train, test]
for dataset in combine:
    dataset["IsAlone"]=0 #Creation of the new feature
    dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1 # 1 if the person is alone
    
# HOUSE PRICING. Create a variable for the total size of the house. Try to group correlated variables
all_data["tot_surface"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]

# Feature engineering for time series
df['rolling_average_volume_past_3_months'] = df.sort_values(['id', 'month']).groupby(['id'])['target'].apply(lambda x: x.shift().rolling(3).mean())


### TO GO FURTHER: CHECK NORMALITY. The variables should be normal
from scipy import stats
#histogram and normal probability plot
sns.distplot(y_train);
fig = plt.figure()
res = stats.probplot(y_train, plot=plt) #The variable has to follow the red line (that represents the normal distribution).
# In case of hyperbol (positive skewness), apply log
y_train = np.log(y_train)   #Recheck now. Be careful on the submission, we need to compute the exponential
and don't forget: y_pred = np.exp(y_pred)
// Skewness measures the lack of symmetry in data distribution. It has to be close to 0
// Kurtosis is used to measure outliers present in the distribution. It has to be less than 3.

## Detect outliers
plt.boxplot(df['target'])
Q1 = df['target'].quantile(0.25)
Q3 = df['target'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)
df['target'] < (Q1 - 1.5 * IQR)  #lower bound
df['target'] > (Q3 + 1.5 * IQR) #upper bound
# 1.5 because of the scale. If we take 1 -> too much outliers, if 2 -> too few. With 1.5, we are the closest to the 3σ rule from the Gaussian (beyond 3σ -> outliers)


### GRAPH
# Gmaps
conda install -c conda-forge gmapsimport gmaps
gmaps.configure(api_key='YourAPI')
fig = gmaps.figure()
heatmap_layer_inside = gmaps.heatmap_layer(
  df[['lat','lng']],
  weights=df[criterion]
)
fig.add_layer(heatmap_layer_inside)
fig

# DOUBLE LINE GRAPH
ax = df.plot(x = 'month', y= ['Y1', 'Y2'], figsize = (25,10), title = 'yourTitle')
_ = plt.xticks(fontsize = 16, rotation=45) # tip for changing the ticks

# Bar plot horizontal
ax = df.sort_values('revenue', ascending=True).plot.barh(x='quartier', y='revenue', figsize = (15,15), title = 'yourTitle')
# show text on top of bars (vertical) or after bars (horizontal)
bars = ax.patches
# Get labels to show
labels = round(df.revenue,0).astype(int)
for rect, label in zip(bars, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label,
            ha='center', va='bottom')
  
# Bar plot from df with values as text on the graph  
ax = sns.barplot(x='category', y="price", data=df_plot)
for _, row in to_draw.iterrows(): #Add text on graph
    ax.text(row.name,row.price, round(row.price,2), color='black', ha="center")
ax.set_title('My_Title') # Add title

# COMBINED BAR AND LINE CHART (SEABORN)
fig, ax1 = plt.subplots(figsize=(20,9))
#bar plot creation
color1 = 'lightgray'
ax1.set_title('yourTitle', fontsize=24)
ax1.set_xlabel('X', fontsize=14)
ax1 = sns.barplot(x='X', y='Y', data = df_plot, color = color1)
ax1.set_ylabel('Y', fontsize=14, color = "black")
plt.xticks(rotation=45) # for better reading
#specify we want to share the same x-axis
ax2 = ax1.twinx()
#line plot creation
color2 = "green"
ax2.set_ylabel('Y2', fontsize=14, color = color2 )
ax2 = sns.lineplot(x=range(0,len(df_plot.Y2)), y='Y2', data = df_plot, sort=False, color=color2)
# Show yearly occ_rate
ax3 = ax2.twiny()
# line plot creation
color3 = 'navy'
ax3 = sns.lineplot(x=range(0,len(df_plot.Y3)), y='Y3', data=df_plot, sort=False, color = color3)
# Round values for axis
ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#show plot
plt.show()

# PIE CHART
labels = df.country_name
sizes = df.ratio
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


# CONVERT PHONE NUMBER TO COUNTRY CODE
pip install phonenumbers
import phonenumbers
from phonenumbers.phonenumberutil import (
    region_code_for_country_code,
    region_code_for_number
)

country_code = []
phone_number = []
incorrect_number = []
def number_to_countryCode(df):
    for n in df.phone:
        try:
            pn = phonenumbers.parse(f'{n}')
            code = region_code_for_country_code(pn.country_code)
            phone_number.append(n)
            country_code.append(code)
        except:
            incorrect_number.append(n)
            

## DATA MANIPULATION
# Take value of a cell in a dataframe refering specific value in a column ==> we get the string and not the array
neighborhood = df[df.housing_id == id].neighborhood.values[0]

### ENCODING
BEST PRACTICE : MANUALLY ENCODE WHEN ORDER MATTERS AND ONE-HOT ENCODE WHEN NO HIERARCHY

# Manual encoding
housing.id = np.where(housing.id > 0, 1, 0)

# Binary encoding
df['instant_bookable'] = 1 * (df['instant_bookable'] == "t") ==> t/f is 1/0 so *1 converts into numeric format


# Converting numeric features to categorical features. Especially for the year or month --> need to create categories
from sklearn.preprocessing import LabelEncoder
str_cols = ['YrSold','YearRemodAdd','YearBuilt','MoSold','MSSubClass']
for col in str_cols:
    all_data[col] = all_data[col].astype(str)
    le = LabelEncoder()
    arr = le.fit_transform(all_data[col])
    all_data[col] = arr

# Count the occurence for each value for categorical feature
train.col1.value_counts()

# Convert string categorical vars into numerical ordered manually. Important to do that manually cuz order matters (Ordinal encoding)
all_data['GarageQual'].unique()
all_data["GarageQual"]=all_data["GaraQual"].map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}).astype(int)
# One-hot encoding for categorical features using pd.dummies
cat_df = all_data.select_dtypes(include=[np.object])  #Take all the categorical features (strings)
for col in cat_df.columns:
    cat_df = pd.concat([cat_df, pd.get_dummies(cat_df[col], drop_first=True)], axis=1)
    cat_df.drop(col, axis=1, inplace=True)
data_encoded = pd.concat([all_data.select_dtypes(include=[np.number]),     # Concat with the numerical features
    cat_df.drop(cat_df.select_dtypes(include=[np.object]), axis=1)], axis=1)   # Don't keep the original categorical features
data_encoded.head(2)

# One-hot encoding using pandas (easier)
all_data = pd.get_dummies(all_data)

# Encode with LabelEncoder. THE DIFFERENCE BETWEEN ONE-HOT AND LABEL ENCODING IS THAT ONE-HOT DOESNT HAVE HIERARCHY. 
from sklearn.preprocessing import LabelEncoder
# Select all cat feature to encode: variables with categories and a hierarchy 
cat_features = ['MSSubClass', 'MSZoning', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'RoofStyle',
       'RoofMatl', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC',
       'CentralAir', 'Electrical',
       'KitchenQual',
       'Functional', 'Fireplaces', 'GarageType', 'GarageFinish',
      'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType',
       'SaleCondition']

le_name_mapping = {}
for feature in cat_features:
  enc = LabelEncoder()
  all_data[feature] = enc.fit_transform(all_data[feature])   # Not the best way but i've to find better one. Not good to fit the encoder on test
  # Keep mapping
  le_name_mapping[feature]= dict(zip(enc.classes_, enc.transform(enc.classes_)))
    

# Hierarchical label clustering
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage

from sklearn.cluster import AgglomerativeClustering

def hc_label_encoding(df, feature):

    def pearson_dist(x, y):
        r = stats.pearsonr(x, y)[0]
        return (1 - abs(r))

    def encode_espece(df):
        data = pd.pivot_table(data=df,columns="month",values="target",index=feature).fillna(0)
        data = data.loc[(data.transpose().sum()!=0)] # on retire les lignes nulles (corrélation non définie)
        distance_matrix = data.transpose().corr().applymap(lambda x:1-x)

        k=len(data.index)
        
        model = AgglomerativeClustering(
            n_clusters=k,
            affinity = "precomputed",
            linkage = "average",
            distance_threshold=None,
            compute_distances = True
        )
        clustering = model.fit(distance_matrix)
        #Labels est le nouvel encodage
        labels = clustering.labels_
        data["cluster"]=labels
        return labels,data

    df_hc_label = df[df.level == level_chosen]
    labels,data = encode_espece(df_hc_label)
    data = data.reset_index()
    df_cluster_feature = data[[f'{feature}', "cluster"]]

    return df_cluster_feature
    
   
# Using Robust Scaler to transform X_train. Aim: scale features to be robust to outliers
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(X)
X_test_scaled = robust_scaler.transform(test)


GODD PRACTICE: USE A CLASS TRANSFORMER. ==> Define a class that does the data preprocessing on the train then on the test set


4) Modeling
# split the dataset into training and validation datasets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=94,
                                                   stratify=y) #random_state to have the same alea and strafify to have a balance in label
                                                   
##) GROUPSHUFFLESPLIT
# Sometimes we need to split dataset by group (listing_id, years, months etc). Otherwise we'd have some data both in train and test set
# For ex: Given listing_id = 94, if we split "classically", we probably have some prices for some weeks in train while prices for other weeks in test!

from sklearn.model_selection import GroupShuffleSplit
# next is needed cuz otherwise too much values to unpack. next creates an iterator, and print the items one by one
train_index, test_index = next(GroupShuffleSplit(test_size=.30, n_splits=2, random_state = 94).split(df, groups=df['listing_id']))
train = df.iloc[train_index]
test = df.iloc[test_index]
###########
# GroupShuffleSplit for hyperparam optimization and CV (cuz we also need to split data). Example with 20-fold for CV
gss = GroupShuffleSplit(n_splits=20, test_size=0.3, random_state=94)
for train_index, test_index in gss.split(df, groups=df['listing_id']):
    X = df.drop('target', axis = 1)
    y = df['target'].astype(float)
    
    X_fold_train = X.iloc[train_index, :]
    y_fold_train = y.iloc[train_index]
    X_fold_test = X.iloc[test_index, :]
    y_fold_test = y.iloc[test_index]
    
    # Fit
    model = LGBMRegressor()
    model.fit(X_fold_train, y_fold_train, eval_metric = 'mae')
    
    # Predict on train and test
    y_fold_pred_train = model.predict(X_fold_train)
    y_fold_pred_test = model.predict(X_fold_test)
    
    # Evaluate
    scores_train.append(
        round(mean_absolute_error(y_fold_train, y_fold_pred_train),2)
    )
    scores_test.append(
        round(mean_absolute_error(y_fold_test, y_fold_pred_test),2)
    )
    final_test_score = round(np.mean(scores_test),2)
    
# Display results
print(f"MAE Scores train are: {scores_train}")
print()
print(f"MAE Scores test are: {scores_test}")
print()
print(f"MAE final test score is {final_test_score}")
    

TIPS : 
We can do cross-validation on the whole dataset to EVALUATE the model in the end (after fit/predict and check overfitting by comparing train and test results)
For small dataset (<100K rows is a small dataset) => cross-validation
How choose K ? Value of K shouldn’t be too small or too high, ideally 5 to 10 depending on the data size. 
         The higher value of K leads to less biased model (but large variance might lead to over-fit)
         
REGRESSION METRICS:
3 main possibilities: R2, (R)MSE and MAE. 
 - R2 = how well the model fits the dependent variables. Not good for overfitting consideration. Solution: adjusted R2
 - RMSE = how far is your prediction. Use RMSE and not MSE cuz in the same unit than the observations
 - MAE = Similar but take the absolute error vs square error for RMSE. RMSE gives larger penalisation to big prediction errors vs MAE treats them equally
Conclusion, better to use RMSE to compare models and if you want to penalize more larger prediction errors
Code using sklearn:
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(y_test, y_pred)))

Steps: Gridsearch (optimization) on X_train and cross-validation on X !

## Blent.ai: evaluate model
# Import regressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set model to test
lr = LinearRegression()
rf = RandomForestRegressor()
xgb = XGBRegressor()

models = [lr, rf, xgb]
names = ['Linear Regression','Random Forest','XGBoost']

# Evaluate each model using minmax scaler
for model, name in zip(models, names):
    # Fit on train
    model.fit(X_train_scaled_minmax, y_train)
    # Predict on train and test
    y_pred_train = model.predict(X_train_scaled_minmax)
    y_pred_test = model.predict(X_test_scaled_minmax)
    # Evaluate and watch overfitting
    print(f'{name}:')
    print(f'MAE train: {round(mean_absolute_error(np.exp(y_train), np.exp(y_pred_train)),2)}€') # exp cuz log_price
    print(f'MAE test: {round(mean_absolute_error(np.exp(y_test), np.exp(y_pred_test)),2)}€')

    print(f'R2 train: {round(r2_score(np.exp(y_train), np.exp(y_pred_train)),2)*100} %')
    print(f'R2 test: {round(r2_score(np.exp(y_test), np.exp(y_pred_test)),2)*100} %')

    print(f'RMSE train: {round(np.sqrt(mean_squared_error(np.exp(y_train), np.exp(y_pred_train))),2)}€')
    print(f'RMSE test: {round(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred_test))),2)}€')
    print()

# Optimization: GridSearchCV (for Gradient Boosting, do it by pairs). GridSearch on X_train
from sklearn.model_selection import GridSearchCV

grid_parameters = {
    'max_depth': [3, 8, 12, 15],
    'min_child_weight': [1, 3, 5, 7]}
grid_cv = GridSearchCV(
    estimator=XGBRegressor(**params),
    param_grid=grid_parameters,
    scoring="neg_mean_absolute_error",
    cv=2,
    verbose=1)
# Lance la recherche par grille (peut prendre 5 minutes)
grid_cv.fit(X_train, y_train)

# Show results
from matplotlib.ticker import PercentFormatter
score_matrix = np.asarray(grid_cv.cv_results_["mean_test_score"]).reshape((4, 4))
sns.set(font_scale=1.5)
fig = plt.figure(figsize=(16, 12))
plt.title("Scores MAE")
ax = sns.heatmap(
    score_matrix,  # Matrice des scores
    annot=True,  # Afficher les scores
    fmt='.02f',  # Affichage avec 1 chiffre après la virgule
    yticklabels=grid_parameters["max_depth"],  # Nom de l'axe vertical
    xticklabels=grid_parameters["min_child_weight"],  # Nom de l'axe horizontal
    cmap="Blues",  # Palette de couleur
)
sns.set(font_scale=1)
ax.set(ylabel="max_depth", xlabel="min_child_weight")
plt.show()

# summarize result
print('Best Score: %s' % grid_cv.best_score_)
print('Best Hyperparameters: %s' % grid_cv.best_params_)


# Define a root mean square error function using cross-validation
from sklearn.model_selection import KFold, cross_val_score
def rmse(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse
 or
    accuracy = cross_val_score(model, X, y, scoring = "accuracy", cv=5) # for accuracy

# Perform 5-folds cross-validation to evaluate the models 
for model, name in zip(models, names):
    # Root mean square error
    score = rmse(model, X_train_scaled, y)
    print(f"- {name}: Mean: {round(score.mean(),3)}, Std: {round(score.std(),3)}")


# Training with log reg
from sklearn import linear_model
accuracy = train_model(linear_model.LogisticRegression(random_state=0, penalty = "l2", solver='newton-cg', max_iter =200, C=1.6), X_train, y_train, X_valid, y_valid)
print ("LR : ", accuracy) 

# coeff interpretation (for linear model)
regr_coef = pd.DataFrame(data=np.asarray([X.columns, model.coef_]).transpose(), columns=["variable", "coef"])
regr_coef.sort_values(by='coef',ascending=False)

### Tuning hyperparameters
# Gridsearch for log reg
from sklearn import model_selection
param_grid = {'C': [0.1,1,10],
              'solver': ['newton-cg', 'saga', 'lbfgs']
              } 
 
clf = model_selection.GridSearchCV(linear_model.LogisticRegression(max_iter=300), param_grid = param_grid, scoring = "accuracy"/"neg_mean_squared_error", cv = 5)
clf.fit(X_train, y_train)
print("The best parameters are: ",clf.best_params_)
print("The best score achieved is: ",np.sqrt(-clf.best_score_))


### Final model
best_model = ExtraTreesRegressor()  # For example with the best_params found earlier if applicable
best_model.fit(X_train_scaled, y_train)

# Predict on test
y_pred = best_model.predict(X_test_scaled)
or
y_pred = np.exp(best_model.predict(X_test_scaled))  # If we've applied log to normalize the variables
or
y_pred = le.inverse_transform(best_model.predict(X_test_scaled))  # If we have encoded the label before


# Rolling prediction
def rolling_predict_next_month(
    df, start_month_order, end_month_order, 
    enc_method, cols_to_encode, l_enc_dict,
    training_columns, model):
    """
    Make predictions for the next month and use this prediction for feature engineering ==> rolling predictions
    """
    
    for i in range(start_month_order, end_month_order):
        # Feature Engineering
        df_feat_eng = feat_engine(df[(0+i <= df.month_order) & (df.month_order <= 24+i)]).drop('index', axis=1)
        # Encoding
        df_prepro_enc, le_name_mapping = encoding(df_feat_eng, enc_method, cols_to_encode, l_enc_dict)
        
        # Make predictions only for the next month
        df_pred = df_feat_eng[df_feat_eng.month_order == 24+i]
        df_pred_enc = df_prepro_enc[df_prepro_enc.month_order == 24+i]

        try:
            assert (df_pred_enc.month_order.unique()[0] == 24+i) & (len(df_pred_enc.month_order.unique()) == 1)
        except:
            print(f"Supposed to only have: {24+i}")
            print(f"But we have: {df_pred_enc.month_order.unique()}")
            break
            
        current_cols_retain_or_drop = list(set(training_columns) & set(df_pred_enc.columns))
 
        X_test = df_pred_enc[current_cols_retain_or_drop]
        y_test = df_pred_enc['target'].values
        
        # Make predictions and replace target by predictions
        try:
            df_pred['target'] = model.predict(X_test)
        except:
            print(f'ERROR when predict! {X_test.describe()}')
            break
            
        if i == 0:
            df_pred_output = df_pred
        else:
            df_pred_output = pd.concat((df_pred_output, df_pred)

        # Update df with pred
        cols_feat_eng = list(set(df_pred.columns)- set(df.columns))
        df = pd.concat((df[df.month_order != 24+i], df_pred.drop(cols_feat_eng, axis=1)))
    
    df_pred_output = df_pred_output.rename(columns={'target':'prediction'})
    
    return df_pred_output
    
    
# Store experiments results to MLflow
def store_mlflow(model, training_columns, rmse_score, r_score, nrmse_mean_score, nrmse_minmax_score:
    # Set tracking locally
    mlflow.set_tracking_uri(
        "file://"
        + os.path.expanduser(
            "/Users/X/Desktop/Projets/Project_name/mlruns"
        )
    )
    # Set mlflow experiment
    mlflow.set_experiment("Project_name")
    # Start a mflow run and specify experiment_id and run_name=name of the model
    run = mlflow.start_run(
        experiment_id=0,
        run_name=type(model).__name__,
    )
    
    # Log training features
    mlflow.log_param("training_columns", training_columns)
    # Store selected params
    try:
        params_mlflow = ['max_depth', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators']
        dict_params = { key: model.get_params()[key] for key in params_mlflow }
        mlflow.log_params({**dict_params})
    except:
        pass
    # Log metrics
    mlflow.log_metric("RMSE", rmse_score)
    mlflow.log_metric("NRMSE_mean", nrmse_mean_score)
    mlflow.log_metric("NRMSE_minmax", nrmse_minmax_score)
    mlflow.log_metric("R2", r_score)
    # Log model
    mlflow.sklearn.log_model(model, "model")
    # End run
    mlflow.end_run()

### EVALUATION
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print(f'MAE train: {round(mean_absolute_error(y_train, model.predict(X_train)),2)}')
print(f'MAE test: {round(mean_absolute_error(y_test, model.predict(X_test)),2)}')

print(f'R2 train: {round(r2_score(y_train, model.predict(X_train)),2)*100} %')
print(f'R2 test: {round(r2_score(y_test, model.predict(X_test)),2)*100} %')

print(f'RMSE train: {round(np.sqrt(mean_squared_error(y_train, model.predict(X_train))),2)}')
print(f'RMSE test: {round(np.sqrt(mean_squared_error(y_test, model.predict(X_test)),2)}')


# Show bias_variance decomposition --> Show order is MSE, bias et variance
# IMPORTANT: NOT ONLY EVALUATE MODEL ON PERFORMANCE METRICS BUT ALSO ON BIAIS-VARIANCE TRADEOFF (GOOD TO HAVE A LOW VAR WITHOUT SACRIFICE BIAS)
# For the decomposition, we use mlxtend module to import bias_variance_decomp
!pip install mlxtend -q
from mlxtend.evaluate import bias_variance_decomp
bias_variance_decomp(
    model,
    X_train.values,
    y_train.values,
    X_test.values,
    y_test.values,
    loss="mse",
    num_rounds=100,
    random_state = 94)


### MODEL EVALUATION
## APPLY CROSS-VALIDATION TO EVALUATE THE STABILITY OF THE MODEL. AT THIS STEP, WE'VE ALREADY CHOSEN OUR FINAL MODEL. LAST CHECK
from sklearn.model_selection import KFold

# Set K
kfold = KFold(n_splits=10)
# To store train and test scores
scores_train = []
scores_test = []

for train_I, test_I in kfold.split(X_encoded):
    # On récupère les indices des sous-échantillons
    X_fold_train = X_encoded.iloc[train_I, :]
    y_fold_train = y_log.iloc[train_I]
    X_fold_test = X_encoded.iloc[test_I, :]
    y_fold_test = y_log.iloc[test_I]
    
    # Normalization
    X_fold_train_scaled = minmaxscaler.fit_transform(X_fold_train)
    X_fold_test_scaled = minmaxscaler.transform(X_fold_test)

    # Model
    rf = RandomForestRegressor()
    rf.fit(X_fold_train_scaled, y_fold_train)
    
    # On calcule le score du modèle sur les deux sous-ensembles
    scores_train.append(
        r2_score(y_fold_train, rf.predict(X_fold_train))
    )
    scores_test.append(
        r2_score(y_fold_test, rf.predict(X_fold_test))
    )
    
    if len(scores_train) % 5 == 0:
        print("Fin de l'itération {} ...".format(len(scores_train)))
        
print("Fin de la validation croisée")


# FYI: We do not need to call the fit method separately while using cross validation, 
# the cross_val_score method fits the data itself while implementing the cross-validation on data

from sklearn.model_selection import cross_val_score
from sklearn import metrics  --> See list of metrics here : https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# Example of 5-fold cross-validation with SVM Regressor
model = svm.SVR(kernel='linear', C=1, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring = "neg_root_mean_squared_error") ==> "scores" gives an array with the multiple scores
# Give the mean score
print(f" {round(scores.mean(),2)} score with a standard deviation of {round(scores.std(),2)}"


# Predict by batch. Use // and % for floor division and reminder
housing_ids = list(dict_input['parameters'].keys()) # create list with all housing ids
rep = {}
num_batch = np.ceil(len(housing_ids) / BATCH_SIZE) # calculate nb of batches --> take the ceil
current_batch = 1

for i in range(0, len(housing_ids), BATCH_SIZE):
    list_ids = set(housing_ids[i:i+BATCH_SIZE]) # HERE IS THE BATCH PROCESS
    print(
        f'Starting predictions on batch [{current_batch}:{num_batch}] (batch size: {len(list_ids)})')
    dict_input_new = {
        'parameters': dict([(k, v) for k, v in dict_input['parameters'].items() if k in list_ids]),
        'features': dict([(k, v) for k, v in dict_input['features'].items() if k in list_ids]),
        'offer': dict([(k, v) for k, v in dict_input['offer'].items() if k in list_ids])
    }
    rep_batch = predict(dict_input_new, online)
    rep = {**rep, **rep_batch} # concatenate two dicts
    current_batch += 1


### SAVE
# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': test_ID,
                       'SalePrice': ensemble})
output.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")


BONUS
### STACKING MODELS
# Averaging base models. We can choose any model to add to the averaged_models
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):       #Create a class to extent scikit with our models
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  
        
# We average the following models
ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt',min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5)
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
Lasso = Lasso(alpha =0.0005, random_state=1)

averaged_models = AveragingModels(models = (ENet, GBoost, KRR, Lasso))
score = rmse(averaged_models, X_train_scaled, y)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


# Better results when adding a meta-learner    TO FURTHER UNDERSTAND
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
          for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

#Check if our meta model helps
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)
score = rmse(stacked_averaged_models, X_train_scaled, y)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))


### We can add XGBoost and LightGBM to the StackedRegressor defined previously. Strategy: compute prediction for the 3 models and do the weighted average
# Prediction from the averaged model
stacked_averaged_models.fit(X_train_scaled, y)
y_avg_pred = np.exp(stacked_averaged_models.predict(X_test_scaled))

# Prediction from the XGBoost
import xgboost as xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

model_xgb.fit(X_train_scaled, y)
y_xgb_pred = np.exp(model_xgb.predict(X_test_scaled))

# Prediction from the LightGBM
import lightgbm as lgb
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

model_lgb.fit(X_train_scaled, y)
y_lgb_pred = np.exp(model_lgb.predict(X_test_scaled))

# Average the predictions
ensemble = y_avg_pred*0.70 + y_xgb_pred*0.15 + y_lgb_pred*0.15
ensemble

# Feature importance for Random Forest
feature_imp = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["importance"])
feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(20)
feat_imp_20


# Save output as pdf
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
fig, ax =plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=df.values,colLabels=df.columns,loc='center')
pp = PdfPages("output.pdf")
pp.savefig(fig, bbox_inches='tight')
pp.close()