-
Notifications
You must be signed in to change notification settings - Fork 1
/
A. FULL guideline for Machine Learning
1365 lines (1073 loc) · 55.1 KB
/
A. FULL guideline for Machine Learning
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
PERSONAL TIPS THAT I MET DURING DS PROJECTS
1)IMPORTATION
FILE MANAGEMENT
import os
os.getcwd() #to get current working directory
os.chdir(path) #change directory to another path
# Go 3 levels up to get data file stores in another directory data
dataset = pd.read_csv('../../../data/dataset.csv')
# Import csv from gcp bucket into jupyter notebook
import pandas as pd
from google.cloud import storage
from io import BytesIO
PROJECT_ID = "your_project_id"
BUCKET_NAME = "your_bucket_name"
FILE_NAME = "your_file_name.csv"
client = storage.Client(project=PROJECT_ID)
bucket = client.get_bucket(BUCKET_NAME)
blob = bucket.get_blob(FILE_NAME)
content = blob.download_as_string()
df = pd.read_csv(BytesIO(content))
# Export csv from notebook to gcp bucket
from google.cloud import storage
def upload_to_bucket(bucket_name, blob_path, local_path):
bucket = storage.Client(project=PROJECT_ID).bucket(bucket_name)
blob = bucket.blob(blob_path)
blob.upload_from_filename(local_path)
return "Done!"
# method call
bucket_name = 'your_bucket_name' # do not give gs:// ,just bucket name
blob_path = 'data_intermediate/df_merge.csv' # path where you want to store the file
local_path = '../data/df_merge.csv' #local file path
upload_to_bucket(bucket_name, blob_path, local_path)
# Import and perform operations on multiple dataframes
month_number = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
df_dict = {m: pd.DataFrame() for m in month_number}
for k in df_dict:
# Import multiple dataframes in one line of code
df_dict[k] = pd.read_csv('../../Datasets/inside_airbnb/calendar/2019/calendar_' + k + '.csv')
print(f'shape before filter for month {k} is {df_dict[k].shape}')
df_dict[k]['date'] = pd.to_datetime(df_dict[k]['date'])
df_dict[k] = df_dict[k][df_dict[k].date.dt.month == int(k)]
df_dict[k] = df_dict[k][(df_dict[k].minimum_nights <= 3) & (df_dict[k].maximum_nights >= 2)]
print(f'shape after filter for month {k} is {df_dict[k].shape}')
# Load a json file and convert it to pandas dataframe perserving tuples as values
with open('path_to_your_json_file.json') as json_file:
dict_input = json.load(json_file)
data = {k:[v] for k,v in zip(dict_input.keys(), dict_input.values())}
df_input = pd.DataFrame(data)
# Read compressed csv and filter on rows
iter_csv = pd.read_csv('test.csv.gz', usecols=cols_to_keep_listings, compression='gzip', na_values='\\N', iterator=True, chunksize=1000)
listings = pd.concat([chunk[(chunk['data_type'] == 'ENTIRE_HOME') &
(chunk['data_address_city'] == 'Paris') &
(chunk['data_subtype'].isin(['APARTMENT', 'HOTEL', 'OTHER', 'APARTHOTEL', 'BED_&_BREAKFAST'])) &
(chunk['unifiedId'].isin(listings_ids))
] for chunk in iter_csv])
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
pd.set_option('display.max_columns', None) # To show all columns
## Connect to SQL database and make some HTTP requests using authentification token using requests library
# Get token. 1-day period
url_token = 'API_token'
credentials = {'mail': 'user_mail,
'password': 'user_password'}
res = requests.post(url_token, data = credentials)
token = res.json()['token']
# Get housing with the token and transform to DataFrame to be able to manipulate data in Jupyter Notebook. Get specific columns, not all columns in database
url_housing = 'API_housing?columns=host_id;housing_id;housing_name'
headers = {'Authorization': f'Bearer {token}'} # Need to use authorization header in every requests
housing_req = requests.get(url_housing, headers=headers)
housing = housing_req.json()
housing = pd.DataFrame(housing)
# Put new housing neighborhood (modify value in directly database)
for id in update_housing.housing_id:
url_housing_put = f'API_housing/{id}'
param = {'neighborhood': f'{update_housing.loc[update_housing.housing_id == id].neighborhood.values[0]}'}
housing_put = requests.put(url_housing_put, data = param, headers = headers)
# Date
from datetime import datetime, date, timedelta
import calendar
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
Import Datasets
# from Kaggle
import os
os.environ['KAGGLE_USERNAME'] = "vinhnguyen94" # username from the json file
os.environ['KAGGLE_KEY'] = "ee939400b83c8ef989727a70f29fab39" # key from the json file
!kaggle competitions download -c hecmontrealdeeplearningcourse # api copied from kaggle. Get the API on the address link
train = pd.read_csv("train.csv", index_col = "chosen column)
# on Colab
from google.colab import files
uploaded = files.upload()
import io
train=pd.read_csv(io.BytesIO(uploaded['train.csv']))
# on Jupyter
ttrain = pd.read_csv('../Datasets/train.csv')
train.describe() // train.describe(include = ['O']) if we want only the strings information. include = 'all' is possible
# Check if the target variable is normal, sometimes there can be very bad mistake that would ruin the model (like a negative price)
train["col"].nunique() # To see how many unique values in a column
train.value_counts() # To see the values distribution
2) UNDERSTAND DATA
# Some useful definitions
List is a collection which is ordered and changeable (can add/remove items). Allows duplicate members.
Tuple is a collection which is ordered and unchangeable (cannot add/remove items). Allows duplicate members.
Set is a collection which is unordered and unindexed. No duplicate members.
Dictionary is a collection which is ordered and changeable. No duplicate members.
Select the most important features to check the outliers and to check multicorrelation (reduce nb of features)
# Heatmap to see how features are correlated
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
# Plot Spearman correlation (better than Pearson's cuz don't assume linear relationship and Gaussian distribution but less powerful when linearity)
def display_correlation(df):
r = df.corr(method="spearman")
plt.figure(figsize=(10,6))
heatmap = sns.heatmap(df.corr(), vmin=-1,
vmax=1, annot=True)
plt.title("Spearman Correlation")
return(r)
# Heatmap to check the features that contribute the most to the target variable
import seaborn as sns
k = 25 #number of variables for heatmap
cols = df.corr().nlargest(k, 'price')['price'].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
fig, ax = plt.subplots(figsize=(12,12)) # Sample figsize in inches
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
# Check feature distribution
df = df_preprocessed_final
target = 'surface'
plt.figure(figsize=(10,5)) # Set figure size before calling plt.plot()
df[f'{target}'].plot.hist(bins=60)
plt.xlabel('Target')
plt.title('Target distribution')
# Spot outliers. Strategy: delete very bad outliers. not all outliers to have a model robust on outliers
fig, ax = plt.subplots()
feature = "TotalBsmtSF"
target = 'SalePrice'
ax.scatter(x = train[feature], y = train[target])
plt.ylabel(target, fontsize=13)
plt.xlabel(feature, fontsize=13)
plt.show()
# Drop chosen outliers
train = train.drop(train[(train[feature]>4000) & (train[target]<300000)].index) # Example for the house pricing
# Multiple plots TO COMPLETE
g = sns.FacetGrid(train, col='y') # Initialize the FacetGrid with the data (train) and the target var
g.map(plt.hist, 'X', bins=20) # Call plotting function .map with interval of 20 (for age for ex)
# Groupby using different aggregation methods: mean and sum and custom function (here keep the mode of the cluster feature)
df_quartier = df.groupby(['region','quartier']).agg(revenue_brut_sum = ('revenue_brut_year', 'sum'),
revenue_brut_night_mean = ('revenue_brut_moyen_night', 'mean'),
cluster_mode = ('cluster', lambda x: x.value_counts().index[0])).reset_index()
# Groupby and show unique values within subgroup
df.groupby("region")["departement"].apply(set).to_frame()
# Move specific col to specific place. col = df.pop(col) => keep the col and remove it from df
y_name = 'arnd'
y = df.pop(y_name)
df.insert(1, y_name, y). # Put at 2nd column
# Find elements not in a list
set(update_df.housing_id) - set(start_df.housing_id) # gives IDs from update_df that are not in start_df
# Groupby and compute percentage within group / subgroup
df_tmp = df.groupby(['Country', 'City']).agg({'revenue':'sum'})
df_tmp.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))
# Groupby weighted average
# Custom function for weighted average
def my_w_avg(s, df, wcol):
return np.average(s, weights=df.loc[s.index, wcol])
df_agg = df.groupby(groupby_cols).agg(
VOLUME = ('VOLUME', 'sum'),
weights = ('weights', 'mean'),
score = ('score', functools.partial(my_w_avg, df=df, wcol='weights')),
).reset_index()
# Create dataframe using dictionary
col = ['col1', 'col2', 'col3']
#val = [[0 for k in range(10)], [1 for k in range(10)], [2 for k in range(10)]]
val = [0, 1, 2]
d = {k: [v] for k,v in zip(col,val)}
df = pd.DataFrame(d)
# Create dataframe using three lists
df = pd.DataFrame({'lst1Tite' : [lst1],
'lst2Tite' : [lst2],
'lst3Tite' : [lst3] },
columns=['lst1Tite','lst1Tite', 'lst1Tite'])
# Split a column of tuples
df['a'], df['b'] = df.col.str
3) DATA PREPROCESSING
To do data pre-processing on both train and test sets. But WORK ON THE TRAIN FIRST THEN APPLY on TEST
# CONCATENATE train and test set to apply some data preprocessing
combine=pd.concat([train,test])
# MERGE 2 dataframes using the same column
new_train = pd.merge(train, text1, on='id', how='outer')
# MAP one column but need unique index
housing['quartier'] = housing.housing_id.map(df_quartier.set_index('housing_id')['quartier'])
# CREATE NEW COLUMN ON CONDITION
conditions = [(housing.postal_code.str.startswith('75')),(housing.postal_code.str.startswith(('78','91', '92', '93', '94', '95')))]
values = ['paris', 'banlieue']
housing['region'] = np.select(conditions, values)
# Extract first n characters from left of columns in pandas
df['zipcode'].str[:5]
# Keep only rows whose strings startwiths '75'
df_paris = df.loc[df.zipcode.str.startswith('75', na=False)]
# Set max value in a column to 30
maxVal = 30
df['col'] = df['col'].where(df['col'] <= maxVal, maxVal)
# Use function map() to apply a function for every items in a list
list_with_int = list(map(int, list_with_strings))
# VARIANTE to compute ratio per group (subtotal)
sub_total = df.groupby('cluster')['count'].sum().reset_index().rename(columns={'count':'sub_total'})
df = pd.merge(df, sub_total, on='cluster', how='left')
df['ratio_per_group'] = round(df.count/df.sub_total, 2)*100
# Compute subtotal (sub total) to have the ratio per group
total_resa_year_region_sleeps = df_booking_moments.groupby(['year','region', 'sleeps_max_cat'])['resa_count'].transform('sum')
df_booking_moments['sub_total'] = total_resa_year_region_sleeps
df_booking_moments['ratio'] = round(df_booking_moments.resa_count / df_booking_moments.sub_total,2)*100
# Select rows that contain a value in a specific column
df[df['col'].str.contains("hello")]
# Replace the value for one specific id (replace condition) by another chosen value
df.loc[df['First Season'] > 1990, 'First Season'] = 1 # Need to select the column you want to change in the condition
# Convert a column and handle error. Here convert postal_code to int and return 999 if impossible to convert to int
def convert_postal_code(x):
try:
res = int(x)
return res
except:
return int(999)
df.col = df.col.map(convert_postal_code)
# Find out how many different values for a specific column after a group by
df_duplicates = df[['destination', 'month', 'client']]\
.reset_index()\
.groupby(['destination', 'month'])['index']\
.agg(list)\
.reset_index(name='list_client')
df_duplicates['len'] = df_duplicates['list_client'].str.len()
df_duplicates.sort_values(by='len', ascending=False)
# Delete only true duplicates
train = train.drop_duplicates("col", keep = "first")
or PERSONALIZED
duplicates = train[train.duplicated('text')]
duplicates = duplicates[['id', 'text', 'keyword', 'location', 'target']].groupby(['text'], as_index=False).count().sort_values(by='id', ascending=False)
true_duplicate = duplicates["id"] > 1. # Can adapt
duplicates[true_duplicate]
index_false_duplicate = duplicates[duplicates["id"] == 1].index
duplicates.drop(index_false_duplicate, inplace=True)
# Duplicates that are differently labeled
df_train['target_relabeled'] = df_train['target'].copy()
df_train.loc[df_train['text'] == 'original text', 'target_relabeled'] = 0/1 #Choose the correct label
# Create column with tuple. Useful for GPS using lat and lnt
housing['GPS'] = tuple(zip(housing.lat, housing.lng))
# Fix issues with too much spaces between words in a column
df.date.map(lambda x: ' '.join(filter(None,x.split(' ')))) ==> first split using space, filter/remove None to have a list. Then join the remaining words in the list by a space
# Use enumerate to create dictionary for creating dataframe
map_admin = {k:v for k,v in enumerate(df_admin.admin_id.unique())} # enumerate renvoie l'index (0,1,2,3,etc) et l'associe à une valeur (admin_id)
# Use dict.items() to have a list of a dictionary's tuple pairs (key:value)
--> Useful for changing/mapping keys and values of a dict
output = {map_admin[k]:list(map(lambda x: map_housings[x], v)) for k,v in dict_labelers.items()}
# Use sub_df in a for loop to create a dataframe. Need to do it when creating a df from dict with multiples values of different sizes per key
for i,x in enumerate(dict.items()): # reminder: enumerate gives iterable pairs of (index,value) and dict.items() gives pairs of (key,value)
sub_dict = dict([x]) # create sub_dict per x in the original dict
sub_df = pd.DataFrame(sub_dict)
if i==0:
df_final = sub_df.copy()
else:
df_final = pd.concat((df_final, sub_df))
# Select all columns starting the 4th column
df.iloc[:,4:]
# Check if intersection of two lists is null
L = [1,2,3,4,5]
M = [5,6,7]
bool(set(L) & set(M))
# CALCULATE DISTANCE (haversine distance)
pip install haversine # install module
from haversine import haversine
name = quartiers_paris.L_QU.unique() # Distance of each house from every quarters in Paris
for q,n in zip(quartiers_paris.GPS, name):
distances = []
for h in housing_paris.GPS:
distances.append(haversine(h,q))
housing_paris[n] = distances
# Create function assign_quartier
def assign_quartier(df):
min_val_col_name = df.idxmin(axis=1). # Take the quartier with the min distance
df['quartier'] = min_val_col_name
# Keep rows with specific values in a specific column
new_amenities = new_amenities.loc[new_amenities.title.isin(selected_amenities)] # Set selected_amenities list before
# Check amenities distribution (multiple items per row that are separated by ",")
pd.Series(np.concatenate(listings['amenities'].map(lambda x: x.split(",")))).value_counts()
# If multiple items are separated by other characters, we can create a cleaning function and use it with apply
def split_amenities(amenities):
amenity_list = amenities \
.replace("\"", "") \
.replace("{", "") \
.replace("}", "") \
.lower().split(",")
return amenity_list
listings['amenities'] = listings['amenities'].apply(split_amenities)
# Convert date to day of week name and add isWE
df['date'] = pd.to_datetime(df['date'])
df['weekday'] = df['date'].dt.weekday
df['weekday_name'] = df['date'].dt.day_name()
df['isWE'] = np.where((df.weekday == 4)|(df.weekday == 5), 1,0)
# Separate weekdays and weekend days
df_week = df[df.isWE == 0]
df_weekend = df[df.isWE == 1]
# Check if we didn't have data leakage
assert df_week.shape[0] + df_weekend.shape[0] == df.shape[0] # Use assert to check if we don't have any error
## USING TRY EXCEPTION TO HANDLE EXCEPTION
# Function to convert format, if error (ParserError from pendulum lib), set specific date
def convert_date_format(x):
try:
return maya.parse(x).datetime(to_timezone='Europe/Paris', naive=False).date() #Do something if no error
except pendulum.parsing.exceptions.ParserError:
return '2000-01-01'. #Do something else if error
housing['first_booking'] = housing['first_booking'].apply(convert_date_format)
## Keep rows from a df according to a percentage. Here: keep the first 80% (already sort by OccRate) per year and per month
df_80 = df.groupby(['year', 'month']).head(df['nb_rows_keep_per_month_80'])
## Create additional rows to have the same length for date
prices = prices.set_index(['date', 'listing_id'])\
.unstack(fill_value=0)\
.asfreq('D', fill_value=0)\
.stack().sort_index(level=1).reset_index()
## Fillna by mean within group
df['price_adjusted'] = df['price'].fillna(df.groupby(['listing_id', 'isWE'])['price'].transform('mean'))
## Fillna by the most frequence within group / subgroup
df["DEBUT_ECHEANCE"] = df.groupby(['NUMERO_CONTRAT', 'NUM_SOUS_CONTRAT', 'MOIS_ANNEE'])['DEBUT_ECHEANCE'].transform(lambda x: x.fillna(x.mode()[0]))
## Round up a column: use ceil function from math
import math
df['bathrooms_adjusted'] = df.bathrooms.apply(lambda x: math.ceil(x))
## Create a dict from two lists
output = dict(zip(keys_list, values_list))
### MISSING VALUES
# Print only the columns with missing values and their corresponding %
df_mv = reservation2
total_miss = df_mv.isnull().sum().sort_values(ascending=False)
percent_miss = (df_mv.isnull().sum()/df_mv.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total_miss, percent_miss], axis=1, keys=['Total_miss', 'Percent_miss'])
missing_data[missing_data.Percent_miss > 0]
THEN
mv_features = ['tax', 'fees']
for f in mv_features:
reservation2[f] = reservation2[f].fillna(0)
# Replace MV for one col depending on another column. Here replace MV "beds" by the most frequent value for beds in observations with the same value of bedrooms and bathrooms
df_na = listings[(listings.beds.isna()) & (~listings.bedrooms.isna()) & (~listings.bathrooms.isna())]
def fill_beds(row):
df_filtered = listings[(listings.bedrooms == row.bedrooms) & (listings.bathrooms == row.bathrooms)]
argmax = df_filtered['beds'].value_counts().idxmax(). # argmax is the most frequent value for beds per subgroup
row['beds'] = argmax
return row
listings_imputed = df_na.apply(fill_beds, axis = 1)
listings.beds = listings.beds.fillna(listings_imputed.beds)
# (Variante) Replace MV for one col depending on another column. Here replace MV "beds" by "accomodates"/2 if > 1 else 1
c1 = (listings.accommodates > 1)
c2 = (listings.accommodates <= 1)
listings.loc[c1,'beds'] = listings.loc[c1,'beds'].fillna(listings.loc[c1,'accommodates']/2)
listings.loc[c2,'beds'] = listings.loc[c2,'beds'].fillna(1)
# (Variante) Replace MV by specific value in one col for rows that have a specific value
id_list = [x1, x2]
df.loc[df.c1.isin(id_list), 'occ_rate_year'] = 100
# Get names of columns with missing values
cols_with_missing = [col for col in train.columns
if train[col].isnull().any()]
# Get columns for categorical features (replace by the most frequent occurence) and numerical features (replace by the mean)
categorical_cols = [cname for cname in all_data.columns if
all_data[cname].nunique() < 10 and #to limit the size of the encoding
all_data[cname].dtype == "object" and
all_data[cname].isnull().sum() > 0
]
numerical_cols = [cname for cname in all_data.columns if
all_data[cname].dtype in ['int64', 'float64'] and
all_data[cname].isnull().sum() > 0
]
# Replace by the mean (num) and the most frequent values (cat). For num, not always by the mean (median or 0)
for feature in categorical_cols:
all_data[feature] = all_data[feature].fillna(all_data[feature].dropna().mode()[0])
for feature in numerical_cols:
all_data[feature] = all_data[feature].fillna(all_data[feature].mean())
# (Variante) Complete missing values by the median / "Unknown"
train['Age']=train['Age'].fillna(train['Age'].median() / "Unknown")
# (Variante) Complete missing values by the most frequent occurence (letter or numerical category). Ex: frequent port of embarkation
argmax = train.Embarked.value_counts().idxmax()
train["Embarked"] = train["Embarked"].fillna(argmax)
## Create a function that fills MV for beds with the most frequent value for beds in observations with the same value of bedrooms and bathrooms
# First, keep the rows that have NA for beds and not NA for bedrooms and bathrooms
df_beds_na = df[(df.beds.isna()) & (~df.bedrooms.isna()) & (~df.bathrooms.isna())]
# Create the function
def fill_beds(row):
df_filtered = df[(df.bedrooms == row.bedrooms) & (df.bathrooms == row.bathrooms)]
argmax = df_filtered.beds.value_counts().idxmax()
row['beds'] = argmax
return row
# Apply the function
df_beds_na_for_merge = df_beds_na.apply(fill_beds, axis = 1). # don't forget axis = 1
# (Variante) Complete missing values by using other correlated features instead of replacing by the mean of the missing variable
# Guess Age values using median values for Age across sets of Pclass and Gender feature combinations
guess_ages = np.zeros((2,3)) # (2,3) cuz Sex is 2 and Pclass is 3
for i in range(0, 2):
for j in range(0, 3):
guess_df = df[(df['Sex'] == i) & \
(df['Pclass'] == j+1)]['Age'].dropna()
# age_mean = guess_df.mean()
# age_std = guess_df.std()
# age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)
age_guess = guess_df.median()
# Convert random age float to nearest .5 age
guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
'Age'] = guess_ages[i,j]
df['Age'] = df['Age'].astype(int)
# Drop na from specific column
df = df.dropna(subset = ['colA'])
# Groupby year and pivot table to have columns per year
df_evolution = df.groupby(['year', 'housing_name'])['revenu'].sum()
df_evolution.year = df_evolution.year.astype(str)
df_evolution_pivot = df_evolution.pivot(index='housing_name', columns='year', values='revenu')
# Replace date for specific row
reservation.at[reservation.loc[reservation.code == 'resa_code'].index[0], 'departure_date']= pd.Timestamp('2019-12-31')
# Take the date of the first resa
reservation['first_resa'] = reservation.groupby(['housing_id', 'housing_name'])['arrival_date'].transform("min") # Add column first_resa with the earliest resa per housing
# Split resa per day: have one row per day of reservation
reservation_day0 = reservation2[['id', 'arrival_date', 'departure_date']]
reservation_day0 = pd.melt(reservation_day0, id_vars='id', value_name='date')
reservation_day0.date = pd.to_datetime(reservation_day0.date)
reservation_day0.set_index('date', inplace=True)
reservation_day0.drop('variable', axis=1, inplace=True)
reservation_day = reservation_day0.groupby('id').resample('D').ffill().reset_index(level=0, drop=True).reset_index(). # Resample per day
reservation_day = pd.merge(reservation2, reservation_day)
# Drop latest day per resa because we only count the nb of nights and not days
reservation_day= reservation_day.drop(reservation_day[reservation_day['departure_date'] == reservation_day['date']].index)
## CREATE FUNCTION with if condition
def is_studio_price(bedroom_count, nightPrice):
is_studio = 1 if bedroom_count == 0 else 0
if is_studio ==1:
nightPrice *= 0.10 # Example: decrease nightPrice by 90% if it is a studio
return nightPrice
### Categorizing
# Create ranges of age for plotting
train['Age_bins']=pd.cut(x=train['Age'], bins=[0,19, 39, 59,79], labels=['0-10s','20-30s','40-50s','60-70s']) # number of labels should be one less than bins
or (BETTER)
train['AgeBand'] = pd.cut(train['Age'], 5) # Create 5 categories of Age. We can add labels if we want. Use qcut if to have balanced categories
pd.crosstab(train.Age_bins, train.Survived).plot(kind="bar") #Survived = target
# Convert age into fixed categories of Age after the pd.cut
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
# Show rows only if values is in a list. Use isin() for string and number
doublon_list = [0, 999]
housing[housing.housing_id.isin(doublon_list) ][['housing_id', 'housing_name']]
### Creation of new feature. Some ideas
# TITANIC. Create another feature called IsAlone. Reminder: combine = [train, test]
for dataset in combine:
dataset["IsAlone"]=0 #Creation of the new feature
dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1 # 1 if the person is alone
# HOUSE PRICING. Create a variable for the total size of the house. Try to group correlated variables
all_data["tot_surface"] = all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]
# Feature engineering for time series
df['rolling_average_volume_past_3_months'] = df.sort_values(['id', 'month']).groupby(['id'])['target'].apply(lambda x: x.shift().rolling(3).mean())
### TO GO FURTHER: CHECK NORMALITY. The variables should be normal
from scipy import stats
#histogram and normal probability plot
sns.distplot(y_train);
fig = plt.figure()
res = stats.probplot(y_train, plot=plt) #The variable has to follow the red line (that represents the normal distribution).
# In case of hyperbol (positive skewness), apply log
y_train = np.log(y_train) #Recheck now. Be careful on the submission, we need to compute the exponential
and don't forget: y_pred = np.exp(y_pred)
// Skewness measures the lack of symmetry in data distribution. It has to be close to 0
// Kurtosis is used to measure outliers present in the distribution. It has to be less than 3.
## Detect outliers
plt.boxplot(df['target'])
Q1 = df['target'].quantile(0.25)
Q3 = df['target'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)
df['target'] < (Q1 - 1.5 * IQR) #lower bound
df['target'] > (Q3 + 1.5 * IQR) #upper bound
# 1.5 because of the scale. If we take 1 -> too much outliers, if 2 -> too few. With 1.5, we are the closest to the 3σ rule from the Gaussian (beyond 3σ -> outliers)
### GRAPH
# Gmaps
conda install -c conda-forge gmapsimport gmaps
gmaps.configure(api_key='YourAPI')
fig = gmaps.figure()
heatmap_layer_inside = gmaps.heatmap_layer(
df[['lat','lng']],
weights=df[criterion]
)
fig.add_layer(heatmap_layer_inside)
fig
# DOUBLE LINE GRAPH
ax = df.plot(x = 'month', y= ['Y1', 'Y2'], figsize = (25,10), title = 'yourTitle')
_ = plt.xticks(fontsize = 16, rotation=45) # tip for changing the ticks
# Bar plot horizontal
ax = df.sort_values('revenue', ascending=True).plot.barh(x='quartier', y='revenue', figsize = (15,15), title = 'yourTitle')
# show text on top of bars (vertical) or after bars (horizontal)
bars = ax.patches
# Get labels to show
labels = round(df.revenue,0).astype(int)
for rect, label in zip(bars, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2, height + 5, label,
ha='center', va='bottom')
# Bar plot from df with values as text on the graph
ax = sns.barplot(x='category', y="price", data=df_plot)
for _, row in to_draw.iterrows(): #Add text on graph
ax.text(row.name,row.price, round(row.price,2), color='black', ha="center")
ax.set_title('My_Title') # Add title
# COMBINED BAR AND LINE CHART (SEABORN)
fig, ax1 = plt.subplots(figsize=(20,9))
#bar plot creation
color1 = 'lightgray'
ax1.set_title('yourTitle', fontsize=24)
ax1.set_xlabel('X', fontsize=14)
ax1 = sns.barplot(x='X', y='Y', data = df_plot, color = color1)
ax1.set_ylabel('Y', fontsize=14, color = "black")
plt.xticks(rotation=45) # for better reading
#specify we want to share the same x-axis
ax2 = ax1.twinx()
#line plot creation
color2 = "green"
ax2.set_ylabel('Y2', fontsize=14, color = color2 )
ax2 = sns.lineplot(x=range(0,len(df_plot.Y2)), y='Y2', data = df_plot, sort=False, color=color2)
# Show yearly occ_rate
ax3 = ax2.twiny()
# line plot creation
color3 = 'navy'
ax3 = sns.lineplot(x=range(0,len(df_plot.Y3)), y='Y3', data=df_plot, sort=False, color = color3)
# Round values for axis
ax1.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#show plot
plt.show()
# PIE CHART
labels = df.country_name
sizes = df.ratio
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# CONVERT PHONE NUMBER TO COUNTRY CODE
pip install phonenumbers
import phonenumbers
from phonenumbers.phonenumberutil import (
region_code_for_country_code,
region_code_for_number
)
country_code = []
phone_number = []
incorrect_number = []
def number_to_countryCode(df):
for n in df.phone:
try:
pn = phonenumbers.parse(f'{n}')
code = region_code_for_country_code(pn.country_code)
phone_number.append(n)
country_code.append(code)
except:
incorrect_number.append(n)
## DATA MANIPULATION
# Take value of a cell in a dataframe refering specific value in a column ==> we get the string and not the array
neighborhood = df[df.housing_id == id].neighborhood.values[0]
### ENCODING
BEST PRACTICE : MANUALLY ENCODE WHEN ORDER MATTERS AND ONE-HOT ENCODE WHEN NO HIERARCHY
# Manual encoding
housing.id = np.where(housing.id > 0, 1, 0)
# Binary encoding
df['instant_bookable'] = 1 * (df['instant_bookable'] == "t") ==> t/f is 1/0 so *1 converts into numeric format
# Converting numeric features to categorical features. Especially for the year or month --> need to create categories
from sklearn.preprocessing import LabelEncoder
str_cols = ['YrSold','YearRemodAdd','YearBuilt','MoSold','MSSubClass']
for col in str_cols:
all_data[col] = all_data[col].astype(str)
le = LabelEncoder()
arr = le.fit_transform(all_data[col])
all_data[col] = arr
# Count the occurence for each value for categorical feature
train.col1.value_counts()
# Convert string categorical vars into numerical ordered manually. Important to do that manually cuz order matters (Ordinal encoding)
all_data['GarageQual'].unique()
all_data["GarageQual"]=all_data["GaraQual"].map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}).astype(int)
# One-hot encoding for categorical features using pd.dummies
cat_df = all_data.select_dtypes(include=[np.object]) #Take all the categorical features (strings)
for col in cat_df.columns:
cat_df = pd.concat([cat_df, pd.get_dummies(cat_df[col], drop_first=True)], axis=1)
cat_df.drop(col, axis=1, inplace=True)
data_encoded = pd.concat([all_data.select_dtypes(include=[np.number]), # Concat with the numerical features
cat_df.drop(cat_df.select_dtypes(include=[np.object]), axis=1)], axis=1) # Don't keep the original categorical features
data_encoded.head(2)
# One-hot encoding using pandas (easier)
all_data = pd.get_dummies(all_data)
# Encode with LabelEncoder. THE DIFFERENCE BETWEEN ONE-HOT AND LABEL ENCODING IS THAT ONE-HOT DOESNT HAVE HIERARCHY.
from sklearn.preprocessing import LabelEncoder
# Select all cat feature to encode: variables with categories and a hierarchy
cat_features = ['MSSubClass', 'MSZoning', 'Street',
'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
'OverallQual', 'OverallCond', 'RoofStyle',
'RoofMatl', 'MasVnrType',
'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC',
'CentralAir', 'Electrical',
'KitchenQual',
'Functional', 'Fireplaces', 'GarageType', 'GarageFinish',
'GarageQual', 'GarageCond', 'PavedDrive',
'SaleType',
'SaleCondition']
le_name_mapping = {}
for feature in cat_features:
enc = LabelEncoder()
all_data[feature] = enc.fit_transform(all_data[feature]) # Not the best way but i've to find better one. Not good to fit the encoder on test
# Keep mapping
le_name_mapping[feature]= dict(zip(enc.classes_, enc.transform(enc.classes_)))
# Hierarchical label clustering
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
def hc_label_encoding(df, feature):
def pearson_dist(x, y):
r = stats.pearsonr(x, y)[0]
return (1 - abs(r))
def encode_espece(df):
data = pd.pivot_table(data=df,columns="month",values="target",index=feature).fillna(0)
data = data.loc[(data.transpose().sum()!=0)] # on retire les lignes nulles (corrélation non définie)
distance_matrix = data.transpose().corr().applymap(lambda x:1-x)
k=len(data.index)
model = AgglomerativeClustering(
n_clusters=k,
affinity = "precomputed",
linkage = "average",
distance_threshold=None,
compute_distances = True
)
clustering = model.fit(distance_matrix)
#Labels est le nouvel encodage
labels = clustering.labels_
data["cluster"]=labels
return labels,data
df_hc_label = df[df.level == level_chosen]
labels,data = encode_espece(df_hc_label)
data = data.reset_index()
df_cluster_feature = data[[f'{feature}', "cluster"]]
return df_cluster_feature
# Using Robust Scaler to transform X_train. Aim: scale features to be robust to outliers
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()
X_train_scaled = robust_scaler.fit_transform(X)
X_test_scaled = robust_scaler.transform(test)
GODD PRACTICE: USE A CLASS TRANSFORMER. ==> Define a class that does the data preprocessing on the train then on the test set
4) Modeling
# split the dataset into training and validation datasets
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=94,
stratify=y) #random_state to have the same alea and strafify to have a balance in label
##) GROUPSHUFFLESPLIT
# Sometimes we need to split dataset by group (listing_id, years, months etc). Otherwise we'd have some data both in train and test set
# For ex: Given listing_id = 94, if we split "classically", we probably have some prices for some weeks in train while prices for other weeks in test!
from sklearn.model_selection import GroupShuffleSplit
# next is needed cuz otherwise too much values to unpack. next creates an iterator, and print the items one by one
train_index, test_index = next(GroupShuffleSplit(test_size=.30, n_splits=2, random_state = 94).split(df, groups=df['listing_id']))
train = df.iloc[train_index]
test = df.iloc[test_index]
###########
# GroupShuffleSplit for hyperparam optimization and CV (cuz we also need to split data). Example with 20-fold for CV
gss = GroupShuffleSplit(n_splits=20, test_size=0.3, random_state=94)
for train_index, test_index in gss.split(df, groups=df['listing_id']):
X = df.drop('target', axis = 1)
y = df['target'].astype(float)
X_fold_train = X.iloc[train_index, :]
y_fold_train = y.iloc[train_index]
X_fold_test = X.iloc[test_index, :]
y_fold_test = y.iloc[test_index]
# Fit
model = LGBMRegressor()
model.fit(X_fold_train, y_fold_train, eval_metric = 'mae')
# Predict on train and test
y_fold_pred_train = model.predict(X_fold_train)
y_fold_pred_test = model.predict(X_fold_test)
# Evaluate
scores_train.append(
round(mean_absolute_error(y_fold_train, y_fold_pred_train),2)
)
scores_test.append(
round(mean_absolute_error(y_fold_test, y_fold_pred_test),2)
)
final_test_score = round(np.mean(scores_test),2)
# Display results
print(f"MAE Scores train are: {scores_train}")
print()
print(f"MAE Scores test are: {scores_test}")
print()
print(f"MAE final test score is {final_test_score}")
TIPS :
We can do cross-validation on the whole dataset to EVALUATE the model in the end (after fit/predict and check overfitting by comparing train and test results)
For small dataset (<100K rows is a small dataset) => cross-validation
How choose K ? Value of K shouldn’t be too small or too high, ideally 5 to 10 depending on the data size.
The higher value of K leads to less biased model (but large variance might lead to over-fit)
REGRESSION METRICS:
3 main possibilities: R2, (R)MSE and MAE.
- R2 = how well the model fits the dependent variables. Not good for overfitting consideration. Solution: adjusted R2
- RMSE = how far is your prediction. Use RMSE and not MSE cuz in the same unit than the observations
- MAE = Similar but take the absolute error vs square error for RMSE. RMSE gives larger penalisation to big prediction errors vs MAE treats them equally
Conclusion, better to use RMSE to compare models and if you want to penalize more larger prediction errors
Code using sklearn:
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(y_test, y_pred)))
Steps: Gridsearch (optimization) on X_train and cross-validation on X !
## Blent.ai: evaluate model
# Import regressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# Import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Set model to test
lr = LinearRegression()
rf = RandomForestRegressor()
xgb = XGBRegressor()
models = [lr, rf, xgb]
names = ['Linear Regression','Random Forest','XGBoost']
# Evaluate each model using minmax scaler
for model, name in zip(models, names):
# Fit on train
model.fit(X_train_scaled_minmax, y_train)
# Predict on train and test
y_pred_train = model.predict(X_train_scaled_minmax)
y_pred_test = model.predict(X_test_scaled_minmax)
# Evaluate and watch overfitting
print(f'{name}:')
print(f'MAE train: {round(mean_absolute_error(np.exp(y_train), np.exp(y_pred_train)),2)}€') # exp cuz log_price
print(f'MAE test: {round(mean_absolute_error(np.exp(y_test), np.exp(y_pred_test)),2)}€')
print(f'R2 train: {round(r2_score(np.exp(y_train), np.exp(y_pred_train)),2)*100} %')
print(f'R2 test: {round(r2_score(np.exp(y_test), np.exp(y_pred_test)),2)*100} %')
print(f'RMSE train: {round(np.sqrt(mean_squared_error(np.exp(y_train), np.exp(y_pred_train))),2)}€')
print(f'RMSE test: {round(np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred_test))),2)}€')
print()
# Optimization: GridSearchCV (for Gradient Boosting, do it by pairs). GridSearch on X_train
from sklearn.model_selection import GridSearchCV
grid_parameters = {
'max_depth': [3, 8, 12, 15],
'min_child_weight': [1, 3, 5, 7]}
grid_cv = GridSearchCV(
estimator=XGBRegressor(**params),
param_grid=grid_parameters,
scoring="neg_mean_absolute_error",
cv=2,
verbose=1)
# Lance la recherche par grille (peut prendre 5 minutes)
grid_cv.fit(X_train, y_train)
# Show results
from matplotlib.ticker import PercentFormatter
score_matrix = np.asarray(grid_cv.cv_results_["mean_test_score"]).reshape((4, 4))
sns.set(font_scale=1.5)
fig = plt.figure(figsize=(16, 12))
plt.title("Scores MAE")
ax = sns.heatmap(
score_matrix, # Matrice des scores
annot=True, # Afficher les scores
fmt='.02f', # Affichage avec 1 chiffre après la virgule
yticklabels=grid_parameters["max_depth"], # Nom de l'axe vertical
xticklabels=grid_parameters["min_child_weight"], # Nom de l'axe horizontal
cmap="Blues", # Palette de couleur
)
sns.set(font_scale=1)
ax.set(ylabel="max_depth", xlabel="min_child_weight")
plt.show()
# summarize result
print('Best Score: %s' % grid_cv.best_score_)
print('Best Hyperparameters: %s' % grid_cv.best_params_)
# Define a root mean square error function using cross-validation
from sklearn.model_selection import KFold, cross_val_score
def rmse(model, X, y):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
return rmse
or
accuracy = cross_val_score(model, X, y, scoring = "accuracy", cv=5) # for accuracy
# Perform 5-folds cross-validation to evaluate the models
for model, name in zip(models, names):
# Root mean square error
score = rmse(model, X_train_scaled, y)