-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraphing_methods.py
126 lines (108 loc) · 4.97 KB
/
graphing_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits import mplot3d
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
def figure_one(driver_full):
#scatter plot
ax=sns.scatterplot(x='num_rides', y='lifetime_val', data=driver_full, marker="+")
#ridge regression
X, y = driver_full[['num_rides']], driver_full['lifetime_val']
reg = linear_model.Ridge(alpha=.01)
reg = reg.fit(X, y)
m, b = reg.coef_, reg.intercept_
#plot regression line and R^2 value
ax.plot(np.linspace(-100, 1200, 100), np.linspace(-100, 1200, 100)*m+b, c='black')
ax.set(title='Lifetime Value vs Total Number of Rides')
ax.set_ylabel('Lifetime Value ($)')
ax.set_xlabel('Total Number of Rides')
plt.show()
print('R^2 value for this model is:', reg.score(X, y))
def figure_two(driver_full):
sns.set(font_scale=1.2)
#normalizing values
driver_norm = driver_full.copy()
driver_norm = driver_norm.transform(lambda x:(x - x.mean())/x.std())
#performing k-means algorithm
X = driver_norm.values
kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
labels = kmeans.labels_
centers = kmeans.cluster_centers_
driver_full['group'] = labels
#labeling groups
experienced = driver_full[driver_full['group']==2]
average = driver_full[driver_full['group']==0]
starting = driver_full[driver_full['group']==1]
groups = {2:'Experienced', 0:'Average', 1:'Starting'}
driver_full['group'] = driver_full['group'].apply(
lambda x:groups[x])
#plotting clusters
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter3D(starting['rides_per_day'], starting['pickup_time'], starting['total_fares'], label='Starting')
ax.scatter3D(average['rides_per_day'], average['pickup_time'], average['total_fares'], label='Average')
ax.scatter3D(experienced['rides_per_day'], experienced['pickup_time'], experienced['total_fares'], label='Experienced')
ax.view_init(20, 255)
ax.legend(loc='center right')
ax.set(title='K-means Clusters', xlabel='Rides per Day', ylabel='Average Pickup Time (min)', zlabel='Total Fares ($)')
plt.gca().patch.set_facecolor('white')
ax.w_xaxis.set_pane_color((0.8, 0.8, 0.8, 1.0))
ax.w_yaxis.set_pane_color((0.8, 0.8, 0.8, 1.0))
ax.w_zaxis.set_pane_color((0.8, 0.8, 0.8, 1.0))
plt.show()
#return table with groups
return driver_full
def figure_three(driver_full):
sns.set(font_scale=1.8)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(25,6))
#plotting num_rides by group
sns.boxplot(x="group", y="num_rides", data=driver_full, ax=ax1, order=['Starting', 'Average', 'Experienced'])
ax1.set(title='Total Number of Rides by Group')
ax1.set_xlabel('Group')
ax1.set_ylabel('Total Number of Rides')
#plotting rides_per_day by group
sns.boxplot(x="group", y="rides_per_day", data=driver_full, ax=ax2, order=['Starting', 'Average', 'Experienced'])
ax2.set(title='Rides per Day by Group')
ax2.set_xlabel('Group')
ax2.set_ylabel('Rides per Day')
#plotting lifetime_val by group
sns.boxplot(x="group", y="lifetime_val", data=driver_full, ax=ax3, order=['Starting', 'Average', 'Experienced'])
ax3.set(title='Estimated Lifetime Value by Group')
ax3.set_xlabel('Group')
ax3.set_ylabel('Lifetime Value ($)')
plt.show()
def figure_four(driver_full):
#method to add percentages to the top of bars
def add_percent(ax, total, height_change, c):
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + height_change,
'{:1.2f} {}'.format(height/total*100, c),
ha="center")
sns.set(font_scale=1.8)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(25,6))
#countplot of each group
sns.countplot(x='group', data=driver_full, order=['Starting', 'Average', 'Experienced'], ax=ax1)
ax1.set(title='Group Sizes')
add_percent(ax1, len(driver_full), 3, '%')
ax1.set_xlabel('Group')
ax1.set_ylabel('Drivers in Group')
#barplot of total fare by group
sns.barplot(x='group', y='total_fares', estimator=np.sum, data=driver_full, ci=None,
order=['Starting', 'Average', 'Experienced'], ax=ax2)
ax2.set(title='Fare Contribution by Group')
add_percent(ax2, np.sum(driver_full['total_fares']), 3, '%')
ax2.set_xlabel('Group')
ax2.set_ylabel('Total Fares ($)')
#barplot of median pickup time by group
sns.barplot(x='group', y='pickup_time', estimator=np.median, data=driver_full, ci=None,
order=['Starting', 'Average', 'Experienced'], ax=ax3)
ax3.set(title='Median Pickup Time by Group')
add_percent(ax3, 100, 0, 'min')
ax3.set_xlabel('Group')
ax3.set_ylabel('Median Pickup Time (min)')
plt.show()