import os
import datetime
#Print Time
def printbar():
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
#Mac system pytorch and matplotlib running at the same time in jupyter need to change environment variables
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
The goal of the titanic dataset is to predict whether they will survive the sinking of the Titanic based on passenger information.
Structured data is generally preprocessed using DataFrame in Pandas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader,TensorDataset
dftrain_raw = pd.read_csv('./data/titanic/train.csv')
dftest_raw = pd.read_csv('./data/titanic/test.csv')
dftrain_raw.head(10)
Field description:
- Survived: 0 means death, 1 means survival [y label]
- Pclass: The ticket class held by the passenger, there are three values (1,2,3) [convert to onehot code]
- Name: Passenger's name 【Drop it down】
- Sex: passenger gender [converted to bool features]
- Age: Passenger age (missing) [Numerical feature, add "Age is missing" as an auxiliary feature]
- SibSp: Number of siblings/spouses of the passenger (integer value) [Numerical Features]
- Parch: the number of parents/children of the passenger (integer value) [numerical feature]
- Ticket: ticket number (character string) [discard]
- Fare: The price of the ticket held by the passenger (floating point number, ranging from 0-500) [Numerical Features]
- Cabin: The cabin where the passenger is located (missing) [Add "Is the cabin missing" as an auxiliary feature]
- Embarked: Passenger embarkation port: S, C, Q (with missing) [converted to onehot code, four dimensions S, C, Q, nan]
Using Pandas' data visualization function, we can simply perform exploratory data analysis (EDA).
label distribution
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw['Survived'].value_counts().plot(kind ='bar',
figsize = (12,8), fontsize=15, rot = 0)
ax.set_ylabel('Counts',fontsize = 15)
ax.set_xlabel('Survived',fontsize = 15)
plt.show()
Age distribution
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw['Age'].plot(kind ='hist',bins = 20,color ='purple',
figsize = (12,8),fontsize=15)
ax.set_ylabel('Frequency',fontsize = 15)
ax.set_xlabel('Age',fontsize = 15)
plt.show()
Correlation between age and label
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw.query('Survived == 0')['Age'].plot(kind ='density',
figsize = (12,8),fontsize=15)
dftrain_raw.query('Survived == 1')['Age'].plot(kind ='density',
figsize = (12,8),fontsize=15)
ax.legend(['Survived==0','Survived==1'],fontsize = 12)
ax.set_ylabel('Density',fontsize = 15)
ax.set_xlabel('Age',fontsize = 15)
plt.show()
The following is the formal data preprocessing
def preprocessing(dfdata):
dfresult = pd.DataFrame()
#Pclass
dfPclass = pd.get_dummies(dfdata['Pclass'])
dfPclass.columns = ['Pclass_' +str(x) for x in dfPclass.columns]
dfresult = pd.concat([dfresult,dfPclass],axis = 1)
#Sex
dfSex = pd.get_dummies(dfdata['Sex'])
dfresult = pd.concat([dfresult,dfSex],axis = 1)
#Age
dfresult['Age'] = dfdata['Age'].fillna(0)
dfresult['Age_null'] = pd.isna(dfdata['Age']).astype('int32')
#SibSp,Parch,Fare
dfresult['SibSp'] = dfdata['SibSp']
dfresult['Parch'] = dfdata['Parch']
dfresult['Fare'] = dfdata['Fare']
#Carbin
dfresult['Cabin_null'] = pd.isna(dfdata['Cabin']).astype('int32')
#Embarked
dfEmbarked = pd.get_dummies(dfdata['Embarked'],dummy_na=True)
dfEmbarked.columns = ['Embarked_' + str(x) for x in dfEmbarked.columns]
dfresult = pd.concat([dfresult,dfEmbarked],axis = 1)
return(dfresult)
x_train = preprocessing(dftrain_raw).values
y_train = dftrain_raw[['Survived']].values
x_test = preprocessing(dftest_raw).values
y_test = dftest_raw[['Survived']].values
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("y_test.shape =", y_test.shape)
x_train.shape = (712, 15)
x_test.shape = (179, 15)
y_train.shape = (712, 1)
y_test.shape = (179, 1)
Further use DataLoader and TensorDataset to encapsulate into an iterable data pipeline.
dl_train = DataLoader(TensorDataset(torch.tensor(x_train).float(),torch.tensor(y_train).float()),
shuffle = True, batch_size = 8)
dl_valid = DataLoader(TensorDataset(torch.tensor(x_test).float(),torch.tensor(y_test).float()),
shuffle = False, batch_size = 8)
# Test data pipeline
for features,labels in dl_train:
print(features,labels)
break
tensor([[ 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000,
0.0000, 0.0000, 7.8958, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000,
0.0000, 0.0000, 30.5000, 0.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 31.0000, 0.0000,
1.0000, 0.0000, 113.2750, 0.0000, 1.0000, 0.0000, 0.0000,
0.0000],
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 60.0000, 0.0000,
0.0000, 0.0000, 26.5500, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 28.0000, 0.0000,
0.0000, 0.0000, 22.5250, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 32.0000, 0.0000,
0.0000, 0.0000, 8.3625, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 28.0000, 0.0000,
0.0000, 0.0000, 13.0000, 1.0000, 0.0000, 0.0000, 1.0000,
0.0000],
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 36.0000, 0.0000,
0.0000, 1.0000, 512.3292, 0.0000, 1.0000, 0.0000, 0.0000,
0.0000]]) tensor([[0.],
[1.],
[1.],
[0.],
[0.],
[0.],
[1.],
[1.]])
There are usually three ways to build a model using Pytorch: use nn.Sequential to build a model in layer order, inherit nn.Module base class to build a custom model, inherit nn.Module base class to build a model and assist in application model container packaging.
Here we choose to use the simplest nn.Sequential, model by layer sequence.
def create_net():
net = nn.Sequential()
net.add_module("linear1",nn.Linear(15,20))
net.add_module("relu1",nn.ReLU())
net.add_module("linear2",nn.Linear(20,15))
net.add_module("relu2",nn.ReLU())
net.add_module("linear3",nn.Linear(15,1))
net.add_module("sigmoid",nn.Sigmoid())
return net
net = create_net()
print(net)
Sequential(
(linear1): Linear(in_features=15, out_features=20, bias=True)
(relu1): ReLU()
(linear2): Linear(in_features=20, out_features=15, bias=True)
(relu2): ReLU()
(linear3): Linear(in_features=15, out_features=1, bias=True)
(sigmoid): Sigmoid()
)
from torchkeras import summary
summary(net,input_shape=(15,))
-------------------------------------------------- --------------
Layer (type) Output Shape Param #
================================================= ==============
Linear-1 [-1, 20] 320
ReLU-2 [-1, 20] 0
Linear-3 [-1, 15] 315
ReLU-4 [-1, 15] 0
Linear-5 [-1, 1] 16
Sigmoid-6 [-1, 1] 0
================================================= ==============
Total params: 651
Trainable params: 651
Non-trainable params: 0
-------------------------------------------------- --------------
Input size (MB): 0.000057
Forward/backward pass size (MB): 0.000549
Params size (MB): 0.002483
Estimated Total Size (MB): 0.003090
-------------------------------------------------- --------------
Pytorch usually requires users to write custom training loops, and the code style of training loops varies from person to person.
There are three typical training loop code styles: script form training loop, function form training loop, and class form training loop.
Here is a more general form of script.
from sklearn.metrics import accuracy_score
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=net.parameters(),lr = 0.01)
metric_func = lambda y_pred,y_true: accuracy_score(y_true.data.numpy(),y_pred.data.numpy()>0.5)
metric_name = "accuracy"
epochs = 10
log_step_freq = 30
dfhistory = pd.DataFrame(columns = ["epoch","loss",metric_name,"val_loss","val_"+metric_name])
print("Start Training...")
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("=========="*8 + "%s"%nowtime)
for epoch in range(1,epochs+1):
#1, training cycle--------------------------------------------- ----
net.train()
loss_sum = 0.0
metric_sum = 0.0
step = 1
for step, (features,labels) in enumerate(dl_train, 1):
# Gradient clear
optimizer.zero_grad()
# Forward propagation for loss
predictions = net(features)
loss = loss_func(predictions,labels)
metric = metric_func(predictions,labels)
# Backpropagation for gradient
loss.backward()
optimizer.step()
# Print batch level log
loss_sum += loss.item()
metric_sum += metric.item()
if step%log_step_freq == 0:
print(("[step = %d] loss: %.3f, "+metric_name+": %.3f")%
(step, loss_sum/step, metric_sum/step))
#2, verification loop --------------------------------------------- ----
net.eval()
val_loss_sum = 0.0
val_metric_sum = 0.0
val_step = 1
for val_step, (features,labels) in enumerate(dl_valid, 1):
# Close gradient calculation
with torch.no_grad():
predictions = net(features)
val_loss = loss_func(predictions,labels)
val_metric = metric_func(predictions,labels)
val_loss_sum += val_loss.item()
val_metric_sum += val_metric.item()
# 3, record the log --------------------------------------------- ----
info = (epoch, loss_sum/step, metric_sum/step,
val_loss_sum/val_step, val_metric_sum/val_step)
dfhistory.loc[epoch-1] = info
# Print epoch level log
print(("\nEPOCH = %d, loss = %.3f,"+ metric_name + \
"= %.3f, val_loss = %.3f, "+"val_"+ metric_name+" = %.3f")
%info)
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print('Finished Training...')
Start Training...
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.703, accuracy: 0.583
[step = 60] loss: 0.629, accuracy: 0.675
EPOCH = 1, loss = 0.643, accuracy = 0.673, val_loss = 0.621, val_accuracy = 0.725
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.653, accuracy: 0.662
[step = 60] loss: 0.624, accuracy: 0.673
EPOCH = 2, loss = 0.621, accuracy = 0.669, val_loss = 0.519, val_accuracy = 0.708
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.582, accuracy: 0.688
[step = 60] loss: 0.555, accuracy: 0.723
EPOCH = 3, loss = 0.543, accuracy = 0.740, val_loss = 0.516, val_accuracy = 0.741
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.563, accuracy: 0.721
[step = 60] loss: 0.528, accuracy: 0.752
EPOCH = 4, loss = 0.515, accuracy = 0.764, val_loss = 0.471, val_accuracy = 0.777
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.433, accuracy: 0.783
[step = 60] loss: 0.477, accuracy: 0.785
EPOCH = 5, loss = 0.489, accuracy = 0.785, val_loss = 0.447, val_accuracy = 0.804
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.460, accuracy: 0.812
[step = 60] loss: 0.477, accuracy: 0.798
EPOCH = 6, loss = 0.474, accuracy = 0.798, val_loss = 0.451, val_accuracy = 0.772
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.516, accuracy: 0.792
[step = 60] loss: 0.496, accuracy: 0.779
EPOCH = 7, loss = 0.473, accuracy = 0.794, val_loss = 0.485, val_accuracy = 0.783
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.472, accuracy: 0.779
[step = 60] loss: 0.487, accuracy: 0.794
EPOCH = 8, loss = 0.474, accuracy = 0.791, val_loss = 0.446, val_accuracy = 0.788
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.492, accuracy: 0.771
[step = 60] loss: 0.445, accuracy: 0.800
EPOCH = 9, loss = 0.464, accuracy = 0.796, val_loss = 0.519, val_accuracy = 0.746
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.436, accuracy: 0.796
[step = 60] loss: 0.460, accuracy: 0.794
EPOCH = 10, loss = 0.462, accuracy = 0.787, val_loss = 0.415, val_accuracy = 0.810
================================================= ==============================2020-06-17 20:53:51
Finished Training...
We first evaluate the effect of the model on the training set and validation set.
dfhistory
%matplotlib inline
%config InlineBackend.figure_format ='svg'
import matplotlib.pyplot as plt
def plot_metric(dfhistory, metric):
train_metrics = dfhistory[metric]
val_metrics = dfhistory['val_'+metric]
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics,'bo--')
plt.plot(epochs, val_metrics,'ro-')
plt.title('Training and validation'+ metric)
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend(["train_"+metric,'val_'+metric])
plt.show()
plot_metric(dfhistory,"loss")
plot_metric(dfhistory,"accuracy")
#Prediction probability
y_pred_probs = net(torch.tensor(x_test[0:10]).float()).data
y_pred_probs
tensor([[0.0119],
[0.6029],
[0.2970],
[0.5717],
[0.5034],
[0.8655],
[0.0572],
[0.9182],
[0.5038],
[0.1739]])
#Prediction category
y_pred = torch.where(y_pred_probs>0.5,
torch.ones_like(y_pred_probs),torch.zeros_like(y_pred_probs))
y_pred
tensor([[0.],
[1.],
[0.],
[1.],
[1.],
[1.],
[0.],
[1.],
[1.],
[0.]])
Pytorch has two ways to save models, both of which are implemented by calling the pickle serialization method.
The first method only saves the model parameters.
The second method saves the complete model.
The first method is recommended, and the second method may cause various problems when switching devices and directories.
1, save model parameters (recommended)
print(net.state_dict().keys())
odict_keys(['linear1.weight','linear1.bias','linear2.weight','linear2.bias','linear3.weight','linear3.bias'])
# Save model parameters
torch.save(net.state_dict(), "./data/net_parameter.pkl")
net_clone = create_net()
net_clone.load_state_dict(torch.load("./data/net_parameter.pkl"))
net_clone.forward(torch.tensor(x_test[0:10]).float()).data
tensor([[0.0119],
[0.6029],
[0.2970],
[0.5717],
[0.5034],
[0.8655],
[0.0572],
[0.9182],
[0.5038],
[0.1739]])
2, save the complete model (not recommended)
torch.save(net,'./data/net_model.pkl')
net_loaded = torch.load('./data/net_model.pkl')
net_loaded(torch.tensor(x_test[0:10]).float()).data
tensor([[0.0119],
[0.6029],
[0.2970],
[0.5717],
[0.5034],
[0.8655],
[0.0572],
[0.9182],
[0.5038],
[0.1739]])
If this book is helpful to you and want to encourage the author, remember to add a star⭐️ to this project and share it with your friends 😊!
If you need to further communicate with the author on the understanding of the content of this book, please leave a message under the public account "Algorithm Food House". The author has limited time and energy and will respond as appropriate.
You can also reply to keywords in the background of the official account: Add group, join the reader exchange group and discuss with you.