import os
import datetime
#Print Time
def printbar():
nowtime ='%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
#Mac system pytorch and matplotlib running at the same time in jupyter need to change environment variables
The goal of the titanic dataset is to predict whether they will survive the sinking of the Titanic based on passenger information.
Structured data is generally preprocessed using DataFrame in Pandas.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from import Dataset,DataLoader,TensorDataset
dftrain_raw = pd.read_csv('./data/titanic/train.csv')
dftest_raw = pd.read_csv('./data/titanic/test.csv')
Field description:
- Survived: 0 means death, 1 means survival [y label]
- Pclass: The ticket class held by the passenger, there are three values (1,2,3) [convert to onehot code]
- Name: Passenger's name 【Drop it down】
- Sex: passenger gender [converted to bool features]
- Age: Passenger age (missing) [Numerical feature, add "Age is missing" as an auxiliary feature]
- SibSp: Number of siblings/spouses of the passenger (integer value) [Numerical Features]
- Parch: the number of parents/children of the passenger (integer value) [numerical feature]
- Ticket: ticket number (character string) [discard]
- Fare: The price of the ticket held by the passenger (floating point number, ranging from 0-500) [Numerical Features]
- Cabin: The cabin where the passenger is located (missing) [Add "Is the cabin missing" as an auxiliary feature]
- Embarked: Passenger embarkation port: S, C, Q (with missing) [converted to onehot code, four dimensions S, C, Q, nan]
Using Pandas' data visualization function, we can simply perform exploratory data analysis (EDA).
label distribution
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw['Survived'].value_counts().plot(kind ='bar',
figsize = (12,8), fontsize=15, rot = 0)
ax.set_ylabel('Counts',fontsize = 15)
ax.set_xlabel('Survived',fontsize = 15)
Age distribution
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw['Age'].plot(kind ='hist',bins = 20,color ='purple',
figsize = (12,8),fontsize=15)
ax.set_ylabel('Frequency',fontsize = 15)
ax.set_xlabel('Age',fontsize = 15)
Correlation between age and label
%matplotlib inline
%config InlineBackend.figure_format ='png'
ax = dftrain_raw.query('Survived == 0')['Age'].plot(kind ='density',
figsize = (12,8),fontsize=15)
dftrain_raw.query('Survived == 1')['Age'].plot(kind ='density',
figsize = (12,8),fontsize=15)
ax.legend(['Survived==0','Survived==1'],fontsize = 12)
ax.set_ylabel('Density',fontsize = 15)
ax.set_xlabel('Age',fontsize = 15)
The following is the formal data preprocessing
def preprocessing(dfdata):
dfresult = pd.DataFrame()
dfPclass = pd.get_dummies(dfdata['Pclass'])
dfPclass.columns = ['Pclass_' +str(x) for x in dfPclass.columns]
dfresult = pd.concat([dfresult,dfPclass],axis = 1)
dfSex = pd.get_dummies(dfdata['Sex'])
dfresult = pd.concat([dfresult,dfSex],axis = 1)
dfresult['Age'] = dfdata['Age'].fillna(0)
dfresult['Age_null'] = pd.isna(dfdata['Age']).astype('int32')
dfresult['SibSp'] = dfdata['SibSp']
dfresult['Parch'] = dfdata['Parch']
dfresult['Fare'] = dfdata['Fare']
dfresult['Cabin_null'] = pd.isna(dfdata['Cabin']).astype('int32')
dfEmbarked = pd.get_dummies(dfdata['Embarked'],dummy_na=True)
dfEmbarked.columns = ['Embarked_' + str(x) for x in dfEmbarked.columns]
dfresult = pd.concat([dfresult,dfEmbarked],axis = 1)
x_train = preprocessing(dftrain_raw).values
y_train = dftrain_raw[['Survived']].values
x_test = preprocessing(dftest_raw).values
y_test = dftest_raw[['Survived']].values
print("x_train.shape =", x_train.shape)
print("x_test.shape =", x_test.shape)
print("y_train.shape =", y_train.shape)
print("y_test.shape =", y_test.shape)
x_train.shape = (712, 15)
x_test.shape = (179, 15)
y_train.shape = (712, 1)
y_test.shape = (179, 1)
Further use DataLoader and TensorDataset to encapsulate into an iterable data pipeline.
dl_train = DataLoader(TensorDataset(torch.tensor(x_train).float(),torch.tensor(y_train).float()),
shuffle = True, batch_size = 8)
dl_valid = DataLoader(TensorDataset(torch.tensor(x_test).float(),torch.tensor(y_test).float()),
shuffle = False, batch_size = 8)
# Test data pipeline
for features,labels in dl_train:
tensor([[ 0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 1.0000,
0.0000, 0.0000, 7.8958, 1.0000, 0.0000, 0.0000, 1.0000,
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 1.0000,
0.0000, 0.0000, 30.5000, 0.0000, 0.0000, 0.0000, 1.0000,
[1.0000, 0.0000, 0.0000, 1.0000, 0.0000, 31.0000, 0.0000,
1.0000, 0.0000, 113.2750, 0.0000, 1.0000, 0.0000, 0.0000,
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 60.0000, 0.0000,
0.0000, 0.0000, 26.5500, 1.0000, 0.0000, 0.0000, 1.0000,
[0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 28.0000, 0.0000,
0.0000, 0.0000, 22.5250, 1.0000, 0.0000, 0.0000, 1.0000,
[0.0000, 0.0000, 1.0000, 0.0000, 1.0000, 32.0000, 0.0000,
0.0000, 0.0000, 8.3625, 1.0000, 0.0000, 0.0000, 1.0000,
[0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 28.0000, 0.0000,
0.0000, 0.0000, 13.0000, 1.0000, 0.0000, 0.0000, 1.0000,
[1.0000, 0.0000, 0.0000, 0.0000, 1.0000, 36.0000, 0.0000,
0.0000, 1.0000, 512.3292, 0.0000, 1.0000, 0.0000, 0.0000,
0.0000]]) tensor([[0.],
There are usually three ways to build a model using Pytorch: use nn.Sequential to build a model in layer order, inherit nn.Module base class to build a custom model, inherit nn.Module base class to build a model and assist in application model container packaging.
Here we choose to use the simplest nn.Sequential, model by layer sequence.
def create_net():
net = nn.Sequential()
return net
net = create_net()
(linear1): Linear(in_features=15, out_features=20, bias=True)
(relu1): ReLU()
(linear2): Linear(in_features=20, out_features=15, bias=True)
(relu2): ReLU()
(linear3): Linear(in_features=15, out_features=1, bias=True)
(sigmoid): Sigmoid()
from torchkeras import summary
-------------------------------------------------- --------------
Layer (type) Output Shape Param #
================================================= ==============
Linear-1 [-1, 20] 320
ReLU-2 [-1, 20] 0
Linear-3 [-1, 15] 315
ReLU-4 [-1, 15] 0
Linear-5 [-1, 1] 16
Sigmoid-6 [-1, 1] 0
================================================= ==============
Total params: 651
Trainable params: 651
Non-trainable params: 0
-------------------------------------------------- --------------
Input size (MB): 0.000057
Forward/backward pass size (MB): 0.000549
Params size (MB): 0.002483
Estimated Total Size (MB): 0.003090
-------------------------------------------------- --------------
Pytorch usually requires users to write custom training loops, and the code style of training loops varies from person to person.
There are three typical training loop code styles: script form training loop, function form training loop, and class form training loop.
Here is a more general form of script.
from sklearn.metrics import accuracy_score
loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=net.parameters(),lr = 0.01)
metric_func = lambda y_pred,y_true: accuracy_score(,>0.5)
metric_name = "accuracy"
epochs = 10
log_step_freq = 30
dfhistory = pd.DataFrame(columns = ["epoch","loss",metric_name,"val_loss","val_"+metric_name])
print("Start Training...")
nowtime ='%Y-%m-%d %H:%M:%S')
print("=========="*8 + "%s"%nowtime)
for epoch in range(1,epochs+1):
#1, training cycle--------------------------------------------- ----
loss_sum = 0.0
metric_sum = 0.0
step = 1
for step, (features,labels) in enumerate(dl_train, 1):
# Gradient clear
# Forward propagation for loss
predictions = net(features)
loss = loss_func(predictions,labels)
metric = metric_func(predictions,labels)
# Backpropagation for gradient
# Print batch level log
loss_sum += loss.item()
metric_sum += metric.item()
if step%log_step_freq == 0:
print(("[step = %d] loss: %.3f, "+metric_name+": %.3f")%
(step, loss_sum/step, metric_sum/step))
#2, verification loop --------------------------------------------- ----
val_loss_sum = 0.0
val_metric_sum = 0.0
val_step = 1
for val_step, (features,labels) in enumerate(dl_valid, 1):
# Close gradient calculation
with torch.no_grad():
predictions = net(features)
val_loss = loss_func(predictions,labels)
val_metric = metric_func(predictions,labels)
val_loss_sum += val_loss.item()
val_metric_sum += val_metric.item()
# 3, record the log --------------------------------------------- ----
info = (epoch, loss_sum/step, metric_sum/step,
val_loss_sum/val_step, val_metric_sum/val_step)
dfhistory.loc[epoch-1] = info
# Print epoch level log
print(("\nEPOCH = %d, loss = %.3f,"+ metric_name + \
"= %.3f, val_loss = %.3f, "+"val_"+ metric_name+" = %.3f")
nowtime ='%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print('Finished Training...')
Start Training...
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.703, accuracy: 0.583
[step = 60] loss: 0.629, accuracy: 0.675
EPOCH = 1, loss = 0.643, accuracy = 0.673, val_loss = 0.621, val_accuracy = 0.725
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.653, accuracy: 0.662
[step = 60] loss: 0.624, accuracy: 0.673
EPOCH = 2, loss = 0.621, accuracy = 0.669, val_loss = 0.519, val_accuracy = 0.708
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.582, accuracy: 0.688
[step = 60] loss: 0.555, accuracy: 0.723
EPOCH = 3, loss = 0.543, accuracy = 0.740, val_loss = 0.516, val_accuracy = 0.741
================================================= ==============================2020-06-17 20:53:49
[step = 30] loss: 0.563, accuracy: 0.721
[step = 60] loss: 0.528, accuracy: 0.752
EPOCH = 4, loss = 0.515, accuracy = 0.764, val_loss = 0.471, val_accuracy = 0.777
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.433, accuracy: 0.783
[step = 60] loss: 0.477, accuracy: 0.785
EPOCH = 5, loss = 0.489, accuracy = 0.785, val_loss = 0.447, val_accuracy = 0.804
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.460, accuracy: 0.812
[step = 60] loss: 0.477, accuracy: 0.798
EPOCH = 6, loss = 0.474, accuracy = 0.798, val_loss = 0.451, val_accuracy = 0.772
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.516, accuracy: 0.792
[step = 60] loss: 0.496, accuracy: 0.779
EPOCH = 7, loss = 0.473, accuracy = 0.794, val_loss = 0.485, val_accuracy = 0.783
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.472, accuracy: 0.779
[step = 60] loss: 0.487, accuracy: 0.794
EPOCH = 8, loss = 0.474, accuracy = 0.791, val_loss = 0.446, val_accuracy = 0.788
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.492, accuracy: 0.771
[step = 60] loss: 0.445, accuracy: 0.800
EPOCH = 9, loss = 0.464, accuracy = 0.796, val_loss = 0.519, val_accuracy = 0.746
================================================= ==============================2020-06-17 20:53:50
[step = 30] loss: 0.436, accuracy: 0.796
[step = 60] loss: 0.460, accuracy: 0.794
EPOCH = 10, loss = 0.462, accuracy = 0.787, val_loss = 0.415, val_accuracy = 0.810
================================================= ==============================2020-06-17 20:53:51
Finished Training...
We first evaluate the effect of the model on the training set and validation set.
%matplotlib inline
%config InlineBackend.figure_format ='svg'
import matplotlib.pyplot as plt
def plot_metric(dfhistory, metric):
train_metrics = dfhistory[metric]
val_metrics = dfhistory['val_'+metric]
epochs = range(1, len(train_metrics) + 1)
plt.plot(epochs, train_metrics,'bo--')
plt.plot(epochs, val_metrics,'ro-')
plt.title('Training and validation'+ metric)
#Prediction probability
y_pred_probs = net(torch.tensor(x_test[0:10]).float()).data
#Prediction category
y_pred = torch.where(y_pred_probs>0.5,
Pytorch has two ways to save models, both of which are implemented by calling the pickle serialization method.
The first method only saves the model parameters.
The second method saves the complete model.
The first method is recommended, and the second method may cause various problems when switching devices and directories.
1, save model parameters (recommended)
# Save model parameters, "./data/net_parameter.pkl")
net_clone = create_net()
2, save the complete model (not recommended),'./data/net_model.pkl')
net_loaded = torch.load('./data/net_model.pkl')
