A04.py

import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import v2
import cv2
import numpy as np
import os
import sys
from prettytable import PrettyTable
from PrettyText import *

# early stopping
# https://stackoverflow.com/a/73704579
class EarlyStopper:
    def __init__(self, patience:int=1, min_delta:int=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            print(pretty_text("early stop:",bg="yellow",bold=True),"reset patience counter")
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            print(pretty_text("early stop:",bg="yellow",bold=True),"patience counter increased to",self.counter)
            if self.counter >= self.patience:
                print(pretty_text("early stop:",bg="yellow",bold=True),"out of patience!")
                return True
        return False

# attempt at AlexNet mentioned in the in-class slides.
# left layers as 2d convolutions
# with help from the following:
#   detailed layer breakdown - https://www.kaggle.com/code/blurredmachine/alexnet-architecture-a-complete-guide
#   tutorial for pytorch - https://www.digitalocean.com/community/tutorials/alexnet-pytorch
class ALEX(nn.Module):
    def __init__(self, class_cnt:int, batch_size:int):
        super().__init__()
        # Create a module list so we have slightly more control
        self.feature_extract = nn.ModuleList([
            # convolution 1 & RELU & maxpool
            nn.Conv3d(in_channels=3, out_channels=96,
                      kernel_size=(1,11,11),
                      stride=(1,4,4),
                      padding=(1,1,1)),
            nn.BatchNorm3d(96),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1,3,3),
                         stride=(1,2,2)),
            
            #convolution 2 & RELU & maxpool
            nn.Conv3d(96, 256, (1,5,5), padding=(1,2,2), stride=(1,1,1)),
            nn.BatchNorm3d(256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2)),
            
            #convolution 3 & RELU
            nn.Conv3d(256, 384, (1,3,3), padding=(1,1,1), stride=(1,1,1)),
            nn.BatchNorm3d(384),
            nn.ReLU(),
            
            #convolution 4 & RELU
            nn.Conv3d(384, 384, (1,3,3), padding=(1,1,1), stride=(1,1,1)),
            nn.BatchNorm3d(384),
            nn.ReLU(),
            
            #convolution 5 & RELU
            nn.Conv3d(384, 256, (1,3,3), padding=(1,1,1), stride=(1,1,1)),
            nn.BatchNorm3d(256),
            nn.ReLU(),
            
            #last max pool occurs after convolution 5
            nn.MaxPool3d(kernel_size=(1,3,3),stride=(1,2,2)),
            
            #dropout
            nn.Dropout(0.5)
        ])
        
        # Classifier section        
        self.flatten = nn.Flatten()
        out_layers = 81920 #4096 for 2D AlexNet 
        self.classifier_stack = nn.Sequential(
            #fc6
            nn.Linear(out_layers, batch_size),
            nn.ReLU(),
            #fc7
            nn.Linear(batch_size, class_cnt)
        )
            
    def forward(self, x):
        # Input: (b, t, c, h, w)
        x = torch.transpose(x, 1, 2)
        # After transpose: (b, c, t, h, w)
        for index, layer in enumerate(self.feature_extract):
            x = layer(x)    
        x = self.flatten(x)
        logits = self.classifier_stack(x)
        return logits

class CNN(nn.Module):
    def __init__(self, class_cnt:int, batch_size:int, version : int):
        super().__init__()
        # Create a module list so we have slightly more control
        if version == 0:
            self.feature_extract = nn.ModuleList([
                # 3d convolution, padding only in space, stride instead of pooling
                nn.Conv3d(3, 16, (3,7,7), padding=(1,2,2), stride=(2,2,2)),
                nn.ReLU(),
                nn.Conv3d(16, 32, (3,7,7), padding=(1,2,2), stride=(2,2,2)),
                nn.ReLU(),
                nn.Conv3d(32, 64, (3,7,7), padding=(1,2,2), stride=(2,2,2)),
                nn.ReLU()                      
            ])
            
            # Classifier section        
            self.flatten = nn.Flatten()
            out_layers = 59136
            self.classifier_stack = nn.Sequential(
                nn.Linear(out_layers, batch_size),
                nn.ReLU(),                
                nn.Linear(batch_size, class_cnt)
            )
        elif version == 1:
            self.feature_extract = nn.ModuleList([
                # 3d convolution w/ padding in multiple dimensions & max pool.
                nn.Conv3d(3, 16, (3,5,5), padding=(2,2,2)),
                nn.ReLU(),
                nn.MaxPool3d((2,2,2)),
                nn.Conv3d(16, 32, (3,5,5), padding=(2,2,2)),
                nn.ReLU(),
                nn.MaxPool3d((2,2,2)),
                nn.Conv3d(32, 64, (3,5,5), padding=(2,2,2)),
                nn.ReLU(),
                nn.ConvTranspose3d(64, 64, (3,5,5), padding=(2,2,2)),
                nn.Sigmoid(),
                nn.MaxPool3d((2,2,2))                
            ])
            
            # Classifier section        
            self.flatten = nn.Flatten()
            expected_size = 67584
            self.classifier_stack = nn.Sequential(
                nn.Linear(expected_size, batch_size),
                nn.ReLU(),                
                nn.Linear(batch_size, class_cnt)
            )
            
    def forward(self, x):
        # Input: (b, t, c, h, w)
        x = torch.transpose(x, 1, 2)
        # After transpose: (b, c, t, h, w)
        for index, layer in enumerate(self.feature_extract):
            x = layer(x)    
        x = self.flatten(x)
        logits = self.classifier_stack(x)
        return logits

    
class RNN(nn.Module):
    def __init__(self, class_cnt:int, batch_size:int):
        super().__init__()
        # Create a module list so we have slightly more control
        # ngl i kind of messed this up by adding in normalization and bigger layers but whatever
        self.feature_extract = nn.ModuleList([
            #conv 1
            nn.Conv3d(in_channels=3, out_channels=8,
                      kernel_size=(3,9,9),
                      padding="same"), # For no padding: "valid"
            nn.BatchNorm3d(8),
            nn.ReLU(),
            
            #conv 2 & RELU & normalization
            nn.Conv3d(8, 64, (3,3,3), padding="same"),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            
            #avg pool 1
            nn.AdaptiveMaxPool3d((1,2,2)),
            
            #conv 3 & RELU & normalization
            nn.Conv3d(64, 256, (3,3,3), padding="same"),
            nn.ReLU(),
            
            #avg pool 2
            nn.AdaptiveAvgPool3d((1,2,2)),
            
            nn.Conv3d(256, 256, (3,3,3), padding="same"),
            nn.BatchNorm3d(256),
            #trying sigmoid bc technically it's binary classification of walking or running so
            nn.Sigmoid(), 
            nn.AdaptiveMaxPool3d((1,2,2))
        ])
        # RNN and classifier section        
        self.flatten = nn.Flatten(start_dim=2)
        
        expected_size = 1024 #256#4224
        
        self.rnn = nn.RNN(input_size=expected_size, 
                          hidden_size=expected_size,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=True)
        
        self.classifier_stack = nn.Sequential(                           
            nn.Linear(2048, class_cnt)
        )
        
    def forward(self, x):
        PRINT_DEBUG = False
        # Input: (b, t, c, h, w)
        x = torch.transpose(x, 1, 2)
        # After transpose: (b, c, t, h, w)
        for index, layer in enumerate(self.feature_extract):
            #print(index, ":", x.shape)
            x = layer(x)
        if PRINT_DEBUG: print("FEATURES:", x.shape)
        # After features: (b, c, t, h, w)    
        x = torch.transpose(x, 1, 2)
        # After swap AGAIN: (b, t, c, h, w)                       
        x = self.flatten(x)
        # After flatten: (b, t, c*h*w) 
        if PRINT_DEBUG: print("FLATTENED:", x.shape)
        out, _ = self.rnn(x)
        if PRINT_DEBUG: print("OUT:", out.shape)
        out = out[:,-1,:]        
        logits = self.classifier_stack(out)
        return logits

def get_approach_names():
    return ["CNN0","CNN1","RNN","ALEX"]

def get_approach_description(approach_name):
    desc = {
        "CNN0":"CNN w/ true 3d convolution (3,7,7), padding only in space, & 3d stride instead of pooling. Training Data is augmented with randomly applied grayscale, h/v flips, and solarization. Strange results, test accuracy never goes beyond 68.5%. Possible it's learning to always guess more frequently appearing value.",
        "CNN1":"CNN w/ true 3d convolution (3,5,5), 3d padding (2,2,2), pooling, and an additional ConvTranspose3d layer. Training Data is augmented with randomly applied grayscale, h/v flips, and solarization. Strange results, test accuracy never goes beyond 68.5%. Possible it's learning to always guess more frequently appearing value.",
        "RNN":"Modified RNNVideoNet example from class and added some layers encountered when implementing ALEX. Kernel size (3,3,3), BiDirectional RNN. Training Data is augmented with randomly applied grayscale, h/v flips, and solarization. Very, very poor results, but I ran out of time to experiment with it.",
        "ALEX":"Attempting to use in-class slides & online resources to implement AlexNet for video. Still uses 2D convolutions, did not experiment with true 3D convolutions. Possibly encountering overfitting as the training accuracy hit 1.000 even though it never triggers the Early Stop. Retraining on non-augmented data resulted in higher accuracy. It has the best accuracy of all 4 approaches."
    }
    return desc.get(approach_name, ''.join([pretty_text("ERROR!","white",bold=True,underline=True)," ",pretty_text("Invalid Approach",bold=True)]))

def get_data_transform(approach_name, training):
    target_size = (100,180) #height,width. tiny because my gpu only has 8gb ram :(
    if not training or approach_name == "ALEX": #NOTE: only diff b/w the two alex models provided is this line
        data_transform = v2.Compose([v2.ToImage(), 
                                    v2.ToDtype(torch.float32, scale=True),
                                    v2.Resize(target_size)])
    else:
        data_transform = v2.Compose([v2.ToImage(), 
                                    v2.ToDtype(torch.float32, scale=True),
                                    v2.RandomGrayscale(0.3),
                                    v2.RandomSolarize(0.3),
                                    v2.RandomHorizontalFlip(0.3),
                                    v2.RandomVerticalFlip(0.3),
                                    v2.Resize(target_size)])
    return data_transform

def get_batch_size(approach_name):
    batch_sizes = {
        "CNN0": 25,
        "CNN1": 25,
        "RNN": 25,
        "ALEX": 25
    }
    return batch_sizes.get(approach_name, 32)

def create_model(approach_name, class_cnt):
    match approach_name:
        case "CNN0":
            model = CNN(class_cnt,get_batch_size("CNN0"),0)
        case "CNN1":
            model = CNN(class_cnt,get_batch_size("CNN1"),1)
        case "RNN":
            model = RNN(class_cnt,get_batch_size("RNN"))
        case "ALEX":
            model = ALEX(class_cnt,get_batch_size("ALEX"))
        case _:
            print(''.join([pretty_text("ERROR!",bg="red",bold=True)," ",pretty_text("Invalid Approach Provided.",bold=True)]))
            model = None
    return model

def train_one_epoch(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    model.train()
    # For HMDB: (X, _, y)
    for batch, (input, _, label) in enumerate(dataloader):
        # move inputs and labels to device
        input, label = input.to(device), label.to(device)
        
        #zero the parameter gradients
        optimizer.zero_grad()
        
        # Compute prediction error
        outputs = model(input)
        loss = loss_fn(outputs, label)
        
        # Backpropagation
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(input)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
        
            
def test_one_epoch(dataloader, model, loss_fn, data_name, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        # For HMDB: X, _, y
        for input, _, label in dataloader:            
            input, label = input.to(device), label.to(device)
            pred = model(input)
            test_loss += loss_fn(pred, label).item()
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(data_name + f" Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

def train_model(approach_name, model, device, train_dataloader, test_dataloader):
    patience = 4
    match approach_name:
        case "CNN0":
            epochs = 32
        case "CNN1":
            epochs = 50
        case "RNN":
            epochs = 15 #ran out of mem at 17 epoch :(
        case "ALEX":
            epochs = 100
            patience = 2
        case _:
            epochs = 32
    early_stopper = EarlyStopper(patience=patience, min_delta=10) #early stopping so we don't accidentally mess up our boy
    # Set loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    if approach_name == "ALEX":
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=5e-3) #higher learning rate for my garbage self-made networks
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_one_epoch(train_dataloader, model, loss_fn, optimizer, device)
        validation_loss = test_one_epoch(test_dataloader,model,loss_fn, "Test", device)
        if early_stopper.early_stop(validation_loss):
            print(pretty_text("Stopping early!",bg="yellow",bold=True),"trying to avoid over overfit")   # did not trigger for ALEX but did trigger in RNN? huh        
            break
    return model

def main():
    print(pretty_text("torch:",bg="yellow",bold=True),torch.__version__)
    print(pretty_text("torchvision:",bg="yellow",bold=True),torchvision.__version__)
    print(pretty_text("attempted approaches:",bg="blue",bold=True))
    for approach in get_approach_names():
        print(pretty_text(approach + ":",color="blue",italic=True),get_approach_description(approach))
    
if __name__ == "__main__":
    main()