import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, random_split, DataLoader
from torchvision import datasets, transforms, models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, random_split, DataLoader
from torchvision.utils import save_image
from torchsummary import summary
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import math
from PIL import Image
import glob
from IPython.display import display
Handwritten Image Classification with Vision in Transformers (ViT)
= torch.device("cuda" if torch.cuda.is_available() else "cpu")
device print(device)
cuda
0)
torch.manual_seed(0) np.random.seed(
= 200
BATCH_SIZE = 5e-5
LR = 10 NUM_EPOCHES
= (0.5,), (0.5,)
mean, std
= transforms.Compose([transforms.ToTensor(),
transform
transforms.Normalize(mean, std) ])
= datasets.MNIST('../data/MNIST/', download=True, train=True, transform=transform)
trainset = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
trainloader
= datasets.MNIST('../data/MNIST/', download=True, train=False, transform=transform)
testset = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False) testloader
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz
Extracting ../data/MNIST/MNIST/raw/train-images-idx3-ubyte.gz to ../data/MNIST/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting ../data/MNIST/MNIST/raw/train-labels-idx1-ubyte.gz to ../data/MNIST/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz
Extracting ../data/MNIST/MNIST/raw/t10k-images-idx3-ubyte.gz to ../data/MNIST/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ../data/MNIST/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/MNIST/MNIST/raw
100%|██████████| 9912422/9912422 [00:00<00:00, 132524476.35it/s]
100%|██████████| 28881/28881 [00:00<00:00, 40378564.61it/s]
100%|██████████| 1648877/1648877 [00:00<00:00, 38440411.96it/s]
100%|██████████| 4542/4542 [00:00<00:00, 22842360.63it/s]
Model
!pip install transformer-implementations
from transformer_package.models import ViT
Collecting transformer-implementations
Downloading transformer_implementations-0.0.9-py3-none-any.whl (9.4 kB)
Installing collected packages: transformer-implementations
Successfully installed transformer-implementations-0.0.9
= 28
image_size = 1
channel_size = 7
patch_size = 512
embed_size = 4
num_heads = 10
classes = 2
num_layers = 256
hidden_size = 0.2
dropout
= ViT(image_size, channel_size, patch_size, embed_size, num_heads, classes, num_layers, hidden_size, dropout=dropout).to(device)
model model
ViT(
(dropout_layer): Dropout(p=0.2, inplace=False)
(embeddings): Linear(in_features=49, out_features=512, bias=True)
(encoders): ModuleList(
(0-1): 2 x VisionEncoder(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attention): MultiHeadAttention(
(dropout_layer): Dropout(p=0.2, inplace=False)
(Q): Linear(in_features=512, out_features=512, bias=True)
(K): Linear(in_features=512, out_features=512, bias=True)
(V): Linear(in_features=512, out_features=512, bias=True)
(linear): Linear(in_features=512, out_features=512, bias=True)
)
(mlp): Sequential(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.2, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.2, inplace=False)
)
)
)
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(classifier): Sequential(
(0): Linear(in_features=512, out_features=10, bias=True)
)
)
for img, label in trainloader:
= img.to(device)
img = label.to(device)
label
print("Input Image Dimensions: {}".format(img.size()))
print("Label Dimensions: {}".format(label.size()))
print("-"*100)
= model(img)
out
print("Output Dimensions: {}".format(out.size()))
break
Input Image Dimensions: torch.Size([200, 1, 28, 28])
Label Dimensions: torch.Size([200])
----------------------------------------------------------------------------------------------------
Output Dimensions: torch.Size([200, 10])
= nn.NLLLoss()
criterion = torch.optim.Adam(params=model.parameters(), lr=LR) optimizer
= {}
loss_hist "train accuracy"] = []
loss_hist["train loss"] = []
loss_hist[
for epoch in range(1, NUM_EPOCHES+1):
model.train()
= 0
epoch_train_loss
= []
y_true_train = []
y_pred_train = 0
ip
for batch_idx, (img, labels) in enumerate(trainloader):
= img.to(device)
img = labels.to(device)
labels
= model(img)
preds
= criterion(preds, labels)
loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
=-1).tolist())
y_pred_train.extend(preds.detach().argmax(dim
y_true_train.extend(labels.detach().tolist())
+= loss.item()
epoch_train_loss = ip + 1
ip if ip % 100 == 0:
print("Step: {:.8f} {:.8f}".format(epoch, ip))
"train loss"].append(epoch_train_loss)
loss_hist[
= len([True for x, y in zip(y_pred_train, y_true_train) if x==y])
total_correct = len(y_pred_train)
total = total_correct * 100 / total
accuracy
"train accuracy"].append(accuracy)
loss_hist[
print("-------------------------------------------------")
print("Epoch: {} Train mean loss: {:.8f}".format(epoch, epoch_train_loss))
print(" Train Accuracy%: ", accuracy, "==", total_correct, "/", total)
print("-------------------------------------------------")
Step: 1.00000000 100.00000000
Step: 1.00000000 200.00000000
Step: 1.00000000 300.00000000
-------------------------------------------------
Epoch: 1 Train mean loss: 323.08200696
Train Accuracy%: 63.598333333333336 == 38159 / 60000
-------------------------------------------------
Step: 2.00000000 100.00000000
Step: 2.00000000 200.00000000
Step: 2.00000000 300.00000000
-------------------------------------------------
Epoch: 2 Train mean loss: 131.86424088
Train Accuracy%: 86.045 == 51627 / 60000
-------------------------------------------------
Step: 3.00000000 100.00000000
Step: 3.00000000 200.00000000
Step: 3.00000000 300.00000000
-------------------------------------------------
Epoch: 3 Train mean loss: 102.83779885
Train Accuracy%: 89.11833333333334 == 53471 / 60000
-------------------------------------------------
Step: 4.00000000 100.00000000
Step: 4.00000000 200.00000000
Step: 4.00000000 300.00000000
-------------------------------------------------
Epoch: 4 Train mean loss: 87.88007259
Train Accuracy%: 90.72666666666667 == 54436 / 60000
-------------------------------------------------
Step: 5.00000000 100.00000000
Step: 5.00000000 200.00000000
Step: 5.00000000 300.00000000
-------------------------------------------------
Epoch: 5 Train mean loss: 78.15140389
Train Accuracy%: 91.76166666666667 == 55057 / 60000
-------------------------------------------------
Step: 6.00000000 100.00000000
Step: 6.00000000 200.00000000
Step: 6.00000000 300.00000000
-------------------------------------------------
Epoch: 6 Train mean loss: 69.25521898
Train Accuracy%: 92.69333333333333 == 55616 / 60000
-------------------------------------------------
Step: 7.00000000 100.00000000
Step: 7.00000000 200.00000000
Step: 7.00000000 300.00000000
-------------------------------------------------
Epoch: 7 Train mean loss: 63.42148008
Train Accuracy%: 93.325 == 55995 / 60000
-------------------------------------------------
Step: 8.00000000 100.00000000
Step: 8.00000000 200.00000000
Step: 8.00000000 300.00000000
-------------------------------------------------
Epoch: 8 Train mean loss: 58.87239636
Train Accuracy%: 93.83333333333333 == 56300 / 60000
-------------------------------------------------
Step: 9.00000000 100.00000000
Step: 9.00000000 200.00000000
Step: 9.00000000 300.00000000
-------------------------------------------------
Epoch: 9 Train mean loss: 54.04052846
Train Accuracy%: 94.25166666666667 == 56551 / 60000
-------------------------------------------------
Step: 10.00000000 100.00000000
Step: 10.00000000 200.00000000
Step: 10.00000000 300.00000000
-------------------------------------------------
Epoch: 10 Train mean loss: 50.72075617
Train Accuracy%: 94.59333333333333 == 56756 / 60000
-------------------------------------------------
Test
"train accuracy"])
plt.plot(loss_hist["Epoch")
plt.xlabel("Loss")
plt.ylabel( plt.show()
"train loss"])
plt.plot(loss_hist["Epoch")
plt.xlabel("Loss")
plt.ylabel( plt.show()
with torch.no_grad():
eval()
model.
= []
y_true_test = []
y_pred_test
for batch_idx, (img, labels) in enumerate(testloader):
= img.to(device)
img = label.to(device)
label
= model(img)
preds
=-1).tolist())
y_pred_test.extend(preds.detach().argmax(dim
y_true_test.extend(labels.detach().tolist())
= len([True for x, y in zip(y_pred_test, y_true_test) if x==y])
total_correct = len(y_pred_test)
total = total_correct * 100 / total
accuracy
print("Test Accuracy%: ", accuracy, "==", total_correct, "/", total)
Test Accuracy%: 96.43 == 9643 / 10000