Dataset And Dataloader - PyTorch Beginner 09
In this part we see how we can use the built-in Dataset and DataLoader classes and improve our pipeline with batch training.
Learn all the basics you need to get started with this deep learning framework! In this part we see how we can use the built-in Dataset and DataLoader classes and improve our pipeline with batch training. See how we can write our own Dataset class and use available built-in datasets.
- Dataset and DataLoader
- Automatic batch calculation
- Batch optimization in training loop
All code from this course can be found on GitHub.
Dataset and DataLoader in PyTorch¶
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
# gradient computation etc. not efficient for whole data set
# -> divide dataset into small batches
'''
# training loop
for epoch in range(num_epochs):
# loop over all batches
for i in range(total_batches):
batch_x, batch_y = ...
'''
# epoch = one forward and backward pass of ALL training samples
# batch_size = number of training samples used in one forward/backward pass
# number of iterations = number of passes, each pass (forward+backward) using [batch_size] number of sampes
# e.g : 100 samples, batch_size=20 -> 100/20=5 iterations for 1 epoch
# --> DataLoader can do the batch computation for us
# Implement a custom Dataset:
# inherit Dataset
# implement __init__ , __getitem__ , and __len__
class WineDataset(Dataset):
def __init__(self):
# Initialize data, download, etc.
# read with numpy or pandas
xy = np.loadtxt('./data/wine/wine.csv', delimiter=',', dtype=np.float32, skiprows=1)
self.n_samples = xy.shape[0]
# here the first column is the class label, the rest are the features
self.x_data = torch.from_numpy(xy[:, 1:]) # size [n_samples, n_features]
self.y_data = torch.from_numpy(xy[:, [0]]) # size [n_samples, 1]
# support indexing such that dataset[i] can be used to get i-th sample
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
# we can call len(dataset) to return the size
def __len__(self):
return self.n_samples
# create dataset
dataset = WineDataset()
# get first sample and unpack
first_data = dataset[0]
features, labels = first_data
print(features, labels)
# Load whole dataset with DataLoader
# shuffle: shuffle data, good for training
# num_workers: faster loading with multiple subprocesses
# !!! IF YOU GET AN ERROR DURING LOADING, SET num_workers TO 0 !!!
train_loader = DataLoader(dataset=dataset,
batch_size=4,
shuffle=True,
num_workers=2)
# convert to an iterator and look at one random sample
dataiter = iter(train_loader)
data = dataiter.next()
features, labels = data
print(features, labels)
# Dummy Training loop
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
print(total_samples, n_iterations)
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(train_loader):
# here: 178 samples, batch_size = 4, n_iters=178/4=44.5 -> 45 iterations
# Run your training process
if (i+1) % 5 == 0:
print(f'Epoch: {epoch+1}/{num_epochs}, Step {i+1}/{n_iterations}| Inputs {inputs.shape} | Labels {labels.shape}')
# some famous datasets are available in torchvision.datasets
# e.g. MNIST, Fashion-MNIST, CIFAR10, COCO
train_dataset = torchvision.datasets.MNIST(root='./data',
train=True,
transform=torchvision.transforms.ToTensor(),
download=True)
train_loader = DataLoader(dataset=train_dataset,
batch_size=3,
shuffle=True)
# look at one random sample
dataiter = iter(train_loader)
data = dataiter.next()
inputs, targets = data
print(inputs.shape, targets.shape)
FREE VS Code / PyCharm Extensions I Use
✅ Write cleaner code with Sourcery, instant refactoring suggestions: Link*
Python Problem-Solving Bootcamp
🚀 Solve 42 programming puzzles over the course of 21 days: Link*