Data Augmentation: Expanding Training Data for Better ML Models

Learn data augmentation techniques — image augmentation, text augmentation, tabular augmentation, SMOTE for imbalanced data, and using augmentation in training pipelines.

Data Augmentation

Data augmentation artificially expands your training set by creating modified copies of existing data. It’s one of the most effective regularization techniques available — more data is almost always better, and augmentation is cheap when you can’t collect more real examples.


Image Augmentation with torchvision

import torchvision.transforms as T
from torch.utils.data import Dataset
from PIL import Image
# Standard augmentation pipeline for training
train_transforms = T.Compose([
T.RandomResizedCrop(224, scale=(0.8, 1.0)), # Random crop and resize
T.RandomHorizontalFlip(p=0.5), # 50% chance of flipping
T.RandomRotation(degrees=15), # ±15° rotation
T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
T.RandomGrayscale(p=0.1),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet stats
])
# No augmentation for validation/test (only normalize)
val_transforms = T.Compose([
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Advanced Image Augmentation with Albumentations

Albumentations is faster and more flexible than torchvision:

import albumentations as A
from albumentations.pytorch import ToTensorV2
train_transforms = A.Compose([
A.RandomResizedCrop(height=224, width=224, scale=(0.8, 1.0)),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15, p=0.5),
A.OneOf([
A.GaussNoise(var_limit=(10, 50)),
A.GaussianBlur(blur_limit=3),
A.MotionBlur(blur_limit=3)
], p=0.3),
A.OneOf([
A.GridDistortion(p=1.0),
A.ElasticTransform(p=1.0),
], p=0.2),
A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
ToTensorV2()
])
# CutMix and MixUp for state-of-the-art accuracy
A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3) # CutOut

Text Augmentation

import random
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
# Synonym replacement
aug_synonym = naw.SynonymAug(aug_p=0.3)
text = "The model performs well on image classification tasks"
augmented = aug_synonym.augment(text)
# Random word insertion, deletion, or swap
aug_random = naw.RandomWordAug(action='swap', aug_p=0.2)
augmented = aug_random.augment(text)
# Back-translation (translate to French, then back to English)
aug_bt = naw.BackTranslationAug(from_model_name='Helsinki-NLP/opus-mt-en-fr',
to_model_name='Helsinki-NLP/opus-mt-fr-en')
augmented = aug_bt.augment(text)
# Easy Data Augmentation (EDA): simple random operations
def eda_augment(text, n_aug=4):
words = text.split()
augmented = []
for _ in range(n_aug):
action = random.choice(['synonym', 'insert', 'swap', 'delete'])
# ... implement EDA operations
return augmented

Tabular Augmentation

import numpy as np
def gaussian_noise_augmentation(X, y, factor=5, noise_std=0.01):
"""Duplicate dataset with small Gaussian noise added to numerical features."""
X_aug = np.tile(X, (factor, 1)) + np.random.normal(0, noise_std, (factor * len(X), X.shape[1]))
y_aug = np.tile(y, factor)
return np.vstack([X, X_aug]), np.concatenate([y, y_aug])
def mixup(X, y, alpha=0.4, n_samples=None):
"""Mix two training examples with random interpolation weight."""
if n_samples is None:
n_samples = len(X)
idx1 = np.random.randint(0, len(X), n_samples)
idx2 = np.random.randint(0, len(X), n_samples)
lambda_ = np.random.beta(alpha, alpha, n_samples)
X_mix = lambda_[:, None] * X[idx1] + (1 - lambda_[:, None]) * X[idx2]
# For classification, mixed labels work with soft labels or label smoothing
return X_mix

SMOTE for Imbalanced Datasets

Synthetic Minority Over-sampling TEchnique creates synthetic samples for underrepresented classes:

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek
# Basic SMOTE
smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f"Before: {pd.Series(y_train).value_counts().to_dict()}")
print(f"After: {pd.Series(y_resampled).value_counts().to_dict()}")
# Combine oversampling + undersampling
smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
# ADASYN: adaptive — focuses on hard-to-classify samples
adasyn = ADASYN(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

Important: Apply SMOTE only to training data, never to validation or test sets.


Test-Time Augmentation (TTA)

During inference, run the model on multiple augmented versions of the same input and average predictions:

def tta_predict(model, image, n_augments=5, transform=val_transforms):
model.eval()
predictions = []
with torch.no_grad():
# Original
predictions.append(torch.softmax(model(transform(image).unsqueeze(0)), dim=1))
# Augmented versions (horizontal flip, slight rotations, etc.)
for _ in range(n_augments - 1):
aug_img = train_transforms(image).unsqueeze(0)
predictions.append(torch.softmax(model(aug_img), dim=1))
return torch.stack(predictions).mean(dim=0)

TTA consistently improves accuracy by 0.5–2% on image classification tasks with minimal extra compute.