Data Augmentation
Data augmentation artificially expands your training set by creating modified copies of existing data. It’s one of the most effective regularization techniques available — more data is almost always better, and augmentation is cheap when you can’t collect more real examples.
Image Augmentation with torchvision
import torchvision.transforms as Tfrom torch.utils.data import Datasetfrom PIL import Image
# Standard augmentation pipeline for trainingtrain_transforms = T.Compose([ T.RandomResizedCrop(224, scale=(0.8, 1.0)), # Random crop and resize T.RandomHorizontalFlip(p=0.5), # 50% chance of flipping T.RandomRotation(degrees=15), # ±15° rotation T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), T.RandomGrayscale(p=0.1), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet stats])
# No augmentation for validation/test (only normalize)val_transforms = T.Compose([ T.Resize(256), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])Advanced Image Augmentation with Albumentations
Albumentations is faster and more flexible than torchvision:
import albumentations as Afrom albumentations.pytorch import ToTensorV2
train_transforms = A.Compose([ A.RandomResizedCrop(height=224, width=224, scale=(0.8, 1.0)), A.HorizontalFlip(p=0.5), A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15, p=0.5), A.OneOf([ A.GaussNoise(var_limit=(10, 50)), A.GaussianBlur(blur_limit=3), A.MotionBlur(blur_limit=3) ], p=0.3), A.OneOf([ A.GridDistortion(p=1.0), A.ElasticTransform(p=1.0), ], p=0.2), A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ToTensorV2()])
# CutMix and MixUp for state-of-the-art accuracyA.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3) # CutOutText Augmentation
import randomimport nlpaug.augmenter.word as nawimport nlpaug.augmenter.sentence as nas
# Synonym replacementaug_synonym = naw.SynonymAug(aug_p=0.3)text = "The model performs well on image classification tasks"augmented = aug_synonym.augment(text)
# Random word insertion, deletion, or swapaug_random = naw.RandomWordAug(action='swap', aug_p=0.2)augmented = aug_random.augment(text)
# Back-translation (translate to French, then back to English)aug_bt = naw.BackTranslationAug(from_model_name='Helsinki-NLP/opus-mt-en-fr', to_model_name='Helsinki-NLP/opus-mt-fr-en')augmented = aug_bt.augment(text)
# Easy Data Augmentation (EDA): simple random operationsdef eda_augment(text, n_aug=4): words = text.split() augmented = [] for _ in range(n_aug): action = random.choice(['synonym', 'insert', 'swap', 'delete']) # ... implement EDA operations return augmentedTabular Augmentation
import numpy as np
def gaussian_noise_augmentation(X, y, factor=5, noise_std=0.01): """Duplicate dataset with small Gaussian noise added to numerical features.""" X_aug = np.tile(X, (factor, 1)) + np.random.normal(0, noise_std, (factor * len(X), X.shape[1])) y_aug = np.tile(y, factor) return np.vstack([X, X_aug]), np.concatenate([y, y_aug])
def mixup(X, y, alpha=0.4, n_samples=None): """Mix two training examples with random interpolation weight.""" if n_samples is None: n_samples = len(X)
idx1 = np.random.randint(0, len(X), n_samples) idx2 = np.random.randint(0, len(X), n_samples) lambda_ = np.random.beta(alpha, alpha, n_samples)
X_mix = lambda_[:, None] * X[idx1] + (1 - lambda_[:, None]) * X[idx2] # For classification, mixed labels work with soft labels or label smoothing return X_mixSMOTE for Imbalanced Datasets
Synthetic Minority Over-sampling TEchnique creates synthetic samples for underrepresented classes:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTEfrom imblearn.combine import SMOTETomek
# Basic SMOTEsmote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(f"Before: {pd.Series(y_train).value_counts().to_dict()}")print(f"After: {pd.Series(y_resampled).value_counts().to_dict()}")
# Combine oversampling + undersamplingsmote_tomek = SMOTETomek(random_state=42)X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)
# ADASYN: adaptive — focuses on hard-to-classify samplesadasyn = ADASYN(sampling_strategy=0.5, random_state=42)X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)Important: Apply SMOTE only to training data, never to validation or test sets.
Test-Time Augmentation (TTA)
During inference, run the model on multiple augmented versions of the same input and average predictions:
def tta_predict(model, image, n_augments=5, transform=val_transforms): model.eval() predictions = []
with torch.no_grad(): # Original predictions.append(torch.softmax(model(transform(image).unsqueeze(0)), dim=1))
# Augmented versions (horizontal flip, slight rotations, etc.) for _ in range(n_augments - 1): aug_img = train_transforms(image).unsqueeze(0) predictions.append(torch.softmax(model(aug_img), dim=1))
return torch.stack(predictions).mean(dim=0)TTA consistently improves accuracy by 0.5–2% on image classification tasks with minimal extra compute.