N-gram Program in NLP
An n-gram program generates sequences of n consecutive words (or characters) from text. This tutorial builds progressively from basic generation to a frequency analyzer and language model.
Basic N-gram Generator
from nltk.util import ngramsfrom nltk.tokenize import word_tokenizeimport nltknltk.download('punkt_tab')
def generate_ngrams(text, n): tokens = word_tokenize(text.lower()) # Remove punctuation tokens = [t for t in tokens if t.isalpha()] return list(ngrams(tokens, n))
text = "Large language models transform how developers build NLP applications efficiently."
print("Unigrams (n=1):", generate_ngrams(text, 1)[:5])print("Bigrams (n=2):", generate_ngrams(text, 2)[:5])print("Trigrams (n=3):", generate_ngrams(text, 3)[:5])print("4-grams (n=4):", generate_ngrams(text, 4)[:4])Character N-grams
def char_ngrams(text, n, word_boundary=True): if word_boundary: # Pad each word with spaces to respect word boundaries words = text.lower().split() all_ngrams = [] for word in words: padded = f" {word} " word_ngrams = [padded[i:i+n] for i in range(len(padded) - n + 1)] all_ngrams.extend(word_ngrams) return all_ngrams else: text = text.lower().replace(' ', '') return [text[i:i+n] for i in range(len(text) - n + 1)]
word = "transformer"print("Char bigrams:", char_ngrams(word, 2, word_boundary=False))print("Char trigrams:", char_ngrams(word, 3, word_boundary=False))
# Multiple wordstext = "nlp model"print("Word-boundary char-4grams:", char_ngrams(text, 4))N-gram Frequency Analysis
from nltk.util import ngramsfrom nltk.tokenize import word_tokenize, sent_tokenizefrom nltk import FreqDistfrom collections import Counterimport nltk
def analyze_ngrams(corpus, n=2, top_k=15): all_ngrams = [] sentences = sent_tokenize(corpus)
for sentence in sentences: tokens = [t.lower() for t in word_tokenize(sentence) if t.isalpha()] all_ngrams.extend(ngrams(tokens, n))
freq_dist = Counter(all_ngrams) return freq_dist.most_common(top_k)
corpus = """Machine learning models have fundamentally transformed natural language processing.Language models learn statistical patterns from vast text corpora.Modern NLP systems use transformer models for language understanding tasks.Deep learning has improved accuracy across all NLP tasks significantly.Transformer models use attention mechanisms to process text sequences efficiently.Language understanding requires both syntactic and semantic knowledge."""
print("=== Bigram Frequency Analysis ===")for bigram, count in analyze_ngrams(corpus, n=2, top_k=10): print(f" {' '.join(bigram):<30} {count}")
print("\n=== Trigram Frequency Analysis ===")for trigram, count in analyze_ngrams(corpus, n=3, top_k=8): print(f" {' '.join(trigram):<40} {count}")N-gram Language Model
A simple language model that predicts the next word based on the preceding n-1 words:
from collections import defaultdict, Counterfrom nltk.tokenize import word_tokenize, sent_tokenizeimport nltk, random
class NgramLanguageModel: def __init__(self, n=2): self.n = n self.ngram_counts = defaultdict(Counter) self.vocab = set()
def train(self, corpus): sentences = sent_tokenize(corpus)
for sentence in sentences: tokens = ['<s>'] * (self.n - 1) + \ [t.lower() for t in word_tokenize(sentence) if t.isalpha()] + \ ['</s>'] self.vocab.update(tokens)
for i in range(len(tokens) - self.n + 1): context = tuple(tokens[i:i + self.n - 1]) next_word = tokens[i + self.n - 1] self.ngram_counts[context][next_word] += 1
def probability(self, word, context): context = tuple(context[-(self.n-1):]) context_count = sum(self.ngram_counts[context].values()) if context_count == 0: return 0.0 word_count = self.ngram_counts[context][word] return word_count / context_count
def predict_next(self, context, top_k=5): context = tuple([w.lower() for w in context[-(self.n-1):]]) if context not in self.ngram_counts: return [] predictions = self.ngram_counts[context].most_common(top_k) total = sum(c for _, c in predictions) return [(word, round(count/total, 4)) for word, count in predictions]
def generate(self, seed_words, max_words=20): tokens = ['<s>'] * (self.n - 1) + [w.lower() for w in seed_words]
for _ in range(max_words): context = tuple(tokens[-(self.n-1):]) candidates = self.ngram_counts.get(context, {}) if not candidates or '</s>' in candidates: break next_word = max(candidates, key=candidates.get) if next_word == '</s>': break tokens.append(next_word)
return ' '.join(tokens[self.n-1:])
# Train and testcorpus = """Natural language processing enables machines to understand human language.Language models learn from text and generate coherent sequences.Machine learning transforms how computers process and analyze text data.Deep learning models achieve remarkable accuracy on language tasks.Modern NLP uses transformer architectures for language generation."""
model = NgramLanguageModel(n=3) # Trigram modelmodel.train(corpus)
# Predict next wordscontext = ["language", "models"]predictions = model.predict_next(context, top_k=5)print(f"Context: '{' '.join(context)}'")print("Top predictions:")for word, prob in predictions: print(f" {word:<20} {prob:.4f}")
# Generate textgenerated = model.generate(["natural", "language"])print(f"\nGenerated: {generated}")Visualization
import matplotlib.pyplot as pltfrom collections import Counterfrom nltk.util import ngramsfrom nltk.tokenize import word_tokenizeimport nltknltk.download('punkt_tab')
text = """Transformers use attention mechanisms that allow models to relate different positionsof a sequence when computing a representation. Attention lets the model focus onrelevant parts of the input when producing an output. Language models trained onlarge corpora learn rich linguistic representations."""
tokens = [t.lower() for t in word_tokenize(text) if t.isalpha()]bigrams = Counter(ngrams(tokens, 2))top_bigrams = bigrams.most_common(10)
labels = [f"{a} {b}" for (a, b), _ in top_bigrams]counts = [count for _, count in top_bigrams]
plt.figure(figsize=(10, 5))plt.barh(labels[::-1], counts[::-1])plt.xlabel('Frequency')plt.title('Top 10 Bigrams')plt.tight_layout()plt.savefig('bigram_frequency.png', dpi=150)plt.show()print("Chart saved to bigram_frequency.png")Practical Applications Summary
| Application | N-gram Type | How |
|---|---|---|
| Autocomplete / autocorrect | Word bigrams/trigrams | Predict most likely next word |
| Spam detection | Character 4-6-grams | Catch obfuscation like “V1agra” |
| Language detection | Character bigrams | Each language has unique n-gram fingerprint |
| Plagiarism detection | Word n-grams | Compare overlap between documents |
| Keyword extraction | Bigrams/trigrams | ”machine learning” more meaningful than “machine” alone |
| Sentiment features | Bigrams | ”not good”, “very bad” vs “not bad”, “very good” |