N-gram Program in NLP

An n-gram program generates sequences of n consecutive words (or characters) from text. This tutorial builds progressively from basic generation to a frequency analyzer and language model.

Basic N-gram Generator

from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

def generate_ngrams(text, n):
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [t for t in tokens if t.isalpha()]
    return list(ngrams(tokens, n))

text = "Large language models transform how developers build NLP applications efficiently."

print("Unigrams (n=1):", generate_ngrams(text, 1)[:5])
print("Bigrams  (n=2):", generate_ngrams(text, 2)[:5])
print("Trigrams (n=3):", generate_ngrams(text, 3)[:5])
print("4-grams  (n=4):", generate_ngrams(text, 4)[:4])

Character N-grams

def char_ngrams(text, n, word_boundary=True):
    if word_boundary:
        # Pad each word with spaces to respect word boundaries
        words = text.lower().split()
        all_ngrams = []
        for word in words:
            padded = f" {word} "
            word_ngrams = [padded[i:i+n] for i in range(len(padded) - n + 1)]
            all_ngrams.extend(word_ngrams)
        return all_ngrams
    else:
        text = text.lower().replace(' ', '')
        return [text[i:i+n] for i in range(len(text) - n + 1)]

word = "transformer"
print("Char bigrams:", char_ngrams(word, 2, word_boundary=False))
print("Char trigrams:", char_ngrams(word, 3, word_boundary=False))

# Multiple words
text = "nlp model"
print("Word-boundary char-4grams:", char_ngrams(text, 4))

N-gram Frequency Analysis

from nltk.util import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import FreqDist
from collections import Counter
import nltk

def analyze_ngrams(corpus, n=2, top_k=15):
    all_ngrams = []
    sentences = sent_tokenize(corpus)

    for sentence in sentences:
        tokens = [t.lower() for t in word_tokenize(sentence) if t.isalpha()]
        all_ngrams.extend(ngrams(tokens, n))

    freq_dist = Counter(all_ngrams)
    return freq_dist.most_common(top_k)

corpus = """
Machine learning models have fundamentally transformed natural language processing.
Language models learn statistical patterns from vast text corpora.
Modern NLP systems use transformer models for language understanding tasks.
Deep learning has improved accuracy across all NLP tasks significantly.
Transformer models use attention mechanisms to process text sequences efficiently.
Language understanding requires both syntactic and semantic knowledge.
"""

print("=== Bigram Frequency Analysis ===")
for bigram, count in analyze_ngrams(corpus, n=2, top_k=10):
    print(f"  {' '.join(bigram):<30} {count}")

print("\n=== Trigram Frequency Analysis ===")
for trigram, count in analyze_ngrams(corpus, n=3, top_k=8):
    print(f"  {' '.join(trigram):<40} {count}")

N-gram Language Model

A simple language model that predicts the next word based on the preceding n-1 words:

from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk, random

class NgramLanguageModel:
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.vocab = set()

    def train(self, corpus):
        sentences = sent_tokenize(corpus)

        for sentence in sentences:
            tokens = ['<s>'] * (self.n - 1) + \
                     [t.lower() for t in word_tokenize(sentence) if t.isalpha()] + \
                     ['</s>']
            self.vocab.update(tokens)

            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i + self.n - 1])
                next_word = tokens[i + self.n - 1]
                self.ngram_counts[context][next_word] += 1

    def probability(self, word, context):
        context = tuple(context[-(self.n-1):])
        context_count = sum(self.ngram_counts[context].values())
        if context_count == 0:
            return 0.0
        word_count = self.ngram_counts[context][word]
        return word_count / context_count

    def predict_next(self, context, top_k=5):
        context = tuple([w.lower() for w in context[-(self.n-1):]])
        if context not in self.ngram_counts:
            return []
        predictions = self.ngram_counts[context].most_common(top_k)
        total = sum(c for _, c in predictions)
        return [(word, round(count/total, 4)) for word, count in predictions]

    def generate(self, seed_words, max_words=20):
        tokens = ['<s>'] * (self.n - 1) + [w.lower() for w in seed_words]

        for _ in range(max_words):
            context = tuple(tokens[-(self.n-1):])
            candidates = self.ngram_counts.get(context, {})
            if not candidates or '</s>' in candidates:
                break
            next_word = max(candidates, key=candidates.get)
            if next_word == '</s>':
                break
            tokens.append(next_word)

        return ' '.join(tokens[self.n-1:])

# Train and test
corpus = """
Natural language processing enables machines to understand human language.
Language models learn from text and generate coherent sequences.
Machine learning transforms how computers process and analyze text data.
Deep learning models achieve remarkable accuracy on language tasks.
Modern NLP uses transformer architectures for language generation.
"""

model = NgramLanguageModel(n=3)  # Trigram model
model.train(corpus)

# Predict next words
context = ["language", "models"]
predictions = model.predict_next(context, top_k=5)
print(f"Context: '{' '.join(context)}'")
print("Top predictions:")
for word, prob in predictions:
    print(f"  {word:<20} {prob:.4f}")

# Generate text
generated = model.generate(["natural", "language"])
print(f"\nGenerated: {generated}")

Visualization

import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

text = """
Transformers use attention mechanisms that allow models to relate different positions
of a sequence when computing a representation. Attention lets the model focus on
relevant parts of the input when producing an output. Language models trained on
large corpora learn rich linguistic representations.
"""

tokens = [t.lower() for t in word_tokenize(text) if t.isalpha()]
bigrams = Counter(ngrams(tokens, 2))
top_bigrams = bigrams.most_common(10)

labels = [f"{a} {b}" for (a, b), _ in top_bigrams]
counts = [count for _, count in top_bigrams]

plt.figure(figsize=(10, 5))
plt.barh(labels[::-1], counts[::-1])
plt.xlabel('Frequency')
plt.title('Top 10 Bigrams')
plt.tight_layout()
plt.savefig('bigram_frequency.png', dpi=150)
plt.show()
print("Chart saved to bigram_frequency.png")

Practical Applications Summary

Application	N-gram Type	How
Autocomplete / autocorrect	Word bigrams/trigrams	Predict most likely next word
Spam detection	Character 4-6-grams	Catch obfuscation like “V1agra”
Language detection	Character bigrams	Each language has unique n-gram fingerprint
Plagiarism detection	Word n-grams	Compare overlap between documents
Keyword extraction	Bigrams/trigrams	”machine learning” more meaningful than “machine” alone
Sentiment features	Bigrams	”not good”, “very bad” vs “not bad”, “very good”