Extract the Meaning of a Paragraph Using NLP

Meaning extraction converts raw paragraphs into structured knowledge — key topics, entities, intent, and relationships. This guide covers a progression from lightweight rule-based extraction to LLM-powered semantic understanding.

Keyword Extraction with TF-IDF

The simplest approach: words with high TF-IDF scores relative to a background corpus are the most distinctive and meaningful:

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

paragraph = """
Retrieval-Augmented Generation (RAG) combines dense vector retrieval with
large language models to produce accurate, grounded answers. Instead of
relying solely on parameters stored in the model weights, RAG systems
dynamically fetch relevant documents from an external knowledge base and
provide them as context to the LLM during inference. This approach
significantly reduces hallucination and keeps the model's knowledge up to date.
"""

# Create a background corpus to compute IDF against
background = [
    "Language models are trained on large text corpora.",
    "Neural networks learn representations from data.",
    "Vector search enables fast retrieval from large datasets.",
    "Machine learning models make predictions from input features."
]

all_docs = [paragraph] + background
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(all_docs)

# Keywords from the paragraph (first document)
paragraph_tfidf = tfidf_matrix[0].toarray()[0]
feature_names = vectorizer.get_feature_names_out()
top_indices = np.argsort(paragraph_tfidf)[::-1][:10]
keywords = [(feature_names[i], round(paragraph_tfidf[i], 4)) for i in top_indices if paragraph_tfidf[i] > 0]

print("Extracted keywords:")
for keyword, score in keywords:
    print(f"  {keyword:<35} {score}")

Named Entity and Noun Chunk Extraction

spaCy extracts the “who”, “what”, “where” from a paragraph:

import spacy

nlp = spacy.load("en_core_web_sm")

paragraph = """
In March 2025, NVIDIA announced the Blackwell Ultra GPU architecture at GTC conference in San Jose.
CEO Jensen Huang showcased performance improvements of 40x over previous H100 models,
targeting AI training workloads for companies like Google, Microsoft, and Amazon.
"""

doc = nlp(paragraph)

print("=== Named Entities ===")
for ent in doc.ents:
    print(f"  {ent.text:<30} [{ent.label_}]")

print("\n=== Key Noun Phrases ===")
for chunk in doc.noun_chunks:
    if len(chunk.text.split()) > 1:  # multi-word phrases only
        print(f"  {chunk.text:<40} root: {chunk.root.text}")

print("\n=== Main Verbs (actions) ===")
for token in doc:
    if token.pos_ == "VERB" and not token.is_stop:
        print(f"  {token.text:<20} lemma: {token.lemma_}")

Extracting the Main Claim

Find the root verb and its subject-object structure:

import spacy

nlp = spacy.load("en_core_web_sm")

def extract_main_claim(text):
    doc = nlp(text)
    claims = []

    for token in doc:
        if token.dep_ == "ROOT" and token.pos_ == "VERB":
            subject = [w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
            obj = [w.text for w in token.rights if w.dep_ in ("dobj", "pobj", "attr")]

            subject_full = next(
                (chunk.text for chunk in doc.noun_chunks if any(w.text in chunk.text for w in token.lefts)),
                subject[0] if subject else "unknown"
            )

            claims.append({
                "subject": subject_full,
                "verb": token.lemma_,
                "object": obj[0] if obj else ""
            })

    return claims

paragraph = "Large language models have fundamentally changed how developers build software applications."
claims = extract_main_claim(paragraph)
for claim in claims:
    print(f"Subject: {claim['subject']}")
    print(f"Action:  {claim['verb']}")
    print(f"Object:  {claim['object']}")

Automatic Summarization

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

long_paragraph = """
The development of large language models has accelerated dramatically since the introduction
of the transformer architecture in 2017. These models, trained on hundreds of billions of words
from the internet, books, and code, have demonstrated remarkable capabilities in understanding
and generating human language. They can write essays, solve math problems, generate code,
translate between languages, and answer complex questions. However, they also suffer from
significant limitations, including hallucination of false information, sensitivity to prompt
phrasing, and difficulty with tasks requiring true reasoning rather than pattern matching.
Researchers continue to work on alignment techniques, better evaluation benchmarks, and
more efficient training methods to address these shortcomings.
"""

summary = summarizer(long_paragraph, max_length=80, min_length=30, do_sample=False)
print("Summary:")
print(summary[0]['summary_text'])

LLM-Powered Meaning Extraction

For the most comprehensive extraction, use an LLM to produce structured output:

from openai import OpenAI
import json

client = OpenAI()

def extract_paragraph_meaning(paragraph):
    prompt = f"""Extract the following from this paragraph and return as JSON:
1. "main_topic": The primary subject in 3-5 words
2. "key_points": List of 3-5 main points as bullet strings
3. "entities": Named entities as {{name: type}} dict
4. "sentiment": Overall tone (positive/negative/neutral)
5. "intent": What is the author trying to communicate? (1 sentence)

Paragraph:
"{paragraph}"
"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0
    )
    return json.loads(response.choices[0].message.content)

paragraph = """
Anthropic's Constitutional AI approach trains language models to be helpful, harmless,
and honest by having the model critique and revise its own outputs based on a set of
principles. Released in 2022, this technique has shown promising results in reducing
harmful outputs compared to standard RLHF methods, while maintaining model capability
on standard benchmarks.
"""

result = extract_paragraph_meaning(paragraph)
print(json.dumps(result, indent=2))

Complete Pipeline

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

nlp = spacy.load("en_core_web_sm")

def analyze_paragraph(text):
    doc = nlp(text)

    # Entities
    entities = {ent.text: ent.label_ for ent in doc.ents}

    # Noun phrases
    key_phrases = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]

    # Main verbs
    main_verbs = [token.lemma_ for token in doc if token.pos_ == "VERB" and not token.is_stop]

    # Keywords via TF-IDF (against itself)
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=10)
    tfidf = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out().tolist()

    return {
        "entities": entities,
        "key_phrases": list(set(key_phrases))[:5],
        "main_verbs": list(set(main_verbs))[:5],
        "keywords": keywords[:8]
    }

para = "Mistral AI released Mixtral 8x7B, a sparse mixture-of-experts model that matches GPT-3.5 quality at a fraction of the computational cost."
result = analyze_paragraph(para)
for key, value in result.items():
    print(f"{key}: {value}")