Extracting Email Addresses Using NLP

Email extraction pulls valid email addresses from unstructured text — an essential step for contact management, data enrichment, lead generation, and compliance workflows.

Basic Regex Extraction

Regular expressions are the most reliable approach for standard email formats:

import re

def extract_emails_basic(text):
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(pattern, text)

text = """
Contact our team at support@example.com for general inquiries.
For sales, reach alice.jones@company.co.uk or bob_smith@tech.org.
Technical support: dev+nlp@api.service.io
Invalid examples: not-an-email @missinglocal .com
"""

emails = extract_emails_basic(text)
print("Found emails:", emails)
# ['support@example.com', 'alice.jones@company.co.uk', 'bob_smith@tech.org', 'dev+nlp@api.service.io']

Robust Email Pattern with Validation

import re

EMAIL_REGEX = re.compile(
    r"""
    (?:[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+     # local part: standard chars
    (?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*) # local part: dot segments
    @                                          # at sign
    (?:[a-zA-Z0-9]                            # domain: first char
    (?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?       # domain: middle
    \.)+                                       # domain: dots
    [a-zA-Z]{2,}                              # TLD: 2+ chars
    """,
    re.VERBOSE
)

def extract_emails_robust(text):
    return EMAIL_REGEX.findall(text)

text = """
Technical contact: engineer@startup.io
Support: help@service.co.uk
Admin: admin@subdomain.company.com
Old-style: user@[127.0.0.1]
Bogus: @@notvalid.com
"""

emails = extract_emails_robust(text)
print("Extracted:", emails)

Handling Obfuscated Emails

People often obfuscate emails to avoid scraping:

import re

def deobfuscate_email(text):
    """Normalize common obfuscation patterns."""
    replacements = [
        (r'\[at\]', '@'),
        (r'\(at\)', '@'),
        (r'\s+at\s+', '@'),
        (r'\[dot\]', '.'),
        (r'\(dot\)', '.'),
        (r'\s+dot\s+', '.'),
    ]
    for pattern, replacement in replacements:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

obfuscated_examples = [
    "Contact us at info [at] company [dot] com",
    "Reach admin(at)example(dot)org for support",
    "Email: sales at domain dot io",
    "Reach out to hello@normal.com normally",
]

EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

for example in obfuscated_examples:
    cleaned = deobfuscate_email(example)
    emails = EMAIL_PATTERN.findall(cleaned)
    print(f"Original: {example}")
    print(f"Extracted: {emails}\n")

Bulk Processing Multiple Documents

import re
from pathlib import Path

EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

def extract_emails_from_text(text):
    return list(set(EMAIL_PATTERN.findall(text)))

# Simulate processing multiple documents
documents = {
    "contract.txt": "Parties: Alice Smith (alice@lawfirm.com) and Bob Jones (bjones@corp.io). Questions to legal@example.com.",
    "readme.md": "Report bugs to bugs@github.com or contact maintainers at dev@opensource.org.",
    "newsletter.html": "Unsubscribe: unsubscribe@newsletter.com | Help: help@newsletter.com",
    "support_ticket.txt": "User reported by: jane.doe@customer.co.uk. Assigned to: support-team@internal.company.com"
}

all_results = {}
for filename, content in documents.items():
    emails = extract_emails_from_text(content)
    all_results[filename] = emails
    print(f"{filename}: {emails}")

# Aggregate unique emails
all_emails = set()
for emails in all_results.values():
    all_emails.update(emails)
print(f"\nTotal unique emails found: {len(all_emails)}")
print("All emails:", sorted(all_emails))

Validating Extracted Emails

import re
import socket

def is_valid_email_format(email):
    pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
    return bool(pattern.match(email))

def has_valid_domain_syntax(email):
    """Check domain has at least one dot and valid TLD length."""
    domain = email.split('@')[1] if '@' in email else ''
    parts = domain.split('.')
    return len(parts) >= 2 and 2 <= len(parts[-1]) <= 6

# Optional: DNS lookup (network required)
def domain_exists(email):
    try:
        domain = email.split('@')[1]
        socket.getaddrinfo(domain, None)
        return True
    except (socket.gaierror, IndexError):
        return False

test_emails = [
    "valid@example.com",
    "user.name+tag@subdomain.company.co.uk",
    "invalid@",
    "@nodomain.com",
    "no-at-sign.com",
    "double@@at.com"
]

for email in test_emails:
    fmt = is_valid_email_format(email)
    dom = has_valid_domain_syntax(email) if fmt else False
    print(f"{'✓' if fmt and dom else '✗'} {email:<40} format: {fmt}, domain: {dom}")

Using spaCy for Contextual Email Extraction

For emails embedded in structured text, spaCy’s token context helps filter false positives:

import spacy
import re

nlp = spacy.load("en_core_web_sm")

EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

def extract_emails_with_context(text):
    emails_with_context = []
    doc = nlp(text)

    for match in EMAIL_PATTERN.finditer(text):
        email = match.group()
        start_char = match.start()

        # Find surrounding text
        start = max(0, start_char - 40)
        end = min(len(text), start_char + len(email) + 40)
        context = text[start:end]

        emails_with_context.append({
            "email": email,
            "context": context.strip()
        })

    return emails_with_context

text = """
For customer inquiries: support@company.com (Monday-Friday 9-5 EST).
Engineering lead: jane.chen@company.com
Partnerships: biz@company.io - response within 24 hours guaranteed.
"""

results = extract_emails_with_context(text)
for r in results:
    print(f"Email: {r['email']}")
    print(f"Context: ...{r['context']}...\n")

Complete Email Extraction Pipeline

import re
from collections import defaultdict

EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')

def pipeline_extract_emails(text):
    """Full extraction pipeline: deobfuscate, extract, deduplicate, validate."""

    # Step 1: Deobfuscate
    text = re.sub(r'\[at\]|\(at\)|\s+at\s+', '@', text, flags=re.IGNORECASE)
    text = re.sub(r'\[dot\]|\(dot\)|\s+dot\s+', '.', text, flags=re.IGNORECASE)

    # Step 2: Extract
    raw_emails = EMAIL_PATTERN.findall(text)

    # Step 3: Normalize and deduplicate
    normalized = list(set(e.lower() for e in raw_emails))

    # Step 4: Basic validation
    valid = [e for e in normalized if len(e.split('@')[0]) >= 1 and '.' in e.split('@')[1]]

    # Step 5: Categorize by domain
    by_domain = defaultdict(list)
    for email in valid:
        domain = email.split('@')[1]
        by_domain[domain].append(email)

    return {"emails": sorted(valid), "by_domain": dict(by_domain), "count": len(valid)}

text = """
Please reach out to:
- Sales team: sales [at] company [dot] com
- HR Department: hr@company.com or recruiting@company.com
- CEO: ceo@company.io (for press inquiries only)
- Support: help at support dot io
"""

result = pipeline_extract_emails(text)
print(f"Found {result['count']} email(s):")
for email in result['emails']:
    print(f"  {email}")
print("\nBy domain:")
for domain, emails in result['by_domain'].items():
    print(f"  {domain}: {emails}")