Extracting Email Addresses Using NLP
Email extraction pulls valid email addresses from unstructured text — an essential step for contact management, data enrichment, lead generation, and compliance workflows.
Basic Regex Extraction
Regular expressions are the most reliable approach for standard email formats:
import re
def extract_emails_basic(text): pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' return re.findall(pattern, text)
text = """Contact our team at support@example.com for general inquiries.For sales, reach alice.jones@company.co.uk or bob_smith@tech.org.Technical support: dev+nlp@api.service.ioInvalid examples: not-an-email @missinglocal .com"""
emails = extract_emails_basic(text)print("Found emails:", emails)# ['support@example.com', 'alice.jones@company.co.uk', 'bob_smith@tech.org', 'dev+nlp@api.service.io']Robust Email Pattern with Validation
import re
EMAIL_REGEX = re.compile( r""" (?:[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+ # local part: standard chars (?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*) # local part: dot segments @ # at sign (?:[a-zA-Z0-9] # domain: first char (?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])? # domain: middle \.)+ # domain: dots [a-zA-Z]{2,} # TLD: 2+ chars """, re.VERBOSE)
def extract_emails_robust(text): return EMAIL_REGEX.findall(text)
text = """Technical contact: engineer@startup.ioSupport: help@service.co.ukAdmin: admin@subdomain.company.comOld-style: user@[127.0.0.1]Bogus: @@notvalid.com"""
emails = extract_emails_robust(text)print("Extracted:", emails)Handling Obfuscated Emails
People often obfuscate emails to avoid scraping:
import re
def deobfuscate_email(text): """Normalize common obfuscation patterns.""" replacements = [ (r'\[at\]', '@'), (r'\(at\)', '@'), (r'\s+at\s+', '@'), (r'\[dot\]', '.'), (r'\(dot\)', '.'), (r'\s+dot\s+', '.'), ] for pattern, replacement in replacements: text = re.sub(pattern, replacement, text, flags=re.IGNORECASE) return text
obfuscated_examples = [ "Contact us at info [at] company [dot] com", "Reach admin(at)example(dot)org for support", "Email: sales at domain dot io", "Reach out to hello@normal.com normally",]
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
for example in obfuscated_examples: cleaned = deobfuscate_email(example) emails = EMAIL_PATTERN.findall(cleaned) print(f"Original: {example}") print(f"Extracted: {emails}\n")Bulk Processing Multiple Documents
import refrom pathlib import Path
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def extract_emails_from_text(text): return list(set(EMAIL_PATTERN.findall(text)))
# Simulate processing multiple documentsdocuments = { "contract.txt": "Parties: Alice Smith (alice@lawfirm.com) and Bob Jones (bjones@corp.io). Questions to legal@example.com.", "readme.md": "Report bugs to bugs@github.com or contact maintainers at dev@opensource.org.", "newsletter.html": "Unsubscribe: unsubscribe@newsletter.com | Help: help@newsletter.com", "support_ticket.txt": "User reported by: jane.doe@customer.co.uk. Assigned to: support-team@internal.company.com"}
all_results = {}for filename, content in documents.items(): emails = extract_emails_from_text(content) all_results[filename] = emails print(f"{filename}: {emails}")
# Aggregate unique emailsall_emails = set()for emails in all_results.values(): all_emails.update(emails)print(f"\nTotal unique emails found: {len(all_emails)}")print("All emails:", sorted(all_emails))Validating Extracted Emails
import reimport socket
def is_valid_email_format(email): pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$') return bool(pattern.match(email))
def has_valid_domain_syntax(email): """Check domain has at least one dot and valid TLD length.""" domain = email.split('@')[1] if '@' in email else '' parts = domain.split('.') return len(parts) >= 2 and 2 <= len(parts[-1]) <= 6
# Optional: DNS lookup (network required)def domain_exists(email): try: domain = email.split('@')[1] socket.getaddrinfo(domain, None) return True except (socket.gaierror, IndexError): return False
test_emails = [ "valid@example.com", "user.name+tag@subdomain.company.co.uk", "invalid@", "@nodomain.com", "no-at-sign.com", "double@@at.com"]
for email in test_emails: fmt = is_valid_email_format(email) dom = has_valid_domain_syntax(email) if fmt else False print(f"{'✓' if fmt and dom else '✗'} {email:<40} format: {fmt}, domain: {dom}")Using spaCy for Contextual Email Extraction
For emails embedded in structured text, spaCy’s token context helps filter false positives:
import spacyimport re
nlp = spacy.load("en_core_web_sm")
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def extract_emails_with_context(text): emails_with_context = [] doc = nlp(text)
for match in EMAIL_PATTERN.finditer(text): email = match.group() start_char = match.start()
# Find surrounding text start = max(0, start_char - 40) end = min(len(text), start_char + len(email) + 40) context = text[start:end]
emails_with_context.append({ "email": email, "context": context.strip() })
return emails_with_context
text = """For customer inquiries: support@company.com (Monday-Friday 9-5 EST).Engineering lead: jane.chen@company.comPartnerships: biz@company.io - response within 24 hours guaranteed."""
results = extract_emails_with_context(text)for r in results: print(f"Email: {r['email']}") print(f"Context: ...{r['context']}...\n")Complete Email Extraction Pipeline
import refrom collections import defaultdict
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
def pipeline_extract_emails(text): """Full extraction pipeline: deobfuscate, extract, deduplicate, validate."""
# Step 1: Deobfuscate text = re.sub(r'\[at\]|\(at\)|\s+at\s+', '@', text, flags=re.IGNORECASE) text = re.sub(r'\[dot\]|\(dot\)|\s+dot\s+', '.', text, flags=re.IGNORECASE)
# Step 2: Extract raw_emails = EMAIL_PATTERN.findall(text)
# Step 3: Normalize and deduplicate normalized = list(set(e.lower() for e in raw_emails))
# Step 4: Basic validation valid = [e for e in normalized if len(e.split('@')[0]) >= 1 and '.' in e.split('@')[1]]
# Step 5: Categorize by domain by_domain = defaultdict(list) for email in valid: domain = email.split('@')[1] by_domain[domain].append(email)
return {"emails": sorted(valid), "by_domain": dict(by_domain), "count": len(valid)}
text = """Please reach out to:- Sales team: sales [at] company [dot] com- HR Department: hr@company.com or recruiting@company.com- CEO: ceo@company.io (for press inquiries only)- Support: help at support dot io"""
result = pipeline_extract_emails(text)print(f"Found {result['count']} email(s):")for email in result['emails']: print(f" {email}")print("\nBy domain:")for domain, emails in result['by_domain'].items(): print(f" {domain}: {emails}")