What Are Regular Expressions?

Regular expressions (regex) are patterns that describe sets of strings. They let you search, match, and manipulate text with precision that string methods alone can't achieve.

import re

# Find all email addresses in text
text = "Contact alice@example.com or bob@company.org for info"
emails = re.findall(r"[\w.+-]+@[\w-]+\.[\w.]+", text)
print(emails)    # ['alice@example.com', 'bob@company.org']
💡
Always use raw strings

Prefix regex patterns with r (e.g., r"\d+") to prevent Python from interpreting backslashes as escape characters.

Basic Pattern Syntax

# Literal characters match themselves
re.search(r"hello", "say hello world")    # Matches "hello"

# Special characters (metacharacters)
.       # Any single character (except newline)
^       # Start of string
$       # End of string
\       # Escape special characters

Character Classes

\d      # Any digit [0-9]
\D      # Any non-digit
\w      # Any word character [a-zA-Z0-9_]
\W      # Any non-word character
\s      # Any whitespace (space, tab, newline)
\S      # Any non-whitespace

[abc]   # Any of: a, b, or c
[a-z]   # Any lowercase letter
[0-9]   # Any digit
[^abc]  # Any character EXCEPT a, b, c

Quantifiers

*       # 0 or more
+       # 1 or more
?       # 0 or 1 (optional)
{3}     # Exactly 3
{2,5}   # Between 2 and 5
{3,}    # 3 or more

Core re Module Functions

re.search() — Find First Match

import re

text = "Order #12345 was placed on 2026-03-04"

match = re.search(r"#(\d+)", text)
if match:
    print(match.group())      # #12345 (entire match)
    print(match.group(1))     # 12345  (first capture group)
    print(match.start())      # 6      (position in string)

re.findall() — Find All Matches

text = "Prices: $10.99, $25.50, and $7.00"
prices = re.findall(r"\$\d+\.\d{2}", text)
print(prices)    # ['$10.99', '$25.50', '$7.00']

# With groups, findall returns the group contents
numbers = re.findall(r"\$(\d+\.\d{2})", text)
print(numbers)   # ['10.99', '25.50', '7.00']

re.sub() — Search and Replace

# Replace phone numbers with [REDACTED]
text = "Call 555-1234 or 555-5678 for support"
cleaned = re.sub(r"\d{3}-\d{4}", "[REDACTED]", text)
print(cleaned)   # Call [REDACTED] or [REDACTED] for support

# Use groups in replacement
text = "2026-03-04"
us_format = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\2/\3/\1", text)
print(us_format)  # 03/04/2026

re.split() — Split by Pattern

# Split by multiple delimiters
text = "apple, banana; cherry  grape"
items = re.split(r"[,;\s]+", text)
print(items)     # ['apple', 'banana', 'cherry', 'grape']

Groups and Capturing

import re

# Named groups
pattern = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})"
match = re.search(pattern, "Date: 2026-03-04")

if match:
    print(match.group("year"))     # 2026
    print(match.group("month"))    # 03
    print(match.group("day"))      # 04
    print(match.groupdict())       # {'year': '2026', 'month': '03', 'day': '04'}

# Non-capturing group (?:...)
# Groups without capturing (for grouping quantifiers)
pattern = r"(?:https?://)?(?:www\.)?(\w+\.\w+)"
match = re.search(pattern, "Visit www.example.com")
print(match.group(1))    # example.com

Common Patterns

import re

# Email validation (basic)
email_pattern = r"^[\w.+-]+@[\w-]+\.[\w.]+$"
print(bool(re.match(email_pattern, "user@example.com")))    # True
print(bool(re.match(email_pattern, "invalid@")))             # False

# IP address
ip_pattern = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
text = "Server at 192.168.1.1 responded, backup at 10.0.0.1"
ips = re.findall(ip_pattern, text)
print(ips)    # ['192.168.1.1', '10.0.0.1']

# URL extraction
url_pattern = r"https?://[\w./\-?=&#]+"
text = "Check https://example.com/page?id=1 and http://test.org"
urls = re.findall(url_pattern, text)
print(urls)

# Password validation (8+ chars, uppercase, lowercase, digit)
def check_password(password):
    if len(password) < 8:
        return False
    if not re.search(r"[A-Z]", password):
        return False
    if not re.search(r"[a-z]", password):
        return False
    if not re.search(r"\d", password):
        return False
    return True

Flags

import re

# Case-insensitive matching
re.findall(r"python", "Python PYTHON python", re.IGNORECASE)
# ['Python', 'PYTHON', 'python']

# Multiline (^ and $ match line boundaries)
text = "Line 1\nLine 2\nLine 3"
re.findall(r"^Line \d", text, re.MULTILINE)
# ['Line 1', 'Line 2', 'Line 3']

# Verbose mode (allows comments in patterns)
phone_pattern = re.compile(r"""
    (\d{3})     # Area code
    [-.\s]?     # Optional separator
    (\d{3})     # Exchange
    [-.\s]?     # Optional separator
    (\d{4})     # Number
""", re.VERBOSE)

Practical Example: Log Parser

import re

log_lines = [
    '2026-03-04 10:30:15 [ERROR] Failed to connect to 192.168.1.100:3306',
    '2026-03-04 10:30:16 [INFO] Retrying connection (attempt 2/3)',
    '2026-03-04 10:30:17 [ERROR] Connection timeout after 5000ms',
    '2026-03-04 10:30:20 [INFO] Connected successfully'
]

pattern = r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)"

for line in log_lines:
    match = re.match(pattern, line)
    if match:
        date, time, level, message = match.groups()
        if level == "ERROR":
            print(f"ERROR at {time}: {message}")

# Extract all IP:port combinations
all_text = "\n".join(log_lines)
connections = re.findall(r"(\d+\.\d+\.\d+\.\d+):(\d+)", all_text)
for ip, port in connections:
    print(f"  Target: {ip} port {port}")

Summary

  • re.search() finds the first match; re.findall() finds all matches
  • re.sub() does search-and-replace; re.split() splits by pattern
  • \d, \w, \s match digits, word chars, whitespace
  • +, *, ?, {n} control how many times to match
  • Parentheses () create capture groups; (?P<name>) creates named groups
  • Always use raw strings r"..." for regex patterns
  • Flags: re.IGNORECASE, re.MULTILINE, re.VERBOSE
🎉
Regex unlocked!

You can now match, extract, and transform text with powerful patterns. Next up: virtual environments and packages — managing dependencies like a pro.