Inefficient Regular Expression Complexity

Description

Inefficient Regular Expression Complexity occurs when the product employs a regular expression with an inefficient, potentially exponential worst-case computational complexity that consumes excessive CPU resources. Certain regex engines support "backtracking," where the engine retreats to earlier positions when a token fails to match, attempting alternative matches. This becomes problematic when backtracking attempts scale exponentially with input length, input can fail to match the expression, and input length is sufficient to trigger the issue. Attackers craft inputs specifically designed to trigger excessive backtracking, spiking CPU consumption.

Risk

ReDoS has severe implications. CPU exhaustion through malicious input. Application denial of service. Server unresponsiveness. Request timeouts cascade. Service degradation for all users. Thread pool exhaustion. High likelihood when regex processes untrusted input and contains vulnerable patterns.

Solution

Remove nested quantifiers and use non-backtracking regex engines during architecture and design phase (high effectiveness). Set backtracking limits during system configuration phase (e.g., PHP's pcre.backtrack_limit) (moderate effectiveness). Avoid backtracking patterns and reject untrusted regex inputs during implementation phase (high effectiveness). Limit input length for regex processing (moderate effectiveness).

Common Consequences

ImpactDetails
AvailabilityScope: Availability

Denial of service through CPU resource consumption from exponential backtracking.

Example Code

Vulnerable Code

// Vulnerable: Regex with catastrophic backtracking

// VULNERABLE: Nested quantifiers
const vulnerableEmailRegex = /^([a-zA-Z0-9]+)*@example\.com$/;

function vulnerableValidateEmail(input) {
    // VULNERABLE: Exponential backtracking on inputs like
    // "aaaaaaaaaaaaaaaaaaaaaaaaaaaa!"
    return vulnerableEmailRegex.test(input);
}

// VULNERABLE: Overlapping alternation with quantifier
const vulnerableUrlRegex = /^(https?:\/\/)?([\da-z\.-]+)*\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$/;

function vulnerableValidateUrl(input) {
    // Attack input: "http://aaaaaaaaaaaaaaaaaaaaaaaaaaa!"
    return vulnerableUrlRegex.test(input);
}

// VULNERABLE: Greedy quantifiers with overlapping patterns
const vulnerableHtmlRegex = /<([a-z]+)([^>]*)>(.*?)<\/\1>/gi;

function vulnerableParseHtml(input) {
    // Can hang on malformed HTML
    return input.match(vulnerableHtmlRegex);
}

// VULNERABLE: User-controlled regex
function vulnerableSearch(userPattern, text) {
    // VULNERABLE: User can provide malicious regex
    const regex = new RegExp(userPattern);
    return regex.test(text);
    // Attack: userPattern = "(a+)+$", text = "aaaaaaaaaaaaaaaaaa!"
}

// VULNERABLE: Common dangerous patterns
const dangerousPatterns = [
    /^(a+)+$/,           // Nested quantifiers
    /^([a-zA-Z]+)*$/,    // Group with quantifier, quantified again
    /^(a|aa)+$/,         // Overlapping alternation
    /^(.*a){20}$/,       // Quantified group with greedy wildcard
    /^(a|a?)+$/,         // Optional in quantified alternation
];
# Vulnerable: Python regex with backtracking issues

import re

# VULNERABLE: Nested quantifiers
vulnerable_pattern = r'^([a-z]+)+$'

def vulnerable_validate(text):
    # VULNERABLE: Exponential time on "aaaa...a!"
    return re.match(vulnerable_pattern, text) is not None

# VULNERABLE: Markdown parser with ReDoS
markdown_link_regex = r'\[([^\]]*)\]\(([^)]*)\)'

def vulnerable_parse_markdown(text):
    # Can be exploited with crafted input
    return re.findall(markdown_link_regex, text)

# VULNERABLE: XML attribute parsing
xml_attr_regex = r'(\w+)\s*=\s*["\']([^"\']*)["\']'

def vulnerable_parse_xml_attrs(text):
    # Vulnerable to backtracking attacks
    return re.findall(xml_attr_regex, text)

# VULNERABLE: User-agent parsing (CVE-like pattern)
ua_regex = r'^([\w\-/\.]+)(?: \(([^)]+)\))*(?: (\w+)/)?'

def vulnerable_parse_user_agent(ua_string):
    # Complex pattern vulnerable to crafted user agents
    return re.match(ua_regex, ua_string)
// Vulnerable: Java regex patterns

import java.util.regex.*;

public class VulnerableRegex {

    // VULNERABLE: Nested quantifiers
    private static final Pattern VULNERABLE_PATTERN =
        Pattern.compile("^([a-zA-Z0-9]+)+@example\\.com$");

    public boolean vulnerableValidate(String input) {
        // Exponential backtracking possible
        return VULNERABLE_PATTERN.matcher(input).matches();
    }

    // VULNERABLE: Processing untrusted regex
    public boolean vulnerableUserSearch(String userRegex, String text) {
        // User can provide malicious regex
        Pattern pattern = Pattern.compile(userRegex);
        return pattern.matcher(text).find();
    }

    // VULNERABLE: No timeout on regex matching
    public String vulnerableReplace(String input, String pattern, String replacement) {
        // Can hang indefinitely
        return input.replaceAll(pattern, replacement);
    }
}

Fixed Code

// Fixed: Safe regex patterns and validation

// FIXED: Atomic groups / possessive quantifiers where supported
// Using simpler, non-backtracking pattern
const safeEmailRegex = /^[a-zA-Z0-9._%+-]+@example\.com$/;

function safeValidateEmail(input) {
    // FIXED: Limit input length
    if (input.length > 254) {  // RFC 5321 max email length
        return false;
    }

    return safeEmailRegex.test(input);
}

// FIXED: Simplified URL pattern
const safeUrlRegex = /^https?:\/\/[^\s/$.?#].[^\s]*$/i;

function safeValidateUrl(input) {
    // FIXED: Length limit
    if (input.length > 2048) {
        return false;
    }

    return safeUrlRegex.test(input);
}

// FIXED: Use dedicated HTML parser instead of regex
const { JSDOM } = require('jsdom');

function safeParseHtml(input) {
    // FIXED: Use proper HTML parser, not regex
    const dom = new JSDOM(input);
    return dom.window.document;
}

// FIXED: Never allow user-controlled regex
function safeSearch(userQuery, text) {
    // FIXED: Escape user input instead of using as regex
    const escaped = userQuery.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
    const regex = new RegExp(escaped, 'gi');
    return regex.test(text);
}

// FIXED: Regex with timeout (Node.js with worker)
const { Worker } = require('worker_threads');

async function regexWithTimeout(pattern, text, timeoutMs = 1000) {
    return new Promise((resolve, reject) => {
        const worker = new Worker(`
            const { parentPort, workerData } = require('worker_threads');
            const regex = new RegExp(workerData.pattern);
            const result = regex.test(workerData.text);
            parentPort.postMessage(result);
        `, { eval: true, workerData: { pattern, text } });

        const timer = setTimeout(() => {
            worker.terminate();
            reject(new Error('Regex timeout'));
        }, timeoutMs);

        worker.on('message', (result) => {
            clearTimeout(timer);
            resolve(result);
        });
    });
}

// FIXED: Use RE2 (non-backtracking engine)
const RE2 = require('re2');

function safeRegexMatch(pattern, text) {
    // RE2 guarantees linear time complexity
    const regex = new RE2(pattern);
    return regex.test(text);
}
# Fixed: Safe Python regex handling

import re
import re2  # Google's RE2 library
from typing import Optional

# FIXED: Simple, efficient patterns
SAFE_EMAIL_PATTERN = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

def safe_validate_email(text: str) -> bool:
    # FIXED: Length limit
    if len(text) > 254:
        return False

    # FIXED: Simple pattern without nested quantifiers
    return re.match(SAFE_EMAIL_PATTERN, text) is not None

# FIXED: Use RE2 for user input
def safe_search_re2(pattern: str, text: str) -> Optional[re2.Match]:
    """Use RE2 which has guaranteed linear time complexity."""
    try:
        regex = re2.compile(pattern)
        return regex.search(text)
    except re2.error:
        return None

# FIXED: Timeout wrapper for regex
import signal

class TimeoutError(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutError("Regex operation timed out")

def safe_regex_with_timeout(pattern: str, text: str, timeout_sec: int = 1) -> bool:
    """Execute regex with timeout protection."""
    # FIXED: Set timeout
    old_handler = signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(timeout_sec)

    try:
        result = re.match(pattern, text) is not None
        signal.alarm(0)  # Cancel alarm
        return result
    except TimeoutError:
        return False
    finally:
        signal.signal(signal.SIGALRM, old_handler)

# FIXED: Input validation before regex
def safe_validate_input(text: str, max_length: int = 10000) -> bool:
    """Validate input before applying regex."""
    # FIXED: Reject oversized input
    if len(text) > max_length:
        return False

    # FIXED: Reject obviously malicious patterns
    suspicious_patterns = ['aaaa' * 10, 'a' * 100]
    for pattern in suspicious_patterns:
        if pattern in text:
            return False

    return True

# FIXED: Use dedicated parser for complex formats
from email.utils import parseaddr

def safe_parse_email(email: str) -> tuple:
    """Use stdlib parser instead of regex."""
    # FIXED: Built-in parser is more robust and efficient
    return parseaddr(email)
// Fixed: Safe Java regex handling

import java.util.regex.*;
import java.util.concurrent.*;

public class SafeRegex {

    // FIXED: Simple, efficient pattern
    private static final Pattern SAFE_EMAIL_PATTERN =
        Pattern.compile("^[\\w.%+-]+@[\\w.-]+\\.[a-zA-Z]{2,}$");

    // FIXED: Input length validation
    private static final int MAX_INPUT_LENGTH = 10000;

    public boolean safeValidateEmail(String input) {
        // FIXED: Length check first
        if (input == null || input.length() > 254) {
            return false;
        }

        return SAFE_EMAIL_PATTERN.matcher(input).matches();
    }

    // FIXED: Regex with timeout
    public boolean safeMatchWithTimeout(String pattern, String input, long timeoutMs)
            throws InterruptedException, ExecutionException, TimeoutException {

        // FIXED: Reject oversized input
        if (input.length() > MAX_INPUT_LENGTH) {
            return false;
        }

        ExecutorService executor = Executors.newSingleThreadExecutor();
        Future<Boolean> future = executor.submit(() -> {
            Pattern p = Pattern.compile(pattern);
            return p.matcher(input).matches();
        });

        try {
            // FIXED: Timeout prevents indefinite hanging
            return future.get(timeoutMs, TimeUnit.MILLISECONDS);
        } finally {
            executor.shutdownNow();
        }
    }

    // FIXED: Escape user input for literal matching
    public boolean safeLiteralSearch(String userQuery, String text) {
        // FIXED: Quote user input to prevent regex interpretation
        String escaped = Pattern.quote(userQuery);
        Pattern pattern = Pattern.compile(escaped, Pattern.CASE_INSENSITIVE);
        return pattern.matcher(text).find();
    }

    // FIXED: Validate regex pattern before use
    public boolean isPatternSafe(String pattern) {
        // FIXED: Reject dangerous patterns
        String[] dangerous = {"(.*)+", "([a-z]+)+", "(a|aa)+", "(a|a?)+"};
        for (String d : dangerous) {
            if (pattern.contains(d)) {
                return false;
            }
        }

        // FIXED: Check for nested quantifiers
        if (Pattern.compile("\\([^)]*[+*]\\)[+*]").matcher(pattern).find()) {
            return false;
        }

        return true;
    }
}

CVE Examples

  • CVE-2020-5243: User-Agent parsing with overlapping capture groups causing ReDoS.
  • CVE-2021-21317: npm user-agent parser ReDoS vulnerability.
  • CVE-2019-16215: Markdown parser CPU exhaustion through regex backtracking.
  • CVE-2017-16021: URL validation ReDoS in node-url-parse.

  • CWE-407: Inefficient Algorithmic Complexity (parent)
  • CWE-1226: Complexity Issues (category)
  • CAPEC-492: Regular Expression Exponential Blowup

References

  1. MITRE Corporation. "CWE-1333: Inefficient Regular Expression Complexity." https://cwe.mitre.org/data/definitions/1333.html
  2. OWASP. "Regular Expression Denial of Service - ReDoS"
  3. Davis, J. et al. "The Impact of Regular Expression Denial of Service (ReDoS)"