Emails rules extractor

A1131

Title: Emails rules extractor
Author: chatgpt

AI written Python 3 interactive script that does the following:

Features:

Reads email addresses from a file.

Extracts:

Numbers at the end of the username ([email protected] → 123)
The domain (e.g., gmail.com)
Filters for specific domains (gmail.com, yahoo.com, etc. — user input).
Sorts results by frequency (how often each number + domain pair appears).
Creates Hashcat rules like $1$2$3$@$g$m$a$i$l$.$c$o$m.
Writes the output to a file of your choice.

#!/usr/bin/env python3

import re
from collections import Counter, defaultdict

def extract_data_from_email(email):
    match = re.match(r'^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})$', email.strip())
    if not match:
        return None
    user, domain = match.groups()
    digits_match = re.search(r'(\d+)$', user)
    if digits_match:
        digits = digits_match.group(1)
        return digits, domain.lower()
    return None

def string_to_hashcat_rule(s):
    return ''.join(f"${c}" for c in s)

def main():
    input_path = input("Enter path to input file containing emails: ").strip()
    domains_input = input("Enter comma-separated domains to filter (e.g., gmail.com,yahoo.com): ").strip()
    output_path = input("Enter path to save generated hashcat rules: ").strip()

    domains_to_include = set(domain.strip().lower() for domain in domains_input.split(',') if domain.strip())
    counter = Counter()
    examples = defaultdict(list)

    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            email = line.strip()
            extracted = extract_data_from_email(email)
            if extracted:
                digits, domain = extracted
                if domain in domains_to_include:
                    key = (digits, domain)
                    counter[key] += 1
                    if len(examples[key]) < 3:
                        examples[key].append(email)

    sorted_items = counter.most_common()

    with open(output_path, 'w', encoding='utf-8') as out:
        for (digits, domain), count in sorted_items:
            rule = string_to_hashcat_rule(digits + '@' + domain)
            out.write(f"{rule}\n")

    print(f"\nDone! {len(sorted_items)} rules written to {output_path}")
    print("Top 5 extracted rules with examples:")
    for (digits, domain), count in sorted_items[:5]:
        rule = string_to_hashcat_rule(digits + '@' + domain)
        print(f"  Rule: {rule} | Count: {count} | Examples: {examples[(digits, domain)]}")

if __name__ == '__main__':
    main()

Example
Input:

File contains:

[email protected]
[email protected]
[email protected]
[email protected]
[email protected]
bad.email.com

Output:

$1$2$3$@$g$m$a$i$l$.$c$o$m
$4$5$6$@$g$m$a$i$l$.$c$o$m
$1$2$3$@$y$a$h$o$o$.$c$o$m

A1131

Analyzes and displays the top 20 (customizable by def print_top_domains) overall domains (NEW). Ready to use in script.

#!/usr/bin/env python3
"""
Email-Based Hashcat Rule Extractor

This script processes a file containing email addresses, extracts trailing digit sequences
from the usernames (before the '@'), filters them by specified domains, and generates
Hashcat-compatible rule strings.

It first displays the top 20 most frequent domains in the input file to help the user 
decide which domains to target for rule extraction.

Functionality:
- Reads email addresses from an input file.
- **Analyzes and displays the top 20 overall domains (NEW).**
- Filters addresses by user-specified domain(s) (e.g., 'gmail.com', 'yahoo.com').
- Extracts trailing digits from the local-part (e.g., 'user123' -> '123').
- Groups and counts digit-domain combinations.
- Generates Hashcat rules from the most common combinations.
- Saves rules to an output file.
- Displays the top 5 most frequent patterns with example emails.

Usage:
    - Run the script and provide:
        1. Path to the input file containing emails (one per line).
        2. Comma-separated list of domains to filter (after seeing the top domains).
        3. Path to save the output Hashcat rules.
"""
import sys
import re
from collections import Counter, defaultdict

def get_file_path(prompt):
    """Handles continuous prompting until a non-empty path is entered."""
    while True:
        path = input(prompt).strip()
        if path:
            return path
        print("Path cannot be empty. Please try again.")

def extract_data_from_email(email):
    """
    Extracts the local part and domain from an email.
    Returns (digits, domain) if the local part ends in digits, otherwise None.
    """
    # Regex to validate email structure and separate user/domain
    match = re.match(r'^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})$', email.strip())
    if not match:
        return None
    
    user, domain = match.groups()
    domain = domain.lower() # Normalize domain to lowercase
    
    # Check for trailing digits in the local part (username)
    digits_match = re.search(r'(\d+)$', user)
    
    if digits_match:
        digits = digits_match.group(1)
        return digits, domain
        
    return None

def string_to_hashcat_rule(s):
    """Converts a string (e.g., '[email protected]') to a Hashcat rule (e.g., '$1$2$3$@$g$m$a$i$l$.$c$o$m')."""
    return ''.join(f"${c}" for c in s)

def print_top_domains(file_path, limit=20): # edit limit if required
    """Reads the file, counts all domains, and prints the top 'limit' domains."""
    domain_counts = Counter()
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                email = line.strip()
                if '@' in email:
                    # rsplit ensures we only split once from the right
                    _, domain = email.rsplit('@', 1)
                    if domain:
                        domain_counts[domain.lower()] += 1
    except FileNotFoundError:
        print(f"\nError: File not found at '{file_path}'", file=sys.stderr)
        return False
    except Exception as e:
        print(f"\nAn error occurred while reading the file: {e}", file=sys.stderr)
        return False

    top_domains = [f"{domain} ({count})" for domain, count in domain_counts.most_common(limit)]
    
    print("\n" + "="*50)
    print(f"🥇 Top {limit} Domains Found in the Input File:")
    print("="*50)
    # Print the top domains comma-separated, without spaces, just like the original request's output
    print(','.join(domain for domain, count in domain_counts.most_common(limit)))
    print("="*50 + "\n")
    
    return True

def main():
    """Main function to handle user input, file processing, and rule generation."""
    print("--- Hashcat Rule Extractor for Email Patterns ---")
    
    # 1. Get Input Path and Print Top Domains
    input_path = get_file_path("Enter path to input file containing emails: ")
    
    # Display the top domains before proceeding
    if not print_top_domains(input_path, limit=20):
        # Exit if file reading failed in print_top_domains
        sys.exit(1)
        
    # 2. Get Domains to Filter
    domains_input = input("Enter comma-separated domains to filter (e.g., gmail.com,yahoo.com): ").strip()
    
    # 3. Get Output Path
    output_path = get_file_path("Enter path to save generated hashcat rules: ")

    domains_to_include = set(domain.strip().lower() for domain in domains_input.split(',') if domain.strip())
    
    if not domains_to_include:
        print("\nWarning: No domains were specified for filtering. Exiting.", file=sys.stderr)
        sys.exit(0)
        
    counter = Counter()
    examples = defaultdict(list)

    print("\nProcessing file...")
    
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            for line in f:
                email = line.strip()
                extracted = extract_data_from_email(email)
                
                if extracted:
                    digits, domain = extracted
                    if domain in domains_to_include:
                        key = (digits, domain)
                        counter[key] += 1
                        # Store a few examples for display later
                        if len(examples[key]) < 3:
                            examples[key].append(email)
    except Exception as e:
        print(f"An error occurred during file processing: {e}", file=sys.stderr)
        sys.exit(1)


    sorted_items = counter.most_common()

    # 4. Write Rules to Output File
    try:
        with open(output_path, 'w', encoding='utf-8') as out:
            for (digits, domain), count in sorted_items:
                rule = string_to_hashcat_rule(digits + '@' + domain)
                out.write(f"{rule}\n")
    except Exception as e:
        print(f"Error writing to output file '{output_path}': {e}", file=sys.stderr)
        sys.exit(1)


    # 5. Display Summary
    print(f"\nDone! {len(sorted_items)} rules written to {output_path}")
    print("\nTop 5 extracted rules with examples:")
    
    if not sorted_items:
        print("  No patterns found matching the specified domains and having trailing digits.")
        return

    for (digits, domain), count in sorted_items[:5]:
        rule = string_to_hashcat_rule(digits + '@' + domain)
        print(f"  Rule: {rule} | Count: {count} | Examples: {', '.join(examples[(digits, domain)])}")

if __name__ == '__main__':
    main()

https://raw.githubusercontent.com/A113L/Bucket/refs/heads/main/emailr.py

hashpwn

Emails rules extractor

Who's Online [Full List]

Board Statistics