# Generate obfuscated-word rules for spamassassin # # Usage: perl obfusc.pl wordlistfile [wordlistfile ...] 2>/dev/null >sarulefile # # Wordlistfile format: # word [score] # Comments (text beginning with #) will be ignored. # End word with \S to suppress word-boundary at end # (in other words, match on variant endings) # Default score is 1.0 # Example: # diploma 1.5 # unaccredited 2 # prestig\S # free # viagra 5.0 # # Obtain current version from: # http://www.impsec.org/~jhardin/antispam/obfusc.pl # # (C) 2004-2007 John Hardin # Released under the GNU General Public License # Contact the author for commercial licensing. # Copyright is not claimed on the output of this script. # # $Id: obfusc.pl,v 1.13 2007-09-20 12:56:31-07 jhardin Exp jhardin $ # $rnum = 1; while (<>) { warn "read $_"; s/#.*//; if (($word, $score) = /^\s*(\S+)(?:\s+([\d.]+))?/) { $score = "1.0" unless $score; $boundend = "\\b"; if ( $word =~ /\\S[*+]?$/) { # trailing \S \S+ \S* means # don't end this on a word boundary # match on trailing text # this is only the beginning of the word $word =~ s/\\S[*+]?$//; $boundend = ""; } $word = lc $word; $word =~ s/_/ /g; warn "parsed \"$word\" @ $score\n"; next if $seen{$word}; $RU = sprintf("OBFU_WRD_%03d", $rnum++); $RE = ""; $RE_spc = ""; $lastltr = ""; $count = 0; for $ltr (split(//, $word)) { warn "$word: checking '$ltr' (RE: \"$RE\" last: $lastltr)\n"; if ($lastltr) { $RE_spc .= "\\s?"; if ($ltr eq $lastltr) { warn "$word: string of letters detected...\n"; $count++; if ($count < 2) { $RE .= "{2,4}"; } $RE_spc .= $ltr; next; } else { # inter-letter trash if ($count == 0) # double-letter obfusc { $RE .= "{1,2}"; } # other stuff $RE .= "(?:"; # embedded punctuation mark obfusc $RE .= "[-'\\.,!_~\"\*\^\\s\\+\\#]"; # another option $RE .= "|"; # embedded " xx " obfusc $RE .= "\\s[a-z]{2}\\s"; # no more options $RE .= ")?"; } } $lastltr = $ltr; $count = 0; $RE_spc .= $ltr; if ($ltr eq "e") {$RE .= "(?:[e3\\xBC\\xBD\\xC6\\xC8-\\xCB\\xE6\\xE8-\\xEB]|\&e[a-z]+;)"; next;} if ($ltr eq "t") {$RE .= "(?:[t\\xA3\\xB1]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "a") {$RE .= "(?:[a4\\\@\\xC0-\\xC6\\xE0-\\xE6]|\\/\\\\|\&a[a-z]+;)"; next;} if ($ltr eq "o") {$RE .= "(?:[o0\\x05\\xA9\\xAE\\xBC\\xBD\\xD2-\\xD6\\xD8\\xF0\\xF2-\\xF6\\xF8]|\&o[a-z]+;|([(][)]))"; next;} if ($ltr eq "i") {$RE .= "(?:[i!l1j\\|\\/\\xA1\\xCC-\\xCF\\xEC-\\xEF]|\&i[a-z]+;)"; next;} if ($ltr eq "n") {$RE .= "(?:[n\\xD1\\xF1]|(\\|\\\\\\|)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "s") {$RE .= "(?:[s5z\\\$\\xA6\\xA7\\xA8]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "h") {$RE .= "(?:h|(\\|-\\|)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "d") {$RE .= "(?:[d\\xD0]|(\\|[)])|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "l") {$RE .= "(?:[l1i!\\|\\xCC-\\xCF]|(\\|_)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "u") {$RE .= "(?:[u\\xB5\\xD9-\\xDC\\xF9-\\xFC]|\&u[a-z]+;)"; next;} if ($ltr eq "y") {$RE .= "(?:[yv\\xA5\\xBE\\xDD\\xFD\\xFF]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "p") {$RE .= "(?:[pq\\xB6\\xDE\\xFE]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "z") {$RE .= "(?:[z2\\xB4\\xB8]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "c") {$RE .= "(?:[c\\xA2\\xA9\\xAB\\xC7\\xE7]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "m") {$RE .= "(?:m|rn|([\\/\\|]\\\\\\/[\\|\\\\])|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "w") {$RE .= "(?:w|(\\\\\\/\\\\\\/)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "k") {$RE .= "(?:k|(\\|<)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "v") {$RE .= "(?:v|(\\\\\\/)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "x") {$RE .= "(?:[x\\xD7]|(><)|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "g") {$RE .= "(?:[gq]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "q") {$RE .= "(?:[gq]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq "b") {$RE .= "(?:[b8\\xDF]|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; next;} if ($ltr eq " ") {$RE .= "\\s+"; next;} if ($ltr eq "[" || $ltr eq "]" || $ltr eq "+") {$RE .= $ltr; next;} $RE .= "(?:$ltr|\&\\#(?:" . ord(uc $ltr) . "|" . ord($ltr) . ");)"; #$RE .= $ltr; } # repeated letters may be obfu'd by only having one, except at end #$RE =~ s/\{1,4\}$/\{2,4\}/; warn "$word: $RE\n"; print "# $word @ $score\n"; print "# open-ended, may FP.\n" unless $boundend; print "describe\t$RU\tobfuscated \"$word\"\n"; print "body\t$RU\t/\\b(?!$word)(?:(?:$RE)|(?:$RE_spc))$boundend/i\n"; print "score\t$RU\t$score\n" unless ($score == 1.00); print "\n"; $seen{$word} = 1; } }