#!/usr/bin/perl -sw 
##
## Whiplash 
##
## Author: Vipul Ved Prakash <mail@vipul.net>.
## $Id: Whiplash.pm,v 1.1 2004/06/15 00:46:40 vipul Exp $

package Razor2::Signature::Whiplash; 

use Digest::SHA1;

sub new { 

    my ($class, %args) = @_;
    my %self = ( 
        uri_terminators => "/><\"",
        length_error    => 100,
        al_terminators  => " /><\"\r\n",
    );
    return bless \%self, $class;

}


sub whiplash { 

    my ($self, $text) = @_;

    # Wrap all the text in case the URL is broken up on multiple lines.

    # $text =~ s/[\r\n]//g;

    return unless $text;

    my @hosts = $self->extract_hosts($text);

    unless (scalar @hosts) { 

        # No hostnames were found in the text, 
        # return undef;

        debug("No hosts found in the message.");

        return;

    }

    # We have one or more hosts. Generate one signature for each host.

    my $length = length($text);
    my $corrected_length = $length - ($length % $$self{length_error});

    my @sigs;
    my %sig_meta;

    for my $host (@hosts) { 

        # Compute a SHA1 of host and corrected length.  The corrected length is 
        # the value of length to the nearest multiple of ``length_error''.
        # Take the first 20 hex chars from SHA1 and call it the signature.

        my $sha1 = Digest::SHA1->new();
        $sha1->add($host);
        $sig = substr $sha1->hexdigest, 0, 12;

        $sha1 = Digest::SHA1->reset();
        $sha1->add($corrected_length);
        $sig .= substr $sha1->hexdigest, 0, 4;

        push @sigs, $sig;
        $sig_meta{$sig} = [$host, $corrected_length];

        debug("$sig ($host + $corrected_length)");

    }

    return (\@sigs, \%sig_meta);

}


sub extract_hosts { 

    my ($self, $text) = @_;

    #
    # Test Vectors:
    #
    #  1. http://www.nodg.com@www.geocities.com/nxcisdsfdfdsy/off
    #  2. http://www.ksleybiuh.com@213.171.60.74/getoff/
    #  3. <http://links.verotel.com/cgi-bin/showsite.verotel?vercode=12372:9804000000374206> 
    #  4. http://217.12.4.7/rmi/http://definethis.net/526/index.html
    #  5. http://magalygr8sex.free-host.com/h.html
    #  6. http://%3CVenkatrs%3E@218.80.74.102/thecard/4index.htm
    #  7. http://EBCDVKIGURGGCEOKXHINOCANVQOIDOXJWTWGPC@218.80.74.102/thecard/5in
    #  8. http://g.india2.bag.gs/remove_page.htm
    #  9. https://220.97.40.149
    # 10. http://&#109;j&#97;k&#101;d.b&#105;z/u&#110;&#115;&#117;bscr&#105;&#98;e&#46;d&#100;d?leaving
    # 11. http://g5j99m8@it.rd.yahoo.com/bassi/*http://www.lekobas.com/c/index.php
    # 12. <a href="http://Chettxuydyhv   vwyyrcmgbxzj  n as ecq kkurxtrvaug nfsygjjjwhfkpaklh t a qsc  exinscfjtxr
    #     jobg @www.mmv9.org?affil=19">look great / feel great</a> 
    # 13. <A
    #      HREF="http://href=www.churchwomen.comhref=www.cairn.nethref=www.teeter.orghr
    #      ef=www.lefty.bizhref=wwwbehold.pitfall@www.mmstong5f.com/host/index.asp?ID=0
    #      1910?href=www.corrode.comhref=www.ode.nethref=www.clergy.orghref=www.aberrat
    #      e.biz" >
    # 14.  www.pillzthatwork.com  # anything that starts with www.
    # 

    # Decode Hex URI encoding (TV #6) (SPEC-REF: UNESCAPE)
    $text =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;

    # Decode Decimal URI encoding (TV #10) (SPEC-REF: UNESCAPE)
    $text =~ s/\&\#([0-9]{2,3})\;/chr($1)/eg;

    debug("host_tokens(): will attempt to extract host names");

    my @hosts;
    my @autolinks = $text =~ m|\s+(www.[^$$self{al_terminators}]+)|ig; # Outlook with autolink these URLs
    push @hosts, @autolinks;

    #
    # We extract host portions from all HTTP/HTTPS URIs found on the text.
    # URIs are decoded if they are encoded, usernames (usually random) are
    # thrown away and all unique hosts are extracted.
    #

    if ($text =~ m|^.*?https?://?(.*)$|si) { 
        $text = "http://$1";
    } else { 
        return;
    }

    while ($host = next_host($text)) {  

        last unless $host;

        # Strip to the domain or IP 

        my $normalized_host;
        
        if ($host =~ /^[\d\.]+$/) { 
    
            # This is an IP address, just use it.
            $normalized_host = $host;

        } else { 

            # See if it's a non country domain.  If so, 
            # we'll extract the hostname. (SPEC-REF: NORMALIZE)

            if ($host =~ m:\.([^\.]*\.[^\.]{2,4})$:) { 
                $normalized_host = $1;
            }  else { 
                $normalized_host = $host;
            }
    
        }

        # Ensure the hostname is not already in the list and that it is
        # potentially a routable hostname: length > 1 and contains
        # atleast one "."

        unless (grep { /^\Q$normalized_host\E$/ } @hosts) {
            if ((length($normalized_host) > 1) and ($normalized_host =~ /\./)) {
                push @hosts, $normalized_host;
            }
        }

        last unless $text =~ m"http://";
        $text = $';

    }

    return @hosts;
   
}


sub next_host { 

        ($_) = @_;

        my ($host, $authority);

        # Algorithm:
        # 1. Find http://
        # 2. Find [@"></]
        # 3. If found @, ignore everything before it and look for ["></]
        # 4. Everything from @ to [">/?] is the host. 
        # 5. If @ was not found, the whole thing is the host
        # 

        # print "next_host: input: $_\n";
        # Remove the protocol name
        s|^http://||i;

        # Find a terminator 
        if (m|(.*?)[>\"\/\?\<]|s) { 
            $_ = $1;
        }

        # Remove the authority section if the URL has one
        s/^[^@]*@//si;

        # The host name is everything after the last `='
        s/.*=//si;
        $host = $_;
        
        # The host part cannot contains whitespace or linefeeds.
        # Everything including and beyond these characters should be
        # throw away.

        $host =~ s/[\r\n\s].*$//s;

        # />
      
        # Lowercase the hostname and remove ``='' chars (which can be part
        # of the hostname sometimes when deQP didn't work correctly.

        $host = lc($host);
        $host =~ s/=//g;   
        $host =~ s/\s*$//g;

        # Throw away the TCP port spec

        $host =~ s/:.*$//;

        # Throw away ``.'' at the end

        $host =~ s/\.$//;

        return $host;

}


sub debug { 
    my $message = shift;
    # print "debug: $message\n";
}


1;

