package Lire::Extensions::WWW::RobotSchema;

# vim:syntax=perl

use strict;

use Lire::Config;
use Lire::Utils qw/ tempfile /;
use Lire::Logger qw/ lr_info /;

use Lire::WWW::Domain;
use Lire::WWW::UserAgent;

use Carp;

use base qw/ Lire::AsciiDlf::ExtendedFieldsCreator /;

sub init_computation {
    my ( $self, $dlf_info ) = @_;

    $self->{'robot_default'}  = $self->schema->field( "robot" )->default;
    $self->{'domain_analyzer'} =
      new Lire::WWW::Domain( 'robot_default' => $self->{'robot_default'} );
    $self->{'ua_analyzer'} =
      new Lire::WWW::UserAgent( 'robot_default' => $self->{'robot_default'} );

    # Cache the field's index of requested_page
    $self->{'useragent_idx'}  = $self->schema->field( "useragent" )->pos;
    $self->{'host_idx'}	    = $self->schema->field( "client_host" )->pos;
    $self->{'url_idx'}	    = $self->schema->field( "requested_page" )->pos;

    $self->{'unknown_host'}   = {};
    $self->{'unknown_agent'}  = {};
}

sub create_extended_fields {
    my ( $self, $dlf ) = @_;

    my $host   = $dlf->[$self->{'host_idx'} ];
    my $ua     = $dlf->[$self->{'useragent_idx'} ];
    my $url    = $dlf->[$self->{'url_idx'}];

    # make sure data is available and in hostname format (not IP)
    if ( ! ( defined $host && defined $ua ) ) {
	return [ undef ];
    } else {
	if ( defined $host ) {
	    $self->{'domain_analyzer'}->setDomain($host);
	    my $robot = $self->{'domain_analyzer'}->getRobot();
	    if ( $url eq '/robots.txt' && $robot eq $self->{'robot_default'} &&
	         $host !~ /^[\d.]+$/ )
	    {
		$self->{'unknown_host'}{$host} = 1;
	    }
	    return [ $robot ] if $robot ne $self->{'robot_default'};
	}
	if ( defined $ua ) {
	    $self->{'ua_analyzer'}->setUserAgent($ua);
	    my $robot = $self->{'ua_analyzer'}->getRobot();
	    if ( $url eq '/robots.txt' && $robot eq $self->{'robot_default'} ) {
		$self->{'unknown_agent'}{$ua} = 1;
	    }
	    return [ $robot ];
	}
	return [ $self->{'robot_default'} ];
    }
}

sub end_computation {
    my ( $self ) = shift;

    $self->SUPER::end_computation( @_ );

    # Save values that should have been matched
    if ( Lire::Config->get( 'lr_debug' ) ) {
	my ($fh, $filename) = tempfile( "unknown_robot_host-XXXXXX",
					'SUFFIX' => '.txt' );
	lr_info( "saving unknown robot hosts in ", $filename );
	foreach my $h ( keys %{$self->{'unknown_host'}} ) {
	    print $fh $h, "\n";
	}
	close $fh;
	unlink $filename unless -s $filename;

	($fh, $filename) = tempfile( "unknown_robot_agent-XXXXXX",
				     'SUFFIX' => '.txt' );
	lr_info( "saving unknown robot agents in ", $filename );
	foreach my $ua ( keys %{$self->{'unknown_agent'}} ) {
	    print $fh $ua, "\n";
	}
	close $fh;
	unlink $filename unless -s $filename;
    }

    # Free memory
    delete $self->{'unknown_host'};
    delete $self->{'unknown_agent'};
    delete $self->{'robot_analyzer'};
    delete $self->{'domain_analyzer'};
}

# keep perl happy
1;

__END__

=pod

=head1 NAME

Lire::Extensions::WWW::RobotSchema -

=head1 SYNOPSIS

 use Lire::Extensions::WWW::RobotSchema;

=head1 DESCRIPTION

Interface to the Lire::WWW::Domain and Lire::WWW::UserAgent package.

=head1 VERSION

$Id: RobotSchema.pm,v 1.13 2004/03/20 00:26:29 flacoste Exp $

=head1 COPYRIGHT

Copyright (C) 2001 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html or write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.

=head1 AUTHOR

E.L. Willighagen <egonw@logreport.org>

=cut
