#!/usr/bin/perl
use strict;
use warnings;
use MKDoc::Init::DBI;
use flo::Standard;
use MKDoc::Util::Text2HTML;
use MKDoc::Stemmer;

our $INDEXABLE_RECENTNESS = shift || die "Usage: $0 <indexable_recentness_in_seconds>";
$ENV{MKDOC_DIR} || die 'MKDOC_DIR is not defined!';

main();
sub main
{
    defined $ENV{SITE_DIR} and return do {
	MKDoc::Init::DBI->init();
        index_remove();
        index_update();
    };
    
    foreach my $mkdoc_site (site_list())
    {
        print "INDEXING: $mkdoc_site\n";
        $ENV{SITE_DIR} = $mkdoc_site;
        main();
    }
}


##
# site_list;
# ----------
# Returns every registered domain.
##
sub site_list
{
    my $file = $ENV{MKDOC_DIR} . '/conf/httpd.conf';
    my @res  = ();
    open FP, "<$file" || die "Cannot read-open $file!";
    while (<FP>)
    {
	chomp();
        s/^Include\s+//    || next;
        s/\/httpd\.conf.*// || next;
        push @res, $_;
    }
    close FP;
    
    return sort @res;
}


sub index_remove
{
    my @to_remove = removed_document_ids();
    my $dbh = MKDoc::SQL::DBH->get();
    my $sql = 'DELETE FROM Document_Index WHERE Record_ID = ?';
    my $sth = $dbh->prepare ($sql);
    for (removed_document_ids())
    {
        print "\tRemoving $_ from index\n";
        $sth->execute ($_);
    }
}


sub index_update
{
    my $dbh = MKDoc::SQL::DBH->get();
    my $sql = 'SELECT ID, Full_Path From Document WHERE Date_Last_Modified > ?';
    my $sth = $dbh->prepare ($sql);
    $sth->execute (reindex_date());
    while ($_ = $sth->fetchrow_arrayref())
    {
        print "\tIndexing $_->[1]\n";
        $_ = $_->[0];
        index_delete_keywords ($_);
        index_insert_keywords ($_);
    }    
}


sub removed_document_ids
{
    my %index_ids = map { $_ => 1 } distinct_index_document_ids();
    foreach (distinct_document_ids()) { delete $index_ids{$_} }
    return keys %index_ids;
}


sub distinct_document_ids
{
    my $dbh = MKDoc::SQL::DBH->get();
    my $sql = 'SELECT DISTINCT ID FROM Document';
    my $sth = $dbh->prepare ($sql);
    $sth->execute();

    my @res = ();
    while ($_ = $sth->fetchrow_arrayref()) { push @res, $_->[0] }
    return @res;   
}

sub distinct_index_document_ids
{
    my $dbh = MKDoc::SQL::DBH->get();
    my $sql = 'SELECT DISTINCT Record_ID FROM Document_Index';
    my $sth = $dbh->prepare ($sql);
    $sth->execute();

    my @res = ();
    while ($_ = $sth->fetchrow_arrayref()) { push @res, $_->[0] }
    return @res;   
}


sub index_delete_keywords
{
    my $doc_id = shift;
    my $dbh    = MKDoc::SQL::DBH->get();

    my $sql = "DELETE FROM Document_Index WHERE Record_ID=?";
    my $sth = $dbh->prepare ($sql);
    $sth->execute ($doc_id);
}


sub index_insert_keywords
{
    my $doc_id = shift;
    my $doc_t  = flo::Standard::table ('Document');
    my $doc    = $doc_t->get ($doc_id) or (Carp::cluck "Cannot get document with ID: $doc_id" and return);

    my $dbh    = MKDoc::SQL::DBH->get();
    my $sth    = $dbh->prepare ('INSERT INTO Document_Index VALUES (NULL, ?, ?, ?)');
    foreach my $col (sort keys %{$doc})
    {
        next unless (defined $doc_t->{weight}->{$col});
	    
	my %keywords = map { $_ => 1 } _data_split ($doc, $doc->{$col});
	my @keywords = keys %keywords;
	    
	for (@keywords) { $sth->execute ($doc_id, $col, $_) }
    }
}


##
# reindex_date;
# -------------
#   Returns the reindex date given $INDEXABLE_RECENTNESS
##
sub reindex_date
{
    # prefill date fields with current date
    my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime (time - $INDEXABLE_RECENTNESS);
    $mon++;
    $year += 1900;
    
    if (length ($mon)  == 1) { $mon  = "0" . $mon  }
    if (length ($mday) == 1) { $mday = "0" . $mday }
    
    return "$year-$mon-$mday $hour:$min:$sec";	
}


##
# _data_split ($table, $value);
# -----------------------------
#   Splits the data into keywords, eventually depending on
#   the language which is being used
#
#   @param - $value : string that needs to be splitted
#   @returns - An array of splitted keywords
##
sub _data_split
{
    shift();
    my $value = shift;
    $value = html2text ($value);
    return MKDoc::Stemmer::split ($value);
}


1;


