# ----------------------------------------------------------------------------
# MKDoc::Util::LinkParser
# ----------------------------------------------------------------------------
# Author: Sam Tregar
# Copyright: (c) MKDoc Holdings Ltd, 2005
#
# This file is part of MKDoc. 
# 
# MKDoc is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# MKDoc is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with MKDoc; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# ---------------------------------------------------------------------------
package MKDoc::Util::LinkParser;
use strict;
use warnings;

=head1 NAME

MKDoc::Util::LinkParser - parses links to differentiate internal and external links

=head1 SYNOPSIS

  use MKDoc::Util::LinkParser;

  # setup a new parser
  $parser = MKDoc::Util::LinkParser->new();
  
  # parse a link
  $parser->parse('http://www.example.com/foo');

  # deal with results
  if ($parser->is_internal) {
     if ($parser->is_valid) {
        $document_id = $parser->document_id;
        print "Found a link to document ID $document_id.\n";
     } else {
        die "Invalid internal link found!";
     }
  } else {
     print "External link found.\n";
  }

=head1 DESCRIPTION

This module parses links and determines whether they are internal
(pointing to the site served by MKDoc) or external.  For internal
links it determines if they are valid (pointing to an active document).

=head1 CLASS METHODS

=head2 new

Creates a new parser.  Accepts no arguments.

=head2 parse

Parse a link.  Takes a single argument, the link to parse.

=head2 thaw

Loads data produced by store() just as though the link had been parsed
with parse().

=head1 INSTANCE METHODS

=head2 is_internal

Returns true if the last link parsed was internal.

=head2 is_external

Returns true if the last link parsed was external.

=head2 is_valid

Returns true if the last link was a valid internal link.  When this is
true document_id will be available.

=head2 document_id

Returns the document_id for the last link parsed.

=head2 uri

Returns the URI object for the last link parsed.  See L<URI> for more
details.

=head2 as_path

Produce a string containing just the path for an internal link.

=head2 as_string

Produce a string containing a canonical version of the link.

=head2 operation

Returns the extra MKDoc-specific operation portion of the URI for
internal links.  This is the part of the URI that points to an
operation rather than a documemt.  For example, given this link:

  http://example.com/test-doc/.admin.content

operation() would return '.admin.content'.

This part of the URI will not be present in the object returned from
uri() and it will not affect is_valid() and document_id() matches.

=head2 freeze

Returns a hash-ref representiting all the relevent details of the
link.  This is the data which may be passed later to thaw().

This method may only be called on valid internal links.

=head1 AUTHOR

Sam Tregar <sam@tregar.com>

=cut

use URI;
use Carp qw(croak);
use lib::sql::DBH;

sub new { 
    my $self = bless {}, shift;

    # make sure the environment is properly setup
    for my $var qw(PUBLIC_DOMAIN USER_DOMAIN) {
        croak("$var not set - needed by " . __PACKAGE__) unless $ENV{$var};
    }

    # setup default base URI
    $self->{base} = URI->new($ENV{PUBLIC_DOMAIN})->canonical;

    # setup list of internals base URLs
    $self->{internal_bases} = [ $self->{base},
                                URI->new($ENV{USER_DOMAIN})->canonical ];
    
    # add in foo.com for www.foo.com
    my $pub = $ENV{PUBLIC_DOMAIN};
    $pub =~ s!^(\w+://)\w+\.!$1!;
    push @{$self->{internal_bases}}, URI->new($pub)->canonical;
   
    # produce a list of canonical internal prefixes minus scheme for
    # fast internal matching
    my @prefixes = map { $_->opaque } @{$self->{internal_bases}};

    # compile a regex from the internal prefixes
    my $re = join('|', map { qr/\Q$_\E/ } @prefixes);
    $self->{internal_re} = qr!^(?:$re)!;

    return $self;
}

sub parse {
    my ($self, $link) = @_;
    $self->_clear();

    my $uri;
    if ($link =~ m!^[^:/?#]+:!) {
        # if it's a full URI parse it as-is
        $uri = URI->new($link)->canonical;
    } else {
        # otherwise base it off the default base
        $uri = URI->new_abs($link, $self->{base})->canonical;
    }
    $self->{uri} = $uri;

    # pull off the operation part of internal paths
    my $path = $uri->path;
    if ($self->is_internal and $path =~ s!/(\.[a-z\.]+)$!!) {
        $self->{operation} = $1;
        $path = '/' unless length $path;
        $self->{uri}->path($path);
    }
}

# clear link state
sub _clear {
    my $self = shift;
    delete $self->{$_} for qw(is_internal document_id operation uri);
}

sub as_string {
    my $self = shift;
    my $uri  = $self->{uri}->clone;

    # the URI object has everything already unless this is an internal
    # link or has an operation
    return $uri->as_string 
      unless $self->{document_id} or $self->{operation};

    # setup path based on document_id if we've got one
    if ($self->{document_id}) {
        my $dbh    = lib::sql::DBH->get();
        my ($path) = $dbh->selectrow_array('SELECT Full_Path FROM Document WHERE ID = ?', undef, $self->{document_id});
        $uri->path($path);
    }

    # add in operation if there is one
    if ($self->{operation}) {
        my $path = $uri->path();
        $path .= '/' unless $path =~ m!/$!;
        $uri->path($path . $self->{operation});
    }

    return $uri->as_string;
}

sub as_path {
    my $self = shift;
    # setup path based on document_id if we've got one
    if ($self->{document_id}) {
        my $dbh    = lib::sql::DBH->get();
        my ($path) = $dbh->selectrow_array('SELECT Full_Path FROM Document WHERE ID = ?', undef, $self->{document_id});
        return $path;
    }
    return '';
}

sub operation { shift->{operation} }

sub is_internal {
    my $self = shift;
    return $self->{is_internal} if defined $self->{is_internal};
    my $uri  = $self->uri;

    # only http and https URLs can be internals
    my $protocol = $uri->scheme;
    return $self->{is_internal} = 0 
      unless  $protocol eq 'http' or $protocol eq 'https';

    my $prefix = $self->{uri}->opaque;
    if ($prefix =~ $self->{internal_re}) {
        return $self->{is_internal} = 1;
    } else {
        return $self->{is_internal} = 0;
    }
}

sub is_external {
    my $self = shift;
    return not $self->is_internal;
}

sub is_valid {
    my $self = shift;
    return 0 if defined $self->{document_id} and not $self->{document_id};
    return 1 if $self->document_id;
}

sub document_id {
    my $self = shift;
    return $self->{document_id} if $self->{document_id};

    my $uri  = $self->uri;
    my $dbh  = lib::sql::DBH->get();
    
    # fix path to end with a /
    my $path = $uri->path;
    $path .= '/' unless $path =~ m!/$!;

    # limit redirection to 100 links to avoid infinite loops
    my $redirect_limit = 100;

    # find an ID, following redirects as needed
    my $id;
    while (1) {
        # does this path exist in the DB?
        last if $id = $self->lookup_path($path, $dbh);

        # look for a redirect and loop if we found one
        last unless $path = $self->find_redirect($path, $dbh);
        
        # don't keep looping forever
        last unless --$redirect_limit;
    } 

    # return what we found, 0 or a real ID
    return $self->{document_id} = $id;
}

# lookup a path in Document and Redirect.  Returns the ID or a
# corrected path if a redirect was found.
sub lookup_path {
    my ($self, $path, $dbh) = @_;

    my ($id) = $dbh->selectrow_array('SELECT ID
                                      FROM Document 
                                      WHERE Full_Path = ?', undef, $path);
    return $id || 0;
}


sub find_redirect {
    my ($self, $path, $dbh) = @_;

    # pull all paths that might pertain to this path
    my @parts = grep { defined and length } split('/', $path);
    my @paths;
    foreach my $x (0 .. $#parts) {
        push(@paths, '/' . join('/', @parts[0 .. $x]) . '/');
    }

    # find the one which matches the longest part of this path, which
    # will be the most specific
    my ($from, $to) = $dbh->selectrow_array(
        'SELECT Old_Path, New_Path FROM Redirect WHERE Old_Path IN (' . 
        join(',', ('?') x @paths) . ') '.
        'ORDER BY LENGTH(Old_Path) DESC LIMIT 1', undef, @paths);
    if ($from) {
        # apply the redirect and return it
        $path =~ s!^\Q$from\E!$to!;
        return $path;
    }
    
    # no redirect found
    return;
}

sub uri { shift->{uri} }

sub freeze {
    my $self = shift;
    croak("Called freeze on an external or invalid link!")
      unless $self->{is_internal} and $self->{document_id};
    my %return;

    # store relevent details for an internal link
    my $uri = $self->{uri};
    $return{is_internal} = $self->{is_internal};
    $return{operation}   = $self->{operation};
    $return{document_id} = $self->{document_id};

    # save necessary URI parts
    $return{scheme}      = $uri->scheme;
    $return{query}       = $uri->query;
    $return{fragment}    = $uri->fragment;
    $return{authority}   = $uri->authority;

    return \%return;
}

sub thaw {
    my ($self, $data) = @_;
    $self->_clear();

    my $uri = URI->new();
    $uri->scheme($data->{scheme});
    $uri->query($data->{query});
    $uri->fragment($data->{fragment});
    $uri->authority($data->{authority});
    $self->{uri} = $uri;

    $self->{is_internal} = $data->{is_internal};
    $self->{operation}   = $data->{operation};
    $self->{document_id} = $data->{document_id};
}

1;

