#!/usr/bin/perl -w
# 
# rss-sitemap2mail - Grabs an rss-1.0 sitemap (typically from an mkdoc
# site) and sends out an email containing recent headlines.  utf-8 xml
# is expected as input
# 
# You need to tell it: the location of the rss; the number of days; the
# 'From:' address; the 'To:' address and a regular expression matching
# the URLs that you want to exclude.
# 
# ./rss-sitemap2mail.pl \
#     http://testers.mkdoc.com/rss100sitemap.rdf \
#     7 \
#     'Jane Doe <sender@example.com>' \
#     'John Doe <recipient@example.com>' \
#     '(testing|draft)' \
#     | /usr/sbin/sendmail -t
# 
# Bruno Postle <bruno@mkdoc.com>
# 
# loosely based on rss2html by Jonathan Eisenzopf
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.

use strict;
use utf8;

use XML::RSS;
use Encode qw /encode decode _utf8_on is_utf8/;
use LWP::Simple;
use Date::Manip;

die "Usage: rss-sitemapmail.pl (<RSS file> | <URL>) <days> <from-address> <to-address> <ignore-regex>\n"
    unless @ARGV == 5;

my $arg = shift;
my $days = shift;
my $from = shift;
my $to = shift;
my $ignore = shift;

my $rss = new XML::RSS;

$rss = get_rss ($arg);

# get all the recent items

my $items = recent_items ($rss, $days, $ignore);
die "No recent items" if (@{$items} == 0);

# build the email headers

my $headers = headers (
    { subject => "$rss->{'channel'}->{'title'} headlines",
         from => $from,
           to => $to });

# build the text version of the email body

my $text_version = text_version (
      { items => $items,
        title => $rss->{'channel'}->{'title'},
       byline => "Recently created documents from the last $days days:",
  description => $rss->{'channel'}->{'description'},
         link => $rss->{'channel'}->{'link'} });

# output message

print STDOUT $headers;

print STDOUT $text_version;


# assemble the email headers

sub headers
{
    my $self = shift;
    my $text;

    # non-ascii headers _really_ need encoding like this:
    # Subject: =?utf-8?B?6ZuF6JmO5Lit?=
    # so this script only works with ascii sitenames

    $text .= "Subject: $self->{'subject'}\n"
    . "From: $self->{'from'}\n"
    . "To: $self->{'to'}\n"
    . "Mime-Version: 1.0\n"
    . "Content-Type: text/plain; charset=utf-8\n"
    . "Content-Transfer-Encoding: 8bit\n"
    . "Content-Disposition: inline\n"
    . "User-Agent: MkDoc Headlines Mailer 0.0.1\n"
    . "\n";

    return $text;
}


# retrieve and parses an rss feed

sub get_rss
{
    my $arg = shift;
    
    if ($arg=~ /http:/i)
    {
        my $string = get ($arg);
        die "Could not retrieve $arg" unless $string;
        $rss->parse ($string);
    }
    
    else
    {
        my $path = $arg;
        die "File \"$path\" doesn't exist.\n" unless -e $path;
        $rss->parsefile ($path);
    }
    return $rss;
}


# get all the recent items/stories/documents

sub recent_items
{
    my $rss = shift;
    my $maxdays = shift;
    my $ignore = shift;
    my @items;

    # loop through the entire original rss feed
    
    foreach my $item (@{$rss->{'items'}})
    {
        my $date = $item->{'dc'}->{'date'};
        $date =~ s/T//;
	
	# calculate the items age in days
	
	my $days = (${1} * 7) + ${2} if (DateCalc ($date, ParseDate ('today')) =~ /0:0:(\d+):(\d+):.*/);
	
	push @items, $item unless ($days > $maxdays or $item->{'link'} =~ /$ignore/i );
    }

    # this sorts the items by URL, so related documents should get grouped together

    @items = sort { $a->{'link'} cmp $b->{'link'} } @items;
    
    return \@items;
}


# assemble a text version, we can do a html version another day..

sub text_version
{
    my $self = shift;
    my $text;

    # start off with the newsfeed title and the byline

    $text .= u_rap ($self->{'title'}, 72)
    . "\n    "
    . u_rap ($self->{'byline'}, 72)
    . "\n\n";

    # place the news items one after an other

    foreach my $item (@{$self->{'items'}})
    {
	$text .= item_text ($item)
	. "\n";
    }
   
    # put the newsfeed description in the sig
   
    my $description = $self->{'description'};
    $description =~ s/(\?|\!|\s|\.)*$/./;
    
    $text .= "-- \n"
    . u_rap ($description, 72);
    
    return $text;
}


# build an individual news/document item

sub item_text
{
    my $self = shift;
    my $text;

    # get a human readable date

    my $date = $self->{'dc'}->{'date'};
    $date =~ s/T//;
    $date = UnixDate ($date, "%b %e, %Y. %R.") || die "Bad date format";

    # wrap and indent the description

    my $description = $self->{'description'};
    $description =~ s/(\?|\!|\s|\.)*$/./;
    $description = u_rap ($description, 68);
    $description =~ s/(^|\n)\s*/${1}    /g;

    # stick it together

    $text .= u_rap ($self->{'title'}, 72)
    . "\n"
    . "    <". $self->{'link'} .">"
    . "\n"
    . $description
    . "\n"
    . "    ". $self->{'dc'}->{'creator'} ."  ". $date
    . "\n";

    return $text;
}


# UTF-8 wrapping function

sub u_rap
{
    my $text = shift;
    _utf8_on ($text);
    my $cols = shift;
    my @words = split /\s+/, $text;
    my @res = ();
    my @line = '';
    while (@words)
    {
        push @line, shift @words;
        my $tmp = join ' ', @line;
        if (length ($tmp) > $cols)
        {
            unshift @words, pop (@line);
            push @res, join ' ', @line;
            @line = ();
        }
    }
    push @res, join ' ', @line;
    my $res = join "\n", @res;
    $res =~ s/^ //;
    return $res;
}

1;
