package MKDoc::XML::Tagger;
use strict;
use warnings;
use utf8;

our $Ignorable_RE = qr /(?:\r|\n|\s|(?:\&\(\d+\)))*/;


##
# Found on http://www.cs.sfu.ca/~cameron/REX.html
# -----------------------------------------------
# REX/Perl 1.0
# Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions",
# Technical Report TR 1998-17, School of Computing Science, Simon Fraser
# University, November, 1998.
# Copyright (c) 1998, Robert D. Cameron.
# The following code may be freely used and distributed provided that
# this copyright and citation notice remains intact and that modifications
# or additions are clearly identified
##
my $TextSE = "[^<]+";
my $UntilHyphen = "[^-]*-";
my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-";
my $CommentCE = "$Until2Hyphens>?";
my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+";
my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>";
my $S = "[ \\n\\t\\r]+";
my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]";
my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]";
my $Name = "(?:$NameStrt)(?:$NameChar)*";
my $QuoteSE = "\"[^\"]*\"|'[^']*'";
my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*";
my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>";
my $S1 = "[\\n\\r\\t ]";
my $UntilQMs = "[^?]*\\?+";
my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>";
my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S";
my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?";
my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?";
my $PI_CE = "$Name(?:$PI_Tail)?";
my $EndTagCE = "$Name(?:$S)?>?";
my $AttValSE = "\"[^<\"]*\"|'[^<']*'";
my $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?";
my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)";
our $XML_SPE = "$TextSE|$MarkupSPE";


sub new
{
    my $class = shift;
    my $self  = bless {}, $class;
    $self->{data} = shift || die '$self->{data} is not defined';
    $self->_initialize();
    return $self;
}


sub _initialize
{
    my $self   = shift;
    my @tokens = $self->{data} =~ /$XML_SPE/go;
    
    my @tags   = ();
    
    my $res    = '';
    
    for (@tokens)
    {
	/^</ and do {
	    push @tags, $_;
	    $res .= '&(' . @tags . ')';
	    next;
	};
	$res .= $_;
    }
    
    $self->{tags} = \@tags;
    $self->{text} = $res;
}


sub text
{
    my $self = shift;
    return $self->{text};
}


sub InKana
{
    return "3040 309F\n30A0 30FF";
}


sub replace
{
    my $self = shift;
    my $text = $self->text();
    
    while (@_)
    {
	my $attr = \%{ shift() };
	my $tag  = delete $attr->{_tag}  || next;
	my $expr = delete $attr->{_expr} || next;
	$text = $self->_text_replace ($text, $expr, $tag, $attr);
    }
    
    for (my $i = 0; $i < @{$self->{tags}}; $i++)
    {
	my $c   = $i + 1;
	my $tag = $self->{tags}->[$i];
	$text =~ s/\&\($c\)/$tag/g;
    }
    
    return $text;
}


sub _text_replace
{
    my $class = shift;
    my $text  = shift;
    my $expr  = shift;
    my $tag   = shift;
    my $attr  = shift;

    my $re    = $class->_regexify ($expr);
    my $tag1  = $class->_tag_start ($tag, $attr);
    my $tag2  = $class->_tag_stop  ($tag, $attr);
    
    my %expr  = map { $_ => 1 } " $text " =~
    /(?<=\p{IsSpace}|\p{IsPunct}|\&|\p{InKana})($re)(?=\p{IsSpace}|\p{IsPunct}|\&|\p{InKana})/gi;

    foreach (keys %expr)
    {
	my $to_replace  = quotemeta ($_);
	my $replacement = $_;
	
	$replacement =~ s/(\&\(\d+\))/$tag2$1$tag1/g;
	$replacement = "$tag1$replacement$tag2";
	$text =~ s/$to_replace/$replacement/g;
    }
    
    return $text;
}


sub _regexify
{
    my $class = shift;
    my $text = shift;
    $text    = lc ($text);
    $text    =~ s/(?:\s|\r|\n)+/$Ignorable_RE/g;
    return $text;
}


sub _tag_start
{
    my $class = shift;
    my $tag  = shift;
    my $attr = shift;
    
    my $attr_str = join ' ', map { $_ . '=' . do {
	my $val = $attr->{$_};
	$val =~ s/\&/&amp;/g;
	$val =~ s/\</&lt;/g;
	$val =~ s/\>/&gt;/g;
	$val =~ s/\"/&quot;/g;
	"\"$val\"";
    } } keys %{$attr};
    
    return $attr_str ? "<$tag $attr_str>" : "<$tag>";
}


sub _tag_stop
{
    my $class = shift;
    my $tag  = shift;
    return "</$tag>";
}


package main;
use strict;
use warnings;
use Test::More 'no_plan';


=cut

my $example = <<'EOF';
Abstract

The Extensible Markup Language (XML) is a subset of <strong>SGML</strong>
that is <a href="foo">completely described</a> in this document.
EOF


my $p = new MKDoc::XML::Tagger ($example);


# checking that text() works and replaces tags correctly
{
    my $text = $p->text();
    like ($text, qr/\&\(1\)SGML\&\(2\)/);
    like ($text, qr/\&\(3\)completely described\&\(4\)/);
}


# this regex should match any amount of consecutive whitespace,
# or \&(214) like tags, or carriage returns
{
    my $sample_text = <<EOF;
   \&(214) \&(214)
\&(22)  \&(214)  \&(33)
 \&(2142343432432) 
EOF
    
    if (0) { $MKDoc::XML::Tagger::Ignorable_RE = $MKDoc::XML::Tagger::Ignorable_RE } # no silly warnings
    my $re = '^' . $MKDoc::XML::Tagger::Ignorable_RE . '$';
    like ($sample_text, qr/$re/);
    unlike ('hello world', qr /$re/);
}


# let's test tag_start
{
    my $tag = $p->_tag_start ('a', { href => 'http://www.disney.com' } );
    like ($tag, qr/<a href="http:\/\/www.disney.com">/);
}


# let's test tag_stop
{
    my $tag = $p->_tag_stop ('a', { href => 'http://www.disney.com' } );
    like ($tag, qr/<\/a>/);
}


# now scary stuff... let's try the text_replace() method
# make a few tests...
{
    my $p = undef;
    my $r = undef;
    
    $p = new MKDoc::XML::Tagger ('Hello, World!');
    $r = $p->_text_replace ($p->text(), 'HeLLo, wOrLd!', 'strong');
    is ($r, '<strong>Hello, World!</strong>');
    
    $p = new MKDoc::XML::Tagger ('YO: Hello, World!');
    $r = $p->_text_replace ($p->text(), 'HEllO, WOrLd!', 'strong');
    is ($r, 'YO: <strong>Hello, World!</strong>');

    $p = new MKDoc::XML::Tagger ('Hello, World! :OY');
    $r = $p->_text_replace ($p->text(), 'HEllO, WOrLd!', 'strong');
    is ($r, '<strong>Hello, World!</strong> :OY');
    
    $p = new MKDoc::XML::Tagger ('YO: Hello, World! :OY');
    $r = $p->_text_replace ($p->text(), 'HEllO, WOrLd!', 'strong');
    is ($r, 'YO: <strong>Hello, World!</strong> :OY');
    
    # now let's be a bit nasty
    $p = new MKDoc::XML::Tagger ('YO: Hello, my <strong>Cool</strong> World! :OY');
    $r = $p->_text_replace ($p->text(), 'my cool', 'em');
    is ($r, 'YO: Hello, <em>my </em>&(1)<em>Cool</em>&(2) World! :OY');
}

=cut

process();
sub process {
    my $test_data = <<'EOF';
<p>This specification, together with associated standards
(Unicode and ISO/IEC 10646 for characters, Internet
RFC 1766 for language identification tags, ISO 639 for
language name codes, and ISO 3166 for country name codes),
provides all the information necessary to understand XML
Version 1.0 and construct computer programs to process it.
</p>
<p>This version of the XML specification may be distributed
freely, as long as all text and legal notices remain intact.</p>
</div> <div class="div2">
<h3><a name="sec-terminology"></a>1.2 Terminology</h3>]
EOF


    $test_data = <<'EOF';
Foo <abbr>Bar Baz</abbr>
EOF




    my $tagger = new MKDoc::XML::Tagger ($test_data);
    print $tagger->replace (
	{ _expr => 'unicode',     _tag => 'a', href => 'http://www.unicode.org/' },
	{ _expr => 'xml',         _tag => 'a', href => 'http://www.xml.com/'     },
	{ _expr => 'it. This',    _tag => 'strong'                               },
	{ _expr => 'Bar Baz',    _tag => 'strong'                               },
	# { _expr => 'fOO BaR',     _tag => 'a', href => '#foo'                    },
       );
}


1;
