# -------------------------------------------------------------------------------------
# MKDoc::Util::Text2HTML
# -------------------------------------------------------------------------------------
#
#       Author : Jean-Michel Hiver (jhiver@mkdoc.com).
#    Copyright : (c) MKDoc Holdings Ltd, 2001
# 
#      Unauthorized modification, use, reuse, distribution or redistribution
#      of this module is stricly forbidden.
#
#    Description:
#
#      Converts from text to html and reversely, from HTML to text.
#
# -------------------------------------------------------------------------------------
package MKDoc::Util::Text2HTML;
use Exporter;
use strict;
use 5.008_000;
use utf8;
use Carp;

our @ISA = qw /Exporter/;
our @EXPORT = qw /text2html html2text text2xml xml2text/;

##
# $DECIMAL_TO_HTML;
# -----------------
#   An associative array which maps the ASCII code (i.e. '60') for
#   the character '<' to its equivalent HTML entity (in this case,
#   '&lt;').
##
our $DECIMAL_TO_HTML = {
    38  => '&amp;',
    60  => '&lt;',
    62  => '&gt;',
    128 => '&euro;',
    160 => '&nbsp;',
    161 => '&iexcl;',
    162 => '&cent;',
    163 => '&pound;',
    164 => '&curren;',
    165 => '&yen;',
    166 => '&brvbar;',
    167 => '&sect;',
    168 => '&uml;',
    169 => '&copy;',
    170 => '&ordf;',
    171 => '&laquo;',
    172 => '&not;',
    173 => '&shy;',
    174 => '&reg;',
    175 => '&macr;',
    176 => '&deg;',
    177 => '&plusmn;',
    178 => '&sup2;',
    179 => '&sup3;',
    180 => '&acute;',
    181 => '&micro;',
    182 => '&para;',
    183 => '&middot;',
    184 => '&cedil;',
    185 => '&sup1;',
    186 => '&ordm;',
    187 => '&raquo;',
    188 => '&frac14;',
    189 => '&frac12;',
    190 => '&frac34;',
    191 => '&iquest;',
    192 => '&Agrave;',
    193 => '&Aacute;',
    194 => '&Acirc;',
    195 => '&Atilde;',
    196 => '&Auml;',
    197 => '&Aring;',
    198 => '&AElig;',
    199 => '&Ccedil;',
    200 => '&Egrave;',
    201 => '&Eacute;',
    202 => '&Ecirc;',
    203 => '&Euml;',
    204 => '&Igrave;',
    205 => '&Iacute;',
    206 => '&Icirc;',
    207 => '&Iuml;',
    208 => '&ETH;',
    209 => '&Ntilde;',
    210 => '&Ograve;',
    211 => '&Oacute;',
    212 => '&Ocirc;',
    213 => '&Otilde;',
    214 => '&Ouml;',
    215 => '&times;',
    216 => '&Oslash;',
    217 => '&Ugrave;',
    218 => '&Uacute;',
    219 => '&Ucirc;',
    220 => '&Uuml;',
    221 => '&Yacute;',
    222 => '&THORN;',
    223 => '&szlig;',
    224 => '&agrave;',
    225 => '&aacute;',
    226 => '&acirc;',
    227 => '&atilde;',
    228 => '&auml;',
    229 => '&aring;',
    230 => '&aelig;',
    231 => '&ccedil;',
    232 => '&egrave;',
    233 => '&eacute;',
    234 => '&ecirc;',
    235 => '&euml;',
    236 => '&igrave;',
    237 => '&iacute;',
    238 => '&icirc;',
    239 => '&iuml;',
    240 => '&eth;',
    241 => '&ntilde;',
    242 => '&ograve;',
    243 => '&oacute;',
    244 => '&ocirc;',
    245 => '&otilde;',
    246 => '&ouml;',
    247 => '&divide;',
    248 => '&oslash;',
    249 => '&ugrave;',
    250 => '&uacute;',
    251 => '&ucirc;',
    252 => '&uuml;',
    253 => '&yacute;',
    254 => '&thorn;',
    255 => '&yuml;',
};


##
# $HTML_TO_DECIMAL;
# -----------------
#   Reverse mapping than $DECIMAL_TO_HTML;
##
our $HTML_TO_DECIMAL = { reverse %{$DECIMAL_TO_HTML} };


##
# text2xml ($data);
# -----------------
#   $data - text data in UTF-8
#
#   Returns $data, the for entities '&', '<', '>', '"' being encoded
#   to &amp;, &lt; &gt and &quot; respectively.
##
sub text2xml
{
    my $data = join '', map { (defined $_) ? $_ : '' } @_;
    $data =~ s/\&/&amp;/g;
    $data =~ s/\</&lt;/g;
    $data =~ s/\>/&gt;/g;
    $data =~ s/\"/&quot;/g;
    return $data;
}


##
# text2xml ($data);
# -----------------
#   $data - 'XML' text data in UTF-8
#
#   Inverse function of MKDoc::Text2HTML::text2xml();
##
sub xml2text
{
    my $data = join '', @_;
    $data =~ s/\&quot;/\"/g;
    $data =~ s/\&gt;/\>/g;
    $data =~ s/\&lt;/\</g;
    $data =~ s/\&amp;/\&/g;
    return $data;
}


##
# text2html ($data);
# ------------------
#   $data - text data in UTF-8
#
#   DEPRECATED. Now an alias for MKDoc::Text2HTML::text2xml();
##
sub text2html
{
    my $data = join '', map { (defined $_) ? $_ : () } @_;
    carp ("function " . __PACKAGE__ . "::text2html is deprecated");
    # u_mark (\$data);
    return text2xml ($data);
}


##
# html2text ($data);
# ------------------
#   $data - HTML in UTF-8 format
#
#   Strips down $data to text and returns it (VERY ROUGH).
##
sub html2text
{
    # roughly strip out tags
    my @no_undefs = map { ( defined $_ ) ? $_ : () } @_;
    my $data = join '', @no_undefs;
    # u_mark (\$data);
    
    $data .= "\x{FEFF}";
    $data =~ s/<[^>]*>//gs;
    chop ($data);
    
    my %entities = $data =~ /(\&(?:\#([0-9]+)|\w+)\;)/g;
    while (my ($entity, $decimal) = each %entities)
    {
	# get the character code for this character
	if (defined $decimal and $decimal)
	{
	    my $val = chr ($decimal);
	    Encode::_utf8_on ($val);
	    $data =~ s/\Q$entity\E/$val/g;
	}
	else
	{
	    my $decimal = $HTML_TO_DECIMAL->{$entity};
	    if (defined $decimal and $decimal)
	    {
		my $val = chr ($decimal);
		Encode::_utf8_on ($val);
		$data =~ s/\Q$entity\E/$val/g;
	    }
	    else
	    {
		$data =~ s/\Q$entity\E//g;
	    }
	}
    }
    
    return $data;
}


1;
