# The LearningOnline Network with CAPA
# gateway for html input/output to be properly parsed and handled
#
# $Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
######################################################################
######################################################################

=pod

=head1 NAME

Apache::lonhtmlgateway - properly parse and handle HTML input and output

=head1 SYNOPSIS

This is part of the LearningOnline Network with CAPA project
described at http://www.lon-capa.org.

=head1 INTRODUCTION

lonhtmlgateway is an object-oriented module used to parse and correct
malformed HTML input from the client, as well as to perform processing
of custom LON-CAPA HTML output before it is sent along to the end-user.
It replaces a number of subroutines in various modules, and adds new
code to tidy and process malformed HTML using XML::LibXML. 

This module is intended to be used for all non-authoring perspectives
in the system.

New to LON-CAPA version 3.0.

=head2 Example Usage

Below is intended code to be invoked and called for use outside 
of this module:

    $gateway = Apache::lonhtmlgateway->new();
    $gateway = Apache::lonhtmlgateway->new($target);
    
    $xhtml = $gateway->process_incoming_html($html);
    $xhtml = $gateway->process_incoming_html($html, $legacy);
    
    $xml = $gateway->process_html_to_xml($html);
    $xhtml = $gateway->process_xml_to_html($xml); 
    
    $bool = Apache::lonhtmlgateway->contains_block_level_tags($input);
    
=head1 GLOBAL VARIABLES

=over 4

=cut 

######################################################################
######################################################################

package Apache::lonhtmlgateway;

use strict;
use utf8;
use Time::Local;
use Time::HiRes;
use Apache::lonlocal;
use Apache::lonnet;
use Apache::lonhtmlcommon;
use Apache::lonxml;
use Apache::lontexconvert;
use lib '/home/httpd/lib/perl/';
use LONCAPA;
use XML::LibXML;
use Encode;
use HTML::Entities;
use HTML::LCParser();
use Safe();

local $XML::LibXML::skipXMLDeclaration = 1;
local $XML::LibXML::skipDTD = 1;
local $XML::LibXML::setTagCompression = 1;

##############################################
##############################################

=item %LONCAPA_ALLOWED_STANDARD_TAGS

This is a hash of all tags, both HTML and custom LON-CAPA tags that
are allowed in non-authoring spaces.  Examples of this include
course documents, bulletin boards, discussion posts, templated pages,
etc.  In addition, in the event of rich text editing, the WYSIWYG
editor needs to know how to display LON-CAPA custom tags as either
inline-level (<span>) or block-level (<div>). Therefore, the hash is
set up with uppercase tag names as keys ("H1"), and the corresponding
entry an integer constant indicating that tag's role or purpose:

=over 4

=item 0 =

Tag is explictly not allowed.  Currently not used anywhere in this
module, but reserved for the future in case certain tags would like
to be explicitly blacklisted.

=item 1 =

Tag is allowed, and in cases where it is unclear, is rendered as an
inline-level element.  Example: <algebra> should be rendered as an 
inline element.

=item 2 =

Tag is allowed, and in cases where it is unclear, is rendered as a
block-level element.  Example: <md> should be rendered as a block
element.

=back

=back

=cut

##############################################
##############################################

our %LONCAPA_ALLOWED_STANDARD_TAGS = (
    # standard html header tags
    H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2,
    # basic inline formatting and phrases
    B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1,
    BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1,
    Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1,
    SUB => 1, SUP => 1,
    # linking and embedding
    A => 1, IMG => 1, 
    # block level tags
    P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2,
    BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2, 
    # table-related tags
    TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2,
    TH => 2, 
    # LON-CAPA custom tags
    M => 1, MI => 1, MD => 2, ALGEBRA => 1,
    CHEM => 1
);

##############################################
##############################################

=head1 PARSING LON-CAPA CUSTOM TAGS

This module maintains a hash %custom_tag_parsers, containing 
lowercase tag names as keys and function references as entries.
Convention used here names the actual parsing function whose
reference is stored here to be of the name &parse_tagname_tag().
These functions are called during the processing of outgoing 
HTML output in the &process_outgoing_html() function.

Each of these functions is passed the following arguments:

=over 4

=item self

Reference to Apache::lonhtmlgateway object calling the function.

=item input

Textual context extracted between the <tag> and </tag> tags.
Note that this text I<could> contain HTML entities.  Thus, for 
functions that cannot handle entitized input, 
&HTML::Entities::decode_entities() should be called on this data
before further handing it off.

=back

Example hash entry:

    mi => \&parse_mi_tag,

=head2 Currently Supported Custom Tags

=over 4

=item <algebra>

Intended to convert and simplify simple algebraic functions into
readable output.  Corrects cases such as double negatives or 
eliminates coefficients of 1 where appropriate.  The actual
handling of content contained in this tag takes place inside
L<Apache::lontexconvert>, which in turn uses the AlgParser 
module to actually process the input.

Usage:
    <algebra>2*x+(-5)</algebra>

=item <chem>

Formatter for chemical equations, adding superscripts, subscripts,
and appropriate arrow characters as appropriate.  This parser is
wholly contained inside this module, but is a copy of a routine
found in homework/default_homework.lcpm.

Usage:
    <chem>CH3CO2H + H2O <=> CH3CO2- + H3O+</chem>

=back

=head3 Math Mode Tags

These tags are intended for LaTeX math mode input, in order to
produce complex mathematical and scientific constructs, which
normal HTML cannot produce.  The output is later rendered by
a user-defined TeX engine in web target, or handled directly
in the case of tex target.  The only difference between the tags
below is determining the author's intent on how to appropriately
render the contents within the tag - this intent is
important in preserving the What You See Is What You Get philosophy
of the rich text editor.

=over 4

=item <mi>

Inline math mode tag.  Content is surrounded by "$" characters and
passed to the parser for the <m> tag.

I<New for LON-CAPA 3.0>.

=item <md>

Display block math mode tag.  Content is surrounded by "\[" and 
"\]" characters and passed to the parser for the <m> tag.

I<New for LON-CAPA 3.0>.

=item <m>

Math mode tag.  Allows author to fully specify the display of their
TeX input, and contain mixed inline-and-block content within a single
tag.  

Due to tools such as the rich text editor needing to know whether a
custom tag is block-level or inline-level on render, the use of this
tag is discouraged starting with LON-CAPA 3.0 although it will continue
to function.  Fully compatible with legacy LON-CAPA 2.x content.

=back

=cut

##############################################
##############################################

my %custom_tag_parsers = (
    mi => \&parse_mi_tag,
    md => \&parse_md_tag,
    m => \&parse_m_tag,
    algebra => \&parse_algebra_tag,
    chem => \&parse_chem_tag
);

##############################################
##############################################

=head1 CLASS OBJECT CONSTRUCTOR

=over 4

=item new

    $gateway = Apache::libhtmlgateway->new();
    $gateway = Apache::libhtmlgateway->new($target);

Constructs and returns a new gateway object.  An optional argument
allows one to specify the target of the output, defaults to 'web'.
Behind the scenes, a single XML::LibXML parser object is created
behind the scenes.  On destroy, this parser object is destroyed
as well.

=back

=cut

##############################################
##############################################

sub new {
    my $invocant = shift;
    my $class = ref($invocant) || $invocant;
    my $target = shift;
    # create a new parser instance for libxml
    my $self = {
        parser => XML::LibXML->new(),
        target => ($target) ? $target : 'web'
    };
    # options for the libxml parser
    $self->{parser}->recover(1);
    $self->{parser}->recover_silently(1);
    bless($self, $class);  # bless = pray that it works
    return $self;
}

sub DESTROY {
    my $self = shift;
    my $parser = $self->{parser};
    undef $parser;  # destroy the parser instance
}

##############################################
##############################################

=head1 PUBLIC OBJECT METHODS

=over 4

=item process_html_to_xml

    $xml = $gateway->process_html_to_xml($html);

Takes presumably-malformed HTML, encodes ampersands characters 
and passes the result to the Xml::LibXML parser, which creates
a DOM tree in memory of the content.  This parse is as error-tolerant
as can be set, and libxml attempts to recover from any errors as much
as possible. This DOM tree is then taken and serialized,
eliminating unbalanced and malformed tags along the way. This
XML code (without any header tags) is then returned to the caller.

=cut

##############################################
##############################################

sub process_html_to_xml {
    my $self = shift;
    my $input = shift;
    my $parser = $self->{parser};

    if (length($input) < 1) { return ""; }
    
    # only encode ampersands -- brackets may be valid tags
    my $encoded = &HTML::Entities::encode_entities($input, '&');
    
    # for the <chem> tag, we want the strings "<=>", "<-", "->" to be properly
    # entitized so the parser doesn't destroy it
    $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\&lt\;\&\#61\;\&gt\;$2/gi;
    $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\&gt\;$2/gi;
    $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\&lt\;\-$2/gi;
    
    # parse into libXML to tidy tags, we suppress any errors
    # because otherwise the parser complains about non-HTML
    # tags to STDERR and the Apache error logs
    my $dom = $parser->parse_html_string($encoded,
        {
            suppress_errors => 1,
            suppress_warnings => 1,
            recover => 2
        }
    );
    # the dom returns a full <html> structure, so just get
    # all the child nodes of the <body> tag and put them together
    my @body_nodes = $dom->findnodes('/html/body');
    my @body_children = $body_nodes[0]->childNodes;
    my $xml = "";
    foreach my $child (@body_children) {
        $xml .= $child->toString();
    }
    # entities passed into $input are in the form of '&amp;lt;'
    # they are double entities
    return $xml;    
}

##############################################
##############################################

=item process_xml_to_html

    $xhtml = $gateway->process_xml_to_html($xml);

Takes XML input, decodes ampersands characters 
and passes the result then to the caller.

=cut

##############################################
##############################################

sub process_xml_to_html {
    my $self = shift;
    my $input = shift;
    # decode one level of entities (XML) such that the
    # output is returned to the original level of entities
    # $input "&lt;" --> $xml "&amp;lt;" --> "&lt;"
    my $xhtml = &HTML::Entities::decode_entities($input);
    # now we have valid XHTML that can be stored and parsed
    return $xhtml;
}

##############################################
##############################################

=item process_incoming_html

    $xhtml = $gateway->process_incoming_html($html);
    $xhtml = $gateway->process_incoming_html($html, $legacy);

Designed to be called for all raw HTML inputs from the client
side before storing or rendering data.  Decodes UTF-8 data,
trims leading and trailing "\n" and "<br />" tags.  Processes
the result through the XML parser, converts this back to
balanced well-formed XHTML, re-encodes the result as UTF-8,
and returns the result to the caller.

=over 4

=item legacy

    $legacy = 0;
    $legacy = 1; 

I<(optional)> If true, adds additional processing intended
to emulate LON-CAPA 2.x parsing of the content.

=back

=cut

##############################################
##############################################

sub process_incoming_html {
    # this should be called by all HTML inputs before storing
    # data --> for consistency's sake, call process_html_to_xml
    # afterwards if you need to embed this in XML later on
    my $self = shift;
    my $input = shift;
    my $legacy = shift;
    
    # no idea why i have to call this to get unicode characters
    # working, but i do, so here it is.
    $input = &Encode::decode_utf8($input);
    
    # trim leading and trailing whitespace and HTML breaks
    chomp($input);
    $input =~ s/\s+$//s;
    $input =~ s/^\s+//s;
    $input =~ s/\<br\s*\/*\>$//s;
    my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is);
    my $xml = $self->process_html_to_xml($input);
    if ($legacy && !&contains_block_level_tags($input)) {
        # the xml returns content inside a <p> tag
        # if there are no block tags... thus to preserve
        # old behavior, we strip out that <p></p>
        if ($no_p_input) {
            $xml =~ s/^\<p\>(.*)\<\/p\>/$1/si;
        }
    }
    my $xhtml = $self->process_xml_to_html($xml);
    # see above unicode encoding comment
    $xhtml = &Encode::encode_utf8($xhtml);
    return $xhtml;
}

##############################################
##############################################

=item process_outgoing_html

    $html = $gateway->process_outgoing_html($xhtml);
    $html = $gateway->process_outgoing_html($xhtml, $legacy);

Designed to be called for all HTML outputs to the client
side before rendering data.  This entitizes all non-allowed
tags, as was previously done in Apache::lonfeedback, and
processes and converts all LON-CAPA supported custom tags (see
above) to their respective output HTML.

=over 4

=item legacy

    $legacy = 0;
    $legacy = 1; 

I<(optional)> If true, adds additional processing intended
to emulate LON-CAPA 2.x parsing of the content.  This includes
behavior to convert "\n" to "<br />" if there are no block-level
tags detected in the input.  In addition, raw URLs are converted
automatically to <a> links.

=back

=back

=cut

##############################################
##############################################

sub process_outgoing_html {
    # this should be called on all HTML outputs before displaying
    # because it will filter out all non-HTML+LONCAPA tags.
    # tags are not filtered at input stage for greater backwards
    # compatibility.  note that this disregards course preference.
    my $self = shift;
    my $input = shift;
    my $legacy = shift;
    
    my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
    # entitize all tags that are not explicitly allowed
    $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/
        {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\&lt;$1"}/ge;
    $input =~ s/(\<?\s*(\w+)[^\<\>]*)\>/
        {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\&gt;"}/ge;
    if ($legacy) {
        unless (&contains_block_level_tags($self, $input)) {
            $input = $self->legacy_newline_to_br($input); 
        }
        $input = $self->legacy_raw_href_to_link($input);
    }
    # at this point, we need to convert our own custom tags
    # into the appropriate output
    # see above for supported tags
    my $output = "";
    my $parser = HTML::LCParser->new(\$input);
    while (my $token = $parser->get_token()) {
    	if ($token->[0] eq 'T') {
            if ($self->{target} ne 'tex') {
    	        $output .= &Apache::lontexconvert::smiley($token->[1]);
    	    } else {
                my $t = $token->[1];
                $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge;
                $output .= $t;
            }
        } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
    	    $output .= $token->[1];
    	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
    	    $output .= $token->[2];
    	} elsif ($token->[0] eq 'S') {
    	    my $tag = lc($token->[1]);
    	    if (exists($custom_tag_parsers{$tag})) {
    	        my $text = $parser->get_text();
    	        $output .= $custom_tag_parsers{$tag}(
    	            $self, $text, $self->{target});
    	    } else {
    	        $output .= $token->[4];
    	    }
    	}
    }
    return $output;
}

##############################################
##############################################

=head1 STATIC CLASS METHODS

The following are static class methods that can be called
by any object.

=over 4

=item contains_block_level_tags

    $bool = Apache::lonhtmlgateway::contains_block_level_tags($input);
    
Uses a regular expression to find, in the input data, any tags 
described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level.
Returns 1 if true, 0 if false. 

=cut

##############################################
##############################################

sub contains_block_level_tags {
    my $class = shift;
    my $input = shift;
    my @block_level_tags = @{&get_block_level_tags($class)};
    foreach my $tag (@block_level_tags) {
        if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) {
            # if your input loves this regular expression
            # as much as i do, then return true.
            # it searches for either a <tag> or <tag />
            return 1;
        }
    }
    return 0;
}

##############################################
##############################################

=item get_block_level_tags

    @tags = Apache::lonhtmlgateway::get_block_level_tags();
    
Return an array with any tags described in 
%LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these
tags are returned in no particular order, and the tag names
are returned in uppercase.


=cut

##############################################
##############################################

sub get_block_level_tags {
    my $class = shift;
    my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
    my @block = [];
    foreach my $tag (keys(%html)) {
        if ($html{$tag} == 2) {
            push(@block, $tag);
        }
    }
    return \@block;
}

sub num_entity {
    sprintf "&#x%X;", ord($_[0]);
}

##############################################
##############################################

=head2 Legacy Functions

These functions are intended to process input in the same or
a similar way to how it was processed in LON-CAPA 2.x.

=item legacy_newline_to_br

I<(formerly Apache::lonfeedback::newline_to_br)>

    $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input);
    
Parse the input using HTML::LCParser, and in any text nodes
which contain "\n" characters, replace those characters with
an HTML "<br />" tag.

=cut

##############################################
##############################################

sub legacy_newline_to_br {
    my $class = shift;
    my $input = shift;
    my $output;
    my $parser = HTML::LCParser->new(\$input);
    while (my $token = $parser->get_token()) {
    	if ($token->[0] eq 'T') {
    	    my $text = $token->[1];
    	    $text =~ s/\n/\<br \/\>/g;
    	    $output .= $text;
    	} elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
    	    $output .= $token->[1];
    	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
    	    $output .= $token->[2];
    	} elsif ($token->[0] eq 'S') {
    	    $output .= $token->[4];
    	}
    }
    return $output;
}

##############################################
##############################################

=item legacy_raw_href_to_link

I<(formerly Apache::lonhtmlcommon::raw_href_to_link)>
    
    $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input);
    
Search for any links/URLs within the input text, and convert them
to <a> tags whose content is embedded inside a <tt> tag.

=back

=cut

##############################################
##############################################

sub legacy_raw_href_to_link {
    my $class = shift;
    my $input = shift;
    $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/<a href="$1"><tt>$1<\/tt><\/a>$2/gi;
    return $input;
}

sub parse_algebra_tag {
    my $self = shift;
    my $input = shift;
    # the <algebra> parser does NOT handle entities,
    # unlike the general <m> parser; thus we run
    # the content of this tag through HTML::Entities,
    # decoding it first. we also just get the tex, and
    # feed it through as if it were an <mi> tag.
    $input = &HTML::Entities::decode($input);
    my $algebra = 
        &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth');
    return &parse_m_tag($self, $algebra);
}

sub parse_mi_tag {
    my $self = shift;
    my $input = shift;
    return &parse_m_tag($self, '\ensuremath{'.$input.'}');
}

sub parse_md_tag {
    my $self = shift;
    my $input = shift;
    return &parse_m_tag($self, '\['.$input.'\]');
}

sub parse_m_tag {
    my $self = shift;
    my $input = shift;
    if ($self->{target} ne 'tex') {
        return &Apache::lontexconvert::to_convert($input, $self->{target});
    } else {
        return '<m>'.$input.'</m>';
    }
}

sub parse_chem_tag {
    my $self = shift;
    my $input = shift;
    my $target = $self->{target};
    # as with the <algebra> tag, some portions of the
    # <chem> input may be coming in encoded, especially
    # arrows -- so decode it in HTML::Entities
    $input = &HTML::Entities::decode($input);
    my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input);
    my $formula = '';
    foreach my $token (@tokens) {
    	if ($token eq '->' ) {
    	    if ($target eq 'web') {
    	        $formula .= '&#8594; ';
    	    } else {
    	        $formula .= '<m>\ensuremath{\rightarrow}</m> ';
    	    }
    	    next;
    	}
    	if ($token eq '<-' ) {
    	    if ($target eq 'web') {
    	        $formula .= '&#8592; ';
    	    } else {
    	        $formula .= '<m>\ensuremath{\leftarrow}</m> ';
    	    }
    	    next;
    	}  
    	if ($token eq '<=>') {
    	    if ($target eq 'web') {
    		$formula .= '&#8652; ';
    	    } else {
    		$formula .= '<m>\ensuremath{\rightleftharpoons}</m> ';
    	    }
    	    next;
    	}
    	if ($token eq '.') {
    	  $formula =~ s/(\&nbsp\;| )$//;
    	  $formula .= '&middot;';
    	  next;
    	}
    	$token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/;
            $formula .= $1 if ($1 ne '1');  # stoichiometric coefficient
    	my $molecule = $2;
    	# subscripts
    	$molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|<sub>$1</sub>|g;
    	# superscripts
    	$molecule =~ s|\^(\d*[+\-]*)|<sup>$1</sup>|g;
    	# strip whitespace
    	$molecule =~ s/\s*//g;
    	# forced space
    	$molecule =~ s/_/ /g;
    	$molecule =~ s/-/&minus;/g;
    	$formula .= $molecule.'&nbsp;';
    }
    # get rid of trailing space
    $formula =~ s/(\&nbsp\;| )$//;
    return $formula;
}

##############################################
##############################################

=head1 AUTHORS

Phil Fazio

=head1 VERSION

$Id: lonhtmlgateway.pm,v 1.5 2010/05/24 23:47:22 raeburn Exp $

=cut

##############################################
##############################################
