X-Git-Url: http://git.maemo.org/git/?p=dh-make-perl;a=blobdiff_plain;f=dev%2Fi386%2Flibhtml-tree-perl%2Flibhtml-tree-perl-3.23%2Flib%2FHTML%2FTreeBuilder.pm;fp=dev%2Fi386%2Flibhtml-tree-perl%2Flibhtml-tree-perl-3.23%2Flib%2FHTML%2FTreeBuilder.pm;h=93c8bd67ac4dd5866c137ed0c749d19656e7ae05;hp=0000000000000000000000000000000000000000;hb=8977e561d8a9eae6959218b0306c9df2056a38a9;hpb=df794b845212301ea0d267c919232538bfef356a
diff --git a/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm b/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm
new file mode 100644
index 0000000..93c8bd6
--- /dev/null
+++ b/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm
@@ -0,0 +1,1869 @@
+package HTML::TreeBuilder;
+
+use strict;
+use integer; # vroom vroom!
+use Carp ();
+use vars qw(@ISA $VERSION $DEBUG);
+$VERSION = '3.23';
+
+#---------------------------------------------------------------------------
+# Make a 'DEBUG' constant...
+
+BEGIN {
+ # We used to have things like
+ # print $indent, "lalala" if $Debug;
+ # But there were an awful lot of having to evaluate $Debug's value.
+ # If we make that depend on a constant, like so:
+ # sub DEBUG () { 1 } # or whatever value.
+ # ...
+ # print $indent, "lalala" if DEBUG;
+ # Which at compile-time (thru the miracle of constant folding) turns into:
+ # print $indent, "lalala";
+ # or, if DEBUG is a constant with a true value, then that print statement
+ # is simply optimized away, and doesn't appear in the target code at all.
+ # If you don't believe me, run:
+ # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \
+ # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder'
+ # and see for yourself (substituting whatever value you want for $DEBUG
+ # there).
+
+ if(defined &DEBUG) {
+ # Already been defined! Do nothing.
+ } elsif($] < 5.00404) {
+ # Grudgingly accomodate ancient (pre-constant) versions.
+ eval 'sub DEBUG { $Debug } ';
+ } elsif(!$DEBUG) {
+ eval 'sub DEBUG () {0}'; # Make it a constant.
+ } elsif($DEBUG =~ m<^\d+$>s) {
+ eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant.
+ } else { # WTF?
+ warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG";
+ eval 'sub DEBUG () { $DEBUG }'; # I guess.
+ }
+}
+
+#---------------------------------------------------------------------------
+
+use HTML::Entities ();
+use HTML::Tagset 3.02 ();
+
+use HTML::Element ();
+use HTML::Parser ();
+@ISA = qw(HTML::Element HTML::Parser);
+ # This looks schizoid, I know.
+ # It's not that we ARE an element AND a parser.
+ # We ARE an element, but one that knows how to handle signals
+ # (method calls) from Parser in order to elaborate its subtree.
+
+# Legacy aliases:
+*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown;
+*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten;
+*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement;
+*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement;
+*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup;
+*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
+*HTML::TreeBuilder::isList = \%HTML::Tagset::isList;
+*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement;
+*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement;
+*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers;
+
+#==========================================================================
+# Two little shortcut constructors:
+
+sub new_from_file { # or from a FH
+ my $class = shift;
+ Carp::croak("new_from_file takes only one argument")
+ unless @_ == 1;
+ Carp::croak("new_from_file is a class method only")
+ if ref $class;
+ my $new = $class->new();
+ $new->parse_file($_[0]);
+ return $new;
+}
+
+sub new_from_content { # from any number of scalars
+ my $class = shift;
+ Carp::croak("new_from_content is a class method only")
+ if ref $class;
+ my $new = $class->new();
+ foreach my $whunk (@_) {
+ if(ref($whunk) eq 'SCALAR') {
+ $new->parse($$whunk);
+ } else {
+ $new->parse($whunk);
+ }
+ last if $new->{'_stunted'}; # might as well check that.
+ }
+ $new->eof();
+ return $new;
+}
+
+# TODO: document more fully?
+sub parse_content { # from any number of scalars
+ my $tree = shift;
+ my $retval;
+ foreach my $whunk (@_) {
+ if(ref($whunk) eq 'SCALAR') {
+ $retval = $tree->parse($$whunk);
+ } else {
+ $retval = $tree->parse($whunk);
+ }
+ last if $tree->{'_stunted'}; # might as well check that.
+ }
+ $tree->eof();
+ return $retval;
+}
+
+
+#---------------------------------------------------------------------------
+
+sub new { # constructor!
+ my $class = shift;
+ $class = ref($class) || $class;
+
+ my $self = HTML::Element->new('html'); # Initialize HTML::Element part
+ {
+ # A hack for certain strange versions of Parser:
+ my $other_self = HTML::Parser->new();
+ %$self = (%$self, %$other_self); # copy fields
+ # Yes, multiple inheritance is messy. Kids, don't try this at home.
+ bless $other_self, "HTML::TreeBuilder::_hideyhole";
+ # whack it out of the HTML::Parser class, to avoid the destructor
+ }
+
+ # The root of the tree is special, as it has these funny attributes,
+ # and gets reblessed into this class.
+
+ # Initialize parser settings
+ $self->{'_implicit_tags'} = 1;
+ $self->{'_implicit_body_p_tag'} = 0;
+ # If true, trying to insert text, or any of %isPhraseMarkup right
+ # under 'body' will implicate a 'p'. If false, will just go there.
+
+ $self->{'_tighten'} = 1;
+ # whether ignorable WS in this tree should be deleted
+
+ $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
+
+ $self->{'_element_class'} = 'HTML::Element';
+ $self->{'_ignore_unknown'} = 1;
+ $self->{'_ignore_text'} = 0;
+ $self->{'_warn'} = 0;
+ $self->{'_no_space_compacting'}= 0;
+ $self->{'_store_comments'} = 0;
+ $self->{'_store_declarations'} = 1;
+ $self->{'_store_pis'} = 0;
+ $self->{'_p_strict'} = 0;
+
+ # Parse attributes passed in as arguments
+ if(@_) {
+ my %attr = @_;
+ for (keys %attr) {
+ $self->{"_$_"} = $attr{$_};
+ }
+ }
+
+ # rebless to our class
+ bless $self, $class;
+
+ $self->{'_element_count'} = 1;
+ # undocumented, informal, and maybe not exactly correct
+
+ $self->{'_head'} = $self->insert_element('head',1);
+ $self->{'_pos'} = undef; # pull it back up
+ $self->{'_body'} = $self->insert_element('body',1);
+ $self->{'_pos'} = undef; # pull it back up again
+
+ return $self;
+}
+
+#==========================================================================
+
+sub _elem # universal accessor...
+{
+ my($self, $elem, $val) = @_;
+ my $old = $self->{$elem};
+ $self->{$elem} = $val if defined $val;
+ return $old;
+}
+
+# accessors....
+sub implicit_tags { shift->_elem('_implicit_tags', @_); }
+sub implicit_body_p_tag { shift->_elem('_implicit_body_p_tag', @_); }
+sub p_strict { shift->_elem('_p_strict', @_); }
+sub no_space_compacting { shift->_elem('_no_space_compacting', @_); }
+sub ignore_unknown { shift->_elem('_ignore_unknown', @_); }
+sub ignore_text { shift->_elem('_ignore_text', @_); }
+sub ignore_ignorable_whitespace { shift->_elem('_tighten', @_); }
+sub store_comments { shift->_elem('_store_comments', @_); }
+sub store_declarations { shift->_elem('_store_declarations', @_); }
+sub store_pis { shift->_elem('_store_pis', @_); }
+sub warn { shift->_elem('_warn', @_); }
+
+
+#==========================================================================
+
+sub warning {
+ my $self = shift;
+ CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
+ # should maybe say HTML::TreeBuilder instead
+}
+
+#==========================================================================
+
+{
+ # To avoid having to rebuild these lists constantly...
+ my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
+ my $indent;
+
+ sub start {
+ return if $_[0]{'_stunted'};
+
+ # Accept a signal from HTML::Parser for start-tags.
+ my($self, $tag, $attr) = @_;
+ # Parser passes more, actually:
+ # $self->start($tag, $attr, $attrseq, $origtext)
+ # But we can merrily ignore $attrseq and $origtext.
+
+ if($tag eq 'x-html') {
+ print "Ignoring open-x-html tag.\n" if DEBUG;
+ # inserted by some lame code-generators.
+ return; # bypass tweaking.
+ }
+
+ $tag =~ s{/$}{}s; # So turns into . Silently forgive.
+
+ unless($tag =~ m/^[-_a-zA-Z0-9:%]+$/s) {
+ DEBUG and print "Start-tag name $tag is no good. Skipping.\n";
+ return;
+ # This avoids having Element's new() throw an exception.
+ }
+
+ my $ptag = (
+ my $pos = $self->{'_pos'} || $self
+ )->{'_tag'};
+ my $already_inserted;
+ #my($indent);
+ if(DEBUG) {
+ # optimization -- don't figure out indenting unless we're in debug mode
+ my @lineage = $pos->lineage;
+ $indent = ' ' x (1 + @lineage);
+ print
+ $indent, "Proposing a new \U$tag\E under ",
+ join('/', map $_->{'_tag'}, reverse($pos, @lineage)) || 'Root',
+ ".\n";
+ #} else {
+ # $indent = ' ';
+ }
+
+ #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
+ # $attr = {%$attr};
+
+ foreach my $k (keys %$attr) {
+ # Make sure some stooge doesn't have "".
+ # That happens every few million Web pages.
+ $attr->{' ' . $k} = delete $attr->{$k}
+ if length $k and substr($k,0,1) eq '_';
+ # Looks bad, but is fine for round-tripping.
+ }
+
+ my $e =
+ ($self->{'_element_class'} || 'HTML::Element')->new($tag, %$attr);
+ # Make a new element object.
+ # (Only rarely do we end up just throwing it away later in this call.)
+
+ # Some prep -- custom messiness for those damned tables, and strict P's.
+ if($self->{'_implicit_tags'}) { # wallawallawalla!
+
+ unless($HTML::TreeBuilder::isTableElement{$tag}) {
+ if ($ptag eq 'table') {
+ print $indent,
+ " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
+ if DEBUG > 1;
+ $self->insert_element('tr', 1);
+ $pos = $self->insert_element('td', 1); # yes, needs updating
+ } elsif ($ptag eq 'tr') {
+ print $indent,
+ " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
+ if DEBUG > 1;
+ $pos = $self->insert_element('td', 1); # yes, needs updating
+ }
+ $ptag = $pos->{'_tag'}; # yes, needs updating
+ }
+ # end of table-implication block.
+
+
+ # Now maybe do a little dance to enforce P-strictness.
+ # This seems like it should be integrated with the big
+ # "ALL HOPE..." block, further below, but that doesn't
+ # seem feasable.
+ if(
+ $self->{'_p_strict'}
+ and $HTML::TreeBuilder::isKnown{$tag}
+ and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag}
+ ) {
+ my $here = $pos;
+ my $here_tag = $ptag;
+ while(1) {
+ if($here_tag eq 'p') {
+ print $indent,
+ " * Inserting $tag closes strict P.\n" if DEBUG > 1;
+ $self->end(\q{p});
+ # NB: same as \'q', but less confusing to emacs cperl-mode
+ last;
+ }
+
+ #print("Lasting from $here_tag\n"),
+ last if
+ $HTML::TreeBuilder::isKnown{$here_tag}
+ and not $HTML::Tagset::is_Possible_Strict_P_Content{$here_tag};
+ # Don't keep looking up the tree if we see something that can't
+ # be strict-P content.
+
+ $here_tag = ($here = $here->{'_parent'} || last)->{'_tag'};
+ }# end while
+ $ptag = ($pos = $self->{'_pos'} || $self)->{'_tag'}; # better update!
+ }
+ # end of strict-p block.
+ }
+
+ # And now, get busy...
+ #----------------------------------------------------------------------
+ if (!$self->{'_implicit_tags'}) { # bimskalabim
+ # do nothing
+ print $indent, " * _implicit_tags is off. doing nothing\n"
+ if DEBUG > 1;
+
+ #----------------------------------------------------------------------
+ } elsif ($HTML::TreeBuilder::isHeadOrBodyElement{$tag}) {
+ if ($pos->is_inside('body')) { # all is well
+ print $indent,
+ " * ambilocal element \U$tag\E is fine under BODY.\n"
+ if DEBUG > 1;
+ } elsif ($pos->is_inside('head')) {
+ print $indent,
+ " * ambilocal element \U$tag\E is fine under HEAD.\n"
+ if DEBUG > 1;
+ } else {
+ # In neither head nor body! mmmmm... put under head?
+
+ if ($ptag eq 'html') { # expected case
+ # TODO?? : would there ever be a case where _head would be
+ # absent from a tree that would ever be accessed at this
+ # point?
+ die "Where'd my head go?" unless ref $self->{'_head'};
+ if ($self->{'_head'}{'_implicit'}) {
+ print $indent,
+ " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
+ if DEBUG > 1;
+ # or rather, points us at it.
+ $self->{'_pos'} = $self->{'_head'}; # to insert under...
+ } else {
+ $self->warning(
+ "Ambilocal element <$tag> not under HEAD or BODY!?");
+ # Put it under HEAD by default, I guess
+ $self->{'_pos'} = $self->{'_head'}; # to insert under...
+ }
+
+ } else {
+ # Neither under head nor body, nor right under html... pass thru?
+ $self->warning(
+ "Ambilocal element <$tag> neither under head nor body, nor right under html!?");
+ }
+ }
+
+ #----------------------------------------------------------------------
+ } elsif ($HTML::TreeBuilder::isBodyElement{$tag}) {
+
+ # Ensure that we are within
+ if($ptag eq 'body') {
+ # We're good.
+ } elsif($HTML::TreeBuilder::isBodyElement{$ptag} # glarg
+ and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag}
+ ) {
+ # Special case: Save ourselves a call to is_inside further down.
+ # If our $ptag is an isBodyElement element (but not an
+ # isHeadOrBodyElement element), then we must be under body!
+ print $indent, " * Inferring that $ptag is under BODY.\n",
+ if DEBUG > 3;
+ # I think this and the test for 'body' trap everything
+ # bodyworthy, except the case where the parent element is
+ # under an unknown element that's a descendant of body.
+ } elsif ($pos->is_inside('head')) {
+ print $indent,
+ " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
+ if DEBUG > 1;
+ $ptag = (
+ $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating
+ || die "Where'd my body go?"
+ )->{'_tag'}; # yes, needs updating
+ } elsif (! $pos->is_inside('body')) {
+ print $indent,
+ " * body-element \U$tag\E makes implicit BODY.\n"
+ if DEBUG > 1;
+ $ptag = (
+ $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating
+ || die "Where'd my body go?"
+ )->{'_tag'}; # yes, needs updating
+ }
+ # else we ARE under body, so okay.
+
+
+ # Handle implicit endings and insert based on and position
+ # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
+ if ($tag eq 'p' or
+ $tag eq 'h1' or $tag eq 'h2' or $tag eq 'h3' or
+ $tag eq 'h4' or $tag eq 'h5' or $tag eq 'h6' or
+ $tag eq 'form'
+ # Hm, should