X-Git-Url: http://git.maemo.org/git/?p=dh-make-perl;a=blobdiff_plain;f=dev%2Fi386%2Flibhtml-tree-perl%2Flibhtml-tree-perl-3.23%2Flib%2FHTML%2FTreeBuilder.pm;fp=dev%2Fi386%2Flibhtml-tree-perl%2Flibhtml-tree-perl-3.23%2Flib%2FHTML%2FTreeBuilder.pm;h=93c8bd67ac4dd5866c137ed0c749d19656e7ae05;hp=0000000000000000000000000000000000000000;hb=8977e561d8a9eae6959218b0306c9df2056a38a9;hpb=df794b845212301ea0d267c919232538bfef356a diff --git a/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm b/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm new file mode 100644 index 0000000..93c8bd6 --- /dev/null +++ b/dev/i386/libhtml-tree-perl/libhtml-tree-perl-3.23/lib/HTML/TreeBuilder.pm @@ -0,0 +1,1869 @@ +package HTML::TreeBuilder; + +use strict; +use integer; # vroom vroom! +use Carp (); +use vars qw(@ISA $VERSION $DEBUG); +$VERSION = '3.23'; + +#--------------------------------------------------------------------------- +# Make a 'DEBUG' constant... + +BEGIN { + # We used to have things like + # print $indent, "lalala" if $Debug; + # But there were an awful lot of having to evaluate $Debug's value. + # If we make that depend on a constant, like so: + # sub DEBUG () { 1 } # or whatever value. + # ... + # print $indent, "lalala" if DEBUG; + # Which at compile-time (thru the miracle of constant folding) turns into: + # print $indent, "lalala"; + # or, if DEBUG is a constant with a true value, then that print statement + # is simply optimized away, and doesn't appear in the target code at all. + # If you don't believe me, run: + # perl -MO=Deparse,-uHTML::TreeBuilder -e 'BEGIN { \ + # $HTML::TreeBuilder::DEBUG = 4} use HTML::TreeBuilder' + # and see for yourself (substituting whatever value you want for $DEBUG + # there). + + if(defined &DEBUG) { + # Already been defined! Do nothing. + } elsif($] < 5.00404) { + # Grudgingly accomodate ancient (pre-constant) versions. + eval 'sub DEBUG { $Debug } '; + } elsif(!$DEBUG) { + eval 'sub DEBUG () {0}'; # Make it a constant. + } elsif($DEBUG =~ m<^\d+$>s) { + eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. + } else { # WTF? + warn "Non-numeric value \"$DEBUG\" in \$HTML::Element::DEBUG"; + eval 'sub DEBUG () { $DEBUG }'; # I guess. + } +} + +#--------------------------------------------------------------------------- + +use HTML::Entities (); +use HTML::Tagset 3.02 (); + +use HTML::Element (); +use HTML::Parser (); +@ISA = qw(HTML::Element HTML::Parser); + # This looks schizoid, I know. + # It's not that we ARE an element AND a parser. + # We ARE an element, but one that knows how to handle signals + # (method calls) from Parser in order to elaborate its subtree. + +# Legacy aliases: +*HTML::TreeBuilder::isKnown = \%HTML::Tagset::isKnown; +*HTML::TreeBuilder::canTighten = \%HTML::Tagset::canTighten; +*HTML::TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; +*HTML::TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; +*HTML::TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; +*HTML::TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; +*HTML::TreeBuilder::isList = \%HTML::Tagset::isList; +*HTML::TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; +*HTML::TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; +*HTML::TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; + +#========================================================================== +# Two little shortcut constructors: + +sub new_from_file { # or from a FH + my $class = shift; + Carp::croak("new_from_file takes only one argument") + unless @_ == 1; + Carp::croak("new_from_file is a class method only") + if ref $class; + my $new = $class->new(); + $new->parse_file($_[0]); + return $new; +} + +sub new_from_content { # from any number of scalars + my $class = shift; + Carp::croak("new_from_content is a class method only") + if ref $class; + my $new = $class->new(); + foreach my $whunk (@_) { + if(ref($whunk) eq 'SCALAR') { + $new->parse($$whunk); + } else { + $new->parse($whunk); + } + last if $new->{'_stunted'}; # might as well check that. + } + $new->eof(); + return $new; +} + +# TODO: document more fully? +sub parse_content { # from any number of scalars + my $tree = shift; + my $retval; + foreach my $whunk (@_) { + if(ref($whunk) eq 'SCALAR') { + $retval = $tree->parse($$whunk); + } else { + $retval = $tree->parse($whunk); + } + last if $tree->{'_stunted'}; # might as well check that. + } + $tree->eof(); + return $retval; +} + + +#--------------------------------------------------------------------------- + +sub new { # constructor! + my $class = shift; + $class = ref($class) || $class; + + my $self = HTML::Element->new('html'); # Initialize HTML::Element part + { + # A hack for certain strange versions of Parser: + my $other_self = HTML::Parser->new(); + %$self = (%$self, %$other_self); # copy fields + # Yes, multiple inheritance is messy. Kids, don't try this at home. + bless $other_self, "HTML::TreeBuilder::_hideyhole"; + # whack it out of the HTML::Parser class, to avoid the destructor + } + + # The root of the tree is special, as it has these funny attributes, + # and gets reblessed into this class. + + # Initialize parser settings + $self->{'_implicit_tags'} = 1; + $self->{'_implicit_body_p_tag'} = 0; + # If true, trying to insert text, or any of %isPhraseMarkup right + # under 'body' will implicate a 'p'. If false, will just go there. + + $self->{'_tighten'} = 1; + # whether ignorable WS in this tree should be deleted + + $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag + + $self->{'_element_class'} = 'HTML::Element'; + $self->{'_ignore_unknown'} = 1; + $self->{'_ignore_text'} = 0; + $self->{'_warn'} = 0; + $self->{'_no_space_compacting'}= 0; + $self->{'_store_comments'} = 0; + $self->{'_store_declarations'} = 1; + $self->{'_store_pis'} = 0; + $self->{'_p_strict'} = 0; + + # Parse attributes passed in as arguments + if(@_) { + my %attr = @_; + for (keys %attr) { + $self->{"_$_"} = $attr{$_}; + } + } + + # rebless to our class + bless $self, $class; + + $self->{'_element_count'} = 1; + # undocumented, informal, and maybe not exactly correct + + $self->{'_head'} = $self->insert_element('head',1); + $self->{'_pos'} = undef; # pull it back up + $self->{'_body'} = $self->insert_element('body',1); + $self->{'_pos'} = undef; # pull it back up again + + return $self; +} + +#========================================================================== + +sub _elem # universal accessor... +{ + my($self, $elem, $val) = @_; + my $old = $self->{$elem}; + $self->{$elem} = $val if defined $val; + return $old; +} + +# accessors.... +sub implicit_tags { shift->_elem('_implicit_tags', @_); } +sub implicit_body_p_tag { shift->_elem('_implicit_body_p_tag', @_); } +sub p_strict { shift->_elem('_p_strict', @_); } +sub no_space_compacting { shift->_elem('_no_space_compacting', @_); } +sub ignore_unknown { shift->_elem('_ignore_unknown', @_); } +sub ignore_text { shift->_elem('_ignore_text', @_); } +sub ignore_ignorable_whitespace { shift->_elem('_tighten', @_); } +sub store_comments { shift->_elem('_store_comments', @_); } +sub store_declarations { shift->_elem('_store_declarations', @_); } +sub store_pis { shift->_elem('_store_pis', @_); } +sub warn { shift->_elem('_warn', @_); } + + +#========================================================================== + +sub warning { + my $self = shift; + CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; + # should maybe say HTML::TreeBuilder instead +} + +#========================================================================== + +{ + # To avoid having to rebuild these lists constantly... + my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; + my $indent; + + sub start { + return if $_[0]{'_stunted'}; + + # Accept a signal from HTML::Parser for start-tags. + my($self, $tag, $attr) = @_; + # Parser passes more, actually: + # $self->start($tag, $attr, $attrseq, $origtext) + # But we can merrily ignore $attrseq and $origtext. + + if($tag eq 'x-html') { + print "Ignoring open-x-html tag.\n" if DEBUG; + # inserted by some lame code-generators. + return; # bypass tweaking. + } + + $tag =~ s{/$}{}s; # So turns into . Silently forgive. + + unless($tag =~ m/^[-_a-zA-Z0-9:%]+$/s) { + DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; + return; + # This avoids having Element's new() throw an exception. + } + + my $ptag = ( + my $pos = $self->{'_pos'} || $self + )->{'_tag'}; + my $already_inserted; + #my($indent); + if(DEBUG) { + # optimization -- don't figure out indenting unless we're in debug mode + my @lineage = $pos->lineage; + $indent = ' ' x (1 + @lineage); + print + $indent, "Proposing a new \U$tag\E under ", + join('/', map $_->{'_tag'}, reverse($pos, @lineage)) || 'Root', + ".\n"; + #} else { + # $indent = ' '; + } + + #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; + # $attr = {%$attr}; + + foreach my $k (keys %$attr) { + # Make sure some stooge doesn't have "". + # That happens every few million Web pages. + $attr->{' ' . $k} = delete $attr->{$k} + if length $k and substr($k,0,1) eq '_'; + # Looks bad, but is fine for round-tripping. + } + + my $e = + ($self->{'_element_class'} || 'HTML::Element')->new($tag, %$attr); + # Make a new element object. + # (Only rarely do we end up just throwing it away later in this call.) + + # Some prep -- custom messiness for those damned tables, and strict P's. + if($self->{'_implicit_tags'}) { # wallawallawalla! + + unless($HTML::TreeBuilder::isTableElement{$tag}) { + if ($ptag eq 'table') { + print $indent, + " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" + if DEBUG > 1; + $self->insert_element('tr', 1); + $pos = $self->insert_element('td', 1); # yes, needs updating + } elsif ($ptag eq 'tr') { + print $indent, + " * Phrasal \U$tag\E right under TR makes an implicit TD\n" + if DEBUG > 1; + $pos = $self->insert_element('td', 1); # yes, needs updating + } + $ptag = $pos->{'_tag'}; # yes, needs updating + } + # end of table-implication block. + + + # Now maybe do a little dance to enforce P-strictness. + # This seems like it should be integrated with the big + # "ALL HOPE..." block, further below, but that doesn't + # seem feasable. + if( + $self->{'_p_strict'} + and $HTML::TreeBuilder::isKnown{$tag} + and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} + ) { + my $here = $pos; + my $here_tag = $ptag; + while(1) { + if($here_tag eq 'p') { + print $indent, + " * Inserting $tag closes strict P.\n" if DEBUG > 1; + $self->end(\q{p}); + # NB: same as \'q', but less confusing to emacs cperl-mode + last; + } + + #print("Lasting from $here_tag\n"), + last if + $HTML::TreeBuilder::isKnown{$here_tag} + and not $HTML::Tagset::is_Possible_Strict_P_Content{$here_tag}; + # Don't keep looking up the tree if we see something that can't + # be strict-P content. + + $here_tag = ($here = $here->{'_parent'} || last)->{'_tag'}; + }# end while + $ptag = ($pos = $self->{'_pos'} || $self)->{'_tag'}; # better update! + } + # end of strict-p block. + } + + # And now, get busy... + #---------------------------------------------------------------------- + if (!$self->{'_implicit_tags'}) { # bimskalabim + # do nothing + print $indent, " * _implicit_tags is off. doing nothing\n" + if DEBUG > 1; + + #---------------------------------------------------------------------- + } elsif ($HTML::TreeBuilder::isHeadOrBodyElement{$tag}) { + if ($pos->is_inside('body')) { # all is well + print $indent, + " * ambilocal element \U$tag\E is fine under BODY.\n" + if DEBUG > 1; + } elsif ($pos->is_inside('head')) { + print $indent, + " * ambilocal element \U$tag\E is fine under HEAD.\n" + if DEBUG > 1; + } else { + # In neither head nor body! mmmmm... put under head? + + if ($ptag eq 'html') { # expected case + # TODO?? : would there ever be a case where _head would be + # absent from a tree that would ever be accessed at this + # point? + die "Where'd my head go?" unless ref $self->{'_head'}; + if ($self->{'_head'}{'_implicit'}) { + print $indent, + " * ambilocal element \U$tag\E makes an implicit HEAD.\n" + if DEBUG > 1; + # or rather, points us at it. + $self->{'_pos'} = $self->{'_head'}; # to insert under... + } else { + $self->warning( + "Ambilocal element <$tag> not under HEAD or BODY!?"); + # Put it under HEAD by default, I guess + $self->{'_pos'} = $self->{'_head'}; # to insert under... + } + + } else { + # Neither under head nor body, nor right under html... pass thru? + $self->warning( + "Ambilocal element <$tag> neither under head nor body, nor right under html!?"); + } + } + + #---------------------------------------------------------------------- + } elsif ($HTML::TreeBuilder::isBodyElement{$tag}) { + + # Ensure that we are within + if($ptag eq 'body') { + # We're good. + } elsif($HTML::TreeBuilder::isBodyElement{$ptag} # glarg + and not $HTML::TreeBuilder::isHeadOrBodyElement{$ptag} + ) { + # Special case: Save ourselves a call to is_inside further down. + # If our $ptag is an isBodyElement element (but not an + # isHeadOrBodyElement element), then we must be under body! + print $indent, " * Inferring that $ptag is under BODY.\n", + if DEBUG > 3; + # I think this and the test for 'body' trap everything + # bodyworthy, except the case where the parent element is + # under an unknown element that's a descendant of body. + } elsif ($pos->is_inside('head')) { + print $indent, + " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" + if DEBUG > 1; + $ptag = ( + $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating + || die "Where'd my body go?" + )->{'_tag'}; # yes, needs updating + } elsif (! $pos->is_inside('body')) { + print $indent, + " * body-element \U$tag\E makes implicit BODY.\n" + if DEBUG > 1; + $ptag = ( + $pos = $self->{'_pos'} = $self->{'_body'} # yes, needs updating + || die "Where'd my body go?" + )->{'_tag'}; # yes, needs updating + } + # else we ARE under body, so okay. + + + # Handle implicit endings and insert based on and position + # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... + if ($tag eq 'p' or + $tag eq 'h1' or $tag eq 'h2' or $tag eq 'h3' or + $tag eq 'h4' or $tag eq 'h5' or $tag eq 'h6' or + $tag eq 'form' + # Hm, should
really be here?! + ) { + # Can't have

, or inside these + $self->end($_Closed_by_structurals, + @HTML::TreeBuilder::p_closure_barriers + # used to be just li! + ); + + } elsif ($tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl') { + # Can't have lists inside -- in the unlikely + # event anyone tries to put them there! + if ( + $ptag eq 'h1' or $ptag eq 'h2' or $ptag eq 'h3' or + $ptag eq 'h4' or $ptag eq 'h5' or $ptag eq 'h6' + ) { + $self->end(\$ptag); + } + # TODO: Maybe keep closing up the tree until + # the ptag isn't any of the above? + # But anyone that says

    ... + # deserves what they get anyway. + + } elsif ($tag eq 'li') { # list item + # Get under a list tag, one way or another + unless( + exists $HTML::TreeBuilder::isList{$ptag} or + $self->end(\q{*}, keys %HTML::TreeBuilder::isList) #' + ) { + print $indent, + " * inserting implicit UL for lack of containing ", + join('|', keys %HTML::TreeBuilder::isList), ".\n" + if DEBUG > 1; + $self->insert_element('ul', 1); + } + + } elsif ($tag eq 'dt' or $tag eq 'dd') { + # Get under a DL, one way or another + unless($ptag eq 'dl' or $self->end(\q{*}, 'dl')) { #' + print $indent, + " * inserting implicit DL for lack of containing DL.\n" + if DEBUG > 1; + $self->insert_element('dl', 1); + } + + } elsif ($HTML::TreeBuilder::isFormElement{$tag}) { + if($self->{'_ignore_formies_outside_form'} # TODO: document this + and not $pos->is_inside('form') + ) { + print $indent, + " * ignoring \U$tag\E because not in a FORM.\n" + if DEBUG > 1; + return; # bypass tweaking. + } + if($tag eq 'option') { + # return unless $ptag eq 'select'; + $self->end(\q{option}); + $ptag = ($self->{'_pos'} || $self)->{'_tag'}; + unless($ptag eq 'select' or $ptag eq 'optgroup') { + print $indent, " * \U$tag\E makes an implicit SELECT.\n" + if DEBUG > 1; + $pos = $self->insert_element('select', 1); + # but not a very useful select -- has no 'name' attribute! + # is $pos's value used after this? + } + } + } elsif ($HTML::TreeBuilder::isTableElement{$tag}) { + if(!$pos->is_inside('table')) { + print $indent, " * \U$tag\E makes an implicit TABLE\n" + if DEBUG > 1; + $self->insert_element('table', 1); + } + + if($tag eq 'td' or $tag eq 'th') { + # Get under a tr one way or another + unless( + $ptag eq 'tr' # either under a tr + or $self->end(\q{*}, 'tr', 'table') #or we can get under one + ) { + print $indent, + " * \U$tag\E under \U$ptag\E makes an implicit TR\n" + if DEBUG > 1; + $self->insert_element('tr', 1); + # presumably pos's value isn't used after this. + } + } else { + $self->end(\$tag, 'table'); #' + } + # Hmm, I guess this is right. To work it out: + # tr closes any open tr (limited at a table) + # thead closes any open thead (limited at a table) + # tbody closes any open tbody (limited at a table) + # tfoot closes any open tfoot (limited at a table) + # colgroup closes any open colgroup (limited at a table) + # col can try, but will always fail, at the enclosing table, + # as col is empty, and therefore never open! + # But! + # td closes any open td OR th (limited at a table) + # th closes any open th OR td (limited at a table) + # ...implementable as "close to a tr, or make a tr" + + } elsif ($HTML::TreeBuilder::isPhraseMarkup{$tag}) { + if($ptag eq 'body' and $self->{'_implicit_body_p_tag'}) { + print + " * Phrasal \U$tag\E right under BODY makes an implicit P\n" + if DEBUG > 1; + $pos = $self->insert_element('p', 1); + # is $pos's value used after this? + } + } + # End of implicit endings logic + + # End of "elsif ($HTML::TreeBuilder::isBodyElement{$tag}" + #---------------------------------------------------------------------- + + } elsif ($HTML::TreeBuilder::isHeadElement{$tag}) { + if ($pos->is_inside('body')) { + print $indent, " * head element \U$tag\E found inside BODY!\n" + if DEBUG; + $self->warning("Header element <$tag> in body"); # [sic] + } elsif (!$pos->is_inside('head')) { + print $indent, " * head element \U$tag\E makes an implicit HEAD.\n" + if DEBUG > 1; + } else { + print $indent, + " * head element \U$tag\E goes inside existing HEAD.\n" + if DEBUG > 1; + } + $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; + + #---------------------------------------------------------------------- + } elsif ($tag eq 'html') { + if(delete $self->{'_implicit'}) { # first time here + print $indent, " * good! found the real HTML element!\n" + if DEBUG > 1; + } else { + print $indent, " * Found a second HTML element\n" + if DEBUG; + $self->warning("Found a nested element"); + } + + # in either case, migrate attributes to the real element + for (keys %$attr) { + $self->attr($_, $attr->{$_}); + } + $self->{'_pos'} = undef; + return $self; # bypass tweaking. + + #---------------------------------------------------------------------- + } elsif ($tag eq 'head') { + my $head = $self->{'_head'} || die "Where'd my head go?"; + if(delete $head->{'_implicit'}) { # first time here + print $indent, " * good! found the real HEAD element!\n" + if DEBUG > 1; + } else { # been here before + print $indent, " * Found a second HEAD element\n" + if DEBUG; + $self->warning("Found a second element"); + } + + # in either case, migrate attributes to the real element + for (keys %$attr) { + $head->attr($_, $attr->{$_}); + } + return $self->{'_pos'} = $head; # bypass tweaking. + + #---------------------------------------------------------------------- + } elsif ($tag eq 'body') { + my $body = $self->{'_body'} || die "Where'd my body go?"; + if(delete $body->{'_implicit'}) { # first time here + print $indent, " * good! found the real BODY element!\n" + if DEBUG > 1; + } else { # been here before + print $indent, " * Found a second BODY element\n" + if DEBUG; + $self->warning("Found a second element"); + } + + # in either case, migrate attributes to the real element + for (keys %$attr) { + $body->attr($_, $attr->{$_}); + } + return $self->{'_pos'} = $body; # bypass tweaking. + + #---------------------------------------------------------------------- + } elsif ($tag eq 'frameset') { + if( + !($self->{'_frameset_seen'}++) # first frameset seen + and !$self->{'_noframes_seen'} + # otherwise it'll be under the noframes already + and !$self->is_inside('body') + ) { + # The following is a bit of a hack. We don't use the normal + # insert_element because 1) we don't want it as _pos, but instead + # right under $self, and 2), more importantly, that we don't want + # this inserted at the /end/ of $self's content_list, but instead + # in the middle of it, specifiaclly right before the body element. + # + my $c = $self->{'_content'} || die "Contentless root?"; + my $body = $self->{'_body'} || die "Where'd my BODY go?"; + for(my $i = 0; $i < @$c; ++$i) { + if($c->[$i] eq $body) { + splice(@$c, $i, 0, $self->{'_pos'} = $pos = $e); + $e->{'_parent'} = $self; + $already_inserted = 1; + print $indent, " * inserting 'frameset' right before BODY.\n" + if DEBUG > 1; + last; + } + } + die "BODY not found in children of root?" unless $already_inserted; + } + + } elsif ($tag eq 'frame') { + # Okay, fine, pass thru. + # Should probably enforce that these should be under a frameset. + # But hey. Ditto for enforcing that 'noframes' should be under + # a 'frameset', as the DTDs say. + + } elsif ($tag eq 'noframes') { + # This basically assumes there'll be exactly one 'noframes' element + # per document. At least, only the first one gets to have the + # body under it. And if there are no noframes elements, then + # the body pretty much stays where it is. Is that ever a problem? + if($self->{'_noframes_seen'}++) { + print $indent, " * ANOTHER noframes element?\n" if DEBUG; + } else { + if($pos->is_inside('body')) { + print $indent, " * 'noframes' inside 'body'. Odd!\n" if DEBUG; + # In that odd case, we /can't/ make body a child of 'noframes', + # because it's an ancestor of the 'noframes'! + } else { + $e->push_content( $self->{'_body'} || die "Where'd my body go?" ); + print $indent, " * Moving body to be under noframes.\n" if DEBUG; + } + } + + #---------------------------------------------------------------------- + } else { + # unknown tag + if ($self->{'_ignore_unknown'}) { + print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; + $self->warning("Skipping unknown tag $tag"); + return; + } else { + print $indent, " * Accepting unknown tag \U$tag\E\n" + if DEBUG; + } + } + #---------------------------------------------------------------------- + # End of mumbo-jumbo + + + print + $indent, "(Attaching ", $e->{'_tag'}, " under ", + ($self->{'_pos'} || $self)->{'_tag'}, ")\n" + # because if _pos isn't defined, it goes under self + if DEBUG; + + + # The following if-clause is to delete /some/ ignorable whitespace + # nodes, as we're making the tree. + # This'd be a node we'd catch later anyway, but we might as well + # nip it in the bud now. + # This doesn't catch /all/ deletable WS-nodes, so we do have to call + # the tightener later to catch the rest. + + if($self->{'_tighten'} and !$self->{'_ignore_text'}) { # if tightenable + my($sibs, $par); + if( + ($sibs = ( $par = $self->{'_pos'} || $self )->{'_content'}) + and @$sibs # parent already has content + and !ref($sibs->[-1]) # and the last one there is a text node + and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace + + and ( # one of these has to be eligible... + $HTML::TreeBuilder::canTighten{$tag} + or + ( + (@$sibs == 1) + ? # WS is leftmost -- so parent matters + $HTML::TreeBuilder::canTighten{$par->{'_tag'}} + : # WS is after another node -- it matters + (ref $sibs->[-2] + and $HTML::TreeBuilder::canTighten{$sibs->[-2]{'_tag'}} + ) + ) + ) + + and !$par->is_inside('pre', 'xmp', 'textarea', 'plaintext') + # we're clear + ) { + pop @$sibs; + print $indent, "Popping a preceding all-WS node\n" if DEBUG; + } + } + + $self->insert_element($e) unless $already_inserted; + + if(DEBUG) { + if($self->{'_pos'}) { + print + $indent, "(Current lineage of pos: \U$tag\E under ", + join('/', + reverse( + # $self->{'_pos'}{'_tag'}, # don't list myself! + $self->{'_pos'}->lineage_tag_names + ) + ), + ".)\n"; + } else { + print $indent, "(Pos points nowhere!?)\n"; + } + } + + unless(($self->{'_pos'} || '') eq $e) { + # if it's an empty element -- i.e., if it didn't change the _pos + &{ $self->{"_tweak_$tag"} + || $self->{'_tweak_*'} + || return $e + }(map $_, $e, $tag, $self); # make a list so the user can't clobber + } + + return $e; + } +} + +#========================================================================== + +{ + my $indent; + + sub end { + return if $_[0]{'_stunted'}; + + # Either: Acccept an end-tag signal from HTML::Parser + # Or: Method for closing currently open elements in some fairly complex + # way, as used by other methods in this class. + my($self, $tag, @stop) = @_; + if($tag eq 'x-html') { + print "Ignoring close-x-html tag.\n" if DEBUG; + # inserted by some lame code-generators. + return; + } + + unless(ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s) { + DEBUG and print "End-tag name $tag is no good. Skipping.\n"; + return; + # This avoids having Element's new() throw an exception. + } + + # This method accepts two calling formats: + # 1) from Parser: $self->end('tag_name', 'origtext') + # in which case we shouldn't mistake origtext as a blocker tag + # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) + # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) + + # End the specified tag, but don't move above any of the blocker tags. + # The tag can also be a reference to an array. Terminate the first + # tag found. + + my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; + # $p and $ptag are sort-of stratch + + if(ref($tag)) { + # First param is a ref of one sort or another -- + # THE CALL IS COMING FROM INSIDE THE HOUSE! + $tag = $$tag if ref($tag) eq 'SCALAR'; + # otherwise it's an arrayref. + } else { + # the call came from Parser -- just ignore origtext + @stop = (); + } + + #my($indent); + if(DEBUG) { + # optimization -- don't figure out depth unless we're in debug mode + my @lineage_tags = $p->lineage_tag_names; + $indent = ' ' x (1 + @lineage_tags); + + # now announce ourselves + print $indent, "Ending ", + ref($tag) ? ('[', join(' ', @$tag ), ']') : "\U$tag\E", + scalar(@stop) ? (" no higher than [", join(' ', @stop), "]" ) + : (), ".\n" + ; + + print $indent, " (Current lineage: ", join('/', @lineage_tags), ".)\n" + if DEBUG > 1; + + if(DEBUG > 3) { + #my( + # $package, $filename, $line, $subroutine, + # $hasargs, $wantarray, $evaltext, $is_require) = caller; + print $indent, + " (Called from ", (caller(1))[3], ' line ', (caller(1))[2], + ")\n"; + } + + #} else { + # $indent = ' '; + } + # End of if DEBUG + + # Now actually do it + my @to_close; + if($tag eq '*') { + # Special -- close everything up to (but not including) the first + # limiting tag, or return if none found. Somewhat of a special case. + PARENT: + while (defined $p) { + $ptag = $p->{'_tag'}; + print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; + for (@stop) { + if($ptag eq $_) { + print $indent, " (Hit a $_; closing everything up to here.)\n" + if DEBUG > 2; + last PARENT; + } + } + push @to_close, $p; + $p = $p->{'_parent'}; # no match so far? keep moving up + print + $indent, + " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" + if DEBUG > 1; + ; + } + unless(defined $p) { # We never found what we were looking for. + print $indent, " (We never found a limit.)\n" if DEBUG > 1; + return; + } + #print + # $indent, + # " (To close: ", join('/', map $_->tag, @to_close), ".)\n" + # if DEBUG > 4; + + # Otherwise update pos and fall thru. + $self->{'_pos'} = $p; + } elsif (ref $tag) { + # Close the first of any of the matching tags, giving up if you hit + # any of the stop-tags. + PARENT: + while (defined $p) { + $ptag = $p->{'_tag'}; + print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; + for (@$tag) { + if($ptag eq $_) { + print $indent, " (Closing $_.)\n" if DEBUG > 2; + last PARENT; + } + } + for (@stop) { + if($ptag eq $_) { + print $indent, " (Hit a limiting $_ -- bailing out.)\n" + if DEBUG > 1; + return; # so it was all for naught + } + } + push @to_close, $p; + $p = $p->{'_parent'}; + } + return unless defined $p; # We went off the top of the tree. + # Otherwise specified element was found; set pos to its parent. + push @to_close, $p; + $self->{'_pos'} = $p->{'_parent'}; + } else { + # Close the first of the specified tag, giving up if you hit + # any of the stop-tags. + while (defined $p) { + $ptag = $p->{'_tag'}; + print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; + if($ptag eq $tag) { + print $indent, " (Closing $tag.)\n" if DEBUG > 2; + last; + } + for (@stop) { + if($ptag eq $_) { + print $indent, " (Hit a limiting $_ -- bailing out.)\n" + if DEBUG > 1; + return; # so it was all for naught + } + } + push @to_close, $p; + $p = $p->{'_parent'}; + } + return unless defined $p; # We went off the top of the tree. + # Otherwise specified element was found; set pos to its parent. + push @to_close, $p; + $self->{'_pos'} = $p->{'_parent'}; + } + + $self->{'_pos'} = undef if $self eq ($self->{'_pos'} || ''); + print $indent, "(Pos now points to ", + $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" + if DEBUG > 1; + + ### EXPENSIVE, because has to check that it's not under a pre + ### or a CDATA-parent. That's one more method call per end()! + ### Might as well just do this at the end of the tree-parse, I guess, + ### at which point we'd be parsing top-down, and just not traversing + ### under pre's or CDATA-parents. + ## + ## Take this opportunity to nix any terminal whitespace nodes. + ## TODO: consider whether this (plus the logic in start(), above) + ## would ever leave any WS nodes in the tree. + ## If not, then there's no reason to have eof() call + ## delete_ignorable_whitespace on the tree, is there? + ## + #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and + # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) + #) { # if tightenable + # my($children, $e_tag); + # foreach my $e (reverse @to_close) { # going top-down + # last if 'pre' eq ($e_tag = $e->{'_tag'}) or + # $HTML::Tagset::isCDATA_Parent{$e_tag}; + # + # if( + # $children = $e->{'_content'} + # and @$children # has children + # and !ref($children->[-1]) + # and $children->[-1] =~ m<^\s+$>s # last node is all-WS + # and + # ( + # # has a tightable parent: + # $HTML::TreeBuilder::canTighten{ $e_tag } + # or + # ( # has a tightenable left sibling: + # @$children > 1 and + # ref($children->[-2]) + # and $HTML::TreeBuilder::canTighten{ $children->[-2]{'_tag'} } + # ) + # ) + # ) { + # pop @$children; + # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, + # # " (", $e->address, ") while exiting.\n" if DEBUG; + # } + # } + #} + + + foreach my $e (@to_close) { + # Call the applicable callback, if any + $ptag = $e->{'_tag'}; + &{ $self->{"_tweak_$ptag"} + || $self->{'_tweak_*'} + || next + }(map $_, $e, $ptag, $self); + print $indent, "Back from tweaking.\n" if DEBUG; + last if $self->{'_stunted'}; # in case one of the handlers called stunt + } + return @to_close; + } +} + +#========================================================================== +{ + my($indent, $nugget); + + sub text { + return if $_[0]{'_stunted'}; + + # Accept a "here's a text token" signal from HTML::Parser. + my($self, $text, $is_cdata) = @_; + # the >3.0 versions of Parser may pass a cdata node. + # Thanks to Gisle Aas for pointing this out. + + return unless length $text; # I guess that's always right + + my $ignore_text = $self->{'_ignore_text'}; + my $no_space_compacting = $self->{'_no_space_compacting'}; + + my $pos = $self->{'_pos'} || $self; + + HTML::Entities::decode($text) + unless $ignore_text || $is_cdata + || $HTML::Tagset::isCDATA_Parent{$pos->{'_tag'}}; + + #my($indent, $nugget); + if(DEBUG) { + # optimization -- don't figure out depth unless we're in debug mode + my @lineage_tags = $pos->lineage_tag_names; + $indent = ' ' x (1 + @lineage_tags); + + $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...'); + $nugget =~ s<([\x00-\x1F])> + <'\\x'.(unpack("H2",$1))>eg; + print + $indent, "Proposing a new text node ($nugget) under ", + join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root', + ".\n"; + + #} else { + # $indent = ' '; + } + + + my $ptag; + if ($HTML::Tagset::isCDATA_Parent{$ptag = $pos->{'_tag'}} + #or $pos->is_inside('pre') + or $pos->is_inside('pre', 'textarea') + ) { + return if $ignore_text; + $pos->push_content($text); + } else { + # return unless $text =~ /\S/; # This is sometimes wrong + + if (!$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/) { + # don't change anything + } elsif ($ptag eq 'head' or $ptag eq 'noframes') { + if($self->{'_implicit_body_p_tag'}) { + print $indent, + " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" + if DEBUG > 1; + $self->end(\$ptag); + $pos = + $self->{'_body'} + ? ($self->{'_pos'} = $self->{'_body'}) # expected case + : $self->insert_element('body', 1); + $pos = $self->insert_element('p', 1); + } else { + print $indent, + " * Text node under \U$ptag\E closes, implicates BODY.\n" + if DEBUG > 1; + $self->end(\$ptag); + $pos = + $self->{'_body'} + ? ($self->{'_pos'} = $self->{'_body'}) # expected case + : $self->insert_element('body', 1); + } + } elsif ($ptag eq 'html') { + if($self->{'_implicit_body_p_tag'}) { + print $indent, + " * Text node under HTML implicates BODY and P.\n" + if DEBUG > 1; + $pos = + $self->{'_body'} + ? ($self->{'_pos'} = $self->{'_body'}) # expected case + : $self->insert_element('body', 1); + $pos = $self->insert_element('p', 1); + } else { + print $indent, + " * Text node under HTML implicates BODY.\n" + if DEBUG > 1; + $pos = + $self->{'_body'} + ? ($self->{'_pos'} = $self->{'_body'}) # expected case + : $self->insert_element('body', 1); + #print "POS is $pos, ", $pos->{'_tag'}, "\n"; + } + } elsif ($ptag eq 'body') { + if($self->{'_implicit_body_p_tag'}) { + print $indent, + " * Text node under BODY implicates P.\n" + if DEBUG > 1; + $pos = $self->insert_element('p', 1); + } + } elsif ($ptag eq 'table') { + print $indent, + " * Text node under TABLE implicates TR and TD.\n" + if DEBUG > 1; + $self->insert_element('tr', 1); + $pos = $self->insert_element('td', 1); + # double whammy! + } elsif ($ptag eq 'tr') { + print $indent, + " * Text node under TR implicates TD.\n" + if DEBUG > 1; + $pos = $self->insert_element('td', 1); + } + # elsif ( + # # $ptag eq 'li' || + # # $ptag eq 'dd' || + # $ptag eq 'form') { + # $pos = $self->insert_element('p', 1); + #} + + + # Whatever we've done above should have had the side + # effect of updating $self->{'_pos'} + + + #print "POS is now $pos, ", $pos->{'_tag'}, "\n"; + + return if $ignore_text; + $text =~ s/[\n\r\f\t ]+/ /g # canonical space + unless $no_space_compacting ; + + print + $indent, " (Attaching text node ($nugget) under ", + # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, + $pos->{'_tag'}, + ").\n" + if DEBUG > 1; + + $pos->push_content($text); + } + + &{ $self->{'_tweak_~text'} || return }($text, $pos, $pos->{'_tag'} . ''); + # Note that this is very exceptional -- it doesn't fall back to + # _tweak_*, and it gives its tweak different arguments. + return; + } +} + +#========================================================================== + +# TODO: test whether comment(), declaration(), and process(), do the right +# thing as far as tightening and whatnot. +# Also, currently, doctypes and comments that appear before head or body +# show up in the tree in the wrong place. Something should be done about +# this. Tricky. Maybe this whole business of pre-making the body and +# whatnot is wrong. + +sub comment { + return if $_[0]{'_stunted'}; + # Accept a "here's a comment" signal from HTML::Parser. + + my($self, $text) = @_; + my $pos = $self->{'_pos'} || $self; + return unless $self->{'_store_comments'} + || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; + + if(DEBUG) { + my @lineage_tags = $pos->lineage_tag_names; + my $indent = ' ' x (1 + @lineage_tags); + + my $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...'); + $nugget =~ s<([\x00-\x1F])> + <'\\x'.(unpack("H2",$1))>eg; + print + $indent, "Proposing a Comment ($nugget) under ", + join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root', + ".\n"; + } + + (my $e = ( + $self->{'_element_class'} || 'HTML::Element' + )->new('~comment'))->{'text'} = $text; + $pos->push_content($e); + ++($self->{'_element_count'}); + + &{ $self->{'_tweak_~comment'} + || $self->{'_tweak_*'} + || return $e + }(map $_, $e, '~comment', $self); + + return $e; +} + +sub declaration { + return if $_[0]{'_stunted'}; + # Accept a "here's a markup declaration" signal from HTML::Parser. + + my($self, $text) = @_; + my $pos = $self->{'_pos'} || $self; + + if(DEBUG) { + my @lineage_tags = $pos->lineage_tag_names; + my $indent = ' ' x (1 + @lineage_tags); + + my $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...'); + $nugget =~ s<([\x00-\x1F])> + <'\\x'.(unpack("H2",$1))>eg; + print + $indent, "Proposing a Declaration ($nugget) under ", + join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root', + ".\n"; + } + (my $e = ( + $self->{'_element_class'} || 'HTML::Element' + )->new('~declaration'))->{'text'} = $text; + + $self->{_decl} = $e; + return $e; +} + +#========================================================================== + +sub process { + return if $_[0]{'_stunted'}; + # Accept a "here's a PI" signal from HTML::Parser. + + return unless $_[0]->{'_store_pis'}; + my($self, $text) = @_; + my $pos = $self->{'_pos'} || $self; + + if(DEBUG) { + my @lineage_tags = $pos->lineage_tag_names; + my $indent = ' ' x (1 + @lineage_tags); + + my $nugget = (length($text) <= 25) ? $text : (substr($text,0,25) . '...'); + $nugget =~ s<([\x00-\x1F])> + <'\\x'.(unpack("H2",$1))>eg; + print + $indent, "Proposing a PI ($nugget) under ", + join('/', reverse($pos->{'_tag'}, @lineage_tags)) || 'Root', + ".\n"; + } + (my $e = ( + $self->{'_element_class'} || 'HTML::Element' + )->new('~pi'))->{'text'} = $text; + $pos->push_content($e); + ++($self->{'_element_count'}); + + &{ $self->{'_tweak_~pi'} + || $self->{'_tweak_*'} + || return $e + }(map $_, $e, '~pi', $self); + + return $e; +} + + +#========================================================================== + +#When you call $tree->parse_file($filename), and the +#tree's ignore_ignorable_whitespace attribute is on (as it is +#by default), HTML::TreeBuilder's logic will manage to avoid +#creating some, but not all, nodes that represent ignorable +#whitespace. However, at the end of its parse, it traverses the +#tree and deletes any that it missed. (It does this with an +#around-method around HTML::Parser's eof method.) +# +#However, with $tree->parse($content), the cleanup-traversal step +#doesn't happen automatically -- so when you're done parsing all +#content for a document (regardless of whether $content is the only +#bit, or whether it's just another chunk of content you're parsing into +#the tree), call $tree->eof() to signal that you're at the end of the +#text you're inputting to the tree. Besides properly cleaning any bits +#of ignorable whitespace from the tree, this will also ensure that +#HTML::Parser's internal buffer is flushed. + +sub eof { + # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. + + return if $_[0]->{'_done'}; # we've already been here + + return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; + + my $x = $_[0]; + print "EOF received.\n" if DEBUG; + my(@rv); + if(wantarray) { + # I don't think this makes any difference for this particular + # method, but let's be scrupulous, for once. + @rv = $x->SUPER::eof(); + } else { + $rv[0] = $x->SUPER::eof(); + } + + $x->end('html') unless $x eq ($x->{'_pos'} || $x); + # That SHOULD close everything, and will run the appropriate tweaks. + # We /could/ be running under some insane mode such that there's more + # than one HTML element, but really, that's just insane to do anyhow. + + unless($x->{'_implicit_tags'}) { + # delete those silly implicit head and body in case we put + # them there in implicit tags mode + foreach my $node ($x->{'_head'}, $x->{'_body'}) { + $node->replace_with_content + if defined $node and ref $node + and $node->{'_implicit'} and $node->{'_parent'}; + # I think they should be empty anyhow, since the only + # logic that'd insert under them can apply only, I think, + # in the case where _implicit_tags is on + } + # this may still leave an implicit 'html' at the top, but there's + # nothing we can do about that, is there? + } + + $x->delete_ignorable_whitespace() + # this's why we trap this -- an after-method + if $x->{'_tighten'} and ! $x->{'_ignore_text'}; + $x->{'_done'} = 1; + + return @rv if wantarray; + return $rv[0]; +} + +#========================================================================== + +# TODO: document + +sub stunt { + my $self = $_[0]; + print "Stunting the tree.\n" if DEBUG; + $self->{'_done'} = 1; + + if($HTML::Parser::VERSION < 3) { + #This is a MEAN MEAN HACK. And it works most of the time! + $self->{'_buf'} = ''; + my $fh = *HTML::Parser::F{IO}; + # the local'd FH used by parse_file loop + if(defined $fh) { + print "Closing Parser's filehandle $fh\n" if DEBUG; + close($fh); + } + + # But if they called $tree->parse_file($filehandle) + # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} + # to close. Ahwell. Not a problem for most users these days. + + } else { + $self->SUPER::eof(); + # Under 3+ versions, calling eof from inside a parse will abort the + # parse / parse_file + } + + # In the off chance that the above didn't work, we'll throw + # this flag to make any future events be no-ops. + $self->stunted(1); + return; +} + +# TODO: document +sub stunted { shift->_elem('_stunted', @_); } +sub done { shift->_elem('_done', @_); } + +#========================================================================== + +sub delete { + # Override Element's delete method. + # This does most, if not all, of what Element's delete does anyway. + # Deletes content, including content in some special attributes. + # But doesn't empty out the hash. + + $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct + + delete @{$_[0]}{'_body', '_head', '_pos'}; + for (@{ delete($_[0]->{'_content'}) + || [] + }, # all/any content +# delete @{$_[0]}{'_body', '_head', '_pos'} + # ...and these, in case these elements don't appear in the + # content, which is possible. If they did appear (as they + # usually do), then calling $_->delete on them again is harmless. +# I don't think that's such a hot idea now. Thru creative reattachment, +# those could actually now point to elements in OTHER trees (which we do +# NOT want to delete!). +## Reasoned out: +# If these point to elements not in the content list of any element in this +# tree, but not in the content list of any element in any OTHER tree, then +# just deleting these will make their refcounts hit zero. +# If these point to elements in the content lists of elements in THIS tree, +# then we'll get to deleting them when we delete from the top. +# If these point to elements in the content lists of elements in SOME OTHER +# tree, then they're not to be deleted. + ) + { + $_->delete + if defined $_ and ref $_ # Make sure it's an object. + and $_ ne $_[0]; # And avoid hitting myself, just in case! + } + + $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; + # An 'html' element having a parent is quite unlikely. + + return undef; +} + +sub tighten_up { # legacy + shift->delete_ignorable_whitespace(@_); +} + +sub elementify { + # Rebless this object down into the normal element class. + my $self = $_[0]; + my $to_class = ($self->{'_element_class'} || 'HTML::Element'); + delete @{$self}{ grep {; + length $_ and substr($_,0,1) eq '_' + # The private attributes that we'll retain: + and $_ ne '_tag' and $_ ne '_parent' and $_ ne '_content' + and $_ ne '_implicit' and $_ ne '_pos' + and $_ ne '_element_class' + } keys %$self }; + bless $self, $to_class; # Returns the same object we were fed +} + +#-------------------------------------------------------------------------- + +sub guts { + my @out; + my @stack = ($_[0]); + my $destructive = $_[1]; + my $this; + while(@stack) { + $this = shift @stack; + if(!ref $this) { + push @out, $this; # yes, it can include text nodes + } elsif(! $this->{'_implicit'}) { + push @out, $this; + delete $this->{'_parent'} if $destructive; + } else { + # it's an implicit node. Delete it and recurse + delete $this->{'_parent'} if $destructive; + unshift @stack, @{ + ( $destructive ? + delete($this->{'_content'}) + : $this->{'_content'} + ) + || [] + }; + } + } + # Doesn't call a real $root->delete on the (when implicit) root, + # but I don't think it needs to. + + return @out if wantarray; # one simple normal case. + return undef unless @out; + return $out[0] if @out == 1 and ref($out[0]); + my $x = HTML::Element->new('div', '_implicit' => 1); + $x->push_content(@out); + return $x; +} + +sub disembowel { $_[0]->guts(1) } + +#-------------------------------------------------------------------------- +1; + +__END__ + +=head1 NAME + +HTML::TreeBuilder - Parser that builds a HTML syntax tree + +=head1 SYNOPSIS + + foreach my $file_name (@ARGV) { + my $tree = HTML::TreeBuilder->new; # empty tree + $tree->parse_file($file_name); + print "Hey, here's a dump of the parse tree of $file_name:\n"; + $tree->dump; # a method we inherit from HTML::Element + print "And here it is, bizarrely rerendered as HTML:\n", + $tree->as_HTML, "\n"; + + # Now that we're done with it, we must destroy it. + $tree = $tree->delete; + } + +=head1 DESCRIPTION + +(This class is part of the L dist.) + +This class is for HTML syntax trees that get built out of HTML +source. The way to use it is to: + +1. start a new (empty) HTML::TreeBuilder object, + +2. then use one of the methods from HTML::Parser (presumably with +$tree->parse_file($filename) for files, or with +$tree->parse($document_content) and $tree->eof if you've got +the content in a string) to parse the HTML +document into the tree $tree. + +(You can combine steps 1 and 2 with the "new_from_file" or +"new_from_content" methods.) + +2b. call $root-Eelementify() if you want. + +3. do whatever you need to do with the syntax tree, presumably +involving traversing it looking for some bit of information in it, + +4. and finally, when you're done with the tree, call $tree->delete() to +erase the contents of the tree from memory. This kind of thing +usually isn't necessary with most Perl objects, but it's necessary for +TreeBuilder objects. See L for a more verbose +explanation of why this is the case. + +=head1 METHODS AND ATTRIBUTES + +Objects of this class inherit the methods of both HTML::Parser and +HTML::Element. The methods inherited from HTML::Parser are used for +building the HTML tree, and the methods inherited from HTML::Element +are what you use to scrutinize the tree. Besides this +(HTML::TreeBuilder) documentation, you must also carefully read the +HTML::Element documentation, and also skim the HTML::Parser +documentation -- probably only its parse and parse_file methods are of +interest. + +Most of the following methods native to HTML::TreeBuilder control how +parsing takes place; they should be set I you try parsing into +the given object. You can set the attributes by passing a TRUE or +FALSE value as argument. E.g., $root->implicit_tags returns the current +setting for the implicit_tags option, $root->implicit_tags(1) turns that +option on, and $root->implicit_tags(0) turns it off. + +=over 4 + +=item $root = HTML::TreeBuilder->new_from_file(...) + +This "shortcut" constructor merely combines constructing a new object +(with the "new" method, below), and calling $new->parse_file(...) on +it. Returns the new object. Note that this provides no way of +setting any parse options like store_comments (for that, call new, and +then set options, before calling parse_file). See the notes (below) +on parameters to parse_file. + +=item $root = HTML::TreeBuilder->new_from_content(...) + +This "shortcut" constructor merely combines constructing a new object +(with the "new" method, below), and calling for(...){$new->parse($_)} +and $new->eof on it. Returns the new object. Note that this provides +no way of setting any parse options like store_comments (for that, +call new, and then set options, before calling parse_file). Example +usages: HTML::TreeBuilder->new_from_content(@lines), or +HTML::TreeBuilder->new_from_content($content) + +=item $root = HTML::TreeBuilder->new() + +This creates a new HTML::TreeBuilder object. This method takes no +attributes. + +=item $root->parse_file(...) + +[An important method inherited from L, which +see. Current versions of HTML::Parser can take a filespec, or a +filehandle object, like *FOO, or some object from class IO::Handle, +IO::File, IO::Socket) or the like. +I think you should check that a given file exists I calling +$root->parse_file($filespec).] + +=item $root->parse(...) + +[A important method inherited from L, which +see. See the note below for $root->eof().] + +=item $root->eof() + +This signals that you're finished parsing content into this tree; this +runs various kinds of crucial cleanup on the tree. This is called +I when you call $root->parse_file(...), but not when +you call $root->parse(...). So if you call +$root->parse(...), then you I call $root->eof() +once you've finished feeding all the chunks to parse(...), and +before you actually start doing anything else with the tree in C<$root>. + +=item C<< $root->parse_content(...) >> + +Basically a happly alias for C<< $root->parse(...); $root->eof >>. +Takes the exact same arguments as C<< $root->parse() >>. + +=item $root->delete() + +[An important method inherited from L, which +see.] + +=item $root->elementify() + +This changes the class of the object in $root from +HTML::TreeBuilder to the class used for all the rest of the elements +in that tree (generally HTML::Element). Returns $root. + +For most purposes, this is unnecessary, but if you call this after +(after!!) +you've finished building a tree, then it keeps you from accidentally +trying to call anything but HTML::Element methods on it. (I.e., if +you accidentally call C<$root-Eparse_file(...)> on the +already-complete and elementified tree, then instead of charging ahead +and I, it'll throw a fatal error -- since C<$root> is +now an object just of class HTML::Element which has no C +method. + +Note that elementify currently deletes all the private attributes of +$root except for "_tag", "_parent", "_content", "_pos", and +"_implicit". If anyone requests that I change this to leave in yet +more private attributes, I might do so, in future versions. + +=item @nodes = $root->guts() + +=item $parent_for_nodes = $root->guts() + +In list context (as in the first case), this method returns the topmost +non-implicit nodes in a tree. This is useful when you're parsing HTML +code that you know doesn't expect an HTML document, but instead just +a fragment of an HTML document. For example, if you wanted the parse +tree for a file consisting of just this: + +
  • I like pie! + +Then you would get that with C<< @nodes = $root->guts(); >>. +It so happens that in this case, C<@nodes> will contain just one +element object, representing the "li" node (with "I like pie!" being +its text child node). However, consider if you were parsing this: + +
    Hooboy!
    + +In that case, C<< $root->guts() >> would return three items: +an element object for the first "hr", a text string "Hooboy!", and +another "hr" element object. + +For cases where you want definitely one element (so you can treat it as +a "document fragment", roughly speaking), call C in scalar +context, as in C<< $parent_for_nodes = $root->guts() >>. That works like +C in list context; in fact, C in list context would +have returned exactly one value, and if it would have been an object (as +opposed to a text string), then that's what C in scalar context +will return. Otherwise, if C in list context would have returned +no values at all, then C in scalar context returns undef. In +all other cases, C in scalar context returns an implicit 'div' +element node, with children consisting of whatever nodes C +in list context would have returned. Note that that may detach those +nodes from C<$root>'s tree. + +=item @nodes = $root->disembowel() + +=item $parent_for_nodes = $root->disembowel() + +The C method works just like the C method, except +that disembowel definitively destroys the tree above the nodes that +are returned. Usually when you want the guts from a tree, you're just +going to toss out the rest of the tree anyway, so this saves you the +bother. (Remember, "disembowel" means "remove the guts from".) + +=item $root->implicit_tags(value) + +Setting this attribute to true will instruct the parser to try to +deduce implicit elements and implicit end tags. If it is false you +get a parse tree that just reflects the text as it stands, which is +unlikely to be useful for anything but quick and dirty parsing. +(In fact, I'd be curious to hear from anyone who finds it useful to +have implicit_tags set to false.) +Default is true. + +Implicit elements have the implicit() attribute set. + +=item $root->implicit_body_p_tag(value) + +This controls an aspect of implicit element behavior, if implicit_tags +is on: If a text element (PCDATA) or a phrasal element (such as +"EemE") is to be inserted under "EbodyE", two things +can happen: if implicit_body_p_tag is true, it's placed under a new, +implicit "EpE" tag. (Past DTDs suggested this was the only +correct behavior, and this is how past versions of this module +behaved.) But if implicit_body_p_tag is false, nothing is implicated +-- the PCDATA or phrasal element is simply placed under +"EbodyE". Default is false. + +=item $root->ignore_unknown(value) + +This attribute controls whether unknown tags should be represented as +elements in the parse tree, or whether they should be ignored. +Default is true (to ignore unknown tags.) + +=item $root->ignore_text(value) + +Do not represent the text content of elements. This saves space if +all you want is to examine the structure of the document. Default is +false. + +=item $root->ignore_ignorable_whitespace(value) + +If set to true, TreeBuilder will try to avoid +creating ignorable whitespace text nodes in the tree. Default is +true. (In fact, I'd be interested in hearing if there's ever a case +where you need this off, or where leaving it on leads to incorrect +behavior.) + +=item $root->no_space_compacting(value) + +This determines whether TreeBuilder compacts all whitespace strings +in the document (well, outside of PRE or TEXTAREA elements), or +leaves them alone. Normally (default, value of 0), each string of +contiguous whitespace in the document is turned into a single space. +But that's not done if no_space_compacting is set to 1. + +Setting no_space_compacting to 1 might be useful if you want +to read in a tree just to make some minor changes to it before +writing it back out. + +This method is experimental. If you use it, be sure to report +any problems you might have with it. + +=item $root->p_strict(value) + +If set to true (and it defaults to false), TreeBuilder will take a +narrower than normal view of what can be under a "p" element; if it sees +a non-phrasal element about to be inserted under a "p", it will close that +"p". Otherwise it will close p elements only for other "p"'s, headings, +and "form" (altho the latter may be removed in future versions). + +For example, when going thru this snippet of code, + +

    stuff +

      + +TreeBuilder will normally (with C false) put the "ul" element +under the "p" element. However, with C set to true, it will +close the "p" first. + +In theory, there should be strictness options like this for other/all +elements besides just "p"; but I treat this as a specal case simply +because of the fact that "p" occurs so frequently and its end-tag is +omitted so often; and also because application of strictness rules +at parse-time across all elements often makes tiny errors in HTML +coding produce drastically bad parse-trees, in my experience. + +If you find that you wish you had an option like this to enforce +content-models on all elements, then I suggest that what you want is +content-model checking as a stage after TreeBuilder has finished +parsing. + +=item $root->store_comments(value) + +This determines whether TreeBuilder will normally store comments found +while parsing content into C<$root>. Currently, this is off by default. + +=item $root->store_declarations(value) + +This determines whether TreeBuilder will normally store markup +declarations found while parsing content into C<$root>. This is on +by default. + +=item $root->store_pis(value) + +This determines whether TreeBuilder will normally store processing +instructions found while parsing content into C<$root> -- assuming a +recent version of HTML::Parser (old versions won't parse PIs +correctly). Currently, this is off (false) by default. + +It is somewhat of a known bug (to be fixed one of these days, if +anyone needs it?) that PIs in the preamble (before the "html" +start-tag) end up actually I the "html" element. + +=item $root->warn(value) + +This determines whether syntax errors during parsing should generate +warnings, emitted via Perl's C function. + +This is off (false) by default. + +=back + +=head1 HTML AND ITS DISCONTENTS + +HTML is rather harder to parse than people who write it generally +suspect. + +Here's the problem: HTML is a kind of SGML that permits "minimization" +and "implication". In short, this means that you don't have to close +every tag you open (because the opening of a subsequent tag may +implicitly close it), and if you use a tag that can't occur in the +context you seem to using it in, under certain conditions the parser +will be able to realize you mean to leave the current context and +enter the new one, that being the only one that your code could +correctly be interpreted in. + +Now, this would all work flawlessly and unproblematically if: 1) all +the rules that both prescribe and describe HTML were (and had been) +clearly set out, and 2) everyone was aware of these rules and wrote +their code in compliance to them. + +However, it didn't happen that way, and so most HTML pages are +difficult if not impossible to correctly parse with nearly any set of +straightforward SGML rules. That's why the internals of +HTML::TreeBuilder consist of lots and lots of special cases -- instead +of being just a generic SGML parser with HTML DTD rules plugged in. + +=head1 TRANSLATIONS? + +The techniques that HTML::TreeBuilder uses to perform what I consider +very robust parses on everyday code are not things that can work only +in Perl. To date, the algorithms at the center of HTML::TreeBuilder +have been implemented only in Perl, as far as I know; and I don't +foresee getting around to implementing them in any other language any +time soon. + +If, however, anyone is looking for a semester project for an applied +programming class (or if they merely enjoy I +masochism), they might do well to see about choosing as a topic the +implementation/adaptation of these routines to any other interesting +programming language that you feel currently suffers from a lack of +robust HTML-parsing. I welcome correspondence on this subject, and +point out that one can learn a great deal about languages by trying to +translate between them, and then comparing the result. + +The HTML::TreeBuilder source may seem long and complex, but it is +rather well commented, and symbol names are generally +self-explanatory. (You are encouraged to read the Mozilla HTML parser +source for comparison.) Some of the complexity comes from little-used +features, and some of it comes from having the HTML tokenizer +(HTML::Parser) being a separate module, requiring somewhat of a +different interface than you'd find in a combined tokenizer and +tree-builder. But most of the length of the source comes from the fact +that it's essentially a long list of special cases, with lots and lots +of sanity-checking, and sanity-recovery -- because, as Roseanne +Rosannadanna once said, "it's always I". + +Users looking to compare several HTML parsers should look at the +source for Raggett's Tidy +(Chttp://www.w3.org/People/Raggett/tidy/E>), +Mozilla +(Chttp://www.mozilla.org/E>), +and possibly root around the browsers section of Yahoo +to find the various open-source ones +(Chttp://dir.yahoo.com/Computers_and_Internet/Software/Internet/World_Wide_Web/Browsers/E>). + +=head1 BUGS + +* Framesets seem to work correctly now. Email me if you get a strange +parse from a document with framesets. + +* Really bad HTML code will, often as not, make for a somewhat +objectionable parse tree. Regrettable, but unavoidably true. + +* If you're running with implicit_tags off (God help you!), consider +that $tree->content_list probably contains the tree or grove from the +parse, and not $tree itself (which will, oddly enough, be an implicit +'html' element). This seems counter-intuitive and problematic; but +seeing as how almost no HTML ever parses correctly with implicit_tags +off, this interface oddity seems the least of your problems. + +=head1 BUG REPORTS + +When a document parses in a way different from how you think it +should, I ask that you report this to me as a bug. The first thing +you should do is copy the document, trim out as much of it as you can +while still producing the bug in question, and I email me that +mini-document I the code you're using to parse it, to the HTML::Tree +bug queue at C. + +Include a note as to how it +parses (presumably including its $tree->dump output), and then a +I explanation of where you think the parser is +going astray, and how you would prefer that it work instead. + +=head1 SEE ALSO + +L; L, L, L + +L + +=head1 COPYRIGHT + +Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester, +2006 Pete Krawczyk. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +This program is distributed in the hope that it will be useful, but +without any warranty; without even the implied warranty of +merchantability or fitness for a particular purpose. + +=head1 AUTHOR + +Currently maintained by Pete Krawczyk C<< >> + +Original authors: Gisle Aas, Sean Burke and Andy Lester. + +=cut