git.maemo.org Git - dh-make-perl/blob - dev/arm/libhtml-tree-perl/libhtml-tree-perl-3.23/debian/libhtml-tree-perl/usr/share/perl5/HTML/Element.pm

   1 package HTML::Element;
   2
   3 =head1 NAME
   4
   5 HTML::Element - Class for objects that represent HTML elements
   6
   7 =head1 VERSION
   8
   9 Version 3.23
  10
  11 =cut
  12
  13 use vars qw( $VERSION );
  14 $VERSION = '3.23';
  15
  16 =head1 SYNOPSIS
  17
  18     use HTML::Element;
  19     $a = HTML::Element->new('a', href => 'http://www.perl.com/');
  20     $a->push_content("The Perl Homepage");
  21
  22     $tag = $a->tag;
  23     print "$tag starts out as:",  $a->starttag, "\n";
  24     print "$tag ends as:",  $a->endtag, "\n";
  25     print "$tag\'s href attribute is: ", $a->attr('href'), "\n";
  26
  27     $links_r = $a->extract_links();
  28     print "Hey, I found ", scalar(@$links_r), " links.\n";
  29
  30     print "And that, as HTML, is: ", $a->as_HTML, "\n";
  31     $a = $a->delete;
  32
  33 =head1 DESCRIPTION
  34
  35 (This class is part of the L<HTML::Tree|HTML::Tree> dist.)
  36
  37 Objects of the HTML::Element class can be used to represent elements
  38 of HTML document trees.  These objects have attributes, notably attributes that
  39 designates each element's parent and content.  The content is an array
  40 of text segments and other HTML::Element objects.  A tree with HTML::Element
  41 objects as nodes can represent the syntax tree for a HTML document.
  42
  43 =head1 HOW WE REPRESENT TREES
  44
  45 Consider this HTML document:
  46
  47   <html lang='en-US'>
  48     <head>
  49       <title>Stuff</title>
  50       <meta name='author' content='Jojo'>
  51     </head>
  52     <body>
  53      <h1>I like potatoes!</h1>
  54     </body>
  55   </html>
  56
  57 Building a syntax tree out of it makes a tree-structure in memory
  58 that could be diagrammed as:
  59
  60                      html (lang='en-US')
  61                       / \
  62                     /     \
  63                   /         \
  64                 head        body
  65                /\               \
  66              /    \               \
  67            /        \               \
  68          title     meta              h1
  69           |       (name='author',     |
  70        "Stuff"    content='Jojo')    "I like potatoes"
  71
  72 This is the traditional way to diagram a tree, with the "root" at the
  73 top, and it's this kind of diagram that people have in mind when they
  74 say, for example, that "the meta element is under the head element
  75 instead of under the body element".  (The same is also said with
  76 "inside" instead of "under" -- the use of "inside" makes more sense
  77 when you're looking at the HTML source.)
  78
  79 Another way to represent the above tree is with indenting:
  80
  81   html (attributes: lang='en-US')
  82     head
  83       title
  84         "Stuff"
  85       meta (attributes: name='author' content='Jojo')
  86     body
  87       h1
  88         "I like potatoes"
  89
  90 Incidentally, diagramming with indenting works much better for very
  91 large trees, and is easier for a program to generate.  The C<< $tree->dump >>
  92 method uses indentation just that way.
  93
  94 However you diagram the tree, it's stored the same in memory -- it's a
  95 network of objects, each of which has attributes like so:
  96
  97   element #1:  _tag: 'html'
  98                _parent: none
  99                _content: [element #2, element #5]
 100                lang: 'en-US'
 101
 102   element #2:  _tag: 'head'
 103                _parent: element #1
 104                _content: [element #3, element #4]
 105
 106   element #3:  _tag: 'title'
 107                _parent: element #2
 108                _content: [text segment "Stuff"]
 109
 110   element #4   _tag: 'meta'
 111                _parent: element #2
 112                _content: none
 113                name: author
 114                content: Jojo
 115
 116   element #5   _tag: 'body'
 117                _parent: element #1
 118                _content: [element #6]
 119
 120   element #6   _tag: 'h1'
 121                _parent: element #5
 122                _content: [text segment "I like potatoes"]
 123
 124 The "treeness" of the tree-structure that these elements comprise is
 125 not an aspect of any particular object, but is emergent from the
 126 relatedness attributes (_parent and _content) of these element-objects
 127 and from how you use them to get from element to element.
 128
 129 While you could access the content of a tree by writing code that says
 130 "access the 'src' attribute of the root's I<first> child's I<seventh>
 131 child's I<third> child", you're more likely to have to scan the contents
 132 of a tree, looking for whatever nodes, or kinds of nodes, you want to
 133 do something with.  The most straightforward way to look over a tree
 134 is to "traverse" it; an HTML::Element method (C<< $h->traverse >>) is
 135 provided for this purpose; and several other HTML::Element methods are
 136 based on it.
 137
 138 (For everything you ever wanted to know about trees, and then some,
 139 see Niklaus Wirth's I<Algorithms + Data Structures = Programs> or
 140 Donald Knuth's I<The Art of Computer Programming, Volume 1>.)
 141
 142 =cut
 143
 144
 145 use strict;
 146 use Carp ();
 147 use HTML::Entities ();
 148 use HTML::Tagset ();
 149 use integer; # vroom vroom!
 150
 151 use vars qw($html_uc $Debug $ID_COUNTER %list_type_to_sub);
 152
 153 $Debug = 0 unless defined $Debug;
 154 sub Version { $VERSION; }
 155
 156 my $nillio = [];
 157
 158 *HTML::Element::emptyElement = \%HTML::Tagset::emptyElement; # legacy
 159 *HTML::Element::optionalEndTag = \%HTML::Tagset::optionalEndTag; # legacy
 160 *HTML::Element::linkElements = \%HTML::Tagset::linkElements; # legacy
 161 *HTML::Element::boolean_attr = \%HTML::Tagset::boolean_attr; # legacy
 162 *HTML::Element::canTighten = \%HTML::Tagset::canTighten; # legacy
 163
 164 # Constants for signalling back to the traverser:
 165 my $travsignal_package = __PACKAGE__ . '::_travsignal';
 166 my(
 167   $ABORT, $PRUNE, $PRUNE_SOFTLY, $OK, $PRUNE_UP
 168 ) =
 169   map
 170    {my $x = $_ ; bless \$x, $travsignal_package;}
 171    qw(
 172      ABORT  PRUNE   PRUNE_SOFTLY   OK   PRUNE_UP
 173    )
 174 ;
 175 sub ABORT           () {$ABORT}
 176 sub PRUNE           () {$PRUNE}
 177 sub PRUNE_SOFTLY    () {$PRUNE_SOFTLY}
 178 sub OK              () {$OK}
 179 sub PRUNE_UP        () {$PRUNE_UP}
 180
 181 $html_uc = 0;
 182 # set to 1 if you want tag and attribute names from starttag and endtag
 183 #  to be uc'd
 184
 185 # Elements that does not have corresponding end tags (i.e. are empty)
 186
 187 #==========================================================================
 188
 189
 190 =head1 BASIC METHODS
 191
 192 =head2 $h = HTML::Element->new('tag', 'attrname' => 'value', ... )
 193
 194 This constructor method returns a new HTML::Element object.  The tag
 195 name is a required argument; it will be forced to lowercase.
 196 Optionally, you can specify other initial attributes at object
 197 creation time.
 198
 199 =cut
 200
 201 #
 202 # An HTML::Element is represented by blessed hash reference, much like
 203 # Tree::DAG_Node objects.  Key-names not starting with '_' are reserved
 204 # for the SGML attributes of the element.
 205 # The following special keys are used:
 206 #
 207 #    '_tag':    The tag name (i.e., the generic identifier)
 208 #    '_parent': A reference to the HTML::Element above (when forming a tree)
 209 #    '_pos':    The current position (a reference to a HTML::Element) is
 210 #               where inserts will be placed (look at the insert_element
 211 #               method)  If not set, the implicit value is the object itself.
 212 #    '_content': A ref to an array of nodes under this.
 213 #                It might not be set.
 214 #
 215 # Example: <img src="gisle.jpg" alt="Gisle's photo"> is represented like this:
 216 #
 217 #  bless {
 218 #     _tag => 'img',
 219 #     src  => 'gisle.jpg',
 220 #     alt  => "Gisle's photo",
 221 #  }, 'HTML::Element';
 222 #
 223
 224 sub new {
 225     my $class = shift;
 226     $class = ref($class) || $class;
 227
 228     my $tag   = shift;
 229     Carp::croak("No tagname") unless defined $tag and length $tag;
 230     Carp::croak "\"$tag\" isn't a good tag name!"
 231      if $tag =~ m/[<>\/\x00-\x20]/; # minimal sanity, certainly!
 232     my $self  = bless { _tag => scalar($class->_fold_case($tag)) }, $class;
 233     my($attr, $val);
 234     while (($attr, $val) = splice(@_, 0, 2)) {
 235         $val = $attr unless defined $val;
 236         $self->{$class->_fold_case($attr)} = $val;
 237     }
 238     if ($tag eq 'html') {
 239         $self->{'_pos'} = undef;
 240     }
 241     return $self;
 242 }
 243
 244
 245 =head2 $h->attr('attr') or $h->attr('attr', 'value')
 246
 247 Returns (optionally sets) the value of the given attribute of $h.  The
 248 attribute name (but not the value, if provided) is forced to
 249 lowercase.  If trying to read the value of an attribute not present
 250 for this element, the return value is undef.
 251 If setting a new value, the old value of that attribute is
 252 returned.
 253
 254 If methods are provided for accessing an attribute (like C<< $h->tag >> for
 255 "_tag", C<< $h->content_list >>, etc. below), use those instead of calling
 256 attr C<< $h->attr >>, whether for reading or setting.
 257
 258 Note that setting an attribute to C<undef> (as opposed to "", the empty
 259 string) actually deletes the attribute.
 260
 261 =cut
 262
 263 sub attr {
 264     my $self = shift;
 265     my $attr = scalar($self->_fold_case(shift));
 266     if (@_) {  # set
 267         if(defined $_[0]) {
 268             my $old = $self->{$attr};
 269             $self->{$attr} = $_[0];
 270             return $old;
 271         }
 272         else {  # delete, actually
 273             return delete $self->{$attr};
 274         }
 275     }
 276     else {   # get
 277         return $self->{$attr};
 278     }
 279 }
 280
 281
 282 =head2 $h->tag() or $h->tag('tagname')
 283
 284 Returns (optionally sets) the tag name (also known as the generic
 285 identifier) for the element $h.  In setting, the tag name is always
 286 converted to lower case.
 287
 288 There are four kinds of "pseudo-elements" that show up as
 289 HTML::Element objects:
 290
 291 =over
 292
 293 =item Comment pseudo-elements
 294
 295 These are element objects with a C<$h-E<gt>tag> value of "~comment",
 296 and the content of the comment is stored in the "text" attribute
 297 (C<$h-E<gt>attr("text")>).  For example, parsing this code with
 298 HTML::TreeBuilder...
 299
 300   <!-- I like Pie.
 301      Pie is good
 302   -->
 303
 304 produces an HTML::Element object with these attributes:
 305
 306   "_tag",
 307   "~comment",
 308   "text",
 309   " I like Pie.\n     Pie is good\n  "
 310
 311 =item Declaration pseudo-elements
 312
 313 Declarations (rarely encountered) are represented as HTML::Element
 314 objects with a tag name of "~declaration", and content in the "text"
 315 attribute.  For example, this:
 316
 317   <!DOCTYPE foo>
 318
 319 produces an element whose attributes include:
 320
 321   "_tag", "~declaration", "text", "DOCTYPE foo"
 322
 323 =item Processing instruction pseudo-elements
 324
 325 PIs (rarely encountered) are represented as HTML::Element objects with
 326 a tag name of "~pi", and content in the "text" attribute.  For
 327 example, this:
 328
 329   <?stuff foo?>
 330
 331 produces an element whose attributes include:
 332
 333   "_tag", "~pi", "text", "stuff foo?"
 334
 335 (assuming a recent version of HTML::Parser)
 336
 337 =item ~literal pseudo-elements
 338
 339 These objects are not currently produced by HTML::TreeBuilder, but can
 340 be used to represent a "super-literal" -- i.e., a literal you want to
 341 be immune from escaping.  (Yes, I just made that term up.)
 342
 343 That is, this is useful if you want to insert code into a tree that
 344 you plan to dump out with C<as_HTML>, where you want, for some reason,
 345 to suppress C<as_HTML>'s normal behavior of amp-quoting text segments.
 346
 347 For example, this:
 348
 349   my $literal = HTML::Element->new('~literal',
 350     'text' => 'x < 4 & y > 7'
 351   );
 352   my $span = HTML::Element->new('span');
 353   $span->push_content($literal);
 354   print $span->as_HTML;
 355
 356 prints this:
 357
 358   <span>x < 4 & y > 7</span>
 359
 360 Whereas this:
 361
 362   my $span = HTML::Element->new('span');
 363   $span->push_content('x < 4 & y > 7');
 364     # normal text segment
 365   print $span->as_HTML;
 366
 367 prints this:
 368
 369   <span>x &lt; 4 &amp; y &gt; 7</span>
 370
 371 Unless you're inserting lots of pre-cooked code into existing trees,
 372 and dumping them out again, it's not likely that you'll find
 373 C<~literal> pseudo-elements useful.
 374
 375 =back
 376
 377 =cut
 378
 379 sub tag {
 380     my $self = shift;
 381     if (@_) { # set
 382         $self->{'_tag'} = $self->_fold_case($_[0]);
 383     }
 384     else { # get
 385         $self->{'_tag'};
 386     }
 387 }
 388
 389
 390 =head2 $h->parent() or $h->parent($new_parent)
 391
 392 Returns (optionally sets) the parent (aka "container") for this element.
 393 The parent should either be undef, or should be another element.
 394
 395 You B<should not> use this to directly set the parent of an element.
 396 Instead use any of the other methods under "Structure-Modifying
 397 Methods", below.
 398
 399 Note that not($h->parent) is a simple test for whether $h is the
 400 root of its subtree.
 401
 402 =cut
 403
 404 sub parent {
 405     my $self = shift;
 406     if (@_) { # set
 407         Carp::croak "an element can't be made its own parent"
 408          if defined $_[0] and ref $_[0] and $self eq $_[0]; # sanity
 409         $self->{'_parent'} = $_[0];
 410     }
 411     else {
 412         $self->{'_parent'}; # get
 413     }
 414 }
 415
 416
 417 =head2 $h->content_list()
 418
 419 Returns a list of the child nodes of this element -- i.e., what
 420 nodes (elements or text segments) are inside/under this element. (Note
 421 that this may be an empty list.)
 422
 423 In a scalar context, this returns the count of the items,
 424 as you may expect.
 425
 426 =cut
 427
 428 sub content_list {
 429     return
 430       wantarray ?        @{shift->{'_content'} || return()}
 431                 : scalar @{shift->{'_content'} || return 0};
 432 }
 433
 434
 435 =head2 $h->content()
 436
 437 This somewhat deprecated method returns the content of this element;
 438 but unlike content_list, this returns either undef (which you should
 439 understand to mean no content), or a I<reference to the array> of
 440 content items, each of which is either a text segment (a string, i.e.,
 441 a defined non-reference scalar value), or an HTML::Element object.
 442 Note that even if an arrayref is returned, it may be a reference to an
 443 empty array.
 444
 445 While older code should feel free to continue to use C<< $h->content >>,
 446 new code should use C<< $h->content_list >> in almost all conceivable
 447 cases.  It is my experience that in most cases this leads to simpler
 448 code anyway, since it means one can say:
 449
 450     @children = $h->content_list;
 451
 452 instead of the inelegant:
 453
 454     @children = @{$h->content || []};
 455
 456 If you do use C<< $h->content >> (or C<< $h->content_array_ref >>), you should not
 457 use the reference returned by it (assuming it returned a reference,
 458 and not undef) to directly set or change the content of an element or
 459 text segment!  Instead use L<content_refs_list> or any of the other
 460 methods under "Structure-Modifying Methods", below.
 461
 462 =cut
 463
 464 # a read-only method!  can't say $h->content( [] )!
 465 sub content {
 466     return shift->{'_content'};
 467 }
 468
 469
 470 =head2 $h->content_array_ref()
 471
 472 This is like C<content> (with all its caveats and deprecations) except
 473 that it is guaranteed to return an array reference.  That is, if the
 474 given node has no C<_content> attribute, the C<content> method would
 475 return that undef, but C<content_array_ref> would set the given node's
 476 C<_content> value to C<[]> (a reference to a new, empty array), and
 477 return that.
 478
 479 =cut
 480
 481 sub content_array_ref {
 482     return shift->{'_content'} ||= [];
 483 }
 484
 485
 486 =head2 $h->content_refs_list
 487
 488 This returns a list of scalar references to each element of C<$h>'s
 489 content list.  This is useful in case you want to in-place edit any
 490 large text segments without having to get a copy of the current value
 491 of that segment value, modify that copy, then use the
 492 C<splice_content> to replace the old with the new.  Instead, here you
 493 can in-place edit:
 494
 495     foreach my $item_r ($h->content_refs_list) {
 496         next if ref $$item_r;
 497         $$item_r =~ s/honour/honor/g;
 498     }
 499
 500 You I<could> currently achieve the same affect with:
 501
 502     foreach my $item (@{ $h->content_array_ref }) {
 503         # deprecated!
 504         next if ref $item;
 505         $item =~ s/honour/honor/g;
 506     }
 507
 508 ...except that using the return value of C<< $h->content >> or
 509 C<< $h->content_array_ref >> to do that is deprecated, and just might stop
 510 working in the future.
 511
 512 =cut
 513
 514 sub content_refs_list {
 515     return \( @{ shift->{'_content'} || return() } );
 516 }
 517
 518
 519 =head2 $h->implicit() or $h->implicit($bool)
 520
 521 Returns (optionally sets) the "_implicit" attribute.  This attribute is
 522 a flag that's used for indicating that the element was not originally
 523 present in the source, but was added to the parse tree (by
 524 HTML::TreeBuilder, for example) in order to conform to the rules of
 525 HTML structure.
 526
 527 =cut
 528
 529 sub implicit {
 530     return shift->attr('_implicit', @_);
 531 }
 532
 533
 534 =head2 $h->pos() or $h->pos($element)
 535
 536 Returns (and optionally sets) the "_pos" (for "current I<pos>ition")
 537 pointer of C<$h>.  This attribute is a pointer used during some
 538 parsing operations, whose value is whatever HTML::Element element
 539 at or under C<$h> is currently "open", where C<< $h->insert_element(NEW) >>
 540 will actually insert a new element.
 541
 542 (This has nothing to do with the Perl function called "pos", for
 543 controlling where regular expression matching starts.)
 544
 545 If you set C<< $h->pos($element) >>, be sure that C<$element> is
 546 either C<$h>, or an element under C<$h>.
 547
 548 If you've been modifying the tree under C<$h> and are no longer
 549 sure C<< $h->pos >> is valid, you can enforce validity with:
 550
 551     $h->pos(undef) unless $h->pos->is_inside($h);
 552
 553 =cut
 554
 555 sub pos {
 556     my $self = shift;
 557     my $pos = $self->{'_pos'};
 558     if (@_) {  # set
 559         my $parm = shift;
 560         if(defined $parm and $parm ne $self) {
 561             $self->{'_pos'} = $parm; # means that element
 562         }
 563         else {
 564             $self->{'_pos'} = undef; # means $self
 565         }
 566     }
 567     return $pos if defined($pos);
 568     return $self;
 569 }
 570
 571
 572 =head2 $h->all_attr()
 573
 574 Returns all this element's attributes and values, as key-value pairs.
 575 This will include any "internal" attributes (i.e., ones not present
 576 in the original element, and which will not be represented if/when you
 577 call C<< $h->as_HTML >>).  Internal attributes are distinguished by the fact
 578 that the first character of their key (not value! key!) is an
 579 underscore ("_").
 580
 581 Example output of C<< $h->all_attr() >> :
 582 C<'_parent', >I<[object_value]>C< , '_tag', 'em', 'lang', 'en-US',
 583 '_content', >I<[array-ref value]>.
 584
 585 =head2 $h->all_attr_names()
 586
 587 Like all_attr, but only returns the names of the attributes.
 588
 589 Example output of C<< $h->all_attr_names() >> :
 590 C<'_parent', '_tag', 'lang', '_content', >.
 591
 592 =cut
 593
 594 sub all_attr {
 595   return %{$_[0]};
 596   # Yes, trivial.  But no other way for the user to do the same
 597   #  without breaking encapsulation.
 598   # And if our object representation changes, this method's behavior
 599   #  should stay the same.
 600 }
 601
 602 sub all_attr_names {
 603     return keys %{$_[0]};
 604 }
 605
 606
 607 =head2 $h->all_external_attr()
 608
 609 Like C<all_attr>, except that internal attributes are not present.
 610
 611 =head2 $h->all_external_attr_names()
 612
 613 Like C<all_external_attr_names>, except that internal attributes' names
 614 are not present.
 615
 616 =cut
 617
 618 sub all_external_attr {
 619   my $self = $_[0];
 620   return
 621     map(
 622         (length($_) && substr($_,0,1) eq '_') ? () : ($_, $self->{$_}),
 623         keys %$self
 624        );
 625 }
 626
 627 sub all_external_attr_names {
 628   return
 629     grep
 630       !(length($_) && substr($_,0,1) eq '_'),
 631       keys %{$_[0]}
 632   ;
 633 }
 634
 635
 636
 637 =head2 $h->id() or $h->id($string)
 638
 639 Returns (optionally sets to C<$string>) the "id" attribute.
 640 C<< $h->id(undef) >> deletes the "id" attribute.
 641
 642 =cut
 643
 644 sub id {
 645   if(@_ == 1) {
 646     return $_[0]{'id'};
 647   } elsif(@_ == 2) {
 648     if(defined $_[1]) {
 649       return $_[0]{'id'} = $_[1];
 650     } else {
 651       return delete $_[0]{'id'};
 652     }
 653   } else {
 654     Carp::croak '$node->id can\'t take ' . scalar(@_) . ' parameters!';
 655   }
 656 }
 657
 658
 659 =head2 $h->idf() or $h->idf($string)
 660
 661 Just like the C<id> method, except that if you call C<< $h->idf() >> and
 662 no "id" attribute is defined for this element, then it's set to a
 663 likely-to-be-unique value, and returned.  (The "f" is for "force".)
 664
 665 =cut
 666
 667 sub _gensym {
 668   unless(defined $ID_COUNTER) {
 669     # start it out...
 670     $ID_COUNTER = sprintf('%04x', rand(0x1000));
 671     $ID_COUNTER =~ tr<0-9a-f><J-NP-Z>; # yes, skip letter "oh"
 672     $ID_COUNTER .= '00000';
 673   }
 674   ++$ID_COUNTER;
 675 }
 676
 677 sub idf {
 678     my $nparms = scalar @_;
 679
 680     if ($nparms == 1) {
 681         my $x;
 682         if (defined($x = $_[0]{'id'}) and length $x) {
 683             return $x;
 684         }
 685         else {
 686             return $_[0]{'id'} = _gensym();
 687         }
 688     }
 689     if ($nparms == 2) {
 690         if (defined $_[1]) {
 691             return $_[0]{'id'} = $_[1];
 692         }
 693         else {
 694             return delete $_[0]{'id'};
 695         }
 696     }
 697     Carp::croak '$node->idf can\'t take ' . scalar(@_) . ' parameters!';
 698 }
 699
 700
 701 =head1 STRUCTURE-MODIFYING METHODS
 702
 703 These methods are provided for modifying the content of trees
 704 by adding or changing nodes as parents or children of other nodes.
 705
 706 =head2 $h->push_content($element_or_text, ...)
 707
 708 Adds the specified items to the I<end> of the content list of the
 709 element C<$h>.  The items of content to be added should each be either a
 710 text segment (a string), an HTML::Element object, or an arrayref.
 711 Arrayrefs are fed thru C<< $h->new_from_lol(that_arrayref) >> to
 712 convert them into elements, before being added to the content
 713 list of C<$h>.  This means you can say things concise things like:
 714
 715   $body->push_content(
 716     ['br'],
 717     ['ul',
 718       map ['li', $_], qw(Peaches Apples Pears Mangos)
 719     ]
 720   );
 721
 722 See C<new_from_lol> method's documentation, far below, for more
 723 explanation.
 724
 725 The push_content method will try to consolidate adjacent text segments
 726 while adding to the content list.  That's to say, if $h's content_list is
 727
 728   ('foo bar ', $some_node, 'baz!')
 729
 730 and you call
 731
 732    $h->push_content('quack?');
 733
 734 then the resulting content list will be this:
 735
 736   ('foo bar ', $some_node, 'baz!quack?')
 737
 738 and not this:
 739
 740   ('foo bar ', $some_node, 'baz!', 'quack?')
 741
 742 If that latter is what you want, you'll have to override the
 743 feature of consolidating text by using splice_content,
 744 as in:
 745
 746   $h->splice_content(scalar($h->content_list),0,'quack?');
 747
 748 Similarly, if you wanted to add 'Skronk' to the beginning of
 749 the content list, calling this:
 750
 751    $h->unshift_content('Skronk');
 752
 753 then the resulting content list will be this:
 754
 755   ('Skronkfoo bar ', $some_node, 'baz!')
 756
 757 and not this:
 758
 759   ('Skronk', 'foo bar ', $some_node, 'baz!')
 760
 761 What you'd to do get the latter is:
 762
 763   $h->splice_content(0,0,'Skronk');
 764
 765 =cut
 766
 767 sub push_content {
 768     my $self = shift;
 769     return $self unless @_;
 770
 771     my $content = ($self->{'_content'} ||= []);
 772     for (@_) {
 773         if (ref($_) eq 'ARRAY') {
 774             # magically call new_from_lol
 775             push @$content, $self->new_from_lol($_);
 776             $content->[-1]->{'_parent'} = $self;
 777         }
 778         elsif (ref($_)) {  # insert an element
 779             $_->detach if $_->{'_parent'};
 780             $_->{'_parent'} = $self;
 781             push(@$content, $_);
 782         }
 783         else {  # insert text segment
 784             if (@$content && !ref $content->[-1]) {
 785                 # last content element is also text segment -- append
 786                 $content->[-1] .= $_;
 787             } else {
 788                 push(@$content, $_);
 789             }
 790         }
 791     }
 792     return $self;
 793 }
 794
 795
 796 =head2 $h->unshift_content($element_or_text, ...)
 797
 798 Just like C<push_content>, but adds to the I<beginning> of the $h
 799 element's content list.
 800
 801 The items of content to be added should each be
 802 either a text segment (a string), an HTML::Element object, or
 803 an arrayref (which is fed thru C<new_from_lol>).
 804
 805 The unshift_content method will try to consolidate adjacent text segments
 806 while adding to the content list.  See above for a discussion of this.
 807
 808 =cut
 809
 810 sub unshift_content {
 811     my $self = shift;
 812     return $self unless @_;
 813
 814     my $content = ($self->{'_content'} ||= []);
 815     for (reverse @_) { # so they get added in the order specified
 816         if (ref($_) eq 'ARRAY') {
 817             # magically call new_from_lol
 818             unshift @$content, $self->new_from_lol($_);
 819             $content->[0]->{'_parent'} = $self;
 820         }
 821         elsif (ref $_) {  # insert an element
 822             $_->detach if $_->{'_parent'};
 823             $_->{'_parent'} = $self;
 824             unshift(@$content, $_);
 825         }
 826         else {  # insert text segment
 827             if (@$content && !ref $content->[0]) {
 828                 # last content element is also text segment -- prepend
 829                 $content->[0]  = $_ . $content->[0];
 830             }
 831             else {
 832                 unshift(@$content, $_);
 833             }
 834         }
 835     }
 836     return $self;
 837 }
 838
 839 # Cf.  splice ARRAY,OFFSET,LENGTH,LIST
 840
 841 =head2 $h->splice_content($offset, $length, $element_or_text, ...)
 842
 843 Detaches the elements from $h's list of content-nodes, starting at
 844 $offset and continuing for $length items, replacing them with the
 845 elements of the following list, if any.  Returns the elements (if any)
 846 removed from the content-list.  If $offset is negative, then it starts
 847 that far from the end of the array, just like Perl's normal C<splice>
 848 function.  If $length and the following list is omitted, removes
 849 everything from $offset onward.
 850
 851 The items of content to be added (if any) should each be either a text
 852 segment (a string), an arrayref (which is fed thru C<new_from_lol>),
 853 or an HTML::Element object that's not already
 854 a child of $h.
 855
 856 =cut
 857
 858 sub splice_content {
 859     my($self, $offset, $length, @to_add) = @_;
 860     Carp::croak "splice_content requires at least one argument"
 861         if @_ < 2;  # at least $h->splice_content($offset);
 862     return $self unless @_;
 863
 864     my $content = ($self->{'_content'} ||= []);
 865     # prep the list
 866
 867     my @out;
 868     if (@_ > 2) {  # self, offset, length, ...
 869         foreach my $n (@to_add) {
 870             if (ref($n) eq 'ARRAY') {
 871                 $n = $self->new_from_lol($n);
 872                 $n->{'_parent'} = $self;
 873             }
 874             elsif (ref($n)) {
 875                 $n->detach;
 876                 $n->{'_parent'} = $self;
 877             }
 878         }
 879         @out = splice @$content, $offset, $length, @to_add;
 880     }
 881     else {  #  self, offset
 882         @out = splice @$content, $offset;
 883     }
 884     foreach my $n (@out) {
 885         $n->{'_parent'} = undef if ref $n;
 886     }
 887     return @out;
 888 }
 889
 890
 891 =head2 $h->detach()
 892
 893 This unlinks $h from its parent, by setting its 'parent' attribute to
 894 undef, and by removing it from the content list of its parent (if it
 895 had one).  The return value is the parent that was detached from (or
 896 undef, if $h had no parent to start with).  Note that neither $h nor
 897 its parent are explicitly destroyed.
 898
 899 =cut
 900
 901 sub detach {
 902     my $self = $_[0];
 903     return undef unless(my $parent = $self->{'_parent'});
 904     $self->{'_parent'} = undef;
 905     my $cohort = $parent->{'_content'} || return $parent;
 906     @$cohort = grep { not( ref($_) and $_ eq $self) } @$cohort;
 907     # filter $self out, if parent has any evident content
 908
 909     return $parent;
 910 }
 911
 912
 913 =head2 $h->detach_content()
 914
 915 This unlinks all of $h's children from $h, and returns them.
 916 Note that these are not explicitly destroyed; for that, you
 917 can just use $h->delete_content.
 918
 919 =cut
 920
 921 sub detach_content {
 922     my $c = $_[0]->{'_content'} || return(); # in case of no content
 923     for (@$c) {
 924         $_->{'_parent'} = undef if ref $_;
 925     }
 926     return splice @$c;
 927 }
 928
 929
 930 =head2 $h->replace_with( $element_or_text, ... )
 931
 932 This replaces C<$h> in its parent's content list with the nodes
 933 specified.  The element C<$h> (which by then may have no parent)
 934 is returned.  This causes a fatal error if C<$h> has no parent.
 935 The list of nodes to insert may contain C<$h>, but at most once.
 936 Aside from that possible exception, the nodes to insert should not
 937 already be children of C<$h>'s parent.
 938
 939 Also, note that this method does not destroy C<$h> -- use
 940 C<< $h->replace_with(...)->delete >> if you need that.
 941
 942 =cut
 943
 944 sub replace_with {
 945     my ($self, @replacers) = @_;
 946     Carp::croak "the target node has no parent"
 947         unless my($parent) = $self->{'_parent'};
 948
 949     my $parent_content = $parent->{'_content'};
 950     Carp::croak "the target node's parent has no content!?"
 951     unless $parent_content and @$parent_content;
 952
 953     my $replacers_contains_self;
 954     for(@replacers) {
 955         if (!ref $_) {
 956             # noop
 957         }
 958         elsif($_ eq $self) {
 959             # noop, but check that it's there just once.
 960             Carp::croak
 961             "Replacement list contains several copies of target!"
 962             if $replacers_contains_self++;
 963         }
 964         elsif($_ eq $parent) {
 965             Carp::croak "Can't replace an item with its parent!";
 966         }
 967         elsif(ref($_) eq 'ARRAY') {
 968             $_ = $self->new_from_lol($_);
 969         }
 970         else {
 971             $_->detach;
 972             $_->{'_parent'} = $parent;
 973             # each of these are necessary
 974         }
 975     } # for @replacers
 976     @$parent_content = map { ( ref($_) and $_ eq $self) ? @replacers : $_ } @$parent_content;
 977
 978     $self->{'_parent'} = undef unless $replacers_contains_self;
 979     # if replacers does contain self, then the parent attribute is fine as-is
 980
 981     return $self;
 982 }
 983
 984 =head2 $h->preinsert($element_or_text...)
 985
 986 Inserts the given nodes right BEFORE C<$h> in C<$h>'s parent's
 987 content list.  This causes a fatal error if C<$h> has no parent.
 988 None of the given nodes should be C<$h> or other children of C<$h>.
 989 Returns C<$h>.
 990
 991 =cut
 992
 993 sub preinsert {
 994     my $self = shift;
 995     return $self unless @_;
 996     return $self->replace_with(@_, $self);
 997 }
 998
 999 =head2 $h->postinsert($element_or_text...)
1000
1001 Inserts the given nodes right AFTER C<$h> in C<$h>'s parent's content
1002 list.  This causes a fatal error if C<$h> has no parent.  None of
1003 the given nodes should be C<$h> or other children of C<$h>.  Returns
1004 C<$h>.
1005
1006 =cut
1007
1008 sub postinsert {
1009     my $self = shift;
1010     return $self unless @_;
1011     return $self->replace_with($self, @_);
1012 }
1013
1014
1015 =head2 $h->replace_with_content()
1016
1017 This replaces C<$h> in its parent's content list with its own content.
1018 The element C<$h> (which by then has no parent or content of its own) is
1019 returned.  This causes a fatal error if C<$h> has no parent.  Also, note
1020 that this does not destroy C<$h> -- use
1021 C<< $h->replace_with_content->delete >> if you need that.
1022
1023 =cut
1024
1025 sub replace_with_content {
1026   my $self = $_[0];
1027   Carp::croak "the target node has no parent"
1028     unless my($parent) = $self->{'_parent'};
1029
1030   my $parent_content = $parent->{'_content'};
1031   Carp::croak "the target node's parent has no content!?"
1032    unless $parent_content and @$parent_content;
1033
1034   my $content_r = $self->{'_content'} || [];
1035   @$parent_content
1036    = map { ( ref($_) and $_ eq $self) ? @$content_r : $_ }
1037          @$parent_content
1038   ;
1039
1040   $self->{'_parent'} = undef; # detach $self from its parent
1041
1042   # Update parentage link, removing from $self's content list
1043   for (splice @$content_r) {  $_->{'_parent'} = $parent if ref $_ }
1044
1045   return $self;  # note: doesn't destroy it.
1046 }
1047
1048
1049
1050 =head2 $h->delete_content()
1051
1052 Clears the content of C<$h>, calling C<< $h->delete >> for each content
1053 element.  Compare with C<< $h->detach_content >>.
1054
1055 Returns C<$h>.
1056
1057 =cut
1058
1059 sub delete_content {
1060     for (splice @{ delete($_[0]->{'_content'})
1061               # Deleting it here (while holding its value, for the moment)
1062               #  will keep calls to detach() from trying to uselessly filter
1063               #  the list (as they won't be able to see it once it's been
1064               #  deleted)
1065             || return($_[0]) # in case of no content
1066           },
1067           0
1068            # the splice is so we can null the array too, just in case
1069            # something somewhere holds a ref to it
1070         )
1071     {
1072         $_->delete if ref $_;
1073     }
1074     $_[0];
1075 }
1076
1077
1078
1079 =head2 $h->delete()
1080
1081 Detaches this element from its parent (if it has one) and explicitly
1082 destroys the element and all its descendants.  The return value is
1083 undef.
1084
1085 Perl uses garbage collection based on reference counting; when no
1086 references to a data structure exist, it's implicitly destroyed --
1087 i.e., when no value anywhere points to a given object anymore, Perl
1088 knows it can free up the memory that the now-unused object occupies.
1089
1090 But this fails with HTML::Element trees, because a parent element
1091 always holds references to its children, and its children elements
1092 hold references to the parent, so no element ever looks like it's
1093 I<not> in use.  So, to destroy those elements, you need to call
1094 C<< $h->delete >> on the parent.
1095
1096 =cut
1097
1098 # two handy aliases
1099 sub destroy { shift->delete(@_) }
1100 sub destroy_content { shift->delete_content(@_) }
1101
1102 sub delete {
1103     my $self = $_[0];
1104     $self->delete_content   # recurse down
1105      if $self->{'_content'} && @{$self->{'_content'}};
1106
1107     $self->detach if $self->{'_parent'} and $self->{'_parent'}{'_content'};
1108      # not the typical case
1109
1110     %$self = (); # null out the whole object on the way out
1111     return undef;
1112 }
1113
1114
1115 =head2 $h->clone()
1116
1117 Returns a copy of the element (whose children are clones (recursively)
1118 of the original's children, if any).
1119
1120 The returned element is parentless.  Any '_pos' attributes present in the
1121 source element/tree will be absent in the copy.  For that and other reasons,
1122 the clone of an HTML::TreeBuilder object that's in mid-parse (i.e, the head
1123 of a tree that HTML::TreeBuilder is elaborating) cannot (currently) be used
1124 to continue the parse.
1125
1126 You are free to clone HTML::TreeBuilder trees, just as long as:
1127 1) they're done being parsed, or 2) you don't expect to resume parsing
1128 into the clone.  (You can continue parsing into the original; it is
1129 never affected.)
1130
1131 =cut
1132
1133 sub clone {
1134   #print "Cloning $_[0]\n";
1135   my $it = shift;
1136   Carp::croak "clone() can be called only as an object method" unless ref $it;
1137   Carp::croak "clone() takes no arguments" if @_;
1138
1139   my $new = bless { %$it }, ref($it);     # COPY!!! HOOBOY!
1140   delete @$new{'_content', '_parent', '_pos', '_head', '_body'};
1141
1142   # clone any contents
1143   if($it->{'_content'} and @{$it->{'_content'}}) {
1144     $new->{'_content'} = [  ref($it)->clone_list( @{$it->{'_content'}} )  ];
1145     for(@{$new->{'_content'}}) {
1146       $_->{'_parent'} = $new if ref $_;
1147     }
1148   }
1149
1150   return $new;
1151 }
1152
1153 =head2 HTML::Element->clone_list(...nodes...)
1154
1155 Returns a list consisting of a copy of each node given.
1156 Text segments are simply copied; elements are cloned by
1157 calling $it->clone on each of them.
1158
1159 Note that this must be called as a class method, not as an instance
1160 method.  C<clone_list> will croak if called as an instance method.
1161 You can also call it like so:
1162
1163     ref($h)->clone_list(...nodes...)
1164
1165 =cut
1166
1167 sub clone_list {
1168   Carp::croak "clone_list can be called only as a class method" if ref shift @_;
1169
1170    # all that does is get me here
1171   return
1172     map
1173       {
1174         ref($_)
1175           ? $_->clone   # copy by method
1176           : $_  # copy by evaluation
1177       }
1178       @_
1179   ;
1180 }
1181
1182
1183 =head2 $h->normalize_content
1184
1185 Normalizes the content of C<$h> -- i.e., concatenates any adjacent
1186 text nodes.  (Any undefined text segments are turned into empty-strings.)
1187 Note that this does not recurse into C<$h>'s descendants.
1188
1189 =cut
1190
1191 sub normalize_content {
1192   my $start = $_[0];
1193   my $c;
1194   return unless $c = $start->{'_content'} and ref $c and @$c; # nothing to do
1195   # TODO: if we start having text elements, deal with catenating those too?
1196   my @stretches = (undef); # start with a barrier
1197
1198   # I suppose this could be rewritten to treat stretches as it goes, instead
1199   #  of at the end.  But feh.
1200
1201   # Scan:
1202   for(my $i = 0; $i < @$c; ++$i) {
1203     if(defined $c->[$i] and ref $c->[$i]) { # not a text segment
1204       if($stretches[0]) {
1205         # put in a barrier
1206         if($stretches[0][1] == 1) {
1207           #print "Nixing stretch at ", $i-1, "\n";
1208           undef $stretches[0]; # nix the previous one-node "stretch"
1209         } else {
1210           #print "End of stretch at ", $i-1, "\n";
1211           unshift @stretches, undef
1212         }
1213       }
1214       # else no need for a barrier
1215     } else { # text segment
1216       $c->[$i] = '' unless defined $c->[$i];
1217       if($stretches[0]) {
1218         ++$stretches[0][1]; # increase length
1219       } else {
1220         #print "New stretch at $i\n";
1221         unshift @stretches, [$i,1]; # start and length
1222       }
1223     }
1224   }
1225
1226   # Now combine.  Note that @stretches is in reverse order, so the indexes
1227   # still make sense as we work our way thru (i.e., backwards thru $c).
1228   foreach my $s (@stretches) {
1229     if($s and $s->[1] > 1) {
1230       #print "Stretch at ", $s->[0], " for ", $s->[1], "\n";
1231       $c->[$s->[0]] .= join('', splice(@$c, $s->[0] + 1, $s->[1] - 1))
1232         # append the subsequent ones onto the first one.
1233     }
1234   }
1235   return;
1236 }
1237
1238 =head2 $h->delete_ignorable_whitespace()
1239
1240 This traverses under C<$h> and deletes any text segments that are ignorable
1241 whitespace.  You should not use this if C<$h> under a 'pre' element.
1242
1243 =cut
1244
1245 sub delete_ignorable_whitespace {
1246   # This doesn't delete all sorts of whitespace that won't actually
1247   #  be used in rendering, tho -- that's up to the rendering application.
1248   # For example:
1249   #   <input type='text' name='foo'>
1250   #     [some whitespace]
1251   #   <input type='text' name='bar'>
1252   # The WS between the two elements /will/ get used by the renderer.
1253   # But here:
1254   #   <input type='hidden' name='foo' value='1'>
1255   #     [some whitespace]
1256   #   <input type='text' name='bar' value='2'>
1257   # the WS between them won't be rendered in any way, presumably.
1258
1259   #my $Debug = 4;
1260   die "delete_ignorable_whitespace can be called only as an object method"
1261    unless ref $_[0];
1262
1263   print "About to tighten up...\n" if $Debug > 2;
1264   my(@to_do) = ($_[0]);  # Start off.
1265   my($i, $sibs, $ptag, $this); # scratch for the loop...
1266   while(@to_do) {
1267     if(
1268        ( $ptag = ($this = shift @to_do)->{'_tag'} ) eq 'pre'
1269        or $ptag eq 'textarea'
1270        or $HTML::Tagset::isCDATA_Parent{$ptag}
1271     ) {
1272       # block the traversal under those
1273        print "Blocking traversal under $ptag\n" if $Debug;
1274        next;
1275     }
1276     next unless($sibs = $this->{'_content'} and @$sibs);
1277     for($i = $#$sibs; $i >= 0; --$i) { # work backwards thru the list
1278       if(ref $sibs->[$i]) {
1279         unshift @to_do, $sibs->[$i];
1280         # yes, this happens in pre order -- we're going backwards
1281         # thru this sibling list.  I doubt it actually matters, tho.
1282         next;
1283       }
1284       next if $sibs->[$i] =~ m<[^\n\r\f\t ]>s; # it's /all/ whitespace
1285
1286       print "Under $ptag whose canTighten ",
1287           "value is ", 0 + $HTML::Element::canTighten{$ptag}, ".\n"
1288        if $Debug > 3;
1289
1290       # It's all whitespace...
1291
1292       if($i == 0) {
1293         if(@$sibs == 1) { # I'm an only child
1294           next unless $HTML::Element::canTighten{$ptag}; # parent
1295         } else { # I'm leftmost of many
1296           # if either my parent or sib are eligible, I'm good.
1297           next unless
1298            $HTML::Element::canTighten{$ptag} # parent
1299            or
1300             (ref $sibs->[1]
1301              and $HTML::Element::canTighten{$sibs->[1]{'_tag'}} # right sib
1302             );
1303         }
1304       } elsif ($i == $#$sibs) { # I'm rightmost of many
1305         # if either my parent or sib are eligible, I'm good.
1306         next unless
1307            $HTML::Element::canTighten{$ptag} # parent
1308            or
1309             (ref $sibs->[$i - 1]
1310             and $HTML::Element::canTighten{$sibs->[$i - 1]{'_tag'}} # left sib
1311             )
1312       } else { # I'm the piggy in the middle
1313         # My parent doesn't matter -- it all depends on my sibs
1314         next
1315           unless
1316             ref $sibs->[$i - 1] or ref $sibs->[$i + 1];
1317          # if NEITHER sib is a node, quit
1318
1319         next if
1320           # bailout condition: if BOTH are INeligible nodes
1321           #  (as opposed to being text, or being eligible nodes)
1322             ref $sibs->[$i - 1]
1323             and ref $sibs->[$i + 1]
1324             and !$HTML::Element::canTighten{$sibs->[$i - 1]{'_tag'}} # left sib
1325             and !$HTML::Element::canTighten{$sibs->[$i + 1]{'_tag'}} # right sib
1326         ;
1327       }
1328       # Unknown tags aren't in canTighten and so AREN'T subject to tightening
1329
1330       print "  delendum: child $i of $ptag\n" if $Debug > 3;
1331       splice @$sibs, $i, 1;
1332     }
1333      # end of the loop-over-children
1334   }
1335    # end of the while loop.
1336
1337   return;
1338 }
1339
1340
1341 =head2 $h->insert_element($element, $implicit)
1342
1343 Inserts (via push_content) a new element under the element at
1344 C<< $h->pos() >>.  Then updates C<< $h->pos() >> to point to the inserted
1345 element, unless $element is a prototypically empty element like
1346 "br", "hr", "img", etc.  The new C<< $h->pos() >> is returned.  This
1347 method is useful only if your particular tree task involves setting
1348 C<< $h->pos() >>.
1349
1350 =cut
1351
1352 sub insert_element {
1353     my($self, $tag, $implicit) = @_;
1354     return $self->pos() unless $tag; # noop if nothing to insert
1355
1356     my $e;
1357     if (ref $tag) {
1358         $e = $tag;
1359         $tag = $e->tag;
1360     } else { # just a tag name -- so make the element
1361         $e = ($self->{'_element_class'} || __PACKAGE__)->new($tag);
1362         ++($self->{'_element_count'}) if exists $self->{'_element_count'};
1363          # undocumented.  see TreeBuilder.
1364     }
1365
1366     $e->{'_implicit'} = 1 if $implicit;
1367
1368     my $pos = $self->{'_pos'};
1369     $pos = $self unless defined $pos;
1370
1371     $pos->push_content($e);
1372
1373     $self->{'_pos'} = $pos = $e
1374       unless $self->_empty_element_map->{$tag} || $e->{'_empty_element'};
1375
1376     $pos;
1377 }
1378
1379 #==========================================================================
1380 # Some things to override in XML::Element
1381
1382 sub _empty_element_map {
1383   \%HTML::Element::emptyElement;
1384 }
1385
1386 sub _fold_case_LC {
1387   if(wantarray) {
1388     shift;
1389     map lc($_), @_;
1390   } else {
1391     return lc($_[1]);
1392   }
1393 }
1394
1395 sub _fold_case_NOT {
1396   if(wantarray) {
1397     shift;
1398     @_;
1399   } else {
1400     return $_[1];
1401   }
1402 }
1403
1404 *_fold_case = \&_fold_case_LC;
1405
1406 #==========================================================================
1407
1408 =head1 DUMPING METHODS
1409
1410 =head2 $h->dump()
1411
1412 =head2 $h->dump(*FH)  ; # or *FH{IO} or $fh_obj
1413
1414 Prints the element and all its children to STDOUT (or to a specified
1415 filehandle), in a format useful
1416 only for debugging.  The structure of the document is shown by
1417 indentation (no end tags).
1418
1419 =cut
1420
1421 sub dump {
1422     my($self, $fh, $depth) = @_;
1423     $fh = *STDOUT{IO} unless defined $fh;
1424     $depth = 0 unless defined $depth;
1425     print $fh
1426       "  " x $depth,   $self->starttag,   " \@", $self->address,
1427       $self->{'_implicit'} ? " (IMPLICIT)\n" : "\n";
1428     for (@{$self->{'_content'}}) {
1429         if (ref $_) {  # element
1430             $_->dump($fh, $depth+1);  # recurse
1431         } else {  # text node
1432             print $fh "  " x ($depth + 1);
1433             if(length($_) > 65 or m<[\x00-\x1F]>) {
1434               # it needs prettyin' up somehow or other
1435               my $x = (length($_) <= 65) ? $_ : (substr($_,0,65) . '...');
1436               $x =~ s<([\x00-\x1F])>
1437                      <'\\x'.(unpack("H2",$1))>eg;
1438               print $fh qq{"$x"\n};
1439             } else {
1440               print $fh qq{"$_"\n};
1441             }
1442         }
1443     }
1444 }
1445
1446
1447 =head2 $h->as_HTML() or $h->as_HTML($entities)
1448
1449 =head2 or $h->as_HTML($entities, $indent_char)
1450
1451 =head2 or $h->as_HTML($entities, $indent_char, \%optional_end_tags)
1452
1453 Returns a string representing in HTML the element and its
1454 descendants.  The optional argument C<$entities> specifies a string of
1455 the entities to encode.  For compatibility with previous versions,
1456 specify C<'E<lt>E<gt>&'> here.  If omitted or undef, I<all> unsafe
1457 characters are encoded as HTML entities.  See L<HTML::Entities> for
1458 details.  If passed an empty string, no entities are encoded.
1459
1460 If $indent_char is specified and defined, the HTML to be output is
1461 intented, using the string you specify (which you probably should
1462 set to "\t", or some number of spaces, if you specify it).
1463
1464 If C<\%optional_end_tags> is specified and defined, it should be
1465 a reference to a hash that holds a true value for every tag name
1466 whose end tag is optional.  Defaults to
1467 C<\%HTML::Element::optionalEndTag>, which is an alias to
1468 C<%HTML::Tagset::optionalEndTag>, which, at time of writing, contains
1469 true values for C<p, li, dt, dd>.  A useful value to pass is an empty
1470 hashref, C<{}>, which means that no end-tags are optional for this dump.
1471 Otherwise, possibly consider copying C<%HTML::Tagset::optionalEndTag> to a
1472 hash of your own, adding or deleting values as you like, and passing
1473 a reference to that hash.
1474
1475 =cut
1476
1477 sub as_HTML {
1478   my($self, $entities, $indent, $omissible_map) = @_;
1479   #my $indent_on = defined($indent) && length($indent);
1480   my @html = ();
1481
1482   $omissible_map ||= \%HTML::Element::optionalEndTag;
1483   my $empty_element_map = $self->_empty_element_map;
1484
1485   my $last_tag_tightenable = 0;
1486   my $this_tag_tightenable = 0;
1487   my $nonindentable_ancestors = 0;  # count of nonindentible tags over us.
1488
1489   my($tag, $node, $start, $depth); # per-iteration scratch
1490
1491   if(defined($indent) && length($indent)) {
1492     $self->traverse(
1493       sub {
1494         ($node, $start, $depth) = @_;
1495         if(ref $node) { # it's an element
1496
1497            $tag = $node->{'_tag'};
1498
1499            if($start) { # on the way in
1500              if(
1501                 ($this_tag_tightenable = $HTML::Element::canTighten{$tag})
1502                 and !$nonindentable_ancestors
1503                 and $last_tag_tightenable
1504              ) {
1505                push
1506                  @html,
1507                  "\n",
1508                  $indent x $depth,
1509                  $node->starttag($entities),
1510                ;
1511              } else {
1512                push(@html, $node->starttag($entities));
1513              }
1514              $last_tag_tightenable = $this_tag_tightenable;
1515
1516              ++$nonindentable_ancestors
1517                if $tag eq 'pre' or $HTML::Tagset::isCDATA_Parent{$tag};             ;
1518
1519            } elsif (not($empty_element_map->{$tag} or $omissible_map->{$tag})) {
1520              # on the way out
1521              if($tag eq 'pre' or $HTML::Tagset::isCDATA_Parent{$tag}) {
1522                --$nonindentable_ancestors;
1523                $last_tag_tightenable = $HTML::Element::canTighten{$tag};
1524                push @html, $node->endtag;
1525
1526              } else { # general case
1527                if(
1528                   ($this_tag_tightenable = $HTML::Element::canTighten{$tag})
1529                   and !$nonindentable_ancestors
1530                   and $last_tag_tightenable
1531                ) {
1532                  push
1533                    @html,
1534                    "\n",
1535                    $indent x $depth,
1536                    $node->endtag,
1537                  ;
1538                } else {
1539                  push @html, $node->endtag;
1540                }
1541                $last_tag_tightenable = $this_tag_tightenable;
1542                #print "$tag tightenable: $this_tag_tightenable\n";
1543              }
1544            }
1545         } else {  # it's a text segment
1546
1547           $last_tag_tightenable = 0;  # I guess this is right
1548           HTML::Entities::encode_entities($node, $entities)
1549             # That does magic things if $entities is undef.
1550            unless (
1551                 (defined($entities) && !length($entities))
1552                 # If there's no entity to encode, don't call it
1553                 || $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }
1554                 # To keep from amp-escaping children of script et al.
1555                 # That doesn't deal with descendants; but then, CDATA
1556                 #  parents shouldn't /have/ descendants other than a
1557                 #  text children (or comments?)
1558                 );
1559           if($nonindentable_ancestors) {
1560             push @html, $node; # say no go
1561           } else {
1562             if($last_tag_tightenable) {
1563               $node =~ s<[\n\r\f\t ]+>< >s;
1564               #$node =~ s< $><>s;
1565               $node =~ s<^ ><>s;
1566               push
1567                 @html,
1568                 "\n",
1569                 $indent x $depth,
1570                 $node,
1571                 #Text::Wrap::wrap($indent x $depth, $indent x $depth, "\n" . $node)
1572               ;
1573             } else {
1574               push
1575                 @html,
1576                 $node,
1577                 #Text::Wrap::wrap('', $indent x $depth, $node)
1578               ;
1579             }
1580           }
1581         }
1582         1; # keep traversing
1583       }
1584     ); # End of parms to traverse()
1585   } else { # no indenting -- much simpler code
1586     $self->traverse(
1587       sub {
1588           ($node, $start) = @_;
1589           if(ref $node) {
1590             $tag = $node->{'_tag'};
1591             if($start) { # on the way in
1592               push(@html, $node->starttag($entities));
1593             } elsif (not($empty_element_map->{$tag} or $omissible_map->{$tag})) {
1594               # on the way out
1595               push(@html, $node->endtag);
1596             }
1597           } else {
1598             # simple text content
1599             HTML::Entities::encode_entities($node, $entities)
1600               # That does magic things if $entities is undef.
1601              unless (
1602                   (defined($entities) && !length($entities))
1603                   # If there's no entity to encode, don't call it
1604                   || $HTML::Tagset::isCDATA_Parent{ $_[3]{'_tag'} }
1605                   # To keep from amp-escaping children of script et al.
1606                   # That doesn't deal with descendants; but then, CDATA
1607                   #  parents shouldn't /have/ descendants other than a
1608                   #  text children (or comments?)
1609                   );
1610             push(@html, $node);
1611           }
1612          1; # keep traversing
1613         }
1614     ); # End of parms to traverse()
1615   }
1616
1617   if ( $self->{_store_declarations} && defined $self->{_decl} ) {
1618     unshift @html, sprintf "<!%s>\n", $self->{_decl}->{text} ;
1619   }
1620
1621
1622   return join('', @html, "\n");
1623 }
1624
1625
1626 =head2 $h->as_text()
1627
1628 =head2 $h->as_text(skip_dels => 1)
1629
1630 Returns a string consisting of only the text parts of the element's
1631 descendants.
1632
1633 Text under 'script' or 'style' elements is never included in what's
1634 returned.  If C<skip_dels> is true, then text content under "del"
1635 nodes is not included in what's returned.
1636
1637 =head2 $h->as_trimmed_text(...)
1638
1639 This is just like as_text(...) except that leading and trailing
1640 whitespace is deleted, and any internal whitespace is collapsed.
1641
1642 =cut
1643
1644 sub as_text {
1645   # Yet another iteratively implemented traverser
1646   my($this,%options) = @_;
1647   my $skip_dels = $options{'skip_dels'} || 0;
1648   my(@pile) = ($this);
1649   my $tag;
1650   my $text = '';
1651   while(@pile) {
1652     if(!defined($pile[0])) { # undef!
1653       # no-op
1654     } elsif(!ref($pile[0])) { # text bit!  save it!
1655       $text .= shift @pile;
1656     } else { # it's a ref -- traverse under it
1657       unshift @pile, @{$this->{'_content'} || $nillio}
1658         unless
1659           ($tag = ($this = shift @pile)->{'_tag'}) eq 'style'
1660           or $tag eq 'script'
1661           or ($skip_dels and $tag eq 'del');
1662     }
1663   }
1664   return $text;
1665 }
1666
1667 sub as_trimmed_text {
1668   my $text = shift->as_text(@_);
1669   $text =~ s/[\n\r\f\t ]+$//s;
1670   $text =~ s/^[\n\r\f\t ]+//s;
1671   $text =~ s/[\n\r\f\t ]+/ /g;
1672   return $text;
1673 }
1674
1675 sub as_text_trimmed { shift->as_trimmed_text(@_) } # alias, because I forget
1676
1677 =head2 $h->as_XML()
1678
1679 Returns a string representing in XML the element and its descendants.
1680
1681 The XML is not indented.
1682
1683 =cut
1684
1685 # TODO: make it wrap, if not indent?
1686
1687 sub as_XML {
1688   # based an as_HTML
1689   my($self) = @_;
1690   #my $indent_on = defined($indent) && length($indent);
1691   my @xml = ();
1692   my $empty_element_map = $self->_empty_element_map;
1693
1694   my($tag, $node, $start); # per-iteration scratch
1695   $self->traverse(
1696     sub {
1697         ($node, $start) = @_;
1698         if(ref $node) { # it's an element
1699           $tag = $node->{'_tag'};
1700           if($start) { # on the way in
1701             if($empty_element_map->{$tag}
1702                and !@{$node->{'_content'} || $nillio}
1703             ) {
1704               push(@xml, $node->starttag_XML(undef,1));
1705             } else {
1706               push(@xml, $node->starttag_XML(undef));
1707             }
1708           } else { # on the way out
1709             unless($empty_element_map->{$tag}
1710                    and !@{$node->{'_content'} || $nillio}
1711             ) {
1712               push(@xml, $node->endtag_XML());
1713             } # otherwise it will have been an <... /> tag.
1714           }
1715         } else { # it's just text
1716           _xml_escape($node);
1717           push(@xml, $node);
1718         }
1719        1; # keep traversing
1720       }
1721   );
1722
1723   join('', @xml, "\n");
1724 }
1725
1726
1727 sub _xml_escape {  # DESTRUCTIVE (a.k.a. "in-place")
1728   # Five required escapes: http://www.w3.org/TR/2006/REC-xml11-20060816/#syntax
1729   # We allow & if it's part of a valid escape already: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-references
1730   foreach my $x (@_) {
1731     $x =~ s/(                   # Escape...
1732                 < |             # Less than, or
1733                 > |             # Greater than, or
1734                 ' |             # Single quote, or
1735                 " |             # Double quote, or
1736                 &(?!            # An ampersand that isn't followed by...
1737                   (\#\d+; |             # A hash mark, digits and semicolon, or
1738                    \#x[\da-f]+; |       # A hash mark, "x", hex digits and semicolon, or
1739                    [A-Za-z0-9]+; ))     # alphanums (not underscore, hence not \w) and a semicolon
1740              )/'&#'.ord($1).";"/sgex;  # And replace them with their XML digit counterpart
1741   }
1742   return;
1743 }
1744
1745 =head2 $h->as_Lisp_form()
1746
1747 Returns a string representing the element and its descendants as a
1748 Lisp form.  Unsafe characters are encoded as octal escapes.
1749
1750 The Lisp form is indented, and contains external ("href", etc.)  as
1751 well as internal attributes ("_tag", "_content", "_implicit", etc.),
1752 except for "_parent", which is omitted.
1753
1754 Current example output for a given element:
1755
1756   ("_tag" "img" "border" "0" "src" "pie.png" "usemap" "#main.map")
1757
1758 =cut
1759
1760 # NOTES:
1761 #
1762 # It's been suggested that attribute names be made :-keywords:
1763 #   (:_tag "img" :border 0 :src "pie.png" :usemap "#main.map")
1764 # However, it seems that Scheme has no such data type as :-keywords.
1765 # So, for the moment at least, I tend toward simplicity, uniformity,
1766 #  and universality, where everything a string or a list.
1767
1768 sub as_Lisp_form {
1769   my @out;
1770
1771   my $sub;
1772   my $depth = 0;
1773   my(@list, $val);
1774   $sub = sub {  # Recursor
1775     my $self = $_[0];
1776     @list = ('_tag', $self->{'_tag'});
1777     @list = () unless defined $list[-1]; # unlikely
1778
1779     for (sort keys %$self) { # predictable ordering
1780       next if $_ eq '_content' or $_ eq '_tag' or $_ eq '_parent' or $_ eq '/';
1781        # Leave the other private attributes, I guess.
1782       push @list, $_, $val if defined($val = $self->{$_}); # and !ref $val;
1783     }
1784
1785     for (@list) {
1786         # octal-escape it
1787         s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])>
1788          <sprintf('\\%03o',ord($1))>eg;
1789         $_ = qq{"$_"};
1790     }
1791     push @out, ('  ' x $depth) . '(' . join ' ', splice @list;
1792     if(@{$self->{'_content'} || $nillio}) {
1793       $out[-1] .= " \"_content\" (\n";
1794       ++$depth;
1795       foreach my $c (@{$self->{'_content'}}) {
1796         if(ref($c)) {
1797           # an element -- recurse
1798           $sub->($c);
1799         } else {
1800           # a text segment -- stick it in and octal-escape it
1801           push @out, $c;
1802           $out[-1] =~
1803             s<([^\x20\x21\x23\x27-\x5B\x5D-\x7E])>
1804              <sprintf('\\%03o',ord($1))>eg;
1805           # And quote and indent it.
1806           $out[-1] .= "\"\n";
1807           $out[-1] = ('  ' x $depth) . '"' . $out[-1];
1808         }
1809       }
1810       --$depth;
1811       substr($out[-1],-1) = "))\n"; # end of _content and of the element
1812     } else {
1813       $out[-1] .= ")\n";
1814     }
1815     return;
1816   };
1817
1818   $sub->($_[0]);
1819   undef $sub;
1820   return join '', @out;
1821 }
1822
1823
1824 sub format {
1825     my($self, $formatter) = @_;
1826     unless (defined $formatter) {
1827         require HTML::FormatText;
1828         $formatter = HTML::FormatText->new();
1829     }
1830     $formatter->format($self);
1831 }
1832
1833
1834
1835 =head2 $h->starttag() or $h->starttag($entities)
1836
1837 Returns a string representing the complete start tag for the element.
1838 I.e., leading "<", tag name, attributes, and trailing ">".
1839 All values are surrounded with
1840 double-quotes, and appropriate characters are encoded.  If C<$entities>
1841 is omitted or undef, I<all> unsafe characters are encoded as HTML
1842 entities.  See L<HTML::Entities> for details.  If you specify some
1843 value for C<$entities>, remember to include the double-quote character in
1844 it.  (Previous versions of this module would basically behave as if
1845 C<'&"E<gt>'> were specified for C<$entities>.)  If C<$entities> is
1846 an empty string, no entity is escaped.
1847
1848 =cut
1849
1850 sub starttag {
1851     my($self, $entities) = @_;
1852
1853     my $name = $self->{'_tag'};
1854
1855     return        $self->{'text'}        if $name eq '~literal';
1856     return "<!" . $self->{'text'} . ">"  if $name eq '~declaration';
1857     return "<?" . $self->{'text'} . ">"  if $name eq '~pi';
1858
1859     if($name eq '~comment') {
1860       if(ref($self->{'text'} || '') eq 'ARRAY') {
1861         # Does this ever get used?  And is this right?
1862         return
1863           "<!" .
1864           join(' ', map("--$_--", @{$self->{'text'}}))
1865           .  ">"
1866        ;
1867       } else {
1868         return "<!--" . $self->{'text'} . "-->"
1869       }
1870     }
1871
1872     my $tag = $html_uc ? "<\U$name" : "<\L$name";
1873     my $val;
1874     for (sort keys %$self) { # predictable ordering
1875         next if !length $_ or m/^_/s or $_ eq '/';
1876         $val = $self->{$_};
1877         next if !defined $val; # or ref $val;
1878         if ($_ eq $val &&   # if attribute is boolean, for this element
1879             exists($HTML::Element::boolean_attr{$name}) &&
1880             (ref($HTML::Element::boolean_attr{$name})
1881               ? $HTML::Element::boolean_attr{$name}{$_}
1882               : $HTML::Element::boolean_attr{$name} eq $_)
1883         ) {
1884             $tag .= $html_uc ? " \U$_" : " \L$_";
1885         }
1886         else { # non-boolean attribute
1887
1888           if (ref $val eq 'HTML::Element' and
1889                   $val->{_tag} eq '~literal') {
1890             $val = $val->{text};
1891           }
1892           else {
1893             HTML::Entities::encode_entities($val, $entities) unless (defined($entities) && !length($entities));
1894           }
1895
1896           $val = qq{"$val"};
1897           $tag .= $html_uc ? qq{ \U$_\E=$val} : qq{ \L$_\E=$val};
1898         }
1899     } # for keys
1900     if ( scalar $self->content_list == 0 && $self->_empty_element_map->{ $self->tag } ) {
1901         return $tag . " />";
1902     }
1903     else {
1904         return $tag . ">";
1905     }
1906 }
1907
1908
1909 sub starttag_XML {
1910     my($self) = @_;
1911      # and a third parameter to signal emptiness?
1912
1913     my $name = $self->{'_tag'};
1914
1915     return        $self->{'text'}        if $name eq '~literal';
1916     return '<!' . $self->{'text'}. '>'   if $name eq '~declaration';
1917     return "<?" . $self->{'text'} . "?>" if $name eq '~pi';
1918
1919     if($name eq '~comment') {
1920       if(ref($self->{'text'} || '') eq 'ARRAY') {
1921         # Does this ever get used?  And is this right?
1922         $name = join(' ', @{$self->{'text'}});
1923       } else {
1924         $name = $self->{'text'};
1925       }
1926       $name =~ s/--/-&#45;/g; # can't have double --'s in XML comments
1927       return "<!-- $name -->";
1928     }
1929
1930     my $tag = "<$name";
1931     my $val;
1932     for (sort keys %$self) { # predictable ordering
1933         next if !length $_ or  m/^_/s or $_ eq '/';
1934         # Hm -- what to do if val is undef?
1935         # I suppose that shouldn't ever happen.
1936         next if !defined($val = $self->{$_}); # or ref $val;
1937         _xml_escape($val);
1938         $tag .= qq{ $_="$val"};
1939     }
1940     @_ == 3 ? "$tag />" : "$tag>";
1941 }
1942
1943
1944
1945 =head2 $h->endtag()
1946
1947 Returns a string representing the complete end tag for this element.
1948 I.e., "</", tag name, and ">".
1949
1950 =cut
1951
1952 sub endtag {
1953     $html_uc ? "</\U$_[0]->{'_tag'}>" : "</\L$_[0]->{'_tag'}>";
1954 }
1955
1956 # TODO: document?
1957 sub endtag_XML {
1958     "</$_[0]->{'_tag'}>";
1959 }
1960
1961
1962 #==========================================================================
1963 # This, ladies and germs, is an iterative implementation of a
1964 # recursive algorithm.  DON'T TRY THIS AT HOME.
1965 # Basically, the algorithm says:
1966 #
1967 # To traverse:
1968 #   1: pre-order visit this node
1969 #   2: traverse any children of this node
1970 #   3: post-order visit this node, unless it's a text segment,
1971 #       or a prototypically empty node (like "br", etc.)
1972 # Add to that the consideration of the callbacks' return values,
1973 # so you can block visitation of the children, or siblings, or
1974 # abort the whole excursion, etc.
1975 #
1976 # So, why all this hassle with making the code iterative?
1977 # It makes for real speed, because it eliminates the whole
1978 # hassle of Perl having to allocate scratch space for each
1979 # instance of the recursive sub.  Since the algorithm
1980 # is basically simple (and not all recursive ones are!) and
1981 # has few necessary lexicals (basically just the current node's
1982 # content list, and the current position in it), it was relatively
1983 # straightforward to store that information not as the frame
1984 # of a sub, but as a stack, i.e., a simple Perl array (well, two
1985 # of them, actually: one for content-listrefs, one for indexes of
1986 # current position in each of those).
1987
1988 my $NIL = [];
1989 sub traverse {
1990   my($start, $callback, $ignore_text) = @_;
1991
1992   Carp::croak "traverse can be called only as an object method"
1993    unless ref $start;
1994
1995   Carp::croak('must provide a callback for traverse()!')
1996    unless defined $callback and ref $callback;
1997
1998   # Elementary type-checking:
1999   my($c_pre, $c_post);
2000   if(UNIVERSAL::isa($callback, 'CODE')) {
2001     $c_pre = $c_post = $callback;
2002   } elsif(UNIVERSAL::isa($callback,'ARRAY')) {
2003     ($c_pre, $c_post) = @$callback;
2004     Carp::croak("pre-order callback \"$c_pre\" is true but not a coderef!")
2005      if $c_pre and not UNIVERSAL::isa($c_pre, 'CODE');
2006     Carp::croak("pre-order callback \"$c_post\" is true but not a coderef!")
2007      if $c_post and not UNIVERSAL::isa($c_post, 'CODE');
2008     return $start unless $c_pre or $c_post;
2009      # otherwise there'd be nothing to actually do!
2010   } else {
2011     Carp::croak("$callback is not a known kind of reference")
2012      unless ref($callback);
2013   }
2014
2015   my $empty_element_map = $start->_empty_element_map;
2016
2017   my(@C) = [$start]; # a stack containing lists of children
2018   my(@I) = (-1); # initial value must be -1 for each list
2019     # a stack of indexes to current position in corresponding lists in @C
2020   # In each of these, 0 is the active point
2021
2022   # scratch:
2023   my(
2024     $rv,   # return value of callback
2025     $this, # current node
2026     $content_r, # child list of $this
2027   );
2028
2029   # THE BIG LOOP
2030   while(@C) {
2031     # Move to next item in this frame
2032     if(!defined($I[0]) or ++$I[0] >= @{$C[0]}) {
2033       # We either went off the end of this list, or aborted the list
2034       # So call the post-order callback:
2035       if($c_post
2036          and defined $I[0]
2037          and @C > 1
2038           # to keep the next line from autovivifying
2039          and defined($this = $C[1][ $I[1] ]) # sanity, and
2040           # suppress callbacks on exiting the fictional top frame
2041          and ref($this) # sanity
2042          and not(
2043                  $this->{'_empty_element'}
2044                  || $empty_element_map->{$this->{'_tag'} || ''}
2045                 ) # things that don't get post-order callbacks
2046       ) {
2047         shift @I;
2048         shift @C;
2049         #print "Post! at depth", scalar(@I), "\n";
2050         $rv = $c_post->(
2051            #map $_, # copy to avoid any messiness
2052            $this,           # 0: this
2053            0,               # 1: startflag (0 for post-order call)
2054            @I - 1,          # 2: depth
2055         );
2056
2057         if(defined($rv) and ref($rv) eq $travsignal_package) {
2058           $rv = $$rv; #deref
2059           if($rv eq 'ABORT') {
2060             last; # end of this excursion!
2061           } elsif($rv eq 'PRUNE') {
2062             # NOOP on post!!
2063           } elsif($rv eq 'PRUNE_SOFTLY') {
2064             # NOOP on post!!
2065           } elsif($rv eq 'OK') {
2066             # noop
2067           } elsif($rv eq 'PRUNE_UP') {
2068             $I[0] = undef;
2069           } else {
2070             die "Unknown travsignal $rv\n";
2071             # should never happen
2072           }
2073         }
2074       }
2075       else {
2076         shift @I;
2077         shift @C;
2078       }
2079       next;
2080     }
2081
2082     $this = $C[0][ $I[0] ];
2083
2084     if($c_pre) {
2085       if(defined $this and ref $this) { # element
2086         $rv = $c_pre->(
2087            #map $_, # copy to avoid any messiness
2088            $this,           # 0: this
2089            1,               # 1: startflag (1 for pre-order call)
2090            @I - 1,          # 2: depth
2091         );
2092       } else { # text segment
2093         next if $ignore_text;
2094         $rv = $c_pre->(
2095            #map $_, # copy to avoid any messiness
2096            $this,           # 0: this
2097            1,               # 1: startflag (1 for pre-order call)
2098            @I - 1,          # 2: depth
2099            $C[1][ $I[1] ],  # 3: parent
2100                # And there will always be a $C[1], since
2101                #  we can't start traversing at a text node
2102            $I[0]            # 4: index of self in parent's content list
2103         );
2104       }
2105       if(not $rv) { # returned false.  Same as PRUNE.
2106         next; # prune
2107       } elsif(ref($rv) eq $travsignal_package) {
2108         $rv = $$rv; # deref
2109         if($rv eq 'ABORT') {
2110           last; # end of this excursion!
2111         } elsif($rv eq 'PRUNE') {
2112           next;
2113         } elsif($rv eq 'PRUNE_SOFTLY') {
2114           if(ref($this)
2115              and
2116              not($this->{'_empty_element'}
2117                  || $empty_element_map->{$this->{'_tag'} || ''})
2118           ) {
2119             # push a dummy empty content list just to trigger a post callback
2120             unshift @I, -1;
2121             unshift @C, $NIL;
2122           }
2123           next;
2124         } elsif($rv eq 'OK') {
2125           # noop
2126         } elsif($rv eq 'PRUNE_UP') {
2127           $I[0] = undef;
2128           next;
2129
2130           # equivalent of last'ing out of the current child list.
2131
2132         # Used to have PRUNE_UP_SOFTLY and ABORT_SOFTLY here, but the code
2133         # for these was seriously upsetting, served no particularly clear
2134         # purpose, and could not, I think, be easily implemented with a
2135         # recursive routine.  All bad things!
2136         } else {
2137           die "Unknown travsignal $rv\n";
2138           # should never happen
2139         }
2140       }
2141       # else fall thru to meaning same as \'OK'.
2142     }
2143     # end of pre-order calling
2144
2145     # Now queue up content list for the current element...
2146     if(ref $this
2147        and
2148        not( # ...except for those which...
2149          not($content_r = $this->{'_content'} and @$content_r)
2150             # ...have empty content lists...
2151          and $this->{'_empty_element'} || $empty_element_map->{$this->{'_tag'} || ''}
2152             # ...and that don't get post-order callbacks
2153        )
2154     ) {
2155       unshift @I, -1;
2156       unshift @C, $content_r || $NIL;
2157       #print $this->{'_tag'}, " ($this) adds content_r ", $C[0], "\n";
2158     }
2159   }
2160   return $start;
2161 }
2162
2163
2164 =head1 SECONDARY STRUCTURAL METHODS
2165
2166 These methods all involve some structural aspect of the tree;
2167 either they report some aspect of the tree's structure, or they involve
2168 traversal down the tree, or walking up the tree.
2169
2170 =head2 $h->is_inside('tag', ...) or $h->is_inside($element, ...)
2171
2172 Returns true if the $h element is, or is contained anywhere inside an
2173 element that is any of the ones listed, or whose tag name is any of
2174 the tag names listed.
2175
2176 =cut
2177
2178 sub is_inside {
2179   my $self = shift;
2180   return undef unless @_; # if no items specified, I guess this is right.
2181
2182   my $current = $self;
2183       # the loop starts by looking at the given element
2184   while (defined $current and ref $current) {
2185     for (@_) {
2186       if(ref) { # element
2187         return 1 if $_ eq $current;
2188       } else { # tag name
2189         return 1 if $_ eq $current->{'_tag'};
2190       }
2191     }
2192     $current = $current->{'_parent'};
2193   }
2194   0;
2195 }
2196
2197 =head2 $h->is_empty()
2198
2199 Returns true if $h has no content, i.e., has no elements or text
2200 segments under it.  In other words, this returns true if $h is a leaf
2201 node, AKA a terminal node.  Do not confuse this sense of "empty" with
2202 another sense that it can have in SGML/HTML/XML terminology, which
2203 means that the element in question is of the type (like HTML's "hr",
2204 "br", "img", etc.) that I<can't> have any content.
2205
2206 That is, a particular "p" element may happen to have no content, so
2207 $that_p_element->is_empty will be true -- even though the prototypical
2208 "p" element isn't "empty" (not in the way that the prototypical "hr"
2209 element is).
2210
2211 If you think this might make for potentially confusing code, consider
2212 simply using the clearer exact equivalent:  not($h->content_list)
2213
2214 =cut
2215
2216 sub is_empty {
2217   my $self = shift;
2218   !$self->{'_content'} || !@{$self->{'_content'}};
2219 }
2220
2221
2222 =head2 $h->pindex()
2223
2224 Return the index of the element in its parent's contents array, such
2225 that $h would equal
2226
2227   $h->parent->content->[$h->pindex]
2228   or
2229   ($h->parent->content_list)[$h->pindex]
2230
2231 assuming $h isn't root.  If the element $h is root, then
2232 $h->pindex returns undef.
2233
2234 =cut
2235
2236 sub pindex {
2237   my $self = shift;
2238
2239   my $parent = $self->{'_parent'} || return undef;
2240   my $pc =  $parent->{'_content'} || return undef;
2241   for(my $i = 0; $i < @$pc; ++$i) {
2242     return $i  if  ref $pc->[$i] and $pc->[$i] eq $self;
2243   }
2244   return undef; # we shouldn't ever get here
2245 }
2246
2247 #--------------------------------------------------------------------------
2248
2249 =head2 $h->left()
2250
2251 In scalar context: returns the node that's the immediate left sibling
2252 of $h.  If $h is the leftmost (or only) child of its parent (or has no
2253 parent), then this returns undef.
2254
2255 In list context: returns all the nodes that're the left siblings of $h
2256 (starting with the leftmost).  If $h is the leftmost (or only) child
2257 of its parent (or has no parent), then this returns empty-list.
2258
2259 (See also $h->preinsert(LIST).)
2260
2261 =cut
2262
2263 sub left {
2264   Carp::croak "left() is supposed to be an object method"
2265    unless ref $_[0];
2266   my $pc =
2267     (
2268      $_[0]->{'_parent'} || return
2269     )->{'_content'} || die "parent is childless?";
2270
2271   die "parent is childless" unless @$pc;
2272   return if @$pc == 1; # I'm an only child
2273
2274   if(wantarray) {
2275     my @out;
2276     foreach my $j (@$pc) {
2277       return @out if ref $j and $j eq $_[0];
2278       push @out, $j;
2279     }
2280   } else {
2281     for(my $i = 0; $i < @$pc; ++$i) {
2282       return $i ? $pc->[$i - 1] : undef
2283        if ref $pc->[$i] and $pc->[$i] eq $_[0];
2284     }
2285   }
2286
2287   die "I'm not in my parent's content list?";
2288   return;
2289 }
2290
2291 =head2 $h->right()
2292
2293 In scalar context: returns the node that's the immediate right sibling
2294 of $h.  If $h is the rightmost (or only) child of its parent (or has
2295 no parent), then this returns undef.
2296
2297 In list context: returns all the nodes that're the right siblings of
2298 $h, starting with the leftmost.  If $h is the rightmost (or only) child
2299 of its parent (or has no parent), then this returns empty-list.
2300
2301 (See also $h->postinsert(LIST).)
2302
2303 =cut
2304
2305 sub right {
2306   Carp::croak "right() is supposed to be an object method"
2307    unless ref $_[0];
2308   my $pc =
2309     (
2310      $_[0]->{'_parent'} || return
2311     )->{'_content'} || die "parent is childless?";
2312
2313   die "parent is childless" unless @$pc;
2314   return if @$pc == 1; # I'm an only child
2315
2316   if(wantarray) {
2317     my(@out, $seen);
2318     foreach my $j (@$pc) {
2319       if($seen) {
2320         push @out, $j;
2321       } else {
2322         $seen = 1 if ref $j and $j eq $_[0];
2323       }
2324     }
2325     die "I'm not in my parent's content list?" unless $seen;
2326     return @out;
2327   } else {
2328     for(my $i = 0; $i < @$pc; ++$i) {
2329       return +($i == $#$pc) ? undef : $pc->[$i+1]
2330        if ref $pc->[$i] and $pc->[$i] eq $_[0];
2331     }
2332     die "I'm not in my parent's content list?";
2333     return;
2334   }
2335 }
2336
2337 #--------------------------------------------------------------------------
2338
2339 =head2 $h->address()
2340
2341 Returns a string representing the location of this node in the tree.
2342 The address consists of numbers joined by a '.', starting with '0',
2343 and followed by the pindexes of the nodes in the tree that are
2344 ancestors of $h, starting from the top.
2345
2346 So if the way to get to a node starting at the root is to go to child
2347 2 of the root, then child 10 of that, and then child 0 of that, and
2348 then you're there -- then that node's address is "0.2.10.0".
2349
2350 As a bit of a special case, the address of the root is simply "0".
2351
2352 I forsee this being used mainly for debugging, but you may
2353 find your own uses for it.
2354
2355 =head2 $h->address($address)
2356
2357 This returns the node (whether element or text-segment) at
2358 the given address in the tree that $h is a part of.  (That is,
2359 the address is resolved starting from $h->root.)
2360
2361 If there is no node at the given address, this returns undef.
2362
2363 You can specify "relative addressing" (i.e., that indexing is supposed
2364 to start from $h and not from $h->root) by having the address start
2365 with a period -- e.g., $h->address(".3.2") will look at child 3 of $h,
2366 and child 2 of that.
2367
2368 =cut
2369
2370 sub address {
2371   if(@_ == 1) { # report-address form
2372     return
2373       join('.',
2374         reverse( # so it starts at the top
2375           map($_->pindex() || '0', # so that root's undef -> '0'
2376             $_[0], # self and...
2377             $_[0]->lineage
2378           )
2379         )
2380       )
2381     ;
2382   } else { # get-node-at-address
2383     my @stack = split(/\./, $_[1]);
2384     my $here;
2385
2386     if(@stack and !length $stack[0]) { # relative addressing
2387       $here = $_[0];
2388       shift @stack;
2389     } else { # absolute addressing
2390       return undef unless 0 == shift @stack; # to pop the initial 0-for-root
2391       $here = $_[0]->root;
2392     }
2393
2394     while(@stack) {
2395       return undef
2396        unless
2397          $here->{'_content'}
2398          and @{$here->{'_content'}} > $stack[0];
2399             # make sure the index isn't too high
2400       $here = $here->{'_content'}[ shift @stack ];
2401       return undef if @stack and not ref $here;
2402         # we hit a text node when we expected a non-terminal element node
2403     }
2404
2405     return $here;
2406   }
2407 }
2408
2409
2410 =head2 $h->depth()
2411
2412 Returns a number expressing C<$h>'s depth within its tree, i.e., how many
2413 steps away it is from the root.  If C<$h> has no parent (i.e., is root),
2414 its depth is 0.
2415
2416 =cut
2417
2418 sub depth {
2419   my $here = $_[0];
2420   my $depth = 0;
2421   while(defined($here = $here->{'_parent'}) and ref($here)) {
2422     ++$depth;
2423   }
2424   return $depth;
2425 }
2426
2427
2428 =head2 $h->root()
2429
2430 Returns the element that's the top of C<$h>'s tree.  If C<$h> is
2431 root, this just returns C<$h>.  (If you want to test whether C<$h>
2432 I<is> the root, instead of asking what its root is, just test
2433 C<< not($h->parent) >>.)
2434
2435 =cut
2436
2437 sub root {
2438   my $here = my $root = shift;
2439   while(defined($here = $here->{'_parent'}) and ref($here)) {
2440     $root = $here;
2441   }
2442   return $root;
2443 }
2444
2445
2446 =head2 $h->lineage()
2447
2448 Returns the list of C<$h>'s ancestors, starting with its parent,
2449 and then that parent's parent, and so on, up to the root.  If C<$h>
2450 is root, this returns an empty list.
2451
2452 If you simply want a count of the number of elements in C<$h>'s lineage,
2453 use $h->depth.
2454
2455 =cut
2456
2457 sub lineage {
2458   my $here = shift;
2459   my @lineage;
2460   while(defined($here = $here->{'_parent'}) and ref($here)) {
2461     push @lineage, $here;
2462   }
2463   return @lineage;
2464 }
2465
2466
2467 =head2 $h->lineage_tag_names()
2468
2469 Returns the list of the tag names of $h's ancestors, starting
2470 with its parent, and that parent's parent, and so on, up to the
2471 root.  If $h is root, this returns an empty list.
2472 Example output: C<('em', 'td', 'tr', 'table', 'body', 'html')>
2473
2474 =cut
2475
2476 sub lineage_tag_names {
2477   my $here = my $start = shift;
2478   my @lineage_names;
2479   while(defined($here = $here->{'_parent'}) and ref($here)) {
2480     push @lineage_names, $here->{'_tag'};
2481   }
2482   return @lineage_names;
2483 }
2484
2485
2486 =head2 $h->descendants()
2487
2488 In list context, returns the list of all $h's descendant elements,
2489 listed in pre-order (i.e., an element appears before its
2490 content-elements).  Text segments DO NOT appear in the list.
2491 In scalar context, returns a count of all such elements.
2492
2493 =head2 $h->descendents()
2494
2495 This is just an alias to the C<descendants> method.
2496
2497 =cut
2498
2499 sub descendents { shift->descendants(@_) }
2500
2501 sub descendants {
2502   my $start = shift;
2503   if(wantarray) {
2504     my @descendants;
2505     $start->traverse(
2506       [ # pre-order sub only
2507         sub {
2508           push(@descendants, $_[0]);
2509           return 1;
2510         },
2511         undef # no post
2512       ],
2513       1, # ignore text
2514     );
2515     shift @descendants; # so $self doesn't appear in the list
2516     return @descendants;
2517   } else { # just returns a scalar
2518     my $descendants = -1; # to offset $self being counted
2519     $start->traverse(
2520       [ # pre-order sub only
2521         sub {
2522           ++$descendants;
2523           return 1;
2524         },
2525         undef # no post
2526       ],
2527       1, # ignore text
2528     );
2529     return $descendants;
2530   }
2531 }
2532
2533
2534 =head2 $h->find_by_tag_name('tag', ...)
2535
2536 In list context, returns a list of elements at or under $h that have
2537 any of the specified tag names.  In scalar context, returns the first
2538 (in pre-order traversal of the tree) such element found, or undef if
2539 none.
2540
2541 =head2 $h->find('tag', ...)
2542
2543 This is just an alias to C<find_by_tag_name>.  (There was once
2544 going to be a whole find_* family of methods, but then look_down
2545 filled that niche, so there turned out not to be much reason for the
2546 verboseness of the name "find_by_tag_name".)
2547
2548 =cut
2549
2550 sub find { shift->find_by_tag_name( @_ ) }
2551  # yup, a handy alias
2552
2553
2554 sub find_by_tag_name {
2555   my(@pile) = shift(@_); # start out the to-do stack for the traverser
2556   Carp::croak "find_by_tag_name can be called only as an object method"
2557    unless ref $pile[0];
2558   return() unless @_;
2559   my(@tags) = $pile[0]->_fold_case(@_);
2560   my(@matching, $this, $this_tag);
2561   while(@pile) {
2562     $this_tag = ($this = shift @pile)->{'_tag'};
2563     foreach my $t (@tags) {
2564       if($t eq $this_tag) {
2565         if(wantarray) {
2566           push @matching, $this;
2567           last;
2568         } else {
2569           return $this;
2570         }
2571       }
2572     }
2573     unshift @pile, grep ref($_), @{$this->{'_content'} || next};
2574   }
2575   return @matching if wantarray;
2576   return;
2577 }
2578
2579 =head2 $h->find_by_attribute('attribute', 'value')
2580
2581 In a list context, returns a list of elements at or under $h that have
2582 the specified attribute, and have the given value for that attribute.
2583 In a scalar context, returns the first (in pre-order traversal of the
2584 tree) such element found, or undef if none.
2585
2586 This method is B<deprecated> in favor of the more expressive
2587 C<look_down> method, which new code should use instead.
2588
2589 =cut
2590
2591
2592 sub find_by_attribute {
2593   # We could limit this to non-internal attributes, but hey.
2594   my($self, $attribute, $value) = @_;
2595   Carp::croak "Attribute must be a defined value!" unless defined $attribute;
2596   $attribute =  $self->_fold_case($attribute);
2597
2598   my @matching;
2599   my $wantarray = wantarray;
2600   my $quit;
2601   $self->traverse(
2602     [ # pre-order only
2603       sub {
2604         if( exists $_[0]{$attribute}
2605              and $_[0]{$attribute} eq $value
2606         ) {
2607           push @matching, $_[0];
2608           return HTML::Element::ABORT unless $wantarray; # only take the first
2609         }
2610         1; # keep traversing
2611       },
2612       undef # no post
2613     ],
2614     1, # yes, ignore text nodes.
2615   );
2616
2617   if($wantarray) {
2618     return @matching;
2619   } else {
2620     return undef unless @matching;
2621     return $matching[0];
2622   }
2623 }
2624
2625 #--------------------------------------------------------------------------
2626
2627 =head2 $h->look_down( ...criteria... )
2628
2629 This starts at $h and looks thru its element descendants (in
2630 pre-order), looking for elements matching the criteria you specify.
2631 In list context, returns all elements that match all the given
2632 criteria; in scalar context, returns the first such element (or undef,
2633 if nothing matched).
2634
2635 There are three kinds of criteria you can specify:
2636
2637 =over
2638
2639 =item (attr_name, attr_value)
2640
2641 This means you're looking for an element with that value for that
2642 attribute.  Example: C<"alt", "pix!">.  Consider that you can search
2643 on internal attribute values too: C<"_tag", "p">.
2644
2645 =item (attr_name, qr/.../)
2646
2647 This means you're looking for an element whose value for that
2648 attribute matches the specified Regexp object.
2649
2650 =item a coderef
2651
2652 This means you're looking for elements where coderef->(each_element)
2653 returns true.  Example:
2654
2655   my @wide_pix_images
2656     = $h->look_down(
2657                     "_tag", "img",
2658                     "alt", "pix!",
2659                     sub { $_[0]->attr('width') > 350 }
2660                    );
2661
2662 =back
2663
2664 Note that C<(attr_name, attr_value)> and C<(attr_name, qr/.../)>
2665 criteria are almost always faster than coderef
2666 criteria, so should presumably be put before them in your list of
2667 criteria.  That is, in the example above, the sub ref is called only
2668 for elements that have already passed the criteria of having a "_tag"
2669 attribute with value "img", and an "alt" attribute with value "pix!".
2670 If the coderef were first, it would be called on every element, and
2671 I<then> what elements pass that criterion (i.e., elements for which
2672 the coderef returned true) would be checked for their "_tag" and "alt"
2673 attributes.
2674
2675 Note that comparison of string attribute-values against the string
2676 value in C<(attr_name, attr_value)> is case-INsensitive!  A criterion
2677 of C<('align', 'right')> I<will> match an element whose "align" value
2678 is "RIGHT", or "right" or "rIGhT", etc.
2679
2680 Note also that C<look_down> considers "" (empty-string) and undef to
2681 be different things, in attribute values.  So this:
2682
2683   $h->look_down("alt", "")
2684
2685 will find elements I<with> an "alt" attribute, but where the value for
2686 the "alt" attribute is "".  But this:
2687
2688   $h->look_down("alt", undef)
2689
2690 is the same as:
2691
2692   $h->look_down(sub { !defined($_[0]->attr('alt')) } )
2693
2694 That is, it finds elements that do not have an "alt" attribute at all
2695 (or that do have an "alt" attribute, but with a value of undef --
2696 which is not normally possible).
2697
2698 Note that when you give several criteria, this is taken to mean you're
2699 looking for elements that match I<all> your criterion, not just I<any>
2700 of them.  In other words, there is an implicit "and", not an "or".  So
2701 if you wanted to express that you wanted to find elements with a
2702 "name" attribute with the value "foo" I<or> with an "id" attribute
2703 with the value "baz", you'd have to do it like:
2704
2705   @them = $h->look_down(
2706     sub {
2707       # the lcs are to fold case
2708       lc($_[0]->attr('name')) eq 'foo'
2709       or lc($_[0]->attr('id')) eq 'baz'
2710     }
2711   );
2712
2713 Coderef criteria are more expressive than C<(attr_name, attr_value)>
2714 and C<(attr_name, qr/.../)>
2715 criteria, and all C<(attr_name, attr_value)>
2716 and C<(attr_name, qr/.../)>
2717 criteria could be
2718 expressed in terms of coderefs.  However, C<(attr_name, attr_value)>
2719 and C<(attr_name, qr/.../)>
2720 criteria are a convenient shorthand.  (In fact, C<look_down> itself is
2721 basically "shorthand" too, since anything you can do with C<look_down>
2722 you could do by traversing the tree, either with the C<traverse>
2723 method or with a routine of your own.  However, C<look_down> often
2724 makes for very concise and clear code.)
2725
2726 =cut
2727
2728 sub look_down {
2729   ref($_[0]) or Carp::croak "look_down works only as an object method";
2730
2731   my @criteria;
2732   for(my $i = 1; $i < @_;) {
2733     Carp::croak "Can't use undef as an attribute name" unless defined $_[$i];
2734     if(ref $_[$i]) {
2735       Carp::croak "A " . ref($_[$i]) . " value is not a criterion"
2736         unless ref $_[$i] eq 'CODE';
2737       push @criteria, $_[ $i++ ];
2738     } else {
2739       Carp::croak "param list to look_down ends in a key!" if $i == $#_;
2740       push @criteria, [ scalar($_[0]->_fold_case($_[$i])),
2741                         defined($_[$i+1])
2742                           ? ( ( ref $_[$i+1] ? $_[$i+1] : lc( $_[$i+1] )), ref( $_[$i+1] ) )
2743                                                # yes, leave that LC!
2744                           : undef
2745                       ];
2746       $i += 2;
2747     }
2748   }
2749   Carp::croak "No criteria?" unless @criteria;
2750
2751   my(@pile) = ($_[0]);
2752   my(@matching, $val, $this);
2753  Node:
2754   while(defined($this = shift @pile)) {
2755     # Yet another traverser implemented with merely iterative code.
2756     foreach my $c (@criteria) {
2757       if(ref($c) eq 'CODE') {
2758         next Node unless $c->($this);  # jump to the continue block
2759       } else { # it's an attr-value pair
2760         next Node  # jump to the continue block
2761           if # two values are unequal if:
2762             (defined($val = $this->{ $c->[0] }))
2763               ? (
2764                   !defined $c->[1]  # actual is def, critval is undef => fail
2765                      # allow regex matching
2766                     # allow regex matching
2767                   or (
2768                   $c->[2] eq 'Regexp'
2769                     ? $val !~ $c->[1]
2770                     : ( ref $val ne $c->[2]
2771                    # have unequal ref values => fail
2772                   or lc($val) ne lc($c->[1])
2773                    # have unequal lc string values => fail
2774                   ))
2775                 )
2776               : (defined $c->[1]) # actual is undef, critval is def => fail
2777       }
2778     }
2779     # We make it this far only if all the criteria passed.
2780     return $this unless wantarray;
2781     push @matching, $this;
2782   } continue {
2783     unshift @pile, grep ref($_), @{$this->{'_content'} || $nillio};
2784   }
2785   return @matching if wantarray;
2786   return;
2787 }
2788
2789
2790 =head2 $h->look_up( ...criteria... )
2791
2792 This is identical to $h->look_down, except that whereas $h->look_down
2793 basically scans over the list:
2794
2795    ($h, $h->descendants)
2796
2797 $h->look_up instead scans over the list
2798
2799    ($h, $h->lineage)
2800
2801 So, for example, this returns all ancestors of $h (possibly including
2802 $h itself) that are "td" elements with an "align" attribute with a
2803 value of "right" (or "RIGHT", etc.):
2804
2805    $h->look_up("_tag", "td", "align", "right");
2806
2807 =cut
2808
2809 sub look_up {
2810   ref($_[0]) or Carp::croak "look_up works only as an object method";
2811
2812   my @criteria;
2813   for(my $i = 1; $i < @_;) {
2814     Carp::croak "Can't use undef as an attribute name" unless defined $_[$i];
2815     if(ref $_[$i]) {
2816       Carp::croak "A " . ref($_[$i]) . " value is not a criterion"
2817         unless ref $_[$i] eq 'CODE';
2818       push @criteria, $_[ $i++ ];
2819     } else {
2820       Carp::croak "param list to look_up ends in a key!" if $i == $#_;
2821       push @criteria, [ scalar($_[0]->_fold_case($_[$i])),
2822                         defined($_[$i+1])
2823                           ? ( ( ref $_[$i+1] ? $_[$i+1] : lc( $_[$i+1] )), ref( $_[$i+1] ) )
2824                           : undef  # Yes, leave that LC!
2825                       ];
2826       $i += 2;
2827     }
2828   }
2829   Carp::croak "No criteria?" unless @criteria;
2830
2831   my(@matching, $val);
2832   my $this = $_[0];
2833  Node:
2834   while(1) {
2835     # You'll notice that the code here is almost the same as for look_down.
2836     foreach my $c (@criteria) {
2837       if(ref($c) eq 'CODE') {
2838         next Node unless $c->($this);  # jump to the continue block
2839       } else { # it's an attr-value pair
2840         next Node  # jump to the continue block
2841           if # two values are unequal if:
2842             (defined($val = $this->{ $c->[0] }))
2843               ? (
2844                   !defined $c->[1]  # actual is def, critval is undef => fail
2845                   or (
2846                   $c->[2] eq 'Regexp'
2847                     ? $val !~ $c->[1]
2848                     : ( ref $val ne $c->[2]
2849                    # have unequal ref values => fail
2850                   or lc($val) ne $c->[1]
2851                    # have unequal lc string values => fail
2852                   ))
2853                 )
2854               : (defined $c->[1]) # actual is undef, critval is def => fail
2855       }
2856     }
2857     # We make it this far only if all the criteria passed.
2858     return $this unless wantarray;
2859     push @matching, $this;
2860   } continue {
2861     last unless defined($this = $this->{'_parent'}) and ref $this;
2862   }
2863
2864   return @matching if wantarray;
2865   return;
2866 }
2867
2868 #--------------------------------------------------------------------------
2869
2870 =head2 $h->traverse(...options...)
2871
2872 Lengthy discussion of HTML::Element's unnecessary and confusing
2873 C<traverse> method has been moved to a separate file:
2874 L<HTML::Element::traverse>
2875
2876 =head2 $h->attr_get_i('attribute')
2877
2878 In list context, returns a list consisting of the values of the given
2879 attribute for $self and for all its ancestors starting from $self and
2880 working its way up.  Nodes with no such attribute are skipped.
2881 ("attr_get_i" stands for "attribute get, with inheritance".)
2882 In scalar context, returns the first such value, or undef if none.
2883
2884 Consider a document consisting of:
2885
2886    <html lang='i-klingon'>
2887      <head><title>Pati Pata</title></head>
2888      <body>
2889        <h1 lang='la'>Stuff</h1>
2890        <p lang='es-MX' align='center'>
2891          Foo bar baz <cite>Quux</cite>.
2892        </p>
2893        <p>Hooboy.</p>
2894      </body>
2895    </html>
2896
2897 If $h is the "cite" element, $h->attr_get_i("lang") in list context
2898 will return the list ('es-MX', 'i-klingon').  In scalar context, it
2899 will return the value 'es-MX'.
2900
2901 If you call with multiple attribute names...
2902
2903 =head2 $h->attr_get_i('a1', 'a2', 'a3')
2904
2905 ...in list context, this will return a list consisting of
2906 the values of these attributes which exist in $self and its ancestors.
2907 In scalar context, this returns the first value (i.e., the value of
2908 the first existing attribute from the first element that has
2909 any of the attributes listed).  So, in the above example,
2910
2911   $h->attr_get_i('lang', 'align');
2912
2913 will return:
2914
2915    ('es-MX', 'center', 'i-klingon') # in list context
2916   or
2917    'es-MX' # in scalar context.
2918
2919 But note that this:
2920
2921  $h->attr_get_i('align', 'lang');
2922
2923 will return:
2924
2925    ('center', 'es-MX', 'i-klingon') # in list context
2926   or
2927    'center' # in scalar context.
2928
2929 =cut
2930
2931 sub attr_get_i {
2932   if(@_ > 2) {
2933     my $self = shift;
2934     Carp::croak "No attribute names can be undef!"
2935      if grep !defined($_), @_;
2936     my @attributes = $self->_fold_case(@_);
2937     if(wantarray) {
2938       my @out;
2939       foreach my $x ($self, $self->lineage) {
2940         push @out, map { exists($x->{$_}) ? $x->{$_} : () } @attributes;
2941       }
2942       return @out;
2943     } else {
2944       foreach my $x ($self, $self->lineage) {
2945         foreach my $attribute (@attributes) {
2946           return $x->{$attribute} if exists $x->{$attribute}; # found
2947         }
2948       }
2949       return undef; # never found
2950     }
2951   } else {
2952     # Single-attribute search.  Simpler, most common, so optimize
2953     #  for the most common case
2954     Carp::croak "Attribute name must be a defined value!" unless defined $_[1];
2955     my $self = $_[0];
2956     my $attribute =  $self->_fold_case($_[1]);
2957     if(wantarray) { # list context
2958       return
2959         map {
2960           exists($_->{$attribute}) ? $_->{$attribute} : ()
2961         } $self, $self->lineage;
2962       ;
2963     } else { # scalar context
2964       foreach my $x ($self, $self->lineage) {
2965         return $x->{$attribute} if exists $x->{$attribute}; # found
2966       }
2967       return undef; # never found
2968     }
2969   }
2970 }
2971
2972
2973 =head2 $h->tagname_map()
2974
2975 Scans across C<$h> and all its descendants, and makes a hash (a
2976 reference to which is returned) where each entry consists of a key
2977 that's a tag name, and a value that's a reference to a list to all
2978 elements that have that tag name.  I.e., this method returns:
2979
2980    {
2981      # Across $h and all descendants...
2982      'a'   => [ ...list of all 'a'   elements... ],
2983      'em'  => [ ...list of all 'em'  elements... ],
2984      'img' => [ ...list of all 'img' elements... ],
2985    }
2986
2987 (There are entries in the hash for only those tagnames that occur
2988 at/under C<$h> -- so if there's no "img" elements, there'll be no
2989 "img" entry in the hashr(ref) returned.)
2990
2991 Example usage:
2992
2993     my $map_r = $h->tagname_map();
2994     my @heading_tags = sort grep m/^h\d$/s, keys %$map_r;
2995     if(@heading_tags) {
2996       print "Heading levels used: @heading_tags\n";
2997     } else {
2998       print "No headings.\n"
2999     }
3000
3001 =cut
3002
3003 sub tagname_map {
3004   my(@pile) = $_[0]; # start out the to-do stack for the traverser
3005   Carp::croak "find_by_tag_name can be called only as an object method"
3006    unless ref $pile[0];
3007   my(%map, $this_tag, $this);
3008   while(@pile) {
3009     $this_tag = ''
3010       unless defined(
3011        $this_tag = (
3012         $this = shift @pile
3013        )->{'_tag'}
3014       )
3015     ; # dance around the strange case of having an undef tagname.
3016     push @{ $map{$this_tag} ||= [] }, $this; # add to map
3017     unshift @pile, grep ref($_), @{$this->{'_content'} || next}; # traverse
3018   }
3019   return \%map;
3020 }
3021
3022
3023 =head2 $h->extract_links() or $h->extract_links(@wantedTypes)
3024
3025 Returns links found by traversing the element and all of its children
3026 and looking for attributes (like "href" in an "a" element, or "src" in
3027 an "img" element) whose values represent links.  The return value is a
3028 I<reference> to an array.  Each element of the array is reference to
3029 an array with I<four> items: the link-value, the element that has the
3030 attribute with that link-value, and the name of that attribute, and
3031 the tagname of that element.
3032 (Example: C<['http://www.suck.com/',> I<$elem_obj> C<, 'href', 'a']>.)
3033 You may or may not end up using the
3034 element itself -- for some purposes, you may use only the link value.
3035
3036 You might specify that you want to extract links from just some kinds
3037 of elements (instead of the default, which is to extract links from
3038 I<all> the kinds of elements known to have attributes whose values
3039 represent links).  For instance, if you want to extract links from
3040 only "a" and "img" elements, you could code it like this:
3041
3042   for (@{  $e->extract_links('a', 'img')  }) {
3043       my($link, $element, $attr, $tag) = @$_;
3044       print
3045         "Hey, there's a $tag that links to ",
3046         $link, ", in its $attr attribute, at ",
3047         $element->address(), ".\n";
3048   }
3049
3050 =cut
3051
3052
3053 sub extract_links {
3054     my $start = shift;
3055
3056     my %wantType;
3057     @wantType{$start->_fold_case(@_)} = (1) x @_; # if there were any
3058     my $wantType = scalar(@_);
3059
3060     my @links;
3061
3062     # TODO: add xml:link?
3063
3064     my($link_attrs, $tag, $self, $val); # scratch for each iteration
3065     $start->traverse(
3066       [
3067         sub { # pre-order call only
3068           $self = $_[0];
3069
3070           $tag = $self->{'_tag'};
3071           return 1 if $wantType && !$wantType{$tag};  # if we're selective
3072
3073           if(defined(  $link_attrs = $HTML::Element::linkElements{$tag}  )) {
3074             # If this is a tag that has any link attributes,
3075             #  look over possibly present link attributes,
3076             #  saving the value, if found.
3077             for (ref($link_attrs) ? @$link_attrs : $link_attrs) {
3078               if(defined(  $val = $self->attr($_)  )) {
3079                 push(@links, [$val, $self, $_, $tag])
3080               }
3081             }
3082           }
3083           1; # return true, so we keep recursing
3084         },
3085         undef
3086       ],
3087       1, # ignore text nodes
3088     );
3089     \@links;
3090 }
3091
3092
3093 =head2 $h->simplify_pres
3094
3095 In text bits under PRE elements that are at/under $h, this routine
3096 nativizes all newlines, and expands all tabs.
3097
3098 That is, if you read a file with lines delimited by C<\cm\cj>'s, the
3099 text under PRE areas will have C<\cm\cj>'s instead of C<\n>'s. Calling
3100 $h->nativize_pre_newlines on such a tree will turn C<\cm\cj>'s into
3101 C<\n>'s.
3102
3103 Tabs are expanded to however many spaces it takes to get
3104 to the next 8th column -- the usual way of expanding them.
3105
3106 =cut
3107
3108 sub simplify_pres {
3109   my $pre = 0;
3110
3111   my $sub;
3112   my $line;
3113   $sub = sub {
3114     ++$pre if $_[0]->{'_tag'} eq 'pre';
3115     foreach my $it (@{ $_[0]->{'_content'} || return }) {
3116       if(ref $it) {
3117         $sub->( $it );  # recurse!
3118       } elsif($pre) {
3119         #$it =~ s/(?:(?:\cm\cj*)|(?:\cj))/\n/g;
3120
3121         $it =
3122           join "\n",
3123           map {;
3124             $line = $_;
3125             while($line =~
3126              s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e
3127               # Sort of adapted from Text::Tabs -- yes, it's hardwired-in that
3128               # tabs are at every EIGHTH column.
3129             ){}
3130             $line;
3131           }
3132           split /(?:(?:\cm\cj*)|(?:\cj))/, $it, -1
3133         ;
3134       }
3135     }
3136     --$pre if $_[0]->{'_tag'} eq 'pre';
3137     return;
3138   };
3139   $sub->( $_[0] );
3140
3141   undef $sub;
3142   return;
3143
3144 }
3145
3146
3147
3148 =head2 $h->same_as($i)
3149
3150 Returns true if $h and $i are both elements representing the same tree
3151 of elements, each with the same tag name, with the same explicit
3152 attributes (i.e., not counting attributes whose names start with "_"),
3153 and with the same content (textual, comments, etc.).
3154
3155 Sameness of descendant elements is tested, recursively, with
3156 C<$child1-E<gt>same_as($child_2)>, and sameness of text segments is tested
3157 with C<$segment1 eq $segment2>.
3158
3159 =cut
3160
3161 sub same_as {
3162   die 'same_as() takes only one argument: $h->same_as($i)' unless @_ == 2;
3163   my($h,$i) = @_[0,1];
3164   die "same_as() can be called only as an object method" unless ref $h;
3165
3166   return 0 unless defined $i and ref $i;
3167    # An element can't be same_as anything but another element!
3168    # They needn't be of the same class, tho.
3169
3170   return 1 if $h eq $i;
3171    # special (if rare) case: anything is the same as... itself!
3172
3173   # assumes that no content lists in/under $h or $i contain subsequent
3174   #  text segments, like: ['foo', ' bar']
3175
3176   # compare attributes now.
3177   #print "Comparing tags of $h and $i...\n";
3178
3179   return 0 unless $h->{'_tag'} eq $i->{'_tag'};
3180     # only significant attribute whose name starts with "_"
3181
3182   #print "Comparing attributes of $h and $i...\n";
3183   # Compare attributes, but only the real ones.
3184   {
3185     # Bear in mind that the average element has very few attributes,
3186     #  and that element names are rather short.
3187     # (Values are a different story.)
3188
3189     # XXX I would think that /^[^_]/ would be faster, at least easier to read.
3190     my @keys_h = sort grep {length $_ and substr($_,0,1) ne '_'} keys %$h;
3191     my @keys_i = sort grep {length $_ and substr($_,0,1) ne '_'} keys %$i;
3192
3193     return 0 unless @keys_h == @keys_i;
3194      # different number of real attributes?  they're different.
3195     for(my $x = 0; $x < @keys_h; ++$x) {
3196       return 0 unless
3197        $keys_h[$x] eq $keys_i[$x] and  # same key name
3198        $h->{$keys_h[$x]} eq $i->{$keys_h[$x]}; # same value
3199        # Should this test for definedness on values?
3200        # People shouldn't be putting undef in attribute values, I think.
3201     }
3202   }
3203
3204   #print "Comparing children of $h and $i...\n";
3205   my $hcl = $h->{'_content'} || [];
3206   my $icl = $i->{'_content'} || [];
3207
3208   return 0 unless @$hcl == @$icl;
3209    # different numbers of children?  they're different.
3210
3211   if(@$hcl) {
3212     # compare each of the children:
3213     for(my $x = 0; $x < @$hcl; ++$x) {
3214       if(ref $hcl->[$x]) {
3215         return 0 unless ref($icl->[$x]);
3216          # an element can't be the same as a text segment
3217         # Both elements:
3218         return 0 unless $hcl->[$x]->same_as($icl->[$x]);  # RECURSE!
3219       } else {
3220         return 0 if ref($icl->[$x]);
3221          # a text segment can't be the same as an element
3222         # Both text segments:
3223         return 0 unless $hcl->[$x] eq $icl->[$x];
3224       }
3225     }
3226   }
3227
3228   return 1; # passed all the tests!
3229 }
3230
3231
3232 =head2 $h = HTML::Element->new_from_lol(ARRAYREF)
3233
3234 Resursively constructs a tree of nodes, based on the (non-cyclic)
3235 data structure represented by ARRAYREF, where that is a reference
3236 to an array of arrays (of arrays (of arrays (etc.))).
3237
3238 In each arrayref in that structure, different kinds of values are
3239 treated as follows:
3240
3241 =over
3242
3243 =item * Arrayrefs
3244
3245 Arrayrefs are considered to
3246 designate a sub-tree representing children for the node constructed
3247 from the current arrayref.
3248
3249 =item * Hashrefs
3250
3251 Hashrefs are considered to contain
3252 attribute-value pairs to add to the element to be constructed from
3253 the current arrayref
3254
3255 =item * Text segments
3256
3257 Text segments at the start of any arrayref
3258 will be considered to specify the name of the element to be
3259 constructed from the current araryref; all other text segments will
3260 be considered to specify text segments as children for the current
3261 arrayref.
3262
3263 =item * Elements
3264
3265 Existing element objects are either inserted into the treelet
3266 constructed, or clones of them are.  That is, when the lol-tree is
3267 being traversed and elements constructed based what's in it, if
3268 an existing element object is found, if it has no parent, then it is
3269 added directly to the treelet constructed; but if it has a parent,
3270 then C<$that_node-E<gt>clone> is added to the treelet at the
3271 appropriate place.
3272
3273 =back
3274
3275 An example will hopefully make this more obvious:
3276
3277   my $h = HTML::Element->new_from_lol(
3278     ['html',
3279       ['head',
3280         [ 'title', 'I like stuff!' ],
3281       ],
3282       ['body',
3283         {'lang', 'en-JP', _implicit => 1},
3284         'stuff',
3285         ['p', 'um, p < 4!', {'class' => 'par123'}],
3286         ['div', {foo => 'bar'}, '123'],
3287       ]
3288     ]
3289   );
3290   $h->dump;
3291
3292 Will print this:
3293
3294   <html> @0
3295     <head> @0.0
3296       <title> @0.0.0
3297         "I like stuff!"
3298     <body lang="en-JP"> @0.1 (IMPLICIT)
3299       "stuff"
3300       <p class="par123"> @0.1.1
3301         "um, p < 4!"
3302       <div foo="bar"> @0.1.2
3303         "123"
3304
3305 And printing $h->as_HTML will give something like:
3306
3307   <html><head><title>I like stuff!</title></head>
3308   <body lang="en-JP">stuff<p class="par123">um, p &lt; 4!
3309   <div foo="bar">123</div></body></html>
3310
3311 You can even do fancy things with C<map>:
3312
3313   $body->push_content(
3314     # push_content implicitly calls new_from_lol on arrayrefs...
3315     ['br'],
3316     ['blockquote',
3317       ['h2', 'Pictures!'],
3318       map ['p', $_],
3319       $body2->look_down("_tag", "img"),
3320         # images, to be copied from that other tree.
3321     ],
3322     # and more stuff:
3323     ['ul',
3324       map ['li', ['a', {'href'=>"$_.png"}, $_ ] ],
3325       qw(Peaches Apples Pears Mangos)
3326     ],
3327   );
3328
3329 =head2 @elements = HTML::Element->new_from_lol(ARRAYREFS)
3330
3331 Constructs I<several> elements, by calling
3332 new_from_lol for every arrayref in the ARRAYREFS list.
3333
3334   @elements = HTML::Element->new_from_lol(
3335     ['hr'],
3336     ['p', 'And there, on the door, was a hook!'],
3337   );
3338    # constructs two elements.
3339
3340 =cut
3341
3342 sub new_from_lol {
3343   my $class = shift;
3344   $class = ref($class) || $class;
3345    # calling as an object method is just the same as ref($h)->new_from_lol(...)
3346   my $lol = $_[1];
3347
3348   my @ancestor_lols;
3349    # So we can make sure there's no cyclicities in this lol.
3350    # That would be perverse, but one never knows.
3351   my($sub, $k, $v, $node); # last three are scratch values
3352   $sub = sub {
3353     #print "Building for $_[0]\n";
3354     my $lol = $_[0];
3355     return unless @$lol;
3356     my(@attributes, @children);
3357     Carp::croak "Cyclicity detected in source LOL tree, around $lol?!?"
3358      if grep($_ eq $lol, @ancestor_lols);
3359     push @ancestor_lols, $lol;
3360
3361     my $tag_name = 'null';
3362
3363     # Recursion in in here:
3364     for(my $i = 0; $i < @$lol; ++$i) { # Iterate over children
3365       if(ref($lol->[$i]) eq 'ARRAY') { # subtree: most common thing in loltree
3366         push @children, $sub->($lol->[$i]);
3367       } elsif(! ref($lol->[$i])) {
3368         if($i == 0) { # name
3369           $tag_name = $lol->[$i];
3370           Carp::croak "\"$tag_name\" isn't a good tag name!"
3371            if $tag_name =~ m/[<>\/\x00-\x20]/; # minimal sanity, certainly!
3372         } else { # text segment child
3373           push @children, $lol->[$i];
3374         }
3375       } elsif(ref($lol->[$i]) eq 'HASH') { # attribute hashref
3376         keys %{$lol->[$i]}; # reset the each-counter, just in case
3377         while(($k,$v) = each %{$lol->[$i]}) {
3378           push @attributes, $class->_fold_case($k), $v
3379             if defined $v and $k ne '_name' and $k ne '_content' and
3380             $k ne '_parent';
3381           # enforce /some/ sanity!
3382         }
3383       } elsif(UNIVERSAL::isa($lol->[$i], __PACKAGE__)) {
3384         if($lol->[$i]->{'_parent'}) { # if claimed
3385           #print "About to clone ", $lol->[$i], "\n";
3386           push @children, $lol->[$i]->clone();
3387         } else {
3388           push @children, $lol->[$i]; # if unclaimed...
3389           #print "Claiming ", $lol->[$i], "\n";
3390           $lol->[$i]->{'_parent'} = 1; # claim it NOW
3391            # This WILL be replaced by the correct value once we actually
3392            #  construct the parent, just after the end of this loop...
3393         }
3394       } else {
3395         Carp::croak "new_from_lol doesn't handle references of type "
3396           . ref($lol->[$i]);
3397       }
3398     }
3399
3400     pop @ancestor_lols;
3401     $node = $class->new($tag_name);
3402
3403     #print "Children: @children\n";
3404
3405     if($class eq __PACKAGE__) {  # Special-case it, for speed:
3406       %$node = (%$node, @attributes) if @attributes;
3407       #print join(' ', $node, ' ' , map("<$_>", %$node), "\n");
3408       if(@children) {
3409         $node->{'_content'} = \@children;
3410         foreach my $c (@children) { $c->{'_parent'} = $node if ref $c }
3411       }
3412     } else {  # Do it the clean way...
3413       #print "Done neatly\n";
3414       while(@attributes) { $node->attr(splice @attributes,0,2) }
3415       $node->push_content(@children) if @children;
3416     }
3417
3418     return $node;
3419   };
3420   # End of sub definition.
3421
3422
3423   if(wantarray) {
3424     my(@nodes) = map {; (ref($_) eq 'ARRAY') ? $sub->($_) : $_ } @_;
3425       # Let text bits pass thru, I guess.  This makes this act more like
3426       #  unshift_content et al.  Undocumented.
3427     undef $sub;
3428       # so it won't be in its own frame, so its refcount can hit 0
3429     return @nodes;
3430   } else {
3431     Carp::croak "new_from_lol in scalar context needs exactly one lol"
3432      unless @_ == 1;
3433     return $_[0] unless ref($_[0]) eq 'ARRAY';
3434      # used to be a fatal error.  still undocumented tho.
3435     $node = $sub->($_[0]);
3436     undef $sub;
3437       # so it won't be in its own frame, so its refcount can hit 0
3438     return $node;
3439   }
3440 }
3441
3442 =head2 $h->objectify_text()
3443
3444 This turns any text nodes under $h from mere text segments (strings)
3445 into real objects, pseudo-elements with a tag-name of "~text", and the
3446 actual text content in an attribute called "text".  (For a discussion
3447 of pseudo-elements, see the "tag" method, far above.)  This method is
3448 provided because, for some purposes, it is convenient or necessary to
3449 be able, for a given text node, to ask what element is its parent; and
3450 clearly this is not possible if a node is just a text string.
3451
3452 Note that these "~text" objects are not recognized as text nodes by
3453 methods like as_text.  Presumably you will want to call
3454 $h->objectify_text, perform whatever task that you needed that for,
3455 and then call $h->deobjectify_text before calling anything like
3456 $h->as_text.
3457
3458 =head2 $h->deobjectify_text()
3459
3460 This undoes the effect of $h->objectify_text.  That is, it takes any
3461 "~text" pseudo-elements in the tree at/under $h, and deletes each one,
3462 replacing each with the content of its "text" attribute.
3463
3464 Note that if $h itself is a "~text" pseudo-element, it will be
3465 destroyed -- a condition you may need to treat specially in your
3466 calling code (since it means you can't very well do anything with $h
3467 after that).  So that you can detect that condition, if $h is itself a
3468 "~text" pseudo-element, then this method returns the value of the
3469 "text" attribute, which should be a defined value; in all other cases,
3470 it returns undef.
3471
3472 (This method assumes that no "~text" pseudo-element has any children.)
3473
3474 =cut
3475
3476 sub objectify_text {
3477   my(@stack) = ($_[0]);
3478
3479   my($this);
3480   while(@stack) {
3481     foreach my $c (@{( $this = shift @stack )->{'_content'}}) {
3482       if(ref($c)) {
3483         unshift @stack, $c;  # visit it later.
3484       } else {
3485         $c = ( $this->{'_element_class'} || __PACKAGE__
3486              )->new('~text', 'text' => $c, '_parent' => $this);
3487       }
3488     }
3489   }
3490   return;
3491 }
3492
3493 sub deobjectify_text {
3494   my(@stack) = ($_[0]);
3495   my($old_node);
3496
3497   if( $_[0]{'_tag'} eq '~text') { # special case
3498     # Puts the $old_node variable to a different purpose
3499     if($_[0]{'_parent'}) {
3500       $_[0]->replace_with( $old_node = delete $_[0]{'text'} )->delete;
3501     } else {  # well, that's that, then!
3502       $old_node = delete $_[0]{'text'};
3503     }
3504
3505     if(ref($_[0]) eq __PACKAGE__) { # common case
3506       %{$_[0]} = ();  # poof!
3507     } else {
3508       # play nice:
3509       delete $_[0]{'_parent'};
3510       $_[0]->delete;
3511     }
3512     return '' unless defined $old_node; # sanity!
3513     return $old_node;
3514   }
3515
3516   while(@stack) {
3517     foreach my $c (@{(shift @stack)->{'_content'}}) {
3518       if(ref($c)) {
3519         if($c->{'_tag'} eq '~text') {
3520           $c = ($old_node = $c)->{'text'};
3521           if(ref($old_node) eq __PACKAGE__) { # common case
3522             %$old_node = ();  # poof!
3523           } else {
3524             # play nice:
3525             delete $old_node->{'_parent'};
3526             $old_node->delete;
3527           }
3528         } else {
3529           unshift @stack, $c;  # visit it later.
3530         }
3531       }
3532     }
3533   }
3534
3535   return undef;
3536 }
3537
3538
3539 =head2 $h->number_lists()
3540
3541 For every UL, OL, DIR, and MENU element at/under $h, this sets a
3542 "_bullet" attribute for every child LI element.  For LI children of an
3543 OL, the "_bullet" attribute's value will be something like "4.", "d.",
3544 "D.", "IV.", or "iv.", depending on the OL element's "type" attribute.
3545 LI children of a UL, DIR, or MENU get their "_bullet" attribute set
3546 to "*".
3547 There should be no other LIs (i.e., except as children of OL, UL, DIR,
3548 or MENU elements), and if there are, they are unaffected.
3549
3550 =cut
3551
3552 {
3553   # The next three subs are basically copied from Number::Latin,
3554   # based on a one-liner by Abigail.  Yes, I could simply require that
3555   # module, and a Roman numeral module too, but really, HTML-Tree already
3556   # has enough dependecies as it is; and anyhow, I don't need the functions
3557   # that do latin2int or roman2int.
3558   no integer;
3559
3560   sub _int2latin {
3561     return undef unless defined $_[0];
3562     return '0' if $_[0] < 1 and $_[0] > -1;
3563     return '-' . _i2l( abs int $_[0] ) if $_[0] <= -1; # tolerate negatives
3564     return       _i2l(     int $_[0] );
3565   }
3566
3567   sub _int2LATIN {
3568     # just the above plus uc
3569     return undef unless defined $_[0];
3570     return '0' if $_[0] < 1 and $_[0] > -1;
3571     return '-' . uc(_i2l( abs int $_[0] )) if $_[0] <= -1;  # tolerate negs
3572     return       uc(_i2l(     int $_[0] ));
3573   }
3574
3575   my @alpha = ('a' .. 'z');
3576   sub _i2l { # the real work
3577     my $int = $_[0] || return "";
3578     _i2l(int (($int - 1) / 26)) . $alpha[$int % 26 - 1];  # yes, recursive
3579     # Yes, 26 => is (26 % 26 - 1), which is -1 => Z!
3580   }
3581 }
3582
3583 {
3584   # And now, some much less impressive Roman numerals code:
3585
3586   my(@i) = ('', qw(I II III IV V VI VII VIII IX));
3587   my(@x) = ('', qw(X XX XXX XL L LX LXX LXXX XC));
3588   my(@c) = ('', qw(C CC CCC CD D DC DCC DCCC CM));
3589   my(@m) = ('', qw(M MM MMM));
3590
3591   sub _int2ROMAN {
3592     my($i, $pref);
3593     return '0' if 0 == ($i = int($_[0] || 0)); # zero is a special case
3594     return $i + 0 if $i <= -4000 or $i >= 4000;
3595     # Because over 3999 would require non-ASCII chars, like D-with-)-inside
3596     if($i < 0) { # grumble grumble tolerate negatives grumble
3597       $pref = '-'; $i = abs($i);
3598     } else {
3599       $pref = '';  # normal case
3600     }
3601
3602     my($x,$c,$m) = (0,0,0);
3603     if(     $i >= 10) { $x = $i / 10; $i %= 10;
3604       if(   $x >= 10) { $c = $x / 10; $x %= 10;
3605         if( $c >= 10) { $m = $c / 10; $c %= 10; } } }
3606     #print "m$m c$c x$x i$i\n";
3607
3608     return join('', $pref, $m[$m], $c[$c], $x[$x], $i[$i] );
3609   }
3610
3611   sub _int2roman { lc(_int2ROMAN($_[0])) }
3612 }
3613
3614 sub _int2int { $_[0] } # dummy
3615
3616 %list_type_to_sub = (
3617   'I' => \&_int2ROMAN,  'i' => \&_int2roman,
3618   'A' => \&_int2LATIN,  'a' => \&_int2latin,
3619   '1' => \&_int2int,
3620 );
3621
3622 sub number_lists {
3623   my(@stack) = ($_[0]);
3624   my($this, $tag, $counter, $numberer); # scratch
3625   while(@stack) { # yup, pre-order-traverser idiom
3626     if(($tag = ($this = shift @stack)->{'_tag'}) eq 'ol') {
3627       # Prep some things:
3628       $counter = (($this->{'start'} || '') =~ m<^\s*(\d{1,7})\s*$>s) ? $1 : 1;
3629       $numberer = $list_type_to_sub{ $this->{'type'} || ''}
3630                || $list_type_to_sub{'1'};
3631
3632       # Immeditately iterate over all children
3633       foreach my $c (@{ $this->{'_content'} || next}) {
3634         next unless ref $c;
3635         unshift @stack, $c;
3636         if($c->{'_tag'} eq 'li') {
3637           $counter = $1 if(($c->{'value'} || '') =~ m<^\s*(\d{1,7})\s*$>s);
3638           $c->{'_bullet'} = $numberer->($counter) . '.';
3639           ++$counter;
3640         }
3641       }
3642
3643     } elsif($tag eq 'ul' or $tag eq 'dir' or $tag eq 'menu') {
3644       # Immeditately iterate over all children
3645       foreach my $c (@{ $this->{'_content'} || next}) {
3646         next unless ref $c;
3647         unshift @stack, $c;
3648         $c->{'_bullet'} = '*' if $c->{'_tag'} eq 'li';
3649       }
3650
3651     } else {
3652       foreach my $c (@{ $this->{'_content'} || next}) {
3653         unshift @stack, $c if ref $c;
3654       }
3655     }
3656   }
3657   return;
3658 }
3659
3660
3661
3662 =head2 $h->has_insane_linkage
3663
3664 This method is for testing whether this element or the elements
3665 under it have linkage attributes (_parent and _content) whose values
3666 are deeply aberrant: if there are undefs in a content list; if an
3667 element appears in the content lists of more than one element;
3668 if the _parent attribute of an element doesn't match its actual
3669 parent; or if an element appears as its own descendant (i.e.,
3670 if there is a cyclicity in the tree).
3671
3672 This returns empty list (or false, in scalar context) if the subtree's
3673 linkage methods are sane; otherwise it returns two items (or true, in
3674 scalar context): the element where the error occurred, and a string
3675 describing the error.
3676
3677 This method is provided is mainly for debugging and troubleshooting --
3678 it should be I<quite impossible> for any document constructed via
3679 HTML::TreeBuilder to parse into a non-sane tree (since it's not
3680 the content of the tree per se that's in question, but whether
3681 the tree in memory was properly constructed); and it I<should> be
3682 impossible for you to produce an insane tree just thru reasonable
3683 use of normal documented structure-modifying methods.  But if you're
3684 constructing your own trees, and your program is going into infinite
3685 loops as during calls to traverse() or any of the secondary
3686 structural methods, as part of debugging, consider calling is_insane
3687 on the tree.
3688
3689 =cut
3690
3691 sub has_insane_linkage {
3692   my @pile = ($_[0]);
3693   my($c, $i, $p, $this); # scratch
3694
3695   # Another iterative traverser; this time much simpler because
3696   #  only in pre-order:
3697   my %parent_of = ($_[0], 'TOP-OF-SCAN');
3698   while(@pile) {
3699     $this = shift @pile;
3700     $c = $this->{'_content'} || next;
3701     return($this, "_content attribute is true but nonref.")
3702      unless ref($c) eq 'ARRAY';
3703     next unless @$c;
3704     for($i = 0; $i < @$c; ++$i) {
3705       return($this, "Child $i is undef")
3706        unless defined $c->[$i];
3707       if(ref($c->[$i])) {
3708         return($c->[$i], "appears in its own content list")
3709          if $c->[$i] eq $this;
3710         return($c->[$i],
3711           "appears twice in the tree: once under $this, once under $parent_of{$c->[$i]}"
3712         )
3713          if exists $parent_of{$c->[$i]};
3714         $parent_of{$c->[$i]} = ''.$this;
3715           # might as well just use the stringification of it.
3716
3717         return($c->[$i], "_parent attribute is wrong (not defined)")
3718          unless defined($p = $c->[$i]{'_parent'});
3719         return($c->[$i], "_parent attribute is wrong (nonref)")
3720          unless ref($p);
3721         return($c->[$i],
3722           "_parent attribute is wrong (is $p; should be $this)"
3723         )
3724          unless $p eq $this;
3725       }
3726     }
3727     unshift @pile, grep ref($_), @$c;
3728      # queue up more things on the pile stack
3729   }
3730   return; #okay
3731 }
3732
3733
3734 sub _asserts_fail {  # to be run on trusted documents only
3735   my(@pile) = ($_[0]);
3736   my(@errors, $this, $id, $assert, $parent, $rv);
3737   while(@pile) {
3738     $this = shift @pile;
3739     if(defined($assert = $this->{'assert'})) {
3740       $id = ($this->{'id'} ||= $this->address); # don't use '0' as an ID, okay?
3741       unless(ref($assert)) {
3742         package main;
3743         $assert = $this->{'assert'} = (
3744           $assert =~ m/\bsub\b/ ? eval($assert) : eval("sub {  $assert\n}")
3745         );
3746         if($@) {
3747           push @errors, [$this, "assertion at $id broke in eval: $@"];
3748           $assert = $this->{'assert'} = sub {};
3749         }
3750       }
3751       $parent = $this->{'_parent'};
3752       $rv = undef;
3753       eval {
3754         $rv =
3755          $assert->(
3756            $this, $this->{'_tag'}, $this->{'_id'}, # 0,1,2
3757            $parent ? ($parent, $parent->{'_tag'}, $parent->{'id'}) : () # 3,4,5
3758          )
3759       };
3760       if($@) {
3761         push @errors, [$this, "assertion at $id died: $@"];
3762       } elsif(!$rv) {
3763         push @errors, [$this, "assertion at $id failed"]
3764       }
3765        # else OK
3766     }
3767     push @pile, grep ref($_), @{$this->{'_content'} || next};
3768   }
3769   return @errors;
3770 }
3771
3772 1;
3773
3774 =head1 BUGS
3775
3776 * If you want to free the memory associated with a tree built of
3777 HTML::Element nodes, then you will have to delete it explicitly.
3778 See the $h->delete method, above.
3779
3780 * There's almost nothing to stop you from making a "tree" with
3781 cyclicities (loops) in it, which could, for example, make the
3782 traverse method go into an infinite loop.  So don't make
3783 cyclicities!  (If all you're doing is parsing HTML files,
3784 and looking at the resulting trees, this will never be a problem
3785 for you.)
3786
3787 * There's no way to represent comments or processing directives
3788 in a tree with HTML::Elements.  Not yet, at least.
3789
3790 * There's (currently) nothing to stop you from using an undefined
3791 value as a text segment.  If you're running under C<perl -w>, however,
3792 this may make HTML::Element's code produce a slew of warnings.
3793
3794 =head1 NOTES ON SUBCLASSING
3795
3796 You are welcome to derive subclasses from HTML::Element, but you
3797 should be aware that the code in HTML::Element makes certain
3798 assumptions about elements (and I'm using "element" to mean ONLY an
3799 object of class HTML::Element, or of a subclass of HTML::Element):
3800
3801 * The value of an element's _parent attribute must either be undef or
3802 otherwise false, or must be an element.
3803
3804 * The value of an element's _content attribute must either be undef or
3805 otherwise false, or a reference to an (unblessed) array.  The array
3806 may be empty; but if it has items, they must ALL be either mere
3807 strings (text segments), or elements.
3808
3809 * The value of an element's _tag attribute should, at least, be a
3810 string of printable characters.
3811
3812 Moreover, bear these rules in mind:
3813
3814 * Do not break encapsulation on objects.  That is, access their
3815 contents only thru $obj->attr or more specific methods.
3816
3817 * You should think twice before completely overriding any of the
3818 methods that HTML::Element provides.  (Overriding with a method that
3819 calls the superclass method is not so bad, though.)
3820
3821 =head1 SEE ALSO
3822
3823 L<HTML::Tree>; L<HTML::TreeBuilder>; L<HTML::AsSubs>; L<HTML::Tagset>;
3824 and, for the morbidly curious, L<HTML::Element::traverse>.
3825
3826 =head1 COPYRIGHT
3827
3828 Copyright 1995-1998 Gisle Aas, 1999-2004 Sean M. Burke, 2005 Andy Lester,
3829 2006 Pete Krawczyk.
3830
3831 This library is free software; you can redistribute it and/or
3832 modify it under the same terms as Perl itself.
3833
3834 This program is distributed in the hope that it will be useful, but
3835 without any warranty; without even the implied warranty of
3836 merchantability or fitness for a particular purpose.
3837
3838 =head1 AUTHOR
3839
3840 Currently maintained by Pete Krawczyk C<< <petek@cpan.org> >>
3841
3842 Original authors: Gisle Aas, Sean Burke and Andy Lester.
3843
3844 Thanks to Mark-Jason Dominus for a POD suggestion.
3845
3846 =cut
3847
3848 1;