X-Git-Url: http://git.maemo.org/git/?p=dh-make-perl;a=blobdiff_plain;f=dev%2Farm%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2FParser.pm;fp=dev%2Farm%2Flibhtml-parser-perl%2Flibhtml-parser-perl-3.56%2FParser.pm;h=569013663af999f608db86fa2c51405f1007ccff;hp=0000000000000000000000000000000000000000;hb=f477fa73365d491991707e7ed9217b48d6994551;hpb=da95c414033799c3a62606f299c3c00b5c77ca11 diff --git a/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.pm b/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.pm new file mode 100644 index 0000000..5690136 --- /dev/null +++ b/dev/arm/libhtml-parser-perl/libhtml-parser-perl-3.56/Parser.pm @@ -0,0 +1,1234 @@ +package HTML::Parser; + +# Copyright 1996-2007, Gisle Aas. +# Copyright 1999-2000, Michael A. Chase. +# +# This library is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. + +use strict; +use vars qw($VERSION @ISA); + +$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ + +require HTML::Entities; + +require XSLoader; +XSLoader::load('HTML::Parser', $VERSION); + +sub new +{ + my $class = shift; + my $self = bless {}, $class; + return $self->init(@_); +} + + +sub init +{ + my $self = shift; + $self->_alloc_pstate; + + my %arg = @_; + my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); + if ($api_version >= 4) { + require Carp; + Carp::croak("API version $api_version not supported " . + "by HTML::Parser $VERSION"); + } + + if ($api_version < 3) { + # Set up method callbacks compatible with HTML-Parser-2.xx + $self->handler(text => "text", "self,text,is_cdata"); + $self->handler(end => "end", "self,tagname,text"); + $self->handler(process => "process", "self,token0,text"); + $self->handler(start => "start", + "self,tagname,attr,attrseq,text"); + + $self->handler(comment => + sub { + my($self, $tokens) = @_; + for (@$tokens) { + $self->comment($_); + } + }, "self,tokens"); + + $self->handler(declaration => + sub { + my $self = shift; + $self->declaration(substr($_[0], 2, -1)); + }, "self,text"); + } + + if (my $h = delete $arg{handlers}) { + $h = {@$h} if ref($h) eq "ARRAY"; + while (my($event, $cb) = each %$h) { + $self->handler($event => @$cb); + } + } + + # In the end we try to assume plain attribute or handler + while (my($option, $val) = each %arg) { + if ($option =~ /^(\w+)_h$/) { + $self->handler($1 => @$val); + } + elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { + require Carp; + Carp::croak("Bad constructor option '$option'"); + } + else { + $self->$option($val); + } + } + + return $self; +} + + +sub parse_file +{ + my($self, $file) = @_; + my $opened; + if (!ref($file) && ref(\$file) ne "GLOB") { + # Assume $file is a filename + local(*F); + open(F, $file) || return undef; + binmode(F); # should we? good for byte counts + $opened++; + $file = *F; + } + my $chunk = ''; + while (read($file, $chunk, 512)) { + $self->parse($chunk) || last; + } + close($file) if $opened; + $self->eof; +} + + +sub netscape_buggy_comment # legacy +{ + my $self = shift; + require Carp; + Carp::carp("netscape_buggy_comment() is deprecated. " . + "Please use the strict_comment() method instead"); + my $old = !$self->strict_comment; + $self->strict_comment(!shift) if @_; + return $old; +} + +# set up method stubs +sub text { } +*start = \&text; +*end = \&text; +*comment = \&text; +*declaration = \&text; +*process = \&text; + +1; + +__END__ + + +=head1 NAME + +HTML::Parser - HTML parser class + +=head1 SYNOPSIS + + use HTML::Parser (); + + # Create parser object + $p = HTML::Parser->new( api_version => 3, + start_h => [\&start, "tagname, attr"], + end_h => [\&end, "tagname"], + marked_sections => 1, + ); + + # Parse document text chunk by chunk + $p->parse($chunk1); + $p->parse($chunk2); + #... + $p->eof; # signal end of document + + # Parse directly from file + $p->parse_file("foo.html"); + # or + open(my $fh, "<:utf8", "foo.html") || die; + $p->parse_file($fh); + +=head1 DESCRIPTION + +Objects of the C class will recognize markup and +separate it from plain text (alias data content) in HTML +documents. As different kinds of markup and text are recognized, the +corresponding event handlers are invoked. + +C is not a generic SGML parser. We have tried to +make it able to deal with the HTML that is actually "out there", and +it normally parses as closely as possible to the way the popular web +browsers do it instead of strictly following one of the many HTML +specifications from W3C. Where there is disagreement, there is often +an option that you can enable to get the official behaviour. + +The document to be parsed may be supplied in arbitrary chunks. This +makes on-the-fly parsing as documents are received from the network +possible. + +If event driven parsing does not feel right for your application, you +might want to use C. This is an C +subclass that allows a more conventional program structure. + + +=head1 METHODS + +The following method is used to construct a new C object: + +=over + +=item $p = HTML::Parser->new( %options_and_handlers ) + +This class method creates a new C object and +returns it. Key/value argument pairs may be provided to assign event +handlers or initialize parser options. The handlers and parser +options can also be set or modified later by the method calls described below. + +If a top level key is in the form "_h" (e.g., "text_h") then it +assigns a handler to that event, otherwise it initializes a parser +option. The event handler specification value must be an array +reference. Multiple handlers may also be assigned with the 'handlers +=> [%handlers]' option. See examples below. + +If new() is called without any arguments, it will create a parser that +uses callback methods compatible with version 2 of C. +See the section on "version 2 compatibility" below for details. + +The special constructor option 'api_version => 2' can be used to +initialize version 2 callbacks while still setting other options and +handlers. The 'api_version => 3' option can be used if you don't want +to set any options and don't want to fall back to v2 compatible +mode. + +Examples: + + $p = HTML::Parser->new(api_version => 3, + text_h => [ sub {...}, "dtext" ]); + +This creates a new parser object with a text event handler subroutine +that receives the original text with general entities decoded. + + $p = HTML::Parser->new(api_version => 3, + start_h => [ 'my_start', "self,tokens" ]); + +This creates a new parser object with a start event handler method +that receives the $p and the tokens array. + + $p = HTML::Parser->new(api_version => 3, + handlers => { text => [\@array, "event,text"], + comment => [\@array, "event,text"], + }); + +This creates a new parser object that stores the event type and the +original text in @array for text and comment events. + +=back + +The following methods feed the HTML document +to the C object: + +=over + +=item $p->parse( $string ) + +Parse $string as the next chunk of the HTML document. The return +value is normally a reference to the parser object (i.e. $p). +Handlers invoked should not attempt to modify the $string in-place until +$p->parse returns. + +If an invoked event handler aborts parsing by calling $p->eof, then +$p->parse() will return a FALSE value. + +=item $p->parse( $code_ref ) + +If a code reference is passed as the argument to be parsed, then the +chunks to be parsed are obtained by invoking this function repeatedly. +Parsing continues until the function returns an empty (or undefined) +result. When this happens $p->eof is automatically signaled. + +Parsing will also abort if one of the event handlers calls $p->eof. + +The effect of this is the same as: + + while (1) { + my $chunk = &$code_ref(); + if (!defined($chunk) || !length($chunk)) { + $p->eof; + return $p; + } + $p->parse($chunk) || return undef; + } + +But it is more efficient as this loop runs internally in XS code. + +=item $p->parse_file( $file ) + +Parse text directly from a file. The $file argument can be a +filename, an open file handle, or a reference to an open file +handle. + +If $file contains a filename and the file can't be opened, then the +method returns an undefined value and $! tells why it failed. +Otherwise the return value is a reference to the parser object. + +If a file handle is passed as the $file argument, then the file will +normally be read until EOF, but not closed. + +If an invoked event handler aborts parsing by calling $p->eof, +then $p->parse_file() may not have read the entire file. + +On systems with multi-byte line terminators, the values passed for the +offset and length argspecs may be too low if parse_file() is called on +a file handle that is not in binary mode. + +If a filename is passed in, then parse_file() will open the file in +binary mode. + +=item $p->eof + +Signals the end of the HTML document. Calling the $p->eof method +outside a handler callback will flush any remaining buffered text +(which triggers the C event if there is any remaining text). + +Calling $p->eof inside a handler will terminate parsing at that point +and cause $p->parse to return a FALSE value. This also terminates +parsing by $p->parse_file(). + +After $p->eof has been called, the parse() and parse_file() methods +can be invoked to feed new documents with the parser object. + +The return value from eof() is a reference to the parser object. + +=back + + +Most parser options are controlled by boolean attributes. +Each boolean attribute is enabled by calling the corresponding method +with a TRUE argument and disabled with a FALSE argument. The +attribute value is left unchanged if no argument is given. The return +value from each method is the old attribute value. + +Methods that can be used to get and/or set parser options are: + +=over + +=item $p->attr_encoded + +=item $p->attr_encoded( $bool ) + +By default, the C and C<@attr> argspecs will have general +entities for attribute values decoded. Enabling this attribute leaves +entities alone. + +=item $p->boolean_attribute_value( $val ) + +This method sets the value reported for boolean attributes inside HTML +start tags. By default, the name of the attribute is also used as its +value. This affects the values reported for C and C +argspecs. + +=item $p->case_sensitive + +=item $p->case_sensitive( $bool ) + +By default, tagnames and attribute names are down-cased. Enabling this +attribute leaves them as found in the HTML source document. + +=item $p->closing_plaintext + +=item $p->closing_plaintext( $bool ) + +By default, "plaintext" element can never be closed. Everything up to +the end of the document is parsed in CDATA mode. This historical +behaviour is what at least MSIE does. Enabling this attribute makes +closing "" tag effective and the parsing process will resume +after seeing this tag. This emulates gecko-based browsers. + +=item $p->empty_element_tags + +=item $p->empty_element_tags( $bool ) + +By default, empty element tags are not recognized as such and the "/" +before ">" is just treated like a normal name character (unless +C is enabled). Enabling this attribute make +C recognize these tags. + +Empty element tags look like start tags, but end with the character +sequence "/>" instead of ">". When recognized by C they +cause an artificial end event in addition to the start event. The +C for the artificial end event will be empty and the C +array will be undefined even though the the token array will have one +element containing the tag name. + +=item $p->marked_sections + +=item $p->marked_sections( $bool ) + +By default, section markings like are treated like +ordinary text. When this attribute is enabled section markings are +honoured. + +There are currently no events associated with the marked section +markup, but the text can be returned as C. + +=item $p->strict_comment + +=item $p->strict_comment( $bool ) + +By default, comments are terminated by the first occurrence of "-->". +This is the behaviour of most popular browsers (like Mozilla, Opera and +MSIE), but it is not correct according to the official HTML +standard. Officially, you need an even number of "--" tokens before +the closing ">" is recognized and there may not be anything but +whitespace between an even and an odd "--". + +The official behaviour is enabled by enabling this attribute. + +Enabling of 'strict_comment' also disables recognizing these forms as +comments: + + + + + +=item $p->strict_end + +=item $p->strict_end( $bool ) + +By default, attributes and other junk are allowed to be present on end tags in a +manner that emulates MSIE's behaviour. + +The official behaviour is enabled with this attribute. If enabled, +only whitespace is allowed between the tagname and the final ">". + +=item $p->strict_names + +=item $p->strict_names( $bool ) + +By default, almost anything is allowed in tag and attribute names. +This is the behaviour of most popular browsers and allows us to parse +some broken tags with invalid attribute values like: + + [PREV + +By default, "LIST]" is parsed as a boolean attribute, not as +part of the ALT value as was clearly intended. This is also what +Mozilla sees. + +The official behaviour is enabled by enabling this attribute. If +enabled, it will cause the tag above to be reported as text +since "LIST]" is not a legal attribute name. + +=item $p->unbroken_text + +=item $p->unbroken_text( $bool ) + +By default, blocks of text are given to the text handler as soon as +possible (but the parser takes care always to break text at a +boundary between whitespace and non-whitespace so single words and +entities can always be decoded safely). This might create breaks that +make it hard to do transformations on the text. When this attribute is +enabled, blocks of text are always reported in one piece. This will +delay the text event until the following (non-text) event has been +recognized by the parser. + +Note that the C argspec will give you the offset of the first +segment of text and C is the combined length of the segments. +Since there might be ignored tags in between, these numbers can't be +used to directly index in the original document file. + +=item $p->utf8_mode + +=item $p->utf8_mode( $bool ) + +Enable this option when parsing raw undecoded UTF-8. This tells the +parser that the entities expanded for strings reported by C, +C<@attr> and C should be expanded as decoded UTF-8 so they end +up compatible with the surrounding text. + +If C is enabled then it is an error to pass strings +containing characters with code above 255 to the parse() method, and +the parse() method will croak if you try. + +Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 +encoded. The character can also be represented by the entity +"♥" or "♥". If we feed the parser: + + $p->parse("\xE2\x99\xA5♥"); + +then C will be reported as "\xE2\x99\xA5\x{2665}" without +C enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. +The later string is what you want. + +This option is only available with perl-5.8 or better. + +=item $p->xml_mode + +=item $p->xml_mode( $bool ) + +Enabling this attribute changes the parser to allow some XML +constructs. This enables the behaviour controlled by individually by +the C, C, C and +C attributes and also suppresses special treatment of +elements that are parsed as CDATA for HTML. + +=item $p->xml_pic + +=item $p->xml_pic( $bool ) + +By default, I are terminated by ">". When +this attribute is enabled, processing instructions are terminated by +"?>" instead. + +=back + +As markup and text is recognized, handlers are invoked. The following +method is used to set up handlers for different events: + +=over + +=item $p->handler( event => \&subroutine, $argspec ) + +=item $p->handler( event => $method_name, $argspec ) + +=item $p->handler( event => \@accum, $argspec ) + +=item $p->handler( event => "" ); + +=item $p->handler( event => undef ); + +=item $p->handler( event ); + +This method assigns a subroutine, method, or array to handle an event. + +Event is one of C, C, C, C, C, +C, C, C or C. + +The C<\&subroutine> is a reference to a subroutine which is called to handle +the event. + +The C<$method_name> is the name of a method of $p which is called to handle +the event. + +The C<@accum> is an array that will hold the event information as +sub-arrays. + +If the second argument is "", the event is ignored. +If it is undef, the default handler is invoked for the event. + +The C<$argspec> is a string that describes the information to be reported +for the event. Any requested information that does not apply to a +specific event is passed as C. If argspec is omitted, then it +is left unchanged. + +The return value from $p->handler is the old callback routine or a +reference to the accumulator array. + +Any return values from handler callback routines/methods are always +ignored. A handler callback can request parsing to be aborted by +invoking the $p->eof method. A handler callback is not allowed to +invoke the $p->parse() or $p->parse_file() method. An exception will +be raised if it tries. + +Examples: + + $p->handler(start => "start", 'self, attr, attrseq, text' ); + +This causes the "start" method of object $p to be called for 'start' events. +The callback signature is $p->start(\%attr, \@attr_seq, $text). + + $p->handler(start => \&start, 'attr, attrseq, text' ); + +This causes subroutine start() to be called for 'start' events. +The callback signature is start(\%attr, \@attr_seq, $text). + + $p->handler(start => \@accum, '"S", attr, attrseq, text' ); + +This causes 'start' event information to be saved in @accum. +The array elements will be ['S', \%attr, \@attr_seq, $text]. + + $p->handler(start => ""); + +This causes 'start' events to be ignored. It also suppresses +invocations of any default handler for start events. It is in most +cases equivalent to $p->handler(start => sub {}), but is more +efficient. It is different from the empty-sub-handler in that +C is not reset by it. + + $p->handler(start => undef); + +This causes no handler to be associated with start events. +If there is a default handler it will be invoked. + +=back + +Filters based on tags can be set up to limit the number of events +reported. The main bottleneck during parsing is often the huge number +of callbacks made from the parser. Applying filters can improve +performance significantly. + +The following methods control filters: + +=over + +=item $p->ignore_elements( @tags ) + +Both the C event and the C event as well as any events that +would be reported in between are suppressed. The ignored elements can +contain nested occurrences of itself. Example: + + $p->ignore_elements(qw(script style)); + +The C