git.maemo.org Git - pkg-perl/blob - deb-src/libhtml-parser-perl/libhtml-parser-perl-3.56/lib/HTML/PullParser.pm

   1 package HTML::PullParser;
   2
   3 # $Id: PullParser.pm,v 2.9 2006/04/26 08:00:28 gisle Exp $
   4
   5 require HTML::Parser;
   6 @ISA=qw(HTML::Parser);
   7 $VERSION = sprintf("%d.%02d", q$Revision: 2.9 $ =~ /(\d+)\.(\d+)/);
   8
   9 use strict;
  10 use Carp ();
  11
  12 sub new
  13 {
  14     my($class, %cnf) = @_;
  15
  16     # Construct argspecs for the various events
  17     my %argspec;
  18     for (qw(start end text declaration comment process default)) {
  19         my $tmp = delete $cnf{$_};
  20         next unless defined $tmp;
  21         $argspec{$_} = $tmp;
  22     }
  23     Carp::croak("Info not collected for any events")
  24           unless %argspec;
  25
  26     my $file = delete $cnf{file};
  27     my $doc  = delete $cnf{doc};
  28     Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
  29           if defined($file) && defined($doc);
  30     Carp::croak("No 'doc' or 'file' given to parse from")
  31           unless defined($file) || defined($doc);
  32
  33     # Create object
  34     $cnf{api_version} = 3;
  35     my $self = $class->SUPER::new(%cnf);
  36
  37     my $accum = $self->{pullparser_accum} = [];
  38     while (my($event, $argspec) = each %argspec) {
  39         $self->SUPER::handler($event => $accum, $argspec);
  40     }
  41
  42     if (defined $doc) {
  43         $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
  44         $self->{pullparser_str_pos} = 0;
  45     }
  46     else {
  47         if (!ref($file) && ref(\$file) ne "GLOB") {
  48             require IO::File;
  49             $file = IO::File->new($file, "r") || return;
  50         }
  51
  52         $self->{pullparser_file} = $file;
  53     }
  54     $self;
  55 }
  56
  57
  58 sub handler
  59 {
  60     Carp::croak("Can't set handlers for HTML::PullParser");
  61 }
  62
  63
  64 sub get_token
  65 {
  66     my $self = shift;
  67     while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
  68         if (my $f = $self->{pullparser_file}) {
  69             # must try to parse more from the file
  70             my $buf;
  71             if (read($f, $buf, 512)) {
  72                 $self->parse($buf);
  73             } else {
  74                 $self->eof;
  75                 $self->{pullparser_eof}++;
  76                 delete $self->{pullparser_file};
  77             }
  78         }
  79         elsif (my $sref = $self->{pullparser_str_ref}) {
  80             # must try to parse more from the scalar
  81             my $pos = $self->{pullparser_str_pos};
  82             my $chunk = substr($$sref, $pos, 512);
  83             $self->parse($chunk);
  84             $pos += length($chunk);
  85             if ($pos < length($$sref)) {
  86                 $self->{pullparser_str_pos} = $pos;
  87             }
  88             else {
  89                 $self->eof;
  90                 $self->{pullparser_eof}++;
  91                 delete $self->{pullparser_str_ref};
  92                 delete $self->{pullparser_str_pos};
  93             }
  94         }
  95         else {
  96             die;
  97         }
  98     }
  99     shift @{$self->{pullparser_accum}};
 100 }
 101
 102
 103 sub unget_token
 104 {
 105     my $self = shift;
 106     unshift @{$self->{pullparser_accum}}, @_;
 107     $self;
 108 }
 109
 110 1;
 111
 112
 113 __END__
 114
 115 =head1 NAME
 116
 117 HTML::PullParser - Alternative HTML::Parser interface
 118
 119 =head1 SYNOPSIS
 120
 121  use HTML::PullParser;
 122
 123  $p = HTML::PullParser->new(file => "index.html",
 124                             start => 'event, tagname, @attr',
 125                             end   => 'event, tagname',
 126                             ignore_elements => [qw(script style)],
 127                            ) || die "Can't open: $!";
 128  while (my $token = $p->get_token) {
 129      #...do something with $token
 130  }
 131
 132 =head1 DESCRIPTION
 133
 134 The HTML::PullParser is an alternative interface to the HTML::Parser class.
 135 It basically turns the HTML::Parser inside out.  You associate a file
 136 (or any IO::Handle object or string) with the parser at construction time and
 137 then repeatedly call $parser->get_token to obtain the tags and text
 138 found in the parsed document.
 139
 140 The following methods are provided:
 141
 142 =over 4
 143
 144 =item $p = HTML::PullParser->new( file => $file, %options )
 145
 146 =item $p = HTML::PullParser->new( doc => \$doc, %options )
 147
 148 A C<HTML::PullParser> can be made to parse from either a file or a
 149 literal document based on whether the C<file> or C<doc> option is
 150 passed to the parser's constructor.
 151
 152 The C<file> passed in can either be a file name or a file handle
 153 object.  If a file name is passed, and it can't be opened for reading,
 154 then the constructor will return an undefined value and $!  will tell
 155 you why it failed.  Otherwise the argument is taken to be some object
 156 that the C<HTML::PullParser> can read() from when it needs more data.
 157 The stream will be read() until EOF, but not closed.
 158
 159 A C<doc> can be passed plain or as a reference
 160 to a scalar.  If a reference is passed then the value of this scalar
 161 should not be changed before all tokens have been extracted.
 162
 163 Next the information to be returned for the different token types must
 164 be set up.  This is done by simply associating an argspec (as defined
 165 in L<HTML::Parser>) with the events you have an interest in.  For
 166 instance, if you want C<start> tokens to be reported as the string
 167 C<'S'> followed by the tagname and the attributes you might pass an
 168 C<start>-option like this:
 169
 170    $p = HTML::PullParser->new(
 171           doc   => $document_to_parse,
 172           start => '"S", tagname, @attr',
 173           end   => '"E", tagname',
 174         );
 175
 176 At last other C<HTML::Parser> options, like C<ignore_tags>, and
 177 C<unbroken_text>, can be passed in.  Note that you should not use the
 178 I<event>_h options to set up parser handlers.  That would confuse the
 179 inner logic of C<HTML::PullParser>.
 180
 181 =item $token = $p->get_token
 182
 183 This method will return the next I<token> found in the HTML document,
 184 or C<undef> at the end of the document.  The token is returned as an
 185 array reference.  The content of this array match the argspec set up
 186 during C<HTML::PullParser> construction.
 187
 188 =item $p->unget_token( @tokens )
 189
 190 If you find out you have read too many tokens you can push them back,
 191 so that they are returned again the next time $p->get_token is called.
 192
 193 =back
 194
 195 =head1 EXAMPLES
 196
 197 The 'eg/hform' script shows how we might parse the form section of
 198 HTML::Documents using HTML::PullParser.
 199
 200 =head1 SEE ALSO
 201
 202 L<HTML::Parser>, L<HTML::TokeParser>
 203
 204 =head1 COPYRIGHT
 205
 206 Copyright 1998-2001 Gisle Aas.
 207
 208 This library is free software; you can redistribute it and/or
 209 modify it under the same terms as Perl itself.
 210
 211 =cut