1 package HTML::PullParser;
3 # $Id: PullParser.pm,v 2.9 2006/04/26 08:00:28 gisle Exp $
7 $VERSION = sprintf("%d.%02d", q$Revision: 2.9 $ =~ /(\d+)\.(\d+)/);
14 my($class, %cnf) = @_;
16 # Construct argspecs for the various events
18 for (qw(start end text declaration comment process default)) {
19 my $tmp = delete $cnf{$_};
20 next unless defined $tmp;
23 Carp::croak("Info not collected for any events")
26 my $file = delete $cnf{file};
27 my $doc = delete $cnf{doc};
28 Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
29 if defined($file) && defined($doc);
30 Carp::croak("No 'doc' or 'file' given to parse from")
31 unless defined($file) || defined($doc);
34 $cnf{api_version} = 3;
35 my $self = $class->SUPER::new(%cnf);
37 my $accum = $self->{pullparser_accum} = [];
38 while (my($event, $argspec) = each %argspec) {
39 $self->SUPER::handler($event => $accum, $argspec);
43 $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
44 $self->{pullparser_str_pos} = 0;
47 if (!ref($file) && ref(\$file) ne "GLOB") {
49 $file = IO::File->new($file, "r") || return;
52 $self->{pullparser_file} = $file;
60 Carp::croak("Can't set handlers for HTML::PullParser");
67 while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
68 if (my $f = $self->{pullparser_file}) {
69 # must try to parse more from the file
71 if (read($f, $buf, 512)) {
75 $self->{pullparser_eof}++;
76 delete $self->{pullparser_file};
79 elsif (my $sref = $self->{pullparser_str_ref}) {
80 # must try to parse more from the scalar
81 my $pos = $self->{pullparser_str_pos};
82 my $chunk = substr($$sref, $pos, 512);
84 $pos += length($chunk);
85 if ($pos < length($$sref)) {
86 $self->{pullparser_str_pos} = $pos;
90 $self->{pullparser_eof}++;
91 delete $self->{pullparser_str_ref};
92 delete $self->{pullparser_str_pos};
99 shift @{$self->{pullparser_accum}};
106 unshift @{$self->{pullparser_accum}}, @_;
117 HTML::PullParser - Alternative HTML::Parser interface
121 use HTML::PullParser;
123 $p = HTML::PullParser->new(file => "index.html",
124 start => 'event, tagname, @attr',
125 end => 'event, tagname',
126 ignore_elements => [qw(script style)],
127 ) || die "Can't open: $!";
128 while (my $token = $p->get_token) {
129 #...do something with $token
134 The HTML::PullParser is an alternative interface to the HTML::Parser class.
135 It basically turns the HTML::Parser inside out. You associate a file
136 (or any IO::Handle object or string) with the parser at construction time and
137 then repeatedly call $parser->get_token to obtain the tags and text
138 found in the parsed document.
140 The following methods are provided:
144 =item $p = HTML::PullParser->new( file => $file, %options )
146 =item $p = HTML::PullParser->new( doc => \$doc, %options )
148 A C<HTML::PullParser> can be made to parse from either a file or a
149 literal document based on whether the C<file> or C<doc> option is
150 passed to the parser's constructor.
152 The C<file> passed in can either be a file name or a file handle
153 object. If a file name is passed, and it can't be opened for reading,
154 then the constructor will return an undefined value and $! will tell
155 you why it failed. Otherwise the argument is taken to be some object
156 that the C<HTML::PullParser> can read() from when it needs more data.
157 The stream will be read() until EOF, but not closed.
159 A C<doc> can be passed plain or as a reference
160 to a scalar. If a reference is passed then the value of this scalar
161 should not be changed before all tokens have been extracted.
163 Next the information to be returned for the different token types must
164 be set up. This is done by simply associating an argspec (as defined
165 in L<HTML::Parser>) with the events you have an interest in. For
166 instance, if you want C<start> tokens to be reported as the string
167 C<'S'> followed by the tagname and the attributes you might pass an
168 C<start>-option like this:
170 $p = HTML::PullParser->new(
171 doc => $document_to_parse,
172 start => '"S", tagname, @attr',
173 end => '"E", tagname',
176 At last other C<HTML::Parser> options, like C<ignore_tags>, and
177 C<unbroken_text>, can be passed in. Note that you should not use the
178 I<event>_h options to set up parser handlers. That would confuse the
179 inner logic of C<HTML::PullParser>.
181 =item $token = $p->get_token
183 This method will return the next I<token> found in the HTML document,
184 or C<undef> at the end of the document. The token is returned as an
185 array reference. The content of this array match the argspec set up
186 during C<HTML::PullParser> construction.
188 =item $p->unget_token( @tokens )
190 If you find out you have read too many tokens you can push them back,
191 so that they are returned again the next time $p->get_token is called.
197 The 'eg/hform' script shows how we might parse the form section of
198 HTML::Documents using HTML::PullParser.
202 L<HTML::Parser>, L<HTML::TokeParser>
206 Copyright 1998-2001 Gisle Aas.
208 This library is free software; you can redistribute it and/or
209 modify it under the same terms as Perl itself.