git.maemo.org Git - pkg-perl/blob - deb-src/libhtml-parser-perl/libhtml-parser-perl-3.56/lib/HTML/LinkExtor.pm

   1 package HTML::LinkExtor;
   2
   3 # $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $
   4
   5 require HTML::Parser;
   6 @ISA = qw(HTML::Parser);
   7 $VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/);
   8
   9 =head1 NAME
  10
  11 HTML::LinkExtor - Extract links from an HTML document
  12
  13 =head1 SYNOPSIS
  14
  15  require HTML::LinkExtor;
  16  $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
  17  sub cb {
  18      my($tag, %links) = @_;
  19      print "$tag @{[%links]}\n";
  20  }
  21  $p->parse_file("index.html");
  22
  23 =head1 DESCRIPTION
  24
  25 I<HTML::LinkExtor> is an HTML parser that extracts links from an
  26 HTML document.  The I<HTML::LinkExtor> is a subclass of
  27 I<HTML::Parser>. This means that the document should be given to the
  28 parser by calling the $p->parse() or $p->parse_file() methods.
  29
  30 =cut
  31
  32 use strict;
  33 use HTML::Tagset ();
  34
  35 # legacy (some applications grabs this hash directly)
  36 use vars qw(%LINK_ELEMENT);
  37 *LINK_ELEMENT = \%HTML::Tagset::linkElements;
  38
  39 =over 4
  40
  41 =item $p = HTML::LinkExtor->new
  42
  43 =item $p = HTML::LinkExtor->new( $callback )
  44
  45 =item $p = HTML::LinkExtor->new( $callback, $base )
  46
  47 The constructor takes two optional arguments. The first is a reference
  48 to a callback routine. It will be called as links are found. If a
  49 callback is not provided, then links are just accumulated internally
  50 and can be retrieved by calling the $p->links() method.
  51
  52 The $base argument is an optional base URL used to absolutize all URLs found.
  53 You need to have the I<URI> module installed if you provide $base.
  54
  55 The callback is called with the lowercase tag name as first argument,
  56 and then all link attributes as separate key/value pairs.  All
  57 non-link attributes are removed.
  58
  59 =cut
  60
  61 sub new
  62 {
  63     my($class, $cb, $base) = @_;
  64     my $self = $class->SUPER::new(
  65                     start_h => ["_start_tag", "self,tagname,attr"],
  66                     report_tags => [keys %HTML::Tagset::linkElements],
  67                );
  68     $self->{extractlink_cb} = $cb;
  69     if ($base) {
  70         require URI;
  71         $self->{extractlink_base} = URI->new($base);
  72     }
  73     $self;
  74 }
  75
  76 sub _start_tag
  77 {
  78     my($self, $tag, $attr) = @_;
  79
  80     my $base = $self->{extractlink_base};
  81     my $links = $HTML::Tagset::linkElements{$tag};
  82     $links = [$links] unless ref $links;
  83
  84     my @links;
  85     my $a;
  86     for $a (@$links) {
  87         next unless exists $attr->{$a};
  88         push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
  89                                : $attr->{$a});
  90     }
  91     return unless @links;
  92     $self->_found_link($tag, @links);
  93 }
  94
  95 sub _found_link
  96 {
  97     my $self = shift;
  98     my $cb = $self->{extractlink_cb};
  99     if ($cb) {
 100         &$cb(@_);
 101     } else {
 102         push(@{$self->{'links'}}, [@_]);
 103     }
 104 }
 105
 106 =item $p->links
 107
 108 Returns a list of all links found in the document.  The returned
 109 values will be anonymous arrays with the follwing elements:
 110
 111   [$tag, $attr => $url1, $attr2 => $url2,...]
 112
 113 The $p->links method will also truncate the internal link list.  This
 114 means that if the method is called twice without any parsing
 115 between them the second call will return an empty list.
 116
 117 Also note that $p->links will always be empty if a callback routine
 118 was provided when the I<HTML::LinkExtor> was created.
 119
 120 =cut
 121
 122 sub links
 123 {
 124     my $self = shift;
 125     exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
 126 }
 127
 128 # We override the parse_file() method so that we can clear the links
 129 # before we start a new file.
 130 sub parse_file
 131 {
 132     my $self = shift;
 133     delete $self->{'links'};
 134     $self->SUPER::parse_file(@_);
 135 }
 136
 137 =back
 138
 139 =head1 EXAMPLE
 140
 141 This is an example showing how you can extract links from a document
 142 received using LWP:
 143
 144   use LWP::UserAgent;
 145   use HTML::LinkExtor;
 146   use URI::URL;
 147
 148   $url = "http://www.perl.org/";  # for instance
 149   $ua = LWP::UserAgent->new;
 150
 151   # Set up a callback that collect image links
 152   my @imgs = ();
 153   sub callback {
 154      my($tag, %attr) = @_;
 155      return if $tag ne 'img';  # we only look closer at <img ...>
 156      push(@imgs, values %attr);
 157   }
 158
 159   # Make the parser.  Unfortunately, we don't know the base yet
 160   # (it might be diffent from $url)
 161   $p = HTML::LinkExtor->new(\&callback);
 162
 163   # Request document and parse it as it arrives
 164   $res = $ua->request(HTTP::Request->new(GET => $url),
 165                       sub {$p->parse($_[0])});
 166
 167   # Expand all image URLs to absolute ones
 168   my $base = $res->base;
 169   @imgs = map { $_ = url($_, $base)->abs; } @imgs;
 170
 171   # Print them out
 172   print join("\n", @imgs), "\n";
 173
 174 =head1 SEE ALSO
 175
 176 L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
 177
 178 =head1 COPYRIGHT
 179
 180 Copyright 1996-2001 Gisle Aas.
 181
 182 This library is free software; you can redistribute it and/or
 183 modify it under the same terms as Perl itself.
 184
 185 =cut
 186
 187 1;