git.maemo.org Git - dh-make-perl/blob - dev/arm/libwww-mechanize-perl/libwww-mechanize-perl-1.34/debian/libwww-mechanize-perl/usr/share/perl5/WWW/Mechanize/Examples.pod

   1 =head1 NAME
   2
   3 WWW::Mechanize::Examples - Sample programs that use WWW::Mechanize
   4
   5 =head1 SYNOPSIS
   6
   7 Plenty of people have learned WWW::Mechanize, and now, you can too!
   8
   9 Following are user-supplied samples of WWW::Mechanize in action.
  10 If you have samples you'd like to contribute, please send 'em to
  11 C<< <andy@petdance.com> >>.
  12
  13 You can also look at the F<t/*.t> files in the distribution.
  14
  15 Please note that these examples are not intended to do any specific task.
  16 For all I know, they're no longer functional because the sites they
  17 hit have changed.  They're here to give examples of how people have
  18 used WWW::Mechanize.
  19
  20 Note that the examples are in reverse order of my having received them,
  21 so the freshest examples are always at the top.
  22
  23 =head2 Starbucks Density Calculator, by Nat Torkington
  24
  25 Here's a pair of scripts from Nat Torkington, editor for O'Reilly Media
  26 and co-author of the I<Perl Cookbook>.
  27
  28 =over 4
  29
  30 Rael [Dornfest] discovered that you can easily find out how many Starbucks
  31 there are in an area by searching for "Starbucks".  So I wrote a silly
  32 scraper for some old census data and came up with some Starbucks density
  33 figures.  There's no meaning to these numbers thanks to errors from using
  34 old census data coupled with false positives in Yahoo search (e.g.,
  35 "Dodie Starbuck-Your Style Desgn" in Portland OR).  But it was fun to
  36 waste a night on.
  37
  38 Here are the top twenty cities in descending order of population,
  39 with the amount of territory each Starbucks has.  E.g., A New York NY
  40 Starbucks covers 1.7 square miles of ground.
  41
  42     New York, NY        1.7
  43     Los Angeles, CA     1.2
  44     Chicago, IL         1.0
  45     Houston, TX         4.6
  46     Philadelphia, PA    6.8
  47     San Diego, CA       2.7
  48     Detroit, MI        19.9
  49     Dallas, TX          2.7
  50     Phoenix, AZ         4.1
  51     San Antonio, TX    12.3
  52     San Jose, CA        1.1
  53     Baltimore, MD       3.9
  54     Indianapolis, IN   12.1
  55     San Francisco, CA   0.5
  56     Jacksonville, FL   39.9
  57     Columbus, OH        7.3
  58     Milwaukee, WI       5.1
  59     Memphis, TN        15.1
  60     Washington, DC      1.4
  61     Boston, MA          0.5
  62
  63 =back
  64
  65 C<get_pop_data>
  66
  67     #!/usr/bin/perl -w
  68
  69     use WWW::Mechanize;
  70     use Storable;
  71
  72     $url = 'http://www.census.gov/population/www/documentation/twps0027.html';
  73     $m = WWW::Mechanize->new();
  74     $m->get($url);
  75
  76     $c = $m->content;
  77
  78     $c =~ m{<A NAME=.tabA.>(.*?)</TABLE>}s
  79       or die "Can't find the population table\n";
  80     $t = $1;
  81     @outer = $t =~ m{<TR.*?>(.*?)</TR>}gs;
  82     shift @outer;
  83     foreach $r (@outer) {
  84       @bits = $r =~ m{<TD.*?>(.*?)</TD>}gs;
  85       for ($x = 0; $x < @bits; $x++) {
  86         $b = $bits[$x];
  87         @v = split /\s*<BR>\s*/, $b;
  88         foreach (@v) { s/^\s+//; s/\s+$// }
  89         push @{$data[$x]}, @v;
  90       }
  91     }
  92
  93     for ($y = 0; $y < @{$data[0]}; $y++) {
  94         $data{$data[1][$y]} = {
  95             NAME => $data[1][$y],
  96             RANK => $data[0][$y],
  97             POP  => comma_free($data[2][$y]),
  98             AREA => comma_free($data[3][$y]),
  99             DENS => comma_free($data[4][$y]),
 100         };
 101     }
 102
 103     store(\%data, "cities.dat");
 104
 105     sub comma_free {
 106       my $n = shift;
 107       $n =~ s/,//;
 108       return $n;
 109     }
 110
 111
 112 C<plague_of_coffee>
 113
 114     #!/usr/bin/perl -w
 115
 116     use WWW::Mechanize;
 117     use strict;
 118     use Storable;
 119
 120     $SIG{__WARN__} = sub {} ;  # ssssssh
 121
 122     my $Cities = retrieve("cities.dat");
 123
 124     my $m = WWW::Mechanize->new();
 125     $m->get("http://local.yahoo.com/");
 126
 127     my @cities = sort { $Cities->{$a}{RANK} <=> $Cities->{$b}{RANK} } keys %$Cities;
 128     foreach my $c ( @cities ) {
 129       my $fields = {
 130         'stx' => "starbucks",
 131         'csz' => $c,
 132       };
 133
 134       my $r = $m->submit_form(form_number => 2,
 135                               fields => $fields);
 136       die "Couldn't submit form" unless $r->is_success;
 137
 138       my $hits = number_of_hits($r);
 139       #  my $ppl  = sprintf("%d", 1000 * $Cities->{$c}{POP} / $hits);
 140       #  print "$c has $hits Starbucks.  That's one for every $ppl people.\n";
 141       my $density = sprintf("%.1f", $Cities->{$c}{AREA} / $hits);
 142       print "$c : $density\n";
 143     }
 144
 145     sub number_of_hits {
 146       my $r = shift;
 147       my $c = $r->content;
 148       if ($c =~ m{\d+ out of <b>(\d+)</b> total results for}) {
 149         return $1;
 150       }
 151       if ($c =~ m{Sorry, no .*? found in or near}) {
 152         return 0;
 153       }
 154       if ($c =~ m{Your search matched multiple cities}) {
 155         warn "Your search matched multiple cities\n";
 156         return 0;
 157       }
 158       if ($c =~ m{Sorry we couldn.t find that location}) {
 159         warn "No cities\n";
 160         return 0;
 161       }
 162       if ($c =~ m{Could not find.*?, showing results for}) {
 163         warn "No matches\n";
 164         return 0;
 165       }
 166       die "Unknown response\n$c\n";
 167     }
 168
 169
 170
 171 =head2 pb-upload, by John Beppu
 172
 173 This script takes filenames of images from the command line and
 174 uploads them to a www.photobucket.com folder.  John Beppu, the author, says:
 175
 176 =over 4
 177
 178 I had 92 pictures I wanted to upload, and doing it through a browser
 179 would've been torture.  But thanks to mech, all I had to do was
 180 `./pb.upload *.jpg` and watch it do its thing.  It felt good.
 181 If I had more time, I'd implement WWW::Photobucket on top of
 182 WWW::Mechanize.
 183
 184 =back
 185
 186     #!/usr/bin/perl -w -T
 187
 188     use strict;
 189     use WWW::Mechanize;
 190
 191     my $login    = "login_name";
 192     my $password = "password";
 193     my $folder   = "folder";
 194
 195     my $url = "http://img78.photobucket.com/albums/v281/$login/$folder/";
 196
 197     # login to your photobucket.com account
 198     my $mech = WWW::Mechanize->new();
 199     $mech->get($url);
 200     $mech->submit_form(
 201         form_number => 1,
 202         fields      => { password => $password },
 203     );
 204     die unless ($mech->success);
 205
 206     # upload image files specified on command line
 207     foreach (@ARGV) {
 208         print "$_\n";
 209         $mech->form_number(2);
 210         $mech->field('the_file[]' => $_);
 211         $mech->submit();
 212     }
 213
 214 =head2 listmod, by Ian Langworth
 215
 216 Ian Langworth contributes this little gem that will bring joy to
 217 beleagured mailing list admins.  It discards spam messages through
 218 mailman's web interface.
 219
 220
 221     #!/arch/unix/bin/perl
 222     use strict;
 223     use warnings;
 224     #
 225     # listmod - fast alternative to mailman list interface
 226     #
 227     # usage: listmod crew XXXXXXXX
 228     #
 229
 230     die "usage: $0 <listname> <password>\n" unless @ARGV == 2;
 231     my ($listname, $password) = @ARGV;
 232
 233     use CGI qw(unescape);
 234
 235     use WWW::Mechanize;
 236     my $m = WWW::Mechanize->new( autocheck => 1 );
 237
 238     use Term::ReadLine;
 239     my $term = Term::ReadLine->new($0);
 240
 241     # submit the form, get the cookie, go to the list admin page
 242     $m->get("https://lists.ccs.neu.edu/bin/admindb/$listname");
 243     $m->set_visible( $password );
 244     $m->click;
 245
 246     # exit if nothing to do
 247     print "There are no pending requests.\n" and exit
 248         if $m->content =~ /There are no pending requests/;
 249
 250     # select the first form and examine its contents
 251     $m->form_number(1);
 252     my $f = $m->current_form or die "Couldn't get first form!\n";
 253
 254     # get me the base form element for each email item
 255     my @items = map {m/^.+?-(.+)/} grep {m/senderbanp/} $f->param
 256         or die "Couldn't get items in first form!\n";
 257
 258     # iterate through items, prompt user, commit actions
 259     foreach my $item (@items) {
 260
 261         # show item info
 262         my $sender = unescape($item);
 263         my ($subject) = [$f->find_input("senderbanp-$item")->value_names]->[1]
 264             =~ /Subject:\s+(.+?)\s+Size:/g;
 265
 266         # prompt user
 267         my $choice = '';
 268         while ( $choice !~ /^[DAX]$/ ) {
 269             print "$sender\: '$subject'\n";
 270             $choice = uc $term->readline("Action: defer/accept/discard [dax]: ");
 271             print "\n\n";
 272         }
 273
 274         # set button
 275         $m->field("senderaction-$item" => {D=>0,A=>1,X=>3}->{$choice});
 276     }
 277
 278     # submit actions
 279     $m->click;
 280
 281 =head2 ccdl, by Andy Lester
 282
 283 Steve McConnell, author of the landmark I<Code Complete> has put
 284 up the chapters for the 2nd edition in PDF format on his website.
 285 I needed to download them to take to Kinko's to have printed.  This
 286 little script did it for me.
 287
 288
 289     #!/usr/bin/perl -w
 290
 291     use strict;
 292     use WWW::Mechanize;
 293
 294     my $start = "http://www.stevemcconnell.com/cc2/cc.htm";
 295
 296     my $mech = WWW::Mechanize->new( autocheck => 1 );
 297     $mech->get( $start );
 298
 299     my @links = $mech->find_all_links( url_regex => qr/\d+.+\.pdf$/ );
 300
 301     for my $link ( @links ) {
 302         my $url = $link->url_abs;
 303         my $filename = $url;
 304         $filename =~ s[^.+/][];
 305
 306         print "Fetching $url";
 307         $mech->get( $url, ':content_file' => $filename );
 308
 309         print "   ", -s $filename, " bytes\n";
 310     }
 311
 312 =head2 quotes.pl, by Andy Lester
 313
 314 This was a script that was going to get a hack in I<Spidering Hacks>,
 315 but got cut at the last minute, probably because it's against IMDB's TOS
 316 to scrape from it.  I present it here as an example, not a suggestion
 317 that you break their TOS.
 318
 319 Last I checked, it didn't work because their HTML didn't match, but it's
 320 still good as sample code.
 321
 322     #!/usr/bin/perl -w
 323
 324     use strict;
 325
 326     use WWW::Mechanize;
 327     use Getopt::Long;
 328     use Text::Wrap;
 329
 330     my $match = undef;
 331     my $random = undef;
 332     GetOptions(
 333         "match=s" => \$match,
 334         "random" => \$random,
 335     ) or exit 1;
 336
 337     my $movie = shift @ARGV or die "Must specify a movie\n";
 338
 339     my $quotes_page = get_quotes_page( $movie );
 340     my @quotes = extract_quotes( $quotes_page );
 341
 342     if ( $match ) {
 343         $match = quotemeta($match);
 344         @quotes = grep /$match/i, @quotes;
 345     }
 346
 347     if ( $random ) {
 348         print $quotes[rand @quotes];
 349     } else {
 350         print join( "\n", @quotes );
 351     }
 352
 353
 354     sub get_quotes_page {
 355         my $movie = shift;
 356
 357         my $mech = new WWW::Mechanize;
 358         $mech->get( "http://www.imdb.com/search" );
 359         $mech->success or die "Can't get the search page";
 360
 361         $mech->submit_form(
 362         form_number => 2,
 363         fields => {
 364             title       => $movie,
 365             restrict    => "Movies only",
 366         },
 367         );
 368
 369         my @links = $mech->find_all_links( url_regex => qr[^/Title] )
 370         or die "No matches for \"$movie\" were found.\n";
 371
 372         # Use the first link
 373         my ( $url, $title ) = @{$links[0]};
 374
 375         warn "Checking $title...\n";
 376
 377         $mech->get( $url );
 378         my $link = $mech->find_link( text_regex => qr/Memorable Quotes/i )
 379         or die qq{"$title" has no quotes in IMDB!\n};
 380
 381         warn "Fetching quotes...\n\n";
 382         $mech->get( $link->[0] );
 383
 384         return $mech->content;
 385     }
 386
 387
 388     sub extract_quotes {
 389         my $page = shift;
 390
 391         # Nibble away at the unwanted HTML at the beginnning...
 392         $page =~ s/.+Memorable Quotes//si;
 393         $page =~ s/.+?(<a name)/$1/si;
 394
 395         # ... and the end of the page
 396         $page =~ s/Browse titles in the movie quotes.+$//si;
 397         $page =~ s/<p.+$//g;
 398
 399         # Quotes separated by an <HR> tag
 400         my @quotes = split( /<hr.+?>/, $page );
 401
 402         for my $quote ( @quotes ) {
 403         my @lines = split( /<br>/, $quote );
 404         for ( @lines ) {
 405             s/<[^>]+>//g;   # Strip HTML tags
 406             s/\s+/ /g;      # Squash whitespace
 407             s/^ //;         # Strip leading space
 408             s/ $//;         # Strip trailing space
 409             s/&#34;/"/g;    # Replace HTML entity quotes
 410
 411             # Word-wrap to fit in 72 columns
 412             $Text::Wrap::columns = 72;
 413             $_ = wrap( '', '    ', $_ );
 414         }
 415         $quote = join( "\n", @lines );
 416         }
 417
 418         return @quotes;
 419     }
 420
 421 =head2 cpansearch.pl, by Ed Silva
 422
 423 A quick little utility to search the CPAN and fire up a browser
 424 with a results page.
 425
 426     #!/usr/bin/perl
 427
 428     # turn on perl's safety features
 429     use strict;
 430     use warnings;
 431
 432     # work out the name of the module we're looking for
 433     my $module_name = $ARGV[0]
 434       or die "Must specify module name on command line";
 435
 436     # create a new browser
 437     use WWW::Mechanize;
 438     my $browser = WWW::Mechanize->new();
 439
 440     # tell it to get the main page
 441     $browser->get("http://search.cpan.org/");
 442
 443     # okay, fill in the box with the name of the
 444     # module we want to look up
 445     $browser->form_number(1);
 446     $browser->field("query", $module_name);
 447     $browser->click();
 448
 449     # click on the link that matches the module name
 450     $browser->follow_link( text_regex => $module_name );
 451
 452     my $url = $browser->uri;
 453
 454     # launch a browser...
 455     system('galeon', $url);
 456
 457     exit(0);
 458
 459
 460 =head2 lj_friends.cgi, by Matt Cashner
 461
 462     #!/usr/bin/perl
 463
 464     # Provides an rss feed of a paid user's LiveJournal friends list
 465     # Full entries, protected entries, etc.
 466     # Add to your favorite rss reader as
 467     # http://your.site.com/cgi-bin/lj_friends.cgi?user=USER&password=PASSWORD
 468
 469     use warnings;
 470     use strict;
 471
 472     use WWW::Mechanize;
 473     use CGI;
 474
 475     my $cgi = CGI->new();
 476     my $form = $cgi->Vars;
 477
 478     my $agent = WWW::Mechanize->new();
 479
 480     $agent->get('http://www.livejournal.com/login.bml');
 481     $agent->form_number('3');
 482     $agent->field('user',$form->{user});
 483     $agent->field('password',$form->{password});
 484     $agent->submit();
 485     $agent->get('http://www.livejournal.com/customview.cgi?user='.$form->{user}.'&styleid=225596&checkcookies=1');
 486     print "Content-type: text/plain\n\n";
 487     print $agent->content();
 488
 489 =head2 Hacking Movable Type, by Dan Rinzel
 490
 491     use WWW::Mechanize;
 492
 493     # a tool to automatically post entries to a moveable type weblog, and set arbitary creation dates
 494
 495     my $mech = WWW::Mechanize->new();
 496     my %entry;
 497     $entry->{title} = "Test AutoEntry Title";
 498     $entry->{btext} = "Test AutoEntry Body";
 499     $entry->{date} = '2002-04-15 14:18:00';
 500     my $start = qq|http://my.blog.site/mt.cgi|;
 501
 502     $mech->get($start);
 503     $mech->field('username','und3f1n3d');
 504     $mech->field('password','obscur3d');
 505     $mech->submit(); # to get login cookie
 506     $mech->get(qq|$start?__mode=view&_type=entry&blog_id=1|);
 507     $mech->form_name('entry_form');
 508     $mech->field('title',$entry->{title});
 509     $mech->field('category_id',1); # adjust as needed
 510     $mech->field('text',$entry->{btext});
 511     $mech->field('status',2); # publish, or 1 = draft
 512     $results = $mech->submit();
 513
 514     # if we're ok with this entry being datestamped "NOW" (no {date} in %entry)
 515     # we're done. Otherwise, time to be tricksy
 516     # MT returns a 302 redirect from this form. the redirect itself contains a <body onload=""> handler
 517     # which takes the user to an editable version of the form where the create date can be edited
 518     # MT date format of YYYY-MM-DD HH:MI:SS is the only one that won't error out
 519
 520     if ($entry->{date} && $entry->{date} =~ /^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}/) {
 521         # travel the redirect
 522         $results = $mech->get($results->{_headers}->{location});
 523         $results->{_content} =~ /<body onLoad="([^\"]+)"/is;
 524         my $js = $1;
 525         $js =~ /\'([^']+)\'/;
 526         $results = $mech->get($start.$1);
 527         $mech->form_name('entry_form');
 528         $mech->field('created_on_manual',$entry->{date});
 529         $mech->submit();
 530     }
 531
 532 =head2 get-despair, by Randal Schwartz
 533
 534 Randal submitted this bot that walks the despair.com site sucking down
 535 all the pictures.
 536
 537     use strict;
 538     $|++;
 539
 540     use WWW::Mechanize;
 541     use File::Basename;
 542
 543     my $m = WWW::Mechanize->new;
 544
 545     $m->get("http://www.despair.com/indem.html");
 546
 547     my @top_links = @{$m->links};
 548
 549     for my $top_link_num (0..$#top_links) {
 550         next unless $top_links[$top_link_num][0] =~ /^http:/;
 551
 552         $m->follow_link( n=>$top_link_num ) or die "can't follow $top_link_num";
 553
 554         print $m->uri, "\n";
 555         for my $image (grep m{^http://store4}, map $_->[0], @{$m->links}) {
 556             my $local = basename $image;
 557             print " $image...", $m->mirror($image, $local)->message, "\n"
 558         }
 559
 560         $m->back or die "can't go back";
 561     }