X-Git-Url: http://git.maemo.org/git/?a=blobdiff_plain;ds=sidebyside;f=dev%2Fi386%2Flibwww-perl%2Flibwww-perl-5.813%2Fbin%2Flwp-rget;fp=dev%2Fi386%2Flibwww-perl%2Flibwww-perl-5.813%2Fbin%2Flwp-rget;h=aaf899b8460ba15df451c0479e89396f6be4b665;hb=8977e561d8a9eae6959218b0306c9df2056a38a9;hp=0000000000000000000000000000000000000000;hpb=df794b845212301ea0d267c919232538bfef356a;p=dh-make-perl diff --git a/dev/i386/libwww-perl/libwww-perl-5.813/bin/lwp-rget b/dev/i386/libwww-perl/libwww-perl-5.813/bin/lwp-rget new file mode 100755 index 0000000..aaf899b --- /dev/null +++ b/dev/i386/libwww-perl/libwww-perl-5.813/bin/lwp-rget @@ -0,0 +1,607 @@ +#!/usr/bin/perl -w + +=head1 NAME + +lwp-rget - Retrieve web documents recursively + +=head1 SYNOPSIS + + lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier] [--iis] + [--keepext=mime/type[,mime/type]] [--limit=N] [--nospace] + [--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] + lwp-rget --version + +=head1 DESCRIPTION + +This program will retrieve a document and store it in a local file. It +will follow any links found in the document and store these documents +as well, patching links so that they refer to these local copies. +This process continues until there are no more unvisited links or the +process is stopped by the one or more of the limits which can be +controlled by the command line arguments. + +This program is useful if you want to make a local copy of a +collection of documents or want to do web reading off-line. + +All documents are stored as plain files in the current directory. The +file names chosen are derived from the last component of URL paths. + +The options are: + +=over 3 + +=item --auth=USER:PASS + +Set the authentication credentials to user "USER" and password "PASS" if +any restricted parts of the web site are hit. If there are restricted +parts of the web site and authentication credentials are not available, +those pages will not be downloaded. + +=item --depth=I + +Limit the recursive level. Embedded images are always loaded, even if +they fall outside the I<--depth>. This means that one can use +I<--depth=0> in order to fetch a single document together with all +inline graphics. + +The default depth is 5. + +=item --hier + +Download files into a hierarchy that mimics the web site structure. +The default is to put all files in the current directory. + +=item --referer=I + +Set the value of the Referer header for the initial request. The +special value C<"NONE"> can be used to suppress the Referer header in +any of subsequent requests. The Referer header will always be suppressed +in all normal C requests if the referring page was transmitted over +C as recommended in RFC 2616. + +=item --iis + +Sends an "Accept: */*" on all URL requests as a workaround for a bug in +IIS 2.0. If no Accept MIME header is present, IIS 2.0 returns with a +"406 No acceptable objects were found" error. Also converts any back +slashes (\\) in URLs to forward slashes (/). + +=item --keepext=I + +Keeps the current extension for the list MIME types. Useful when +downloading text/plain documents that shouldn't all be translated to +*.txt files. + +=item --limit=I + +Limit the number of documents to get. The default limit is 50. + +=item --nospace + +Changes spaces in all URLs to underscore characters (_). Useful when +downloading files from sites serving URLs with spaces in them. Does not +remove spaces from fragments, e.g., "file.html#somewhere in here". + +=item --prefix=I + +Limit the links to follow. Only URLs that start the prefix string are +followed. + +The default prefix is set as the "directory" of the initial URL to +follow. For instance if we start lwp-rget with the URL +C, then prefix will be set to +C. + +Use C<--prefix=''> if you don't want the fetching to be limited by any +prefix. + +=item --sleep=I + +Sleep I seconds before retrieving each document. This options allows +you to go slowly, not loading the server you visiting too much. + +=item --tolower + +Translates all links to lowercase. Useful when downloading files from +IIS since it does not serve files in a case sensitive manner. + +=item --verbose + +Make more noise while running. + +=item --quiet + +Don't make any noise. + +=item --version + +Print program version number and quit. + +=item --help + +Print the usage message and quit. + +=back + +Before the program exits the name of the file, where the initial URL +is stored, is printed on stdout. All used filenames are also printed +on stderr as they are loaded. This printing can be suppressed with +the I<--quiet> option. + +=head1 SEE ALSO + +L, L + +=head1 AUTHOR + +Gisle Aas + +=cut + +use strict; + +use Getopt::Long qw(GetOptions); +use URI::URL qw(url); +use LWP::MediaTypes qw(media_suffix); +use HTML::Entities (); + +use vars qw($VERSION); +use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT); + +my $progname = $0; +$progname =~ s|.*/||; # only basename left +$progname =~ s/\.\w*$//; #strip extension if any + +$VERSION = "5.810"; + +#$Getopt::Long::debug = 1; +#$Getopt::Long::ignorecase = 0; + +# Defaults +$MAX_DEPTH = 5; +$MAX_DOCS = 50; + +GetOptions('version' => \&print_version, + 'help' => \&usage, + 'depth=i' => \$MAX_DEPTH, + 'limit=i' => \$MAX_DOCS, + 'verbose!' => \$VERBOSE, + 'quiet!' => \$QUIET, + 'sleep=i' => \$SLEEP, + 'prefix:s' => \$PREFIX, + 'referer:s'=> \$REFERER, + 'hier' => \$HIER, + 'auth=s' => \$AUTH, + 'iis' => \$IIS, + 'tolower' => \$TOLOWER, + 'nospace' => \$NOSPACE, + 'keepext=s' => \$KEEPEXT{'OPT'}, + ) || usage(); + +sub print_version { + require LWP; + my $DISTNAME = 'libwww-perl-' . LWP::Version(); + print <<"EOT"; +This is lwp-rget version $VERSION ($DISTNAME) + +Copyright 1996-1998, Gisle Aas. + +This program is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. +EOT + exit 0; +} + +my $start_url = shift || usage(); +usage() if @ARGV; + +require LWP::UserAgent; +my $ua = new LWP::UserAgent; +$ua->agent("$progname/$VERSION " . $ua->agent); +$ua->env_proxy; + +unless (defined $PREFIX) { + $PREFIX = url($start_url); # limit to URLs below this one + eval { + $PREFIX->eparams(undef); + $PREFIX->equery(undef); + }; + + $_ = $PREFIX->epath; + s|[^/]+$||; + $PREFIX->epath($_); + $PREFIX = $PREFIX->as_string; +} + +%KEEPEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEPEXT{'OPT'}||0)); + +my $SUPPRESS_REFERER; +$SUPPRESS_REFERER++ if ($REFERER || "") eq "NONE"; + +print <<"" if $VERBOSE; +START = $start_url +MAX_DEPTH = $MAX_DEPTH +MAX_DOCS = $MAX_DOCS +PREFIX = $PREFIX + +my $no_docs = 0; +my %seen = (); # mapping from URL => local_file + +my $filename = fetch($start_url, undef, $REFERER); +print "$filename\n" unless $QUIET; + +sub fetch +{ + my($url, $type, $referer, $depth) = @_; + + # Fix http://sitename.com/../blah/blah.html to + # http://sitename.com/blah/blah.html + $url = $url->as_string if (ref($url)); + while ($url =~ s#(https?://[^/]+/)\.\.\/#$1#) {} + + # Fix backslashes (\) in URL if $IIS defined + $url = fix_backslashes($url) if (defined $IIS); + + $url = url($url); + $type ||= 'a'; + # Might be the background attribute + $type = 'img' if ($type eq 'body' || $type eq 'td'); + $depth ||= 0; + + # Print the URL before we start checking... + my $out = (" " x $depth) . $url . " "; + $out .= "." x (60 - length($out)); + print STDERR $out . " " if $VERBOSE; + + # Can't get mailto things + if ($url->scheme eq 'mailto') { + print STDERR "*skipping mailto*\n" if $VERBOSE; + return $url->as_string; + } + + # The $plain_url is a URL without the fragment part + my $plain_url = $url->clone; + $plain_url->frag(undef); + + # Check PREFIX, but not for links + if ($type ne 'img' and $url->as_string !~ /^\Q$PREFIX/o) { + print STDERR "*outsider*\n" if $VERBOSE; + return $url->as_string; + } + + # Translate URL to lowercase if $TOLOWER defined + $plain_url = to_lower($plain_url) if (defined $TOLOWER); + + # If we already have it, then there is nothing to be done + my $seen = $seen{$plain_url->as_string}; + if ($seen) { + my $frag = $url->frag; + $seen .= "#$frag" if defined($frag); + $seen = protect_frag_spaces($seen); + print STDERR "$seen (again)\n" if $VERBOSE; + return $seen; + } + + # Too much or too deep + if ($depth > $MAX_DEPTH and $type ne 'img') { + print STDERR "*too deep*\n" if $VERBOSE; + return $url; + } + if ($no_docs > $MAX_DOCS) { + print STDERR "*too many*\n" if $VERBOSE; + return $url; + } + + # Fetch document + $no_docs++; + sleep($SLEEP) if $SLEEP; + my $req = HTTP::Request->new(GET => $url); + # See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT + $req->header ('Accept', '*/*') if (defined $IIS); # GIF/JPG from IIS 2.0 + $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH); + if ($referer && !$SUPPRESS_REFERER) { + if ($req->url->scheme eq 'http') { + # RFC 2616, section 15.1.3 + $referer = url($referer) unless ref($referer); + undef $referer if ($referer->scheme || '') eq 'https'; + } + $req->referer($referer) if $referer; + } + my $res = $ua->request($req); + + # Check outcome + if ($res->is_success) { + my $doc = $res->content; + my $ct = $res->content_type; + my $name = find_name($res->request->url, $ct); + print STDERR "$name\n" unless $QUIET; + $seen{$plain_url->as_string} = $name; + + # If the file is HTML, then we look for internal links + if ($ct eq "text/html") { + # Save an unprosessed version of the HTML document. This + # both reserves the name used, and it also ensures that we + # don't loose everything if this program is killed before + # we finish. + save($name, $doc); + my $base = $res->base; + + # Follow and substitute links... + $doc =~ +s/ + ( + <(img|a|body|area|frame|td)\b # some interesting tag + [^>]+ # still inside tag (not strictly correct) + \b(?:src|href|background) # some link attribute + \s*=\s* # = + ) + (?: # scope of OR-ing + (")([^"]*)" | # value in double quotes OR + (')([^']*)' | # value in single quotes OR + ([^\s>]+) # quoteless value + ) +/ + new_link($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7), + $base, $name, "$url", $depth+1) +/giex; + # XXX + # The regular expression above is not strictly correct. + # It is not really possible to parse HTML with a single + # regular expression, but it is faster. Tags that might + # confuse us include: + # + # + # + } + save($name, $doc); + return $name; + } + else { + print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE; + $seen{$plain_url->as_string} = $url->as_string; + return $url->as_string; + } +} + +sub new_link +{ + my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth) = @_; + + $url = protect_frag_spaces($url); + + $url = fetch(url($url, $base)->abs, $type, $referer, $depth); + $url = url("file:$url", "file:$localbase")->rel + unless $url =~ /^[.+\-\w]+:/; + + $url = unprotect_frag_spaces($url); + + return $pre . $quote . $url . $quote; +} + + +sub protect_frag_spaces +{ + my ($url) = @_; + + $url = $url->as_string if (ref($url)); + + if ($url =~ m/^([^#]*#)(.+)$/) + { + my ($base, $frag) = ($1, $2); + $frag =~ s/ /%20/g; + $url = $base . $frag; + } + + return $url; +} + + +sub unprotect_frag_spaces +{ + my ($url) = @_; + + $url = $url->as_string if (ref($url)); + + if ($url =~ m/^([^#]*#)(.+)$/) + { + my ($base, $frag) = ($1, $2); + $frag =~ s/%20/ /g; + $url = $base . $frag; + } + + return $url; +} + + +sub fix_backslashes +{ + my ($url) = @_; + my ($base, $frag); + + $url = $url->as_string if (ref($url)); + + if ($url =~ m/([^#]+)(#.*)/) + { + ($base, $frag) = ($1, $2); + } + else + { + $base = $url; + $frag = ""; + } + + $base =~ tr/\\/\//; + $base =~ s/%5[cC]/\//g; # URL-encoded back slash is %5C + + return $base . $frag; +} + + +sub to_lower +{ + my ($url) = @_; + my $was_object = 0; + + if (ref($url)) + { + $url = $url->as_string; + $was_object = 1; + } + + if ($url =~ m/([^#]+)(#.*)/) + { + $url = lc($1) . $2; + } + else + { + $url = lc($url); + } + + if ($was_object == 1) + { + return url($url); + } + else + { + return $url; + } +} + + +sub translate_spaces +{ + my ($url) = @_; + my ($base, $frag); + + $url = $url->as_string if (ref($url)); + + if ($url =~ m/([^#]+)(#.*)/) + { + ($base, $frag) = ($1, $2); + } + else + { + $base = $url; + $frag = ""; + } + + $base =~ s/^ *//; # Remove initial spaces from base + $base =~ s/ *$//; # Remove trailing spaces from base + + $base =~ tr/ /_/; + $base =~ s/%20/_/g; # URL-encoded space is %20 + + return $base . $frag; +} + + +sub mkdirp +{ + my($directory, $mode) = @_; + my @dirs = split(/\//, $directory); + my $path = shift(@dirs); # build it as we go + my $result = 1; # assume it will work + + unless (-d $path) { + $result &&= mkdir($path, $mode); + } + + foreach (@dirs) { + $path .= "/$_"; + if ( ! -d $path) { + $result &&= mkdir($path, $mode); + } + } + + return $result; +} + + +sub find_name +{ + my($url, $type) = @_; + #print "find_name($url, $type)\n"; + + # Translate spaces in URL to underscores (_) if $NOSPACE defined + $url = translate_spaces($url) if (defined $NOSPACE); + + # Translate URL to lowercase if $TOLOWER defined + $url = to_lower($url) if (defined $TOLOWER); + + $url = url($url) unless ref($url); + + my $path = $url->path; + + # trim path until only the basename is left + $path =~ s|(.*/)||; + my $dirname = ".$1"; + if (!$HIER) { + $dirname = ""; + } + elsif (! -d $dirname) { + mkdirp($dirname, 0775); + } + + my $extra = ""; # something to make the name unique + my $suffix; + + if ($KEEPEXT{lc($type)}) { + $suffix = ($path =~ m/\.(.*)/) ? $1 : ""; + } + else { + $suffix = media_suffix($type); + } + + $path =~ s|\..*||; # trim suffix + $path = "index" unless length $path; + + while (1) { + # Construct a new file name + my $file = $dirname . $path . $extra; + $file .= ".$suffix" if $suffix; + # Check if it is unique + return $file unless -f $file; + + # Try something extra + unless ($extra) { + $extra = "001"; + next; + } + $extra++; + } +} + + +sub save +{ + my $name = shift; + #print "save($name,...)\n"; + open(FILE, ">$name") || die "Can't save $name: $!"; + binmode FILE; + print FILE $_[0]; + close(FILE); +} + + +sub usage +{ + print <<""; exit 1; +Usage: $progname [options] +Allowed options are: + --auth=USER:PASS Set authentication credentials for web site + --depth=N Maximum depth to traverse (default is $MAX_DEPTH) + --hier Download into hierarchy (not all files into cwd) + --referer=URI Set initial referer header (or "NONE") + --iis Workaround IIS 2.0 bug by sending "Accept: */*" MIME + header; translates backslashes (\\) to forward slashes (/) + --keepext=type Keep file extension for MIME types (comma-separated list) + --limit=N A limit on the number documents to get (default is $MAX_DOCS) + --nospace Translate spaces URLs (not #fragments) to underscores (_) + --version Print version number and quit + --verbose More output + --quiet No output + --sleep=SECS Sleep between gets, ie. go slowly + --prefix=PREFIX Limit URLs to follow to those which begin with PREFIX + --tolower Translate all URLs to lowercase (useful with IIS servers) + +}