git.maemo.org Git - pkg-perl/blob - deb-src/libhtml-parser-perl/libhtml-parser-perl-3.56/t/tokeparser.t

   1 use Test::More tests => 17;
   2
   3 use strict;
   4 use HTML::TokeParser;
   5
   6 # First we create an HTML document to test
   7
   8 my $file = "ttest$$.htm";
   9 die "$file already exists" if -e $file;
  10
  11 open(F, ">$file") or die "Can't create $file: $!";
  12 print F <<'EOT';  close(F);
  13
  14 <!--This is a test-->
  15 <html><head><title>
  16   This is the &lt;title&gt;
  17 </title>
  18
  19   <base href="http://www.perl.com">
  20 </head>
  21
  22 <body background="bg.gif">
  23
  24     <h1>This is the <b>title</b> again
  25     </h1>
  26
  27     And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl">&nbsp;<!--nice isn't it-->Institute</a>
  28
  29    <br/><? process instruction >
  30
  31 </body>
  32 </html>
  33
  34 EOT
  35
  36 END { unlink($file) || warn "Can't unlink $file: $!"; }
  37
  38
  39 my $p;
  40
  41
  42 $p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
  43 ok($p->unbroken_text);
  44 if ($p->get_tag("foo", "title")) {
  45     my $title = $p->get_trimmed_text;
  46     #diag "Title: $title";
  47     is($title, "This is the <title>");
  48 }
  49 undef($p);
  50
  51 # Test with reference to glob
  52 open(F, $file) || die "Can't open $file: $!";
  53 $p = HTML::TokeParser->new(\*F);
  54 my $scount = 0;
  55 my $ecount = 0;
  56 my $tcount = 0;
  57 my $pcount = 0;
  58 while (my $token = $p->get_token) {
  59     $scount++ if $token->[0] eq "S";
  60     $ecount++ if $token->[0] eq "E";
  61     $pcount++ if $token->[0] eq "PI";
  62 }
  63 undef($p);
  64 close F;
  65
  66 # Test with glob
  67 open(F, $file) || die "Can't open $file: $!";
  68 $p = HTML::TokeParser->new(*F);
  69 $tcount++ while $p->get_tag;
  70 undef($p);
  71 close F;
  72
  73 # Test with plain file name
  74 $p = HTML::TokeParser->new($file) || die;
  75 $tcount++ while $p->get_tag;
  76 undef($p);
  77
  78 #diag "Number of tokens found: $tcount/2 = $scount + $ecount";
  79 is($tcount, 34);
  80 is($scount, 10);
  81 is($ecount, 7);
  82 is($pcount, 1);
  83 is($tcount/2, $scount + $ecount);
  84
  85 ok(!HTML::TokeParser->new("/noT/thEre/$$"));
  86
  87
  88 $p = HTML::TokeParser->new($file) || die;
  89 $p->get_tag("a");
  90 my $atext = $p->get_text;
  91 undef($p);
  92
  93 is($atext, "Perl\240Institute");
  94
  95 # test parsing of embeded document
  96 $p = HTML::TokeParser->new(\<<HTML);
  97 <title>Title</title>
  98 <H1>
  99 Heading
 100 </h1>
 101 HTML
 102
 103 ok($p->get_tag("h1"));
 104 is($p->get_trimmed_text, "Heading");
 105 undef($p);
 106
 107 # test parsing of large embedded documents
 108 my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;
 109
 110 #use Time::HiRes qw(time);
 111 my $start = time;
 112 $p = HTML::TokeParser->new(\$doc);
 113 #diag "Construction time: ", time - $start;
 114
 115 my $count;
 116 while (my $t = $p->get_token) {
 117     $count++ if $t->[0] eq "S";
 118 }
 119 #diag "Parse time: ", time - $start;
 120
 121 is($count, 2022);
 122
 123 $p = HTML::TokeParser->new(\<<'EOT');
 124 <H1>This is a heading</H1>
 125 This is s<b>o</b>me<hr>text.
 126 <br />
 127 This is some more text.
 128 <p>
 129 This is even some more.
 130 EOT
 131
 132 $p->get_tag("/h1");
 133
 134 my $t = $p->get_trimmed_text("br", "p");
 135 is($t, "This is some text.");
 136
 137 $p->get_tag;
 138
 139 $t = $p->get_trimmed_text("br", "p");
 140 is($t,"This is some more text.");
 141
 142 undef($p);
 143
 144 $p = HTML::TokeParser->new(\<<'EOT');
 145 <H1>This is a <b>bold</b> heading</H1>
 146 This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
 147 <p>
 148 This is even some more.
 149 EOT
 150
 151 $p->get_tag("h1");
 152
 153 $t = $p->get_phrase;
 154 is($t, "This is a bold heading");
 155
 156 $t = $p->get_phrase;
 157 is($t, "");
 158
 159 $p->get_tag;
 160
 161 $t = $p->get_phrase;
 162 is($t, "This is some italic text. This is some more text.");
 163
 164 undef($p);