git.maemo.org Git - quicknewsreader/blob - qml/QuickNewsReader/content/js/Readability.js

   1 /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */
   2 /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */
   3
   4 var dbg = function(s) {
   5     window.console.log("Readability: " + s);
   6 };
   7
   8 /*
   9  * Readability. An Arc90 Lab Experiment.
  10  * Website: http://lab.arc90.com/experiments/readability
  11  * Source:  http://code.google.com/p/arc90labs-readability
  12  *
  13  * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
  14  *
  15  * Copyright (c) 2010 Arc90 Inc
  16  * Readability is licensed under the Apache License, Version 2.0.
  17 **/
  18 var readability = {
  19     version:                '1.7.1',
  20     emailSrc:               'http://lab.arc90.com/experiments/readability/email.php',
  21     iframeLoads:             0,
  22     convertLinksToFootnotes: false,
  23     reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
  24     frameHack:               false, /**
  25                                       * The frame hack is to workaround a firefox bug where if you
  26                                       * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
  27                                       * So we fake a scrollbar in the wrapping div.
  28                                      **/
  29     biggestFrame:            false,
  30     bodyCache:               null,   /* Cache the body HTML in case we need to re-use it later */
  31     flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */
  32
  33     /* constants */
  34     FLAG_STRIP_UNLIKELYS:     0x1,
  35     FLAG_WEIGHT_CLASSES:      0x2,
  36     FLAG_CLEAN_CONDITIONALLY: 0x4,
  37
  38     maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
  39     parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
  40     pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
  41
  42     /**
  43      * All of the regular expressions in use within readability.
  44      * Defined up here so we don't instantiate them repeatedly in loops.
  45      **/
  46     regexps: {
  47         unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
  48         okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
  49         positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
  50         negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
  51         extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
  52         divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
  53         replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
  54         replaceFonts:          /<(\/?)font[^>]*>/gi,
  55         trim:                  /^\s+|\s+$/g,
  56         normalize:             /\s{2,}/g,
  57         killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
  58         videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
  59         skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
  60         nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
  61         prevLink:              /(prev|earl|old|new|<|«)/i
  62     },
  63
  64     /**
  65      * Runs readability.
  66      *
  67      * Workflow:
  68      *  1. Prep the document by removing script tags, css, etc.
  69      *  2. Build readability's DOM tree.
  70      *  3. Grab the article content from the current dom tree.
  71      *  4. Replace the current DOM tree with the new one.
  72      *  5. Read peacefully.
  73      *
  74      * @return void
  75      **/
  76     init: function() {
  77         /* Before we do anything, remove all scripts that are not readability. */
  78         window.onload = window.onunload = function() {};
  79
  80         readability.removeScripts(document);
  81
  82         if(document.body && !readability.bodyCache) {
  83             readability.bodyCache = document.body.innerHTML;
  84
  85         }
  86         /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
  87         readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
  88
  89         /* Pull out any possible next page link first */
  90         var nextPageLink = readability.findNextPageLink(document.body);
  91
  92         readability.prepDocument();
  93
  94         /* Build readability's DOM tree */
  95         var overlay        = document.createElement("DIV");
  96         var innerDiv       = document.createElement("DIV");
  97         var articleTools   = readability.getArticleTools();
  98         var articleTitle   = readability.getArticleTitle();
  99         var articleContent = readability.grabArticle();
 100         var articleFooter  = readability.getArticleFooter();
 101
 102         if(!articleContent) {
 103             articleContent    = document.createElement("DIV");
 104             articleContent.id = "readability-content";
 105             articleContent.innerHTML = [
 106                 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
 107                 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: <a href='" + readability.biggestFrame.src + "'>" + readability.biggestFrame.src + "</a></p>" : ""),
 108                 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
 109             ].join('');
 110
 111             nextPageLink = null;
 112         }
 113
 114         overlay.id              = "readOverlay";
 115         innerDiv.id             = "readInner";
 116
 117         /* Apply user-selected styling */
 118         document.body.className = readStyle;
 119         document.dir            = readability.getSuggestedDirection(articleTitle.innerHTML);
 120
 121         if (readStyle === "style-athelas" || readStyle === "style-apertura"){
 122             overlay.className = readStyle + " rdbTypekit";
 123         }
 124         else {
 125             overlay.className = readStyle;
 126         }
 127         innerDiv.className    = readMargin + " " + readSize;
 128
 129         if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
 130             readability.convertLinksToFootnotes = true;
 131         }
 132
 133         /* Glue the structure of our document together. */
 134         innerDiv.appendChild( articleTitle   );
 135         innerDiv.appendChild( articleContent );
 136         innerDiv.appendChild( articleFooter  );
 137          overlay.appendChild( articleTools   );
 138          overlay.appendChild( innerDiv       );
 139
 140         /* Clear the old HTML, insert the new content. */
 141         document.body.innerHTML = "";
 142         document.body.insertBefore(overlay, document.body.firstChild);
 143         document.body.removeAttribute('style');
 144
 145         if(readability.frameHack)
 146         {
 147             var readOverlay = document.getElementById('readOverlay');
 148             readOverlay.style.height = '100%';
 149             readOverlay.style.overflow = 'auto';
 150         }
 151
 152         /**
 153          * If someone tries to use Readability on a site's root page, give them a warning about usage.
 154         **/
 155         if((window.location.protocol + "//" + window.location.host + "/") === window.location.href)
 156         {
 157             articleContent.style.display = "none";
 158             var rootWarning = document.createElement('p');
 159                 rootWarning.id = "readability-warning";
 160                 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
 161                     "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
 162
 163             innerDiv.insertBefore( rootWarning, articleContent );
 164         }
 165
 166         readability.postProcessContent(articleContent);
 167
 168         window.scrollTo(0, 0);
 169
 170         /* If we're using the Typekit library, select the font */
 171         if (readStyle === "style-athelas" || readStyle === "style-apertura") {
 172             readability.useRdbTypekit();
 173         }
 174
 175         if (nextPageLink) {
 176             /**
 177              * Append any additional pages after a small timeout so that people
 178              * can start reading without having to wait for this to finish processing.
 179             **/
 180             window.setTimeout(function() {
 181                 readability.appendNextPage(nextPageLink);
 182             }, 500);
 183         }
 184
 185         /** Smooth scrolling **/
 186         document.onkeydown = function(e) {
 187             var code = (window.event) ? event.keyCode : e.keyCode;
 188             if (code === 16) {
 189                 readability.reversePageScroll = true;
 190                 return;
 191             }
 192
 193             if (code === 32) {
 194                 readability.curScrollStep = 0;
 195                 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
 196
 197                 if(readability.reversePageScroll) {
 198                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
 199                 }
 200                 else {
 201                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
 202                 }
 203
 204                 return false;
 205             }
 206         };
 207
 208         document.onkeyup = function(e) {
 209             var code = (window.event) ? event.keyCode : e.keyCode;
 210             if (code === 16) {
 211                 readability.reversePageScroll = false;
 212                 return;
 213             }
 214         };
 215     },
 216
 217     /**
 218      * Run any post-process modifications to article content as necessary.
 219      *
 220      * @param Element
 221      * @return void
 222     **/
 223     postProcessContent: function(articleContent) {
 224         if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
 225             readability.addFootnotes(articleContent);
 226         }
 227
 228         readability.fixImageFloats(articleContent);
 229     },
 230
 231     /**
 232      * Some content ends up looking ugly if the image is too large to be floated.
 233      * If the image is wider than a threshold (currently 55%), no longer float it,
 234      * center it instead.
 235      *
 236      * @param Element
 237      * @return void
 238     **/
 239     fixImageFloats: function (articleContent) {
 240         var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
 241             images              = articleContent.getElementsByTagName('img');
 242
 243         for(var i=0, il = images.length; i < il; i+=1) {
 244             var image = images[i];
 245
 246             if(image.offsetWidth > imageWidthThreshold) {
 247                 image.className += " blockImage";
 248             }
 249         }
 250     },
 251
 252     /**
 253      * Get the article tools Element that has buttons like reload, print, email.
 254      *
 255      * @return void
 256      **/
 257     getArticleTools: function () {
 258         var articleTools = document.createElement("DIV");
 259
 260         articleTools.id        = "readTools";
 261         articleTools.innerHTML =
 262             "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
 263             "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
 264             "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
 265
 266         return articleTools;
 267     },
 268
 269     /**
 270      * retuns the suggested direction of the string
 271      *
 272      * @return "rtl" || "ltr"
 273      **/
 274     getSuggestedDirection: function(text) {
 275         function sanitizeText() {
 276             return text.replace(/@\w+/, "");
 277         }
 278
 279         function countMatches(match) {
 280             var matches = text.match(new RegExp(match, "g"));
 281             return matches !== null ? matches.length : 0;
 282         }
 283
 284         function isRTL() {
 285             var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
 286             var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
 287
 288             // if 20% of chars are Hebrew or Arbic then direction is rtl
 289             return  (count_heb + count_arb) * 100 / text.length > 20;
 290         }
 291
 292         text  = sanitizeText(text);
 293         return isRTL() ? "rtl" : "ltr";
 294     },
 295
 296
 297     /**
 298      * Get the article title as an H1.
 299      *
 300      * @return void
 301      **/
 302     getArticleTitle: function () {
 303         var curTitle = "",
 304             origTitle = "";
 305
 306         try {
 307             curTitle = origTitle = document.title;
 308
 309             if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
 310                 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
 311             }
 312         }
 313         catch(e) {}
 314
 315         if(curTitle.match(/ [\|\-] /))
 316         {
 317             curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
 318
 319             if(curTitle.split(' ').length < 3) {
 320                 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
 321             }
 322         }
 323         else if(curTitle.indexOf(': ') !== -1)
 324         {
 325             curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
 326
 327             if(curTitle.split(' ').length < 3) {
 328                 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
 329             }
 330         }
 331         else if(curTitle.length > 150 || curTitle.length < 15)
 332         {
 333             var hOnes = document.getElementsByTagName('h1');
 334             if(hOnes.length === 1)
 335             {
 336                 curTitle = readability.getInnerText(hOnes[0]);
 337             }
 338         }
 339
 340         curTitle = curTitle.replace( readability.regexps.trim, "" );
 341
 342         if(curTitle.split(' ').length <= 4) {
 343             curTitle = origTitle;
 344         }
 345
 346         var articleTitle = document.createElement("H1");
 347         articleTitle.innerHTML = curTitle;
 348
 349         return articleTitle;
 350     },
 351
 352     /**
 353      * Get the footer with the readability mark etc.
 354      *
 355      * @return void
 356      **/
 357     getArticleFooter: function () {
 358         var articleFooter = document.createElement("DIV");
 359
 360         /**
 361          * For research purposes, generate an img src that contains the chosen readstyle etc,
 362          * so we can generate aggregate stats and change styles based on them in the future
 363          **/
 364         // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
 365         /* TODO: attach this to an image */
 366
 367         articleFooter.id = "readFooter";
 368         articleFooter.innerHTML = [
 369         "<div id='rdb-footer-print'>Excerpted from <cite>" + document.title + "</cite><br />" + window.location.href + "</div>",
 370         "<div id='rdb-footer-wrapper'>",
 371              "<div id='rdb-footer-left'>",
 372                  "<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability &mdash;&nbsp;</a>",
 373                  "<a href='http://www.arc90.com/' id='arc90-logo'> An Arc90 Laboratory Experiment&nbsp;</a>",
 374                  " <span id='readability-url'> http://lab.arc90.com/experiments/readability</span>",
 375              "</div>",
 376              "<div id='rdb-footer-right'>",
 377                  "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>",
 378                  "<span class='version'>Readability version " + readability.version + "</span>",
 379              "</div>",
 380         "</div>"].join('');
 381
 382         return articleFooter;
 383     },
 384
 385     /**
 386      * Prepare the HTML document for readability to scrape it.
 387      * This includes things like stripping javascript, CSS, and handling terrible markup.
 388      *
 389      * @return void
 390      **/
 391     prepDocument: function () {
 392         /**
 393          * In some cases a body element can't be found (if the HTML is totally hosed for example)
 394          * so we create a new body node and append it to the document.
 395          */
 396         if(document.body === null)
 397         {
 398             var body = document.createElement("body");
 399             try {
 400                 document.body = body;
 401             }
 402             catch(e) {
 403                 document.documentElement.appendChild(body);
 404                 dbg(e);
 405             }
 406         }
 407
 408         document.body.id = "readabilityBody";
 409
 410         var frames = document.getElementsByTagName('frame');
 411         if(frames.length > 0)
 412         {
 413             var bestFrame = null;
 414             var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
 415             var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
 416             for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
 417             {
 418                 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
 419                 var canAccessFrame = false;
 420                 try {
 421                     var frameBody = frames[frameIndex].contentWindow.document.body;
 422                     canAccessFrame = true;
 423                 }
 424                 catch(eFrames) {
 425                     dbg(eFrames);
 426                 }
 427
 428                 if(frameSize > biggestFrameSize) {
 429                     biggestFrameSize         = frameSize;
 430                     readability.biggestFrame = frames[frameIndex];
 431                 }
 432
 433                 if(canAccessFrame && frameSize > bestFrameSize)
 434                 {
 435                     readability.frameHack = true;
 436
 437                     bestFrame = frames[frameIndex];
 438                     bestFrameSize = frameSize;
 439                 }
 440             }
 441
 442             if(bestFrame)
 443             {
 444                 var newBody = document.createElement('body');
 445                 newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
 446                 newBody.style.overflow = 'scroll';
 447                 document.body = newBody;
 448
 449                 var frameset = document.getElementsByTagName('frameset')[0];
 450                 if(frameset) {
 451                     frameset.parentNode.removeChild(frameset); }
 452             }
 453         }
 454
 455         /* Remove all stylesheets */
 456         for (var k=0;k < document.styleSheets.length; k+=1) {
 457             if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
 458                 document.styleSheets[k].disabled = true;
 459             }
 460         }
 461
 462         /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
 463         var styleTags = document.getElementsByTagName("style");
 464         for (var st=0;st < styleTags.length; st+=1) {
 465             styleTags[st].textContent = "";
 466         }
 467
 468         /* Turn all double br's into p's */
 469         /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
 470         document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
 471     },
 472
 473     /**
 474      * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
 475      * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
 476      *
 477      * @return void
 478     **/
 479     addFootnotes: function(articleContent) {
 480         var footnotesWrapper = document.getElementById('readability-footnotes'),
 481             articleFootnotes = document.getElementById('readability-footnotes-list');
 482
 483         if(!footnotesWrapper) {
 484             footnotesWrapper               = document.createElement("DIV");
 485             footnotesWrapper.id            = 'readability-footnotes';
 486             footnotesWrapper.innerHTML     = '<h3>References</h3>';
 487             footnotesWrapper.style.display = 'none'; /* Until we know we have footnotes, don't show the references block. */
 488
 489             articleFootnotes    = document.createElement('ol');
 490             articleFootnotes.id = 'readability-footnotes-list';
 491
 492             footnotesWrapper.appendChild(articleFootnotes);
 493
 494             var readFooter = document.getElementById('readFooter');
 495
 496             if(readFooter) {
 497                 readFooter.parentNode.insertBefore(footnotesWrapper, readFooter);
 498             }
 499         }
 500
 501         var articleLinks = articleContent.getElementsByTagName('a');
 502         var linkCount    = articleFootnotes.getElementsByTagName('li').length;
 503         for (var i = 0; i < articleLinks.length; i+=1)
 504         {
 505             var articleLink  = articleLinks[i],
 506                 footnoteLink = articleLink.cloneNode(true),
 507                 refLink      = document.createElement('a'),
 508                 footnote     = document.createElement('li'),
 509                 linkDomain   = footnoteLink.host ? footnoteLink.host : document.location.host,
 510                 linkText     = readability.getInnerText(articleLink);
 511
 512             if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLink)) {
 513                 continue;
 514             }
 515
 516             linkCount+=1;
 517
 518             /** Add a superscript reference after the article link */
 519             refLink.href      = '#readabilityFootnoteLink-' + linkCount;
 520             refLink.innerHTML = '<small><sup>[' + linkCount + ']</sup></small>';
 521             refLink.className = 'readability-DoNotFootnote';
 522             try { refLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */
 523
 524             if(articleLink.parentNode.lastChild === articleLink) {
 525                 articleLink.parentNode.appendChild(refLink);
 526             } else {
 527                 articleLink.parentNode.insertBefore(refLink, articleLink.nextSibling);
 528             }
 529
 530             articleLink.name        = 'readabilityLink-' + linkCount;
 531             try { articleLink.style.color = 'inherit'; } catch(err) {} /* IE7 doesn't like inherit. */
 532
 533             footnote.innerHTML      = "<small><sup><a href='#readabilityLink-" + linkCount + "' title='Jump to Link in Article'>^</a></sup></small> ";
 534
 535             footnoteLink.innerHTML  = (footnoteLink.title ? footnoteLink.title : linkText);
 536             footnoteLink.name       = 'readabilityFootnoteLink-' + linkCount;
 537
 538             footnote.appendChild(footnoteLink);
 539             footnote.innerHTML = footnote.innerHTML + "<small> (" + linkDomain + ")</small>";
 540
 541             articleFootnotes.appendChild(footnote);
 542         }
 543
 544         if(linkCount > 0) {
 545             footnotesWrapper.style.display = 'block';
 546         }
 547     },
 548
 549     useRdbTypekit: function () {
 550         var rdbHead      = document.getElementsByTagName('head')[0];
 551         var rdbTKScript  = document.createElement('script');
 552         var rdbTKCode    = null;
 553
 554         var rdbTKLink    = document.createElement('a');
 555             rdbTKLink.setAttribute('class','rdbTK-powered');
 556             rdbTKLink.setAttribute('title','Fonts by Typekit');
 557             rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
 558
 559         if (readStyle === "style-athelas") {
 560             rdbTKCode = "sxt6vzy";
 561             dbg("Using Athelas Theme");
 562
 563             rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
 564             rdbTKLink.setAttribute('id','rdb-athelas');
 565             document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
 566         }
 567         if (readStyle === "style-apertura") {
 568             rdbTKCode = "bae8ybu";
 569             dbg("Using Inverse Theme");
 570
 571             rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
 572             rdbTKLink.setAttribute('id','rdb-inverse');
 573             document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
 574         }
 575
 576         /**
 577          * Setting new script tag attributes to pull Typekits libraries
 578         **/
 579         rdbTKScript.setAttribute('type','text/javascript');
 580         rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
 581         rdbTKScript.setAttribute('charset','UTF-8');
 582         rdbHead.appendChild(rdbTKScript);
 583
 584         /**
 585          * In the future, maybe try using the following experimental Callback function?:
 586          * http://gist.github.com/192350
 587          * &
 588          * http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
 589         **/
 590         var typekitLoader = function() {
 591             dbg("Looking for Typekit.");
 592             if(typeof Typekit !== "undefined") {
 593                 try {
 594                     dbg("Caught typekit");
 595                     Typekit.load();
 596                     clearInterval(window.typekitInterval);
 597                 } catch(e) {
 598                     dbg("Typekit error: " + e);
 599                 }
 600             }
 601         };
 602
 603         window.typekitInterval = window.setInterval(typekitLoader, 100);
 604     },
 605
 606     /**
 607      * Prepare the article node for display. Clean out any inline styles,
 608      * iframes, forms, strip extraneous <p> tags, etc.
 609      *
 610      * @param Element
 611      * @return void
 612      **/
 613     prepArticle: function (articleContent) {
 614         readability.cleanStyles(articleContent);
 615         readability.killBreaks(articleContent);
 616
 617         /* Clean out junk from the article content */
 618         readability.cleanConditionally(articleContent, "form");
 619         readability.clean(articleContent, "object");
 620         readability.clean(articleContent, "h1");
 621
 622         /**
 623          * If there is only one h2, they are probably using it
 624          * as a header and not a subheader, so remove it since we already have a header.
 625         ***/
 626         if(articleContent.getElementsByTagName('h2').length === 1) {
 627             readability.clean(articleContent, "h2");
 628         }
 629         readability.clean(articleContent, "iframe");
 630
 631         readability.cleanHeaders(articleContent);
 632
 633         /* Do these last as the previous stuff may have removed junk that will affect these */
 634         readability.cleanConditionally(articleContent, "table");
 635         readability.cleanConditionally(articleContent, "ul");
 636         readability.cleanConditionally(articleContent, "div");
 637
 638         /* Remove extra paragraphs */
 639         var articleParagraphs = articleContent.getElementsByTagName('p');
 640         for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
 641             var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
 642             var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
 643             var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
 644
 645             if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
 646                 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
 647             }
 648         }
 649
 650         try {
 651             articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
 652         }
 653         catch (e) {
 654             dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
 655         }
 656     },
 657
 658     /**
 659      * Initialize a node with the readability object. Also checks the
 660      * className/id for special names to add to its score.
 661      *
 662      * @param Element
 663      * @return void
 664     **/
 665     initializeNode: function (node) {
 666         node.readability = {"contentScore": 0};
 667
 668         switch(node.tagName) {
 669             case 'DIV':
 670                 node.readability.contentScore += 5;
 671                 break;
 672
 673             case 'PRE':
 674             case 'TD':
 675             case 'BLOCKQUOTE':
 676                 node.readability.contentScore += 3;
 677                 break;
 678
 679             case 'ADDRESS':
 680             case 'OL':
 681             case 'UL':
 682             case 'DL':
 683             case 'DD':
 684             case 'DT':
 685             case 'LI':
 686             case 'FORM':
 687                 node.readability.contentScore -= 3;
 688                 break;
 689
 690             case 'H1':
 691             case 'H2':
 692             case 'H3':
 693             case 'H4':
 694             case 'H5':
 695             case 'H6':
 696             case 'TH':
 697                 node.readability.contentScore -= 5;
 698                 break;
 699         }
 700
 701         node.readability.contentScore += readability.getClassWeight(node);
 702     },
 703
 704     /***
 705      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 706      *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 707      *
 708      * @param page a document to run upon. Needs to be a full document, complete with body.
 709      * @return Element
 710     **/
 711     grabArticle: function (page) {
 712         var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
 713             isPaging = (page !== null) ? true: false;
 714
 715         page = page ? page : document.body;
 716
 717         var pageCacheHtml = page.innerHTML;
 718
 719         var allElements = page.getElementsByTagName('*');
 720
 721         /**
 722          * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
 723          * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
 724          *
 725          * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
 726          * TODO: Shouldn't this be a reverse traversal?
 727         **/
 728         var node = null;
 729         var nodesToScore = [];
 730         for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
 731             /* Remove unlikely candidates */
 732             if (stripUnlikelyCandidates) {
 733                 var unlikelyMatchString = node.className + node.id;
 734                 if (
 735                     (
 736                         unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
 737                         unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
 738                         node.tagName !== "BODY"
 739                     )
 740                 )
 741                 {
 742                     dbg("Removing unlikely candidate - " + unlikelyMatchString);
 743                     node.parentNode.removeChild(node);
 744                     nodeIndex-=1;
 745                     continue;
 746                 }
 747             }
 748
 749             if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
 750                 nodesToScore[nodesToScore.length] = node;
 751             }
 752
 753             /* Turn all divs that don't have children block level elements into p's */
 754             if (node.tagName === "DIV") {
 755                 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
 756                     var newNode = document.createElement('p');
 757                     try {
 758                         newNode.innerHTML = node.innerHTML;
 759                         node.parentNode.replaceChild(newNode, node);
 760                         nodeIndex-=1;
 761
 762                         nodesToScore[nodesToScore.length] = node;
 763                     }
 764                     catch(e) {
 765                         dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
 766                     }
 767                 }
 768                 else
 769                 {
 770                     /* EXPERIMENTAL */
 771                     for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
 772                         var childNode = node.childNodes[i];
 773                         if(childNode.nodeType === 3) { // Node.TEXT_NODE
 774                             var p = document.createElement('p');
 775                             p.innerHTML = childNode.nodeValue;
 776                             p.style.display = 'inline';
 777                             p.className = 'readability-styled';
 778                             childNode.parentNode.replaceChild(p, childNode);
 779                         }
 780                     }
 781                 }
 782             }
 783         }
 784
 785         /**
 786          * Loop through all paragraphs, and assign a score to them based on how content-y they look.
 787          * Then add their score to their parent node.
 788          *
 789          * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
 790         **/
 791         var candidates = [];
 792         for (var pt=0; pt < nodesToScore.length; pt+=1) {
 793             var parentNode      = nodesToScore[pt].parentNode;
 794             var grandParentNode = parentNode ? parentNode.parentNode : null;
 795             var innerText       = readability.getInnerText(nodesToScore[pt]);
 796
 797             if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
 798                 continue;
 799             }
 800
 801             /* If this paragraph is less than 25 characters, don't even count it. */
 802             if(innerText.length < 25) {
 803                 continue; }
 804
 805             /* Initialize readability data for the parent. */
 806             if(typeof parentNode.readability === 'undefined') {
 807                 readability.initializeNode(parentNode);
 808                 candidates.push(parentNode);
 809             }
 810
 811             /* Initialize readability data for the grandparent. */
 812             if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
 813                 readability.initializeNode(grandParentNode);
 814                 candidates.push(grandParentNode);
 815             }
 816
 817             var contentScore = 0;
 818
 819             /* Add a point for the paragraph itself as a base. */
 820             contentScore+=1;
 821
 822             /* Add points for any commas within this paragraph */
 823             contentScore += innerText.split(',').length;
 824
 825             /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
 826             contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 827
 828             /* Add the score to the parent. The grandparent gets half. */
 829             parentNode.readability.contentScore += contentScore;
 830
 831             if(grandParentNode) {
 832                 grandParentNode.readability.contentScore += contentScore/2;
 833             }
 834         }
 835
 836         /**
 837          * After we've calculated scores, loop through all of the possible candidate nodes we found
 838          * and find the one with the highest score.
 839         **/
 840         var topCandidate = null;
 841         for(var c=0, cl=candidates.length; c < cl; c+=1)
 842         {
 843             /**
 844              * Scale the final candidates score based on link density. Good content should have a
 845              * relatively small link density (5% or less) and be mostly unaffected by this operation.
 846             **/
 847             candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
 848
 849             dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
 850
 851             if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
 852                 topCandidate = candidates[c]; }
 853         }
 854
 855         /**
 856          * If we still have no top candidate, just use the body as a last resort.
 857          * We also have to copy the body node so it is something we can modify.
 858          **/
 859         if (topCandidate === null || topCandidate.tagName === "BODY")
 860         {
 861             topCandidate = document.createElement("DIV");
 862             topCandidate.innerHTML = page.innerHTML;
 863             page.innerHTML = "";
 864             page.appendChild(topCandidate);
 865             readability.initializeNode(topCandidate);
 866         }
 867
 868         /**
 869          * Now that we have the top candidate, look through its siblings for content that might also be related.
 870          * Things like preambles, content split by ads that we removed, etc.
 871         **/
 872         var articleContent        = document.createElement("DIV");
 873         if (isPaging) {
 874             articleContent.id     = "readability-content";
 875         }
 876         var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
 877         var siblingNodes          = topCandidate.parentNode.childNodes;
 878
 879
 880         for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
 881             var siblingNode = siblingNodes[s];
 882             var append      = false;
 883
 884             /**
 885              * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
 886              * Example of error visible here: http://www.esquire.com/features/honesty0707
 887             **/
 888             if(!siblingNode) {
 889                 continue;
 890             }
 891
 892             dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
 893             dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
 894
 895             if(siblingNode === topCandidate)
 896             {
 897                 append = true;
 898             }
 899
 900             var contentBonus = 0;
 901             /* Give a bonus if sibling nodes and top candidates have the example same classname */
 902             if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
 903                 contentBonus += topCandidate.readability.contentScore * 0.2;
 904             }
 905
 906             if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
 907             {
 908                 append = true;
 909             }
 910
 911             if(siblingNode.nodeName === "P") {
 912                 var linkDensity = readability.getLinkDensity(siblingNode);
 913                 var nodeContent = readability.getInnerText(siblingNode);
 914                 var nodeLength  = nodeContent.length;
 915
 916                 if(nodeLength > 80 && linkDensity < 0.25)
 917                 {
 918                     append = true;
 919                 }
 920                 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
 921                 {
 922                     append = true;
 923                 }
 924             }
 925
 926             if(append) {
 927                 dbg("Appending node: " + siblingNode);
 928
 929                 var nodeToAppend = null;
 930                 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
 931                     /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
 932
 933                     dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
 934                     nodeToAppend = document.createElement("DIV");
 935                     try {
 936                         nodeToAppend.id = siblingNode.id;
 937                         nodeToAppend.innerHTML = siblingNode.innerHTML;
 938                     }
 939                     catch(er) {
 940                         dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
 941                         nodeToAppend = siblingNode;
 942                         s-=1;
 943                         sl-=1;
 944                     }
 945                 } else {
 946                     nodeToAppend = siblingNode;
 947                     s-=1;
 948                     sl-=1;
 949                 }
 950
 951                 /* To ensure a node does not interfere with readability styles, remove its classnames */
 952                 nodeToAppend.className = "";
 953
 954                 /* Append sibling and subtract from our list because it removes the node when you append to another node */
 955                 articleContent.appendChild(nodeToAppend);
 956             }
 957         }
 958
 959         /**
 960          * So we have all of the content that we need. Now we clean it up for presentation.
 961         **/
 962         readability.prepArticle(articleContent);
 963
 964         if (readability.curPageNum === 1) {
 965             articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
 966         }
 967
 968         /**
 969          * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
 970          * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
 971          * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
 972          * finding the -right- content.
 973         **/
 974         if(readability.getInnerText(articleContent, false).length < 250) {
 975         page.innerHTML = pageCacheHtml;
 976
 977             if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
 978                 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
 979                 return readability.grabArticle(page);
 980             }
 981             else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
 982                 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
 983                 return readability.grabArticle(page);
 984             }
 985             else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
 986                 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
 987                 return readability.grabArticle(page);
 988             } else {
 989                 return null;
 990             }
 991         }
 992
 993         return articleContent;
 994     },
 995
 996     /**
 997      * Removes script tags from the document.
 998      *
 999      * @param Element
1000     **/
1001     removeScripts: function (doc) {
1002         var scripts = doc.getElementsByTagName('script');
1003         for(var i = scripts.length-1; i >= 0; i-=1)
1004         {
1005             if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
1006             {
1007                 scripts[i].nodeValue="";
1008                 scripts[i].removeAttribute('src');
1009                 if (scripts[i].parentNode) {
1010                         scripts[i].parentNode.removeChild(scripts[i]);
1011                 }
1012             }
1013         }
1014     },
1015
1016     /**
1017      * Get the inner text of a node - cross browser compatibly.
1018      * This also strips out any excess whitespace to be found.
1019      *
1020      * @param Element
1021      * @return string
1022     **/
1023     getInnerText: function (e, normalizeSpaces) {
1024         var textContent    = "";
1025
1026         if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
1027             return "";
1028         }
1029
1030         normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
1031
1032         if (navigator.appName === "Microsoft Internet Explorer") {
1033             textContent = e.innerText.replace( readability.regexps.trim, "" ); }
1034         else {
1035             textContent = e.textContent.replace( readability.regexps.trim, "" ); }
1036
1037         if(normalizeSpaces) {
1038             return textContent.replace( readability.regexps.normalize, " "); }
1039         else {
1040             return textContent; }
1041     },
1042
1043     /**
1044      * Get the number of times a string s appears in the node e.
1045      *
1046      * @param Element
1047      * @param string - what to split on. Default is ","
1048      * @return number (integer)
1049     **/
1050     getCharCount: function (e,s) {
1051         s = s || ",";
1052         return readability.getInnerText(e).split(s).length-1;
1053     },
1054
1055     /**
1056      * Remove the style attribute on every e and under.
1057      * TODO: Test if getElementsByTagName(*) is faster.
1058      *
1059      * @param Element
1060      * @return void
1061     **/
1062     cleanStyles: function (e) {
1063         e = e || document;
1064         var cur = e.firstChild;
1065
1066         if(!e) {
1067             return; }
1068
1069         // Remove any root styles, if we're able.
1070         if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
1071             e.removeAttribute('style'); }
1072
1073         // Go until there are no more child nodes
1074         while ( cur !== null ) {
1075             if ( cur.nodeType === 1 ) {
1076                 // Remove style attribute(s) :
1077                 if(cur.className !== "readability-styled") {
1078                     cur.removeAttribute("style");
1079                 }
1080                 readability.cleanStyles( cur );
1081             }
1082             cur = cur.nextSibling;
1083         }
1084     },
1085
1086     /**
1087      * Get the density of links as a percentage of the content
1088      * This is the amount of text that is inside a link divided by the total text in the node.
1089      *
1090      * @param Element
1091      * @return number (float)
1092     **/
1093     getLinkDensity: function (e) {
1094         var links      = e.getElementsByTagName("a");
1095         var textLength = readability.getInnerText(e).length;
1096         var linkLength = 0;
1097         for(var i=0, il=links.length; i<il;i+=1)
1098         {
1099             linkLength += readability.getInnerText(links[i]).length;
1100         }
1101
1102         return linkLength / textLength;
1103     },
1104
1105     /**
1106      * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
1107      *
1108      * @author Dan Lacy
1109      * @return string the base url
1110     **/
1111     findBaseUrl: function () {
1112         var noUrlParams     = window.location.pathname.split("?")[0],
1113             urlSlashes      = noUrlParams.split("/").reverse(),
1114             cleanedSegments = [],
1115             possibleType    = "";
1116
1117         for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
1118             var segment = urlSlashes[i];
1119
1120             // Split off and save anything that looks like a file type.
1121             if (segment.indexOf(".") !== -1) {
1122                 possibleType = segment.split(".")[1];
1123
1124                 /* If the type isn't alpha-only, it's probably not actually a file extension. */
1125                 if(!possibleType.match(/[^a-zA-Z]/)) {
1126                     segment = segment.split(".")[0];
1127                 }
1128             }
1129
1130             /**
1131              * EW-CMS specific segment replacement. Ugly.
1132              * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
1133             **/
1134             if(segment.indexOf(',00') !== -1) {
1135                 segment = segment.replace(',00', '');
1136             }
1137
1138             // If our first or second segment has anything looking like a page number, remove it.
1139             if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
1140                 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
1141             }
1142
1143
1144             var del = false;
1145
1146             /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
1147             if (i < 2 && segment.match(/^\d{1,2}$/)) {
1148                 del = true;
1149             }
1150
1151             /* If this is the first segment and it's just "index", remove it. */
1152             if(i === 0 && segment.toLowerCase() === "index") {
1153                 del = true;
1154             }
1155
1156             /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
1157             if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
1158                 del = true;
1159             }
1160
1161             /* If it's not marked for deletion, push it to cleanedSegments. */
1162             if (!del) {
1163                 cleanedSegments.push(segment);
1164             }
1165         }
1166
1167         // This is our final, cleaned, base article URL.
1168         return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
1169     },
1170
1171     /**
1172      * Look for any paging links that may occur within the document.
1173      *
1174      * @param body
1175      * @return object (array)
1176     **/
1177     findNextPageLink: function (elem) {
1178         var possiblePages = {},
1179             allLinks = elem.getElementsByTagName('a'),
1180             articleBaseUrl = readability.findBaseUrl();
1181
1182         /**
1183          * Loop through all links, looking for hints that they may be next-page links.
1184          * Things like having "page" in their textContent, className or id, or being a child
1185          * of a node with a page-y className or id.
1186          *
1187          * Also possible: levenshtein distance? longest common subsequence?
1188          *
1189          * After we do that, assign each page a score, and
1190         **/
1191         for(var i = 0, il = allLinks.length; i < il; i+=1) {
1192             var link     = allLinks[i],
1193                 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1194
1195             /* If we've already seen this page, ignore it */
1196             if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1197                 continue;
1198             }
1199
1200             /* If it's on a different domain, skip it. */
1201             if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1202                 continue;
1203             }
1204
1205             var linkText = readability.getInnerText(link);
1206
1207             /* If the linkText looks like it's not the next page, skip it. */
1208             if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1209                 continue;
1210             }
1211
1212             /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1213             var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1214             if(!linkHrefLeftover.match(/\d/)) {
1215                 continue;
1216             }
1217
1218             if(!(linkHref in possiblePages)) {
1219                 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1220             } else {
1221                 possiblePages[linkHref].linkText += ' | ' + linkText;
1222             }
1223
1224             var linkObj = possiblePages[linkHref];
1225
1226             /**
1227              * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1228              * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1229             **/
1230             if(linkHref.indexOf(articleBaseUrl) !== 0) {
1231                 linkObj.score -= 25;
1232             }
1233
1234             var linkData = linkText + ' ' + link.className + ' ' + link.id;
1235             if(linkData.match(readability.regexps.nextLink)) {
1236                 linkObj.score += 50;
1237             }
1238             if(linkData.match(/pag(e|ing|inat)/i)) {
1239                 linkObj.score += 25;
1240             }
1241             if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1242                 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1243                 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1244                     linkObj.score -= 65;
1245                 }
1246             }
1247             if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1248                 linkObj.score -= 50;
1249             }
1250             if(linkData.match(readability.regexps.prevLink)) {
1251                 linkObj.score -= 200;
1252             }
1253
1254             /* If a parentNode contains page or paging or paginat */
1255             var parentNode = link.parentNode,
1256                 positiveNodeMatch = false,
1257                 negativeNodeMatch = false;
1258             while(parentNode) {
1259                 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1260                 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1261                     positiveNodeMatch = true;
1262                     linkObj.score += 25;
1263                 }
1264                 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1265                     /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1266                     if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1267                         linkObj.score -= 25;
1268                         negativeNodeMatch = true;
1269                     }
1270                 }
1271
1272                 parentNode = parentNode.parentNode;
1273             }
1274
1275             /**
1276              * If the URL looks like it has paging in it, add to the score.
1277              * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1278             **/
1279             if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1280                 linkObj.score += 25;
1281             }
1282
1283             /* If the URL contains negative values, give a slight decrease. */
1284             if (linkHref.match(readability.regexps.extraneous)) {
1285                 linkObj.score -= 15;
1286             }
1287
1288             /**
1289              * Minor punishment to anything that doesn't match our current URL.
1290              * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1291              *       Dan, can you show me a counterexample where this is necessary?
1292              * if (linkHref.indexOf(window.location.href) !== 0) {
1293              *    linkObj.score -= 1;
1294              * }
1295             **/
1296
1297             /**
1298              * If the link text can be parsed as a number, give it a minor bonus, with a slight
1299              * bias towards lower numbered pages. This is so that pages that might not have 'next'
1300              * in their text can still get scored, and sorted properly by score.
1301             **/
1302             var linkTextAsNumber = parseInt(linkText, 10);
1303             if(linkTextAsNumber) {
1304                 // Punish 1 since we're either already there, or it's probably before what we want anyways.
1305                 if (linkTextAsNumber === 1) {
1306                     linkObj.score -= 10;
1307                 }
1308                 else {
1309                     // Todo: Describe this better
1310                     linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1311                 }
1312             }
1313         }
1314
1315         /**
1316          * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1317          * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1318         **/
1319         var topPage = null;
1320         for(var page in possiblePages) {
1321             if(possiblePages.hasOwnProperty(page)) {
1322                 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1323                     topPage = possiblePages[page];
1324                 }
1325             }
1326         }
1327
1328         if(topPage) {
1329             var nextHref = topPage.href.replace(/\/$/,'');
1330
1331             dbg('NEXT PAGE IS ' + nextHref);
1332             readability.parsedPages[nextHref] = true;
1333             return nextHref;
1334         }
1335         else {
1336             return null;
1337         }
1338     },
1339
1340     /**
1341      * Build a simple cross browser compatible XHR.
1342      *
1343      * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk.
1344     **/
1345     xhr: function () {
1346         if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1347             return new XMLHttpRequest();
1348         }
1349         else {
1350             try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1351             try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1352             try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1353         }
1354
1355         return false;
1356     },
1357
1358     successfulRequest: function (request) {
1359         return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1360     },
1361
1362     ajax: function (url, options) {
1363         var request = readability.xhr();
1364
1365         function respondToReadyState(readyState) {
1366             if (request.readyState === 4) {
1367                 if (readability.successfulRequest(request)) {
1368                     if (options.success) { options.success(request); }
1369                 }
1370                 else {
1371                     if (options.error) { options.error(request); }
1372                 }
1373             }
1374         }
1375
1376         if (typeof options === 'undefined') { options = {}; }
1377
1378         request.onreadystatechange = respondToReadyState;
1379
1380         request.open('get', url, true);
1381         request.setRequestHeader('Accept', 'text/html');
1382
1383         try {
1384             request.send(options.postBody);
1385         }
1386         catch (e) {
1387             if (options.error) { options.error(); }
1388         }
1389
1390         return request;
1391     },
1392
1393     /**
1394      * Make an AJAX request for each page and append it to the document.
1395     **/
1396     curPageNum: 1,
1397
1398     appendNextPage: function (nextPageLink) {
1399         readability.curPageNum+=1;
1400
1401         var articlePage       = document.createElement("DIV");
1402         articlePage.id        = 'readability-page-' + readability.curPageNum;
1403         articlePage.className = 'page';
1404         articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
1405
1406         document.getElementById("readability-content").appendChild(articlePage);
1407
1408         if(readability.curPageNum > readability.maxPages) {
1409             var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
1410
1411             articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
1412             return;
1413         }
1414
1415         /**
1416          * Now that we've built the article page DOM element, get the page content
1417          * asynchronously and load the cleaned content into the div we created for it.
1418         **/
1419         var replaceContent = function(pageUrl, thisPage) {
1420             readability.ajax(pageUrl, {
1421                 success: function(r) {
1422
1423                     /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1424                     var eTag = r.getResponseHeader('ETag');
1425                     if(eTag) {
1426                         if(eTag in readability.pageETags) {
1427                             dbg("Exact duplicate page found via ETag. Aborting.");
1428                             articlePage.style.display = 'none';
1429                             return;
1430                         } else {
1431                             readability.pageETags[eTag] = 1;
1432                         }
1433                     }
1434
1435                     // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1436                     var page = document.createElement("DIV");
1437
1438                     /**
1439                      * Do some preprocessing to our HTML to make it ready for appending.
1440                      * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1441                      * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1442                      * • Turn all double br's into p's - was handled by prepDocument in the original view.
1443                      *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1444                     **/
1445                     var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1446                     responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1447                     responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
1448                     responseHtml = responseHtml.replace(readability.regexps.replaceBrs, '</p><p>');
1449                     responseHtml = responseHtml.replace(readability.regexps.replaceFonts, '<$1span>');
1450
1451                     page.innerHTML = responseHtml;
1452
1453                     /**
1454                      * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1455                     **/
1456                     readability.flags = 0x1 | 0x2 | 0x4;
1457
1458                     var nextPageLink = readability.findNextPageLink(page),
1459                         content      =  readability.grabArticle(page);
1460
1461                     if(!content) {
1462                         dbg("No content found in page to append. Aborting.");
1463                         return;
1464                     }
1465
1466                     /**
1467                      * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1468                      * Compare it against all of the the previous document's we've gotten. If the previous
1469                      * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1470                     **/
1471                     var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1472                     if(firstP && firstP.innerHTML.length > 100) {
1473                         for(var i=1; i <= readability.curPageNum; i+=1) {
1474                             var rPage = document.getElementById('readability-page-' + i);
1475                             if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1476                                 dbg('Duplicate of page ' + i + ' - skipping.');
1477                                 articlePage.style.display = 'none';
1478                                 readability.parsedPages[pageUrl] = true;
1479                                 return;
1480                             }
1481                         }
1482                     }
1483
1484                     readability.removeScripts(content);
1485
1486                     thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
1487
1488                     /**
1489                      * After the page has rendered, post process the content. This delay is necessary because,
1490                      * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1491                      * wait a little bit for reflow to finish before we can fix floating images.
1492                     **/
1493                     window.setTimeout(
1494                         function() { readability.postProcessContent(thisPage); },
1495                         500
1496                     );
1497
1498                     if(nextPageLink) {
1499                         readability.appendNextPage(nextPageLink);
1500                     }
1501                 }
1502             });
1503         }(nextPageLink, articlePage);
1504     },
1505
1506     /**
1507      * Get an elements class/id weight. Uses regular expressions to tell if this
1508      * element looks good or bad.
1509      *
1510      * @param Element
1511      * @return number (Integer)
1512     **/
1513     getClassWeight: function (e) {
1514         if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1515             return 0;
1516         }
1517
1518         var weight = 0;
1519
1520         /* Look for a special classname */
1521         if (typeof(e.className) === 'string' && e.className !== '')
1522         {
1523             if(e.className.search(readability.regexps.negative) !== -1) {
1524                 weight -= 25; }
1525
1526             if(e.className.search(readability.regexps.positive) !== -1) {
1527                 weight += 25; }
1528         }
1529
1530         /* Look for a special ID */
1531         if (typeof(e.id) === 'string' && e.id !== '')
1532         {
1533             if(e.id.search(readability.regexps.negative) !== -1) {
1534                 weight -= 25; }
1535
1536             if(e.id.search(readability.regexps.positive) !== -1) {
1537                 weight += 25; }
1538         }
1539
1540         return weight;
1541     },
1542
1543     nodeIsVisible: function (node) {
1544         return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1545     },
1546
1547     /**
1548      * Remove extraneous break tags from a node.
1549      *
1550      * @param Element
1551      * @return void
1552      **/
1553     killBreaks: function (e) {
1554         try {
1555             e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'<br />');
1556         }
1557         catch (eBreaks) {
1558             dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
1559         }
1560     },
1561
1562     /**
1563      * Clean a node of all elements of type "tag".
1564      * (Unless it's a youtube/vimeo video. People love movies.)
1565      *
1566      * @param Element
1567      * @param string tag to clean
1568      * @return void
1569      **/
1570     clean: function (e, tag) {
1571         var targetList = e.getElementsByTagName( tag );
1572         var isEmbed    = (tag === 'object' || tag === 'embed');
1573
1574         for (var y=targetList.length-1; y >= 0; y-=1) {
1575             /* Allow youtube and vimeo videos through as people usually want to see those. */
1576             if(isEmbed) {
1577                 var attributeValues = "";
1578                 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1579                     attributeValues += targetList[y].attributes[i].value + '|';
1580                 }
1581
1582                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1583                 if (attributeValues.search(readability.regexps.videos) !== -1) {
1584                     continue;
1585                 }
1586
1587                 /* Then check the elements inside this element for the same. */
1588                 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1589                     continue;
1590                 }
1591
1592             }
1593
1594             targetList[y].parentNode.removeChild(targetList[y]);
1595         }
1596     },
1597
1598     /**
1599      * Clean an element of all tags of type "tag" if they look fishy.
1600      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1601      *
1602      * @return void
1603      **/
1604     cleanConditionally: function (e, tag) {
1605
1606         if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1607             return;
1608         }
1609
1610         var tagsList      = e.getElementsByTagName(tag);
1611         var curTagsLength = tagsList.length;
1612
1613         /**
1614          * Gather counts for other typical elements embedded within.
1615          * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1616          *
1617          * TODO: Consider taking into account original contentScore here.
1618         **/
1619         for (var i=curTagsLength-1; i >= 0; i-=1) {
1620             var weight = readability.getClassWeight(tagsList[i]);
1621             var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1622
1623             dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1624
1625             if(weight+contentScore < 0)
1626             {
1627                 tagsList[i].parentNode.removeChild(tagsList[i]);
1628             }
1629             else if ( readability.getCharCount(tagsList[i],',') < 10) {
1630                 /**
1631                  * If there are not very many commas, and the number of
1632                  * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1633                 **/
1634                 var p      = tagsList[i].getElementsByTagName("p").length;
1635                 var img    = tagsList[i].getElementsByTagName("img").length;
1636                 var li     = tagsList[i].getElementsByTagName("li").length-100;
1637                 var input  = tagsList[i].getElementsByTagName("input").length;
1638
1639                 var embedCount = 0;
1640                 var embeds     = tagsList[i].getElementsByTagName("embed");
1641                 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1642                     if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1643                       embedCount+=1;
1644                     }
1645                 }
1646
1647                 var linkDensity   = readability.getLinkDensity(tagsList[i]);
1648                 var contentLength = readability.getInnerText(tagsList[i]).length;
1649                 var toRemove      = false;
1650
1651                 if ( img > p ) {
1652                     toRemove = true;
1653                 } else if(li > p && tag !== "ul" && tag !== "ol") {
1654                     toRemove = true;
1655                 } else if( input > Math.floor(p/3) ) {
1656                     toRemove = true;
1657                 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1658                     toRemove = true;
1659                 } else if(weight < 25 && linkDensity > 0.2) {
1660                     toRemove = true;
1661                 } else if(weight >= 25 && linkDensity > 0.5) {
1662                     toRemove = true;
1663                 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1664                     toRemove = true;
1665                 }
1666
1667                 if(toRemove) {
1668                     tagsList[i].parentNode.removeChild(tagsList[i]);
1669                 }
1670             }
1671         }
1672     },
1673
1674     /**
1675      * Clean out spurious headers from an Element. Checks things like classnames and link density.
1676      *
1677      * @param Element
1678      * @return void
1679     **/
1680     cleanHeaders: function (e) {
1681         for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1682             var headers = e.getElementsByTagName('h' + headerIndex);
1683             for (var i=headers.length-1; i >=0; i-=1) {
1684                 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1685                     headers[i].parentNode.removeChild(headers[i]);
1686                 }
1687             }
1688         }
1689     },
1690
1691     /*** Smooth scrolling logic ***/
1692
1693     /**
1694      * easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation.
1695      * Borrowed from jQuery's easing library.
1696      * @return integer
1697     **/
1698     easeInOut: function(start,end,totalSteps,actualStep) {
1699         var delta = end - start;
1700
1701         if ((actualStep/=totalSteps/2) < 1) {
1702             return delta/2*actualStep*actualStep + start;
1703         }
1704         actualStep -=1;
1705         return -delta/2 * ((actualStep)*(actualStep-2) - 1) + start;
1706     },
1707
1708     /**
1709      * Helper function to, in a cross compatible way, get or set the current scroll offset of the document.
1710      * @return mixed integer on get, the result of window.scrollTo on set
1711     **/
1712     scrollTop: function(scroll){
1713         var setScroll = typeof scroll !== 'undefined';
1714
1715         if(setScroll) {
1716             return window.scrollTo(0, scroll);
1717         }
1718         if(typeof window.pageYOffset !== 'undefined') {
1719             return window.pageYOffset;
1720         }
1721         else if(document.documentElement.clientHeight) {
1722             return document.documentElement.scrollTop;
1723         }
1724         else {
1725             return document.body.scrollTop;
1726         }
1727     },
1728
1729     /**
1730      * scrollTo - Smooth scroll to the point of scrollEnd in the document.
1731      * @return void
1732     **/
1733     curScrollStep: 0,
1734     scrollTo: function (scrollStart, scrollEnd, steps, interval) {
1735         if(
1736             (scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) ||
1737             (scrollStart > scrollEnd && readability.scrollTop() > scrollEnd)
1738           ) {
1739             readability.curScrollStep+=1;
1740             if(readability.curScrollStep > steps) {
1741                 return;
1742             }
1743
1744             var oldScrollTop = readability.scrollTop();
1745
1746             readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep));
1747
1748             // We're at the end of the window.
1749             if(oldScrollTop === readability.scrollTop()) {
1750                 return;
1751             }
1752
1753             window.setTimeout(function() {
1754                 readability.scrollTo(scrollStart, scrollEnd, steps, interval);
1755             }, interval);
1756         }
1757     },
1758
1759
1760     /**
1761      * Show the email popup.
1762      *
1763      * @return void
1764      **/
1765     emailBox: function () {
1766         var emailContainerExists = document.getElementById('email-container');
1767         if(null !== emailContainerExists)
1768         {
1769             return;
1770         }
1771
1772         var emailContainer = document.createElement("DIV");
1773         emailContainer.setAttribute('id', 'email-container');
1774         emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+encodeURIComponent(window.location)+'&pageTitle='+encodeURIComponent(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
1775
1776         document.body.appendChild(emailContainer);
1777     },
1778
1779     /**
1780      * Close the email popup. This is a hacktackular way to check if we're in a "close loop".
1781      * Since we don't have crossdomain access to the frame, we can only know when it has
1782      * loaded again. If it's loaded over 3 times, we know to close the frame.
1783      *
1784      * @return void
1785      **/
1786     removeFrame: function () {
1787         readability.iframeLoads+=1;
1788         if (readability.iframeLoads > 3)
1789         {
1790             var emailContainer = document.getElementById('email-container');
1791             if (null !== emailContainer) {
1792                 emailContainer.parentNode.removeChild(emailContainer);
1793             }
1794
1795             readability.iframeLoads = 0;
1796         }
1797     },
1798
1799     htmlspecialchars: function (s) {
1800         if (typeof(s) === "string") {
1801             s = s.replace(/&/g, "&amp;");
1802             s = s.replace(/"/g, "&quot;");
1803             s = s.replace(/'/g, "&#039;");
1804             s = s.replace(/</g, "&lt;");
1805             s = s.replace(/>/g, "&gt;");
1806         }
1807
1808         return s;
1809     },
1810
1811     flagIsActive: function(flag) {
1812         return (readability.flags & flag) > 0;
1813     },
1814
1815     addFlag: function(flag) {
1816         readability.flags = readability.flags | flag;
1817     },
1818
1819     removeFlag: function(flag) {
1820         readability.flags = readability.flags & ~flag;
1821     }
1822
1823 };
1824
1825 readability.init();