mirrors
/
SingleFile
réplica de https://github.com/gildas-lormeau/SingleFile


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858
							/*eslint-env es6:false*/
/*
 * Copyright (c) 2010 Arc90 Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This code is heavily based on Arc90's readability.js (1.7.1) script
 * available at: http://code.google.com/p/arc90labs-readability
 */

/**
 * Public constructor.
 * @param {HTMLDocument} doc     The document to parse.
 * @param {Object}       options The options object.
 */
function Readability(doc, options) {
    // In some older versions, people passed a URI as the first argument. Cope:
    if (options && options.documentElement) {
      doc = options;
      options = arguments[2];
    } else if (!doc || !doc.documentElement) {
      throw new Error("First argument to Readability constructor should be a document object.");
    }
    options = options || {};
  
    this._doc = doc;
    this._articleTitle = null;
    this._articleByline = null;
    this._articleDir = null;
    this._articleSiteName = null;
    this._attempts = [];
  
    // Configurable options
    this._debug = !!options.debug;
    this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
    this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
    this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
    this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
    this._keepClasses = !!options.keepClasses;
  
    // Start with all flags set
    this._flags = this.FLAG_STRIP_UNLIKELYS |
                  this.FLAG_WEIGHT_CLASSES |
                  this.FLAG_CLEAN_CONDITIONALLY;
  
    var logEl;
  
    // Control whether log messages are sent to the console
    if (this._debug) {
      logEl = function(e) {
        var rv = e.nodeName + " ";
        if (e.nodeType == e.TEXT_NODE) {
          return rv + '("' + e.textContent + '")';
        }
        var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
        var elDesc = "";
        if (e.id)
          elDesc = "(#" + e.id + classDesc + ")";
        else if (classDesc)
          elDesc = "(" + classDesc + ")";
        return rv + elDesc;
      };
      this.log = function () {
        if (typeof dump !== "undefined") {
          var msg = Array.prototype.map.call(arguments, function(x) {
            return (x && x.nodeName) ? logEl(x) : x;
          }).join(" ");
          dump("Reader: (Readability) " + msg + "\n");
        } else if (typeof console !== "undefined") {
          var args = ["Reader: (Readability) "].concat(arguments);
          console.log.apply(console, args);
        }
      };
    } else {
      this.log = function () {};
    }
  }
  
  Readability.prototype = {
    FLAG_STRIP_UNLIKELYS: 0x1,
    FLAG_WEIGHT_CLASSES: 0x2,
    FLAG_CLEAN_CONDITIONALLY: 0x4,
  
    // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
    ELEMENT_NODE: 1,
    TEXT_NODE: 3,
  
    // Max number of nodes supported by this parser. Default: 0 (no limit)
    DEFAULT_MAX_ELEMS_TO_PARSE: 0,
  
    // The number of top candidates to consider when analysing how
    // tight the competition is among candidates.
    DEFAULT_N_TOP_CANDIDATES: 5,
  
    // Element tags to score by default.
    DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
  
    // The default number of chars an article must have in order to return a result
    DEFAULT_CHAR_THRESHOLD: 500,
  
    // All of the regular expressions in use within readability.
    // Defined up here so we don't instantiate them repeatedly in loops.
    REGEXPS: {
      // NOTE: These two regular expressions are duplicated in
      // Readability-readerable.js. Please keep both copies in sync.
      unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
      okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
  
      positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
      negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
      extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
      byline: /byline|author|dateline|writtenby|p-author/i,
      replaceFonts: /<(\/?)font[^>]*>/gi,
      normalize: /\s{2,}/g,
      videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
      shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
      nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
      prevLink: /(prev|earl|old|new|<|«)/i,
      whitespace: /^\s*$/,
      hasContent: /\S$/,
    },
  
    DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
  
    ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
  
    PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
  
    DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
  
    // The commented out elements qualify as phrasing content but tend to be
    // removed by readability when put into paragraphs, so we ignore them here.
    PHRASING_ELEMS: [
      // "CANVAS", "IFRAME", "SVG", "VIDEO",
      "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
      "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
      "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
      "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
      "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
    ],
  
    // These are the classes that readability sets itself.
    CLASSES_TO_PRESERVE: [ "page" ],
  
    /**
     * Run any post-process modifications to article content as necessary.
     *
     * @param Element
     * @return void
    **/
    _postProcessContent: function(articleContent) {
      // Readability cannot open relative uris so we convert them to absolute uris.
      this._fixRelativeUris(articleContent);
  
      if (!this._keepClasses) {
        // Remove classes.
        this._cleanClasses(articleContent);
      }
    },
  
    /**
     * Iterates over a NodeList, calls `filterFn` for each node and removes node
     * if function returned `true`.
     *
     * If function is not passed, removes all the nodes in node list.
     *
     * @param NodeList nodeList The nodes to operate on
     * @param Function filterFn the function to use as a filter
     * @return void
     */
    _removeNodes: function(nodeList, filterFn) {
      for (var i = nodeList.length - 1; i >= 0; i--) {
        var node = nodeList[i];
        var parentNode = node.parentNode;
        if (parentNode) {
          if (!filterFn || filterFn.call(this, node, i, nodeList)) {
            parentNode.removeChild(node);
          }
        }
      }
    },
  
    /**
     * Iterates over a NodeList, and calls _setNodeTag for each node.
     *
     * @param NodeList nodeList The nodes to operate on
     * @param String newTagName the new tag name to use
     * @return void
     */
    _replaceNodeTags: function(nodeList, newTagName) {
      for (var i = nodeList.length - 1; i >= 0; i--) {
        var node = nodeList[i];
        this._setNodeTag(node, newTagName);
      }
    },
  
    /**
     * Iterate over a NodeList, which doesn't natively fully implement the Array
     * interface.
     *
     * For convenience, the current object context is applied to the provided
     * iterate function.
     *
     * @param  NodeList nodeList The NodeList.
     * @param  Function fn       The iterate function.
     * @return void
     */
    _forEachNode: function(nodeList, fn) {
      Array.prototype.forEach.call(nodeList, fn, this);
    },
  
    /**
     * Iterate over a NodeList, return true if any of the provided iterate
     * function calls returns true, false otherwise.
     *
     * For convenience, the current object context is applied to the
     * provided iterate function.
     *
     * @param  NodeList nodeList The NodeList.
     * @param  Function fn       The iterate function.
     * @return Boolean
     */
    _someNode: function(nodeList, fn) {
      return Array.prototype.some.call(nodeList, fn, this);
    },
  
    /**
     * Iterate over a NodeList, return true if all of the provided iterate
     * function calls return true, false otherwise.
     *
     * For convenience, the current object context is applied to the
     * provided iterate function.
     *
     * @param  NodeList nodeList The NodeList.
     * @param  Function fn       The iterate function.
     * @return Boolean
     */
    _everyNode: function(nodeList, fn) {
      return Array.prototype.every.call(nodeList, fn, this);
    },
  
    /**
     * Concat all nodelists passed as arguments.
     *
     * @return ...NodeList
     * @return Array
     */
    _concatNodeLists: function() {
      var slice = Array.prototype.slice;
      var args = slice.call(arguments);
      var nodeLists = args.map(function(list) {
        return slice.call(list);
      });
      return Array.prototype.concat.apply([], nodeLists);
    },
  
    _getAllNodesWithTag: function(node, tagNames) {
      if (node.querySelectorAll) {
        return node.querySelectorAll(tagNames.join(","));
      }
      return [].concat.apply([], tagNames.map(function(tag) {
        var collection = node.getElementsByTagName(tag);
        return Array.isArray(collection) ? collection : Array.from(collection);
      }));
    },
  
    /**
     * Removes the class="" attribute from every element in the given
     * subtree, except those that match CLASSES_TO_PRESERVE and
     * the classesToPreserve array from the options object.
     *
     * @param Element
     * @return void
     */
    _cleanClasses: function(node) {
      var classesToPreserve = this._classesToPreserve;
      var className = (node.getAttribute("class") || "")
        .split(/\s+/)
        .filter(function(cls) {
          return classesToPreserve.indexOf(cls) != -1;
        })
        .join(" ");
  
      if (className) {
        node.setAttribute("class", className);
      } else {
        node.removeAttribute("class");
      }
  
      for (node = node.firstElementChild; node; node = node.nextElementSibling) {
        this._cleanClasses(node);
      }
    },
  
    /**
     * Converts each <a> and <img> uri in the given element to an absolute URI,
     * ignoring #ref URIs.
     *
     * @param Element
     * @return void
     */
    _fixRelativeUris: function(articleContent) {
      var baseURI = this._doc.baseURI;
      var documentURI = this._doc.documentURI;
      function toAbsoluteURI(uri) {
        // Leave hash links alone if the base URI matches the document URI:
        if (baseURI == documentURI && uri.charAt(0) == "#") {
          return uri;
        }
        // Otherwise, resolve against base URI:
        try {
          return new URL(uri, baseURI).href;
        } catch (ex) {
          // Something went wrong, just return the original:
        }
        return uri;
      }
  
      var links = this._getAllNodesWithTag(articleContent, ["a"]);
      this._forEachNode(links, function(link) {
        var href = link.getAttribute("href");
        if (href) {
          // Replace links with javascript: URIs with text content, since
          // they won't work after scripts have been removed from the page.
          if (href.indexOf("javascript:") === 0) {
            var text = this._doc.createTextNode(link.textContent);
            link.parentNode.replaceChild(text, link);
          } else {
            link.setAttribute("href", toAbsoluteURI(href));
          }
        }
      });
  
      var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
      this._forEachNode(imgs, function(img) {
        var src = img.getAttribute("src");
        if (src) {
          img.setAttribute("src", toAbsoluteURI(src));
        }
      });
    },
  
    /**
     * Get the article title as an H1.
     *
     * @return void
     **/
    _getArticleTitle: function() {
      var doc = this._doc;
      var curTitle = "";
      var origTitle = "";
  
      try {
        curTitle = origTitle = doc.title.trim();
  
        // If they had an element with id "title" in their HTML
        if (typeof curTitle !== "string")
          curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
      } catch (e) {/* ignore exceptions setting the title. */}
  
      var titleHadHierarchicalSeparators = false;
      function wordCount(str) {
        return str.split(/\s+/).length;
      }
  
      // If there's a separator in the title, first remove the final part
      if ((/ [\|\-\\\/>»] /).test(curTitle)) {
        titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
        curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
  
        // If the resulting title is too short (3 words or fewer), remove
        // the first part instead:
        if (wordCount(curTitle) < 3)
          curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
      } else if (curTitle.indexOf(": ") !== -1) {
        // Check if we have an heading containing this exact string, so we
        // could assume it's the full title.
        var headings = this._concatNodeLists(
          doc.getElementsByTagName("h1"),
          doc.getElementsByTagName("h2")
        );
        var trimmedTitle = curTitle.trim();
        var match = this._someNode(headings, function(heading) {
          return heading.textContent.trim() === trimmedTitle;
        });
  
        // If we don't, let's extract the title out of the original title string.
        if (!match) {
          curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
  
          // If the title is now too short, try the first colon instead:
          if (wordCount(curTitle) < 3) {
            curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
            // But if we have too many words before the colon there's something weird
            // with the titles and the H tags so let's just use the original title instead
          } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
            curTitle = origTitle;
          }
        }
      } else if (curTitle.length > 150 || curTitle.length < 15) {
        var hOnes = doc.getElementsByTagName("h1");
  
        if (hOnes.length === 1)
          curTitle = this._getInnerText(hOnes[0]);
      }
  
      curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
      // If we now have 4 words or fewer as our title, and either no
      // 'hierarchical' separators (\, /, > or ») were found in the original
      // title or we decreased the number of words by more than 1 word, use
      // the original title.
      var curTitleWordCount = wordCount(curTitle);
      if (curTitleWordCount <= 4 &&
          (!titleHadHierarchicalSeparators ||
           curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
        curTitle = origTitle;
      }
  
      return curTitle;
    },
  
    /**
     * Prepare the HTML document for readability to scrape it.
     * This includes things like stripping javascript, CSS, and handling terrible markup.
     *
     * @return void
     **/
    _prepDocument: function() {
      var doc = this._doc;
  
      // Remove all style tags in head
      this._removeNodes(doc.getElementsByTagName("style"));
  
      if (doc.body) {
        this._replaceBrs(doc.body);
      }
  
      this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
    },
  
    /**
     * Finds the next element, starting from the given node, and ignoring
     * whitespace in between. If the given node is an element, the same node is
     * returned.
     */
    _nextElement: function (node) {
      var next = node;
      while (next
          && (next.nodeType != this.ELEMENT_NODE)
          && this.REGEXPS.whitespace.test(next.textContent)) {
        next = next.nextSibling;
      }
      return next;
    },
  
    /**
     * Replaces 2 or more successive <br> elements with a single <p>.
     * Whitespace between <br> elements are ignored. For example:
     *   <div>foo<br>bar<br> <br><br>abc</div>
     * will become:
     *   <div>foo<br>bar<p>abc</p></div>
     */
    _replaceBrs: function (elem) {
      this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
        var next = br.nextSibling;
  
        // Whether 2 or more <br> elements have been found and replaced with a
        // <p> block.
        var replaced = false;
  
        // If we find a <br> chain, remove the <br>s until we hit another element
        // or non-whitespace. This leaves behind the first <br> in the chain
        // (which will be replaced with a <p> later).
        while ((next = this._nextElement(next)) && (next.tagName == "BR")) {
          replaced = true;
          var brSibling = next.nextSibling;
          next.parentNode.removeChild(next);
          next = brSibling;
        }
  
        // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
        // all sibling nodes as children of the <p> until we hit another <br>
        // chain.
        if (replaced) {
          var p = this._doc.createElement("p");
          br.parentNode.replaceChild(p, br);
  
          next = p.nextSibling;
          while (next) {
            // If we've hit another <br><br>, we're done adding children to this <p>.
            if (next.tagName == "BR") {
              var nextElem = this._nextElement(next.nextSibling);
              if (nextElem && nextElem.tagName == "BR")
                break;
            }
  
            if (!this._isPhrasingContent(next))
              break;
  
            // Otherwise, make this node a child of the new <p>.
            var sibling = next.nextSibling;
            p.appendChild(next);
            next = sibling;
          }
  
          while (p.lastChild && this._isWhitespace(p.lastChild)) {
            p.removeChild(p.lastChild);
          }
  
          if (p.parentNode.tagName === "P")
            this._setNodeTag(p.parentNode, "DIV");
        }
      });
    },
  
    _setNodeTag: function (node, tag) {
      this.log("_setNodeTag", node, tag);
      if (node.__JSDOMParser__) {
        node.localName = tag.toLowerCase();
        node.tagName = tag.toUpperCase();
        return node;
      }
  
      var replacement = node.ownerDocument.createElement(tag);
      while (node.firstChild) {
        replacement.appendChild(node.firstChild);
      }
      node.parentNode.replaceChild(replacement, node);
      if (node.readability)
        replacement.readability = node.readability;
  
      for (var i = 0; i < node.attributes.length; i++) {
        try {
          replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
        } catch (ex) {
          /* it's possible for setAttribute() to throw if the attribute name
           * isn't a valid XML Name. Such attributes can however be parsed from
           * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
           * so we can hit them here and then throw. We don't care about such
           * attributes so we ignore them.
           */
        }
      }
      return replacement;
    },
  
    /**
     * Prepare the article node for display. Clean out any inline styles,
     * iframes, forms, strip extraneous <p> tags, etc.
     *
     * @param Element
     * @return void
     **/
    _prepArticle: function(articleContent) {
      this._cleanStyles(articleContent);
  
      // Check for data tables before we continue, to avoid removing items in
      // those tables, which will often be isolated even though they're
      // visually linked to other content-ful elements (text, images, etc.).
      this._markDataTables(articleContent);
  
      this._fixLazyImages(articleContent);
  
      // Clean out junk from the article content
      this._cleanConditionally(articleContent, "form");
      this._cleanConditionally(articleContent, "fieldset");
      this._clean(articleContent, "object");
      this._clean(articleContent, "embed");
      this._clean(articleContent, "h1");
      this._clean(articleContent, "footer");
      this._clean(articleContent, "link");
      this._clean(articleContent, "aside");
  
      // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
      // which means we don't remove the top candidates even they have "share".
  
      var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
  
      this._forEachNode(articleContent.children, function (topCandidate) {
        this._cleanMatchedNodes(topCandidate, function (node, matchString) {
          return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
        });
      });
  
      // If there is only one h2 and its text content substantially equals article title,
      // they are probably using it as a header and not a subheader,
      // so remove it since we already extract the title separately.
      var h2 = articleContent.getElementsByTagName("h2");
      if (h2.length === 1) {
        var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
        if (Math.abs(lengthSimilarRate) < 0.5) {
          var titlesMatch = false;
          if (lengthSimilarRate > 0) {
            titlesMatch = h2[0].textContent.includes(this._articleTitle);
          } else {
            titlesMatch = this._articleTitle.includes(h2[0].textContent);
          }
          if (titlesMatch) {
            this._clean(articleContent, "h2");
          }
        }
      }
  
      this._clean(articleContent, "iframe");
      this._clean(articleContent, "input");
      this._clean(articleContent, "textarea");
      this._clean(articleContent, "select");
      this._clean(articleContent, "button");
      this._cleanHeaders(articleContent);
  
      // Do these last as the previous stuff may have removed junk
      // that will affect these
      this._cleanConditionally(articleContent, "table");
      this._cleanConditionally(articleContent, "ul");
      this._cleanConditionally(articleContent, "div");
  
      // Remove extra paragraphs
      this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
        var imgCount = paragraph.getElementsByTagName("img").length;
        var embedCount = paragraph.getElementsByTagName("embed").length;
        var objectCount = paragraph.getElementsByTagName("object").length;
        // At this point, nasty iframes have been removed, only remain embedded video ones.
        var iframeCount = paragraph.getElementsByTagName("iframe").length;
        var totalCount = imgCount + embedCount + objectCount + iframeCount;
  
        return totalCount === 0 && !this._getInnerText(paragraph, false);
      });
  
      this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
        var next = this._nextElement(br.nextSibling);
        if (next && next.tagName == "P")
          br.parentNode.removeChild(br);
      });
  
      // Remove single-cell tables
      this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
        var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
        if (this._hasSingleTagInsideElement(tbody, "TR")) {
          var row = tbody.firstElementChild;
          if (this._hasSingleTagInsideElement(row, "TD")) {
            var cell = row.firstElementChild;
            cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
            table.parentNode.replaceChild(cell, table);
          }
        }
      });
    },
  
    /**
     * Initialize a node with the readability object. Also checks the
     * className/id for special names to add to its score.
     *
     * @param Element
     * @return void
    **/
    _initializeNode: function(node) {
      node.readability = {"contentScore": 0};
  
      switch (node.tagName) {
        case "DIV":
          node.readability.contentScore += 5;
          break;
  
        case "PRE":
        case "TD":
        case "BLOCKQUOTE":
          node.readability.contentScore += 3;
          break;
  
        case "ADDRESS":
        case "OL":
        case "UL":
        case "DL":
        case "DD":
        case "DT":
        case "LI":
        case "FORM":
          node.readability.contentScore -= 3;
          break;
  
        case "H1":
        case "H2":
        case "H3":
        case "H4":
        case "H5":
        case "H6":
        case "TH":
          node.readability.contentScore -= 5;
          break;
      }
  
      node.readability.contentScore += this._getClassWeight(node);
    },
  
    _removeAndGetNext: function(node) {
      var nextNode = this._getNextNode(node, true);
      node.parentNode.removeChild(node);
      return nextNode;
    },
  
    /**
     * Traverse the DOM from node to node, starting at the node passed in.
     * Pass true for the second parameter to indicate this node itself
     * (and its kids) are going away, and we want the next node over.
     *
     * Calling this in a loop will traverse the DOM depth-first.
     */
    _getNextNode: function(node, ignoreSelfAndKids) {
      // First check for kids if those aren't being ignored
      if (!ignoreSelfAndKids && node.firstElementChild) {
        return node.firstElementChild;
      }
      // Then for siblings...
      if (node.nextElementSibling) {
        return node.nextElementSibling;
      }
      // And finally, move up the parent chain *and* find a sibling
      // (because this is depth-first traversal, we will have already
      // seen the parent nodes themselves).
      do {
        node = node.parentNode;
      } while (node && !node.nextElementSibling);
      return node && node.nextElementSibling;
    },
  
    _checkByline: function(node, matchString) {
      if (this._articleByline) {
        return false;
      }
  
      if (node.getAttribute !== undefined) {
        var rel = node.getAttribute("rel");
        var itemprop = node.getAttribute("itemprop");
      }
  
      if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
        this._articleByline = node.textContent.trim();
        return true;
      }
  
      return false;
    },
  
    _getNodeAncestors: function(node, maxDepth) {
      maxDepth = maxDepth || 0;
      var i = 0, ancestors = [];
      while (node.parentNode) {
        ancestors.push(node.parentNode);
        if (maxDepth && ++i === maxDepth)
          break;
        node = node.parentNode;
      }
      return ancestors;
    },
  
    /***
     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
     *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
     *
     * @param page a document to run upon. Needs to be a full document, complete with body.
     * @return Element
    **/
    _grabArticle: function (page) {
      this.log("**** grabArticle ****");
      var doc = this._doc;
      var isPaging = (page !== null ? true: false);
      page = page ? page : this._doc.body;
  
      // We can't grab an article if we don't have a page!
      if (!page) {
        this.log("No body found in document. Abort.");
        return null;
      }
  
      var pageCacheHtml = page.innerHTML;
  
      while (true) {
        var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
  
        // First, node prepping. Trash nodes that look cruddy (like ones with the
        // class name "comment", etc), and turn divs into P tags where they have been
        // used inappropriately (as in, where they contain no other block level elements.)
        var elementsToScore = [];
        var node = this._doc.documentElement;
  
        while (node) {
          var matchString = node.className + " " + node.id;
  
          if (!this._isProbablyVisible(node)) {
            this.log("Removing hidden node - " + matchString);
            node = this._removeAndGetNext(node);
            continue;
          }
  
          // Check to see if this node is a byline, and remove it if it is.
          if (this._checkByline(node, matchString)) {
            node = this._removeAndGetNext(node);
            continue;
          }
  
          // Remove unlikely candidates
          if (stripUnlikelyCandidates) {
            if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
                !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
                !this._hasAncestorTag(node, "table") &&
                node.tagName !== "BODY" &&
                node.tagName !== "A") {
              this.log("Removing unlikely candidate - " + matchString);
              node = this._removeAndGetNext(node);
              continue;
            }
          }
  
          // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
          if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
               node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
               node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
              this._isElementWithoutContent(node)) {
            node = this._removeAndGetNext(node);
            continue;
          }
  
          if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
            elementsToScore.push(node);
          }
  
          // Turn all divs that don't have children block level elements into p's
          if (node.tagName === "DIV") {
            // Put phrasing content into paragraphs.
            var p = null;
            var childNode = node.firstChild;
            while (childNode) {
              var nextSibling = childNode.nextSibling;
              if (this._isPhrasingContent(childNode)) {
                if (p !== null) {
                  p.appendChild(childNode);
                } else if (!this._isWhitespace(childNode)) {
                  p = doc.createElement("p");
                  node.replaceChild(p, childNode);
                  p.appendChild(childNode);
                }
              } else if (p !== null) {
                while (p.lastChild && this._isWhitespace(p.lastChild)) {
                  p.removeChild(p.lastChild);
                }
                p = null;
              }
              childNode = nextSibling;
            }
  
            // Sites like http://mobile.slate.com encloses each paragraph with a DIV
            // element. DIVs with only a P element inside and no text content can be
            // safely converted into plain P elements to avoid confusing the scoring
            // algorithm with DIVs with are, in practice, paragraphs.
            if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
              var newNode = node.children[0];
              node.parentNode.replaceChild(newNode, node);
              node = newNode;
              elementsToScore.push(node);
            } else if (!this._hasChildBlockElement(node)) {
              node = this._setNodeTag(node, "P");
              elementsToScore.push(node);
            }
          }
          node = this._getNextNode(node);
        }
  
        /**
         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
         * Then add their score to their parent node.
         *
         * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
        **/
        var candidates = [];
        this._forEachNode(elementsToScore, function(elementToScore) {
          if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
            return;
  
          // If this paragraph is less than 25 characters, don't even count it.
          var innerText = this._getInnerText(elementToScore);
          if (innerText.length < 25)
            return;
  
          // Exclude nodes with no ancestor.
          var ancestors = this._getNodeAncestors(elementToScore, 3);
          if (ancestors.length === 0)
            return;
  
          var contentScore = 0;
  
          // Add a point for the paragraph itself as a base.
          contentScore += 1;
  
          // Add points for any commas within this paragraph.
          contentScore += innerText.split(",").length;
  
          // For every 100 characters in this paragraph, add another point. Up to 3 points.
          contentScore += Math.min(Math.floor(innerText.length / 100), 3);
  
          // Initialize and score ancestors.
          this._forEachNode(ancestors, function(ancestor, level) {
            if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
              return;
  
            if (typeof(ancestor.readability) === "undefined") {
              this._initializeNode(ancestor);
              candidates.push(ancestor);
            }
  
            // Node score divider:
            // - parent:             1 (no division)
            // - grandparent:        2
            // - great grandparent+: ancestor level * 3
            if (level === 0)
              var scoreDivider = 1;
            else if (level === 1)
              scoreDivider = 2;
            else
              scoreDivider = level * 3;
            ancestor.readability.contentScore += contentScore / scoreDivider;
          });
        });
  
        // After we've calculated scores, loop through all of the possible
        // candidate nodes we found and find the one with the highest score.
        var topCandidates = [];
        for (var c = 0, cl = candidates.length; c < cl; c += 1) {
          var candidate = candidates[c];
  
          // Scale the final candidates score based on link density. Good content
          // should have a relatively small link density (5% or less) and be mostly
          // unaffected by this operation.
          var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
          candidate.readability.contentScore = candidateScore;
  
          this.log("Candidate:", candidate, "with score " + candidateScore);
  
          for (var t = 0; t < this._nbTopCandidates; t++) {
            var aTopCandidate = topCandidates[t];
  
            if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
              topCandidates.splice(t, 0, candidate);
              if (topCandidates.length > this._nbTopCandidates)
                topCandidates.pop();
              break;
            }
          }
        }
  
        var topCandidate = topCandidates[0] || null;
        var neededToCreateTopCandidate = false;
        var parentOfTopCandidate;
  
        // If we still have no top candidate, just use the body as a last resort.
        // We also have to copy the body node so it is something we can modify.
        if (topCandidate === null || topCandidate.tagName === "BODY") {
          // Move all of the page's children into topCandidate
          topCandidate = doc.createElement("DIV");
          neededToCreateTopCandidate = true;
          // Move everything (not just elements, also text nodes etc.) into the container
          // so we even include text directly in the body:
          var kids = page.childNodes;
          while (kids.length) {
            this.log("Moving child out:", kids[0]);
            topCandidate.appendChild(kids[0]);
          }
  
          page.appendChild(topCandidate);
  
          this._initializeNode(topCandidate);
        } else if (topCandidate) {
          // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
          // and whose scores are quite closed with current `topCandidate` node.
          var alternativeCandidateAncestors = [];
          for (var i = 1; i < topCandidates.length; i++) {
            if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) {
              alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i]));
            }
          }
          var MINIMUM_TOPCANDIDATES = 3;
          if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
            parentOfTopCandidate = topCandidate.parentNode;
            while (parentOfTopCandidate.tagName !== "BODY") {
              var listsContainingThisAncestor = 0;
              for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) {
                listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate));
              }
              if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
                topCandidate = parentOfTopCandidate;
                break;
              }
              parentOfTopCandidate = parentOfTopCandidate.parentNode;
            }
          }
          if (!topCandidate.readability) {
            this._initializeNode(topCandidate);
          }
  
          // Because of our bonus system, parents of candidates might have scores
          // themselves. They get half of the node. There won't be nodes with higher
          // scores than our topCandidate, but if we see the score going *up* in the first
          // few steps up the tree, that's a decent sign that there might be more content
          // lurking in other places that we want to unify in. The sibling stuff
          // below does some of that - but only if we've looked high enough up the DOM
          // tree.
          parentOfTopCandidate = topCandidate.parentNode;
          var lastScore = topCandidate.readability.contentScore;
          // The scores shouldn't get too low.
          var scoreThreshold = lastScore / 3;
          while (parentOfTopCandidate.tagName !== "BODY") {
            if (!parentOfTopCandidate.readability) {
              parentOfTopCandidate = parentOfTopCandidate.parentNode;
              continue;
            }
            var parentScore = parentOfTopCandidate.readability.contentScore;
            if (parentScore < scoreThreshold)
              break;
            if (parentScore > lastScore) {
              // Alright! We found a better parent to use.
              topCandidate = parentOfTopCandidate;
              break;
            }
            lastScore = parentOfTopCandidate.readability.contentScore;
            parentOfTopCandidate = parentOfTopCandidate.parentNode;
          }
  
          // If the top candidate is the only child, use parent instead. This will help sibling
          // joining logic when adjacent content is actually located in parent's sibling node.
          parentOfTopCandidate = topCandidate.parentNode;
          while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
            topCandidate = parentOfTopCandidate;
            parentOfTopCandidate = topCandidate.parentNode;
          }
          if (!topCandidate.readability) {
            this._initializeNode(topCandidate);
          }
        }
  
        // Now that we have the top candidate, look through its siblings for content
        // that might also be related. Things like preambles, content split by ads
        // that we removed, etc.
        var articleContent = doc.createElement("DIV");
        if (isPaging)
          articleContent.id = "readability-content";
  
        var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
        // Keep potential top candidate's parent node to try to get text direction of it later.
        parentOfTopCandidate = topCandidate.parentNode;
        var siblings = parentOfTopCandidate.children;
  
        for (var s = 0, sl = siblings.length; s < sl; s++) {
          var sibling = siblings[s];
          var append = false;
  
          this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
          this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
  
          if (sibling === topCandidate) {
            append = true;
          } else {
            var contentBonus = 0;
  
            // Give a bonus if sibling nodes and top candidates have the example same classname
            if (sibling.className === topCandidate.className && topCandidate.className !== "")
              contentBonus += topCandidate.readability.contentScore * 0.2;
  
            if (sibling.readability &&
                ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
              append = true;
            } else if (sibling.nodeName === "P") {
              var linkDensity = this._getLinkDensity(sibling);
              var nodeContent = this._getInnerText(sibling);
              var nodeLength = nodeContent.length;
  
              if (nodeLength > 80 && linkDensity < 0.25) {
                append = true;
              } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 &&
                         nodeContent.search(/\.( |$)/) !== -1) {
                append = true;
              }
            }
          }
  
          if (append) {
            this.log("Appending node:", sibling);
  
            if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
              // We have a node that isn't a common block level element, like a form or td tag.
              // Turn it into a div so it doesn't get filtered out later by accident.
              this.log("Altering sibling:", sibling, "to div.");
  
              sibling = this._setNodeTag(sibling, "DIV");
            }
  
            articleContent.appendChild(sibling);
            // siblings is a reference to the children array, and
            // sibling is removed from the array when we call appendChild().
            // As a result, we must revisit this index since the nodes
            // have been shifted.
            s -= 1;
            sl -= 1;
          }
        }
  
        if (this._debug)
          this.log("Article content pre-prep: " + articleContent.innerHTML);
        // So we have all of the content that we need. Now we clean it up for presentation.
        this._prepArticle(articleContent);
        if (this._debug)
          this.log("Article content post-prep: " + articleContent.innerHTML);
  
        if (neededToCreateTopCandidate) {
          // We already created a fake div thing, and there wouldn't have been any siblings left
          // for the previous loop, so there's no point trying to create a new div, and then
          // move all the children over. Just assign IDs and class names here. No need to append
          // because that already happened anyway.
          topCandidate.id = "readability-page-1";
          topCandidate.className = "page";
        } else {
          var div = doc.createElement("DIV");
          div.id = "readability-page-1";
          div.className = "page";
          var children = articleContent.childNodes;
          while (children.length) {
            div.appendChild(children[0]);
          }
          articleContent.appendChild(div);
        }
  
        if (this._debug)
          this.log("Article content after paging: " + articleContent.innerHTML);
  
        var parseSuccessful = true;
  
        // Now that we've gone through the full algorithm, check to see if
        // we got any meaningful content. If we didn't, we may need to re-run
        // grabArticle with different flags set. This gives us a higher likelihood of
        // finding the content, and the sieve approach gives us a higher likelihood of
        // finding the -right- content.
        var textLength = this._getInnerText(articleContent, true).length;
        if (textLength < this._charThreshold) {
          parseSuccessful = false;
          page.innerHTML = pageCacheHtml;
  
          if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
            this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
            this._attempts.push({articleContent: articleContent, textLength: textLength});
          } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
            this._removeFlag(this.FLAG_WEIGHT_CLASSES);
            this._attempts.push({articleContent: articleContent, textLength: textLength});
          } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
            this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
            this._attempts.push({articleContent: articleContent, textLength: textLength});
          } else {
            this._attempts.push({articleContent: articleContent, textLength: textLength});
            // No luck after removing flags, just return the longest text we found during the different loops
            this._attempts.sort(function (a, b) {
              return b.textLength - a.textLength;
            });
  
            // But first check if we actually have something
            if (!this._attempts[0].textLength) {
              return null;
            }
  
            articleContent = this._attempts[0].articleContent;
            parseSuccessful = true;
          }
        }
  
        if (parseSuccessful) {
          // Find out text direction from ancestors of final top candidate.
          var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
          this._someNode(ancestors, function(ancestor) {
            if (!ancestor.tagName)
              return false;
            var articleDir = ancestor.getAttribute("dir");
            if (articleDir) {
              this._articleDir = articleDir;
              return true;
            }
            return false;
          });
          return articleContent;
        }
      }
    },
  
    /**
     * Check whether the input string could be a byline.
     * This verifies that the input is a string, and that the length
     * is less than 100 chars.
     *
     * @param possibleByline {string} - a string to check whether its a byline.
     * @return Boolean - whether the input string is a byline.
     */
    _isValidByline: function(byline) {
      if (typeof byline == "string" || byline instanceof String) {
        byline = byline.trim();
        return (byline.length > 0) && (byline.length < 100);
      }
      return false;
    },
  
    /**
     * Attempts to get excerpt and byline metadata for the article.
     *
     * @return Object with optional "excerpt" and "byline" properties
     */
    _getArticleMetadata: function() {
      var metadata = {};
      var values = {};
      var metaElements = this._doc.getElementsByTagName("meta");
  
      // property is a space-separated list of values
      var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
  
      // name is a single value
      var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
  
      // Find description tags.
      this._forEachNode(metaElements, function(element) {
        var elementName = element.getAttribute("name");
        var elementProperty = element.getAttribute("property");
        var content = element.getAttribute("content");
        if (!content) {
          return;
        }
        var matches = null;
        var name = null;
  
        if (elementProperty) {
          matches = elementProperty.match(propertyPattern);
          if (matches) {
            for (var i = matches.length - 1; i >= 0; i--) {
              // Convert to lowercase, and remove any whitespace
              // so we can match below.
              name = matches[i].toLowerCase().replace(/\s/g, "");
              // multiple authors
              values[name] = content.trim();
            }
          }
        }
        if (!matches && elementName && namePattern.test(elementName)) {
          name = elementName;
          if (content) {
            // Convert to lowercase, remove any whitespace, and convert dots
            // to colons so we can match below.
            name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
            values[name] = content.trim();
          }
        }
      });
  
      // get title
      metadata.title = values["dc:title"] ||
                       values["dcterm:title"] ||
                       values["og:title"] ||
                       values["weibo:article:title"] ||
                       values["weibo:webpage:title"] ||
                       values["title"] ||
                       values["twitter:title"];
  
      if (!metadata.title) {
        metadata.title = this._getArticleTitle();
      }
  
      // get author
      metadata.byline = values["dc:creator"] ||
                        values["dcterm:creator"] ||
                        values["author"];
  
      // get description
      metadata.excerpt = values["dc:description"] ||
                         values["dcterm:description"] ||
                         values["og:description"] ||
                         values["weibo:article:description"] ||
                         values["weibo:webpage:description"] ||
                         values["description"] ||
                         values["twitter:description"];
  
      // get site name
      metadata.siteName = values["og:site_name"];
  
      return metadata;
    },
  
    /**
     * Removes script tags from the document.
     *
     * @param Element
    **/
    _removeScripts: function(doc) {
      this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
        scriptNode.nodeValue = "";
        scriptNode.removeAttribute("src");
        return true;
      });
      this._removeNodes(doc.getElementsByTagName("noscript"));
    },
  
    /**
     * Check if this node has only whitespace and a single element with given tag
     * Returns false if the DIV node contains non-empty text nodes
     * or if it contains no element with given tag or more than 1 element.
     *
     * @param Element
     * @param string tag of child element
    **/
    _hasSingleTagInsideElement: function(element, tag) {
      // There should be exactly 1 element child with given tag
      if (element.children.length != 1 || element.children[0].tagName !== tag) {
        return false;
      }
  
      // And there should be no text nodes with real content
      return !this._someNode(element.childNodes, function(node) {
        return node.nodeType === this.TEXT_NODE &&
               this.REGEXPS.hasContent.test(node.textContent);
      });
    },
  
    _isElementWithoutContent: function(node) {
      return node.nodeType === this.ELEMENT_NODE &&
        node.textContent.trim().length == 0 &&
        (node.children.length == 0 ||
         node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
    },
  
    /**
     * Determine whether element has any children block level elements.
     *
     * @param Element
     */
    _hasChildBlockElement: function (element) {
      return this._someNode(element.childNodes, function(node) {
        return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
               this._hasChildBlockElement(node);
      });
    },
  
    /***
     * Determine if a node qualifies as phrasing content.
     * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
    **/
    _isPhrasingContent: function(node) {
      return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
        ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
          this._everyNode(node.childNodes, this._isPhrasingContent));
    },
  
    _isWhitespace: function(node) {
      return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
             (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
    },
  
    /**
     * Get the inner text of a node - cross browser compatibly.
     * This also strips out any excess whitespace to be found.
     *
     * @param Element
     * @param Boolean normalizeSpaces (default: true)
     * @return string
    **/
    _getInnerText: function(e, normalizeSpaces) {
      normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces;
      var textContent = e.textContent.trim();
  
      if (normalizeSpaces) {
        return textContent.replace(this.REGEXPS.normalize, " ");
      }
      return textContent;
    },
  
    /**
     * Get the number of times a string s appears in the node e.
     *
     * @param Element
     * @param string - what to split on. Default is ","
     * @return number (integer)
    **/
    _getCharCount: function(e, s) {
      s = s || ",";
      return this._getInnerText(e).split(s).length - 1;
    },
  
    /**
     * Remove the style attribute on every e and under.
     * TODO: Test if getElementsByTagName(*) is faster.
     *
     * @param Element
     * @return void
    **/
    _cleanStyles: function(e) {
      if (!e || e.tagName.toLowerCase() === "svg")
        return;
  
      // Remove `style` and deprecated presentational attributes
      for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
        e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
      }
  
      if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
        e.removeAttribute("width");
        e.removeAttribute("height");
      }
  
      var cur = e.firstElementChild;
      while (cur !== null) {
        this._cleanStyles(cur);
        cur = cur.nextElementSibling;
      }
    },
  
    /**
     * Get the density of links as a percentage of the content
     * This is the amount of text that is inside a link divided by the total text in the node.
     *
     * @param Element
     * @return number (float)
    **/
    _getLinkDensity: function(element) {
      var textLength = this._getInnerText(element).length;
      if (textLength === 0)
        return 0;
  
      var linkLength = 0;
  
      // XXX implement _reduceNodeList?
      this._forEachNode(element.getElementsByTagName("a"), function(linkNode) {
        linkLength += this._getInnerText(linkNode).length;
      });
  
      return linkLength / textLength;
    },
  
    /**
     * Get an elements class/id weight. Uses regular expressions to tell if this
     * element looks good or bad.
     *
     * @param Element
     * @return number (Integer)
    **/
    _getClassWeight: function(e) {
      if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES))
        return 0;
  
      var weight = 0;
  
      // Look for a special classname
      if (typeof(e.className) === "string" && e.className !== "") {
        if (this.REGEXPS.negative.test(e.className))
          weight -= 25;
  
        if (this.REGEXPS.positive.test(e.className))
          weight += 25;
      }
  
      // Look for a special ID
      if (typeof(e.id) === "string" && e.id !== "") {
        if (this.REGEXPS.negative.test(e.id))
          weight -= 25;
  
        if (this.REGEXPS.positive.test(e.id))
          weight += 25;
      }
  
      return weight;
    },
  
    /**
     * Clean a node of all elements of type "tag".
     * (Unless it's a youtube/vimeo video. People love movies.)
     *
     * @param Element
     * @param string tag to clean
     * @return void
     **/
    _clean: function(e, tag) {
      var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
  
      this._removeNodes(e.getElementsByTagName(tag), function(element) {
        // Allow youtube and vimeo videos through as people usually want to see those.
        if (isEmbed) {
          // First, check the elements attributes to see if any of them contain youtube or vimeo
          for (var i = 0; i < element.attributes.length; i++) {
            if (this.REGEXPS.videos.test(element.attributes[i].value)) {
              return false;
            }
          }
  
          // For embed with <object> tag, check inner HTML as well.
          if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
            return false;
          }
        }
  
        return true;
      });
    },
  
    /**
     * Check if a given node has one of its ancestor tag name matching the
     * provided one.
     * @param  HTMLElement node
     * @param  String      tagName
     * @param  Number      maxDepth
     * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
     * @return Boolean
     */
    _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
      maxDepth = maxDepth || 3;
      tagName = tagName.toUpperCase();
      var depth = 0;
      while (node.parentNode) {
        if (maxDepth > 0 && depth > maxDepth)
          return false;
        if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
          return true;
        node = node.parentNode;
        depth++;
      }
      return false;
    },
  
    /**
     * Return an object indicating how many rows and columns this table has.
     */
    _getRowAndColumnCount: function(table) {
      var rows = 0;
      var columns = 0;
      var trs = table.getElementsByTagName("tr");
      for (var i = 0; i < trs.length; i++) {
        var rowspan = trs[i].getAttribute("rowspan") || 0;
        if (rowspan) {
          rowspan = parseInt(rowspan, 10);
        }
        rows += (rowspan || 1);
  
        // Now look for column-related info
        var columnsInThisRow = 0;
        var cells = trs[i].getElementsByTagName("td");
        for (var j = 0; j < cells.length; j++) {
          var colspan = cells[j].getAttribute("colspan") || 0;
          if (colspan) {
            colspan = parseInt(colspan, 10);
          }
          columnsInThisRow += (colspan || 1);
        }
        columns = Math.max(columns, columnsInThisRow);
      }
      return {rows: rows, columns: columns};
    },
  
    /**
     * Look for 'data' (as opposed to 'layout') tables, for which we use
     * similar checks as
     * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
     */
    _markDataTables: function(root) {
      var tables = root.getElementsByTagName("table");
      for (var i = 0; i < tables.length; i++) {
        var table = tables[i];
        var role = table.getAttribute("role");
        if (role == "presentation") {
          table._readabilityDataTable = false;
          continue;
        }
        var datatable = table.getAttribute("datatable");
        if (datatable == "0") {
          table._readabilityDataTable = false;
          continue;
        }
        var summary = table.getAttribute("summary");
        if (summary) {
          table._readabilityDataTable = true;
          continue;
        }
  
        var caption = table.getElementsByTagName("caption")[0];
        if (caption && caption.childNodes.length > 0) {
          table._readabilityDataTable = true;
          continue;
        }
  
        // If the table has a descendant with any of these tags, consider a data table:
        var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
        var descendantExists = function(tag) {
          return !!table.getElementsByTagName(tag)[0];
        };
        if (dataTableDescendants.some(descendantExists)) {
          this.log("Data table because found data-y descendant");
          table._readabilityDataTable = true;
          continue;
        }
  
        // Nested tables indicate a layout table:
        if (table.getElementsByTagName("table")[0]) {
          table._readabilityDataTable = false;
          continue;
        }
  
        var sizeInfo = this._getRowAndColumnCount(table);
        if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
          table._readabilityDataTable = true;
          continue;
        }
        // Now just go by size entirely:
        table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
      }
    },
  
    /* convert images and figures that have properties like data-src into images that can be loaded without JS */
    _fixLazyImages: function (root) {
      this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
        // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
        if ((!elem.src && (!elem.srcset || elem.srcset == "null")) || elem.className.toLowerCase().indexOf("lazy") !== -1) {
          for (var i = 0; i < elem.attributes.length; i++) {
            var attr = elem.attributes[i];
            if (attr.name === "src" || attr.name === "srcset") {
              continue;
            }
            var copyTo = null;
            if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
              copyTo = "srcset";
            } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
              copyTo = "src";
            }
            if (copyTo) {
              //if this is an img or picture, set the attribute directly
              if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
                elem.setAttribute(copyTo, attr.value);
              } else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
                //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
                //see the nytimes-3 testcase for an example
                var img = this._doc.createElement("img");
                img.setAttribute(copyTo, attr.value);
                elem.appendChild(img);
              }
            }
          }
        }
      });
    },
  
    /**
     * Clean an element of all tags of type "tag" if they look fishy.
     * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
     *
     * @return void
     **/
    _cleanConditionally: function(e, tag) {
      if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
        return;
  
      var isList = tag === "ul" || tag === "ol";
  
      // Gather counts for other typical elements embedded within.
      // Traverse backwards so we can remove nodes at the same time
      // without effecting the traversal.
      //
      // TODO: Consider taking into account original contentScore here.
      this._removeNodes(e.getElementsByTagName(tag), function(node) {
        // First check if this node IS data table, in which case don't remove it.
        var isDataTable = function(t) {
          return t._readabilityDataTable;
        };
  
        if (tag === "table" && isDataTable(node)) {
          return false;
        }
  
        // Next check if we're inside a data table, in which case don't remove it as well.
        if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
          return false;
        }
  
        var weight = this._getClassWeight(node);
        var contentScore = 0;
  
        this.log("Cleaning Conditionally", node);
  
        if (weight + contentScore < 0) {
          return true;
        }
  
        if (this._getCharCount(node, ",") < 10) {
          // If there are not very many commas, and the number of
          // non-paragraph elements is more than paragraphs or other
          // ominous signs, remove the element.
          var p = node.getElementsByTagName("p").length;
          var img = node.getElementsByTagName("img").length;
          var li = node.getElementsByTagName("li").length - 100;
          var input = node.getElementsByTagName("input").length;
  
          var embedCount = 0;
          var embeds = this._concatNodeLists(
            node.getElementsByTagName("object"),
            node.getElementsByTagName("embed"),
            node.getElementsByTagName("iframe"));
  
          for (var i = 0; i < embeds.length; i++) {
            // If this embed has attribute that matches video regex, don't delete it.
            for (var j = 0; j < embeds[i].attributes.length; j++) {
              if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
                return false;
              }
            }
  
            // For embed with <object> tag, check inner HTML as well.
            if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
              return false;
            }
  
            embedCount++;
          }
  
          var linkDensity = this._getLinkDensity(node);
          var contentLength = this._getInnerText(node).length;
  
          var haveToRemove =
            (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
            (!isList && li > p) ||
            (input > Math.floor(p/3)) ||
            (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
            (!isList && weight < 25 && linkDensity > 0.2) ||
            (weight >= 25 && linkDensity > 0.5) ||
            ((embedCount === 1 && contentLength < 75) || embedCount > 1);
          return haveToRemove;
        }
        return false;
      });
    },
  
    /**
     * Clean out elements that match the specified conditions
     *
     * @param Element
     * @param Function determines whether a node should be removed
     * @return void
     **/
    _cleanMatchedNodes: function(e, filter) {
      var endOfSearchMarkerNode = this._getNextNode(e, true);
      var next = this._getNextNode(e);
      while (next && next != endOfSearchMarkerNode) {
        if (filter.call(this, next, next.className + " " + next.id)) {
          next = this._removeAndGetNext(next);
        } else {
          next = this._getNextNode(next);
        }
      }
    },
  
    /**
     * Clean out spurious headers from an Element. Checks things like classnames and link density.
     *
     * @param Element
     * @return void
    **/
    _cleanHeaders: function(e) {
      for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
        this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
          return this._getClassWeight(header) < 0;
        });
      }
    },
  
    _flagIsActive: function(flag) {
      return (this._flags & flag) > 0;
    },
  
    _removeFlag: function(flag) {
      this._flags = this._flags & ~flag;
    },
  
    _isProbablyVisible: function(node) {
      return (!node.style || node.style.display != "none")
        && !node.hasAttribute("hidden")
        && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true");
    },
  
    /**
     * Runs readability.
     *
     * Workflow:
     *  1. Prep the document by removing script tags, css, etc.
     *  2. Build readability's DOM tree.
     *  3. Grab the article content from the current dom tree.
     *  4. Replace the current DOM tree with the new one.
     *  5. Read peacefully.
     *
     * @return void
     **/
    parse: function () {
      // Avoid parsing too large documents, as per configuration option
      if (this._maxElemsToParse > 0) {
        var numTags = this._doc.getElementsByTagName("*").length;
        if (numTags > this._maxElemsToParse) {
          throw new Error("Aborting parsing document; " + numTags + " elements found");
        }
      }
  
      // Remove script tags from the document.
      this._removeScripts(this._doc);
  
      this._prepDocument();
  
      var metadata = this._getArticleMetadata();
      this._articleTitle = metadata.title;
  
      var articleContent = this._grabArticle();
      if (!articleContent)
        return null;
  
      this.log("Grabbed: " + articleContent.innerHTML);
  
      this._postProcessContent(articleContent);
  
      // If we haven't found an excerpt in the article's metadata, use the article's
      // first paragraph as the excerpt. This is used for displaying a preview of
      // the article's content.
      if (!metadata.excerpt) {
        var paragraphs = articleContent.getElementsByTagName("p");
        if (paragraphs.length > 0) {
          metadata.excerpt = paragraphs[0].textContent.trim();
        }
      }
  
      var textContent = articleContent.textContent;
      return {
        title: this._articleTitle,
        byline: metadata.byline || this._articleByline,
        dir: this._articleDir,
        content: articleContent.innerHTML,
        textContent: textContent,
        length: textContent.length,
        excerpt: metadata.excerpt,
        siteName: metadata.siteName || this._articleSiteName
      };
    }
  };
  
  if (typeof module === "object") {
    module.exports = Readability;
  }