|
|
@@ -1,4 +1,3 @@
|
|
|
-/*eslint-env es6:false*/
|
|
|
/*
|
|
|
* Copyright (c) 2010 Arc90 Inc
|
|
|
*
|
|
|
@@ -54,6 +53,7 @@ function Readability(doc, options) {
|
|
|
return el.innerHTML;
|
|
|
};
|
|
|
this._disableJSONLD = !!options.disableJSONLD;
|
|
|
+ this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos;
|
|
|
|
|
|
// Start with all flags set
|
|
|
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
|
|
@@ -73,12 +73,7 @@ function Readability(doc, options) {
|
|
|
return `<${node.localName} ${attrPairs}>`;
|
|
|
};
|
|
|
this.log = function () {
|
|
|
- if (typeof dump !== "undefined") {
|
|
|
- var msg = Array.prototype.map.call(arguments, function(x) {
|
|
|
- return (x && x.nodeName) ? logNode(x) : x;
|
|
|
- }).join(" ");
|
|
|
- dump("Reader: (Readability) " + msg + "\n");
|
|
|
- } else if (typeof console !== "undefined") {
|
|
|
+ if (typeof console !== "undefined") {
|
|
|
let args = Array.from(arguments, arg => {
|
|
|
if (arg && arg.nodeType == this.ELEMENT_NODE) {
|
|
|
return logNode(arg);
|
|
|
@@ -87,6 +82,12 @@ function Readability(doc, options) {
|
|
|
});
|
|
|
args.unshift("Reader: (Readability)");
|
|
|
console.log.apply(console, args);
|
|
|
+ } else if (typeof dump !== "undefined") {
|
|
|
+ /* global dump */
|
|
|
+ var msg = Array.prototype.map.call(arguments, function(x) {
|
|
|
+ return (x && x.nodeName) ? logNode(x) : x;
|
|
|
+ }).join(" ");
|
|
|
+ dump("Reader: (Readability) " + msg + "\n");
|
|
|
}
|
|
|
};
|
|
|
} else {
|
|
|
@@ -140,6 +141,9 @@ Readability.prototype = {
|
|
|
hashUrl: /^#.+/,
|
|
|
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
|
|
|
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
|
|
|
+ // Commas as used in Latin, Sindhi, Chinese and various other scripts.
|
|
|
+ // see: https://en.wikipedia.org/wiki/Comma#Comma_variants
|
|
|
+ commas: /\u002C|\u060C|\uFE50|\uFE10|\uFE11|\u2E41|\u2E34|\u2E32|\uFF0C/g,
|
|
|
// See: https://schema.org/Article
|
|
|
jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
|
|
|
},
|
|
|
@@ -909,6 +913,12 @@ Readability.prototype = {
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
+ // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
|
|
|
+ if (node.getAttribute("aria-modal") == "true" && node.getAttribute("role") == "dialog") {
|
|
|
+ node = this._removeAndGetNext(node);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
// Check to see if this node is a byline, and remove it if it is.
|
|
|
if (this._checkByline(node, matchString)) {
|
|
|
node = this._removeAndGetNext(node);
|
|
|
@@ -1023,7 +1033,7 @@ Readability.prototype = {
|
|
|
contentScore += 1;
|
|
|
|
|
|
// Add points for any commas within this paragraph.
|
|
|
- contentScore += innerText.split(",").length;
|
|
|
+ contentScore += innerText.split(this.REGEXPS.commas).length;
|
|
|
|
|
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
|
|
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
|
|
|
@@ -1642,12 +1652,7 @@ Readability.prototype = {
|
|
|
* @param Element
|
|
|
**/
|
|
|
_removeScripts: function(doc) {
|
|
|
- this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) {
|
|
|
- scriptNode.nodeValue = "";
|
|
|
- scriptNode.removeAttribute("src");
|
|
|
- return true;
|
|
|
- });
|
|
|
- this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
|
|
|
+ this._removeNodes(this._getAllNodesWithTag(doc, ["script", "noscript"]));
|
|
|
},
|
|
|
|
|
|
/**
|
|
|
@@ -1837,13 +1842,13 @@ Readability.prototype = {
|
|
|
if (isEmbed) {
|
|
|
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
|
|
for (var i = 0; i < element.attributes.length; i++) {
|
|
|
- if (this.REGEXPS.videos.test(element.attributes[i].value)) {
|
|
|
+ if (this._allowedVideoRegex.test(element.attributes[i].value)) {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// For embed with <object> tag, check inner HTML as well.
|
|
|
- if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
|
|
|
+ if (element.tagName === "object" && this._allowedVideoRegex.test(element.innerHTML)) {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
@@ -2112,13 +2117,13 @@ Readability.prototype = {
|
|
|
for (var i = 0; i < embeds.length; i++) {
|
|
|
// If this embed has attribute that matches video regex, don't delete it.
|
|
|
for (var j = 0; j < embeds[i].attributes.length; j++) {
|
|
|
- if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
|
|
|
+ if (this._allowedVideoRegex.test(embeds[i].attributes[j].value)) {
|
|
|
return false;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
// For embed with <object> tag, check inner HTML as well.
|
|
|
- if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
|
|
|
+ if (embeds[i].tagName === "object" && this._allowedVideoRegex.test(embeds[i].innerHTML)) {
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
@@ -2145,7 +2150,7 @@ Readability.prototype = {
|
|
|
return haveToRemove;
|
|
|
}
|
|
|
}
|
|
|
- li_count = node.getElementsByTagName("li").length;
|
|
|
+ let li_count = node.getElementsByTagName("li").length;
|
|
|
// Only allow the list to remain if every li contains an image
|
|
|
if (img == li_count) {
|
|
|
return false;
|
|
|
@@ -2294,5 +2299,6 @@ Readability.prototype = {
|
|
|
};
|
|
|
|
|
|
if (typeof module === "object") {
|
|
|
+ /* global module */
|
|
|
module.exports = Readability;
|
|
|
}
|