docprocessor.js 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. /*
  2. * Copyright 2011 Gildas Lormeau
  3. * contact : gildas.lormeau <at> gmail.com
  4. *
  5. * This file is part of SingleFile Core.
  6. *
  7. * SingleFile Core is free software: you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation, either version 3 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * SingleFile Core is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public License
  18. * along with SingleFile Core. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20. (function() {
  21. var IMPORT_URL_VALUE_EXP = /(url\s*\(\s*(?:'|")?\s*([^('|"|\))]*)\s*(?:'|")?\s*\))|(@import\s*\(?\s*(?:'|")?\s*([^('|"|\))]*)\s*(?:'|")?\s*(?:\)|;))/i;
  22. var URL_VALUE_EXP = /url\s*\(\s*(?:'|")?\s*([^('|"|\))]*)\s*(?:'|")?\s*\)/i;
  23. var IMPORT_VALUE_ALT_EXP = /@import\s*\(?\s*(?:'|")?\s*([^('|"|\))]*)\s*(?:'|")?\s*(?:\)|;)/i;
  24. var URL_EXP = /url\s*\(([^\)]*)\)/gi;
  25. var IMPORT_EXP = /(@import\s*url\s*\([^\)]*\)\s*;?)|(@import\s*('|")?\s*[^\(|;|'|"]*\s*('|")?\s*;)/gi;
  26. var IMPORT_ALT_EXP = /@import\s*('|")?\s*[^\(|;|'|"]*\s*('|")?\s*;/gi;
  27. var EMPTY_PIXEL_DATA = "data:image/gif;base64,R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==";
  28. function decodeDataURI(dataURI) {
  29. var content = dataURI.indexOf(","), meta = dataURI.substr(5, content).toLowerCase()
  30. // 'data:'.length == 5
  31. , data = decodeURIComponent(dataURI.substr(content + 1));
  32. if (/;\s*base64\s*[;,]/.test(meta)) {
  33. data = atob(data); // decode base64
  34. }
  35. if (/;\s*charset=[uU][tT][fF]-?8\s*[;,]/.test(meta)) {
  36. data = decodeURIComponent(escape(data)); // decode UTF-8
  37. }
  38. return data;
  39. }
  40. ;
  41. function formatURL(link, host) {
  42. var i, newlinkparts, hparts, lparts;
  43. if (!link)
  44. return "";
  45. lparts = link.split('/');
  46. host = host.split("#")[0].split("?")[0];
  47. if (/http:|https:|ftp:|data:|javascript:/i.test(lparts[0]))
  48. return link.trim();
  49. hparts = host.split('/');
  50. newlinkparts = [];
  51. if (hparts.length > 3)
  52. hparts.pop();
  53. if (lparts[0] == '') {
  54. if (lparts[1] == '')
  55. host = hparts[0] + '//' + lparts[2];
  56. else
  57. host = hparts[0] + '//' + hparts[2];
  58. hparts = host.split('/');
  59. delete lparts[0];
  60. if (lparts[1] == '') {
  61. delete lparts[1];
  62. delete lparts[2];
  63. }
  64. }
  65. for (i = 0; i < lparts.length; i++) {
  66. if (lparts[i] == '..') {
  67. if (lparts[i - 1])
  68. delete lparts[i - 1];
  69. else if (hparts.length > 3)
  70. hparts.pop();
  71. delete lparts[i];
  72. }
  73. if (lparts[i] == '.')
  74. delete lparts[i];
  75. }
  76. for (i = 0; i < lparts.length; i++)
  77. if (lparts[i])
  78. newlinkparts[newlinkparts.length] = lparts[i];
  79. return (hparts.join('/') + '/' + newlinkparts.join('/')).trim();
  80. }
  81. function resolveURLs(content, host) {
  82. var ret = content.replace(URL_EXP, function(value) {
  83. var result = value.match(URL_VALUE_EXP);
  84. if (result)
  85. if (result[1].indexOf("data:") != 0)
  86. return value.replace(result[1], formatURL(result[1], host));
  87. return value;
  88. });
  89. return ret.replace(IMPORT_ALT_EXP, function(value) {
  90. var result = value.match(IMPORT_VALUE_ALT_EXP);
  91. if (result)
  92. if (result[1].indexOf("data:") != 0)
  93. return "@import \"" + formatURL(result[1], host) + "\";";
  94. return value;
  95. });
  96. }
  97. function getDataURI(data, defaultURL, woURL) {
  98. if (data.content)
  99. return (woURL ? "" : "url(") + "data:" + data.mediaType + ";" + data.mediaTypeParam + "," + data.content + (woURL ? "" : ")");
  100. else
  101. return woURL ? defaultURL : "url(" + defaultURL + ")";
  102. }
  103. function removeComments(content) {
  104. var start, end;
  105. do {
  106. start = content.indexOf("/*");
  107. end = content.indexOf("*/", start);
  108. if (start != -1 && end != -1)
  109. content = content.substring(0, start) + content.substr(end + 2);
  110. } while (start != -1 && end != -1);
  111. return content;
  112. }
  113. function replaceURLs(content, host, requestManager, callback) {
  114. var i, url, result, values = removeComments(content).match(URL_EXP), requestMax = 0, requestIndex = 0;
  115. function sendRequest(origUrl) {
  116. requestMax++;
  117. requestManager.send(url, function(data) {
  118. requestIndex++;
  119. if (content.indexOf(origUrl) != -1) {
  120. data.mediaType = data.mediaType ? data.mediaType.split(";")[0] : null;
  121. content = content.replace(new RegExp(origUrl.replace(/([{}\(\)\^$&.\*\?\/\+\|\[\\\\]|\]|\-)/g, "\\$1"), "gi"), getDataURI(data,
  122. EMPTY_PIXEL_DATA, true));
  123. }
  124. if (requestIndex == requestMax)
  125. callback(content);
  126. }, null, "base64");
  127. }
  128. if (values)
  129. for (i = 0; i < values.length; i++) {
  130. result = values[i].match(URL_VALUE_EXP);
  131. if (result && result[1]) {
  132. url = formatURL(result[1], host);
  133. if (url.indexOf("data:") != 0)
  134. sendRequest(result[1]);
  135. }
  136. }
  137. }
  138. // ----------------------------------------------------------------------------------------------
  139. function processStylesheets(doc, docElement, baseURI, requestManager) {
  140. Array.prototype.forEach.call(docElement.querySelectorAll('link[href][rel*="stylesheet"]'), function(node) {
  141. var href = node.getAttribute("href"), url = formatURL(href, baseURI);
  142. function createStyleNode(content) {
  143. var i, newNode, commentNode;
  144. newNode = doc.createElement("style");
  145. for (i = 0; i < node.attributes.length; i++)
  146. if (node.attributes[i].value)
  147. newNode.setAttribute(node.attributes[i].name, node.attributes[i].value);
  148. newNode._baseURI = url;
  149. newNode.removeAttribute("href");
  150. newNode.textContent = resolveURLs(content, url);
  151. if (node.disabled) {
  152. commentNode = doc.createComment();
  153. commentNode.textContent = newNode.outerHTML.replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/--/g, "&minus;&minus;");
  154. node.parentElement.replaceChild(commentNode, node);
  155. } else
  156. node.parentElement.replaceChild(newNode, node);
  157. }
  158. if (href.indexOf("data:") != 0)
  159. requestManager.send(url, function(data) {
  160. if (data.status >= 400) {
  161. node.parentElement.removeChild(node);
  162. } else {
  163. createStyleNode(data.content || "");
  164. }
  165. });
  166. else
  167. createStyleNode(decodeDataURI(href));
  168. });
  169. }
  170. function processImports(docElement, baseURI, characterSet, requestManager) {
  171. var ret = true;
  172. Array.prototype.forEach.call(docElement.querySelectorAll("style"), function(styleSheet) {
  173. var url, result, imports = removeComments(styleSheet.textContent).match(IMPORT_EXP);
  174. function insertStylesheet(imp, content) {
  175. styleSheet.textContent = styleSheet.textContent.replace(imp, resolveURLs(content, url));
  176. }
  177. if (imports)
  178. imports.forEach(function(imp) {
  179. result = imp.match(IMPORT_URL_VALUE_EXP);
  180. if (result && (result[2] || result[4])) {
  181. url = formatURL(result[2] || result[4], styleSheet._baseURI || baseURI);
  182. if (url.indexOf("data:") != 0) {
  183. requestManager.send(url, function(data) {
  184. insertStylesheet(imp, data.status < 400 && data.content ? data.content : "");
  185. }, null, characterSet);
  186. } else {
  187. insertStylesheet(imports[i], decodeDataURI(url));
  188. }
  189. ret = false;
  190. }
  191. });
  192. });
  193. return ret;
  194. }
  195. function processStyleAttributes(docElement, baseURI, requestManager) {
  196. Array.prototype.forEach.call(docElement.querySelectorAll("*[style]"), function(node) {
  197. replaceURLs(node.getAttribute("style"), baseURI, requestManager, function(style) {
  198. node.setAttribute("style", style);
  199. });
  200. });
  201. }
  202. function processBgAttributes(docElement, baseURI, requestManager) {
  203. var backgrounds = docElement.querySelectorAll("*[background]");
  204. Array.prototype.forEach.call(backgrounds, function(node) {
  205. var url, value = node.getAttribute("background");
  206. if (value.indexOf(".") != -1) {
  207. url = formatURL(value, baseURI);
  208. if (url.indexOf("data:") != 0)
  209. requestManager.send(url, function(data) {
  210. node.setAttribute("background", getDataURI(data, EMPTY_PIXEL_DATA, true));
  211. }, null, "base64");
  212. }
  213. });
  214. }
  215. function insertDefaultFavico(doc, docElement, baseURI) {
  216. var node, docHead = docElement.querySelector("html > head"), favIcon = docElement
  217. .querySelector('link[href][rel="shortcut icon"], link[href][rel="apple-touch-icon"], link[href][rel="icon"]');
  218. if (!favIcon && docHead) {
  219. node = doc.createElement("link");
  220. node.setAttribute("type", "image/x-icon");
  221. node.setAttribute("rel", "shortcut icon");
  222. node.setAttribute("href", formatURL("/favicon.ico", baseURI));
  223. docHead.appendChild(node);
  224. }
  225. }
  226. function processImages(docElement, baseURI, requestManager) {
  227. var images;
  228. function process(attributeName) {
  229. Array.prototype.forEach.call(images, function(node) {
  230. var url = formatURL(node.getAttribute(attributeName), baseURI);
  231. if (url.indexOf("data:") != 0)
  232. requestManager.send(url, function(data) {
  233. node.setAttribute(attributeName, getDataURI(data, EMPTY_PIXEL_DATA, true));
  234. }, null, "base64");
  235. });
  236. }
  237. images = docElement.querySelectorAll('link[href][rel="shortcut icon"], link[href][rel="apple-touch-icon"], link[href][rel="icon"]');
  238. process("href");
  239. images = docElement.querySelectorAll('img[src], input[src][type="image"]');
  240. process("src");
  241. images = docElement.querySelectorAll('video[poster]');
  242. process("poster");
  243. }
  244. function processSVGs(docElement, baseURI, requestManager) {
  245. var images = docElement.querySelectorAll('object[type="image/svg+xml"], object[type="image/svg-xml"], embed[src*=".svg"]');
  246. Array.prototype.forEach.call(images, function(node) {
  247. var data = node.getAttribute("data"), src = node.getAttribute("src"), url = formatURL(data || src, baseURI);
  248. if (url.indexOf("data:") != 0)
  249. requestManager.send(url, function(data) {
  250. node.setAttribute(data ? "data" : "src", getDataURI(data, "data:text/xml,<svg></svg>", true));
  251. }, null, null);
  252. });
  253. }
  254. function processStyles(docElement, baseURI, requestManager) {
  255. Array.prototype.forEach.call(docElement.querySelectorAll("style"), function(styleSheet) {
  256. replaceURLs(styleSheet.textContent, styleSheet._baseURI || baseURI, requestManager, function(textContent) {
  257. styleSheet.textContent = textContent;
  258. });
  259. });
  260. }
  261. function processScripts(docElement, baseURI, characterSet, requestManager) {
  262. Array.prototype.forEach.call(docElement.querySelectorAll("script[src]"), function(node) {
  263. var src = node.getAttribute("src");
  264. if (src.indexOf("data:") != 0)
  265. requestManager.send(formatURL(src, baseURI), function(data) {
  266. if (data.status < 400) {
  267. data.content = data.content.replace(/"([^"]*)<\/\s*script\s*>([^"]*)"/gi, '"$1<"+"/script>$2"');
  268. data.content = data.content.replace(/'([^']*)<\/\s*script\s*>([^']*)'/gi, "'$1<'+'/script>$2'");
  269. node.textContent = "\n" + data.content + "\n";
  270. }
  271. node.removeAttribute("src");
  272. }, characterSet);
  273. });
  274. }
  275. function processCanvas(doc, docElement, canvasData) {
  276. var index = 0;
  277. Array.prototype.forEach.call(docElement.querySelectorAll("canvas"), function(node) {
  278. var i, data = canvasData[index], newNode = doc.createElement("img");
  279. if (data) {
  280. newNode.setAttribute("src", data);
  281. for (i = 0; i < node.attributes.length; i++)
  282. if (node.attributes[i].value)
  283. newNode.setAttribute(node.attributes[i].name, node.attributes[i].value);
  284. if (!newNode.width)
  285. newNode.style.pixelWidth = node.clientWidth;
  286. if (!newNode.height)
  287. newNode.style.pixelHeight = node.clientHeight;
  288. node.parentElement.replaceChild(newNode, node);
  289. }
  290. index++;
  291. });
  292. }
  293. function removeScripts(docElement) {
  294. Array.prototype.forEach.call(docElement.querySelectorAll("script"), function(node) {
  295. node.parentElement.removeChild(node);
  296. });
  297. Array.prototype.forEach.call(docElement.querySelectorAll("*[onload]"), function(node) {
  298. node.removeAttribute("onload");
  299. });
  300. }
  301. function removeObjects(docElement) {
  302. var objects = docElement.querySelectorAll('applet, object:not([type="image/svg+xml"]):not([type="image/svg-xml"]), embed:not([src*=".svg"])');
  303. Array.prototype.forEach.call(objects, function(node) {
  304. node.parentElement.removeChild(node);
  305. });
  306. objects = docElement.querySelectorAll('audio[src], video[src]');
  307. Array.prototype.forEach.call(objects, function(node) {
  308. node.removeAttribute("src");
  309. });
  310. }
  311. function removeBlockquotesCite(docElement) {
  312. Array.prototype.forEach.call(docElement.querySelectorAll("blockquote[cite]"), function(node) {
  313. node.removeAttribute("cite");
  314. });
  315. }
  316. function removeFrames(docElement) {
  317. Array.prototype.forEach.call(docElement.querySelectorAll("iframe, frame"), function(node) {
  318. node.parentElement.removeChild(node);
  319. });
  320. }
  321. function removeMetaRefresh(docElement) {
  322. Array.prototype.forEach.call(docElement.querySelectorAll("meta[http-equiv=refresh]"), function(node) {
  323. node.parentElement.removeChild(node);
  324. });
  325. }
  326. function resetFrames(docElement, baseURI) {
  327. Array.prototype.forEach.call(docElement.querySelectorAll("iframe, frame"), function(node) {
  328. var src = formatURL(node.getAttribute("src"), baseURI);
  329. if (src.indexOf("data:") != 0)
  330. node.setAttribute("src", "about:blank");
  331. });
  332. }
  333. function setAbsoluteLinks(docElement, baseURI) {
  334. Array.prototype.forEach.call(docElement.querySelectorAll("a:not([href^='#'])"), function(link) {
  335. var fullHref = formatURL(link.getAttribute("href"), baseURI);
  336. if (fullHref && (!(fullHref.indexOf(baseURI.split("#")[0]) == 0) || fullHref.indexOf("#") == -1))
  337. link.setAttribute("href", fullHref);
  338. });
  339. }
  340. // ----------------------------------------------------------------------------------------------
  341. singlefile.initProcess = function(doc, docElement, addDefaultFavico, baseURI, characterSet, config, canvasData, requestManager, onInit, onProgress, onEnd) {
  342. var initManager = new RequestManager(), manager = new RequestManager(onProgress);
  343. function RequestManager(onProgress) {
  344. var that = this, currentCount = 0, requests = [];
  345. this.requestCount = 0;
  346. this.send = function(url, responseHandler, characterSet, mediaTypeParam) {
  347. this.requestCount++;
  348. requests.push({
  349. url : url,
  350. responseHandler : responseHandler,
  351. characterSet : characterSet,
  352. mediaTypeParam : mediaTypeParam
  353. });
  354. };
  355. this.doSend = function() {
  356. requests.forEach(function(request) {
  357. requestManager.send(request.url, function(response) {
  358. request.responseHandler(response);
  359. currentCount++;
  360. if (onProgress)
  361. onProgress(currentCount, that.requestCount);
  362. if (currentCount == that.requestCount) {
  363. that.requestCount = 0;
  364. currentCount = 0;
  365. if (that.onEnd)
  366. that.onEnd();
  367. }
  368. }, request.characterSet, request.mediaTypeParam);
  369. });
  370. requests = [];
  371. };
  372. }
  373. function cbImports() {
  374. if (config.removeScripts)
  375. removeScripts(docElement);
  376. if (config.removeObjects)
  377. removeObjects(docElement);
  378. if (config.removeFrames || config.getRawDoc)
  379. removeFrames(docElement);
  380. resetFrames(docElement, baseURI);
  381. removeBlockquotesCite(docElement);
  382. removeMetaRefresh(docElement);
  383. setAbsoluteLinks(docElement, baseURI);
  384. if (addDefaultFavico)
  385. insertDefaultFavico(doc, docElement, baseURI);
  386. processStyleAttributes(docElement, baseURI, manager);
  387. processBgAttributes(docElement, baseURI, manager);
  388. processImages(docElement, baseURI, manager);
  389. processSVGs(docElement, baseURI, manager);
  390. processStyles(docElement, baseURI, manager);
  391. processScripts(docElement, baseURI, characterSet, manager);
  392. processCanvas(doc, docElement, canvasData);
  393. if (onInit)
  394. setTimeout(function() {
  395. onInit(manager.requestCount);
  396. }, 1);
  397. }
  398. function cbStylesheets() {
  399. initManager.onEnd = function(noRequests) {
  400. if (noRequests)
  401. cbImports();
  402. else
  403. cbStylesheets();
  404. };
  405. processImports(docElement, baseURI, characterSet, initManager);
  406. initManager.doSend();
  407. if (initManager.requestCount == 0)
  408. cbImports();
  409. }
  410. manager.onEnd = onEnd;
  411. processStylesheets(doc, docElement, baseURI, initManager);
  412. initManager.onEnd = cbStylesheets;
  413. initManager.doSend();
  414. if (initManager.requestCount == 0)
  415. initManager.onEnd();
  416. return function() {
  417. manager.doSend();
  418. if (manager.onEnd && manager.requestCount == 0)
  419. manager.onEnd();
  420. };
  421. };
  422. })();