Преглед изворни кода

use hashes instead of URLs to detect duplicate images

Gildas пре 7 година
родитељ
комит
b91c2a7064
2 измењених фајлова са 37 додато и 28 уклоњено
  1. 13 10
      lib/single-file/single-file-browser.js
  2. 24 18
      lib/single-file/single-file-core.js

+ 13 - 10
lib/single-file/single-file-browser.js

@@ -65,17 +65,17 @@ this.SingleFile = this.SingleFile || (() => {
 			try {
 				resourceContent = await fetchResource(resourceURL);
 			} catch (error) {
-				return options && options.asDataURI ? "data:base64," : "";
+				return options && options.asDataURI ? { empty: true } : { content: "" };
 			}
 			if (resourceContent.status >= 400 && superFetch.hostFetch) {
 				try {
 					resourceContent = await superFetch.hostFetch(resourceURL);
 				} catch (error) {
-					return options && options.asDataURI ? "data:base64," : "";
+					return options && options.asDataURI ? { empty: true } : { content: "" };
 				}
 			}
 			if (resourceContent.status >= 400) {
-				return options && options.asDataURI ? "data:base64," : "";
+				return options && options.asDataURI ? { empty: true } : "";
 			}
 			let contentType = resourceContent.headers && resourceContent.headers.get("content-type");
 			let charSet;
@@ -99,14 +99,16 @@ this.SingleFile = this.SingleFile || (() => {
 					if (DEBUG) {
 						log("  // ENDED   download url =", resourceURL, "delay =", Date.now() - startTime);
 					}
-					const dataURI = "data:" + (contentType || "") + ";" + "base64," + base64.fromByteArray(new Uint8Array(buffer));
+					const uInt8Array = new Uint8Array(buffer);
+					const dataURI = "data:" + (contentType || "") + ";" + "base64," + base64.fromByteArray(uInt8Array);
 					if (options.maxResourceSizeEnabled && buffer.byteLength > options.maxResourceSize * ONE_MB) {
-						return "data:base64,";
+						return { empty: true };
 					} else {
-						return dataURI;
+						const hash = await crypto.subtle.digest("SHA-256", uInt8Array);
+						return { content: dataURI, hash };
 					}
 				} catch (error) {
-					return "data:base64,";
+					return { empty: true };
 				}
 			} else {
 				if (!charSet) {
@@ -125,12 +127,13 @@ this.SingleFile = this.SingleFile || (() => {
 					}
 					const textContent = (new TextDecoder(charSet)).decode(arrayBuffer);
 					if (options.maxResourceSizeEnabled && textContent.length > options.maxResourceSize * ONE_MB) {
-						return "";
+						return { content: "" };
 					} else {
-						return textContent;
+						const hash = await crypto.subtle.digest("SHA-256", arrayBuffer);
+						return { content: textContent, hash };
 					}
 				} catch (error) {
-					return "";
+					return { content: "" };
 				}
 			}
 		}

+ 24 - 18
lib/single-file/single-file-core.js

@@ -230,6 +230,7 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 	class BatchRequest {
 		constructor() {
 			this.requests = new Map();
+			this.hashes = [];
 		}
 
 		async addURL(resourceURL, asDataURI = true) {
@@ -255,10 +256,14 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 				const [resourceURL, asDataURI] = JSON.parse(requestKey);
 				const resourceRequests = this.requests.get(requestKey);
 				try {
-					const resourceContent = await Download.getContent(resourceURL, { asDataURI, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled });
-					indexResource = indexResource + 1;
+					const result = await Download.getContent(resourceURL, { asDataURI, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled });
+					indexResource = this.hashes.indexOf(result.hash);
+					if (indexResource == -1) {
+						indexResource = this.hashes.length;
+						this.hashes.push(result.hash);
+					}
 					onloadListener({ index: indexResource, url: resourceURL });
-					resourceRequests.forEach(resourceRequest => resourceRequest.resolve({ content: resourceContent, indexResource, duplicate: Boolean(resourceRequests.length > 1) }));
+					resourceRequests.forEach(resourceRequest => resourceRequest.resolve({ content: result.content, empty: result.empty, indexResource, duplicate: Boolean(resourceRequests.length > 1) }));
 				} catch (error) {
 					indexResource = indexResource + 1;
 					onloadListener({ index: indexResource, url: resourceURL });
@@ -272,7 +277,6 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 	// ------------
 	// DOMProcessor
 	// ------------
-	const EMPTY_DATA_URI = "data:base64,";
 	const EMPTY_IMAGE = "";
 
 	class DOMProcessor {
@@ -291,11 +295,12 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 			this.stats.set("processed", "resources", this.maxResources);
 		}
 
-		async loadPage(pageContent) {
-			if (!pageContent || this.options.saveRawPage) {
-				pageContent = await Download.getContent(this.baseURI, { asDataURI: false, maxResourceSize: this.options.maxResourceSize, maxResourceSizeEnabled: this.options.maxResourceSizeEnabled });
+		async loadPage(content) {
+			if (!content || this.options.saveRawPage) {
+				const result = await Download.getContent(this.baseURI, { asDataURI: false, maxResourceSize: this.options.maxResourceSize, maxResourceSizeEnabled: this.options.maxResourceSizeEnabled });
+				content = result.content;
 			}
-			this.doc = DOM.createDoc(pageContent, this.baseURI);
+			this.doc = DOM.createDoc(content, this.baseURI);
 			this.onEventAttributeNames = DOM.getOnEventAttributeNames(this.doc);
 		}
 
@@ -696,7 +701,7 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 			if (this.options.removeAlternativeImages) {
 				const shortcutIcons = Array.from(this.doc.querySelectorAll("link[href][rel=\"icon\"], link[href][rel=\"shortcut icon\"]"));
 				shortcutIcons.sort((linkElement1, linkElement2) => (parseInt(linkElement2.sizes, 10) || 16) - (parseInt(linkElement1.sizes, 10) || 16));
-				const shortcutIcon = shortcutIcons.find(linkElement => linkElement.href && linkElement.href != EMPTY_DATA_URI);
+				const shortcutIcon = shortcutIcons[0];
 				if (shortcutIcon) {
 					this.doc.querySelectorAll("link[href][rel*=\"icon\"]").forEach(linkElement => {
 						if (linkElement != shortcutIcon) {
@@ -722,8 +727,8 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 			await Promise.all(Array.from(this.doc.querySelectorAll("script[src]")).map(async scriptElement => {
 				if (scriptElement.src) {
 					this.stats.add("processed", "scripts", 1);
-					const scriptContent = await Download.getContent(scriptElement.src, { asDataURI: false, maxResourceSize: this.options.maxResourceSize, maxResourceSizeEnabled: this.options.maxResourceSizeEnabled });
-					scriptElement.textContent = scriptContent.replace(/<\/script>/gi, "<\\/script>");
+					const result = await Download.getContent(scriptElement.src, { asDataURI: false, maxResourceSize: this.options.maxResourceSize, maxResourceSizeEnabled: this.options.maxResourceSizeEnabled });
+					scriptElement.textContent = result.content.replace(/<\/script>/gi, "<\\/script>");
 				}
 				scriptElement.removeAttribute("src");
 			}));
@@ -735,6 +740,7 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 				await Promise.all(frameElements.map(async frameElement => {
 					DomProcessorHelper.setFrameEmptySrc(frameElement);
 					frameElement.setAttribute("sandbox", "allow-scripts allow-same-origin");
+					frameElement.removeAttribute("src");
 					const frameWindowId = frameElement.getAttribute(DOM.windowIdAttributeName(this.options.sessionId));
 					if (frameWindowId) {
 						const frameData = this.options.framesData.find(frame => frame.windowId == frameWindowId);
@@ -797,7 +803,7 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 			}
 			await Promise.all(linkElements.map(async linkElement => {
 				const resourceURL = linkElement.href;
-				linkElement.setAttribute("href", EMPTY_DATA_URI);
+				linkElement.removeAttribute("href");
 				const options = Object.create(this.options);
 				options.insertSingleFileComment = false;
 				options.insertFaviconLink = false;
@@ -984,8 +990,8 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 					if (!DomUtil.testIgnoredPath(resourceURL) && DomUtil.testValidPath(resourceURL)) {
 						resourceURL = new URL(match.resourceURL, baseURI).href;
 						if (DomUtil.testValidURL(resourceURL, baseURI)) {
-							let importedStylesheetContent = await Download.getContent(resourceURL, { asDataURI: false, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled });
-							importedStylesheetContent = DomUtil.wrapMediaQuery(importedStylesheetContent, match.media);
+							const result = await Download.getContent(resourceURL, { asDataURI: false, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled });
+							let importedStylesheetContent = DomUtil.wrapMediaQuery(result.content, match.media);
 							if (stylesheetContent.includes(cssImport)) {
 								importedStylesheetContent = await DomProcessorHelper.resolveImportURLs(importedStylesheetContent, resourceURL, options);
 								stylesheetContent = stylesheetContent.replace(DomUtil.getRegExp(cssImport), importedStylesheetContent);
@@ -1024,8 +1030,8 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 		static async resolveLinkStylesheetURLs(resourceURL, baseURI, media, options) {
 			resourceURL = DomUtil.normalizeURL(resourceURL);
 			if (resourceURL && resourceURL != baseURI && resourceURL != ABOUT_BLANK_URI) {
-				let stylesheetContent = await Download.getContent(resourceURL, { asDataURI: false, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled, charSet: options.charSet });
-				stylesheetContent = await DomProcessorHelper.resolveImportURLs(stylesheetContent, resourceURL, options);
+				const result = await Download.getContent(resourceURL, { asDataURI: false, maxResourceSize: options.maxResourceSize, maxResourceSizeEnabled: options.maxResourceSizeEnabled, charSet: options.charSet });
+				let stylesheetContent = await DomProcessorHelper.resolveImportURLs(result.content, resourceURL, options);
 				stylesheetContent = DomUtil.wrapMediaQuery(stylesheetContent, media);
 				return stylesheetContent;
 			}
@@ -1104,8 +1110,8 @@ this.SingleFileCore = this.SingleFileCore || (() => {
 						resourceURL = new URL(resourceURL, baseURI).href;
 						if (DomUtil.testValidURL(resourceURL, baseURI)) {
 							try {
-								const { content, indexResource, duplicate } = await batchRequest.addURL(resourceURL);
-								if (removeElementIfMissing && content == EMPTY_DATA_URI) {
+								const { content, indexResource, duplicate, empty } = await batchRequest.addURL(resourceURL);
+								if (removeElementIfMissing && empty) {
 									resourceElement.remove();
 								} else {
 									if (content.startsWith(prefixDataURI) || content.startsWith(PREFIX_DATA_URI_NO_MIMETYPE) || content.startsWith(PREFIX_DATA_URI_OCTET_STREAM)) {