Browse Source

implement mime sniffing (for images and fonts)

Gildas 5 năm trước cách đây
mục cha
commit
341dc827c2

+ 1 - 0
cli/back-ends/common/scripts.js

@@ -40,6 +40,7 @@ const SCRIPTS = [
 	"lib/single-file/vendor/css-font-property-parser.js",
 	"lib/single-file/vendor/css-unescape.js",
 	"lib/single-file/vendor/css-media-query-parser.js",
+	"lib/single-file/vendor/mime-type-parser.js",
 	"lib/single-file/modules/html-minifier.js",
 	"lib/single-file/modules/css-fonts-minifier.js",
 	"lib/single-file/modules/css-fonts-alt-minifier.js",

+ 1 - 0
extension/lib/single-file/core/bg/scripts.js

@@ -35,6 +35,7 @@ singlefile.extension.lib.core.bg.scripts = (() => {
 		"lib/single-file/vendor/css-tree.js",
 		"lib/single-file/vendor/html-srcset-parser.js",
 		"lib/single-file/vendor/css-minifier.js",
+		"lib/single-file/vendor/mime-type-parser.js",
 		"lib/single-file/modules/html-minifier.js",
 		"lib/single-file/modules/html-serializer.js",
 		"lib/single-file/modules/html-images-alt-minifier.js",

+ 10 - 8
lib/single-file/single-file-core.js

@@ -310,9 +310,9 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 			this.duplicates = new Map();
 		}
 
-		addURL(resourceURL, asBinary, groupDuplicates) {
+		addURL(resourceURL, asBinary, expectedType, groupDuplicates) {
 			return new Promise((resolve, reject) => {
-				const requestKey = JSON.stringify([resourceURL, asBinary]);
+				const requestKey = JSON.stringify([resourceURL, asBinary, expectedType]);
 				let resourceRequests = this.requests.get(requestKey);
 				if (!resourceRequests) {
 					resourceRequests = [];
@@ -339,13 +339,14 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 			const resourceURLs = [...this.requests.keys()];
 			let indexResource = 0;
 			return Promise.all(resourceURLs.map(async requestKey => {
-				const [resourceURL, asBinary] = JSON.parse(requestKey);
+				const [resourceURL, asBinary, expectedType] = JSON.parse(requestKey);
 				const resourceRequests = this.requests.get(requestKey);
 				try {
 					const currentIndexResource = indexResource;
 					indexResource = indexResource + 1;
 					const content = await util.getContent(resourceURL, {
 						asBinary,
+						expectedType,
 						maxResourceSize: options.maxResourceSize,
 						maxResourceSizeEnabled: options.maxResourceSizeEnabled,
 						frameId: options.windowId
@@ -407,7 +408,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 			this.options.saveDate = new Date();
 			this.options.saveUrl = this.options.url;
 			if (this.options.enableMaff) {
-				this.maffMetaDataPromise = this.batchRequest.addURL(util.resolveURL("index.rdf", this.options.baseURI || this.options.url), false);
+				this.maffMetaDataPromise = this.batchRequest.addURL(util.resolveURL("index.rdf", this.options.baseURI || this.options.url));
 			}
 			this.maxResources = this.batchRequest.getMaxResources();
 			if (!this.options.saveRawPage && !this.options.removeFrames && this.options.frames) {
@@ -1677,7 +1678,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 							const resourceURL = normalizeURL(originalResourceURL);
 							if (!testIgnoredPath(resourceURL)) {
 								if (testValidURL(resourceURL)) {
-									let { content } = await batchRequest.addURL(resourceURL, true);
+									let { content } = await batchRequest.addURL(resourceURL, true, "font");
 									let resourceURLs = fontURLs.get(declaration);
 									if (!resourceURLs) {
 										resourceURLs = [];
@@ -1710,7 +1711,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 						const resourceURL = normalizeURL(originalResourceURL);
 						if (!testIgnoredPath(resourceURL)) {
 							if (testValidURL(resourceURL)) {
-								let { content, indexResource, duplicate } = await batchRequest.addURL(resourceURL, true, true);
+								let { content, indexResource, duplicate } = await batchRequest.addURL(resourceURL, true, "image", true);
 								let variableDefined;
 								const tokens = [];
 								findURLToken(originalResourceURL, declaration.value.children, (token, parent, rootFunction) => {
@@ -1762,7 +1763,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 								// ignored
 							}
 							if (testValidURL(resourceURL)) {
-								let { content, indexResource, duplicate } = await batchRequest.addURL(resourceURL, true, resourceElement.tagName == "IMG" && attributeName == "src");
+								let { content, indexResource, duplicate } = await batchRequest.addURL(resourceURL, true, "image", resourceElement.tagName == "IMG" && attributeName == "src");
 								if (originURL) {
 									if (content == EMPTY_DATA_URI) {
 										try {
@@ -1774,6 +1775,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 											resourceURL = originURL;
 											content = (await util.getContent(resourceURL, {
 												asBinary: true,
+												expectedType: "image",
 												maxResourceSize: options.maxResourceSize,
 												maxResourceSizeEnabled: options.maxResourceSizeEnabled,
 												frameId: options.windowId
@@ -1862,7 +1864,7 @@ this.singlefile.lib.core = this.singlefile.lib.core || (() => {
 								// ignored
 							}
 							if (testValidURL(resourceURL)) {
-								const { content } = await batchRequest.addURL(resourceURL, true);
+								const { content } = await batchRequest.addURL(resourceURL, true, "image");
 								const forbiddenPrefixFound = PREFIXES_FORBIDDEN_DATA_URI.filter(prefixDataURI => content.startsWith(prefixDataURI)).length;
 								if (forbiddenPrefixFound) {
 									return "";

+ 69 - 15
lib/single-file/single-file-util.js

@@ -236,21 +236,16 @@ this.singlefile.lib.util = this.singlefile.lib.util || (() => {
 					return { data: options.asBinary ? "data:null;base64," : "", resourceURL };
 				}
 				resourceURL = response.url || resourceURL;
-				let contentType = response.headers.get("content-type");
-				let charset;
-				if (contentType) {
-					const matchContentType = contentType.toLowerCase().split(";");
-					contentType = matchContentType[0].trim();
-					if (!contentType.includes("/")) {
-						contentType = null;
-					}
-					const charsetValue = matchContentType[1] && matchContentType[1].trim();
-					if (charsetValue) {
-						const matchCharset = charsetValue.match(/^charset=(.*)/);
-						if (matchCharset && matchCharset[1]) {
-							charset = helper.removeQuotes(matchCharset[1].trim());
-						}
-					}
+				let contentType = "", charset;
+				try {
+					const mimeType = new vendor.MIMEType(response.headers.get("content-type"));
+					contentType = mimeType.type + "/" + mimeType.subtype;
+					charset = mimeType.parameters.get("charset");
+				} catch (error) {
+					// ignored
+				}
+				if (!contentType) {
+					contentType = guessMIMEType(options.expectedType, buffer);
 				}
 				if (!charset && options.charset) {
 					charset = options.charset;
@@ -303,6 +298,65 @@ this.singlefile.lib.util = this.singlefile.lib.util || (() => {
 		}
 	};
 
+	function guessMIMEType(expectedType, buffer) {
+		if (expectedType == "image") {
+			if (compareBytes([255, 255, 255, 255], [0, 0, 1, 0])) {
+				return "image/x-icon";
+			}
+			if (compareBytes([255, 255, 255, 255], [0, 0, 2, 0])) {
+				return "image/x-icon";
+			}
+			if (compareBytes([255, 255], [78, 77])) {
+				return "image/bmp";
+			}
+			if (compareBytes([255, 255, 255, 255, 255, 255], [71, 73, 70, 56, 57, 97])) {
+				return "image/gif";
+			}
+			if (compareBytes([255, 255, 255, 255, 255, 255], [71, 73, 70, 56, 59, 97])) {
+				return "image/gif";
+			}
+			if (compareBytes([255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255], [82, 73, 70, 70, 0, 0, 0, 0, 87, 69, 66, 80, 86, 80])) {
+				return "image/webp";
+			}
+			if (compareBytes([255, 255, 255, 255, 255, 255, 255, 255], [137, 80, 78, 71, 13, 10, 26, 10])) {
+				return "image/png";
+			}
+			if (compareBytes([255, 255, 255], [255, 216, 255])) {
+				return "image/jpeg";
+			}
+		}
+		if (expectedType == "font") {
+			if (compareBytes([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255],
+				[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76, 80])) {
+				return "application/vnd.ms-fontobject";
+			}
+			if (compareBytes([255, 255, 255, 255], [0, 1, 0, 0])) {
+				return "font/ttf";
+			}
+			if (compareBytes([255, 255, 255, 255], [79, 84, 84, 79])) {
+				return "font/otf";
+			}
+			if (compareBytes([255, 255, 255, 255], [116, 116, 99, 102])) {
+				return "font/collection";
+			}
+			if (compareBytes([255, 255, 255, 255], [119, 79, 70, 70])) {
+				return "font/woff";
+			}
+			if (compareBytes([255, 255, 255, 255], [119, 79, 70, 50])) {
+				return "font/woff2";
+			}
+		}
+
+		function compareBytes(mask, pattern) {
+			let patternMatch = true;
+			const value = new Uint8Array(buffer, 0, mask.length);
+			for (let index = 0; index < mask.length && patternMatch; index++) {
+				patternMatch = patternMatch && ((value[index] & mask[index]) == pattern[index]);
+			}
+			return patternMatch;
+		}
+	}
+
 	// https://developer.mozilla.org/en-US/docs/Web/API/SubtleCrypto/digest
 	function hex(buffer) {
 		const hexCodes = [];

+ 450 - 0
lib/single-file/vendor/mime-type-parser.js

@@ -0,0 +1,450 @@
+/*
+ * The MIT License (MIT)
+ * 
+ * Author: Gildas Lormeau
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// derived from https://github.com/jsdom/whatwg-mimetype
+
+/* 
+ * Copyright © 2017–2018 Domenic Denicola <d@domenic.me>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+this.singlefile.lib.vendor.MIMEType = this.singlefile.lib.vendor.MIMEType || (() => {
+
+	"use strict";
+
+	let utils, parser, serializer, MIMEType;
+
+	// lib/utils.js
+	{
+		utils = {};
+		utils.removeLeadingAndTrailingHTTPWhitespace = string => {
+			return string.replace(/^[ \t\n\r]+/, "").replace(/[ \t\n\r]+$/, "");
+		};
+
+		utils.removeTrailingHTTPWhitespace = string => {
+			return string.replace(/[ \t\n\r]+$/, "");
+		};
+
+		utils.isHTTPWhitespaceChar = char => {
+			return char === " " || char === "\t" || char === "\n" || char === "\r";
+		};
+
+		utils.solelyContainsHTTPTokenCodePoints = string => {
+			return /^[-!#$%&'*+.^_`|~A-Za-z0-9]*$/.test(string);
+		};
+
+		utils.soleyContainsHTTPQuotedStringTokenCodePoints = string => {
+			return /^[\t\u0020-\u007E\u0080-\u00FF]*$/.test(string);
+		};
+
+		utils.asciiLowercase = string => {
+			return string.replace(/[A-Z]/g, l => l.toLowerCase());
+		};
+
+		// This variant only implements it with the extract-value flag set.
+		utils.collectAnHTTPQuotedString = (input, position) => {
+			let value = "";
+
+			position++;
+
+			// eslint-disable-next-line no-constant-condition
+			while (true) {
+				while (position < input.length && input[position] !== "\"" && input[position] !== "\\") {
+					value += input[position];
+					++position;
+				}
+
+				if (position >= input.length) {
+					break;
+				}
+
+				const quoteOrBackslash = input[position];
+				++position;
+
+				if (quoteOrBackslash === "\\") {
+					if (position >= input.length) {
+						value += "\\";
+						break;
+					}
+
+					value += input[position];
+					++position;
+				} else {
+					break;
+				}
+			}
+
+			return [value, position];
+		};
+	}
+
+	// lib/serializer.js
+	{
+		const { solelyContainsHTTPTokenCodePoints } = utils;
+		serializer = mimeType => {
+			let serialization = `${mimeType.type}/${mimeType.subtype}`;
+
+			if (mimeType.parameters.size === 0) {
+				return serialization;
+			}
+
+			for (let [name, value] of mimeType.parameters) {
+				serialization += ";";
+				serialization += name;
+				serialization += "=";
+
+				if (!solelyContainsHTTPTokenCodePoints(value) || value.length === 0) {
+					value = value.replace(/(["\\])/g, "\\$1");
+					value = `"${value}"`;
+				}
+
+				serialization += value;
+			}
+
+			return serialization;
+		};
+	}
+
+	// lib/parser.js
+	{
+		const {
+			removeLeadingAndTrailingHTTPWhitespace,
+			removeTrailingHTTPWhitespace,
+			isHTTPWhitespaceChar,
+			solelyContainsHTTPTokenCodePoints,
+			soleyContainsHTTPQuotedStringTokenCodePoints,
+			asciiLowercase,
+			collectAnHTTPQuotedString
+		} = utils;
+
+		parser = input => {
+			input = removeLeadingAndTrailingHTTPWhitespace(input);
+
+			let position = 0;
+			let type = "";
+			while (position < input.length && input[position] !== "/") {
+				type += input[position];
+				++position;
+			}
+
+			if (type.length === 0 || !solelyContainsHTTPTokenCodePoints(type)) {
+				return null;
+			}
+
+			if (position >= input.length) {
+				return null;
+			}
+
+			// Skips past "/"
+			++position;
+
+			let subtype = "";
+			while (position < input.length && input[position] !== ";") {
+				subtype += input[position];
+				++position;
+			}
+
+			subtype = removeTrailingHTTPWhitespace(subtype);
+
+			if (subtype.length === 0 || !solelyContainsHTTPTokenCodePoints(subtype)) {
+				return null;
+			}
+
+			const mimeType = {
+				type: asciiLowercase(type),
+				subtype: asciiLowercase(subtype),
+				parameters: new Map()
+			};
+
+			while (position < input.length) {
+				// Skip past ";"
+				++position;
+
+				while (isHTTPWhitespaceChar(input[position])) {
+					++position;
+				}
+
+				let parameterName = "";
+				while (position < input.length && input[position] !== ";" && input[position] !== "=") {
+					parameterName += input[position];
+					++position;
+				}
+				parameterName = asciiLowercase(parameterName);
+
+				if (position < input.length) {
+					if (input[position] === ";") {
+						continue;
+					}
+
+					// Skip past "="
+					++position;
+				}
+
+				let parameterValue = null;
+				if (input[position] === "\"") {
+					[parameterValue, position] = collectAnHTTPQuotedString(input, position);
+
+					while (position < input.length && input[position] !== ";") {
+						++position;
+					}
+				} else {
+					parameterValue = "";
+					while (position < input.length && input[position] !== ";") {
+						parameterValue += input[position];
+						++position;
+					}
+
+					parameterValue = removeTrailingHTTPWhitespace(parameterValue);
+
+					if (parameterValue === "") {
+						continue;
+					}
+				}
+
+				if (parameterName.length > 0 &&
+					solelyContainsHTTPTokenCodePoints(parameterName) &&
+					soleyContainsHTTPQuotedStringTokenCodePoints(parameterValue) &&
+					!mimeType.parameters.has(parameterName)) {
+					mimeType.parameters.set(parameterName, parameterValue);
+				}
+			}
+
+			return mimeType;
+		};
+	}
+
+	// lib/mime-type.js
+	{		
+		const parse = parser;
+		const serialize = serializer;
+		const {
+			asciiLowercase,
+			solelyContainsHTTPTokenCodePoints,
+			soleyContainsHTTPQuotedStringTokenCodePoints
+		} = utils;
+
+		MIMEType = class MIMEType {
+			constructor(string) {
+				string = String(string);
+				const result = parse(string);
+				if (result === null) {
+					throw new Error(`Could not parse MIME type string "${string}"`);
+				}
+
+				this._type = result.type;
+				this._subtype = result.subtype;
+				this._parameters = new MIMETypeParameters(result.parameters);
+			}
+
+			static parse(string) {
+				try {
+					return new this(string);
+				} catch (e) {
+					return null;
+				}
+			}
+
+			get essence() {
+				return `${this.type}/${this.subtype}`;
+			}
+
+			get type() {
+				return this._type;
+			}
+
+			set type(value) {
+				value = asciiLowercase(String(value));
+
+				if (value.length === 0) {
+					throw new Error("Invalid type: must be a non-empty string");
+				}
+				if (!solelyContainsHTTPTokenCodePoints(value)) {
+					throw new Error(`Invalid type ${value}: must contain only HTTP token code points`);
+				}
+
+				this._type = value;
+			}
+
+			get subtype() {
+				return this._subtype;
+			}
+
+			set subtype(value) {
+				value = asciiLowercase(String(value));
+
+				if (value.length === 0) {
+					throw new Error("Invalid subtype: must be a non-empty string");
+				}
+				if (!solelyContainsHTTPTokenCodePoints(value)) {
+					throw new Error(`Invalid subtype ${value}: must contain only HTTP token code points`);
+				}
+
+				this._subtype = value;
+			}
+
+			get parameters() {
+				return this._parameters;
+			}
+
+			toString() {
+				// The serialize function works on both "MIME type records" (i.e. the results of parse) and on this class, since
+				// this class's interface is identical.
+				return serialize(this);
+			}
+
+			isJavaScript({ allowParameters = false } = {}) {
+				switch (this._type) {
+					case "text": {
+						switch (this._subtype) {
+							case "ecmascript":
+							case "javascript":
+							case "javascript1.0":
+							case "javascript1.1":
+							case "javascript1.2":
+							case "javascript1.3":
+							case "javascript1.4":
+							case "javascript1.5":
+							case "jscript":
+							case "livescript":
+							case "x-ecmascript":
+							case "x-javascript": {
+								return allowParameters || this._parameters.size === 0;
+							}
+							default: {
+								return false;
+							}
+						}
+					}
+					case "application": {
+						switch (this._subtype) {
+							case "ecmascript":
+							case "javascript":
+							case "x-ecmascript":
+							case "x-javascript": {
+								return allowParameters || this._parameters.size === 0;
+							}
+							default: {
+								return false;
+							}
+						}
+					}
+					default: {
+						return false;
+					}
+				}
+			}
+			isXML() {
+				return (this._subtype === "xml" && (this._type === "text" || this._type === "application")) ||
+					this._subtype.endsWith("+xml");
+			}
+			isHTML() {
+				return this._subtype === "html" && this._type === "text";
+			}
+		};
+
+		class MIMETypeParameters {
+			constructor(map) {
+				this._map = map;
+			}
+
+			get size() {
+				return this._map.size;
+			}
+
+			get(name) {
+				name = asciiLowercase(String(name));
+				return this._map.get(name);
+			}
+
+			has(name) {
+				name = asciiLowercase(String(name));
+				return this._map.has(name);
+			}
+
+			set(name, value) {
+				name = asciiLowercase(String(name));
+				value = String(value);
+
+				if (!solelyContainsHTTPTokenCodePoints(name)) {
+					throw new Error(`Invalid MIME type parameter name "${name}": only HTTP token code points are valid.`);
+				}
+				if (!soleyContainsHTTPQuotedStringTokenCodePoints(value)) {
+					throw new Error(`Invalid MIME type parameter value "${value}": only HTTP quoted-string token code points are valid.`);
+				}
+
+				return this._map.set(name, value);
+			}
+
+			clear() {
+				this._map.clear();
+			}
+
+			delete(name) {
+				name = asciiLowercase(String(name));
+				return this._map.delete(name);
+			}
+
+			forEach(callbackFn, thisArg) {
+				this._map.forEach(callbackFn, thisArg);
+			}
+
+			keys() {
+				return this._map.keys();
+			}
+
+			values() {
+				return this._map.values();
+			}
+
+			entries() {
+				return this._map.entries();
+			}
+
+			[Symbol.iterator]() {
+				return this._map[Symbol.iterator]();
+			}
+		}
+
+	}
+
+	return MIMEType;
+
+})();

+ 1 - 0
manifest.json

@@ -64,6 +64,7 @@
 			"lib/single-file/vendor/html-srcset-parser.js",
 			"lib/single-file/vendor/css-font-property-parser.js",
 			"lib/single-file/vendor/css-unescape.js",
+			"lib/single-file/vendor/mime-type-parser.js",
 			"lib/single-file/single-file-util.js",
 			"lib/single-file/single-file-helper.js",
 			"lib/single-file/modules/css-fonts-minifier.js",