Преглед изворни кода

refactored module to expose a JS API

Gildas пре 5 година
родитељ
комит
2a2c625b6f
3 измењених фајлова са 221 додато и 165 уклоњено
  1. 13 164
      cli/single-file
  2. 207 0
      cli/singlefile-cli-core.js
  3. 1 1
      package.json

+ 13 - 164
cli/single-file

@@ -23,177 +23,26 @@
  *   Source.
  */
 
-/* global require, module, URL */
-
-const VALID_URL_TEST = /^(https?|file):\/\//;
+/* global require */
 
 const fileUrl = require("file-url");
 const fs = require("fs");
-const options = require("./args");
-
-const backEnds = {
-	jsdom: "./back-ends/jsdom.js",
-	puppeteer: "./back-ends/puppeteer.js",
-	"puppeteer-firefox": "./back-ends/puppeteer-firefox.js",
-	"webdriver-chromium": "./back-ends/webdriver-chromium.js",
-	"webdriver-gecko": "./back-ends/webdriver-gecko.js"
-};
-if (options.url && !VALID_URL_TEST.test(options.url)) {
-	options.url = fileUrl(options.url);
-}
-options.retrieveLinks = true;
-options.browserScripts = options.browserScripts.map(path => require.resolve(path));
-const backend = require(backEnds[options.backEnd]);
-run(options);
-
-module.exports = run;
+run(require("./args"));
 
 async function run(options) {
-	await backend.initialize(options);
-	let tasks;
-	if (options.urlsFile) {
-		tasks = fs.readFileSync(options.urlsFile).toString().split("\n")
-			.map(url => createTask(url));
-	} else {
-		tasks = [createTask(options.url)];
-	}
-	tasks = tasks.filter(task => task);
-	await runTasks(tasks);
-	if (options.crawlReplaceURLs) {
-		tasks.forEach(task => {
-			try {
-				let pageContent = fs.readFileSync(task.filename).toString();
-				tasks.forEach(otherTask => {
-					pageContent = pageContent.replace(new RegExp(escapeRegExp("\"" + otherTask.originalUrl + "\""), "gi"), "\"" + otherTask.filename + "\"");
-					pageContent = pageContent.replace(new RegExp(escapeRegExp("'" + otherTask.originalUrl + "'"), "gi"), "'" + otherTask.filename + "'");
-					const filename = otherTask.filename.replace(/ /g, "%20");
-					pageContent = pageContent.replace(new RegExp(escapeRegExp("=" + otherTask.originalUrl + " "), "gi"), "=" + filename + " ");
-					pageContent = pageContent.replace(new RegExp(escapeRegExp("=" + otherTask.originalUrl + ">"), "gi"), "=" + filename + ">");
-				});
-				fs.writeFileSync(task.filename, pageContent);
-			} catch (error) {
-				// ignored
-			}
-		});
-	}
-	if (!options.browserDebug) {
-		return backend.closeBrowser();
-	}
-}
-
-async function runTasks(tasks) {
-	const availableTasks = tasks.filter(task => !task.status).length;
-	const processingTasks = tasks.filter(task => task.status == "processing").length;
-	const promisesTasks = [];
-	for (let workerIndex = 0; workerIndex < Math.min(availableTasks, options.maxParallelWorkers - processingTasks); workerIndex++) {
-		promisesTasks.push(runNextTask(tasks));
-	}
-	return Promise.all(promisesTasks);
-}
-
-async function runNextTask(tasks) {
-	const task = tasks.find(task => !task.status);
-	if (task) {
-		let taskOptions = JSON.parse(JSON.stringify(options));
-		taskOptions.url = task.url;
-		task.status = "processing";
-		const pageData = await capturePage(taskOptions);
-		task.status = "processed";
-		if (pageData) {
-			task.filename = pageData.filename;
-			if (options.crawlLinks && testMaxDepth(task)) {
-				let newTasks = pageData.links
-					.map(urlLink => createTask(urlLink, task, tasks[0]))
-					.filter(task => task &&
-						testMaxDepth(task) &&
-						!tasks.find(otherTask => otherTask.url == task.url) &&
-						(!options.crawlInnerLinksOnly || task.isInnerLink));
-				tasks.splice(tasks.length, 0, ...newTasks);
-			}
-		}
-		await runTasks(tasks);
+	const singlefile = await require("./singlefile-cli-core")(options);
+	let urls;
+	if (options.url && !singlefile.VALID_URL_TEST.test(options.url)) {
+		options.url = fileUrl(options.url);
 	}
-}
-
-function testMaxDepth(task) {
-	return (options.crawlMaxDepth == 0 || task.depth < options.crawlMaxDepth) &&
-		(options.crawlExternalLinksMaxDepth == 0 || task.externalLinkDepth < options.crawlExternalLinksMaxDepth);
-}
-
-function createTask(url, parentTask, rootTask) {
-	url = parentTask ? rewriteURL(url) : url;
-	if (VALID_URL_TEST.test(url)) {
-		const isInnerLink = rootTask && url.startsWith(getHostURL(rootTask.url));
-		return {
-			url,
-			isInnerLink,
-			originalUrl: url,
-			depth: parentTask ? parentTask.depth + 1 : 0,
-			externalLinkDepth: isInnerLink ? -1 : parentTask ? parentTask.externalLinkDepth + 1 : -1
-		};
-	}
-}
-
-function rewriteURL(url) {
-	url = url.trim();
-	if (options.crawlRemoveURLFragment) {
-		url = url.replace(/^(.*?)#.*$/, "$1");
-	}
-	options.crawlRewriteRules.forEach(rewriteRule => {
-		const parts = rewriteRule.trim().split(/ +/);
-		if (parts.length) {
-			url = url.replace(new RegExp(parts[0]), parts[1] || "").trim();
-		}
-	});
-	return url;
-}
-
-function getHostURL(url) {
-	url = new URL(url);
-	return url.protocol + "//" + (url.username ? url.username + (url.password || "") + "@" : "") + url.hostname;
-}
-
-async function capturePage(options) {
-	try {
-		const pageData = await backend.getPageData(options);
-		if (options.output) {
-			fs.writeFileSync(getFilename(options.output), pageData.content);
-		} else {
-			if (options.filenameTemplate && pageData.filename) {
-				fs.writeFileSync(getFilename(pageData.filename), pageData.content);
-			} else {
-				console.log(pageData.content); // eslint-disable-line no-console
-			}
-		}
-		return pageData;
-	} catch (error) {
-		const message = "URL: " + options.url + "\nStack: " + error.stack + "\n";
-		if (options.errorFile) {
-			fs.writeFileSync(options.errorFile, message, { flag: "a" });
-		} else {
-			console.error(message); // eslint-disable-line no-console
-		}
-	}
-}
-
-function getFilename(filename, index = 1) {
-	let newFilename = filename;
-	if (index > 1) {
-		const regExpMatchExtension = /(\.[^.]+)$/;
-		const matchExtension = newFilename.match(regExpMatchExtension);
-		if (matchExtension && matchExtension[1]) {
-			newFilename = newFilename.replace(regExpMatchExtension, " - " + index + matchExtension[1]);
-		} else {
-			newFilename += " - " + index;
-		}
-	}
-	if (fs.existsSync(newFilename)) {
-		return getFilename(filename, index + 1);
+	if (options.urlsFile) {
+		urls = fs.readFileSync(options.urlsFile).toString().split("\n");
 	} else {
-		return newFilename;
+		urls = [options.url];
 	}
-}
+	options.retrieveLinks = true;
+	options.browserScripts = options.browserScripts.map(path => require.resolve(path));
 
-function escapeRegExp(string) {
-	return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+	await singlefile.capture(urls);
+	await singlefile.finish();
 }

+ 207 - 0
cli/singlefile-cli-core.js

@@ -0,0 +1,207 @@
+#!/usr/bin/env node
+
+/*
+ * Copyright 2010-2020 Gildas Lormeau
+ * contact : gildas.lormeau <at> gmail.com
+ *
+ * This file is part of SingleFile.
+ *
+ *   The code in this file is free software: you can redistribute it and/or
+ *   modify it under the terms of the GNU Affero General Public License
+ *   (GNU AGPL) as published by the Free Software Foundation, either version 3
+ *   of the License, or (at your option) any later version.
+ *
+ *   The code in this file is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+ *   General Public License for more details.
+ *
+ *   As additional permission under GNU AGPL version 3 section 7, you may
+ *   distribute UNMODIFIED VERSIONS OF THIS file without the copy of the GNU
+ *   AGPL normally required by section 4, provided you include this license
+ *   notice and a URL through which recipients can access the Corresponding
+ *   Source.
+ */
+
+/* global require, module, URL */
+
+const fs = require("fs");
+const VALID_URL_TEST = /^(https?|file):\/\//;
+
+const backEnds = {
+	jsdom: "./back-ends/jsdom.js",
+	puppeteer: "./back-ends/puppeteer.js",
+	"puppeteer-firefox": "./back-ends/puppeteer-firefox.js",
+	"webdriver-chromium": "./back-ends/webdriver-chromium.js",
+	"webdriver-gecko": "./back-ends/webdriver-gecko.js"
+};
+
+let backend, tasks = [], maxParallelWorkers = 8;
+module.exports = initialize;
+
+async function initialize(options) {
+	maxParallelWorkers = options.maxParallelWorkers;
+	backend = require(backEnds[options.backEnd]);
+	await backend.initialize(options);
+	return {
+		capture: urls => capture(urls, options),
+		finish: () => finish(options),
+		VALID_URL_TEST
+	};
+}
+
+async function capture(urls, options) {
+	let newTasks;
+	newTasks = urls.map(url => createTask(url, options));
+	newTasks = newTasks.filter(task => task);
+	if (newTasks.length) {
+		tasks = tasks.concat(newTasks);
+		await runTasks();
+	}
+}
+
+async function finish(options) {
+	const promiseTasks = tasks.map(task => task.promise);
+	await Promise.all(promiseTasks);
+	if (options.crawlReplaceURLs) {
+		tasks.forEach(task => {
+			try {
+				let pageContent = fs.readFileSync(task.filename).toString();
+				tasks.forEach(otherTask => {
+					pageContent = pageContent.replace(new RegExp(escapeRegExp("\"" + otherTask.originalUrl + "\""), "gi"), "\"" + otherTask.filename + "\"");
+					pageContent = pageContent.replace(new RegExp(escapeRegExp("'" + otherTask.originalUrl + "'"), "gi"), "'" + otherTask.filename + "'");
+					const filename = otherTask.filename.replace(/ /g, "%20");
+					pageContent = pageContent.replace(new RegExp(escapeRegExp("=" + otherTask.originalUrl + " "), "gi"), "=" + filename + " ");
+					pageContent = pageContent.replace(new RegExp(escapeRegExp("=" + otherTask.originalUrl + ">"), "gi"), "=" + filename + ">");
+				});
+				fs.writeFileSync(task.filename, pageContent);
+			} catch (error) {
+				// ignored
+			}
+		});
+	}
+	if (!options.browserDebug) {
+		return backend.closeBrowser();
+	}
+}
+
+async function runTasks() {
+	const availableTasks = tasks.filter(task => !task.status).length;
+	const processingTasks = tasks.filter(task => task.status == "processing").length;
+	const promisesTasks = [];
+	for (let workerIndex = 0; workerIndex < Math.min(availableTasks, maxParallelWorkers - processingTasks); workerIndex++) {
+		promisesTasks.push(runNextTask());
+	}
+	return Promise.all(promisesTasks);
+}
+
+async function runNextTask() {
+	const task = tasks.find(task => !task.status);
+	if (task) {
+		const options = task.options;
+		let taskOptions = JSON.parse(JSON.stringify(options));
+		taskOptions.url = task.url;
+		task.status = "processing";
+		task.promise = capturePage(taskOptions);
+		const pageData = await task.promise;
+		task.status = "processed";
+		if (pageData) {
+			task.filename = pageData.filename;
+			if (options.crawlLinks && testMaxDepth(task)) {
+				let newTasks = pageData.links
+					.map(urlLink => createTask(urlLink, options, task, tasks[0]))
+					.filter(task => task &&
+						testMaxDepth(task) &&
+						!tasks.find(otherTask => otherTask.url == task.url) &&
+						(!options.crawlInnerLinksOnly || task.isInnerLink));
+				tasks.splice(tasks.length, 0, ...newTasks);
+			}
+		}
+		await runTasks();
+	}
+}
+
+function testMaxDepth(task) {
+	const options = task.options;
+	return (options.crawlMaxDepth == 0 || task.depth < options.crawlMaxDepth) &&
+		(options.crawlExternalLinksMaxDepth == 0 || task.externalLinkDepth < options.crawlExternalLinksMaxDepth);
+}
+
+function createTask(url, options, parentTask, rootTask) {
+	url = parentTask ? rewriteURL(url, options.crawlRemoveURLFragment, options.crawlRewriteRules) : url;
+	if (VALID_URL_TEST.test(url)) {
+		const isInnerLink = rootTask && url.startsWith(getHostURL(rootTask.url));
+		return {
+			url,
+			isInnerLink,
+			originalUrl: url,
+			depth: parentTask ? parentTask.depth + 1 : 0,
+			externalLinkDepth: isInnerLink ? -1 : parentTask ? parentTask.externalLinkDepth + 1 : -1,
+			options
+		};
+	}
+}
+
+function rewriteURL(url, crawlRemoveURLFragment, crawlRewriteRules) {
+	url = url.trim();
+	if (crawlRemoveURLFragment) {
+		url = url.replace(/^(.*?)#.*$/, "$1");
+	}
+	crawlRewriteRules.forEach(rewriteRule => {
+		const parts = rewriteRule.trim().split(/ +/);
+		if (parts.length) {
+			url = url.replace(new RegExp(parts[0]), parts[1] || "").trim();
+		}
+	});
+	return url;
+}
+
+function getHostURL(url) {
+	url = new URL(url);
+	return url.protocol + "//" + (url.username ? url.username + (url.password || "") + "@" : "") + url.hostname;
+}
+
+async function capturePage(options) {
+	try {
+		const pageData = await backend.getPageData(options);
+		if (options.output) {
+			fs.writeFileSync(getFilename(options.output), pageData.content);
+		} else {
+			if (options.filenameTemplate && pageData.filename) {
+				fs.writeFileSync(getFilename(pageData.filename), pageData.content);
+			} else {
+				console.log(pageData.content); // eslint-disable-line no-console
+			}
+		}
+		return pageData;
+	} catch (error) {
+		const message = "URL: " + options.url + "\nStack: " + error.stack + "\n";
+		if (options.errorFile) {
+			fs.writeFileSync(options.errorFile, message, { flag: "a" });
+		} else {
+			console.error(message); // eslint-disable-line no-console
+		}
+	}
+}
+
+function getFilename(filename, index = 1) {
+	let newFilename = filename;
+	if (index > 1) {
+		const regExpMatchExtension = /(\.[^.]+)$/;
+		const matchExtension = newFilename.match(regExpMatchExtension);
+		if (matchExtension && matchExtension[1]) {
+			newFilename = newFilename.replace(regExpMatchExtension, " - " + index + matchExtension[1]);
+		} else {
+			newFilename += " - " + index;
+		}
+	}
+	if (fs.existsSync(newFilename)) {
+		return getFilename(filename, index + 1);
+	} else {
+		return newFilename;
+	}
+}
+
+function escapeRegExp(string) {
+	return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}

+ 1 - 1
package.json

@@ -4,7 +4,7 @@
 	"description": "SingleFile",
 	"author": "Gildas Lormeau",
 	"license": "AGPL-3.0-or-later",
-	"main": "cli/single-file",
+	"main": "cli/singlefile-cli-core.js",
 	"bin": {
 		"single-file": "./cli/single-file"
 	},