il y a 5 ans · 5972d92923
--- a/cli/args.js
+++ b/cli/args.js
@@ -110,8 +110,14 @@ const args = require("yargs")
 
				 	.boolean("crawl-links")
			
 
				 	.options("crawl-inner-links-only", { description: "Crawl pages found via inner links only if they are hosted on the same domain" })
			
 
				 	.boolean("crawl-inner-links-only")
			
 
				+	.options("crawl-load-session", { description: "Name of the file of the session to load (previously saved with --crawl-save-session or --crawl-sync-session)" })
			
 
				+	.string("crawl-load-session")
			
 
				 	.options("crawl-remove-url-fragment", { description: "Remove URL fragments found in links" })
			
 
				 	.boolean("crawl-remove-url-fragment")
			
 
				+	.options("crawl-save-session", { description: "Name of the file where to save the state of the session" })
			
 
				+	.string("crawl-save-session")
			
 
				+	.options("crawl-sync-session", { description: "Name of the file where to load and save the state of the session" })
			
 
				+	.string("crawl-sync-session")
			
 
				 	.options("crawl-max-depth", { description: "Max depth when crawling pages found in internal and external links (0: infinite)" })
			
 
				 	.number("crawl-max-depth")
			
 
				 	.options("crawl-external-links-max-depth", { description: "Max depth when crawling pages found in external links (0: infinite)" })
			
--- a/cli/single-file-cli-api.js
+++ b/cli/single-file-cli-api.js
@@ -34,13 +34,25 @@ const backEnds = {
 
				 	"webdriver-gecko": "./back-ends/webdriver-gecko.js"
			
 
				 };
			
 
				 
			
 
				-let backend, tasks = [], maxParallelWorkers = 8;
			
 
				+let backend, tasks = [], maxParallelWorkers = 8, sessionFilename;
			
 
				 module.exports = initialize;
			
 
				 
			
 
				 async function initialize(options) {
			
 
				 	maxParallelWorkers = options.maxParallelWorkers;
			
 
				 	backend = require(backEnds[options.backEnd]);
			
 
				 	await backend.initialize(options);
			
 
				+	if (options.crawlSyncSession || options.crawlLoadSession) {
			
 
				+		try {
			
 
				+			tasks = JSON.parse(fs.readFileSync(options.crawlSyncSession || options.crawlLoadSession).toString());
			
 
				+		} catch (error) {
			
 
				+			if (options.crawlLoadSession) {
			
 
				+				throw error;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (options.crawlSyncSession || options.crawlSaveSession) {
			
 
				+		sessionFilename = options.crawlSyncSession || options.crawlSaveSession;
			
 
				+	}
			
 
				 	return {
			
 
				 		capture: urls => capture(urls, options),
			
 
				 		finish: () => finish(options),
			
@@ -50,12 +62,14 @@ async function initialize(options) {
 
				 
			
 
				 async function capture(urls, options) {
			
 
				 	let newTasks;
			
 
				+	const taskUrls = tasks.map(task => task.url);
			
 
				 	newTasks = urls.map(url => createTask(url, options));
			
 
				-	newTasks = newTasks.filter(task => task);
			
 
				+	newTasks = newTasks.filter(task => task && !taskUrls.includes(task.url));
			
 
				 	if (newTasks.length) {
			
 
				 		tasks = tasks.concat(newTasks);
			
 
				-		await runTasks();
			
 
				+		saveTasks();
			
 
				 	}
			
 
				+	await runTasks();
			
 
				 }
			
 
				 
			
 
				 async function finish(options) {
			
@@ -102,9 +116,11 @@ async function runNextTask() {
 
				 		let taskOptions = JSON.parse(JSON.stringify(options));
			
 
				 		taskOptions.url = task.url;
			
 
				 		task.status = "processing";
			
 
				+		saveTasks();
			
 
				 		task.promise = capturePage(taskOptions);
			
 
				 		const pageData = await task.promise;
			
 
				 		task.status = "processed";
			
 
				+		saveTasks();
			
 
				 		if (pageData) {
			
 
				 			task.filename = pageData.filename;
			
 
				 			if (options.crawlLinks && testMaxDepth(task)) {
			
@@ -115,6 +131,7 @@ async function runNextTask() {
 
				 						!tasks.find(otherTask => otherTask.url == task.url) &&
			
 
				 						(!options.crawlInnerLinksOnly || task.isInnerLink));
			
 
				 				tasks.splice(tasks.length, 0, ...newTasks);
			
 
				+				saveTasks();
			
 
				 			}
			
 
				 		}
			
 
				 		await runTasks();
			
@@ -142,6 +159,18 @@ function createTask(url, options, parentTask, rootTask) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+function saveTasks() {
			
 
				+	if (sessionFilename) {
			
 
				+		fs.writeFileSync(sessionFilename, JSON.stringify(
			
 
				+			tasks.map(task => Object.assign({}, task, {
			
 
				+				status: task.status == "processing" ? undefined : task.status,
			
 
				+				promise: undefined,
			
 
				+				options: task.status && task.status == "processed" ? undefined : task.options
			
 
				+			}))
			
 
				+		));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 function rewriteURL(url, crawlRemoveURLFragment, crawlRewriteRules) {
			
 
				 	url = url.trim();
			
 
				 	if (crawlRemoveURLFragment) {