Parcourir la source

added --crawl-load-session, --crawl-save-session and --crawl-sync-session

Gildas il y a 5 ans
Parent
commit
5972d92923
2 fichiers modifiés avec 38 ajouts et 3 suppressions
  1. 6 0
      cli/args.js
  2. 32 3
      cli/single-file-cli-api.js

+ 6 - 0
cli/args.js

@@ -110,8 +110,14 @@ const args = require("yargs")
 	.boolean("crawl-links")
 	.options("crawl-inner-links-only", { description: "Crawl pages found via inner links only if they are hosted on the same domain" })
 	.boolean("crawl-inner-links-only")
+	.options("crawl-load-session", { description: "Name of the file of the session to load (previously saved with --crawl-save-session or --crawl-sync-session)" })
+	.string("crawl-load-session")
 	.options("crawl-remove-url-fragment", { description: "Remove URL fragments found in links" })
 	.boolean("crawl-remove-url-fragment")
+	.options("crawl-save-session", { description: "Name of the file where to save the state of the session" })
+	.string("crawl-save-session")
+	.options("crawl-sync-session", { description: "Name of the file where to load and save the state of the session" })
+	.string("crawl-sync-session")
 	.options("crawl-max-depth", { description: "Max depth when crawling pages found in internal and external links (0: infinite)" })
 	.number("crawl-max-depth")
 	.options("crawl-external-links-max-depth", { description: "Max depth when crawling pages found in external links (0: infinite)" })

+ 32 - 3
cli/single-file-cli-api.js

@@ -34,13 +34,25 @@ const backEnds = {
 	"webdriver-gecko": "./back-ends/webdriver-gecko.js"
 };
 
-let backend, tasks = [], maxParallelWorkers = 8;
+let backend, tasks = [], maxParallelWorkers = 8, sessionFilename;
 module.exports = initialize;
 
 async function initialize(options) {
 	maxParallelWorkers = options.maxParallelWorkers;
 	backend = require(backEnds[options.backEnd]);
 	await backend.initialize(options);
+	if (options.crawlSyncSession || options.crawlLoadSession) {
+		try {
+			tasks = JSON.parse(fs.readFileSync(options.crawlSyncSession || options.crawlLoadSession).toString());
+		} catch (error) {
+			if (options.crawlLoadSession) {
+				throw error;
+			}
+		}
+	}
+	if (options.crawlSyncSession || options.crawlSaveSession) {
+		sessionFilename = options.crawlSyncSession || options.crawlSaveSession;
+	}
 	return {
 		capture: urls => capture(urls, options),
 		finish: () => finish(options),
@@ -50,12 +62,14 @@ async function initialize(options) {
 
 async function capture(urls, options) {
 	let newTasks;
+	const taskUrls = tasks.map(task => task.url);
 	newTasks = urls.map(url => createTask(url, options));
-	newTasks = newTasks.filter(task => task);
+	newTasks = newTasks.filter(task => task && !taskUrls.includes(task.url));
 	if (newTasks.length) {
 		tasks = tasks.concat(newTasks);
-		await runTasks();
+		saveTasks();
 	}
+	await runTasks();
 }
 
 async function finish(options) {
@@ -102,9 +116,11 @@ async function runNextTask() {
 		let taskOptions = JSON.parse(JSON.stringify(options));
 		taskOptions.url = task.url;
 		task.status = "processing";
+		saveTasks();
 		task.promise = capturePage(taskOptions);
 		const pageData = await task.promise;
 		task.status = "processed";
+		saveTasks();
 		if (pageData) {
 			task.filename = pageData.filename;
 			if (options.crawlLinks && testMaxDepth(task)) {
@@ -115,6 +131,7 @@ async function runNextTask() {
 						!tasks.find(otherTask => otherTask.url == task.url) &&
 						(!options.crawlInnerLinksOnly || task.isInnerLink));
 				tasks.splice(tasks.length, 0, ...newTasks);
+				saveTasks();
 			}
 		}
 		await runTasks();
@@ -142,6 +159,18 @@ function createTask(url, options, parentTask, rootTask) {
 	}
 }
 
+function saveTasks() {
+	if (sessionFilename) {
+		fs.writeFileSync(sessionFilename, JSON.stringify(
+			tasks.map(task => Object.assign({}, task, {
+				status: task.status == "processing" ? undefined : task.status,
+				promise: undefined,
+				options: task.status && task.status == "processed" ? undefined : task.options
+			}))
+		));
+	}
+}
+
 function rewriteURL(url, crawlRemoveURLFragment, crawlRewriteRules) {
 	url = url.trim();
 	if (crawlRemoveURLFragment) {