Просмотр исходного кода

add option "--crawl-no-parent"

Gildas 5 лет назад
Родитель
Сommit
554ce67905
2 измененных файлов с 8 добавлено и 1 удалено
  1. 2 0
      cli/args.js
  2. 6 1
      cli/single-file-cli-api.js

+ 2 - 0
cli/args.js

@@ -114,6 +114,8 @@ const args = require("yargs")
 	.boolean("crawl-links")
 	.options("crawl-inner-links-only", { description: "Crawl pages found via inner links only if they are hosted on the same domain" })
 	.boolean("crawl-inner-links-only")
+	.options("crawl-no-parent", { description: "Crawl pages found via inner links only if their URLs are not parent of the URL to crawl" })
+	.boolean("crawl-no-parent")
 	.options("crawl-load-session", { description: "Name of the file of the session to load (previously saved with --crawl-save-session or --crawl-sync-session)" })
 	.string("crawl-load-session")
 	.options("crawl-remove-url-fragment", { description: "Remove URL fragments found in links" })

+ 6 - 1
cli/single-file-cli-api.js

@@ -133,7 +133,8 @@ async function runNextTask() {
 					.filter(task => task &&
 						testMaxDepth(task) &&
 						!tasks.find(otherTask => otherTask.url == task.url) &&
-						(!options.crawlInnerLinksOnly || task.isInnerLink));
+						(!options.crawlInnerLinksOnly || task.isInnerLink) &&
+						(!options.crawlNoParent || task.isChild));
 				tasks.splice(tasks.length, 0, ...newTasks);
 			}
 		}
@@ -152,10 +153,14 @@ function createTask(url, options, parentTask, rootTask) {
 	url = parentTask ? rewriteURL(url, options.crawlRemoveURLFragment, options.crawlRewriteRules) : url;
 	if (VALID_URL_TEST.test(url)) {
 		const isInnerLink = rootTask && url.startsWith(getHostURL(rootTask.url));
+		const rootBaseURIMatch = rootTask && rootTask.url.match(/(.*?)[^/]*$/);
+		const isChild = isInnerLink && rootBaseURIMatch && rootBaseURIMatch[1] && url.startsWith(rootBaseURIMatch[1]);
 		return {
 			url,
 			isInnerLink,
+			isChild,
 			originalUrl: url,
+			rootBaseURI: rootBaseURIMatch && rootBaseURIMatch[1],
 			depth: parentTask ? parentTask.depth + 1 : 0,
 			externalLinkDepth: isInnerLink ? -1 : parentTask ? parentTask.externalLinkDepth + 1 : -1,
 			options