Skip to content

Commit 2f9a61f

Browse files
ikreymerSuaYoo
andauthored
custom prefix additional fixes (#2746)
- follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function - Show URL prefix list in settings for custom prefix scope. - Update user guide with correct custom prefix field. --------- Co-authored-by: sua yoo <sua@webrecorder.org>
1 parent 74aec5d commit 2f9a61f

File tree

5 files changed

+67
-18
lines changed

5 files changed

+67
-18
lines changed

frontend/docs/docs/user-guide/workflow-setup.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ _Site Crawl_
6060
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
6161

6262
`Custom Page Prefix`
63-
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
63+
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope).
6464

6565
### Page URL(s)
6666

@@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs
9191

9292
Instructs the crawler to stop visiting new links past a specified depth.
9393

94-
### Extra URL Prefixes in Scope
94+
### URL Prefixes in Scope
9595

96-
This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found.
96+
When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found.
9797

98-
This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`.
98+
By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed.
99+
100+
This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages).
99101

100102
### Include Any Linked Page ("one hop out")
101103

frontend/src/components/ui/config-details.ts

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1";
33
import { html, nothing, type TemplateResult } from "lit";
44
import { customElement, property, state } from "lit/decorators.js";
55
import { when } from "lit/directives/when.js";
6-
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
76
import capitalize from "lodash/fp/capitalize";
8-
import RegexColorize from "regex-colorize";
97

108
import { BtrixElement } from "@/classes/BtrixElement";
119
import { none, notSpecified } from "@/layouts/empty";
@@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section";
2119
import type { Collection } from "@/types/collection";
2220
import { WorkflowScopeType } from "@/types/workflow";
2321
import { isApiError } from "@/utils/api";
22+
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
2423
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
2524
import { humanizeSchedule } from "@/utils/cron";
2625
import { pluralOf } from "@/utils/pluralize";
@@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement {
433432
: undefined,
434433
true,
435434
)}
436-
${when(scopeType === WorkflowScopeType.Prefix, () =>
435+
${when(scopeType === WorkflowScopeType.Custom, () =>
437436
this.renderSetting(
438-
msg("Extra URL Prefixes in Scope"),
437+
msg("URL Prefixes in Scope"),
439438
includeUrlList.length
440439
? html`
441-
<ul>
442-
${includeUrlList.map(
443-
(url: string) =>
444-
staticHtml`<li class="regex">${unsafeStatic(
445-
new RegexColorize().colorizeText(url) as string,
446-
)}</li>`,
447-
)}
448-
</ul>
440+
<btrix-data-table
441+
.columns=${[msg("URL Prefix")]}
442+
.rows=${includeUrlList.map((url) => [
443+
unescapeCustomPrefix(url),
444+
])}
445+
>
446+
</btrix-data-table>
449447
`
450448
: none,
451449
true,
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import { expect } from "@open-wc/testing";
2+
3+
import { unescapeCustomPrefix } from "./unescapeCustomPrefix";
4+
5+
describe("unescapeCustomPrefix", () => {
6+
it("doesn't modify a text URL", () => {
7+
expect(unescapeCustomPrefix("https://example.com/")).to.equal(
8+
"https://example.com/",
9+
);
10+
});
11+
12+
it("doesn't modify a text URL with query params", () => {
13+
expect(
14+
unescapeCustomPrefix("https://example.com/page?query&foo=bar"),
15+
).to.equal("https://example.com/page?query&foo=bar");
16+
});
17+
18+
it("escapes a regex URL", () => {
19+
expect(unescapeCustomPrefix("https://example\\.com/")).to.equal(
20+
"https://example.com/",
21+
);
22+
});
23+
24+
it("escapes a regex URL with query params", () => {
25+
expect(
26+
unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"),
27+
).to.equal("https://example.com/page?query&foo=bar");
28+
});
29+
30+
it("removes leading ^ from a regex URL", () => {
31+
expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal(
32+
"https://example.com/",
33+
);
34+
});
35+
36+
it("removes multiple leading ^ from a regex URL", () => {
37+
expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal(
38+
"https://example.com/",
39+
);
40+
});
41+
});
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { regexUnescape } from "@/utils/string";
2+
3+
/**
4+
* Unescape "custom" scope prefix URL for user display
5+
*/
6+
export function unescapeCustomPrefix(urlPrefix: string) {
7+
return regexUnescape(urlPrefix.replace(/^\^+/, ""));
8+
}

frontend/src/utils/workflow.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ import {
1818
WorkflowScopeType,
1919
type NewWorkflowOnlyScopeType,
2020
} from "@/types/workflow";
21+
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
2122
import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
2223
import { getNextDate, getScheduleInterval } from "@/utils/cron";
2324
import localize, { getDefaultLang } from "@/utils/localize";
24-
import { regexUnescape } from "@/utils/string";
2525

2626
export const BYTES_PER_GB = 1e9;
2727
export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
@@ -218,7 +218,7 @@ export function getInitialFormState(params: {
218218
if (primarySeedConfig.include?.length) {
219219
formState.customIncludeUrlList = primarySeedConfig.include
220220
// Unescape regex
221-
.map(regexUnescape)
221+
.map(unescapeCustomPrefix)
222222
.join("\n");
223223
// if we have additional include URLs, set to "custom" scope here
224224
// to indicate 'Custom Page Prefix' option

0 commit comments

Comments
 (0)