Skip to content

Commit ccab5ae

Browse files
committed
define crawler with backend as queue storage
1 parent 9a87303 commit ccab5ae

File tree

2 files changed

+272
-1
lines changed

2 files changed

+272
-1
lines changed

.eslintrc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,15 @@
2222
"no-console": [
2323
"off"
2424
]
25-
}
25+
},
26+
"overrides": [
27+
{
28+
"files": [
29+
"src/crawler/**/*.ts"
30+
],
31+
"rules": {
32+
"react/no-is-mounted": "off"
33+
}
34+
}
35+
]
2636
}

src/crawler/crawler.ts

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
import { CommandTypes, sendCommandToContent } from '@/bus/Command';
2+
3+
interface CrawlerState {
4+
isActive: boolean;
5+
nextProcessTime: number;
6+
rateLimit: number;
7+
}
8+
9+
interface QueueUrlsResponse {
10+
accepted: number;
11+
rejected: number;
12+
queueSize: number;
13+
crawledCount: number;
14+
}
15+
16+
interface NextUrlResponse {
17+
url: string;
18+
}
19+
20+
interface QueueUrlsRequest {
21+
urls: string[];
22+
sourceUrl: string;
23+
}
24+
25+
class Crawler {
26+
private readonly state: CrawlerState;
27+
private process: ( html: string ) => Promise< void >;
28+
29+
constructor() {
30+
this.state = {
31+
isActive: false,
32+
nextProcessTime: 0,
33+
rateLimit: 1.0, // pages per sec; 1.0 means 1000ms delay between delays
34+
};
35+
// Initialize with empty process function
36+
this.process = async () => {};
37+
}
38+
39+
private log( level: 'log' | 'warn' | 'error', ...args: any[] ): void {
40+
console[ level ]( ...args );
41+
}
42+
43+
// Allow setting the process function
44+
public setProcessFunction(
45+
processFn: ( html: string ) => Promise< void >
46+
): void {
47+
this.process = processFn;
48+
}
49+
50+
public async start(): Promise< void > {
51+
if ( this.state.isActive ) {
52+
this.log( 'log', 'Crawler already running' );
53+
return;
54+
}
55+
56+
this.state.isActive = true;
57+
this.log( 'log', 'Crawler started' );
58+
59+
while ( this.state.isActive ) {
60+
const next = await this.getNextUrl();
61+
if ( next ) {
62+
await this.processUrl( next );
63+
} else {
64+
this.state.isActive = false;
65+
this.log( 'log', 'Crawler finished' );
66+
}
67+
}
68+
}
69+
70+
private async processUrl( url: string ): Promise< void > {
71+
this.log( 'log', 'processing url', url );
72+
try {
73+
// Wait until we're allowed to process the next URL
74+
await this.waitForRateLimit();
75+
76+
await this.navigateToUrl( url );
77+
78+
// @TODO: Get the HTML content via bus?
79+
const html = document.documentElement.outerHTML;
80+
81+
// Process the page content
82+
await this.process( html );
83+
84+
// Extract and queue new URLs
85+
const links = this.extractLinks( html );
86+
await this.queueUrls( links, url );
87+
} catch ( error ) {
88+
this.log( 'error', 'Error processing URL', url, error );
89+
this.state.isActive = false;
90+
}
91+
}
92+
93+
private async waitForRateLimit(): Promise< void > {
94+
const now = Date.now();
95+
const delayMs = 1000 / this.state.rateLimit; // Convert rate limit to milliseconds between requests
96+
97+
if ( now < this.state.nextProcessTime ) {
98+
await new Promise( ( resolve ) =>
99+
setTimeout( resolve, this.state.nextProcessTime - now )
100+
);
101+
}
102+
103+
// Calculate next allowed process time using the delay
104+
this.state.nextProcessTime = now + delayMs;
105+
}
106+
107+
private extractLinks( htmlString: string ): string[] {
108+
// Create a DOM parser instance
109+
const parser = new DOMParser();
110+
111+
// Parse the HTML string into a document
112+
const doc = parser.parseFromString( htmlString, 'text/html' );
113+
114+
// Find all anchor tags
115+
const linkElements = doc.querySelectorAll( 'a' );
116+
117+
// Convert NodeList to Array and extract link data
118+
const links = Array.from( linkElements ).map( ( link ) => {
119+
// Get the href attribute
120+
const href = link.getAttribute( 'href' );
121+
122+
// Skip if no href, or it's a javascript: link or anchor link
123+
if (
124+
! href ||
125+
href.startsWith( 'javascript:' ) ||
126+
href.startsWith( '#' )
127+
) {
128+
return null;
129+
}
130+
131+
// Try to resolve relative URLs to absolute
132+
let absoluteUrl;
133+
try {
134+
absoluteUrl = new URL( href, window.location.origin ).href;
135+
} catch ( e ) {
136+
// If URL parsing fails, use the original href
137+
absoluteUrl = href;
138+
}
139+
140+
const isExternal = link.hostname !== window.location.hostname;
141+
if ( isExternal ) {
142+
return null;
143+
}
144+
145+
return absoluteUrl;
146+
} );
147+
148+
// Filter out null values and return unique links
149+
return links
150+
.filter( ( link ) => link !== null )
151+
.filter(
152+
( link, index, self ) =>
153+
index === self.findIndex( ( l ) => l === link )
154+
);
155+
}
156+
157+
private async queueUrls(
158+
urls: string[],
159+
sourceUrl: string,
160+
retryCount = 0,
161+
maxRetries = 5
162+
): Promise< QueueUrlsResponse > {
163+
const request: QueueUrlsRequest = {
164+
urls,
165+
sourceUrl,
166+
};
167+
168+
const response = await fetch( '/crawl-api/queue-urls', {
169+
method: 'POST',
170+
headers: { 'Content-Type': 'application/json' },
171+
body: JSON.stringify( request ),
172+
} );
173+
174+
if ( ! response.ok ) {
175+
this.log(
176+
'warn',
177+
`Attempt ${
178+
retryCount + 1
179+
}/${ maxRetries } failed: HTTP error! status: ${
180+
response.status
181+
}`
182+
);
183+
184+
if ( retryCount >= maxRetries - 1 ) {
185+
return Promise.reject(
186+
new Error(
187+
`Failed to queue URLs after ${ maxRetries } attempts`
188+
)
189+
);
190+
}
191+
192+
// Wait before retrying
193+
await this.sleep();
194+
195+
// Recursive call
196+
return this.queueUrls( urls, sourceUrl, retryCount++, maxRetries );
197+
}
198+
199+
return response.json();
200+
}
201+
202+
private async sleep( ms: number = 1000 ): Promise< void > {
203+
return new Promise( ( resolve ) => setTimeout( resolve, ms ) );
204+
}
205+
206+
private async getNextUrl(
207+
retryCount = 0,
208+
maxRetries = 5
209+
): Promise< string | null > {
210+
const response = await fetch( '/crawl-api/next-url' );
211+
212+
// crawling queue is finished
213+
if ( response.status === 204 ) {
214+
return null;
215+
}
216+
217+
if ( ! response.ok ) {
218+
this.log(
219+
'warn',
220+
`Attempt ${
221+
retryCount + 1
222+
}/${ maxRetries } failed: HTTP error! status: ${
223+
response.status
224+
}`
225+
);
226+
227+
if ( retryCount >= maxRetries - 1 ) {
228+
return Promise.reject(
229+
new Error(
230+
`Failed to get next URL after ${ maxRetries } attempts`
231+
)
232+
);
233+
}
234+
235+
// Wait before retrying
236+
await this.sleep();
237+
238+
// Recursive call
239+
return this.getNextUrl( retryCount++, maxRetries );
240+
}
241+
242+
const data: NextUrlResponse = await response.json();
243+
return data.url;
244+
}
245+
246+
private async navigateToUrl( url: string ): Promise< void > {
247+
void sendCommandToContent( {
248+
type: CommandTypes.NavigateTo,
249+
payload: { url },
250+
} );
251+
}
252+
253+
public stop(): void {
254+
this.state.isActive = false;
255+
}
256+
257+
public updateRateLimit( newLimit: number ): void {
258+
// only allow between 0.1 and 10 pages per second - no reason for this limit; feel free to change
259+
this.state.rateLimit = Math.max( 0.1, Math.min( 10.0, newLimit ) );
260+
}
261+
}

0 commit comments

Comments
 (0)