|
| 1 | +import { CommandTypes, sendCommandToContent } from '@/bus/Command'; |
| 2 | + |
| 3 | +interface CrawlerState { |
| 4 | + isActive: boolean; |
| 5 | + nextProcessTime: number; |
| 6 | + rateLimit: number; |
| 7 | +} |
| 8 | + |
| 9 | +interface QueueUrlsResponse { |
| 10 | + accepted: number; |
| 11 | + rejected: number; |
| 12 | + queueSize: number; |
| 13 | + crawledCount: number; |
| 14 | +} |
| 15 | + |
| 16 | +interface NextUrlResponse { |
| 17 | + url: string; |
| 18 | +} |
| 19 | + |
| 20 | +interface QueueUrlsRequest { |
| 21 | + urls: string[]; |
| 22 | + sourceUrl: string; |
| 23 | +} |
| 24 | + |
| 25 | +class Crawler { |
| 26 | + private readonly state: CrawlerState; |
| 27 | + private process: ( html: string ) => Promise< void >; |
| 28 | + |
| 29 | + constructor() { |
| 30 | + this.state = { |
| 31 | + isActive: false, |
| 32 | + nextProcessTime: 0, |
| 33 | + rateLimit: 1.0, // pages per sec; 1.0 means 1000ms delay between delays |
| 34 | + }; |
| 35 | + // Initialize with empty process function |
| 36 | + this.process = async () => {}; |
| 37 | + } |
| 38 | + |
| 39 | + private log( level: 'log' | 'warn' | 'error', ...args: any[] ): void { |
| 40 | + console[ level ]( ...args ); |
| 41 | + } |
| 42 | + |
| 43 | + // Allow setting the process function |
| 44 | + public setProcessFunction( |
| 45 | + processFn: ( html: string ) => Promise< void > |
| 46 | + ): void { |
| 47 | + this.process = processFn; |
| 48 | + } |
| 49 | + |
| 50 | + public async start(): Promise< void > { |
| 51 | + if ( this.state.isActive ) { |
| 52 | + this.log( 'log', 'Crawler already running' ); |
| 53 | + return; |
| 54 | + } |
| 55 | + |
| 56 | + this.state.isActive = true; |
| 57 | + this.log( 'log', 'Crawler started' ); |
| 58 | + |
| 59 | + while ( this.state.isActive ) { |
| 60 | + const next = await this.getNextUrl(); |
| 61 | + if ( next ) { |
| 62 | + await this.processUrl( next ); |
| 63 | + } else { |
| 64 | + this.state.isActive = false; |
| 65 | + this.log( 'log', 'Crawler finished' ); |
| 66 | + } |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + private async processUrl( url: string ): Promise< void > { |
| 71 | + this.log( 'log', 'processing url', url ); |
| 72 | + try { |
| 73 | + // Wait until we're allowed to process the next URL |
| 74 | + await this.waitForRateLimit(); |
| 75 | + |
| 76 | + await this.navigateToUrl( url ); |
| 77 | + |
| 78 | + // @TODO: Get the HTML content via bus? |
| 79 | + const html = document.documentElement.outerHTML; |
| 80 | + |
| 81 | + // Process the page content |
| 82 | + await this.process( html ); |
| 83 | + |
| 84 | + // Extract and queue new URLs |
| 85 | + const links = this.extractLinks( html ); |
| 86 | + await this.queueUrls( links, url ); |
| 87 | + } catch ( error ) { |
| 88 | + this.log( 'error', 'Error processing URL', url, error ); |
| 89 | + this.state.isActive = false; |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + private async waitForRateLimit(): Promise< void > { |
| 94 | + const now = Date.now(); |
| 95 | + const delayMs = 1000 / this.state.rateLimit; // Convert rate limit to milliseconds between requests |
| 96 | + |
| 97 | + if ( now < this.state.nextProcessTime ) { |
| 98 | + await new Promise( ( resolve ) => |
| 99 | + setTimeout( resolve, this.state.nextProcessTime - now ) |
| 100 | + ); |
| 101 | + } |
| 102 | + |
| 103 | + // Calculate next allowed process time using the delay |
| 104 | + this.state.nextProcessTime = now + delayMs; |
| 105 | + } |
| 106 | + |
| 107 | + private extractLinks( htmlString: string ): string[] { |
| 108 | + // Create a DOM parser instance |
| 109 | + const parser = new DOMParser(); |
| 110 | + |
| 111 | + // Parse the HTML string into a document |
| 112 | + const doc = parser.parseFromString( htmlString, 'text/html' ); |
| 113 | + |
| 114 | + // Find all anchor tags |
| 115 | + const linkElements = doc.querySelectorAll( 'a' ); |
| 116 | + |
| 117 | + // Convert NodeList to Array and extract link data |
| 118 | + const links = Array.from( linkElements ).map( ( link ) => { |
| 119 | + // Get the href attribute |
| 120 | + const href = link.getAttribute( 'href' ); |
| 121 | + |
| 122 | + // Skip if no href, or it's a javascript: link or anchor link |
| 123 | + if ( |
| 124 | + ! href || |
| 125 | + href.startsWith( 'javascript:' ) || |
| 126 | + href.startsWith( '#' ) |
| 127 | + ) { |
| 128 | + return null; |
| 129 | + } |
| 130 | + |
| 131 | + // Try to resolve relative URLs to absolute |
| 132 | + let absoluteUrl; |
| 133 | + try { |
| 134 | + absoluteUrl = new URL( href, window.location.origin ).href; |
| 135 | + } catch ( e ) { |
| 136 | + // If URL parsing fails, use the original href |
| 137 | + absoluteUrl = href; |
| 138 | + } |
| 139 | + |
| 140 | + const isExternal = link.hostname !== window.location.hostname; |
| 141 | + if ( isExternal ) { |
| 142 | + return null; |
| 143 | + } |
| 144 | + |
| 145 | + return absoluteUrl; |
| 146 | + } ); |
| 147 | + |
| 148 | + // Filter out null values and return unique links |
| 149 | + return links |
| 150 | + .filter( ( link ) => link !== null ) |
| 151 | + .filter( |
| 152 | + ( link, index, self ) => |
| 153 | + index === self.findIndex( ( l ) => l === link ) |
| 154 | + ); |
| 155 | + } |
| 156 | + |
| 157 | + private async queueUrls( |
| 158 | + urls: string[], |
| 159 | + sourceUrl: string, |
| 160 | + retryCount = 0, |
| 161 | + maxRetries = 5 |
| 162 | + ): Promise< QueueUrlsResponse > { |
| 163 | + const request: QueueUrlsRequest = { |
| 164 | + urls, |
| 165 | + sourceUrl, |
| 166 | + }; |
| 167 | + |
| 168 | + const response = await fetch( '/crawl-api/queue-urls', { |
| 169 | + method: 'POST', |
| 170 | + headers: { 'Content-Type': 'application/json' }, |
| 171 | + body: JSON.stringify( request ), |
| 172 | + } ); |
| 173 | + |
| 174 | + if ( ! response.ok ) { |
| 175 | + this.log( |
| 176 | + 'warn', |
| 177 | + `Attempt ${ |
| 178 | + retryCount + 1 |
| 179 | + }/${ maxRetries } failed: HTTP error! status: ${ |
| 180 | + response.status |
| 181 | + }` |
| 182 | + ); |
| 183 | + |
| 184 | + if ( retryCount >= maxRetries - 1 ) { |
| 185 | + return Promise.reject( |
| 186 | + new Error( |
| 187 | + `Failed to queue URLs after ${ maxRetries } attempts` |
| 188 | + ) |
| 189 | + ); |
| 190 | + } |
| 191 | + |
| 192 | + // Wait before retrying |
| 193 | + await this.sleep(); |
| 194 | + |
| 195 | + // Recursive call |
| 196 | + return this.queueUrls( urls, sourceUrl, retryCount++, maxRetries ); |
| 197 | + } |
| 198 | + |
| 199 | + return response.json(); |
| 200 | + } |
| 201 | + |
| 202 | + private async sleep( ms: number = 1000 ): Promise< void > { |
| 203 | + return new Promise( ( resolve ) => setTimeout( resolve, ms ) ); |
| 204 | + } |
| 205 | + |
| 206 | + private async getNextUrl( |
| 207 | + retryCount = 0, |
| 208 | + maxRetries = 5 |
| 209 | + ): Promise< string | null > { |
| 210 | + const response = await fetch( '/crawl-api/next-url' ); |
| 211 | + |
| 212 | + // crawling queue is finished |
| 213 | + if ( response.status === 204 ) { |
| 214 | + return null; |
| 215 | + } |
| 216 | + |
| 217 | + if ( ! response.ok ) { |
| 218 | + this.log( |
| 219 | + 'warn', |
| 220 | + `Attempt ${ |
| 221 | + retryCount + 1 |
| 222 | + }/${ maxRetries } failed: HTTP error! status: ${ |
| 223 | + response.status |
| 224 | + }` |
| 225 | + ); |
| 226 | + |
| 227 | + if ( retryCount >= maxRetries - 1 ) { |
| 228 | + return Promise.reject( |
| 229 | + new Error( |
| 230 | + `Failed to get next URL after ${ maxRetries } attempts` |
| 231 | + ) |
| 232 | + ); |
| 233 | + } |
| 234 | + |
| 235 | + // Wait before retrying |
| 236 | + await this.sleep(); |
| 237 | + |
| 238 | + // Recursive call |
| 239 | + return this.getNextUrl( retryCount++, maxRetries ); |
| 240 | + } |
| 241 | + |
| 242 | + const data: NextUrlResponse = await response.json(); |
| 243 | + return data.url; |
| 244 | + } |
| 245 | + |
| 246 | + private async navigateToUrl( url: string ): Promise< void > { |
| 247 | + void sendCommandToContent( { |
| 248 | + type: CommandTypes.NavigateTo, |
| 249 | + payload: { url }, |
| 250 | + } ); |
| 251 | + } |
| 252 | + |
| 253 | + public stop(): void { |
| 254 | + this.state.isActive = false; |
| 255 | + } |
| 256 | + |
| 257 | + public updateRateLimit( newLimit: number ): void { |
| 258 | + // only allow between 0.1 and 10 pages per second - no reason for this limit; feel free to change |
| 259 | + this.state.rateLimit = Math.max( 0.1, Math.min( 10.0, newLimit ) ); |
| 260 | + } |
| 261 | +} |
0 commit comments