Skip to content

Commit 29a9926

Browse files
Merge pull request #13 from scrapfly/extraction-scrape-config
add scrape extraction params and rename extraction template options
2 parents 923097b + 48de0b4 commit 29a9926

File tree

8 files changed

+178
-24
lines changed

8 files changed

+178
-24
lines changed

__tests__/client/extraction.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ Deno.test('extract: fails due to invalid config', async () => {
3939
new ExtractionConfig({
4040
body: html,
4141
content_type: 'text/html',
42-
ephemeral_template: { source: 'html' },
43-
template: 'template',
42+
extraction_ephemeral_template: { source: 'html' },
43+
extraction_template: 'template',
4444
}),
4545
);
4646
},

__tests__/config/extraction.test.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ Deno.test('url param generation: sets charset', async () => {
5858
});
5959
});
6060

61-
Deno.test('url param generation: sets template', async () => {
61+
Deno.test('url param generation: sets extraction_template', async () => {
6262
const config = new ExtractionConfig({
6363
body: input_html,
6464
content_type: input_content_type,
65-
template: 'my_template',
65+
extraction_template: 'my_template',
6666
});
6767
const params = config.toApiParams({ key: '1234' });
6868
assertEquals(params, {
@@ -72,11 +72,11 @@ Deno.test('url param generation: sets template', async () => {
7272
});
7373
});
7474

75-
Deno.test('url param generation: sets ephemeral_template', async () => {
75+
Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
7676
const config = new ExtractionConfig({
7777
body: input_html,
7878
content_type: input_content_type,
79-
ephemeral_template: { source: 'html', selectors: [] },
79+
extraction_ephemeral_template: { source: 'html', selectors: [] },
8080
});
8181
const params = config.toApiParams({ key: '1234' });
8282
assertEquals(params, {

__tests__/config/scrape.test.ts

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { HttpMethod } from '../../src/types.ts';
33
import { ScrapeConfigError } from '../../src/errors.ts';
44
import { assertEquals, assertThrows } from "https://deno.land/std@0.224.0/assert/mod.ts";
55

6+
const input_content_type = 'text/html';
67

78
Deno.test('scrapeconfig loads', () => {
89
const config = new ScrapeConfig({ url: 'http://httpbin.dev/get' });
@@ -15,8 +16,6 @@ Deno.test('scrapeconfig throws on unknown options', () => {
1516
}, ScrapeConfigError, "Invalid option provided: foobar");
1617
});
1718

18-
19-
2019
Deno.test('scrapeconfig allowed methods', () => {
2120
(['GET', 'POST', 'PUT', 'PATCH', 'HEAD'] as HttpMethod[]).forEach((method) => {
2221
const config = new ScrapeConfig({
@@ -360,6 +359,58 @@ Deno.test('url param generation: proxy_pool sets', () => {
360359
});
361360
});
362361

362+
Deno.test('url param generation: sets extraction_template', async () => {
363+
const config = new ScrapeConfig({
364+
url: 'http://httpbin.dev/get',
365+
extraction_template: 'my_template',
366+
});
367+
const params = config.toApiParams({ key: '1234' });
368+
assertEquals(params, {
369+
key: '1234',
370+
url: 'http://httpbin.dev/get',
371+
extraction_template: 'my_template',
372+
});
373+
});
374+
375+
Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
376+
const config = new ScrapeConfig({
377+
url: 'http://httpbin.dev/get',
378+
extraction_ephemeral_template: { source: 'html', selectors: [] },
379+
});
380+
const params = config.toApiParams({ key: '1234' });
381+
assertEquals(params, {
382+
key: '1234',
383+
url: 'http://httpbin.dev/get',
384+
extraction_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
385+
});
386+
});
387+
388+
Deno.test('url param generation: sets extraction_prompt', async () => {
389+
const config = new ScrapeConfig({
390+
url: 'http://httpbin.dev/get',
391+
extraction_prompt: 'summarize the document',
392+
});
393+
const params = config.toApiParams({ key: '1234' });
394+
assertEquals(params, {
395+
key: '1234',
396+
url: 'http://httpbin.dev/get',
397+
extraction_prompt: 'summarize the document',
398+
});
399+
});
400+
401+
Deno.test('url param generation: sets extraction_model', async () => {
402+
const config = new ScrapeConfig({
403+
url: 'http://httpbin.dev/get',
404+
extraction_model: 'review_list',
405+
});
406+
const params = config.toApiParams({ key: '1234' });
407+
assertEquals(params, {
408+
key: '1234',
409+
url: 'http://httpbin.dev/get',
410+
extraction_model: 'review_list',
411+
});
412+
});
413+
363414
Deno.test('url param generation: session sets', () => {
364415
const config = new ScrapeConfig({
365416
url: 'http://httpbin.dev/get',

examples/deno/deno_examples.ts

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,28 @@ export async function JSRender(apiKey: string) {
8181
console.log(scrape_result.result.browser_data);
8282
}
8383

84+
/* Use AI extraction capabilities with the the web scraping API
85+
* all Extraction API methods are supported, see below examples for more
86+
*/
87+
export async function scrapeExtraction(apiKey: string) {
88+
const client = new ScrapflyClient({ key: apiKey});
89+
90+
let scrape_result = await client.scrape(
91+
new ScrapeConfig({
92+
url: 'https://web-scraping.dev/product/1',
93+
// enable browsers:
94+
render_js: true,
95+
// use LLM prompt for auto parsing
96+
extraction_prompt: "Extract the product specification in json format",
97+
})
98+
);
99+
100+
// access the extraction result
101+
console.log("extraction result:");
102+
console.log(scrape_result.result.extracted_data);
103+
}
104+
105+
84106
/* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction
85107
* This example demonstrates how to use LLM query HTML files
86108
* https://scrapfly.io/docs/extraction-api/llm-prompt
@@ -190,7 +212,7 @@ export async function extractionTemplates(apiKey: string){
190212
body: html,
191213
content_type: "text/html",
192214
// provide template:
193-
ephemeral_template: template,
215+
extraction_ephemeral_template: template,
194216
})
195217
);
196218
console.log('product extract');

examples/node_commonjs/commonjs_examples.cjs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,27 @@ async function JSRender(apiKey) {
8181
console.log(scrape_result.result.browser_data);
8282
}
8383

84+
/* Use AI extraction capabilities with the the web scraping API
85+
* all Extraction API methods are supported, see below examples for more
86+
*/
87+
async function scrapeExtraction(apiKey) {
88+
const client = new ScrapflyClient({ key: apiKey});
89+
90+
let scrape_result = await client.scrape(
91+
new ScrapeConfig({
92+
url: 'https://web-scraping.dev/product/1',
93+
// enable browsers:
94+
render_js: true,
95+
// use LLM prompt for auto parsing
96+
extraction_prompt: "Extract the product specification in json format",
97+
})
98+
);
99+
100+
// access the extraction result
101+
console.log("extraction result:");
102+
console.log(scrape_result.result.extracted_data);
103+
}
104+
84105
/* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction
85106
* This example demonstrates how to use LLM query HTML files
86107
* https://scrapfly.io/docs/extraction-api/llm-prompt
@@ -190,7 +211,7 @@ async function extractionTemplates(apiKey){
190211
body: html,
191212
content_type: "text/html",
192213
// provide template:
193-
ephemeral_template: template,
214+
extraction_ephemeral_template: template,
194215
})
195216
);
196217
console.log('product extract');
@@ -233,6 +254,7 @@ module.exports = {
233254
getAccount,
234255
basicGet,
235256
JSRender,
257+
scrapeExtraction,
236258
extractionLLM,
237259
extractionAutoExtract,
238260
extractionTemplates,

src/extractionconfig.ts

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import * as errors from './errors.ts';
22
import { urlsafe_b64encode } from './utils.ts';
3+
import { ExtractionConfigError } from './errors.ts';
34

45
export enum CompressionFormat {
56
/**
@@ -21,30 +22,55 @@ type ExtractionConfigOptions = {
2122
content_type: string;
2223
url?: string;
2324
charset?: string;
24-
template?: string; // saved template name
25-
ephemeral_template?: object; // ephemeraly declared json template
25+
extraction_template?: string; // saved template name
26+
extraction_ephemeral_template?: object; // ephemeraly declared json template
2627
extraction_prompt?: string;
2728
extraction_model?: string;
2829
is_document_compressed?: boolean;
2930
document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat;
3031
webhook?: string;
32+
33+
// deprecated options
34+
template?: string;
35+
ephemeral_template?: object;
3136
};
3237

3338
export class ExtractionConfig {
3439
body: string | Uint8Array;
3540
content_type: string;
3641
url?: string;
3742
charset?: string;
38-
template?: string; // saved template name
39-
ephemeral_template?: object; // ephemeraly declared json template
43+
extraction_template?: string; // saved template name
44+
extraction_ephemeral_template?: object; // ephemeraly declared json template
4045
extraction_prompt?: string;
4146
extraction_model?: string;
4247
is_document_compressed?: boolean;
4348
document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat;
4449
webhook?: string;
4550

51+
// // deprecated options
52+
template?: string;
53+
ephemeral_template?: object;
54+
4655
constructor(options: ExtractionConfigOptions) {
4756
this.validateOptions(options);
57+
if (options.template) {
58+
console.warn(
59+
`Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead.`
60+
);
61+
this.extraction_template = options.template;
62+
} else {
63+
this.extraction_template = options.extraction_template;
64+
}
65+
if (options.ephemeral_template) {
66+
console.warn(
67+
`Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead.`
68+
);
69+
this.extraction_ephemeral_template = options.ephemeral_template;
70+
} else {
71+
this.extraction_ephemeral_template = options.extraction_ephemeral_template;
72+
}
73+
4874
if (
4975
options.document_compression_format &&
5076
!Object.values(CompressionFormat).includes(options.document_compression_format as CompressionFormat)
@@ -57,8 +83,8 @@ export class ExtractionConfig {
5783
this.content_type = options.content_type;
5884
this.url = options.url ?? this.url;
5985
this.charset = options.charset ?? this.charset;
60-
this.template = options.template ?? this.template;
61-
this.ephemeral_template = options.ephemeral_template ?? this.ephemeral_template;
86+
this.extraction_template = options.extraction_template ?? this.extraction_template;
87+
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
6288
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
6389
this.extraction_model = options.extraction_model ?? this.extraction_model;
6490
this.is_document_compressed = options.is_document_compressed ?? this.is_document_compressed;
@@ -90,18 +116,18 @@ export class ExtractionConfig {
90116
params.charset = this.charset;
91117
}
92118

93-
if (this.template && this.ephemeral_template) {
94-
throw new errors.ExtractionConfigError(
95-
'You cannot pass both parameters template and ephemeral_template. You must choose',
119+
if (this.extraction_template && this.extraction_ephemeral_template) {
120+
throw new ExtractionConfigError(
121+
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
96122
);
97123
}
98124

99-
if (this.template) {
100-
params.extraction_template = this.template;
125+
if (this.extraction_template) {
126+
params.extraction_template = this.extraction_template;
101127
}
102128

103-
if (this.ephemeral_template) {
104-
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.ephemeral_template));
129+
if (this.extraction_ephemeral_template) {
130+
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
105131
}
106132

107133
if (this.extraction_prompt) {

src/scrapeconfig.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ type ScrapeConfigOptions = {
6161
tags?: string[];
6262
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
6363
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
64+
extraction_template?: string; // saved template name
65+
extraction_ephemeral_template?: object; // ephemeraly declared json template
66+
extraction_prompt?: string;
67+
extraction_model?: string;
6468
correlation_id?: string;
6569
cookies?: Rec<string>;
6670
body?: string;
@@ -104,6 +108,10 @@ export class ScrapeConfig {
104108
tags: Set<string> = new Set<string>();
105109
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
106110
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
111+
extraction_template?: string; // saved template name
112+
extraction_ephemeral_template?: object; // ephemeraly declared json template
113+
extraction_prompt?: string;
114+
extraction_model?: string;
107115
correlation_id?: string;
108116
cookies?: Rec<string>;
109117
body?: string;
@@ -163,6 +171,10 @@ export class ScrapeConfig {
163171
this.tags = new Set(options.tags) ?? this.tags;
164172
this.format = options.format ?? this.format;
165173
this.format_options = options.format_options ?? this.format_options;
174+
this.extraction_template = options.extraction_template ?? this.extraction_template;
175+
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
176+
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
177+
this.extraction_model = options.extraction_model ?? this.extraction_model;
166178
this.correlation_id = options.correlation_id ?? this.correlation_id;
167179
this.cookies = options.cookies
168180
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
@@ -338,6 +350,27 @@ export class ScrapeConfig {
338350
params.format += ':' + this.format_options.join(',');
339351
}
340352
}
353+
if (this.extraction_template && this.extraction_ephemeral_template) {
354+
throw new ScrapeConfigError(
355+
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
356+
);
357+
}
358+
359+
if (this.extraction_template) {
360+
params.extraction_template = this.extraction_template;
361+
}
362+
363+
if (this.extraction_ephemeral_template) {
364+
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
365+
}
366+
367+
if (this.extraction_prompt) {
368+
params.extraction_prompt = this.extraction_prompt;
369+
}
370+
371+
if (this.extraction_model) {
372+
params.extraction_model = this.extraction_model;
373+
}
341374
if (this.correlation_id) {
342375
params.correlation_id = this.correlation_id;
343376
}

tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"types": ["node", "jest"],
66
"skipLibCheck": true,
77
"module": "ESNext",
8-
"lib": ["ES2022", "ESNext"],
8+
"lib": ["ES2022", "ESNext", "dom"],
99
"moduleResolution": "Node",
1010
"rootDir": ".",
1111
"outDir": "build",

0 commit comments

Comments
 (0)