@@ -6,9 +6,12 @@ import * as Core from '../core';
6
6
7
7
export class Parse extends APIResource {
8
8
/**
9
- * Parse a file into a structured Markdown representation. The file size must be
10
- * less than 100MB and the number of pages must be less than 400.
9
+ * Parse a file into a structured Markdown and/or JSON. Files must be less than
10
+ * 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
11
+ * PDF, which may affect page count.
11
12
*
13
+ * See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
14
+ * [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
12
15
* Email [parse-feedback@contextual.ai](mailto:parse-feedback@contextual.ai) with
13
16
* any feedback or questions.
14
17
*/
@@ -92,6 +95,11 @@ export interface ParseJobResultsResponse {
92
95
*/
93
96
status : 'pending' | 'processing' | 'retrying' | 'completed' | 'failed' | 'cancelled' ;
94
97
98
+ /**
99
+ * Document-level metadata parsed from the document
100
+ */
101
+ document_metadata ?: ParseJobResultsResponse . DocumentMetadata ;
102
+
95
103
/**
96
104
* The parsed, structured Markdown of the input file. Only present if
97
105
* `markdown-document` was among the requested output types.
@@ -103,137 +111,143 @@ export interface ParseJobResultsResponse {
103
111
* requested) and/or per-page `ParsedBlock`s (if `blocks-per-page` was requested).
104
112
*/
105
113
pages ?: Array < ParseJobResultsResponse . Page > ;
106
-
107
- /**
108
- * The table of contents representing the document's heading hierarchy. Only
109
- * present if `enable_document_hierarchy` was set to true in the parse request.
110
- */
111
- table_of_contents ?: ParseJobResultsResponse . TableOfContents ;
112
114
}
113
115
114
116
export namespace ParseJobResultsResponse {
115
117
/**
116
- * Per-page parse results.
118
+ * Document-level metadata parsed from the document
117
119
*/
118
- export interface Page {
120
+ export interface DocumentMetadata {
119
121
/**
120
- * The index of the parsed page (zero-indexed)
122
+ * Hierarchy of the document, as both heading blocks and a markdown table of
123
+ * contents
121
124
*/
122
- index : number ;
123
-
124
- /**
125
- * The parsed, structured blocks of this page. Present if `blocks-per-page` was
126
- * among the requested output types.
127
- */
128
- blocks ?: Array < Page . Block > ;
129
-
130
- /**
131
- * The parsed, structured Markdown of this page. Present if `markdown-per-page` was
132
- * among the requested output types.
133
- */
134
- markdown ?: string ;
125
+ hierarchy ?: DocumentMetadata . Hierarchy ;
135
126
}
136
127
137
- export namespace Page {
128
+ export namespace DocumentMetadata {
138
129
/**
139
- * One logical block of content from a parsed page.
130
+ * Hierarchy of the document, as both heading blocks and a markdown table of
131
+ * contents
140
132
*/
141
- export interface Block {
133
+ export interface Hierarchy {
142
134
/**
143
- * Unique ID of the block
135
+ * Heading blocks which define the hierarchy of the document
144
136
*/
145
- id : string ;
137
+ blocks ?: Array < Hierarchy . Block > ;
146
138
147
139
/**
148
- * The normalized bounding box of the block, as relative percentages of the page
149
- * width and height
140
+ * Markdown representation of the table of contents for this document
150
141
*/
151
- bounding_box : Block . BoundingBox ;
152
-
153
- /**
154
- * The Markdown representation of the block
155
- */
156
- markdown : string ;
142
+ table_of_contents ?: string ;
143
+ }
157
144
145
+ export namespace Hierarchy {
158
146
/**
159
- * The type of the block
147
+ * One logical block of content from a parsed page.
160
148
*/
161
- type : 'heading' | 'text' | 'table' | 'figure' ;
149
+ export interface Block {
150
+ /**
151
+ * Unique ID of the block
152
+ */
153
+ id : string ;
162
154
163
- /**
164
- * The confidence level of this block categorized as 'low', 'medium', or 'high'.
165
- * Only available for blocks of type 'table' currently.
166
- */
167
- confidence_level ?: 'low' | 'medium' | 'high' ;
155
+ /**
156
+ * The normalized bounding box of the block, as relative percentages of the page
157
+ * width and height
158
+ */
159
+ bounding_box : Block . BoundingBox ;
168
160
169
- /**
170
- * The level of the block in the document hierarchy, starting at 0 for the
171
- * root-level title block. Only present if `enable_document_hierarchy` was set to
172
- * true in the request.
173
- */
174
- hierarchy_level ?: number ;
161
+ /**
162
+ * The Markdown representation of the block
163
+ */
164
+ markdown : string ;
175
165
176
- /**
177
- * The page (0-indexed) that this block belongs to. Only set for heading blocks
178
- * that are returned in the table of contents.
179
- */
180
- page_index ?: number ;
166
+ /**
167
+ * The type of the block
168
+ */
169
+ type : 'heading' | 'text' | 'table' | 'figure' ;
181
170
182
- /**
183
- * The IDs of the parent in the document hierarchy, sorted from root-level to
184
- * bottom. For root-level heading blocks, this will be an empty list. Only present
185
- * if `enable_document_hierarchy` was set to true in the request.
186
- */
187
- parent_ids ?: Array < string > ;
188
- }
171
+ /**
172
+ * The confidence level of this block categorized as 'low', 'medium', or 'high'.
173
+ * Only available for blocks of type 'table' currently.
174
+ */
175
+ confidence_level ?: 'low' | 'medium' | 'high' ;
189
176
190
- export namespace Block {
191
- /**
192
- * The normalized bounding box of the block, as relative percentages of the page
193
- * width and height
194
- */
195
- export interface BoundingBox {
196
177
/**
197
- * The x-coordinate of the top-left corner of the bounding box
178
+ * The level of the block in the document hierarchy, starting at 0 for the
179
+ * root-level title block. Only present if `enable_document_hierarchy` was set to
180
+ * true in the request.
198
181
*/
199
- x0 : number ;
182
+ hierarchy_level ? : number ;
200
183
201
184
/**
202
- * The x-coordinate of the bottom-right corner of the bounding box
185
+ * The page (0-indexed) that this block belongs to. Only set for heading blocks
186
+ * that are returned in the table of contents.
203
187
*/
204
- x1 : number ;
188
+ page_index ? : number ;
205
189
206
190
/**
207
- * The y-coordinate of the top-left corner of the bounding box
191
+ * The IDs of the parent in the document hierarchy, sorted from root-level to
192
+ * bottom. For root-level heading blocks, this will be an empty list. Only present
193
+ * if `enable_document_hierarchy` was set to true in the request.
208
194
*/
209
- y0 : number ;
195
+ parent_ids ?: Array < string > ;
196
+ }
210
197
198
+ export namespace Block {
211
199
/**
212
- * The y-coordinate of the bottom-right corner of the bounding box
200
+ * The normalized bounding box of the block, as relative percentages of the page
201
+ * width and height
213
202
*/
214
- y1 : number ;
203
+ export interface BoundingBox {
204
+ /**
205
+ * The x-coordinate of the top-left corner of the bounding box
206
+ */
207
+ x0 : number ;
208
+
209
+ /**
210
+ * The x-coordinate of the bottom-right corner of the bounding box
211
+ */
212
+ x1 : number ;
213
+
214
+ /**
215
+ * The y-coordinate of the top-left corner of the bounding box
216
+ */
217
+ y0 : number ;
218
+
219
+ /**
220
+ * The y-coordinate of the bottom-right corner of the bounding box
221
+ */
222
+ y1 : number ;
223
+ }
215
224
}
216
225
}
217
226
}
218
227
219
228
/**
220
- * The table of contents representing the document's heading hierarchy. Only
221
- * present if `enable_document_hierarchy` was set to true in the parse request.
229
+ * Per-page parse results.
222
230
*/
223
- export interface TableOfContents {
231
+ export interface Page {
232
+ /**
233
+ * The index of the parsed page (zero-indexed)
234
+ */
235
+ index : number ;
236
+
224
237
/**
225
- * Heading blocks that define the hierarchy of the document
238
+ * The parsed, structured blocks of this page. Present if `blocks-per-page` was
239
+ * among the requested output types.
226
240
*/
227
- blocks ?: Array < TableOfContents . Block > ;
241
+ blocks ?: Array < Page . Block > ;
228
242
229
243
/**
230
- * Markdown representation of the table of contents that can be pre-pended to the
231
- * markdown document .
244
+ * The parsed, structured Markdown of this page. Present if `markdown-per-page` was
245
+ * among the requested output types .
232
246
*/
233
247
markdown ?: string ;
234
248
}
235
249
236
- export namespace TableOfContents {
250
+ export namespace Page {
237
251
/**
238
252
* One logical block of content from a parsed page.
239
253
*/
@@ -372,43 +386,43 @@ export interface ParseCreateParams {
372
386
raw_file : Core . Uploadable ;
373
387
374
388
/**
375
- * Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
376
- * table of contents to the output with the structure of the entire parsed
377
- * document . Not permitted in ' basic' parsing_mode, or if page_range is not
378
- * continuous and/or does not start from page zero.
389
+ * Adds a table of contents to the output with the structure of the entire parsed
390
+ * document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
391
+ * H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
392
+ * is not continuous and/or does not start from page zero.
379
393
*/
380
394
enable_document_hierarchy ?: boolean ;
381
395
382
396
/**
383
397
* Controls whether tables are split into multiple tables by row with the headers
384
398
* propagated. Use for improving LLM comprehension of very large tables. Not
385
- * permitted in ' basic' parsing_mode.
399
+ * permitted in ` basic` parsing_mode.
386
400
*/
387
401
enable_split_tables ?: boolean ;
388
402
389
403
/**
390
- * Controls how thorough figure captions are. ' concise' is short and minimizes
391
- * chances of hallucinations. ' detailed' is more thorough and can include
392
- * commentary. Not permitted in ' basic' parsing_mode.
404
+ * Controls how thorough figure captions are. ` concise` is short and minimizes
405
+ * chances of hallucinations. ` detailed` is more thorough and can include
406
+ * commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
393
407
*/
394
408
figure_caption_mode ?: 'concise' | 'detailed' ;
395
409
396
410
/**
397
411
* Threshold number of table cells beyond which large tables are split if
398
- * `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
412
+ * `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
399
413
*/
400
414
max_split_table_cells ?: number ;
401
415
402
416
/**
403
417
* Optional string representing page range to be parsed. Format: comma-separated
404
- * indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
405
- * ' 0-2,5,6'
418
+ * indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
419
+ * ` 0-2,5,6`)
406
420
*/
407
421
page_range ?: string ;
408
422
409
423
/**
410
- * The settings to use for parsing. ' basic' is for simple, text-only documents.
411
- * ' standard' is for complex documents with images, complex hierarchy, and/or no
424
+ * The settings to use for parsing. ` basic` is for simple, text-only documents.
425
+ * ` standard` is for complex documents with images, complex hierarchy, and/or no
412
426
* natively encoded textual data (e.g. for scanned documents).
413
427
*/
414
428
parse_mode ?: 'basic' | 'standard' ;
@@ -417,11 +431,11 @@ export interface ParseCreateParams {
417
431
export interface ParseJobResultsParams {
418
432
/**
419
433
* The desired output format(s) of the parsed file. Must be `markdown-document`,
420
- * `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
421
- * whole document into a single concatenated markdown output. `markdown-per-page`
422
- * provides markdown output per page. `blocks-per-page` provides a structured JSON
434
+ * `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
435
+ * multiple formats in the response. `markdown-document` parses the whole document
436
+ * into a single concatenated markdown output. `markdown-per-page` provides
437
+ * markdown output per page. `blocks-per-page` provides a structured JSON
423
438
* representation of the content blocks on each page, sorted by reading order.
424
- * Specify multiple values to get multiple formats in the response.
425
439
*/
426
440
output_types ?: Array < 'markdown-document' | 'markdown-per-page' | 'blocks-per-page' > ;
427
441
}
0 commit comments