Skip to content

Commit 2465e5d

Browse files
authored
Merge pull request #19 from DocumindHQ/multi-files
Support for more files and schema field types
2 parents bee8f5a + 24290b6 commit 2465e5d

File tree

6 files changed

+53
-6
lines changed

6 files changed

+53
-6
lines changed

core/dist/openAI.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ const getCompletion = async ({ apiKey, imagePath, llmParams, maintainFormat, mod
2424
}
2525
}
2626
const systemPrompt = `
27-
Convert the following document page to markdown.
27+
Convert the following image/document to markdown.
2828
Return only the markdown with no explanation text. Do not include deliminators like '''markdown.
2929
You must include all information on the page. Do not exclude headers, footers, or subtext.
3030
`;

core/src/openAI.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export const getCompletion = async ({
3333
}
3434

3535
const systemPrompt = `
36-
Convert the following document page to markdown.
36+
Convert the following image/document to markdown.
3737
Return only the markdown with no explanation text. Do not include deliminators like '''markdown.
3838
You must include all information on the page. Do not exclude headers, footers, or subtext.
3939
`;

extractor/src/converter.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ export const convertFile = async (filePath, model) => {
1313
const totalPages = pages.length;
1414

1515
const markdown = await generateMarkdownDocument(pages);
16+
console.log('Markdown generated', markdown);
1617

1718
return { markdown, totalPages, fileName };
1819
} catch (error) {

extractor/src/services/extract.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { extractData } from '../main-extractor.js';
2-
import { isPdfFile } from '../utils/pdfValidator.js';
2+
import { isValidFile } from '../utils/fileValidator.js';
33
import { validateSchema } from '../utils/schemaValidator.js';
44
import { getTemplate } from './templates.js';
55

@@ -19,8 +19,8 @@ export async function extract({ file, schema, template, model, autoSchema }) {
1919
throw new Error('File is required.');
2020
}
2121

22-
if (!isPdfFile(file)) {
23-
throw new Error('File must be a PDF.');
22+
if (!isValidFile(file)) {
23+
throw new Error('File must be a valid format: PDF, PNG, JPG, TXT, DOCX, or HTML.');
2424
}
2525

2626
let finalSchema = null;

extractor/src/utils/fileValidator.js

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import axios from 'axios';
2+
3+
/**
4+
* Function to check if a file is valid based on its URL or MIME type
5+
* @param {string} file - The URL to the file
6+
* @returns {Promise<boolean>} - Resolves to true if the file is valid, false otherwise
7+
*/
8+
export async function isValidFile(file) {
9+
const allowedExtensions = ['pdf', 'png', 'jpg', 'jpeg', 'txt', 'docx', 'html'];
10+
const allowedMimeTypes = {
11+
pdf: 'application/pdf',
12+
png: 'image/png',
13+
jpg: 'image/jpeg',
14+
jpeg: 'image/jpeg',
15+
txt: 'text/plain',
16+
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
17+
html: 'text/html',
18+
};
19+
20+
const urlPath = new URL(file).pathname;
21+
const extensionRegex = new RegExp(`\\.(${allowedExtensions.join('|')})$`, 'i');
22+
23+
if (!extensionRegex.test(urlPath)) {
24+
return false;
25+
}
26+
27+
// Optional: Check the MIME type if query parameters are used
28+
try {
29+
const response = await axios.head(file);
30+
const contentType = response.headers['content-type'];
31+
return Object.values(allowedMimeTypes).some(mime => contentType.startsWith(mime));
32+
} catch (error) {
33+
console.error('Error checking MIME type:', error);
34+
return false;
35+
}
36+
}

extractor/src/utils/schemaValidator.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* @returns {Object} - { isValid: boolean, errors: Array<string> }
55
*/
66
export function validateSchema(schema) {
7-
const validTypes = ["string", "number", "array", "object"];
7+
const validTypes = ["string", "number", "array", "object", "boolean", "enum"];
88
let errors = [];
99

1010
if (!Array.isArray(schema)) {
@@ -36,7 +36,17 @@ export function validateSchema(schema) {
3636
field.children.forEach((child, index) => validateField(child, `${path}.children[${index}]`));
3737
}
3838
}
39+
40+
// Additional checks for enum
41+
if (field.type === "enum") {
42+
if (!field.hasOwnProperty("values") || !Array.isArray(field.values) || field.values.length === 0) {
43+
errors.push(`"values" is required and must be a non-empty array for enum at ${path}`);
44+
} else if (!field.values.every((value) => typeof value === "string")) {
45+
errors.push(`"values" for enum at ${path} must be an array of strings`);
46+
}
3947
}
48+
49+
}
4050

4151
schema.forEach((field, index) => validateField(field, `schema[${index}]`));
4252

0 commit comments

Comments
 (0)