Skip to content

Commit 070d4de

Browse files
authored
Merge pull request #18 from DocumindHQ/formatter
File converters added
2 parents d257972 + 2f2a6d0 commit 070d4de

File tree

3 files changed

+81
-0
lines changed

3 files changed

+81
-0
lines changed

extractor/src/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
export { extract } from './services/extract.js';
22
export { templates } from './services/templates.js';
3+
export { formatter } from './services/formatter.js';

extractor/src/services/formatter.js

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import { convertFile } from '../converter.js';
2+
import { isPdfFile } from '../utils/pdfValidator.js';
3+
import { convertToText } from '../utils/convertToText.js';
4+
5+
/**
6+
* Extracts markdown content from a PDF.
7+
* @param {object} options - Options for extracting the markdown.
8+
* @param {string} options.file - The PDF file.
9+
* @param {string} [options.model] - The LLM model to use.
10+
* @returns {Promise<string>} - The markdown content.
11+
*/
12+
const getMarkdown = async ({ file, model }) => {
13+
try {
14+
if (!file) {
15+
throw new Error('File is required.');
16+
}
17+
18+
if (!isPdfFile(file)) {
19+
throw new Error('File must be a PDF.');
20+
}
21+
22+
const { markdown } = await convertFile(file, model);
23+
24+
if (!markdown) {
25+
throw new Error("Failed to extract markdown.");
26+
}
27+
28+
return markdown;
29+
} catch (error) {
30+
console.error("Error extracting markdown:", error);
31+
throw error;
32+
}
33+
};
34+
35+
/**
36+
* Extracts plain text from a PDF by converting markdown to text.
37+
* @param {object} options - Options for extracting the plain text.
38+
* @param {string} options.file - The path to the PDF file.
39+
* @param {string} [options.model] - The LLM model to use.
40+
* @returns {Promise<string>} - The plain text content.
41+
*/
42+
const getPlainText = async ({ file, model }) => {
43+
try {
44+
const markdown = await getMarkdown({ file, model });
45+
return convertToText(markdown);
46+
} catch (error) {
47+
console.error("Error extracting plain text:", error);
48+
throw error;
49+
}
50+
};
51+
52+
/**
53+
* Formatter object for various formats.
54+
*/
55+
export const formatter = {
56+
markdown: getMarkdown,
57+
plaintext: getPlainText,
58+
};

extractor/src/utils/convertToText.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
export const convertToText = (markdown) => {
2+
if (!markdown || typeof markdown !== "string") {
3+
throw new Error("Valid markdown content is required.");
4+
}
5+
6+
// Strip markdown syntax and handle tables
7+
const plainText = markdown
8+
.replace(/(\*\*|__)(.*?)\1/g, "$2") // Bold
9+
.replace(/(\*|_)(.*?)\1/g, "$2") // Italic
10+
.replace(/(#+\s)/g, "") // Headings
11+
.replace(/\[(.*?)\]\(.*?\)/g, "$1") // Links
12+
.replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Images
13+
.replace(/(```.*?\n[\s\S]*?\n```|`.*?`)/g, "") // Code blocks/inline
14+
.replace(/>+/g, "") // Blockquotes
15+
.replace(/\n{2,}/g, "\n") // Excess newlines
16+
.replace(/\|([^|]*)\|/g, (_, row) => row.trim()) // Table rows
17+
.replace(/-+/g, "") // Table dividers (---|---)
18+
.trim();
19+
20+
return plainText;
21+
};
22+

0 commit comments

Comments
 (0)