Merge pull request #18 from DocumindHQ/formatter

Tammilore · web-flow · commit 070d4dee3dd7 · 2024-12-14T15:29:00.000Z
File converters added
diff --git a/extractor/src/index.js b/extractor/src/index.js
@@ -1,2 +1,3 @@
 export { extract } from './services/extract.js';
 export { templates } from './services/templates.js';
+export { formatter } from './services/formatter.js';
diff --git a/extractor/src/services/formatter.js b/extractor/src/services/formatter.js
@@ -0,0 +1,58 @@
+import { convertFile } from '../converter.js';
+import { isPdfFile } from '../utils/pdfValidator.js';
+import { convertToText } from '../utils/convertToText.js';
+
+/**
+ * Extracts markdown content from a PDF.
+ * @param {object} options - Options for extracting the markdown.
+ * @param {string} options.file - The PDF file.
+ * @param {string} [options.model] - The LLM model to use.
+ * @returns {Promise<string>} - The markdown content.
+ */
+const getMarkdown = async ({ file, model }) => {
+  try {
+    if (!file) {
+        throw new Error('File is required.');
+      }
+  
+      if (!isPdfFile(file)) {
+        throw new Error('File must be a PDF.');
+      }
+
+    const { markdown } = await convertFile(file, model);
+
+    if (!markdown) {
+      throw new Error("Failed to extract markdown.");
+    }
+
+    return markdown;
+  } catch (error) {
+    console.error("Error extracting markdown:", error);
+    throw error;
+  }
+};
+
+/**
+ * Extracts plain text from a PDF by converting markdown to text.
+ * @param {object} options - Options for extracting the plain text.
+ * @param {string} options.file - The path to the PDF file.
+ * @param {string} [options.model] - The LLM model to use.
+ * @returns {Promise<string>} - The plain text content.
+ */
+const getPlainText = async ({ file, model }) => {
+  try {
+    const markdown = await getMarkdown({ file, model });
+    return convertToText(markdown);
+  } catch (error) {
+    console.error("Error extracting plain text:", error);
+    throw error;
+  }
+};
+
+/**
+ * Formatter object for various formats.
+ */
+export const formatter = {
+  markdown: getMarkdown,
+  plaintext: getPlainText,
+};
diff --git a/extractor/src/utils/convertToText.js b/extractor/src/utils/convertToText.js
@@ -0,0 +1,22 @@
+export const convertToText = (markdown) => {
+    if (!markdown || typeof markdown !== "string") {
+      throw new Error("Valid markdown content is required.");
+    }
+  
+    // Strip markdown syntax and handle tables
+    const plainText = markdown
+      .replace(/(\*\*|__)(.*?)\1/g, "$2") // Bold
+      .replace(/(\*|_)(.*?)\1/g, "$2")   // Italic
+      .replace(/(#+\s)/g, "")            // Headings
+      .replace(/\[(.*?)\]\(.*?\)/g, "$1") // Links
+      .replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Images
+      .replace(/(```.*?\n[\s\S]*?\n```|`.*?`)/g, "") // Code blocks/inline
+      .replace(/>+/g, "")                // Blockquotes
+      .replace(/\n{2,}/g, "\n")          // Excess newlines
+      .replace(/\|([^|]*)\|/g, (_, row) => row.trim()) // Table rows
+      .replace(/-+/g, "")                // Table dividers (---|---)
+      .trim();
+  
+    return plainText;
+  };
+  

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`export { extract } from './services/extract.js';`
`2`	`2`	`export { templates } from './services/templates.js';`
	`3`	`+export { formatter } from './services/formatter.js';`