Skip to content

增加doc、xlsx文件支持 #68

@leapar

Description

@leapar

希望增加word文档、excel文档支持。我写了一个版本,不知道是否合适。

https://github.com/leapar/eino-ext

package indexing

import (
	"context"

	"github.com/cloudwego/eino-ext/components/document/loader/file"
	"github.com/cloudwego/eino-ext/components/document/parser/csv"
	"github.com/cloudwego/eino-ext/components/document/parser/doc"
	"github.com/cloudwego/eino-ext/components/document/parser/pdf"
	"github.com/cloudwego/eino-ext/components/document/parser/xlsx"
	"github.com/cloudwego/eino/components/document"
	"github.com/cloudwego/eino/components/document/parser"
)

// newLoader component initialization function of node 'FileLoader' in graph 'KnowledgeIndexing'
func newLoader(ctx context.Context) (ldr document.Loader, err error) {
	pdfParser, err := pdf.NewPDFParser(&pdf.Config{ToPages: true})
	if err != nil {
		return nil, err
	}

	docParser, err := doc.NewDocParser()
	if err != nil {
		return nil, err
	}

	csvParser, err := csv.NewCsvParser()
	if err != nil {
		return nil, err
	}

	xlsxParser, err := xlsx.NewXlsxParser()
	if err != nil {
		return nil, err
	}
	txtParser := &parser.TextParser{}

	extParser, err := parser.NewExtParser(ctx, &parser.ExtParserConfig{
		FallbackParser: parser.TextParser{},
		Parsers: map[string]parser.Parser{
			".pdf":  pdfParser,
			".txt":  txtParser,
			".md":   txtParser,
			".doc":  docParser,
			".docx": docParser,
			".xlsx": xlsxParser,
			".csv":  csvParser,
		},
	})
	if err != nil {
		return nil, err
	}

	// TODO Modify component configuration here.
	config := &file.FileLoaderConfig{
		Parser: extParser,
	}
	ldr, err = file.NewFileLoader(ctx, config)
	if err != nil {
		return nil, err
	}

	return ldr, nil
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions