Skip to content

add lazyllm reader and paddleocr in engine #505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 50 additions & 2 deletions lazyllm/engine/engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typing import List, Dict, Type, Optional, Union, Any, overload
from typing import List, Dict, Type, Optional, Union, Any, overload,Callable
import lazyllm
from lazyllm import graph, switch, pipeline, package
from lazyllm.tools import IntentClassifier, SqlManager
from lazyllm.tools.rag import SimpleDirectoryReader,DocNode
from lazyllm.common import compile_func
from .node import all_nodes, Node
from .node_meta_hook import NodeMetaHook
Expand All @@ -13,7 +14,9 @@
from datetime import datetime, timedelta
import requests
import json

from fsspec import AbstractFileSystem
import paddleocr
import string
# Each session will have a separate engine
class Engine(ABC):
__default_engine__ = None
Expand Down Expand Up @@ -772,3 +775,48 @@ def __call__(self, *args, **kw) -> Union[str, List[str]]:
@NodeConstructor.register('File')
def make_file(id: str):
return FileResource(id)



class ReaderResource(object):

def __call__(self, input_files: Union[str, List[str]] = "",
exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False,
encoding: str = "utf-8", filename_as_id: bool = False, required_exts: Optional[List[str]] = None,
file_extractor: Optional[Dict[str, Callable]] = None, fs: Optional[AbstractFileSystem] = None,
metadata_genf: Optional[Callable[[str], Dict]] = None, num_files_limit: Optional[int] = None,
return_trace: bool = False, metadatas: Optional[Dict] = None):
if len(input_files) == 0:
return []
if isinstance(input_files,str):
input_files = [input_files]
return SimpleDirectoryReader("",input_files,exclude,exclude_hidden,recursive,
encoding,filename_as_id,required_exts,file_extractor,fs,
metadata_genf,num_files_limit,return_trace,metadatas)._load_data()

@NodeConstructor.register('Reader')
def make_simple_reader():
return ReaderResource()

punctuation = set(string.punctuation+ ",。!?;:“”‘’()【】《》…—~、")
def is_all_punctuation(s: str) -> bool:
return all(c in punctuation for c in s)
class OCR(lazyllm.Module):
def __init__(self):
super().__init__()
self._m = paddleocr.PaddleOCR()
def forward(self, input,metadatas: Optional[Dict] = None):
result = self._m.predict(input)
txt = []
for res in result:
for sentence in res['rec_texts']:
t = sentence.strip()
if not is_all_punctuation(t) and len(t)>0 :
txt.append(DocNode(text=t,global_metadata=metadatas or {}))
return txt



@NodeConstructor.register('OCR')
def make_ocr():
return OCR()
2 changes: 1 addition & 1 deletion requirements.full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,4 @@ pymongo
pymysql
flagembedding
mcp>=1.5.0

paddleocr
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ pymilvus>=2.4.11, <2.5.0
async-timeout
httpx<0.28.0
rapidfuzz

paddleocr
21 changes: 18 additions & 3 deletions tests/basic_tests/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from lazyllm.engine import LightEngine
from lazyllm.engine import LightEngine,engine
import pytest
import time
from gradio_client import Client
Expand All @@ -11,7 +11,8 @@
import socket
import threading
import requests

import paddleocr
import string
HOOK_PORT = 33733
HOOK_ROUTE = "mock_post"
fastapi_code = """
Expand Down Expand Up @@ -665,7 +666,21 @@ def test_engine_status(self):
engine.release_node(gid)
assert '__start__' in engine._nodes and '__end__' in engine._nodes


def test_engine_pdf_reader(self):
nodes = [dict(id='1', kind='Reader', name='m1', args=dict())]
edges = [dict(iid='__start__', oid='1'), dict(iid='1', oid='__end__')]
p = "D:\\Tutorial\\data\\data_txt\\6\\道德经.txt"
engine = LightEngine()
gid = engine.start(nodes, edges)
data = engine.run(gid, p)
entrys = lazyllm.tools.rag.SimpleDirectoryReader(input_files=[p])._load_data()
assert len(data) == len(entrys)
engine.stop(gid)
engine.reset()
nodes = [dict(id='1', kind='OCR', name='m1', args=dict())]
gid = engine.start(nodes, edges)
data = engine.run(gid, "C:\\wangtianxiong\\桌面\\vdbpdf.pdf")
print([t.get_text() for t in data])
class TestEngineRAG(object):

def test_rag(self):
Expand Down
Loading