From eeea02af9d11a363ee9cbeac30f05fd3fff9a0a1 Mon Sep 17 00:00:00 2001 From: z-zeechung Date: Sat, 16 Mar 2024 13:09:44 +0800 Subject: [PATCH] =?UTF-8?q?=E2=91=A0=E6=B7=BB=E5=8A=A0=E4=BA=86OCR?= =?UTF-8?q?=E7=BB=84=E4=BB=B6=EF=BC=8C=E9=9C=80=E8=A6=81=E5=85=88=E5=AE=89?= =?UTF-8?q?=E8=A3=85=E5=B9=B6=E9=85=8D=E7=BD=AEtesseract=EF=BC=9B=E2=91=A1?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E5=9B=BE=E5=83=8FPDF=E6=80=BB?= =?UTF-8?q?=E7=BB=93=E6=8F=92=E4=BB=B6=EF=BC=9B=E2=91=A2=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=BA=86=E8=81=94=E7=BD=91=E7=9A=84ChatGPT-=E7=99=BE=E5=BA=A6?= =?UTF-8?q?=E7=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 3 + crazy_functional.py | 23 +++ crazy_functions/ocr_fns/tesseract.py | 64 +++++++ ...345\203\217PDF\346\226\207\346\241\243.py" | 172 ++++++++++++++++++ ...T_\347\231\276\345\272\246\347\211\210.py" | 93 ++++++++++ 5 files changed, 355 insertions(+) create mode 100644 crazy_functions/ocr_fns/tesseract.py create mode 100644 "crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py" create mode 100644 "crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py" diff --git a/config.py b/config.py index 1bdb29955..2ceada3c7 100644 --- a/config.py +++ b/config.py @@ -271,6 +271,9 @@ NUM_CUSTOM_BASIC_BTN = 4 +#tesseract路径 +TESSERACT_PATH = "path/to/your/tesseract.exe" + """ --------------- 配置关联关系说明 --------------- diff --git a/crazy_functional.py b/crazy_functional.py index 3e998e56f..efc55040f 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -35,6 +35,7 @@ def get_crazy_functions(): from crazy_functions.批量Markdown翻译 import Markdown中译英 from crazy_functions.虚空终端 import 虚空终端 from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表 + from crazy_functions.批量总结图像PDF文档 import 批量总结图像PDF文档 function_plugins = { "虚空终端": { @@ -224,6 +225,15 @@ def get_crazy_functions(): "Info": "批量总结PDF文档的内容 | 输入参数为路径", "Function": HotReload(批量总结PDF文档), }, + "批量总结PDF文档(图像PDF)": { + "Group": "学术", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "批量总结图像PDF文档的内容 | 输入参数为路径", + "Function": HotReload(批量总结图像PDF文档), + "AdvancedArgs": True, + "ArgsReminder": "请输入要识别语言的代码,支持的语言代码详见此网页:https://gitee.com/dalaomai/tessdata_fast。示例:①简体中文:chi_sim;②简体中文和英文:chi_sim+eng;③竖版繁体中文:chi_tra_vert", + }, "谷歌学术检索助手(输入谷歌学术搜索页url)": { "Group": "学术", "Color": "stop", @@ -332,6 +342,19 @@ def get_crazy_functions(): } } ) + from crazy_functions.联网的ChatGPT_百度版 import 连接百度搜索回答问题 + + function_plugins.update( + { + "连接网络回答问题(百度版,输入问题后点击该插件)": { + "Group": "对话", + "Color": "stop", + "AsButton": False, # 加入下拉菜单中 + "Info": "连接网络回答问题| 输入参数是一个问题", + "Function": HotReload(连接百度搜索回答问题), + }, + } + ) except: print(trimmed_format_exc()) print("Load function plugin failed") diff --git a/crazy_functions/ocr_fns/tesseract.py b/crazy_functions/ocr_fns/tesseract.py new file mode 100644 index 000000000..755a5790d --- /dev/null +++ b/crazy_functions/ocr_fns/tesseract.py @@ -0,0 +1,64 @@ +import subprocess, os, urllib.request +from toolbox import get_conf + +TESSERACT_PATH = get_conf("TESSERACT_PATH") + +lang_list = ["afr","amh","ara","asm","aze","aze_cyrl","bel","ben","bod","bos","bre","bul","cat","ceb","ces","chi_sim","chi_sim_vert","chi_tra","chi_tra_vert","chr","cos","cym","dan", + "deu","div","dzo","ell","eng","enm","epo","equ","est","eus","fao","fas","fil","fin","fra","frk","frm","fry","gla","gle","glg","grc","guj","hat","heb","hin","hrv","hun", + "hye","iku","ind","isl","ita","ita_old","jav","jpn","jpn_vert","kan","kat","kat_old","kaz","khm","kir","kmr","kor","kor_vert","lao","lat","lav","lit","ltz","mal","mar", + "mkd","mlt","mon","mri","msa","mya","nep","nld","nor","oci","ori","pan","pol","por","pus","que","ron","rus","san","sin","slk","slv","snd","spa","spa_old","sqi","srp", + "srp_latn","sun","swa","swe","syr","tam","tat","tel","tgk","tha","tir","ton","tur","uig","ukr","urd","uzb","uzb_cyrl","vie","yid","yor"] + +def download_lang(lang): + #从码云的某个仓库下载,github太慢。要是哪天链接挂了就换一个 + url = f"https://gitee.com/dalaomai/tessdata_fast/raw/main/{lang}.traineddata" + + path = os.path.dirname(TESSERACT_PATH) + path = os.path.join(path, "tessdata") + path = os.path.join(path, f"{lang}.traineddata") + + response = urllib.request.urlopen(url) + if response.status == 200: + with open(path, 'wb') as file: + file.write(response.read()) + print(f'已将{lang}语言包下载至{path}') + else: + print('未能成功从{url}下载语言包') + +def lang_exists(lang): + path = os.path.dirname(TESSERACT_PATH) + path = os.path.join(path, "tessdata") + path = os.path.join(path, f"{lang}.traineddata") + return os.path.isfile(path) + +def normalize_lang(text): + langs = [] + for l in lang_list: + if l in text: + langs.append(l) + if langs.__len__() == 0: + langs = ["chi_sim", "eng"] + + invalid_langs = [] + for lang in langs: + if lang_exists(lang): + ... + else: + try: + download_lang(lang) + except Exception as e: + print(f"下载语言包失败: {e}") + invalid_langs.append(lang) + for lang in invalid_langs: + langs.remove(lang) + + if langs.__len__() == 0: + langs = ["osd"] + + return "+".join(langs) + +def tesseract_ocr(img_path, output_path, lang): + subprocess.run(f"\"{TESSERACT_PATH}\" \"{img_path}\" \"{output_path}\" -l {lang}") + if os.path.isfile(output_path): + os.remove(output_path) + os.rename(output_path+".txt", output_path) \ No newline at end of file diff --git "a/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py" "b/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py" new file mode 100644 index 000000000..14a58096a --- /dev/null +++ "b/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py" @@ -0,0 +1,172 @@ +from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str +from toolbox import CatchException, report_exception +from toolbox import write_history_to_file, promote_file_to_downloadzone +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive +from .crazy_utils import read_and_clean_pdf_text +from .crazy_utils import input_clipping + + + +def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt): + file_write_buffer = [] + for file_name in file_manifest: + + + + print('begin ocr on:', file_name) + from crazy_functions.ocr_fns.tesseract import normalize_lang, tesseract_ocr + import fitz, os + lang = normalize_lang(str(plugin_kwargs['advanced_arg'])) + img_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.png") + txt_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.txt") + pdf_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.pdf") + pages = [] + pdf = fitz.open(file_name) + for idx in range(0, pdf.page_count): + page = pdf[idx] + trans = fitz.Matrix(2, 2).prerotate(0) + pm = page.get_pixmap(matrix=trans, alpha=False) + pm.save(img_temp) + tesseract_ocr(img_temp, txt_temp, lang) + with open(txt_temp, "r", encoding="utf-8") as f: + pages.append(f.read()) + pdf.close() + + + + print('begin analysis on:', file_name) + ############################## <第 0 步,切割PDF> ################################## + # 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割) + # 的长度必须小于 2500 个 Token + #file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF + #file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + #page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + page_one = [pages[0]] + file_content = "\n".join(pages[1:]) + + TOKEN_LIMIT_PER_FRAGMENT = 1000 + + from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit + paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model']) + page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model']) + # 为了更好的效果,我们剥离Introduction之后的部分(如果有) + paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0] + + ############################## <第 1 步,从摘要中提取高价值信息,放到history中> ################################## + final_results = [] + final_results.append(paper_meta) + + ############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ################################## + i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示 + chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI + + iteration_results = [] + last_iteration_result = paper_meta # 初始值是摘要 + MAX_WORD_TOTAL = 1000 + n_fragment = len(paper_fragments) + if n_fragment >= 20: print('文章极长,不能达到预期效果') + for i in range(n_fragment): + NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment + i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}" + i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}" + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问 + llm_kwargs, chatbot, + history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果 + sys_prompt="Extract the main idea of this section with Chinese." # 提示 + ) + iteration_results.append(gpt_say) + last_iteration_result = gpt_say + + ############################## <第 3 步,整理history,提取总结> ################################## + final_results.extend(iteration_results) + final_results.append(f'Please conclude this paper discussed above。') + # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py + NUM_OF_WORD = 1000 + i_say = """ +1. Mark the title of the paper (with Chinese translation) +2. list all the authors' names (use English) +3. mark the first author's affiliation (output Chinese translation only) +4. mark the keywords of this article (use English) +5. link to the paper, Github code link (if available, fill in Github:None if not) +6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English) + - (1):What is the research background of this article? + - (2):What are the past methods? What are the problems with them? Is the approach well motivated? + - (3):What is the research methodology proposed in this paper? + - (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals? +Follow the format of the output that follows: +1. Title: xxx\n\n +2. Authors: xxx\n\n +3. Affiliation: xxx\n\n +4. Keywords: xxx\n\n +5. Urls: xxx or xxx , xxx \n\n +6. Summary: \n\n + - (1):xxx;\n + - (2):xxx;\n + - (3):xxx;\n + - (4):xxx.\n\n +Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible, +do not have too much repetitive information, numerical values using the original numbers. + """ + # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py + file_write_buffer.extend(final_results) + i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000) + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=i_say, inputs_show_user='开始最终总结', + llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results, + sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters" + ) + final_results.append(gpt_say) + file_write_buffer.extend([i_say, gpt_say]) + ############################## <第 4 步,设置一个token上限> ################################## + _, final_results = input_clipping("", final_results, max_token_limit=3200) + yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了 + + res = write_history_to_file(file_write_buffer) + promote_file_to_downloadzone(res, chatbot=chatbot) + yield from update_ui(chatbot=chatbot, history=final_results) # 刷新界面 + + +@CatchException +def 批量总结图像PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + + import glob, os + + # 基本信息:功能、贡献者 + chatbot.append([ + "函数插件功能?", + "批量总结图像PDF文档。函数插件贡献者: ValeriaWong,Eralien,ZeeChung"]) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + + # 尝试导入依赖,如果缺少依赖,则给出安装建议 + try: + import fitz + except: + report_exception(chatbot, history, + a = f"解析项目: {txt}", + b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 清空历史,以免输入溢出 + history = [] + + # 检测输入参数,如没有给定输入参数,直接退出 + if os.path.exists(txt): + project_folder = txt + else: + if txt == "": txt = '空空如也的输入栏' + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 搜索需要处理的文件清单 + file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] + + # 如果没找到任何文件 + if len(file_manifest) == 0: + report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}") + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 + return + + # 开始正式执行任务 + yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt) diff --git "a/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py" "b/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py" new file mode 100644 index 000000000..94c004f53 --- /dev/null +++ "b/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py" @@ -0,0 +1,93 @@ +from toolbox import CatchException, update_ui +from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping +import requests +from bs4 import BeautifulSoup +from request_llms.bridge_all import model_info +import jieba + + +def bing_search(query): + url = f"http://www.baidu.com/s?wd={query}&cl=3&pn=1&ie=utf-8&rn=20&tn=baidurt" + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'} + response = requests.get(url, headers=headers) + + urls = [] + soup = BeautifulSoup(response.text, 'html.parser') + for paragraph in soup.find_all('a'): + if "href" in paragraph.attrs and "onmousedown" in paragraph.attrs and "\'fm\':\'baidurt\'" in paragraph["onmousedown"] and "http" in paragraph["href"] and "tab" not in paragraph["onmousedown"]: + urls.append(paragraph["href"]) + return urls + + +def scrape_text(key_words, url) -> str: + try: + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'} + response = requests.get(url, headers=headers) + if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding + + soup = BeautifulSoup(response.text, 'html.parser') + text = soup.body.text + text = text.split("\n") + + valid = [] + for t in text: + for kw in key_words: + if kw in t: + valid.append(t) + break + valid = "\n".join(valid) + return valid + except: + return "" + +@CatchException +def 连接百度搜索回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request): + """ + txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 + llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 + plugin_kwargs 插件模型的参数,暂时没有用武之地 + chatbot 聊天显示框的句柄,用于显示给用户 + history 聊天历史,前情提要 + system_prompt 给gpt的静默提醒 + user_request 当前用户的请求信息(IP地址等) + """ + history = [] # 清空历史,以免输入溢出 + chatbot.append((f"请结合互联网信息回答以下问题:{txt}", + "[Local Message] 请注意,您正在调用一个[函数插件]的模板,该模板可以实现ChatGPT联网信息综合。该函数面向希望实现更多有趣功能的开发者,它可以作为创建新功能函数的模板。您若希望分享新的功能模组,请不吝PR!")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + + # ------------- < 第1步:爬取搜索引擎的结果 > ------------- + urls = bing_search(txt) + history = [] + if len(urls) == 0: + chatbot.append((f"结论:{txt}", + "[Local Message] 受到百度限制,无法从百度获取信息!")) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + return + # ------------- < 第2步:依次访问网页 > ------------- + max_search_result = 8 # 最多收纳多少个网页的结果 + kw = jieba.lcut_for_search(txt) + for index, url in enumerate(urls[:max_search_result]): + res = scrape_text(kw, url) + history.extend([f"第{index}份搜索结果:", res]) + #chatbot.append([f"第{index}份搜索结果:", res[:500]+"......"]) + chatbot[-1] = [f"第{index}份搜索结果:", res[:500]+"......"] + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 + + # ------------- < 第3步:ChatGPT综合 > ------------- + i_say = f"从以上搜索结果中抽取信息,然后回答问题:{txt}" + i_say, history = input_clipping( # 裁剪输入,从最长的条目开始裁剪,防止爆token + inputs=i_say, + history=history, + max_token_limit=model_info[llm_kwargs['llm_model']]['max_token']*3//4 + ) + + gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( + inputs=i_say, inputs_show_user=i_say, + llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, + sys_prompt="请从给定的若干条搜索结果中抽取信息,对最相关的两个搜索结果进行总结,然后回答问题。" + ) + chatbot[-1] = (i_say, gpt_say) + history.append(i_say);history.append(gpt_say) + yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 +