From eeea02af9d11a363ee9cbeac30f05fd3fff9a0a1 Mon Sep 17 00:00:00 2001
From: z-zeechung <zhouzeechung@qq.com>
Date: Sat, 16 Mar 2024 13:09:44 +0800
Subject: [PATCH] =?UTF-8?q?=E2=91=A0=E6=B7=BB=E5=8A=A0=E4=BA=86OCR?=
 =?UTF-8?q?=E7=BB=84=E4=BB=B6=EF=BC=8C=E9=9C=80=E8=A6=81=E5=85=88=E5=AE=89?=
 =?UTF-8?q?=E8=A3=85=E5=B9=B6=E9=85=8D=E7=BD=AEtesseract=EF=BC=9B=E2=91=A1?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E5=9B=BE=E5=83=8FPDF=E6=80=BB?=
 =?UTF-8?q?=E7=BB=93=E6=8F=92=E4=BB=B6=EF=BC=9B=E2=91=A2=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E4=BA=86=E8=81=94=E7=BD=91=E7=9A=84ChatGPT-=E7=99=BE=E5=BA=A6?=
 =?UTF-8?q?=E7=89=88?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                                     |   3 +
 crazy_functional.py                           |  23 +++
 crazy_functions/ocr_fns/tesseract.py          |  64 +++++++
 ...345\203\217PDF\346\226\207\346\241\243.py" | 172 ++++++++++++++++++
 ...T_\347\231\276\345\272\246\347\211\210.py" |  93 ++++++++++
 5 files changed, 355 insertions(+)
 create mode 100644 crazy_functions/ocr_fns/tesseract.py
 create mode 100644 "crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py"
 create mode 100644 "crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py"

diff --git a/config.py b/config.py
index 1bdb29955..2ceada3c7 100644
--- a/config.py
+++ b/config.py
@@ -271,6 +271,9 @@
 NUM_CUSTOM_BASIC_BTN = 4
 
 
+#tesseract路径
+TESSERACT_PATH = "path/to/your/tesseract.exe"
+
 
 """
 --------------- 配置关联关系说明 ---------------
diff --git a/crazy_functional.py b/crazy_functional.py
index 3e998e56f..efc55040f 100644
--- a/crazy_functional.py
+++ b/crazy_functional.py
@@ -35,6 +35,7 @@ def get_crazy_functions():
     from crazy_functions.批量Markdown翻译 import Markdown中译英
     from crazy_functions.虚空终端 import 虚空终端
     from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表
+    from crazy_functions.批量总结图像PDF文档 import 批量总结图像PDF文档
 
     function_plugins = {
         "虚空终端": {
@@ -224,6 +225,15 @@ def get_crazy_functions():
             "Info": "批量总结PDF文档的内容 | 输入参数为路径",
             "Function": HotReload(批量总结PDF文档),
         },
+        "批量总结PDF文档（图像PDF）": {
+            "Group": "学术",
+            "Color": "stop",
+            "AsButton": False,  # 加入下拉菜单中
+            "Info": "批量总结图像PDF文档的内容 | 输入参数为路径",
+            "Function": HotReload(批量总结图像PDF文档),
+            "AdvancedArgs": True,
+            "ArgsReminder": "请输入要识别语言的代码，支持的语言代码详见此网页：https://gitee.com/dalaomai/tessdata_fast。示例：①简体中文：chi_sim；②简体中文和英文：chi_sim+eng；③竖版繁体中文：chi_tra_vert",
+        },
         "谷歌学术检索助手（输入谷歌学术搜索页url）": {
             "Group": "学术",
             "Color": "stop",
@@ -332,6 +342,19 @@ def get_crazy_functions():
                 }
             }
         )
+        from crazy_functions.联网的ChatGPT_百度版 import 连接百度搜索回答问题
+        
+        function_plugins.update(
+            {
+                "连接网络回答问题（百度版，输入问题后点击该插件）": {
+                    "Group": "对话",
+                    "Color": "stop",
+                    "AsButton": False,  # 加入下拉菜单中
+                    "Info": "连接网络回答问题| 输入参数是一个问题",
+                    "Function": HotReload(连接百度搜索回答问题),
+                },
+            }
+        )
     except:
         print(trimmed_format_exc())
         print("Load function plugin failed")
diff --git a/crazy_functions/ocr_fns/tesseract.py b/crazy_functions/ocr_fns/tesseract.py
new file mode 100644
index 000000000..755a5790d
--- /dev/null
+++ b/crazy_functions/ocr_fns/tesseract.py
@@ -0,0 +1,64 @@
+import subprocess, os, urllib.request
+from toolbox import get_conf
+
+TESSERACT_PATH = get_conf("TESSERACT_PATH")
+
+lang_list = ["afr","amh","ara","asm","aze","aze_cyrl","bel","ben","bod","bos","bre","bul","cat","ceb","ces","chi_sim","chi_sim_vert","chi_tra","chi_tra_vert","chr","cos","cym","dan",
+             "deu","div","dzo","ell","eng","enm","epo","equ","est","eus","fao","fas","fil","fin","fra","frk","frm","fry","gla","gle","glg","grc","guj","hat","heb","hin","hrv","hun",
+             "hye","iku","ind","isl","ita","ita_old","jav","jpn","jpn_vert","kan","kat","kat_old","kaz","khm","kir","kmr","kor","kor_vert","lao","lat","lav","lit","ltz","mal","mar",
+             "mkd","mlt","mon","mri","msa","mya","nep","nld","nor","oci","ori","pan","pol","por","pus","que","ron","rus","san","sin","slk","slv","snd","spa","spa_old","sqi","srp",
+             "srp_latn","sun","swa","swe","syr","tam","tat","tel","tgk","tha","tir","ton","tur","uig","ukr","urd","uzb","uzb_cyrl","vie","yid","yor"]
+             
+def download_lang(lang):
+    #从码云的某个仓库下载，github太慢。要是哪天链接挂了就换一个
+    url = f"https://gitee.com/dalaomai/tessdata_fast/raw/main/{lang}.traineddata"
+    
+    path = os.path.dirname(TESSERACT_PATH)
+    path = os.path.join(path, "tessdata")
+    path = os.path.join(path, f"{lang}.traineddata")
+    
+    response = urllib.request.urlopen(url)
+    if response.status == 200:
+        with open(path, 'wb') as file:
+            file.write(response.read())
+            print(f'已将{lang}语言包下载至{path}')
+    else:
+        print('未能成功从{url}下载语言包')
+             
+def lang_exists(lang):
+    path = os.path.dirname(TESSERACT_PATH)
+    path = os.path.join(path, "tessdata")
+    path = os.path.join(path, f"{lang}.traineddata")
+    return os.path.isfile(path)
+    
+def normalize_lang(text):
+    langs = []
+    for l in lang_list:
+        if l in text:
+            langs.append(l)
+    if langs.__len__() == 0:
+        langs = ["chi_sim", "eng"]
+    
+    invalid_langs = []
+    for lang in langs:
+        if lang_exists(lang):
+            ...
+        else:
+            try:
+                download_lang(lang)
+            except Exception as e:
+                print(f"下载语言包失败: {e}")
+                invalid_langs.append(lang)
+    for lang in invalid_langs:
+        langs.remove(lang)
+        
+    if langs.__len__() == 0:
+        langs = ["osd"]
+        
+    return "+".join(langs)
+    
+def tesseract_ocr(img_path, output_path, lang):
+    subprocess.run(f"\"{TESSERACT_PATH}\" \"{img_path}\" \"{output_path}\" -l {lang}")
+    if os.path.isfile(output_path):
+        os.remove(output_path)
+    os.rename(output_path+".txt", output_path)
\ No newline at end of file
diff --git "a/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py" "b/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py"
new file mode 100644
index 000000000..14a58096a
--- /dev/null
+++ "b/crazy_functions/\346\211\271\351\207\217\346\200\273\347\273\223\345\233\276\345\203\217PDF\346\226\207\346\241\243.py"
@@ -0,0 +1,172 @@
+from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str
+from toolbox import CatchException, report_exception
+from toolbox import write_history_to_file, promote_file_to_downloadzone
+from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
+from .crazy_utils import read_and_clean_pdf_text
+from .crazy_utils import input_clipping
+
+
+
+def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
+    file_write_buffer = []
+    for file_name in file_manifest:
+    
+    
+    
+        print('begin ocr on:', file_name)
+        from crazy_functions.ocr_fns.tesseract import normalize_lang, tesseract_ocr
+        import fitz, os
+        lang = normalize_lang(str(plugin_kwargs['advanced_arg']))
+        img_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.png")
+        txt_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.txt")
+        pdf_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.pdf")
+        pages = []
+        pdf = fitz.open(file_name)
+        for idx in range(0, pdf.page_count):
+            page = pdf[idx]
+            trans = fitz.Matrix(2, 2).prerotate(0)
+            pm = page.get_pixmap(matrix=trans, alpha=False)
+            pm.save(img_temp)
+            tesseract_ocr(img_temp, txt_temp, lang)
+            with open(txt_temp, "r", encoding="utf-8") as f:
+                pages.append(f.read())
+        pdf.close()
+    
+    
+    
+        print('begin analysis on:', file_name)
+        ############################## <第 0 步，切割PDF> ##################################
+        # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
+        # 的长度必须小于 2500 个 Token
+        #file_content, page_one = read_and_clean_pdf_text(file_name) # （尝试）按照章节切割PDF
+        #file_content = file_content.encode('utf-8', 'ignore').decode()   # avoid reading non-utf8 chars
+        #page_one = str(page_one).encode('utf-8', 'ignore').decode()  # avoid reading non-utf8 chars
+        page_one = [pages[0]]
+        file_content = "\n".join(pages[1:])
+        
+        TOKEN_LIMIT_PER_FRAGMENT = 1000
+
+        from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
+        paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content,  limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
+        page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
+        # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
+        paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
+        
+        ############################## <第 1 步，从摘要中提取高价值信息，放到history中> ##################################
+        final_results = []
+        final_results.append(paper_meta)
+
+        ############################## <第 2 步，迭代地历遍整个文章，提取精炼信息> ##################################
+        i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
+        chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
+
+        iteration_results = []
+        last_iteration_result = paper_meta  # 初始值是摘要
+        MAX_WORD_TOTAL = 1000
+        n_fragment = len(paper_fragments)
+        if n_fragment >= 20: print('文章极长，不能达到预期效果')
+        for i in range(n_fragment):
+            NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
+            i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
+            i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
+            gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问， i_say_show_user=给用户看的提问
+                                                                                llm_kwargs, chatbot, 
+                                                                                history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
+                                                                                sys_prompt="Extract the main idea of this section with Chinese."  # 提示
+                                                                                ) 
+            iteration_results.append(gpt_say)
+            last_iteration_result = gpt_say
+
+        ############################## <第 3 步，整理history，提取总结> ##################################
+        final_results.extend(iteration_results)
+        final_results.append(f'Please conclude this paper discussed above。')
+        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
+        NUM_OF_WORD = 1000
+        i_say = """
+1. Mark the title of the paper (with Chinese translation)
+2. list all the authors' names (use English)
+3. mark the first author's affiliation (output Chinese translation only)
+4. mark the keywords of this article (use English)
+5. link to the paper, Github code link (if available, fill in Github:None if not)
+6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English)
+    - (1):What is the research background of this article?
+    - (2):What are the past methods? What are the problems with them? Is the approach well motivated?
+    - (3):What is the research methodology proposed in this paper?
+    - (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
+Follow the format of the output that follows:                  
+1. Title: xxx\n\n
+2. Authors: xxx\n\n
+3. Affiliation: xxx\n\n
+4. Keywords: xxx\n\n
+5. Urls: xxx or xxx , xxx \n\n
+6. Summary: \n\n
+    - (1):xxx;\n 
+    - (2):xxx;\n 
+    - (3):xxx;\n
+    - (4):xxx.\n\n
+Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible,
+do not have too much repetitive information, numerical values using the original numbers.
+        """
+        # This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
+        file_write_buffer.extend(final_results)
+        i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000)
+        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
+            inputs=i_say, inputs_show_user='开始最终总结', 
+            llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results, 
+            sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters"
+        )
+        final_results.append(gpt_say)
+        file_write_buffer.extend([i_say, gpt_say])
+        ############################## <第 4 步，设置一个token上限> ##################################
+        _, final_results = input_clipping("", final_results, max_token_limit=3200)
+        yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
+
+    res = write_history_to_file(file_write_buffer)
+    promote_file_to_downloadzone(res, chatbot=chatbot)
+    yield from update_ui(chatbot=chatbot, history=final_results) # 刷新界面
+
+
+@CatchException
+def 批量总结图像PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
+    
+    import glob, os
+
+    # 基本信息：功能、贡献者
+    chatbot.append([
+        "函数插件功能？",
+        "批量总结图像PDF文档。函数插件贡献者: ValeriaWong，Eralien，ZeeChung"])
+    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+
+    # 尝试导入依赖，如果缺少依赖，则给出安装建议
+    try:
+        import fitz
+    except:
+        report_exception(chatbot, history, 
+            a = f"解析项目: {txt}", 
+            b = f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade pymupdf```。")
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        return
+
+    # 清空历史，以免输入溢出
+    history = []
+
+    # 检测输入参数，如没有给定输入参数，直接退出
+    if os.path.exists(txt):
+        project_folder = txt
+    else:
+        if txt == "": txt = '空空如也的输入栏'
+        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        return
+
+    # 搜索需要处理的文件清单
+    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]
+    
+    # 如果没找到任何文件
+    if len(file_manifest) == 0:
+        report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
+        return
+
+    # 开始正式执行任务
+    yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
diff --git "a/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py" "b/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py"
new file mode 100644
index 000000000..94c004f53
--- /dev/null
+++ "b/crazy_functions/\350\201\224\347\275\221\347\232\204ChatGPT_\347\231\276\345\272\246\347\211\210.py"
@@ -0,0 +1,93 @@
+from toolbox import CatchException, update_ui
+from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
+import requests
+from bs4 import BeautifulSoup
+from request_llms.bridge_all import model_info
+import jieba
+
+
+def bing_search(query):
+    url = f"http://www.baidu.com/s?wd={query}&cl=3&pn=1&ie=utf-8&rn=20&tn=baidurt"
+    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
+    response = requests.get(url, headers=headers)
+    
+    urls = []
+    soup = BeautifulSoup(response.text, 'html.parser')
+    for paragraph in soup.find_all('a'):
+        if "href" in paragraph.attrs and "onmousedown" in paragraph.attrs and "\'fm\':\'baidurt\'" in paragraph["onmousedown"] and "http" in paragraph["href"] and "tab" not in paragraph["onmousedown"]:
+            urls.append(paragraph["href"])
+    return urls
+
+
+def scrape_text(key_words, url) -> str:
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
+        response = requests.get(url, headers=headers)
+        if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding
+            
+        soup = BeautifulSoup(response.text, 'html.parser')
+        text = soup.body.text
+        text = text.split("\n")
+        
+        valid = []
+        for t in text:
+            for kw in key_words:
+                if kw in t:
+                    valid.append(t)
+                    break
+        valid = "\n".join(valid)
+        return valid
+    except:
+        return ""
+
+@CatchException
+def 连接百度搜索回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
+    """
+    txt             输入栏用户输入的文本，例如需要翻译的一段话，再例如一个包含了待处理文件的路径
+    llm_kwargs      gpt模型参数，如温度和top_p等，一般原样传递下去就行
+    plugin_kwargs   插件模型的参数，暂时没有用武之地
+    chatbot         聊天显示框的句柄，用于显示给用户
+    history         聊天历史，前情提要
+    system_prompt   给gpt的静默提醒
+    user_request    当前用户的请求信息（IP地址等）
+    """
+    history = []    # 清空历史，以免输入溢出
+    chatbot.append((f"请结合互联网信息回答以下问题：{txt}",
+                    "[Local Message] 请注意，您正在调用一个[函数插件]的模板，该模板可以实现ChatGPT联网信息综合。该函数面向希望实现更多有趣功能的开发者，它可以作为创建新功能函数的模板。您若希望分享新的功能模组，请不吝PR！"))
+    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间，我们先及时地做一次界面更新
+
+    # ------------- < 第1步：爬取搜索引擎的结果 > -------------
+    urls = bing_search(txt)
+    history = []
+    if len(urls) == 0:
+        chatbot.append((f"结论：{txt}",
+                        "[Local Message] 受到百度限制，无法从百度获取信息！"))
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间，我们先及时地做一次界面更新
+        return
+    # ------------- < 第2步：依次访问网页 > -------------
+    max_search_result = 8   # 最多收纳多少个网页的结果
+    kw = jieba.lcut_for_search(txt)
+    for index, url in enumerate(urls[:max_search_result]):
+        res = scrape_text(kw, url)
+        history.extend([f"第{index}份搜索结果：", res])
+        #chatbot.append([f"第{index}份搜索结果：", res[:500]+"......"])
+        chatbot[-1] = [f"第{index}份搜索结果：", res[:500]+"......"]
+        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间，我们先及时地做一次界面更新
+
+    # ------------- < 第3步：ChatGPT综合 > -------------
+    i_say = f"从以上搜索结果中抽取信息，然后回答问题：{txt}"
+    i_say, history = input_clipping(    # 裁剪输入，从最长的条目开始裁剪，防止爆token
+        inputs=i_say,
+        history=history,
+        max_token_limit=model_info[llm_kwargs['llm_model']]['max_token']*3//4
+    )
+    
+    gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
+        inputs=i_say, inputs_show_user=i_say,
+        llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
+        sys_prompt="请从给定的若干条搜索结果中抽取信息，对最相关的两个搜索结果进行总结，然后回答问题。"
+    )
+    chatbot[-1] = (i_say, gpt_say)
+    history.append(i_say);history.append(gpt_say)
+    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
+