Spaces:

markqiu
/

prinvest_mate

Sleeping

App Files Files Community

Tuchuanhuhuhu commited on May 18, 2023

Commit

a592279

2 Parent(s): 66e5db6 8043b80

Merge branch 'main' into tooling

Browse files

Files changed (10) hide show

config_example.json +2 -1
modules/config.py +1 -1
modules/index_func.py +5 -3
modules/models/PaLM.py +10 -0
modules/models/base_model.py +18 -13
modules/models/models.py +3 -0
modules/overwrites.py +0 -10
modules/pdf_func.py +7 -7
modules/presets.py +1 -0
requirements.txt +9 -1

config_example.json CHANGED Viewed

@@ -2,9 +2,10 @@
     // 你的OpenAI API Key，一般必填，
     // 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
     "openai_api_key": "",
     "usage_limit": 120, // API Key的当月限额，单位：美元
     // 你的xmchat API Key，与OpenAI API Key不同
-    "xmchat_api_key": "",
     "language": "auto",
     // 如果使用代理，请取消注释下面的两行，并替换代理URL
     // "https_proxy": "http://127.0.0.1:1079",

     // 你的OpenAI API Key，一般必填，
     // 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
     "openai_api_key": "",
+    "google_palm_api_key": "",
+    "xmchat_api_key": "",
     "usage_limit": 120, // API Key的当月限额，单位：美元
     // 你的xmchat API Key，与OpenAI API Key不同
     "language": "auto",
     // 如果使用代理，请取消注释下面的两行，并替换代理URL
     // "https_proxy": "http://127.0.0.1:1079",

modules/config.py CHANGED Viewed

@@ -103,7 +103,7 @@ api_host = os.environ.get("api_host", config.get("api_host", ""))
 if api_host:
     shared.state.set_api_host(api_host)
-default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-4")
 os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
 os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
 os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")

 if api_host:
     shared.state.set_api_host(api_host)
+default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-3.5-turbo")
 os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
 os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
 os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")

modules/index_func.py CHANGED Viewed

@@ -42,7 +42,7 @@ def get_documents(file_src):
     for file in file_src:
         filepath = file.name
         filename = os.path.basename(filepath)
-        file_type = os.path.splitext(filepath)[1]
         logging.info(f"loading file: {filename}")
         try:
             if file_type == ".pdf":
@@ -87,8 +87,9 @@ def get_documents(file_src):
                 loader = TextLoader(filepath, "utf8")
                 texts = loader.load()
         except Exception as e:
             logging.error(f"Error loading file: {filename}")
-            pass
         texts = text_splitter.split_documents(texts)
         documents.extend(texts)
@@ -142,6 +143,7 @@ def construct_index(
             return index
         except Exception as e:
             logging.error("索引构建失败！", e)
-            print(e)
             return None

     for file in file_src:
         filepath = file.name
         filename = os.path.basename(filepath)
+        file_type = os.path.splitext(filename)[1]
         logging.info(f"loading file: {filename}")
         try:
             if file_type == ".pdf":
                 loader = TextLoader(filepath, "utf8")
                 texts = loader.load()
         except Exception as e:
+            import traceback
             logging.error(f"Error loading file: {filename}")
+            traceback.print_exc()
         texts = text_splitter.split_documents(texts)
         documents.extend(texts)
             return index
         except Exception as e:
+            import traceback
             logging.error("索引构建失败！", e)
+            traceback.print_exc()
             return None

modules/models/PaLM.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .base_model import BaseLLMModel, CallbackToIterator, ChuanhuCallbackHandler
+from langchain.chat_models import ChatGooglePalm
+class PaLM_Client(BaseLLMModel):
+    def __init__(self, model_name, user="") -> None:
+        super().__init__(model_name, user)
+        self.llm = ChatGooglePalm(google_api_key="")
+    def get_answer_at_once(self):
+        self.llm.generate(self.history)

modules/models/base_model.py CHANGED Viewed

@@ -20,6 +20,7 @@ from enum import Enum
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.callbacks.manager import BaseCallbackManager
 from typing import Any, Dict, List, Optional, Union
@@ -108,6 +109,7 @@ class ModelType(Enum):
     MOSS = 5
     YuanAI = 6
     ChuanhuAgent = 7
     @classmethod
     def get_type(cls, model_name: str):
@@ -129,6 +131,8 @@ class ModelType(Enum):
             model_type = ModelType.YuanAI
         elif "川虎助理" in model_name_lower:
             model_type = ModelType.ChuanhuAgent
         else:
             model_type = ModelType.Unknown
         return model_type
@@ -262,19 +266,20 @@ class BaseLLMModel:
             status = i18n("索引构建完成")
             # Summarize the document
             logging.info(i18n("生成内容总结中……"))
-            os.environ["OPENAI_API_KEY"] = self.api_key
-            from langchain.chains.summarize import load_summarize_chain
-            from langchain.prompts import PromptTemplate
-            from langchain.chat_models import ChatOpenAI
-            from langchain.callbacks import StdOutCallbackHandler
-            prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
-            PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
-            handler = StdOutCallbackHandler()
-            llm = ChatOpenAI(callbacks=[handler])
-            chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
-            summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
-            print(i18n("总结") + f": {summary}")
-            chatbot.append([i18n("总结"), summary])
         return gr.Files.update(), chatbot, status
     def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):

 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.callbacks.manager import BaseCallbackManager
+from langchain.callbacks import get_openai_callback
 from typing import Any, Dict, List, Optional, Union
     MOSS = 5
     YuanAI = 6
     ChuanhuAgent = 7
+    PaLM = 8
     @classmethod
     def get_type(cls, model_name: str):
             model_type = ModelType.YuanAI
         elif "川虎助理" in model_name_lower:
             model_type = ModelType.ChuanhuAgent
+        elif "palm" in model_name_lower:
+            model_type = ModelType.PaLM
         else:
             model_type = ModelType.Unknown
         return model_type
             status = i18n("索引构建完成")
             # Summarize the document
             logging.info(i18n("生成内容总结中……"))
+            with get_openai_callback() as cb:
+                os.environ["OPENAI_API_KEY"] = self.api_key
+                from langchain.chains.summarize import load_summarize_chain
+                from langchain.prompts import PromptTemplate
+                from langchain.chat_models import ChatOpenAI
+                from langchain.callbacks import StdOutCallbackHandler
+                prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
+                PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
+                llm = ChatOpenAI()
+                chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
+                summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
+                print(i18n("总结") + f": {summary}")
+                chatbot.append([i18n("上传了")+str(len(files))+"个文件", summary])
+            logging.info(cb)
         return gr.Files.update(), chatbot, status
     def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):

modules/models/models.py CHANGED Viewed

@@ -606,6 +606,9 @@ def get_model(
         elif model_type == ModelType.ChuanhuAgent:
             from .ChuanhuAgent import ChuanhuAgent_Client
             model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
         elif model_type == ModelType.Unknown:
             raise ValueError(f"未知模型: {model_name}")
         logging.info(msg)

         elif model_type == ModelType.ChuanhuAgent:
             from .ChuanhuAgent import ChuanhuAgent_Client
             model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
+        elif model_type == ModelType.PaLM:
+            from .PaLM import PaLM_Client
+            model = PaLM_Client(model_name, user_name=user_name)
         elif model_type == ModelType.Unknown:
             raise ValueError(f"未知模型: {model_name}")
         logging.info(msg)

modules/overwrites.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 import logging
-from llama_index import Prompt
 from typing import List, Tuple
 import mdtex2html
 from gradio_client import utils as client_utils
@@ -10,15 +9,6 @@ from modules.presets import *
 from modules.index_func import *
 from modules.config import render_latex
-def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
-    logging.debug("Compacting text chunks...🚀🚀🚀")
-    combined_str = [c.strip() for c in text_chunks if c.strip()]
-    combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
-    combined_str = "\n\n".join(combined_str)
-    # resplit based on self.max_chunk_overlap
-    text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
-    return text_splitter.split_text(combined_str)
 def postprocess(
         self,

 from __future__ import annotations
 import logging
 from typing import List, Tuple
 import mdtex2html
 from gradio_client import utils as client_utils
 from modules.index_func import *
 from modules.config import render_latex
 def postprocess(
         self,

modules/pdf_func.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from types import SimpleNamespace
 import pdfplumber
 import logging
-from llama_index import Document
 def prepare_table_config(crop_page):
     """Prepare table查找边界, 要求page为原始page
     From https://github.com/jsvine/pdfplumber/issues/242
     """
     page = crop_page.root_page # root/parent
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
             title_bottom = word.bottom
         elif word.text == "Abstract": # 获取页面abstract
             top = word.top
     user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
     # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
     return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
             new_pages.append(right)
         else:
             new_pages.append(page)
     return new_pages
 def parse_pdf(filename, two_column = True):
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
             name_top=name_top,
             name_bottom=name_bottom,
             record_chapter_name = True,
             page_start=page_start,
             page_stop=None,
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
                 if word.size >= 11: # 出现chapter name
                     if cur_chapter is None:
                         cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
-                    elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
                         # 不再继续写chapter name
                         cur_chapter.page_stop = page.page_number # stop id
                         chapters.append(cur_chapter)
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
         text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
     logging.getLogger().setLevel(level)
-    return Document(text=text, extra_info={"title": title})
 BASE_POINTS = """
 1. Who are the authors?

 from types import SimpleNamespace
 import pdfplumber
 import logging
+from langchain.docstore.document import Document
 def prepare_table_config(crop_page):
     """Prepare table查找边界, 要求page为原始page
     From https://github.com/jsvine/pdfplumber/issues/242
     """
     page = crop_page.root_page # root/parent
             title_bottom = word.bottom
         elif word.text == "Abstract": # 获取页面abstract
             top = word.top
     user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
     # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
     return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
             new_pages.append(right)
         else:
             new_pages.append(page)
     return new_pages
 def parse_pdf(filename, two_column = True):
             name_top=name_top,
             name_bottom=name_bottom,
             record_chapter_name = True,
             page_start=page_start,
             page_stop=None,
                 if word.size >= 11: # 出现chapter name
                     if cur_chapter is None:
                         cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
+                    elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
                         # 不再继续写chapter name
                         cur_chapter.page_stop = page.page_number # stop id
                         chapters.append(cur_chapter)
         text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
     logging.getLogger().setLevel(level)
+    return Document(page_content=text, metadata={"title": title})
 BASE_POINTS = """
 1. Who are the authors?

modules/presets.py CHANGED Viewed

@@ -68,6 +68,7 @@ ONLINE_MODELS = [
     "gpt-4-32k",
     "gpt-4-32k-0314",
     "xmchat",
     "yuanai-1.0-base_10B",
     "yuanai-1.0-translate",
     "yuanai-1.0-dialog",

     "gpt-4-32k",
     "gpt-4-32k-0314",
     "xmchat",
+    "Google PaLM",
     "yuanai-1.0-base_10B",
     "yuanai-1.0-translate",
     "yuanai-1.0-dialog",

requirements.txt CHANGED Viewed

@@ -15,4 +15,12 @@ pdfplumber
 pandas
 commentjson
 openpyxl
-pandocs

 pandas
 commentjson
 openpyxl
+pandoc
+wolframalpha
+faiss-cpu
+google-search-results
+arxiv
+wikipedia
+google.generativeai
+openai
+unstructured