Spaces:
Sleeping
Sleeping
Merge branch 'main' into tooling
Browse files- config_example.json +2 -1
- modules/config.py +1 -1
- modules/index_func.py +5 -3
- modules/models/PaLM.py +10 -0
- modules/models/base_model.py +18 -13
- modules/models/models.py +3 -0
- modules/overwrites.py +0 -10
- modules/pdf_func.py +7 -7
- modules/presets.py +1 -0
- requirements.txt +9 -1
config_example.json
CHANGED
@@ -2,9 +2,10 @@
|
|
2 |
// 你的OpenAI API Key,一般必填,
|
3 |
// 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
|
4 |
"openai_api_key": "",
|
|
|
|
|
5 |
"usage_limit": 120, // API Key的当月限额,单位:美元
|
6 |
// 你的xmchat API Key,与OpenAI API Key不同
|
7 |
-
"xmchat_api_key": "",
|
8 |
"language": "auto",
|
9 |
// 如果使用代理,请取消注释下面的两行,并替换代理URL
|
10 |
// "https_proxy": "http://127.0.0.1:1079",
|
|
|
2 |
// 你的OpenAI API Key,一般必填,
|
3 |
// 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
|
4 |
"openai_api_key": "",
|
5 |
+
"google_palm_api_key": "",
|
6 |
+
"xmchat_api_key": "",
|
7 |
"usage_limit": 120, // API Key的当月限额,单位:美元
|
8 |
// 你的xmchat API Key,与OpenAI API Key不同
|
|
|
9 |
"language": "auto",
|
10 |
// 如果使用代理,请取消注释下面的两行,并替换代理URL
|
11 |
// "https_proxy": "http://127.0.0.1:1079",
|
modules/config.py
CHANGED
@@ -103,7 +103,7 @@ api_host = os.environ.get("api_host", config.get("api_host", ""))
|
|
103 |
if api_host:
|
104 |
shared.state.set_api_host(api_host)
|
105 |
|
106 |
-
default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-
|
107 |
os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
|
108 |
os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
|
109 |
os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")
|
|
|
103 |
if api_host:
|
104 |
shared.state.set_api_host(api_host)
|
105 |
|
106 |
+
default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-3.5-turbo")
|
107 |
os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
|
108 |
os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
|
109 |
os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")
|
modules/index_func.py
CHANGED
@@ -42,7 +42,7 @@ def get_documents(file_src):
|
|
42 |
for file in file_src:
|
43 |
filepath = file.name
|
44 |
filename = os.path.basename(filepath)
|
45 |
-
file_type = os.path.splitext(
|
46 |
logging.info(f"loading file: {filename}")
|
47 |
try:
|
48 |
if file_type == ".pdf":
|
@@ -87,8 +87,9 @@ def get_documents(file_src):
|
|
87 |
loader = TextLoader(filepath, "utf8")
|
88 |
texts = loader.load()
|
89 |
except Exception as e:
|
|
|
90 |
logging.error(f"Error loading file: {filename}")
|
91 |
-
|
92 |
|
93 |
texts = text_splitter.split_documents(texts)
|
94 |
documents.extend(texts)
|
@@ -142,6 +143,7 @@ def construct_index(
|
|
142 |
return index
|
143 |
|
144 |
except Exception as e:
|
|
|
145 |
logging.error("索引构建失败!", e)
|
146 |
-
|
147 |
return None
|
|
|
42 |
for file in file_src:
|
43 |
filepath = file.name
|
44 |
filename = os.path.basename(filepath)
|
45 |
+
file_type = os.path.splitext(filename)[1]
|
46 |
logging.info(f"loading file: {filename}")
|
47 |
try:
|
48 |
if file_type == ".pdf":
|
|
|
87 |
loader = TextLoader(filepath, "utf8")
|
88 |
texts = loader.load()
|
89 |
except Exception as e:
|
90 |
+
import traceback
|
91 |
logging.error(f"Error loading file: {filename}")
|
92 |
+
traceback.print_exc()
|
93 |
|
94 |
texts = text_splitter.split_documents(texts)
|
95 |
documents.extend(texts)
|
|
|
143 |
return index
|
144 |
|
145 |
except Exception as e:
|
146 |
+
import traceback
|
147 |
logging.error("索引构建失败!", e)
|
148 |
+
traceback.print_exc()
|
149 |
return None
|
modules/models/PaLM.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base_model import BaseLLMModel, CallbackToIterator, ChuanhuCallbackHandler
|
2 |
+
from langchain.chat_models import ChatGooglePalm
|
3 |
+
|
4 |
+
class PaLM_Client(BaseLLMModel):
|
5 |
+
def __init__(self, model_name, user="") -> None:
|
6 |
+
super().__init__(model_name, user)
|
7 |
+
self.llm = ChatGooglePalm(google_api_key="")
|
8 |
+
|
9 |
+
def get_answer_at_once(self):
|
10 |
+
self.llm.generate(self.history)
|
modules/models/base_model.py
CHANGED
@@ -20,6 +20,7 @@ from enum import Enum
|
|
20 |
|
21 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
22 |
from langchain.callbacks.manager import BaseCallbackManager
|
|
|
23 |
|
24 |
from typing import Any, Dict, List, Optional, Union
|
25 |
|
@@ -108,6 +109,7 @@ class ModelType(Enum):
|
|
108 |
MOSS = 5
|
109 |
YuanAI = 6
|
110 |
ChuanhuAgent = 7
|
|
|
111 |
|
112 |
@classmethod
|
113 |
def get_type(cls, model_name: str):
|
@@ -129,6 +131,8 @@ class ModelType(Enum):
|
|
129 |
model_type = ModelType.YuanAI
|
130 |
elif "川虎助理" in model_name_lower:
|
131 |
model_type = ModelType.ChuanhuAgent
|
|
|
|
|
132 |
else:
|
133 |
model_type = ModelType.Unknown
|
134 |
return model_type
|
@@ -262,19 +266,20 @@ class BaseLLMModel:
|
|
262 |
status = i18n("索引构建完成")
|
263 |
# Summarize the document
|
264 |
logging.info(i18n("生成内容总结中……"))
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
|
|
278 |
return gr.Files.update(), chatbot, status
|
279 |
|
280 |
def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
|
|
|
20 |
|
21 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
22 |
from langchain.callbacks.manager import BaseCallbackManager
|
23 |
+
from langchain.callbacks import get_openai_callback
|
24 |
|
25 |
from typing import Any, Dict, List, Optional, Union
|
26 |
|
|
|
109 |
MOSS = 5
|
110 |
YuanAI = 6
|
111 |
ChuanhuAgent = 7
|
112 |
+
PaLM = 8
|
113 |
|
114 |
@classmethod
|
115 |
def get_type(cls, model_name: str):
|
|
|
131 |
model_type = ModelType.YuanAI
|
132 |
elif "川虎助理" in model_name_lower:
|
133 |
model_type = ModelType.ChuanhuAgent
|
134 |
+
elif "palm" in model_name_lower:
|
135 |
+
model_type = ModelType.PaLM
|
136 |
else:
|
137 |
model_type = ModelType.Unknown
|
138 |
return model_type
|
|
|
266 |
status = i18n("索引构建完成")
|
267 |
# Summarize the document
|
268 |
logging.info(i18n("生成内容总结中……"))
|
269 |
+
with get_openai_callback() as cb:
|
270 |
+
os.environ["OPENAI_API_KEY"] = self.api_key
|
271 |
+
from langchain.chains.summarize import load_summarize_chain
|
272 |
+
from langchain.prompts import PromptTemplate
|
273 |
+
from langchain.chat_models import ChatOpenAI
|
274 |
+
from langchain.callbacks import StdOutCallbackHandler
|
275 |
+
prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
|
276 |
+
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
|
277 |
+
llm = ChatOpenAI()
|
278 |
+
chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
|
279 |
+
summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
|
280 |
+
print(i18n("总结") + f": {summary}")
|
281 |
+
chatbot.append([i18n("上传了")+str(len(files))+"个文件", summary])
|
282 |
+
logging.info(cb)
|
283 |
return gr.Files.update(), chatbot, status
|
284 |
|
285 |
def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
|
modules/models/models.py
CHANGED
@@ -606,6 +606,9 @@ def get_model(
|
|
606 |
elif model_type == ModelType.ChuanhuAgent:
|
607 |
from .ChuanhuAgent import ChuanhuAgent_Client
|
608 |
model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
|
|
|
|
|
|
|
609 |
elif model_type == ModelType.Unknown:
|
610 |
raise ValueError(f"未知模型: {model_name}")
|
611 |
logging.info(msg)
|
|
|
606 |
elif model_type == ModelType.ChuanhuAgent:
|
607 |
from .ChuanhuAgent import ChuanhuAgent_Client
|
608 |
model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
|
609 |
+
elif model_type == ModelType.PaLM:
|
610 |
+
from .PaLM import PaLM_Client
|
611 |
+
model = PaLM_Client(model_name, user_name=user_name)
|
612 |
elif model_type == ModelType.Unknown:
|
613 |
raise ValueError(f"未知模型: {model_name}")
|
614 |
logging.info(msg)
|
modules/overwrites.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
from __future__ import annotations
|
2 |
import logging
|
3 |
|
4 |
-
from llama_index import Prompt
|
5 |
from typing import List, Tuple
|
6 |
import mdtex2html
|
7 |
from gradio_client import utils as client_utils
|
@@ -10,15 +9,6 @@ from modules.presets import *
|
|
10 |
from modules.index_func import *
|
11 |
from modules.config import render_latex
|
12 |
|
13 |
-
def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
|
14 |
-
logging.debug("Compacting text chunks...🚀🚀🚀")
|
15 |
-
combined_str = [c.strip() for c in text_chunks if c.strip()]
|
16 |
-
combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
|
17 |
-
combined_str = "\n\n".join(combined_str)
|
18 |
-
# resplit based on self.max_chunk_overlap
|
19 |
-
text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
|
20 |
-
return text_splitter.split_text(combined_str)
|
21 |
-
|
22 |
|
23 |
def postprocess(
|
24 |
self,
|
|
|
1 |
from __future__ import annotations
|
2 |
import logging
|
3 |
|
|
|
4 |
from typing import List, Tuple
|
5 |
import mdtex2html
|
6 |
from gradio_client import utils as client_utils
|
|
|
9 |
from modules.index_func import *
|
10 |
from modules.config import render_latex
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def postprocess(
|
14 |
self,
|
modules/pdf_func.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
from types import SimpleNamespace
|
2 |
import pdfplumber
|
3 |
import logging
|
4 |
-
from
|
5 |
|
6 |
def prepare_table_config(crop_page):
|
7 |
"""Prepare table查找边界, 要求page为原始page
|
8 |
-
|
9 |
From https://github.com/jsvine/pdfplumber/issues/242
|
10 |
"""
|
11 |
page = crop_page.root_page # root/parent
|
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
|
|
60 |
title_bottom = word.bottom
|
61 |
elif word.text == "Abstract": # 获取页面abstract
|
62 |
top = word.top
|
63 |
-
|
64 |
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
65 |
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
66 |
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
|
|
75 |
new_pages.append(right)
|
76 |
else:
|
77 |
new_pages.append(page)
|
78 |
-
|
79 |
return new_pages
|
80 |
|
81 |
def parse_pdf(filename, two_column = True):
|
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
|
|
94 |
name_top=name_top,
|
95 |
name_bottom=name_bottom,
|
96 |
record_chapter_name = True,
|
97 |
-
|
98 |
page_start=page_start,
|
99 |
page_stop=None,
|
100 |
|
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
|
|
114 |
if word.size >= 11: # 出现chapter name
|
115 |
if cur_chapter is None:
|
116 |
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
117 |
-
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
118 |
# 不再继续写chapter name
|
119 |
cur_chapter.page_stop = page.page_number # stop id
|
120 |
chapters.append(cur_chapter)
|
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
|
|
143 |
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
144 |
|
145 |
logging.getLogger().setLevel(level)
|
146 |
-
return Document(
|
147 |
|
148 |
BASE_POINTS = """
|
149 |
1. Who are the authors?
|
|
|
1 |
from types import SimpleNamespace
|
2 |
import pdfplumber
|
3 |
import logging
|
4 |
+
from langchain.docstore.document import Document
|
5 |
|
6 |
def prepare_table_config(crop_page):
|
7 |
"""Prepare table查找边界, 要求page为原始page
|
8 |
+
|
9 |
From https://github.com/jsvine/pdfplumber/issues/242
|
10 |
"""
|
11 |
page = crop_page.root_page # root/parent
|
|
|
60 |
title_bottom = word.bottom
|
61 |
elif word.text == "Abstract": # 获取页面abstract
|
62 |
top = word.top
|
63 |
+
|
64 |
user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
|
65 |
# 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
|
66 |
return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
|
|
|
75 |
new_pages.append(right)
|
76 |
else:
|
77 |
new_pages.append(page)
|
78 |
+
|
79 |
return new_pages
|
80 |
|
81 |
def parse_pdf(filename, two_column = True):
|
|
|
94 |
name_top=name_top,
|
95 |
name_bottom=name_bottom,
|
96 |
record_chapter_name = True,
|
97 |
+
|
98 |
page_start=page_start,
|
99 |
page_stop=None,
|
100 |
|
|
|
114 |
if word.size >= 11: # 出现chapter name
|
115 |
if cur_chapter is None:
|
116 |
cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
|
117 |
+
elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
|
118 |
# 不再继续写chapter name
|
119 |
cur_chapter.page_stop = page.page_number # stop id
|
120 |
chapters.append(cur_chapter)
|
|
|
143 |
text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
|
144 |
|
145 |
logging.getLogger().setLevel(level)
|
146 |
+
return Document(page_content=text, metadata={"title": title})
|
147 |
|
148 |
BASE_POINTS = """
|
149 |
1. Who are the authors?
|
modules/presets.py
CHANGED
@@ -68,6 +68,7 @@ ONLINE_MODELS = [
|
|
68 |
"gpt-4-32k",
|
69 |
"gpt-4-32k-0314",
|
70 |
"xmchat",
|
|
|
71 |
"yuanai-1.0-base_10B",
|
72 |
"yuanai-1.0-translate",
|
73 |
"yuanai-1.0-dialog",
|
|
|
68 |
"gpt-4-32k",
|
69 |
"gpt-4-32k-0314",
|
70 |
"xmchat",
|
71 |
+
"Google PaLM",
|
72 |
"yuanai-1.0-base_10B",
|
73 |
"yuanai-1.0-translate",
|
74 |
"yuanai-1.0-dialog",
|
requirements.txt
CHANGED
@@ -15,4 +15,12 @@ pdfplumber
|
|
15 |
pandas
|
16 |
commentjson
|
17 |
openpyxl
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
pandas
|
16 |
commentjson
|
17 |
openpyxl
|
18 |
+
pandoc
|
19 |
+
wolframalpha
|
20 |
+
faiss-cpu
|
21 |
+
google-search-results
|
22 |
+
arxiv
|
23 |
+
wikipedia
|
24 |
+
google.generativeai
|
25 |
+
openai
|
26 |
+
unstructured
|