Tuchuanhuhuhu commited on
Commit
a592279
·
2 Parent(s): 66e5db6 8043b80

Merge branch 'main' into tooling

Browse files
config_example.json CHANGED
@@ -2,9 +2,10 @@
2
  // 你的OpenAI API Key,一般必填,
3
  // 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
4
  "openai_api_key": "",
 
 
5
  "usage_limit": 120, // API Key的当月限额,单位:美元
6
  // 你的xmchat API Key,与OpenAI API Key不同
7
- "xmchat_api_key": "",
8
  "language": "auto",
9
  // 如果使用代理,请取消注释下面的两行,并替换代理URL
10
  // "https_proxy": "http://127.0.0.1:1079",
 
2
  // 你的OpenAI API Key,一般必填,
3
  // 若缺省填为 "openai_api_key": "" 则必须再在图形界面中填入API Key
4
  "openai_api_key": "",
5
+ "google_palm_api_key": "",
6
+ "xmchat_api_key": "",
7
  "usage_limit": 120, // API Key的当月限额,单位:美元
8
  // 你的xmchat API Key,与OpenAI API Key不同
 
9
  "language": "auto",
10
  // 如果使用代理,请取消注释下面的两行,并替换代理URL
11
  // "https_proxy": "http://127.0.0.1:1079",
modules/config.py CHANGED
@@ -103,7 +103,7 @@ api_host = os.environ.get("api_host", config.get("api_host", ""))
103
  if api_host:
104
  shared.state.set_api_host(api_host)
105
 
106
- default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-4")
107
  os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
108
  os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
109
  os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")
 
103
  if api_host:
104
  shared.state.set_api_host(api_host)
105
 
106
+ default_chuanhu_assistant_model = config.get("default_chuanhu_assistant_model", "gpt-3.5-turbo")
107
  os.environ["GOOGLE_CSE_ID"] = config.get("GOOGLE_CSE_ID", "")
108
  os.environ["GOOGLE_API_KEY"] = config.get("GOOGLE_API_KEY", "")
109
  os.environ["WOLFRAM_ALPHA_APPID"] = config.get("WOLFRAM_ALPHA_APPID", "")
modules/index_func.py CHANGED
@@ -42,7 +42,7 @@ def get_documents(file_src):
42
  for file in file_src:
43
  filepath = file.name
44
  filename = os.path.basename(filepath)
45
- file_type = os.path.splitext(filepath)[1]
46
  logging.info(f"loading file: {filename}")
47
  try:
48
  if file_type == ".pdf":
@@ -87,8 +87,9 @@ def get_documents(file_src):
87
  loader = TextLoader(filepath, "utf8")
88
  texts = loader.load()
89
  except Exception as e:
 
90
  logging.error(f"Error loading file: {filename}")
91
- pass
92
 
93
  texts = text_splitter.split_documents(texts)
94
  documents.extend(texts)
@@ -142,6 +143,7 @@ def construct_index(
142
  return index
143
 
144
  except Exception as e:
 
145
  logging.error("索引构建失败!", e)
146
- print(e)
147
  return None
 
42
  for file in file_src:
43
  filepath = file.name
44
  filename = os.path.basename(filepath)
45
+ file_type = os.path.splitext(filename)[1]
46
  logging.info(f"loading file: {filename}")
47
  try:
48
  if file_type == ".pdf":
 
87
  loader = TextLoader(filepath, "utf8")
88
  texts = loader.load()
89
  except Exception as e:
90
+ import traceback
91
  logging.error(f"Error loading file: {filename}")
92
+ traceback.print_exc()
93
 
94
  texts = text_splitter.split_documents(texts)
95
  documents.extend(texts)
 
143
  return index
144
 
145
  except Exception as e:
146
+ import traceback
147
  logging.error("索引构建失败!", e)
148
+ traceback.print_exc()
149
  return None
modules/models/PaLM.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base_model import BaseLLMModel, CallbackToIterator, ChuanhuCallbackHandler
2
+ from langchain.chat_models import ChatGooglePalm
3
+
4
+ class PaLM_Client(BaseLLMModel):
5
+ def __init__(self, model_name, user="") -> None:
6
+ super().__init__(model_name, user)
7
+ self.llm = ChatGooglePalm(google_api_key="")
8
+
9
+ def get_answer_at_once(self):
10
+ self.llm.generate(self.history)
modules/models/base_model.py CHANGED
@@ -20,6 +20,7 @@ from enum import Enum
20
 
21
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
22
  from langchain.callbacks.manager import BaseCallbackManager
 
23
 
24
  from typing import Any, Dict, List, Optional, Union
25
 
@@ -108,6 +109,7 @@ class ModelType(Enum):
108
  MOSS = 5
109
  YuanAI = 6
110
  ChuanhuAgent = 7
 
111
 
112
  @classmethod
113
  def get_type(cls, model_name: str):
@@ -129,6 +131,8 @@ class ModelType(Enum):
129
  model_type = ModelType.YuanAI
130
  elif "川虎助理" in model_name_lower:
131
  model_type = ModelType.ChuanhuAgent
 
 
132
  else:
133
  model_type = ModelType.Unknown
134
  return model_type
@@ -262,19 +266,20 @@ class BaseLLMModel:
262
  status = i18n("索引构建完成")
263
  # Summarize the document
264
  logging.info(i18n("生成内容总结中……"))
265
- os.environ["OPENAI_API_KEY"] = self.api_key
266
- from langchain.chains.summarize import load_summarize_chain
267
- from langchain.prompts import PromptTemplate
268
- from langchain.chat_models import ChatOpenAI
269
- from langchain.callbacks import StdOutCallbackHandler
270
- prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
271
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
272
- handler = StdOutCallbackHandler()
273
- llm = ChatOpenAI(callbacks=[handler])
274
- chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
275
- summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
276
- print(i18n("总结") + f": {summary}")
277
- chatbot.append([i18n("总结"), summary])
 
278
  return gr.Files.update(), chatbot, status
279
 
280
  def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
 
20
 
21
  from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
22
  from langchain.callbacks.manager import BaseCallbackManager
23
+ from langchain.callbacks import get_openai_callback
24
 
25
  from typing import Any, Dict, List, Optional, Union
26
 
 
109
  MOSS = 5
110
  YuanAI = 6
111
  ChuanhuAgent = 7
112
+ PaLM = 8
113
 
114
  @classmethod
115
  def get_type(cls, model_name: str):
 
131
  model_type = ModelType.YuanAI
132
  elif "川虎助理" in model_name_lower:
133
  model_type = ModelType.ChuanhuAgent
134
+ elif "palm" in model_name_lower:
135
+ model_type = ModelType.PaLM
136
  else:
137
  model_type = ModelType.Unknown
138
  return model_type
 
266
  status = i18n("索引构建完成")
267
  # Summarize the document
268
  logging.info(i18n("生成内容总结中……"))
269
+ with get_openai_callback() as cb:
270
+ os.environ["OPENAI_API_KEY"] = self.api_key
271
+ from langchain.chains.summarize import load_summarize_chain
272
+ from langchain.prompts import PromptTemplate
273
+ from langchain.chat_models import ChatOpenAI
274
+ from langchain.callbacks import StdOutCallbackHandler
275
+ prompt_template = "Write a concise summary of the following:\n\n{text}\n\nCONCISE SUMMARY IN " + language + ":"
276
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
277
+ llm = ChatOpenAI()
278
+ chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
279
+ summary = chain({"input_documents": list(index.docstore.__dict__["_dict"].values())}, return_only_outputs=True)["output_text"]
280
+ print(i18n("总结") + f": {summary}")
281
+ chatbot.append([i18n("上传了")+str(len(files))+"个文件", summary])
282
+ logging.info(cb)
283
  return gr.Files.update(), chatbot, status
284
 
285
  def prepare_inputs(self, real_inputs, use_websearch, files, reply_language, chatbot):
modules/models/models.py CHANGED
@@ -606,6 +606,9 @@ def get_model(
606
  elif model_type == ModelType.ChuanhuAgent:
607
  from .ChuanhuAgent import ChuanhuAgent_Client
608
  model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
 
 
 
609
  elif model_type == ModelType.Unknown:
610
  raise ValueError(f"未知模型: {model_name}")
611
  logging.info(msg)
 
606
  elif model_type == ModelType.ChuanhuAgent:
607
  from .ChuanhuAgent import ChuanhuAgent_Client
608
  model = ChuanhuAgent_Client(model_name, access_key, user_name=user_name)
609
+ elif model_type == ModelType.PaLM:
610
+ from .PaLM import PaLM_Client
611
+ model = PaLM_Client(model_name, user_name=user_name)
612
  elif model_type == ModelType.Unknown:
613
  raise ValueError(f"未知模型: {model_name}")
614
  logging.info(msg)
modules/overwrites.py CHANGED
@@ -1,7 +1,6 @@
1
  from __future__ import annotations
2
  import logging
3
 
4
- from llama_index import Prompt
5
  from typing import List, Tuple
6
  import mdtex2html
7
  from gradio_client import utils as client_utils
@@ -10,15 +9,6 @@ from modules.presets import *
10
  from modules.index_func import *
11
  from modules.config import render_latex
12
 
13
- def compact_text_chunks(self, prompt: Prompt, text_chunks: List[str]) -> List[str]:
14
- logging.debug("Compacting text chunks...🚀🚀🚀")
15
- combined_str = [c.strip() for c in text_chunks if c.strip()]
16
- combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
17
- combined_str = "\n\n".join(combined_str)
18
- # resplit based on self.max_chunk_overlap
19
- text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
20
- return text_splitter.split_text(combined_str)
21
-
22
 
23
  def postprocess(
24
  self,
 
1
  from __future__ import annotations
2
  import logging
3
 
 
4
  from typing import List, Tuple
5
  import mdtex2html
6
  from gradio_client import utils as client_utils
 
9
  from modules.index_func import *
10
  from modules.config import render_latex
11
 
 
 
 
 
 
 
 
 
 
12
 
13
  def postprocess(
14
  self,
modules/pdf_func.py CHANGED
@@ -1,11 +1,11 @@
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
- from llama_index import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
-
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
@@ -60,7 +60,7 @@ def get_title_with_cropped_page(first_page):
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
-
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
@@ -75,7 +75,7 @@ def get_column_cropped_pages(pages, two_column=True):
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
-
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
@@ -94,7 +94,7 @@ def parse_pdf(filename, two_column = True):
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
-
98
  page_start=page_start,
99
  page_stop=None,
100
 
@@ -114,7 +114,7 @@ def parse_pdf(filename, two_column = True):
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
- elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
@@ -143,7 +143,7 @@ def parse_pdf(filename, two_column = True):
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
- return Document(text=text, extra_info={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?
 
1
  from types import SimpleNamespace
2
  import pdfplumber
3
  import logging
4
+ from langchain.docstore.document import Document
5
 
6
  def prepare_table_config(crop_page):
7
  """Prepare table查找边界, 要求page为原始page
8
+
9
  From https://github.com/jsvine/pdfplumber/issues/242
10
  """
11
  page = crop_page.root_page # root/parent
 
60
  title_bottom = word.bottom
61
  elif word.text == "Abstract": # 获取页面abstract
62
  top = word.top
63
+
64
  user_info = [i["text"] for i in extract_words(first_page.within_bbox((x0,title_bottom,x1,top)))]
65
  # 裁剪掉上半部分, within_bbox: full_included; crop: partial_included
66
  return title, user_info, first_page.within_bbox((x0,top,x1,bottom))
 
75
  new_pages.append(right)
76
  else:
77
  new_pages.append(page)
78
+
79
  return new_pages
80
 
81
  def parse_pdf(filename, two_column = True):
 
94
  name_top=name_top,
95
  name_bottom=name_bottom,
96
  record_chapter_name = True,
97
+
98
  page_start=page_start,
99
  page_stop=None,
100
 
 
114
  if word.size >= 11: # 出现chapter name
115
  if cur_chapter is None:
116
  cur_chapter = create_chapter(page.page_number, word.top, word.bottom)
117
+ elif not cur_chapter.record_chapter_name or (cur_chapter.name_bottom != cur_chapter.name_bottom and cur_chapter.name_top != cur_chapter.name_top):
118
  # 不再继续写chapter name
119
  cur_chapter.page_stop = page.page_number # stop id
120
  chapters.append(cur_chapter)
 
143
  text += f"The {idx}th Chapter {chapter.name}: " + " ".join(chapter.text) + "\n"
144
 
145
  logging.getLogger().setLevel(level)
146
+ return Document(page_content=text, metadata={"title": title})
147
 
148
  BASE_POINTS = """
149
  1. Who are the authors?
modules/presets.py CHANGED
@@ -68,6 +68,7 @@ ONLINE_MODELS = [
68
  "gpt-4-32k",
69
  "gpt-4-32k-0314",
70
  "xmchat",
 
71
  "yuanai-1.0-base_10B",
72
  "yuanai-1.0-translate",
73
  "yuanai-1.0-dialog",
 
68
  "gpt-4-32k",
69
  "gpt-4-32k-0314",
70
  "xmchat",
71
+ "Google PaLM",
72
  "yuanai-1.0-base_10B",
73
  "yuanai-1.0-translate",
74
  "yuanai-1.0-dialog",
requirements.txt CHANGED
@@ -15,4 +15,12 @@ pdfplumber
15
  pandas
16
  commentjson
17
  openpyxl
18
- pandocs
 
 
 
 
 
 
 
 
 
15
  pandas
16
  commentjson
17
  openpyxl
18
+ pandoc
19
+ wolframalpha
20
+ faiss-cpu
21
+ google-search-results
22
+ arxiv
23
+ wikipedia
24
+ google.generativeai
25
+ openai
26
+ unstructured