import streamlit as st #from langchain.chat_models import ChatOpenAI from langchain.document_loaders import WebBaseLoader from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from bs4 import BeautifulSoup #WebBaseLoader会需要 用到? from langchain import HuggingFaceHub import requests import sys from huggingface_hub import InferenceClient import os from dotenv import load_dotenv load_dotenv() hf_token = os.environ.get('HUGGINGFACEHUB_API_TOKEN') repo_id=os.environ.get('repo_id') #port = os.getenv('port') #OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') llm = HuggingFaceHub(repo_id=repo_id, # for StarChat huggingfacehub_api_token=hf_token, #这个变量huggingfacehub_api_token名称似乎没有问题! model_kwargs={"min_length": 512, # for StarChat "max_new_tokens": 1024, "do_sample": True, # for StarChat "temperature": 0.01, "top_k": 50, "top_p": 0.95, "eos_token_id": 49155}) #chain = load_summarize_chain(llm, chain_type="stuff") #stuff模式容易导致出错:估计是超LLM的token限制所致 chain = load_summarize_chain(llm, chain_type="refine") print(f"定义处理多余的Context文本的函数") def remove_context(text): # 检查 'Context:' 是否存在 if 'Context:' in text: # 找到第一个 '\n\n' 的位置 end_of_context = text.find('\n\n') # 删除 'Context:' 到第一个 '\n\n' 之间的部分 return text[end_of_context + 2:] # '+2' 是为了跳过两个换行符 else: # 如果 'Context:' 不存在,返回原始文本 return text print(f"处理多余的Context文本函数定义结束") #text_splitter_rcs = RecursiveCharacterTextSplitter( # #separator = "\n", #TypeError: TextSplitter.__init__() got an unexpected keyword argument 'separator' # chunk_size = 500, # chunk_overlap = 100, #striding over the text # length_function = len, # ) #llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k") url=st.text_input("Enter webiste URL to summarize (format: https://www.usinoip.com):") if url !="" and not url.strip().isspace() and not url == "" and not url.strip() == "" and not url.isspace(): try: #loader = WebBaseLoader("https://www.usinoip.com/") with st.spinner("AI Thinking...Please wait a while to Cheers!"): print("Website to Chat: "+url) loader = WebBaseLoader(url) docs = loader.load() print("Webpage contents loaded") #split_docs = text_splitter_rcs.split_documents(docs) #print(split_docs) result=chain.run(docs) #这个result的格式比较特殊,可以直接print,但不可以和其他字符串联合print输出 - this step errors! #result=chain.run(split_docs) #找到之前总是POST Error的原因:chain.run(docs)的结果,格式不是str,导致程序错误 print("Chain run results:") print(result) result=str(result) print("Chain run results in str format:") print(result) cleaned_initial_ai_response = remove_context(result) print("Ai Resposne result cleaned initially: "+cleaned_initial_ai_response) final_ai_response = cleaned_initial_ai_response.split('<|end|>')[0].strip().replace('\n\n', '\n').replace('<|end|>', '').replace('<|user|>', '').replace('<|system|>', '').replace('<|assistant|>', '') new_final_ai_response = final_ai_response.split('Unhelpful Answer:')[0].strip() final_result = new_final_ai_response.split('Note:')[0].strip() #print("AI Summarization: "+result) #这个会出错,原因见上方 print("AI Summarization:") #print(result) print(final_result) st.write("AI Summarization:") #st.write(result) st.write(final_result) except Exception as e: st.write("Wrong URL or URL not parsable.")