|
from huggingface_hub import login, InferenceClient |
|
import os, gc, time, random, datetime, json, re |
|
HF_TOKEN=os.getenv('HF_TOKEN') |
|
SERP_API_KEY=os.getenv('SERP_KEY') |
|
login(token=HF_TOKEN) |
|
import gradio as gr |
|
from transformers import CodeAgent, Tool, ToolCollection, load_tool, ReactCodeAgent, ReactJsonAgent |
|
from transformers.agents import PythonInterpreterTool |
|
from langchain.memory import ConversationBufferMemory |
|
import bs4 |
|
import requests |
|
from llm_engine import HfEngine |
|
import datasets |
|
import spaces |
|
import tqdm |
|
from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
from langchain.docstore.document import Document |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_core.vectorstores import VectorStore |
|
from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_JSON_SYSTEM_PROMPT |
|
from transformers.agents.default_tools import Tool, PythonInterpreterTool |
|
from duckduckgo_search import DDGS |
|
from web_surfer import (SearchInformationTool, NavigationalSearchTool, VisitTool, DownloadTool, PageUpTool, PageDownTool, FinderTool, FindNextTool, ArchiveSearchTool,) |
|
from mdconvert import MarkdownConverter |
|
from visual_qa import VisualQATool, VisualQAGPT4Tool |
|
def search_ducky(query): |
|
with DDGS() as ddgs: |
|
results = list(ddgs.text(query, max_results=10)) |
|
content = '' |
|
if results: |
|
for result in results: |
|
content += result['body'] |
|
return content |
|
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") |
|
source_docs = [Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base] |
|
docs_processed = RecursiveCharacterTextSplitter(chunk_size=500).split_documents(source_docs)[:1000] |
|
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") |
|
vectordb = FAISS.from_documents(documents=docs_processed, embedding=embedding_model) |
|
all_sources = list(set([doc.metadata["source"] for doc in docs_processed])) |
|
print(all_sources) |
|
class RetrieverTool(Tool): |
|
name = "retriever" |
|
description = "Retrieves some documents from the knowledge base that have the closest embeddings to the input query." |
|
inputs = { |
|
"query": { |
|
"type": "text", |
|
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", |
|
}, |
|
"source": { |
|
"type": "text", |
|
"description": "" |
|
}, |
|
} |
|
output_type = "text" |
|
|
|
def __init__(self, vectordb: VectorStore, all_sources: str, **kwargs): |
|
super().__init__(**kwargs) |
|
self.vectordb = vectordb |
|
self.inputs["source"]["description"] = (f"The source of the documents to search, as a str representation of a list. Possible values in the list are: {all_sources}. If this argument is not provided, all sources will be searched.") |
|
|
|
def forward(self, query: str, source: str = None) -> str: |
|
assert isinstance(query, str), "Your search query must be a string" |
|
|
|
if source: |
|
if isinstance(source, str) and "[" not in str(source): |
|
source = [source] |
|
source = json.loads(str(source).replace("'", '"')) |
|
|
|
docs = self.vectordb.similarity_search(query, filter=({"source": source} if source else None), k=3) |
|
|
|
if len(docs) == 0: |
|
return "No documents found with this filtering. Try removing the source filter." |
|
return "Retrieved documents:\n\n" + "\n===Document===\n".join([doc.page_content for doc in docs]) |
|
memory = ConversationBufferMemory(memory_key="chat_history") |
|
llm_engine = HfEngine(model="Jopmt/JoPmt") |
|
|
|
|
|
|
|
|
|
class SearchTool(Tool): |
|
name = "ask_search_agent" |
|
description = "A search agent that will browse the internet to answer a question. Use it to gather informations, not for problem-solving." |
|
|
|
inputs = { |
|
"question": { |
|
"description": "Your question, as a natural language sentence. You are talking to an agent, so provide them with as much context as possible.", |
|
"type": "text", |
|
} |
|
} |
|
output_type = "text" |
|
|
|
def forward(self, question: str) -> str: |
|
return websurfer_agent.run(question) |
|
tools=[PythonInterpreterTool(),SearchTool(),RetrieverTool(vectordb, all_sources)] |
|
additional_authorized_imports=['requests', 'bs4', 'os', 'time', 'datetime', 'json', 're'] |
|
WEB_TOOLS = [SearchInformationTool(), NavigationalSearchTool(), VisitTool(), DownloadTool(), PageUpTool(), PageDownTool(), FinderTool(), FindNextTool(), ArchiveSearchTool(),] |
|
websurfer_agent = ReactJsonAgent(tools=WEB_TOOLS,llm_engine=llm_engine, add_base_tools=True,max_iterations=1) |
|
reagent = ReactCodeAgent(tools=tools, llm_engine=llm_engine, add_base_tools=True,max_iterations=1,additional_authorized_imports=additional_authorized_imports) |
|
def plix(inut, progress=gr.Progress(track_tqdm=True)): |
|
goose=reagent.run(inut) |
|
return goose |
|
with gr.Blocks(theme=random.choice([gr.themes.Monochrome(),gr.themes.Base.from_hub("gradio/seafoam"),gr.themes.Base.from_hub("freddyaboulton/dracula_revamped"),gr.themes.Glass(),gr.themes.Base(),]),analytics_enabled=False) as iface: |
|
out=gr.Textbox(label="🤗Output",lines=5,interactive=False) |
|
inut=gr.Textbox(label="Prompt") |
|
btn=gr.Button("GENERATE") |
|
btn.click(fn=plix,inputs=inut,outputs=out) |
|
iface.queue(max_size=1,api_open=False) |
|
iface.launch(max_threads=20,inline=False,show_api=False) |