Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#import atexit
|
2 |
+
import gradio as gr
|
3 |
+
#from langchain.document_loaders import UnstructuredPDFLoader
|
4 |
+
from langchain.document_loaders import PyPDFLoader
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
#from langchain.text_splitter import CharacterTextSplitter
|
7 |
+
from langchain.vectorstores import Pinecone
|
8 |
+
import pinecone
|
9 |
+
import requests
|
10 |
+
import sys
|
11 |
+
#from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate)
|
12 |
+
from langchain.chains.question_answering import load_qa_chain
|
13 |
+
#from langchain.chains import RetrievalQA
|
14 |
+
from langchain import PromptTemplate
|
15 |
+
from langchain import HuggingFaceHub
|
16 |
+
from PyPDF2 import PdfReader
|
17 |
+
#from langchain.document_loaders import TextLoader
|
18 |
+
#from sentence_transformers.util import semantic_search
|
19 |
+
from pathlib import Path
|
20 |
+
from time import sleep
|
21 |
+
#import pandas as pd
|
22 |
+
#import torch
|
23 |
+
import os
|
24 |
+
import random
|
25 |
+
import string
|
26 |
+
|
27 |
+
from dotenv import load_dotenv
|
28 |
+
load_dotenv()
|
29 |
+
|
30 |
+
file_path = os.path.join(os.getcwd(), "valuation.pdf")
|
31 |
+
#loader = PyPDFLoader("60LEADERSONAI.pdf")
|
32 |
+
#loader = PyPDFLoader(file_path)
|
33 |
+
#data = loader.load()
|
34 |
+
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
35 |
+
#db_texts = text_splitter.split_documents(data)
|
36 |
+
|
37 |
+
data = PdfReader(file_path)
|
38 |
+
raw_text = ''
|
39 |
+
db_texts=''
|
40 |
+
for i, page in enumerate(data.pages):
|
41 |
+
text = page.extract_text()
|
42 |
+
if text:
|
43 |
+
raw_text += text
|
44 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
45 |
+
# separator = "\n",
|
46 |
+
chunk_size = 1000,
|
47 |
+
chunk_overlap = 100, #striding over the text
|
48 |
+
length_function = len,
|
49 |
+
)
|
50 |
+
db_texts = text_splitter.split_text(raw_text)
|
51 |
+
|
52 |
+
class HFEmbeddings:
|
53 |
+
def __init__(self, api_url, headers):
|
54 |
+
self.api_url = api_url
|
55 |
+
self.headers = headers
|
56 |
+
|
57 |
+
def get_embeddings(self, texts):
|
58 |
+
response = requests.post(self.api_url, headers=self.headers, json={"inputs": texts, "options": {"wait_for_model": True}})
|
59 |
+
embeddings = response.json()
|
60 |
+
return embeddings
|
61 |
+
|
62 |
+
def embed_documents(self, texts):
|
63 |
+
embeddings = self.get_embeddings(texts)
|
64 |
+
return embeddings
|
65 |
+
|
66 |
+
def __call__(self, texts):
|
67 |
+
return self.embed_documents(texts)
|
68 |
+
|
69 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
70 |
+
model_id = os.getenv('model_id')
|
71 |
+
hf_token = os.getenv('hf_token')
|
72 |
+
repo_id = os.getenv('repo_id')
|
73 |
+
|
74 |
+
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
|
75 |
+
headers = {"Authorization": f"Bearer {hf_token}"}
|
76 |
+
|
77 |
+
hf_embeddings = HFEmbeddings(api_url, headers)
|
78 |
+
|
79 |
+
#Pinecone账号:
|
80 |
+
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
|
81 |
+
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
|
82 |
+
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
|
83 |
+
#def generate_random_string(length):
|
84 |
+
# letters = string.ascii_letters
|
85 |
+
# random_string = ''.join(random.choice(letters) for _ in range(length))
|
86 |
+
# return random_string
|
87 |
+
#random_string = generate_random_string(8)
|
88 |
+
|
89 |
+
#def generate_random_string(length):
|
90 |
+
# letters = string.ascii_lowercase
|
91 |
+
# return ''.join(random.choice(letters) for i in range(length))
|
92 |
+
#random_string = generate_random_string(8)
|
93 |
+
|
94 |
+
index_name = PINECONE_INDEX_NAME
|
95 |
+
#namespace = random_string
|
96 |
+
namespace = "HF-GRADIO-0909"
|
97 |
+
|
98 |
+
#def exit_handler():
|
99 |
+
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
|
100 |
+
# index_namespace_to_delete = pinecone.Index(index_name=index_name)
|
101 |
+
# index_namespace_to_delete.delete(delete_all=True, namespace=namespace)
|
102 |
+
|
103 |
+
#atexit.register(exit_handler)
|
104 |
+
|
105 |
+
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
|
106 |
+
vector_db = Pinecone.from_texts(db_texts, hf_embeddings, index_name=index_name, namespace=namespace)
|
107 |
+
#vector_db = Pinecone.from_texts([t.page_content for t in db_texts], hf_embeddings, index_name=index_name, namespace=namespace)
|
108 |
+
#docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)
|
109 |
+
print("***********************************")
|
110 |
+
print("Pinecone Vector/Embedding DB Ready.")
|
111 |
+
print()
|
112 |
+
|
113 |
+
llm = HuggingFaceHub(repo_id=repo_id,
|
114 |
+
model_kwargs={"min_length":100,
|
115 |
+
"max_new_tokens":1024, "do_sample":True,
|
116 |
+
"temperature":0.1,
|
117 |
+
"top_k":50,
|
118 |
+
"top_p":0.95, "eos_token_id":49155})
|
119 |
+
|
120 |
+
chain = load_qa_chain(llm=llm, chain_type="stuff")
|
121 |
+
|
122 |
+
def run_chain(user_query):
|
123 |
+
if user_query !="" and not user_query.strip().isspace() and not user_query.isspace():
|
124 |
+
print("Your query:\n"+user_query)
|
125 |
+
vector_db_from_index = Pinecone.from_existing_index(index_name, hf_embeddings, namespace=namespace)
|
126 |
+
ss_results = vector_db_from_index.similarity_search(query=user_query, namespace=namespace, k=5)
|
127 |
+
initial_ai_response = chain.run(input_documents=ss_results, question=user_query)
|
128 |
+
temp_ai_response = initial_ai_response.partition('<|end|>')[0]
|
129 |
+
final_ai_response = temp_ai_response.replace('\n', '')
|
130 |
+
return final_ai_response
|
131 |
+
else:
|
132 |
+
print("Invalid inputs.")
|
133 |
+
|
134 |
+
iface = gr.Interface(fn=run_chain, inputs="text", outputs="text", title="AI Response")
|
135 |
+
iface.launch()
|