iAIChat commited on
Commit
73b7e8f
1 Parent(s): 72c48f1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import atexit
2
+ import gradio as gr
3
+ #from langchain.document_loaders import UnstructuredPDFLoader
4
+ from langchain.document_loaders import PyPDFLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ #from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain.vectorstores import Pinecone
8
+ import pinecone
9
+ import requests
10
+ import sys
11
+ #from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate)
12
+ from langchain.chains.question_answering import load_qa_chain
13
+ #from langchain.chains import RetrievalQA
14
+ from langchain import PromptTemplate
15
+ from langchain import HuggingFaceHub
16
+ from PyPDF2 import PdfReader
17
+ #from langchain.document_loaders import TextLoader
18
+ #from sentence_transformers.util import semantic_search
19
+ from pathlib import Path
20
+ from time import sleep
21
+ #import pandas as pd
22
+ #import torch
23
+ import os
24
+ import random
25
+ import string
26
+
27
+ from dotenv import load_dotenv
28
+ load_dotenv()
29
+
30
+ file_path = os.path.join(os.getcwd(), "valuation.pdf")
31
+ #loader = PyPDFLoader("60LEADERSONAI.pdf")
32
+ #loader = PyPDFLoader(file_path)
33
+ #data = loader.load()
34
+ #text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
35
+ #db_texts = text_splitter.split_documents(data)
36
+
37
+ data = PdfReader(file_path)
38
+ raw_text = ''
39
+ db_texts=''
40
+ for i, page in enumerate(data.pages):
41
+ text = page.extract_text()
42
+ if text:
43
+ raw_text += text
44
+ text_splitter = RecursiveCharacterTextSplitter(
45
+ # separator = "\n",
46
+ chunk_size = 1000,
47
+ chunk_overlap = 100, #striding over the text
48
+ length_function = len,
49
+ )
50
+ db_texts = text_splitter.split_text(raw_text)
51
+
52
+ class HFEmbeddings:
53
+ def __init__(self, api_url, headers):
54
+ self.api_url = api_url
55
+ self.headers = headers
56
+
57
+ def get_embeddings(self, texts):
58
+ response = requests.post(self.api_url, headers=self.headers, json={"inputs": texts, "options": {"wait_for_model": True}})
59
+ embeddings = response.json()
60
+ return embeddings
61
+
62
+ def embed_documents(self, texts):
63
+ embeddings = self.get_embeddings(texts)
64
+ return embeddings
65
+
66
+ def __call__(self, texts):
67
+ return self.embed_documents(texts)
68
+
69
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
70
+ model_id = os.getenv('model_id')
71
+ hf_token = os.getenv('hf_token')
72
+ repo_id = os.getenv('repo_id')
73
+
74
+ api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
75
+ headers = {"Authorization": f"Bearer {hf_token}"}
76
+
77
+ hf_embeddings = HFEmbeddings(api_url, headers)
78
+
79
+ #Pinecone账号:
80
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
81
+ PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT')
82
+ PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')
83
+ #def generate_random_string(length):
84
+ # letters = string.ascii_letters
85
+ # random_string = ''.join(random.choice(letters) for _ in range(length))
86
+ # return random_string
87
+ #random_string = generate_random_string(8)
88
+
89
+ #def generate_random_string(length):
90
+ # letters = string.ascii_lowercase
91
+ # return ''.join(random.choice(letters) for i in range(length))
92
+ #random_string = generate_random_string(8)
93
+
94
+ index_name = PINECONE_INDEX_NAME
95
+ #namespace = random_string
96
+ namespace = "HF-GRADIO-0909"
97
+
98
+ #def exit_handler():
99
+ # pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
100
+ # index_namespace_to_delete = pinecone.Index(index_name=index_name)
101
+ # index_namespace_to_delete.delete(delete_all=True, namespace=namespace)
102
+
103
+ #atexit.register(exit_handler)
104
+
105
+ pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
106
+ vector_db = Pinecone.from_texts(db_texts, hf_embeddings, index_name=index_name, namespace=namespace)
107
+ #vector_db = Pinecone.from_texts([t.page_content for t in db_texts], hf_embeddings, index_name=index_name, namespace=namespace)
108
+ #docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name, namespace=namespace)
109
+ print("***********************************")
110
+ print("Pinecone Vector/Embedding DB Ready.")
111
+ print()
112
+
113
+ llm = HuggingFaceHub(repo_id=repo_id,
114
+ model_kwargs={"min_length":100,
115
+ "max_new_tokens":1024, "do_sample":True,
116
+ "temperature":0.1,
117
+ "top_k":50,
118
+ "top_p":0.95, "eos_token_id":49155})
119
+
120
+ chain = load_qa_chain(llm=llm, chain_type="stuff")
121
+
122
+ def run_chain(user_query):
123
+ if user_query !="" and not user_query.strip().isspace() and not user_query.isspace():
124
+ print("Your query:\n"+user_query)
125
+ vector_db_from_index = Pinecone.from_existing_index(index_name, hf_embeddings, namespace=namespace)
126
+ ss_results = vector_db_from_index.similarity_search(query=user_query, namespace=namespace, k=5)
127
+ initial_ai_response = chain.run(input_documents=ss_results, question=user_query)
128
+ temp_ai_response = initial_ai_response.partition('<|end|>')[0]
129
+ final_ai_response = temp_ai_response.replace('\n', '')
130
+ return final_ai_response
131
+ else:
132
+ print("Invalid inputs.")
133
+
134
+ iface = gr.Interface(fn=run_chain, inputs="text", outputs="text", title="AI Response")
135
+ iface.launch()