samsonleegh commited on
Commit
a8fcfee
·
verified ·
1 Parent(s): 29213bb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -0
app.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core.base.embeddings.base import similarity
2
+ from llama_index.llms.ollama import Ollama
3
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
4
+ from llama_index.core import StorageContext
5
+ from llama_index.vector_stores.chroma import ChromaVectorStore
6
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
7
+ from llama_index.core import load_index_from_storage
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler, CBEventType
11
+ from llama_index.core.node_parser import SentenceSplitter
12
+ from llama_index.core.postprocessor import SimilarityPostprocessor
13
+ from llama_index.llms.openai import OpenAI
14
+ from llama_index.llms.groq import Groq
15
+ from llama_parse import LlamaParse
16
+ from llama_index.core.indices.query.query_transform.base import HyDEQueryTransform
17
+ from llama_index.core.query_engine import TransformQueryEnginef
18
+ from llama_index.core.extractors import (
19
+ SummaryExtractor,
20
+ QuestionsAnsweredExtractor,
21
+ )
22
+ from llama_index.core.schema import MetadataMode
23
+ from llama_index.core.ingestion import IngestionPipeline
24
+
25
+ load_dotenv()
26
+ # OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
27
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
28
+ LLAMAINDEX_API_KEY = os.getenv('LLAMAINDEX_API_KEY')
29
+
30
+ llm = Groq(model="llama3-70b-8192")#"llama3-8b-8192")
31
+ Settings.llm = llm
32
+
33
+ # set up callback manager
34
+ llama_debug = LlamaDebugHandler(print_trace_on_end=True)
35
+ callback_manager = CallbackManager([llama_debug])
36
+ Settings.callback_manager = callback_manager
37
+
38
+ # converting documents into embeddings and indexing
39
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
40
+ Settings.embed_model = embed_model
41
+
42
+ # create splitter
43
+ splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
44
+ Settings.transformations = [splitter]
45
+
46
+ if os.path.exists("./vectordb"):
47
+ storage_context = StorageContext.from_defaults(persist_dir="./vectordb")
48
+ index = load_index_from_storage(storage_context)
49
+ else:
50
+ parser = LlamaParse(
51
+ api_key=LLAMAINDEX_API_KEY,
52
+ result_type="markdown", # "markdown" and "text" are available
53
+ verbose=True,
54
+ )
55
+ filename_fn = lambda filename: {"file_name": filename}
56
+ required_exts = [".pdf",".docx"]
57
+ file_extractor = {".pdf": parser}
58
+ reader = SimpleDirectoryReader(
59
+ "./data",
60
+ file_extractor=file_extractor,
61
+ required_exts=required_exts,
62
+ recursive=True,
63
+ file_metadata=filename_fn
64
+ )
65
+ documents = reader.load_data()
66
+ for doc in documents:
67
+ doc.text = str(doc.metadata) +' '+ doc.text
68
+ print("index creating with `%d` documents", len(documents))
69
+ # index = VectorStoreIndex.from_documents(documents, embed_model=embed_model, text_splitter=splitter)
70
+ extractor_llm = Groq(model="llama3-70b-8192", temperature=0.1, max_tokens=512) #OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)
71
+
72
+ node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
73
+ extractors = [
74
+ SummaryExtractor(summaries=["prev", "self", "next"], llm=extractor_llm),
75
+ QuestionsAnsweredExtractor(
76
+ questions=3, llm=extractor_llm, metadata_mode=MetadataMode.EMBED
77
+ ),
78
+ ]
79
+ nodes = node_parser.get_nodes_from_documents(documents)
80
+ nodes_extract_ls = []
81
+ print('extracting from:', len(nodes), ' nodes.')
82
+ import time
83
+ batch_size=5
84
+ for i in range(0, len(nodes), batch_size):
85
+ print(i)
86
+ nodes_batch_raw = nodes[i:i+batch_size]
87
+ try:
88
+ pipeline = IngestionPipeline(transformations=[node_parser, *extractors])
89
+ nodes_batch = pipeline.run(nodes=nodes_batch_raw, in_place=False, show_progress=True)
90
+ nodes_extract_ls.append(nodes_batch)
91
+ except:
92
+ time.sleep(30) # api call limit reach, sleep 30 seconds before trying
93
+ nodes_extract = [
94
+ x
95
+ for xs in nodes_extract_ls
96
+ for x in xs
97
+ ]
98
+ index = VectorStoreIndex(nodes_extract)
99
+
100
+ index.storage_context.persist(persist_dir="./vectordb")
101
+
102
+
103
+ query_engine = index.as_query_engine(
104
+ similarity_top_k=5,
105
+ #node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
106
+ verbose=True,
107
+ )
108
+
109
+ # query_engine = index.as_query_engine(
110
+ # similarity_top_k=10,
111
+ # #node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
112
+ # verbose=True,
113
+ # )
114
+
115
+ # # hyde: transform query into a temporary doc, and use doc to doc similarity match
116
+ # hyde = HyDEQueryTransform(include_original=True)
117
+ # hyde_query_engine = TransformQueryEngine(query_engine, query_transform=hyde)
118
+
119
+ import gradio as gr
120
+
121
+ def retreive(question):
122
+ qns_w_source = "Answer the following question: " + question + " Followed by providing the page and file name of the source document as well, thank you!"
123
+ streaming_response = query_engine.query(qns_w_source)
124
+ # sources = streaming_response.get_formatted_sources(length=5000)
125
+ return str(streaming_response) #+ "\n" + str(sources)
126
+
127
+ demo = gr.Interface(fn=retreive, inputs="textbox", outputs="textbox")
128
+
129
+ demo.launch(share=True) # Share your demo with just 1 extra parameter 🚀