ragtest-sakimilo / evaluate_model.py
lingyit1108's picture
to create RAGAs result with triad of metrics
b580d80
raw
history blame
2.83 kB
import os, time
import pandas as pd
from tqdm import tqdm
import chromadb
import openai
from llama_index import (
SimpleDirectoryReader,
StorageContext,
Document,
VectorStoreIndex,
ServiceContext
)
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms import OpenAI
from llama_index.embeddings import HuggingFaceEmbedding
from trulens_eval import Tru
import utils
from utils import get_prebuilt_trulens_recorder
openai.api_key = utils.get_openai_api_key()
def main():
start_time = time.time()
llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0)
fine_tuned_path = "local:./models/fine-tuned-embeddings"
db = chromadb.PersistentClient(path="./models/chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=fine_tuned_path)
print("Loading embeddings from vector store..")
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
storage_context=storage_context,
service_context=service_context
)
query_engine = index.as_query_engine()
mock_qna_source = pd.read_csv("./database/mock_qna_source.csv")
mock_qna_source = mock_qna_source[ mock_qna_source["question"].notnull() ]
print("mock_qna_source.shape", mock_qna_source.shape)
with open("./raw_documents/eval_questions.txt", "r") as fp:
questions_content = fp.read()
questions_content_ls = questions_content.split("\n\n")
eval_questions = mock_qna_source["question"].tolist() + questions_content_ls
response = query_engine.query(eval_questions[0])
print(str(response))
tru = Tru(database_file="./models/trulens_eval.sqlite")
tru_recorder = get_prebuilt_trulens_recorder(query_engine,
app_id="Direct Query Engine")
print("Sending each question to llm..")
with tru_recorder as recording:
for question in tqdm(eval_questions):
response = query_engine.query(question)
records, feedback = tru.get_records_and_feedback(app_ids=[])
os.makedirs("./results", exist_ok=True)
records.to_csv("./results/records.csv", index=False)
print(tru.db.engine.url.render_as_string(hide_password=False))
end_time = time.time()
time_spent_mins = (end_time - start_time) / 60
with open("./results/time_cost.txt", "w") as fp:
fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.")
if __name__ == "__main__":
main()