import utils import os import numpy as np import nest_asyncio import openai import chromadb from llama_index.legacy import ( VectorStoreIndex, SimpleDirectoryReader ) from llama_index.core import ( StorageContext, Document, Settings ) from llama_index.vector_stores.chroma.base import ChromaVectorStore from llama_index.llms.openai import OpenAI from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding from trulens_eval import Tru from utils import get_prebuilt_trulens_recorder import time nest_asyncio.apply() openai.api_key = utils.get_openai_api_key() def main(): if not os.path.exists("./default.sqlite"): start_time = time.time() llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0) fine_tuned_path = "local:./models/fine-tuned-embeddings" Settings.llm = llm Settings.embed_model = fine_tuned_path db = chromadb.PersistentClient(path="./models/chroma_db") chroma_collection = db.get_or_create_collection("quickstart") # assign chroma as the vector_store to the context vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) # create your index index = VectorStoreIndex.from_vector_store( vector_store=vector_store, storage_context=storage_context ) query_engine = index.as_query_engine() separator = "\n\n" eval_questions = [] with open('./raw_documents/eval_questions.txt', 'r') as file: content = file.read() for question in content.split(separator): print(question) print(separator) eval_questions.append(question.strip()) response = query_engine.query(eval_questions[0]) print(str(response)) tru = Tru(database_file="./models/trulens_eval.sqlite") tru_recorder = get_prebuilt_trulens_recorder(query_engine, app_id="Direct Query Engine") print("Sending each question to llm ..") with tru_recorder as recording: for question in eval_questions: response = query_engine.query(question) records, feedback = tru.get_records_and_feedback(app_ids=[]) os.makedirs("./results", exist_ok=True) records.to_csv("./results/records.csv", index=False) print(tru.db.engine.url.render_as_string(hide_password=False)) end_time = time.time() time_spent_mins = (end_time - start_time) / 60 with open("./results/time_cost.txt", "w") as fp: fp.write(f"Takes {int(time_spent_mins)} mins to create llm evaluation.") if __name__ == "__main__": # main() if False: start_time = time.time() llm = OpenAI(model="gpt-3.5-turbo-1106", temperature=0.0) fine_tuned_path = "local:./models/fine-tuned-embeddings" Settings.llm = llm Settings.embed_model = fine_tuned_path db = chromadb.PersistentClient(path="./models/chroma_db") chroma_collection = db.get_or_create_collection("quickstart") # assign chroma as the vector_store to the context vector_store = ChromaVectorStore(chroma_collection=chroma_collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) # create your index index = VectorStoreIndex.from_vector_store( vector_store=vector_store, storage_context=storage_context ) query_engine = index.as_query_engine() separator = "\n\n" eval_questions = [] with open('./raw_documents/eval_questions.txt', 'r') as file: content = file.read() for question in content.split(separator): print(question) print(separator) eval_questions.append(question.strip()) response = query_engine.query(eval_questions[0]) print(str(response)) from trulens_eval import Tru tru = Tru() documents = SimpleDirectoryReader( input_files=["./raw_documents/qna.txt"] ).load_data() index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() response = query_engine.query("Which is not a government healthcare philosophy?") print(response) from trulens_eval.feedback.provider.openai import OpenAI openai = OpenAI() # select context to be used in feedback. the location of context is app specific. from trulens_eval.app import App context = App.select_context(query_engine) from trulens_eval import Feedback # Define a groundedness feedback function from trulens_eval.feedback import Groundedness grounded = Groundedness(groundedness_provider=OpenAI()) f_groundedness = ( Feedback(grounded.groundedness_measure_with_cot_reasons) .on(context.collect()) # collect context chunks into a list .on_output() .aggregate(grounded.grounded_statements_aggregator) ) # Question/answer relevance between overall question and answer. f_qa_relevance = Feedback(openai.relevance).on_input_output() # Question/statement relevance between question and each context chunk. f_qs_relevance = ( Feedback(openai.qs_relevance) .on_input() .on(context) .aggregate(np.mean) ) from trulens_eval import TruLlama tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', feedbacks=[f_groundedness, f_qa_relevance, f_qs_relevance]) if False: # or as context manager with tru_query_engine_recorder as recording: query_engine.query("Which of the following is TRUE on the similarity of Means Testing and Casemix?")