Spaces:
Sleeping
Sleeping
File size: 7,303 Bytes
4694efc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import logging
import os
import sys
import time
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Add src directory to Python path
src_path = Path(__file__).parent.parent
sys.path.append(str(src_path))
# Load environment variables
root_dir = Path(__file__).parent.parent.parent
env_path = root_dir / ".env"
load_dotenv(env_path)
from embedding.model import EmbeddingModel
from langchain.chat_models import init_chat_model
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from rag.chain import RAGChain
from rag.document_loader import GridCodeLoader
from rag.vectorstore import VectorStore
from ragas import EvaluationDataset, RunConfig, evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import AnswerRelevancy, ContextPrecision, ContextRecall, Faithfulness
from ragas.testset import TestsetGenerator
def setup_rag(embedding_model_type):
"""Initialize RAG system for evaluation with specified embedding model."""
logger.info("Setting up RAG system...")
# Load documents
data_path = root_dir / "data" / "raw" / "grid_code.pdf"
if not data_path.exists():
raise FileNotFoundError(f"PDF not found: {data_path}")
loader = GridCodeLoader(str(data_path), pages=17)
documents = loader.load_and_split()
logger.info(f"Loaded {len(documents)} document chunks")
# Initialize embedding model and vectorstore
embedding_model = EmbeddingModel(model_type=embedding_model_type)
vectorstore = VectorStore(embedding_model)
vectorstore = vectorstore.create_vectorstore(documents)
return RAGChain(vectorstore), documents
def generate_test_dataset(documents, n_questions=30):
"""Generate synthetic test dataset using RAGAS or load it if it already exists."""
dataset_path = "../data/processed/synthetic_test_dataset.csv"
# Check if the dataset already exists
if os.path.exists(dataset_path):
logger.info(f"Loading existing synthetic test dataset from {dataset_path}...")
return pd.read_csv(dataset_path)
logger.info("Generating synthetic test dataset...")
# Initialize the rate limiter
rate_limiter = InMemoryRateLimiter(
requests_per_second=1, # Make a request once every 1 second
check_every_n_seconds=0.1, # Check every 100 ms to see if allowed to make a request
max_bucket_size=10, # Controls the maximum burst size
)
# Initialize the chat model with the rate limiter
model = init_chat_model("gpt-4o", temperature=0, rate_limiter=rate_limiter)
# Initialize generator models
generator_llm = LangchainLLMWrapper(model)
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
# Create test set generator
generator = TestsetGenerator(
llm=generator_llm, embedding_model=generator_embeddings
)
# Generate synthetic test dataset
dataset = generator.generate_with_langchain_docs(
documents, testset_size=n_questions
)
df = dataset.to_pandas()
df.to_csv(dataset_path, index=False) # Save as CSV
logger.info(
f"Generated synthetic dataset with {len(df)} test cases and saved to '{dataset_path}'."
)
return df
@retry(wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
def get_rag_response(rag_chain, question):
"""Get RAG response with retry logic"""
return rag_chain.invoke(question)
def evaluate_rag_system(rag_chain, test_dataset):
"""Evaluate RAG system using RAGAS metrics"""
logger.info("Starting RAGAS evaluation...")
# Get RAG responses for each question
eval_data = []
# Iterate through DataFrame rows
for _, row in test_dataset.iterrows():
# Add delay between requests
time.sleep(3) # Wait 3 seconds between requests
response = get_rag_response(rag_chain, row["user_input"])
eval_data.append(
{
"user_input": row["user_input"],
"response": response["answer"],
"retrieved_contexts": [doc.page_content for doc in response["context"]],
"ground_truth": row["reference"], # Keep for faithfulness
"reference": row["reference"], # Keep for context_recall
}
)
logger.info(f"Processed question: {row['user_input'][:50]}...")
# Convert to pandas then to EvaluationDataset
eval_df = pd.DataFrame(eval_data)
logger.info("Sample evaluation data:")
logger.info(eval_df.iloc[0].to_dict())
eval_dataset = EvaluationDataset.from_pandas(eval_df)
# Initialize RAGAS evaluator
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
custom_run_config = RunConfig(timeout=360, max_workers=32)
# Run evaluation
results = evaluate(
eval_dataset,
metrics=[
Faithfulness(), # Measures how accurately the generated response reflects the ground truth.
AnswerRelevancy(), # Assesses the relevance of the answer to the user's question.
ContextRecall(), # Evaluates the ability of the model to retrieve relevant context from the documents.
ContextPrecision(), # Measures the precision of the retrieved contexts in relation to the user's question.
],
llm=evaluator_llm,
run_config=custom_run_config,
)
return results
def run_evaluation_with_model(rag_chain, test_dataset, embedding_model_type):
"""Run evaluation with the specified embedding model type."""
logger.info(f"Running evaluation with {embedding_model_type} embeddings...")
# Run evaluation
results = evaluate_rag_system(rag_chain, test_dataset)
logger.info(f"Evaluation Results for {embedding_model_type}:")
logger.info(results)
# Save results to CSV
results_path = Path("../data/processed/")
results_path.mkdir(parents=True, exist_ok=True)
# Convert results to DataFrame
results_df = pd.DataFrame([results])
results_df.to_csv(
results_path / f"evaluation_results_{embedding_model_type}.csv", index=False
)
logger.info(
f"Saved evaluation results to evaluation_results_{embedding_model_type}.csv"
)
def main():
"""Main evaluation script"""
logger.info("Starting RAG evaluation")
try:
# Setup RAG system with the fine-tuned embedding model
rag_chain_finetuned, documents = setup_rag("finetuned")
# Generate synthetic test dataset
test_dataset = generate_test_dataset(documents)
# Run evaluations with both embedding models
run_evaluation_with_model(rag_chain_finetuned, test_dataset, "finetuned")
# # Setup RAG system with the OpenAI embedding model
# rag_chain_openai, _ = setup_rag("openai")
# # Run evaluation with OpenAI embeddings
# run_evaluation_with_model(rag_chain_openai, test_dataset, "openai")
except Exception as e:
logger.error(f"Evaluation failed: {str(e)}")
raise
if __name__ == "__main__":
main()
|