vprzybylo commited on
Commit
4694efc
Β·
0 Parent(s):

first commit in new repo

Browse files
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ .DS_Store
5
+ data/processed/
6
+ *.csv
7
+ # Ignore other PDFs and binary files except grid_code.pdf
8
+ *.pdf
9
+ !data/raw/grid_code.pdf
10
+ *.docx
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ FROM python:3.9
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+ ENV PATH="/home/user/.local/bin:$PATH"
7
+
8
+ WORKDIR /app
9
+
10
+ # Copy only the midterm project files
11
+ COPY --chown=user requirements.txt requirements.txt
12
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
13
+
14
+ # Copy the midterm directory contents
15
+ COPY --chown=user midterm /app
16
+
17
+ # Set environment variable for Python path
18
+ ENV PYTHONPATH=/app/src
19
+
20
+ # Run streamlit on port 7860 for Hugging Face Spaces
21
+ CMD ["streamlit", "run", "src/ui/app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ # Add src directory to Python path
5
+ src_path = Path(__file__).parent / "src"
6
+ sys.path.append(str(src_path))
7
+
8
+ from ui.app import main
9
+
10
+ if __name__ == "__main__":
11
+ main()
app/.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ data/raw/grid_code.pdf filter=lfs diff=lfs merge=lfs -text
2
+ *.docx filter=lfs diff=lfs merge=lfs -text
3
+ *.pdf filter=lfs diff=lfs merge=lfs -text
app/.hf/space ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ title: "GridGuide: Field Assistant"
2
+ emoji: 🌩️
3
+ colorFrom: blue
4
+ colorTo: indigo
5
+ sdk: docker
6
+ pinned: false
app/data/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Directory
2
+
3
+ This directory contains the Grid Code documentation and processed data.
4
+
5
+ ## Structure
6
+
7
+ - `raw/` - Contains the original Grid Code PDF
8
+ - `processed/` - Contains processed chunks and embeddings
9
+ - `test/` - Contains test data and evaluation sets
10
+
11
+ ## Grid Code PDF
12
+
13
+ Place the Grid Code PDF file in the `raw/` directory with filename `grid_code.pdf`.
14
+
15
+ ## Processing
16
+
17
+ The data processing pipeline:
18
+ 1. Loads PDF from raw/
19
+ 2. Splits into chunks
20
+ 3. Generates embeddings
21
+ 4. Stores processed data
22
+
23
+ ## Test Data
24
+
25
+ The test directory contains:
26
+ - Sample questions and answers
27
+ - Evaluation datasets
28
+ - Test PDF segments
app/data/processed/qdrant/.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ tmp lock file
app/data/processed/qdrant/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"collections": {"grid_code": {"vectors": {"size": 1536, "distance": "Cosine", "hnsw_config": null, "quantization_config": null, "on_disk": null, "datatype": null, "multivector_config": null}, "shard_number": null, "sharding_method": null, "replication_factor": null, "write_consistency_factor": null, "on_disk_payload": null, "hnsw_config": null, "wal_config": null, "optimizers_config": null, "init_from": null, "quantization_config": null, "sparse_vectors": null, "strict_mode_config": null}}, "aliases": {}}
app/requirements/dev.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain-community==0.3.14
2
+ langchain-openai==0.2.14
3
+ langchain-qdrant>=0.2.0
4
+ openai>=1.6.1
5
+ qdrant-client>=1.6.4
6
+ ragas==0.2.10
7
+ streamlit==1.29.0
8
+ python-dotenv==1.0.0
9
+ pypdf==3.17.1
10
+ rich>=13.7.0
11
+ rapidfuzz>=3.6.1
12
+ tenacity>=8.2.3
13
+ sentence-transformers==3.4.1
app/requirements/prod.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ -r dev.txt
app/src/data/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Data handling utilities
app/src/data/pdf_loader.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+ class GridCodePDFLoader:
5
+ def __init__(self, pdf_path):
6
+ self.pdf_path = pdf_path
7
+ self.text_splitter = RecursiveCharacterTextSplitter(
8
+ chunk_size=500,
9
+ chunk_overlap=50,
10
+ separators=["\n\n", "\n", ".", " ", ""]
11
+ )
12
+
13
+ def load_and_split(self):
14
+ """Load PDF and split into chunks"""
15
+ loader = PyPDFLoader(self.pdf_path)
16
+ pages = loader.load()
17
+ return self.text_splitter.split_documents(pages)
18
+
19
+ def extract_metadata(self):
20
+ """Extract metadata from PDF like sections, tables etc."""
21
+ # TODO: Implement metadata extraction
22
+ pass
app/src/data/processed/qdrant/.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ tmp lock file
app/src/data/processed/qdrant/meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"collections": {}, "aliases": {}}
app/src/embedding/fine_tune.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # fine_tune_lama.py
2
+
3
+ import pandas as pd
4
+ import logging
5
+ from sentence_transformers import InputExample, SentenceTransformer, losses
6
+ from torch.utils.data import DataLoader
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+
11
+ # Load the embedding model
12
+ model_id = "Snowflake/snowflake-arctic-embed-l"
13
+ logging.info(f"Loading model: {model_id}")
14
+ model = SentenceTransformer(model_id)
15
+
16
+ def load_synthetic_dataset():
17
+ logging.info("Loading synthetic dataset...")
18
+ df = pd.read_csv("../data/processed/synthetic_test_dataset.csv")
19
+ # Convert to the format expected by the model
20
+ examples = []
21
+ for _, row in df.iterrows():
22
+ examples.append(
23
+ InputExample(texts=[row["user_input"], row["reference"]], label=1)
24
+ ) # Assuming label 1 for positive pairs
25
+ logging.info(f"Loaded {len(examples)} examples.")
26
+ return examples
27
+
28
+ train_examples = load_synthetic_dataset()
29
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
30
+
31
+ # Define the loss function
32
+ inner_train_loss = losses.MultipleNegativesRankingLoss(model)
33
+ train_loss = losses.MatryoshkaLoss(
34
+ model, inner_train_loss, matryoshka_dims=[768, 512, 256, 128, 64]
35
+ )
36
+
37
+ EPOCHS = 1
38
+ warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)
39
+
40
+ # Fine-tune the model
41
+ logging.info("Starting model training...")
42
+ model.fit(
43
+ train_objectives=[(train_dataloader, train_loss)],
44
+ epochs=EPOCHS,
45
+ warmup_steps=warmup_steps,
46
+ output_path="data/processed/finetuned_arctic_ft",
47
+ show_progress_bar=True,
48
+ )
49
+
50
+ logging.info("Model training completed.")
app/src/embedding/model.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_openai import OpenAIEmbeddings
5
+
6
+
7
+ class EmbeddingModel:
8
+ def __init__(self, model_type="openai"):
9
+ self.model_type = model_type
10
+ self.model = self._initialize_model()
11
+
12
+ def _initialize_model(self):
13
+ if self.model_type == "openai":
14
+ return OpenAIEmbeddings(model="text-embedding-3-small")
15
+ elif self.model_type == "finetuned":
16
+ model_path = (
17
+ Path(__file__).parent.parent.parent
18
+ / "data"
19
+ / "processed"
20
+ / "finetuned_arctic_ft_repo"
21
+ )
22
+ return HuggingFaceEmbeddings(
23
+ model_name=str(model_path),
24
+ model_kwargs={"device": "cpu"},
25
+ encode_kwargs={"normalize_embeddings": True},
26
+ )
27
+ else:
28
+ raise ValueError(f"Unsupported model type: {self.model_type}")
29
+
30
+ def embed_documents(self, texts):
31
+ """Embed a list of texts"""
32
+ return self.model.embed_documents(texts)
33
+
34
+ def embed_query(self, text):
35
+ """Embed a single text"""
36
+ return self.model.embed_query(text)
app/src/embedding/save_to_hf.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### `save_to_hf.py`
2
+
3
+ import logging
4
+ import os
5
+
6
+ from huggingface_hub import HfApi, Repository
7
+
8
+ # Set up logging
9
+ logging.basicConfig(
10
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11
+ )
12
+
13
+
14
+ def push_model_to_huggingface(model_dir, model_name, hf_username):
15
+ """Push the model to Hugging Face Hub using the Repository class."""
16
+ try:
17
+ # Create a new directory for the repository
18
+ repo_dir = f"./{model_name}_repo" # Specify a new directory
19
+ os.makedirs(repo_dir, exist_ok=True)
20
+
21
+ # Initialize the repository
22
+ repo_id = f"{hf_username}/{model_name}"
23
+ repo = Repository(local_dir=repo_dir, clone_from=repo_id)
24
+
25
+ # Copy model files to the new repository directory
26
+ for filename in os.listdir(model_dir):
27
+ full_file_name = os.path.join(model_dir, filename)
28
+ if os.path.isfile(full_file_name):
29
+ os.rename(full_file_name, os.path.join(repo_dir, filename))
30
+
31
+ # Add model files to the repository
32
+ repo.git_add()
33
+ repo.git_commit("Add custom segmentation model")
34
+ repo.git_push()
35
+
36
+ logging.info(f"Model pushed to Hugging Face Hub: {repo_id}")
37
+
38
+ except Exception as e:
39
+ logging.error(f"Failed to push model to Hugging Face Hub: {str(e)}")
40
+
41
+
42
+ if __name__ == "__main__":
43
+ # Define parameters
44
+ model_directory = (
45
+ "src/data/processed/finetuned_arctic_ft" # Directory where the model is saved
46
+ )
47
+ model_name = "finetuned_arctic_ft" # Name for the model on Hugging Face
48
+ hf_username = "vanessaprzybylo" # Replace with your Hugging Face username
49
+
50
+ # Push the model to Hugging Face
51
+ push_model_to_huggingface(model_directory, model_name, hf_username)
app/src/evaluation/evaluate_rag.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ import time
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+ from dotenv import load_dotenv
9
+ from tenacity import retry, stop_after_attempt, wait_exponential
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Add src directory to Python path
16
+ src_path = Path(__file__).parent.parent
17
+ sys.path.append(str(src_path))
18
+
19
+ # Load environment variables
20
+ root_dir = Path(__file__).parent.parent.parent
21
+ env_path = root_dir / ".env"
22
+ load_dotenv(env_path)
23
+
24
+ from embedding.model import EmbeddingModel
25
+ from langchain.chat_models import init_chat_model
26
+ from langchain_core.rate_limiters import InMemoryRateLimiter
27
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
28
+ from rag.chain import RAGChain
29
+ from rag.document_loader import GridCodeLoader
30
+ from rag.vectorstore import VectorStore
31
+ from ragas import EvaluationDataset, RunConfig, evaluate
32
+ from ragas.embeddings import LangchainEmbeddingsWrapper
33
+ from ragas.llms import LangchainLLMWrapper
34
+ from ragas.metrics import AnswerRelevancy, ContextPrecision, ContextRecall, Faithfulness
35
+ from ragas.testset import TestsetGenerator
36
+
37
+
38
+ def setup_rag(embedding_model_type):
39
+ """Initialize RAG system for evaluation with specified embedding model."""
40
+ logger.info("Setting up RAG system...")
41
+
42
+ # Load documents
43
+ data_path = root_dir / "data" / "raw" / "grid_code.pdf"
44
+ if not data_path.exists():
45
+ raise FileNotFoundError(f"PDF not found: {data_path}")
46
+
47
+ loader = GridCodeLoader(str(data_path), pages=17)
48
+ documents = loader.load_and_split()
49
+ logger.info(f"Loaded {len(documents)} document chunks")
50
+
51
+ # Initialize embedding model and vectorstore
52
+ embedding_model = EmbeddingModel(model_type=embedding_model_type)
53
+ vectorstore = VectorStore(embedding_model)
54
+ vectorstore = vectorstore.create_vectorstore(documents)
55
+
56
+ return RAGChain(vectorstore), documents
57
+
58
+
59
+ def generate_test_dataset(documents, n_questions=30):
60
+ """Generate synthetic test dataset using RAGAS or load it if it already exists."""
61
+ dataset_path = "../data/processed/synthetic_test_dataset.csv"
62
+
63
+ # Check if the dataset already exists
64
+ if os.path.exists(dataset_path):
65
+ logger.info(f"Loading existing synthetic test dataset from {dataset_path}...")
66
+ return pd.read_csv(dataset_path)
67
+
68
+ logger.info("Generating synthetic test dataset...")
69
+
70
+ # Initialize the rate limiter
71
+ rate_limiter = InMemoryRateLimiter(
72
+ requests_per_second=1, # Make a request once every 1 second
73
+ check_every_n_seconds=0.1, # Check every 100 ms to see if allowed to make a request
74
+ max_bucket_size=10, # Controls the maximum burst size
75
+ )
76
+
77
+ # Initialize the chat model with the rate limiter
78
+ model = init_chat_model("gpt-4o", temperature=0, rate_limiter=rate_limiter)
79
+
80
+ # Initialize generator models
81
+ generator_llm = LangchainLLMWrapper(model)
82
+ generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
83
+
84
+ # Create test set generator
85
+ generator = TestsetGenerator(
86
+ llm=generator_llm, embedding_model=generator_embeddings
87
+ )
88
+
89
+ # Generate synthetic test dataset
90
+ dataset = generator.generate_with_langchain_docs(
91
+ documents, testset_size=n_questions
92
+ )
93
+
94
+ df = dataset.to_pandas()
95
+ df.to_csv(dataset_path, index=False) # Save as CSV
96
+ logger.info(
97
+ f"Generated synthetic dataset with {len(df)} test cases and saved to '{dataset_path}'."
98
+ )
99
+ return df
100
+
101
+
102
+ @retry(wait=wait_exponential(multiplier=1, min=4, max=60), stop=stop_after_attempt(5))
103
+ def get_rag_response(rag_chain, question):
104
+ """Get RAG response with retry logic"""
105
+ return rag_chain.invoke(question)
106
+
107
+
108
+ def evaluate_rag_system(rag_chain, test_dataset):
109
+ """Evaluate RAG system using RAGAS metrics"""
110
+ logger.info("Starting RAGAS evaluation...")
111
+
112
+ # Get RAG responses for each question
113
+ eval_data = []
114
+
115
+ # Iterate through DataFrame rows
116
+ for _, row in test_dataset.iterrows():
117
+ # Add delay between requests
118
+ time.sleep(3) # Wait 3 seconds between requests
119
+ response = get_rag_response(rag_chain, row["user_input"])
120
+ eval_data.append(
121
+ {
122
+ "user_input": row["user_input"],
123
+ "response": response["answer"],
124
+ "retrieved_contexts": [doc.page_content for doc in response["context"]],
125
+ "ground_truth": row["reference"], # Keep for faithfulness
126
+ "reference": row["reference"], # Keep for context_recall
127
+ }
128
+ )
129
+ logger.info(f"Processed question: {row['user_input'][:50]}...")
130
+
131
+ # Convert to pandas then to EvaluationDataset
132
+ eval_df = pd.DataFrame(eval_data)
133
+ logger.info("Sample evaluation data:")
134
+ logger.info(eval_df.iloc[0].to_dict())
135
+ eval_dataset = EvaluationDataset.from_pandas(eval_df)
136
+
137
+ # Initialize RAGAS evaluator
138
+ evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
139
+
140
+ custom_run_config = RunConfig(timeout=360, max_workers=32)
141
+
142
+ # Run evaluation
143
+ results = evaluate(
144
+ eval_dataset,
145
+ metrics=[
146
+ Faithfulness(), # Measures how accurately the generated response reflects the ground truth.
147
+ AnswerRelevancy(), # Assesses the relevance of the answer to the user's question.
148
+ ContextRecall(), # Evaluates the ability of the model to retrieve relevant context from the documents.
149
+ ContextPrecision(), # Measures the precision of the retrieved contexts in relation to the user's question.
150
+ ],
151
+ llm=evaluator_llm,
152
+ run_config=custom_run_config,
153
+ )
154
+
155
+ return results
156
+
157
+
158
+ def run_evaluation_with_model(rag_chain, test_dataset, embedding_model_type):
159
+ """Run evaluation with the specified embedding model type."""
160
+ logger.info(f"Running evaluation with {embedding_model_type} embeddings...")
161
+
162
+ # Run evaluation
163
+ results = evaluate_rag_system(rag_chain, test_dataset)
164
+
165
+ logger.info(f"Evaluation Results for {embedding_model_type}:")
166
+ logger.info(results)
167
+
168
+ # Save results to CSV
169
+ results_path = Path("../data/processed/")
170
+ results_path.mkdir(parents=True, exist_ok=True)
171
+
172
+ # Convert results to DataFrame
173
+ results_df = pd.DataFrame([results])
174
+ results_df.to_csv(
175
+ results_path / f"evaluation_results_{embedding_model_type}.csv", index=False
176
+ )
177
+ logger.info(
178
+ f"Saved evaluation results to evaluation_results_{embedding_model_type}.csv"
179
+ )
180
+
181
+
182
+ def main():
183
+ """Main evaluation script"""
184
+ logger.info("Starting RAG evaluation")
185
+
186
+ try:
187
+ # Setup RAG system with the fine-tuned embedding model
188
+ rag_chain_finetuned, documents = setup_rag("finetuned")
189
+
190
+ # Generate synthetic test dataset
191
+ test_dataset = generate_test_dataset(documents)
192
+
193
+ # Run evaluations with both embedding models
194
+ run_evaluation_with_model(rag_chain_finetuned, test_dataset, "finetuned")
195
+
196
+ # # Setup RAG system with the OpenAI embedding model
197
+ # rag_chain_openai, _ = setup_rag("openai")
198
+
199
+ # # Run evaluation with OpenAI embeddings
200
+ # run_evaluation_with_model(rag_chain_openai, test_dataset, "openai")
201
+
202
+ except Exception as e:
203
+ logger.error(f"Evaluation failed: {str(e)}")
204
+ raise
205
+
206
+
207
+ if __name__ == "__main__":
208
+ main()
app/src/rag/chain.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain.chains import create_retrieval_chain
4
+ from langchain.chains.combine_documents import create_stuff_documents_chain
5
+
6
+ class RAGChain:
7
+ def __init__(self, vectorstore):
8
+ self.vectorstore = vectorstore
9
+ self.llm = ChatOpenAI(model="gpt-4o")
10
+ self.chain = self._create_chain()
11
+
12
+ def _create_chain(self):
13
+ prompt = ChatPromptTemplate.from_template("""
14
+ You are a helpful assistant for field workers in the electricity transmission sector.
15
+ Answer questions about the Grid Code using the following context.
16
+ If you're unsure or the context doesn't contain the answer, say so.
17
+
18
+ Context: {context}
19
+ Question: {input}
20
+ """)
21
+
22
+ document_chain = create_stuff_documents_chain(self.llm, prompt)
23
+ retrieval_chain = create_retrieval_chain(
24
+ self.vectorstore.as_retriever(),
25
+ document_chain
26
+ )
27
+ return retrieval_chain
28
+
29
+ def invoke(self, question):
30
+ return self.chain.invoke({"input": question})
app/src/rag/document_loader.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import pypdf
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class GridCodeLoader:
9
+ def __init__(self, file_path, pages=None):
10
+ self.file_path = file_path
11
+ self.pages = pages
12
+ self.text_splitter = RecursiveCharacterTextSplitter(
13
+ chunk_size=2000,
14
+ chunk_overlap=50,
15
+ separators=["\n\n", "\n", ".", " ", ""]
16
+ )
17
+
18
+ def load_and_split(self):
19
+ logger.info(f"Loading PDF from {self.file_path}")
20
+ # Open PDF directly first to get total pages
21
+ reader = pypdf.PdfReader(self.file_path)
22
+ total_pages = len(reader.pages)
23
+
24
+ if isinstance(self.pages, int):
25
+ # Load first n pages
26
+ pages_to_load = list(range(min(self.pages, total_pages)))
27
+ logger.info(f"Loaded first {len(pages_to_load)} pages from PDF")
28
+ elif isinstance(self.pages, (list, tuple)):
29
+ # Load specific pages
30
+ pages_to_load = [p for p in self.pages if p < total_pages]
31
+ logger.info(f"Loaded pages {self.pages} from PDF")
32
+ else:
33
+ pages_to_load = list(range(total_pages))
34
+ logger.info(f"Loaded all {len(pages_to_load)} pages from PDF")
35
+
36
+ # Now use PyPDFLoader with the selected pages
37
+ loader = PyPDFLoader(self.file_path)
38
+ documents = loader.load()
39
+ documents = [doc for i, doc in enumerate(documents) if i in pages_to_load]
40
+
41
+ logger.info("Splitting documents into chunks...")
42
+ chunks = self.text_splitter.split_documents(documents)
43
+ logger.info(f"Created {len(chunks)} chunks")
44
+ return chunks
app/src/rag/vectorstore.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from langchain_community.vectorstores import Qdrant
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class VectorStore:
10
+ def __init__(self, embedding_model):
11
+ self.embedding_model = embedding_model
12
+ self.collection_name = "grid_code"
13
+
14
+ def create_vectorstore(self, documents):
15
+ """Create vector store."""
16
+ logger.info("Creating vector store...")
17
+ vectorstore = Qdrant.from_documents(
18
+ documents=documents,
19
+ embedding=self.embedding_model.model,
20
+ location=":memory:", # Use in-memory storage
21
+ collection_name=self.collection_name,
22
+ )
23
+ logger.info(f"Created vector store with {len(documents)} chunks")
24
+ return vectorstore
25
+
26
+ def similarity_search(self, query, k=4):
27
+ raise NotImplementedError("Use the Qdrant vectorstore instance directly")
app/src/ui/app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Annotated, TypedDict
6
+
7
+ import requests
8
+ import streamlit as st
9
+ from langchain.agents import AgentExecutor, create_tool_calling_agent
10
+ from langchain_core.prompts import (
11
+ ChatPromptTemplate,
12
+ HumanMessagePromptTemplate,
13
+ MessagesPlaceholder,
14
+ SystemMessagePromptTemplate,
15
+ )
16
+ from langchain_core.tools import Tool
17
+ from langchain_openai import ChatOpenAI
18
+ from langgraph.graph.message import add_messages
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Add src directory to Python path
25
+ src_path = Path(__file__).parent.parent
26
+ sys.path.append(str(src_path))
27
+
28
+
29
+ # Get secrets from Hugging Face Space
30
+ def get_secrets():
31
+ """Get secrets from Hugging Face Space or local environment."""
32
+ if os.environ.get("SPACE_ID"):
33
+ # We're in a Hugging Face Space
34
+ return {
35
+ "OPENAI_API_KEY": st.secrets["OPENAI_API_KEY"],
36
+ "TAVILY_API_KEY": st.secrets.get("TAVILY_API_KEY"),
37
+ "LANGCHAIN_API_KEY": st.secrets.get("LANGCHAIN_API_KEY"),
38
+ "LANGCHAIN_PROJECT": st.secrets.get("LANGCHAIN_PROJECT", "GridGuide"),
39
+ "LANGCHAIN_TRACING_V2": st.secrets.get("LANGCHAIN_TRACING_V2", "true"),
40
+ }
41
+ else:
42
+ # We're running locally, use environment variables
43
+ return {
44
+ "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY"),
45
+ "TAVILY_API_KEY": os.environ.get("TAVILY_API_KEY"),
46
+ "LANGCHAIN_API_KEY": os.environ.get("LANGCHAIN_API_KEY"),
47
+ "LANGCHAIN_PROJECT": os.environ.get("LANGCHAIN_PROJECT", "GridGuide"),
48
+ "LANGCHAIN_TRACING_V2": os.environ.get("LANGCHAIN_TRACING_V2", "true"),
49
+ }
50
+
51
+
52
+ # Set up environment variables from secrets
53
+ secrets = get_secrets()
54
+ for key, value in secrets.items():
55
+ if value:
56
+ os.environ[key] = value
57
+
58
+ # Verify API keys
59
+ if not os.environ.get("OPENAI_API_KEY"):
60
+ st.error(
61
+ "OpenAI API key not found. Please set it in the Hugging Face Space secrets."
62
+ )
63
+ st.stop()
64
+
65
+ from embedding.model import EmbeddingModel
66
+ from rag.chain import RAGChain
67
+ from rag.document_loader import GridCodeLoader
68
+ from rag.vectorstore import VectorStore
69
+
70
+
71
+ class WeatherTool:
72
+ def __init__(self):
73
+ self.base_url = "https://api.weather.gov"
74
+ self.headers = {
75
+ "User-Agent": "(Grid Code Assistant, [email protected])",
76
+ "Accept": "application/json",
77
+ }
78
+
79
+ def get_coordinates_from_zip(self, zipcode):
80
+ response = requests.get(f"https://api.zippopotam.us/us/{zipcode}")
81
+ if response.status_code == 200:
82
+ data = response.json()
83
+ return {
84
+ "lat": data["places"][0]["latitude"],
85
+ "lon": data["places"][0]["longitude"],
86
+ "place": data["places"][0]["place name"],
87
+ "state": data["places"][0]["state"],
88
+ }
89
+ return None
90
+
91
+ def run(self, zipcode):
92
+ coords = self.get_coordinates_from_zip(zipcode)
93
+ if not coords:
94
+ return {"error": "Invalid ZIP code or unable to get coordinates."}
95
+
96
+ point_url = f"{self.base_url}/points/{coords['lat']},{coords['lon']}"
97
+ response = requests.get(point_url, headers=self.headers)
98
+
99
+ if response.status_code != 200:
100
+ return {"error": "Unable to fetch weather data."}
101
+
102
+ grid_data = response.json()
103
+ forecast_url = grid_data["properties"]["forecast"]
104
+
105
+ response = requests.get(forecast_url, headers=self.headers)
106
+ if response.status_code == 200:
107
+ forecast_data = response.json()["properties"]["periods"]
108
+ weather_data = {
109
+ "type": "weather",
110
+ "location": f"{coords['place']}, {coords['state']}",
111
+ "current": forecast_data[0],
112
+ "forecast": forecast_data[1:4],
113
+ }
114
+ # Save to session state
115
+ st.session_state.weather_data = weather_data
116
+ return weather_data
117
+ return {"error": "Unable to fetch forecast data."}
118
+
119
+
120
+ def initialize_rag():
121
+ """Initialize RAG system."""
122
+ if "rag_chain" in st.session_state:
123
+ logger.info("Using cached RAG chain from session state")
124
+ return st.session_state.rag_chain
125
+
126
+ # Use relative path from src directory
127
+ data_path = Path(__file__).parent.parent.parent / "data" / "raw" / "grid_code.pdf"
128
+ if not data_path.exists():
129
+ raise FileNotFoundError(f"PDF not found: {data_path}")
130
+
131
+ with st.spinner("Loading Grid Code documents..."):
132
+ loader = GridCodeLoader(str(data_path), pages=17)
133
+ documents = loader.load_and_split()
134
+ logger.info(f"Loaded {len(documents)} document chunks")
135
+
136
+ with st.spinner("Creating vector store..."):
137
+ embedding_model = EmbeddingModel()
138
+ vectorstore = VectorStore(embedding_model)
139
+ vectorstore = vectorstore.create_vectorstore(documents)
140
+ logger.info("Vector store created successfully")
141
+
142
+ # Cache the RAG chain in session state
143
+ rag_chain = RAGChain(vectorstore)
144
+ st.session_state.rag_chain = rag_chain
145
+ return rag_chain
146
+
147
+
148
+ class RAGTool:
149
+ def __init__(self, rag_chain):
150
+ self.rag_chain = rag_chain
151
+
152
+ def run(self, question: str) -> str:
153
+ """Answer questions using the Grid Code."""
154
+ response = self.rag_chain.invoke(question)
155
+ return response["answer"]
156
+
157
+
158
+ class AgentState(TypedDict):
159
+ """State definition for the agent."""
160
+
161
+ messages: Annotated[list, add_messages]
162
+
163
+
164
+ def create_agent_workflow(rag_chain, weather_tool):
165
+ """Create an agent that can use both RAG and weather tools."""
166
+
167
+ # Define the tools
168
+ tools = [
169
+ Tool(
170
+ name="grid_code_query",
171
+ description="Answer questions about the Grid Code and electrical regulations",
172
+ func=lambda q: rag_chain.invoke(q)["answer"],
173
+ ),
174
+ Tool(
175
+ name="get_weather",
176
+ description="Get weather forecast for a ZIP code. Input should be a 5-digit ZIP code.",
177
+ func=lambda z: weather_tool.run(z),
178
+ ),
179
+ ]
180
+
181
+ # Initialize the LLM
182
+ llm = ChatOpenAI(model="gpt-4o", temperature=0)
183
+
184
+ # Create the custom prompt
185
+ prompt = ChatPromptTemplate.from_messages(
186
+ [
187
+ SystemMessagePromptTemplate.from_template(
188
+ """You are a helpful assistant that specializes in two areas:
189
+ 1. Answering questions about electrical Grid Code regulations
190
+ 2. Providing weather information for specific locations
191
+
192
+ For weather queries:
193
+ - Extract the ZIP code from the question
194
+ - Use the get_weather tool to fetch the forecast
195
+
196
+ For Grid Code questions:
197
+ - Use the grid_code_query tool to find relevant information
198
+ - If the information isn't in the Grid Code, clearly state that
199
+ - Provide specific references when possible
200
+ """
201
+ ),
202
+ MessagesPlaceholder(variable_name="chat_history", optional=True),
203
+ HumanMessagePromptTemplate.from_template("{input}"),
204
+ MessagesPlaceholder(variable_name="agent_scratchpad"),
205
+ ]
206
+ )
207
+
208
+ # Create the agent
209
+ agent = create_tool_calling_agent(llm, tools, prompt)
210
+
211
+ return AgentExecutor(
212
+ agent=agent,
213
+ tools=tools,
214
+ verbose=True,
215
+ handle_parsing_errors=True,
216
+ )
217
+
218
+
219
+ def display_weather(weather_data):
220
+ """Display weather information in a nice format"""
221
+ if "error" in weather_data:
222
+ st.error(weather_data["error"])
223
+ return
224
+
225
+ if weather_data.get("type") == "weather":
226
+ # Location header
227
+ st.header(f"Weather for {weather_data['location']}")
228
+
229
+ # Current conditions
230
+ current = weather_data["current"]
231
+ st.subheader("Current Conditions")
232
+
233
+ # Use columns for current weather layout
234
+ col1, col2 = st.columns(2)
235
+
236
+ with col1:
237
+ # Temperature display with metric
238
+ st.metric(
239
+ "Temperature", f"{current['temperature']}Β°{current['temperatureUnit']}"
240
+ )
241
+ # Wind information
242
+ st.info(f"πŸ’¨ Wind: {current['windSpeed']} {current['windDirection']}")
243
+
244
+ with col2:
245
+ # Current forecast
246
+ st.markdown(f"**🌀️ Conditions:** {current['shortForecast']}")
247
+ st.markdown(f"**πŸ“ Details:** {current['detailedForecast']}")
248
+
249
+ # Extended forecast
250
+ st.subheader("Extended Forecast")
251
+ for period in weather_data["forecast"]:
252
+ with st.expander(f"πŸ“… {period['name']}"):
253
+ st.markdown(
254
+ f"**🌑️ Temperature:** {period['temperature']}°{period['temperatureUnit']}"
255
+ )
256
+ st.markdown(
257
+ f"**πŸ’¨ Wind:** {period['windSpeed']} {period['windDirection']}"
258
+ )
259
+ st.markdown(f"**🌀️ Forecast:** {period['shortForecast']}")
260
+ st.markdown(f"**πŸ“ Details:** {period['detailedForecast']}")
261
+
262
+
263
+ def main():
264
+ st.title("GridGuide: Field Assistant")
265
+
266
+ # Initialize if not in session state
267
+ if "app" not in st.session_state:
268
+ rag_chain = initialize_rag()
269
+ weather_tool = WeatherTool()
270
+ st.session_state.app = create_agent_workflow(rag_chain, weather_tool)
271
+
272
+ # Create the input box
273
+ user_input = st.text_input("Ask about weather or the Grid Code:")
274
+
275
+ if user_input:
276
+ with st.spinner("Processing your request..."):
277
+ # Invoke the agent executor
278
+ result = st.session_state.app.invoke({"input": user_input})
279
+
280
+ # Check if we have weather data in session state
281
+ if "weather_data" in st.session_state:
282
+ display_weather(st.session_state.weather_data)
283
+ # Clear the weather data after displaying
284
+ del st.session_state.weather_data
285
+ else:
286
+ st.write(result["output"])
287
+
288
+
289
+ if __name__ == "__main__":
290
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.32.0
2
+ langchain==0.3.19
3
+ langchain-core==0.3.37
4
+ langchain-openai==0.3.6
5
+ langchain-huggingface==0.1.2
6
+ langchain-community>=0.3.14
7
+ python-dotenv>=1.0.0
8
+ requests>=2.31.0
9
+ langgraph==0.2.74
10
+ qdrant-client>=1.7.3
11
+ pypdf>=4.0.1
12
+ openai==1.64.0
13
+ typing-extensions>=4.9.0
14
+ pydantic>=2.6.3
setup.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="grid-code-assistant",
5
+ version="0.1",
6
+ packages=find_packages(),
7
+ )