Spaces:

veerukhannan
/

Nyaya-Mitra

Runtime error

App Files Files Community

veerukhannan commited on Nov 24, 2024

Commit

7540753

verified ·

1 Parent(s): 6e6a2b1

Update add_embeddings.py

Browse files

Files changed (1) hide show

add_embeddings.py +74 -63

add_embeddings.py CHANGED Viewed

@@ -13,9 +13,11 @@ class LegalDocumentProcessor:
         print("Initializing Legal Document Processor...")
         self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
         self.model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
-        # Initialize ChromaDB with persistent storage for Hugging Face Spaces
-        self.pdf_dir = "/home/user/app"  # Default path in HF Spaces
         db_dir = os.path.join(self.pdf_dir, "chroma_db")
         os.makedirs(db_dir, exist_ok=True)
@@ -31,19 +33,45 @@ class LegalDocumentProcessor:
                 name="indian_legal_docs",
                 metadata={"description": "Indian Criminal Law Documents"}
             )
-    def mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output[0]
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def get_embedding(self, text: str) -> List[float]:
-        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
-        with torch.no_grad():
-            model_output = self.model(**inputs)
-        sentence_embeddings = self.mean_pooling(model_output, inputs['attention_mask'])
-        return sentence_embeddings[0].tolist()
     def process_pdf(self, pdf_path: str) -> List[str]:
         """Extract text from PDF and split into chunks"""
         print(f"Processing PDF: {pdf_path}")
@@ -51,8 +79,8 @@ class LegalDocumentProcessor:
             reader = PdfReader(pdf_path)
             text = ""
             for page in reader.pages:
-                text += page.extract_text()
             chunks = self._split_into_chunks(text)
             print(f"Created {len(chunks)} chunks from {pdf_path}")
             return chunks
@@ -60,45 +88,31 @@ class LegalDocumentProcessor:
             print(f"Error processing PDF {pdf_path}: {str(e)}")
             return []
-    def _split_into_chunks(self, text: str, max_chunk_size: int = 1000) -> List[str]:
-        """Split text into smaller chunks while preserving context"""
-        sections = re.split(r'(Chapter \d+|Section \d+|\n\n)', text)
-        chunks = []
-        current_chunk = ""
-        for section in sections:
-            if len(current_chunk) + len(section) < max_chunk_size:
-                current_chunk += section
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                current_chunk = section
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        return chunks
     def process_and_store_documents(self):
         """Process all legal documents and store in ChromaDB"""
         print("Starting document processing...")
-        print(f"Looking for PDFs in: {self.pdf_dir}")
-        print(f"Directory contents: {os.listdir(self.pdf_dir)}")
-        # Define the expected PDF paths for Hugging Face Spaces
         pdf_files = {
             'BNS': os.path.join(self.pdf_dir, 'BNS.pdf'),
             'BNSS': os.path.join(self.pdf_dir, 'BNSS.pdf'),
             'BSA': os.path.join(self.pdf_dir, 'BSA.pdf')
         }
-        # Verify files exist
-        for law_code, pdf_path in pdf_files.items():
-            if not os.path.exists(pdf_path):
-                print(f"Warning: {pdf_path} not found")
-        # Process each PDF
         for law_code, pdf_path in pdf_files.items():
             if os.path.exists(pdf_path):
                 print(f"Processing {law_code} from {pdf_path}")
@@ -124,13 +138,7 @@ class LegalDocumentProcessor:
                         )
                     except Exception as e:
                         print(f"Error processing chunk {i} from {law_code}: {str(e)}")
-                print(f"Completed processing {law_code}")
-            else:
-                print(f"Skipping {law_code} - PDF not found")
-        print("Document processing completed")
     def search_documents(self, query: str, n_results: int = 3) -> Dict:
         """Search for relevant legal information"""
         try:
@@ -140,25 +148,28 @@ class LegalDocumentProcessor:
                 n_results=n_results
             )
             return {
-                "documents": results["documents"][0],
-                "metadatas": results["metadatas"][0]
             }
         except Exception as e:
             print(f"Error during search: {str(e)}")
             return {
                 "documents": ["Sorry, I couldn't search the documents effectively."],
                 "metadatas": [{"law_code": "ERROR", "source": "error"}]
-            }
-if __name__ == "__main__":
-    processor = LegalDocumentProcessor()
-    processor.process_and_store_documents()
-    test_query = "What are the provisions for digital evidence?"
-    results = processor.search_documents(test_query)
-    print(f"Query: {test_query}")
-    print("\nResults:")
-    for doc, metadata in zip(results["documents"], results["metadatas"]):
-        print(f"\nFrom {metadata['source']}:")
-        print(doc[:200] + "...")

         print("Initializing Legal Document Processor...")
         self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
         self.model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+        self.max_chunk_size = 500  # Reduced chunk size
+        self.max_context_length = 4000  # Maximum context length for response
+        # Initialize ChromaDB
+        self.pdf_dir = "/home/user/app"
         db_dir = os.path.join(self.pdf_dir, "chroma_db")
         os.makedirs(db_dir, exist_ok=True)
                 name="indian_legal_docs",
                 metadata={"description": "Indian Criminal Law Documents"}
             )
+    def _split_into_chunks(self, text: str) -> List[str]:
+        """Split text into smaller chunks while preserving context"""
+        # Split on meaningful boundaries
+        patterns = [
+            r'(?=Chapter \d+)',
+            r'(?=Section \d+)',
+            r'(?=\n\d+\.\s)',  # Numbered paragraphs
+            r'\n\n'
+        ]
+        # Combine patterns
+        split_pattern = '|'.join(patterns)
+        sections = re.split(split_pattern, text)
+        chunks = []
+        current_chunk = ""
+        for section in sections:
+            section = section.strip()
+            if not section:
+                continue
+            # If section is small enough, add to current chunk
+            if len(current_chunk) + len(section) < self.max_chunk_size:
+                current_chunk += " " + section
+            else:
+                # If current chunk is not empty, add it to chunks
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                # Start new chunk with current section
+                current_chunk = section
+        # Add the last chunk if not empty
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
     def process_pdf(self, pdf_path: str) -> List[str]:
         """Extract text from PDF and split into chunks"""
         print(f"Processing PDF: {pdf_path}")
             reader = PdfReader(pdf_path)
             text = ""
             for page in reader.pages:
+                text += page.extract_text() + "\n\n"
             chunks = self._split_into_chunks(text)
             print(f"Created {len(chunks)} chunks from {pdf_path}")
             return chunks
             print(f"Error processing PDF {pdf_path}: {str(e)}")
             return []
+    def get_embedding(self, text: str) -> List[float]:
+        """Generate embedding for text"""
+        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
+        with torch.no_grad():
+            model_output = self.model(**inputs)
+        # Mean pooling
+        token_embeddings = model_output[0]
+        attention_mask = inputs['attention_mask']
+        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        sum_embeddings = torch.sum(token_embeddings * mask, 1)
+        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
+        return (sum_embeddings / sum_mask).squeeze().tolist()
     def process_and_store_documents(self):
         """Process all legal documents and store in ChromaDB"""
         print("Starting document processing...")
+        # Define the expected PDF paths
         pdf_files = {
             'BNS': os.path.join(self.pdf_dir, 'BNS.pdf'),
             'BNSS': os.path.join(self.pdf_dir, 'BNSS.pdf'),
             'BSA': os.path.join(self.pdf_dir, 'BSA.pdf')
         }
         for law_code, pdf_path in pdf_files.items():
             if os.path.exists(pdf_path):
                 print(f"Processing {law_code} from {pdf_path}")
                         )
                     except Exception as e:
                         print(f"Error processing chunk {i} from {law_code}: {str(e)}")
     def search_documents(self, query: str, n_results: int = 3) -> Dict:
         """Search for relevant legal information"""
         try:
                 n_results=n_results
             )
+            # Limit context size
+            documents = results["documents"][0]
+            total_length = 0
+            filtered_documents = []
+            filtered_metadatas = []
+            for doc, metadata in zip(documents, results["metadatas"][0]):
+                doc_length = len(doc)
+                if total_length + doc_length <= self.max_context_length:
+                    filtered_documents.append(doc)
+                    filtered_metadatas.append(metadata)
+                    total_length += doc_length
+                else:
+                    break
             return {
+                "documents": filtered_documents,
+                "metadatas": filtered_metadatas
             }
         except Exception as e:
             print(f"Error during search: {str(e)}")
             return {
                 "documents": ["Sorry, I couldn't search the documents effectively."],
                 "metadatas": [{"law_code": "ERROR", "source": "error"}]
+            }