Spaces:
Running
Running
from typing import List, Dict, Any, Optional | |
from tantivy import Index | |
import logging | |
import os | |
import re | |
class TantivySearch: | |
def __init__(self, index_path: str): | |
"""Initialize the Tantivy search agent with the index path""" | |
self.index_path = index_path | |
self.logger = logging.getLogger(__name__) | |
try: | |
self.index = Index.open(index_path) | |
self.logger.info(f"Successfully opened Tantivy index at {index_path}") | |
except Exception as e: | |
self.logger.error(f"Failed to open Tantivy index: {e}") | |
raise | |
def get_query_instructions(self) -> str: | |
"""Return instructions for the LLM on how to parse and construct Tantivy queries""" | |
return """ | |
Instructions for generating a query: | |
1. Boolean Operators: | |
- AND: term1 AND term2 (both required) | |
- OR: term1 OR term2 (either term) | |
- Multiple words default to OR operation (cloud network = cloud OR network) | |
- AND takes precedence over OR | |
- Example: Shabath AND (walk OR go) | |
2. Field-specific Terms: | |
- Field-specific terms: field:term | |
- Example: text:讗讚诐 AND reference:讘专讗砖讬转 | |
- available fields: text, reference, topics | |
- text contains the text of the document | |
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗 | |
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc. | |
3. Required/Excluded Terms: | |
- Required (+): +term (must contain) | |
- Excluded (-): -term (must not contain) | |
- Example: +security cloud -deprecated | |
- Equivalent to: security AND cloud AND NOT deprecated | |
4. Phrase Search: | |
- Use quotes: "exact phrase" | |
- Both single/double quotes work | |
- Escape quotes with \\" | |
- Slop operator: "term1 term2"~N | |
- Example: "cloud security"~2 | |
- the above will find "cloud framework and security " | |
- Prefix matching: "start of phrase"* | |
5. Wildcards: | |
- ? for single character | |
- * for any number of characters | |
- Example: sec?rity cloud* | |
6. Special Features: | |
- All docs: * | |
- Boost terms: term^2.0 (positive numbers only) | |
- Example: security^2.0 cloud | |
- the above will boost security by 2.0 | |
Query Examples: | |
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专 | |
2. Field-specific: text:住讬谞讬 AND topics:转谞讱 | |
3. Phrase with slop: "security framework"~2 | |
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐 | |
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖 | |
Tips: | |
- Group complex expressions with parentheses | |
- Use quotes for exact phrases | |
- Add + for required terms, - for excluded terms | |
- Boost important terms with ^N | |
- use field-specific terms for better results. | |
""" | |
def search(self, query: str, num_results: int = 10) -> List[Dict[str, Any]]: | |
"""Search the Tantivy index with the given query using Tantivy's query syntax""" | |
try: | |
# Create a searcher | |
searcher = self.index.searcher() | |
# Parse and execute the query | |
try: | |
# First try with lenient parsing | |
query_parser = self.index.parse_query_lenient(query) | |
search_results = searcher.search(query_parser[0], num_results).hits | |
except Exception as query_error: | |
self.logger.error(f"Lenient query parsing failed: {query_error}") | |
# Process results | |
results = [] | |
for score, doc_address in search_results: | |
doc = searcher.doc(doc_address) | |
text = doc.get_first("text") | |
# Extract highlighted snippets based on query terms | |
# Remove special syntax for highlighting while preserving Hebrew | |
highlight_terms = re.sub( | |
r'[:"()[\]{}^~*\\]|\b(AND|OR|NOT|TO|IN)\b|[-+]', | |
' ', | |
query | |
).strip() | |
highlight_terms = [term for term in highlight_terms.split() if len(term) > 1] | |
# Create regex pattern for highlighting | |
if highlight_terms: | |
# Escape regex special chars but preserve Hebrew | |
patterns = [re.escape(term) for term in highlight_terms] | |
pattern = '|'.join(patterns) | |
# Get surrounding context for matches | |
matches = list(re.finditer(pattern, text, re.IGNORECASE)) | |
if matches: | |
highlights = [] | |
for match in matches: | |
start = max(0, match.start() - 50) | |
end = min(len(text), match.end() + 50) | |
highlight = text[start:end] | |
if start > 0: | |
highlight = f"...{highlight}" | |
if end < len(text): | |
highlight = f"{highlight}..." | |
highlights.append(highlight) | |
else: | |
highlights = [text[:100] + "..." if len(text) > 100 else text] | |
else: | |
highlights = [text[:100] + "..." if len(text) > 100 else text] | |
result = { | |
"score": float(score), | |
"title": doc.get_first("title") or os.path.basename(doc.get_first("filePath") or ""), | |
"reference": doc.get_first("reference"), | |
"topics": doc.get_first("topics"), | |
"file_path": doc.get_first("filePath"), | |
"line_number": doc.get_first("segment"), | |
"is_pdf": doc.get_first("isPdf"), | |
"text": text, | |
"highlights": highlights | |
} | |
results.append(result) | |
self.logger.info(f"Found {len(results)} results for query: {query}") | |
return results | |
except Exception as e: | |
self.logger.error(f"Error during search: {str(e)}") | |
return [] | |
def validate_index(self) -> bool: | |
"""Validate that the index exists and is accessible""" | |
try: | |
# Try to create a searcher and perform a simple search | |
searcher = self.index.searcher() | |
query_parser = self.index.parse_query("*") | |
searcher.search(query_parser, 1) | |
return True | |
except Exception as e: | |
self.logger.error(f"Index validation failed: {e}") | |
return False | |