Spaces:
Running
Running
from langchain_core.tools import tool | |
from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries | |
from tantivy_search import TantivySearch | |
from typing import Optional | |
from pydantic import BaseModel, Field | |
import os | |
import gdown | |
import zipfile | |
from app import INDEX_PATH | |
class ReadTextArgs(BaseModel): | |
reference: str = Field(description="The reference to retrieve the text for. examples: 讘专讗砖讬转 讗 驻专拽 讗; 砖讜诇讞谉 注专讜讱 讞讜砖谉 诪砖驻讟 住讬诪谉 砖诪讟 住注讬祝 讗") | |
class SearchArgs(BaseModel): | |
query: str = Field(description="""the query for the search. | |
Instructions for generating a query: | |
1. Boolean Operators: | |
- AND: term1 AND term2 (both required) | |
- OR: term1 OR term2 (either term) | |
- Multiple words default to OR operation (cloud network = cloud OR network) | |
- AND takes precedence over OR | |
- Example: Shabath AND (walk OR go) | |
2. Field-specific Terms: | |
- Field-specific terms: field:term | |
- Example: text:讗讚诐 AND reference:讘专讗砖讬转 | |
- available fields: text, reference, topics | |
- text contains the text of the document | |
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗 | |
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc. | |
3. Required/Excluded Terms: | |
- Required (+): +term (must contain) | |
- Excluded (-): -term (must not contain) | |
- Example: +security cloud -deprecated | |
- Equivalent to: security AND cloud AND NOT deprecated | |
4. Phrase Search: | |
- Use quotes: "exact phrase" | |
- Both single/double quotes work | |
- Escape quotes with \\" | |
- Slop operator: "term1 term2"~N | |
- Example: "cloud security"~2 | |
- the above will find "cloud framework and security " | |
- Prefix matching: "start of phrase"* | |
5. Wildcards: | |
- ? for single character | |
- * for any number of characters | |
- Example: sec?rity cloud* | |
6. Special Features: | |
- All docs: * | |
- Boost terms: term^2.0 (positive numbers only) | |
- Example: security^2.0 cloud | |
- the above will boost security by 2.0 | |
Query Examples: | |
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专 | |
2. Field-specific: text:住讬谞讬 AND topics:转谞讱 | |
3. Phrase with slop: "security framework"~2 | |
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐 | |
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖 | |
Tips: | |
- Group complex expressions with parentheses | |
- Use quotes for exact phrases | |
- Add + for required terms, - for excluded terms | |
- Boost important terms with ^N | |
- use field-specific terms for better results. | |
- the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts | |
""") | |
num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10) | |
index_path = INDEX_PATH | |
gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t") | |
if not os.path.exists(index_path): | |
try: | |
zip_path = "index.zip" | |
url = f"https://drive.google.com/uc?id={gdrive_index_id}" | |
gdown.download(url, zip_path, quiet=False) | |
with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
zip_ref.extractall(".") | |
os.remove(zip_path) | |
except Exception as e: | |
raise Exception(f"failed to download index: {e}") | |
try: | |
tantivy = TantivySearch(index_path) | |
tantivy.validate_index() | |
except Exception as e: | |
raise Exception(f"failed to create index: {e}") | |
def search( query: str, num_results: int = 10): | |
"""Searches the index for the given query.""" | |
results = tantivy.search(query, num_results) | |
formatted_results = [] | |
for result in results: | |
formatted_results.append({ | |
'text': result.get('text', 'N/A'), | |
'reference': result.get('reference', 'N/A') | |
}) | |
return formatted_results | |
def read_text(reference: str )->str: | |
"""Retrieves the text for a given reference. | |
""" | |
text = sefaria_get_text(reference) | |
return { | |
'text': str(text), | |
'reference': reference | |
} | |
def get_commentaries(reference: str, num_results: int = 10)->str: | |
"""Retrieves references to all available commentaries on the given verse.""" | |
commentaries = sefaria_get_commentaries(reference) | |
return { | |
'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries), | |
'reference': f"Commentaries on {reference}" | |
} | |