Tora-Agent / tools.py
sivan22's picture
Update tools.py
9681aac verified
from langchain_core.tools import tool
from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries
from tantivy_search import TantivySearch
from typing import Optional
from pydantic import BaseModel, Field
import os
import gdown
import zipfile
from app import INDEX_PATH
class ReadTextArgs(BaseModel):
reference: str = Field(description="The reference to retrieve the text for. examples: 讘专讗砖讬转 讗 驻专拽 讗; 砖讜诇讞谉 注专讜讱 讞讜砖谉 诪砖驻讟 住讬诪谉 砖诪讟 住注讬祝 讗")
class SearchArgs(BaseModel):
query: str = Field(description="""the query for the search.
Instructions for generating a query:
1. Boolean Operators:
- AND: term1 AND term2 (both required)
- OR: term1 OR term2 (either term)
- Multiple words default to OR operation (cloud network = cloud OR network)
- AND takes precedence over OR
- Example: Shabath AND (walk OR go)
2. Field-specific Terms:
- Field-specific terms: field:term
- Example: text:讗讚诐 AND reference:讘专讗砖讬转
- available fields: text, reference, topics
- text contains the text of the document
- reference contains the citation of the document, e.g. 讘专讗砖讬转, 驻专拽 讗
- topics contains the topics of the document. available topics includes: 转谞讱, 讛诇讻讛, 诪讚专砖, etc.
3. Required/Excluded Terms:
- Required (+): +term (must contain)
- Excluded (-): -term (must not contain)
- Example: +security cloud -deprecated
- Equivalent to: security AND cloud AND NOT deprecated
4. Phrase Search:
- Use quotes: "exact phrase"
- Both single/double quotes work
- Escape quotes with \\"
- Slop operator: "term1 term2"~N
- Example: "cloud security"~2
- the above will find "cloud framework and security "
- Prefix matching: "start of phrase"*
5. Wildcards:
- ? for single character
- * for any number of characters
- Example: sec?rity cloud*
6. Special Features:
- All docs: *
- Boost terms: term^2.0 (positive numbers only)
- Example: security^2.0 cloud
- the above will boost security by 2.0
Query Examples:
1. Basic: +砖讘转 +讞讜诇讛 +讗住讜专
2. Field-specific: text:住讬谞讬 AND topics:转谞讱
3. Phrase with slop: "security framework"~2
4. Complex: +reference:讘专讗砖讬转 +text:"讛讘诇"^2.0 +(讚诪讬 OR 讚诪讬诐) -讛讘诇讬诐
6. Mixed: (text:"专讘谞讜 诪砖讛"^2.0 OR reference:"诪砖谞讛 转讜专讛") AND topics:讛诇讻讛) AND text:"转讜专讛 讛诪诇讱"~3 AND NOT topics:诪讚专砖
Tips:
- Group complex expressions with parentheses
- Use quotes for exact phrases
- Add + for required terms, - for excluded terms
- Boost important terms with ^N
- use field-specific terms for better results.
- the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts
""")
num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10)
index_path = INDEX_PATH
gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t")
if not os.path.exists(index_path):
try:
zip_path = "index.zip"
url = f"https://drive.google.com/uc?id={gdrive_index_id}"
gdown.download(url, zip_path, quiet=False)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(".")
os.remove(zip_path)
except Exception as e:
raise Exception(f"failed to download index: {e}")
try:
tantivy = TantivySearch(index_path)
tantivy.validate_index()
except Exception as e:
raise Exception(f"failed to create index: {e}")
@tool(args_schema=SearchArgs)
def search( query: str, num_results: int = 10):
"""Searches the index for the given query."""
results = tantivy.search(query, num_results)
formatted_results = []
for result in results:
formatted_results.append({
'text': result.get('text', 'N/A'),
'reference': result.get('reference', 'N/A')
})
return formatted_results
@tool(args_schema=ReadTextArgs)
def read_text(reference: str )->str:
"""Retrieves the text for a given reference.
"""
text = sefaria_get_text(reference)
return {
'text': str(text),
'reference': reference
}
@tool
def get_commentaries(reference: str, num_results: int = 10)->str:
"""Retrieves references to all available commentaries on the given verse."""
commentaries = sefaria_get_commentaries(reference)
return {
'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries),
'reference': f"Commentaries on {reference}"
}