Spaces:

sivan22
/

Tora-Agent

Running

App Files Files Community

Tora-Agent / tools.py

sivan22

Update tools.py

9681aac verified about 2 months ago

raw

history blame contribute delete

4.91 kB

	from langchain_core.tools import tool
	from sefaria import get_text as sefaria_get_text, get_commentaries as sefaria_get_commentaries
	from tantivy_search import TantivySearch
	from typing import Optional
	from pydantic import BaseModel, Field
	import os
	import gdown
	import zipfile

	from app import INDEX_PATH


	class ReadTextArgs(BaseModel):
	reference: str = Field(description="The reference to retrieve the text for. examples: בראשית א פרק א; שולחן ערוך חושן משפט סימן שמט סעיף א")

	class SearchArgs(BaseModel):
	query: str = Field(description="""the query for the search.
	Instructions for generating a query:

	1. Boolean Operators:

	- AND: term1 AND term2 (both required)
	- OR: term1 OR term2 (either term)
	- Multiple words default to OR operation (cloud network = cloud OR network)
	- AND takes precedence over OR
	- Example: Shabath AND (walk OR go)

	2. Field-specific Terms:
	- Field-specific terms: field:term
	- Example: text:אדם AND reference:בראשית
	- available fields: text, reference, topics
	- text contains the text of the document
	- reference contains the citation of the document, e.g. בראשית, פרק א
	- topics contains the topics of the document. available topics includes: תנך, הלכה, מדרש, etc.

	3. Required/Excluded Terms:
	- Required (+): +term (must contain)
	- Excluded (-): -term (must not contain)
	- Example: +security cloud -deprecated
	- Equivalent to: security AND cloud AND NOT deprecated

	4. Phrase Search:
	- Use quotes: "exact phrase"
	- Both single/double quotes work
	- Escape quotes with \\"
	- Slop operator: "term1 term2"~N
	- Example: "cloud security"~2
	- the above will find "cloud framework and security "
	- Prefix matching: "start of phrase"*

	5. Wildcards:
	- ? for single character
	- * for any number of characters
	- Example: sec?rity cloud*

	6. Special Features:
	- All docs: *
	- Boost terms: term^2.0 (positive numbers only)
	- Example: security^2.0 cloud
	- the above will boost security by 2.0

	Query Examples:
	1. Basic: +שבת +חולה +אסור
	2. Field-specific: text:סיני AND topics:תנך
	3. Phrase with slop: "security framework"~2
	4. Complex: +reference:בראשית +text:"הבל"^2.0 +(דמי OR דמים) -הבלים
	6. Mixed: (text:"רבנו משה"^2.0 OR reference:"משנה תורה") AND topics:הלכה) AND text:"תורה המלך"~3 AND NOT topics:מדרש

	Tips:
	- Group complex expressions with parentheses
	- Use quotes for exact phrases
	- Add + for required terms, - for excluded terms
	- Boost important terms with ^N
	- use field-specific terms for better results.
	- the corpus to search in is an ancient Hebrew corpus: Tora and Talmud. so Try to use ancient Hebrew terms and or Talmudic expressions and prevent modern words that are not common in talmudic texts
	""")
	num_results: int = Field(description="the maximum number of results to return. Default: 10", default=10)



	index_path = INDEX_PATH
	gdrive_index_id = os.getenv("GDRIVE_INDEX_ID", "1lpbBCPimwcNfC0VZOlQueA4SHNGIp5_t")
	if not os.path.exists(index_path):
	try:
	zip_path = "index.zip"
	url = f"https://drive.google.com/uc?id={gdrive_index_id}"
	gdown.download(url, zip_path, quiet=False)
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(".")
	os.remove(zip_path)
	except Exception as e:
	raise Exception(f"failed to download index: {e}")
	try:
	tantivy = TantivySearch(index_path)
	tantivy.validate_index()
	except Exception as e:
	raise Exception(f"failed to create index: {e}")



	@tool(args_schema=SearchArgs)
	def search( query: str, num_results: int = 10):
	"""Searches the index for the given query."""
	results = tantivy.search(query, num_results)
	formatted_results = []
	for result in results:
	formatted_results.append({
	'text': result.get('text', 'N/A'),
	'reference': result.get('reference', 'N/A')
	})

	return formatted_results


	@tool(args_schema=ReadTextArgs)
	def read_text(reference: str )->str:
	"""Retrieves the text for a given reference.
	"""
	text = sefaria_get_text(reference)
	return {
	'text': str(text),
	'reference': reference
	}

	@tool
	def get_commentaries(reference: str, num_results: int = 10)->str:
	"""Retrieves references to all available commentaries on the given verse."""
	commentaries = sefaria_get_commentaries(reference)
	return {
	'text': '\n'.join(commentaries) if isinstance(commentaries, list) else str(commentaries),
	'reference': f"Commentaries on {reference}"
	}