|
import json |
|
from dataclasses import dataclass |
|
from datetime import datetime |
|
from enum import Enum |
|
from typing import Dict, List, Optional |
|
|
|
from claudette import Chat, models |
|
|
|
|
|
@dataclass |
|
class Context: |
|
tibetan: str |
|
english: str |
|
commentaries: List[str] |
|
sanskrit: Optional[str] = None |
|
|
|
|
|
class AnalysisType(Enum): |
|
SEMANTIC = "semantic" |
|
TERM_GENERATION = "term_generation" |
|
EVALUATION = "evaluation" |
|
|
|
|
|
class BuddhistTermAnalyzer: |
|
def __init__(self): |
|
|
|
self.model = models[1] |
|
self.total_api_calls_cost = 0 |
|
self.token_usage = {} |
|
|
|
|
|
self.system_prompts = { |
|
AnalysisType.SEMANTIC: """You are an expert in Buddhist terminology analysis with deep knowledge of Sanskrit and Tibetan. |
|
Analyze the given term through a systematic philological approach. |
|
You must ONLY respond with a valid JSON object, no other text. |
|
Never include any explanatory text before or after the JSON. |
|
|
|
Required JSON structure: |
|
{ |
|
"sanskrit_analysis": { |
|
"term": "string", # Sanskrit equivalent |
|
"morphology": "string", # Morphological breakdown |
|
"literal_meaning": "string", # Literal meaning in Sanskrit |
|
"technical_usage": "string" # Technical usage in Sanskrit Buddhist literature |
|
}, |
|
"tibetan_mapping": { |
|
"term": "string", # Tibetan term |
|
"morphology": "string", # Morphological breakdown of Tibetan |
|
"translation_strategy": "string", # How Tibetan translates the Sanskrit |
|
"semantic_extension": "string" # Any semantic changes or extensions in Tibetan |
|
}, |
|
"commentary_insights": [ |
|
{ |
|
"source": "string", # Which commentary |
|
"explanation": "string", # Key explanation |
|
"technical_points": ["string"] # Technical clarifications |
|
} |
|
], |
|
"english_renderings": [ |
|
{ |
|
"translation": "string", |
|
"accuracy_score": number, # 1-10 |
|
"captures_sanskrit": boolean, |
|
"captures_tibetan": boolean, |
|
"notes": "string" |
|
} |
|
], |
|
"semantic_synthesis": { |
|
"core_meaning": "string", # Core meaning synthesized from all sources |
|
"technical_usage": ["string"], # List of technical usages found in context |
|
"connotative_aspects": ["string"] # Important connotations and implications |
|
}, |
|
"usage_examples": [ |
|
{ |
|
"source_text": "string", # Original context |
|
"usage_type": "string", # How term is used here |
|
"commentary_explanation": "string" # What commentary says about this usage |
|
} |
|
] |
|
}""", |
|
AnalysisType.TERM_GENERATION: """You are an expert Buddhist translator. |
|
You must ONLY respond with a valid JSON object, no other text. |
|
Never include any explanatory text before or after the JSON. |
|
|
|
Required JSON structure: |
|
{ |
|
"academic": { |
|
"terms": ["term1", "term2"], |
|
"reasoning": "string" |
|
}, |
|
"practitioner": { |
|
"terms": ["term1", "term2"], |
|
"reasoning": "string" |
|
}, |
|
"general": { |
|
"terms": ["term1", "term2"], |
|
"reasoning": "string" |
|
} |
|
}""", |
|
AnalysisType.EVALUATION: """You are an expert evaluator of Buddhist translations. |
|
You must ONLY respond with a valid JSON object, no other text. |
|
Never include any explanatory text before or after the JSON. |
|
|
|
Required JSON structure: |
|
{ |
|
"evaluations": { |
|
"term": { |
|
"technical_score": 0.0, |
|
"cultural_score": 0.0, |
|
"audience_score": 0.0, |
|
"reasoning": "string" |
|
} |
|
} |
|
}""", |
|
} |
|
|
|
|
|
self.chats = { |
|
analysis_type: Chat(self.model, sp=system_prompt) |
|
for analysis_type, system_prompt in self.system_prompts.items() |
|
} |
|
|
|
def create_semantic_prompt(self, tibetan_term: str, contexts: List[Dict]) -> str: |
|
return f""" |
|
Analyze this Buddhist term following these steps: |
|
|
|
Target Term: {tibetan_term} |
|
|
|
Analysis Process: |
|
1. First analyze the Sanskrit source: |
|
- Identify the Sanskrit equivalent |
|
- Break down its morphology |
|
- Understand its literal and technical meanings |
|
|
|
2. Map to Tibetan: |
|
- Analyze how Tibetan translates the Sanskrit |
|
- Note any semantic extensions or modifications |
|
- Understand the translation strategy |
|
|
|
3. Study the commentaries: |
|
- Extract key explanations |
|
- Note technical clarifications |
|
- Identify special usages explained |
|
|
|
4. Evaluate English translations: |
|
- Compare against Sanskrit and Tibetan meanings |
|
- Assess accuracy and completeness |
|
- Note which aspects are captured/missed |
|
|
|
5. Synthesize understanding: |
|
- Combine insights from all sources |
|
- Document technical usage patterns |
|
- Note important connotations |
|
|
|
Contexts: |
|
{json.dumps(contexts, indent=2, ensure_ascii=False)} |
|
|
|
Important: |
|
- Base analysis strictly on provided contexts |
|
- Use commentaries to resolve ambiguities |
|
- Pay special attention to technical terms in commentaries |
|
- Note when English translations diverge from Sanskrit/Tibetan |
|
- Document specific usage examples from the context |
|
|
|
Remember: Return ONLY the JSON object with no other text.""" |
|
|
|
def create_generation_prompt( |
|
self, tibetan_term: str, semantic_analysis: Dict |
|
) -> str: |
|
return f""" |
|
Respond ONLY with a JSON object containing translation candidates: |
|
|
|
Term: {tibetan_term} |
|
|
|
Semantic Analysis: |
|
{json.dumps(semantic_analysis, indent=2, ensure_ascii=False)} |
|
|
|
Remember: Return ONLY the JSON object with no other text.""" |
|
|
|
def create_evaluation_prompt( |
|
self, tibetan_term: str, candidates: Dict, semantic_analysis: Dict |
|
) -> str: |
|
return f""" |
|
Respond ONLY with a JSON object evaluating these candidates: |
|
|
|
Term: {tibetan_term} |
|
|
|
Candidates: |
|
{json.dumps(candidates, indent=2, ensure_ascii=False)} |
|
|
|
Semantic Analysis: |
|
{json.dumps(semantic_analysis, indent=2, ensure_ascii=False)} |
|
|
|
Remember: Return ONLY the JSON object with no other text.""" |
|
|
|
def _track_usage(self, analysis_type: AnalysisType, response): |
|
cost = self.chats[analysis_type].cost |
|
self.total_api_calls_cost += cost |
|
self.token_usage[str(analysis_type)] = { |
|
"token_usage": repr(response.usage), |
|
"api_call_cost": cost, |
|
} |
|
|
|
def analyze_term(self, tibetan_term: str, contexts: List[Dict]) -> Dict: |
|
"""Main analysis pipeline using cached prompts""" |
|
|
|
|
|
semantic_prompt = self.create_semantic_prompt(tibetan_term, contexts) |
|
semantic_response = self.chats[AnalysisType.SEMANTIC](semantic_prompt) |
|
self._track_usage(AnalysisType.SEMANTIC, semantic_response) |
|
semantic_analysis = json.loads(semantic_response.content[0].text) |
|
|
|
|
|
generation_prompt = self.create_generation_prompt( |
|
tibetan_term, semantic_analysis |
|
) |
|
generation_response = self.chats[AnalysisType.TERM_GENERATION]( |
|
generation_prompt |
|
) |
|
self._track_usage(AnalysisType.TERM_GENERATION, generation_response) |
|
semantic_analysis = json.loads(semantic_response.content[0].text) |
|
candidates = json.loads(generation_response.content[0].text) |
|
|
|
|
|
evaluation_prompt = self.create_evaluation_prompt( |
|
tibetan_term, candidates, semantic_analysis |
|
) |
|
evaluation_response = self.chats[AnalysisType.EVALUATION](evaluation_prompt) |
|
self._track_usage(AnalysisType.EVALUATION, evaluation_response) |
|
evaluations = json.loads(evaluation_response.content[0].text) |
|
|
|
|
|
return self.format_results( |
|
tibetan_term, |
|
semantic_analysis, |
|
candidates, |
|
evaluations, |
|
) |
|
|
|
def format_results( |
|
self, |
|
tibetan_term: str, |
|
semantic_analysis: Dict, |
|
candidates: Dict, |
|
evaluations: Dict, |
|
) -> Dict: |
|
"""Format the final results""" |
|
return { |
|
"tibetan_term": tibetan_term, |
|
"recommendations": { |
|
"Academic": { |
|
"term": candidates["academic"]["terms"][0], |
|
"reasoning": candidates["academic"]["reasoning"], |
|
}, |
|
"Practitioner": { |
|
"term": candidates["practitioner"]["terms"][0], |
|
"reasoning": candidates["practitioner"]["reasoning"], |
|
}, |
|
"General": { |
|
"term": candidates["general"]["terms"][0], |
|
"reasoning": candidates["general"]["reasoning"], |
|
}, |
|
}, |
|
"analysis": semantic_analysis, |
|
"evaluations": evaluations["evaluations"], |
|
"total_api_calls_cost": self.total_api_calls_cost, |
|
"token_usage": self.token_usage, |
|
} |
|
|
|
|
|
class TermStandardizationAgent: |
|
def __init__(self): |
|
self.analyzer = BuddhistTermAnalyzer() |
|
|
|
def select_best_terms(self, tibetan_term: str, contexts: List[Dict]) -> Dict: |
|
"""Main entry point for term standardization""" |
|
results = self.analyzer.analyze_term(tibetan_term, contexts) |
|
return results |
|
|
|
|
|
|
|
def main(): |
|
from pathlib import Path |
|
|
|
|
|
agent = TermStandardizationAgent() |
|
|
|
|
|
tibetan_term = "བྱང་ཆུབ་སེམས་" |
|
contexts_fn = Path(__file__).parent / "data" / f"{tibetan_term}.json" |
|
contexts = json.load(contexts_fn.open()) |
|
|
|
|
|
results = agent.select_best_terms(tibetan_term, contexts) |
|
date_time = datetime.now().strftime("%Y%m%d%H%M%S") |
|
results_path = Path(__file__).parent / "results" |
|
results_path.mkdir(exist_ok=True, parents=True) |
|
result_fn = results_path / f"{tibetan_term}_{date_time}.json" |
|
json.dump(results, result_fn.open("w"), indent=2, ensure_ascii=False) |
|
print(f"Results saved to: {result_fn}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|