Spaces:
Running
Running
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import re | |
# Load the model and tokenizer | |
model_name = 'abinayam/gpt-2-tamil' | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name) | |
# System prompt | |
system_prompt = """You are an expert Tamil language model specializing in spelling and grammar correction. Your task is to: | |
1. Correct any spelling errors in the given text. | |
2. Fix grammatical mistakes, including proper application of sandhi rules. | |
3. Ensure the corrected text maintains the original meaning and context. | |
4. Provide the corrected version of the entire input text. | |
Remember to preserve the structure and intent of the original text while making necessary corrections.""" | |
# Common error corrections | |
common_errors = { | |
'பழங்கல்': 'பழங்கள்', | |
# Add more common spelling errors here | |
} | |
def apply_sandhi_rules(text): | |
# Apply sandhi rules | |
text = re.sub(r'(கு|க்கு)\s+(ப|த|க|ச)', r'\1ப் \2', text) | |
# Add more sandhi rules as needed | |
return text | |
def preprocess_text(text): | |
# Apply common error corrections | |
for error, correction in common_errors.items(): | |
text = text.replace(error, correction) | |
return text | |
def postprocess_text(text): | |
# Apply sandhi rules | |
text = apply_sandhi_rules(text) | |
return text | |
def correct_text(input_text): | |
# Preprocess the input text | |
preprocessed_text = preprocess_text(input_text) | |
# Prepare the full prompt with system prompt and input text | |
full_prompt = f"{system_prompt}\n\nInput: {preprocessed_text}\n\nCorrected:" | |
# Tokenize the full prompt | |
input_ids = tokenizer.encode(full_prompt, return_tensors='pt') | |
# Generate corrected text | |
with torch.no_grad(): | |
output = model.generate( | |
input_ids, | |
max_length=len(input_ids[0]) + 100, # Adjust based on expected output length | |
num_return_sequences=1, | |
temperature=0.7, | |
do_sample=True, | |
top_k=50, | |
top_p=0.95 | |
) | |
# Decode the generated text | |
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
# Extract the corrected text (everything after "Corrected:") | |
corrected_text = generated_text.split("Corrected:")[-1].strip() | |
# Postprocess the corrected text | |
final_text = postprocess_text(corrected_text) | |
return final_text | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=correct_text, | |
inputs=gr.Textbox(lines=5, placeholder="Enter Tamil text here..."), | |
outputs=gr.Textbox(label="Corrected Text"), | |
title="Tamil Spell Corrector and Grammar Checker", | |
description="This app uses the 'abinayam/gpt-2-tamil' model along with custom rules to correct spelling and grammar in Tamil text.", | |
examples=[ | |
["நான் நேற்று கடைக்கு போனேன். அங்கே நிறைய பழங்கல் வாங்கினேன்."], | |
["நான் பள்ளிகு செல்கிறேன்."], | |
["அவன் வீட்டுகு வந்தான்."] | |
] | |
) | |
# Launch the app | |
iface.launch() |