Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
import shutil | |
from pypdf import PdfReader | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import fitz | |
TOKENIZER_REPO = "MediaTek-Research/Breeze-7B-Instruct-v1_0" | |
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO,local_files_only=False,use_fast=True) | |
tran_hints = "请将以下的文字转为繁体:" | |
start_flag="<s>" | |
end_flag="</s>" | |
model = AutoModelForCausalLM.from_pretrained( | |
TOKENIZER_REPO, | |
device_map="auto", | |
local_files_only=False, | |
torch_dtype=torch.bfloat16 | |
) | |
def generate(text): | |
chat_data = [] | |
text = text.strip() | |
if text: | |
chat_data.append({"role": "user", "content": text}) | |
achat=tokenizer.apply_chat_template(chat_data,return_tensors="pt") | |
#achat=tokenizer.encode(chat_data,return_tensors="pt",max_length=2048) | |
outputs = model.generate(achat, | |
max_new_tokens=2048, | |
top_p=0.01, | |
top_k=85, | |
repetition_penalty=1.1, | |
temperature=0) | |
return tokenizer.decode(outputs[0]) | |
def tran_txt(input_txt): | |
data_txt=tran_hints+"\n"+input_txt.strip() | |
tran_result=generate(data_txt) | |
print("tran_result="+tran_result) | |
# tran_result=tran_result.strip() | |
# index=tran_result.find(start_flag) | |
# if index>=0: | |
# tran_result=tran_result[len(start_flag):] | |
# tran_result=tran_result.strip() | |
# c_index=tran_result.find(data_txt) | |
# if c_index>=0: | |
# tran_result=tran_result[len(data_txt):] | |
# e_index=tran_result.find(end_flag) | |
# if e_index>=0: | |
# tran_result=tran_result[0:e_index] | |
return tran_result | |
def exec_tran(file): | |
temp_file=upload_file(file) | |
page_texts=read_paragraphs(temp_file) | |
temp_result_file=file; | |
file_index=temp_result_file.index('.pdf') | |
if file_index!=-1: | |
temp_result_file=temp_result_file[0:file_index] | |
temp_result_file=temp_result_file+"_result.txt" | |
else : | |
temp_result_file=temp_result_file+"_result.txt" | |
tran_file_name=file.name | |
with open(temp_result_file,'w') as fw: | |
tran_result=tran_txt(tran_hints) | |
# print(tran_result+"\n") | |
for page_content in page_texts: | |
#lines=page_content.split('\n') | |
#for line_content in lines: | |
#print("input="+line_content) | |
tran_result=tran_txt(page_content) | |
# print("result="+tran_result) | |
fw.write(tran_result+"\n") | |
return temp_result_file | |
def upload_file(file): | |
UPLOAD_FOLDER="./data" | |
if not os.path.exists(UPLOAD_FOLDER): | |
os.mkdir(UPLOAD_FOLDER) | |
return shutil.copy(file,UPLOAD_FOLDER) | |
def read_paragraphs(pdf_path): | |
document = fitz.open(pdf_path) | |
paragraphs = [] | |
for page in document: | |
text = page.get_text("paragraphs") | |
para_list = text.split('。') | |
paragraphs.extend([para for para in para_list if para.strip()]) | |
document.close() | |
return paragraphs | |
def load_pdf_pages(filename): | |
page_texts=[] | |
reader = PdfReader(filename) | |
for page in reader.pages: | |
page_texts.append(page.extract_text()) | |
return page_texts | |
def exec_translate(file): | |
upload_file(file) | |
page_texts=load_pdf_pages(file.name) | |
with gr.Blocks() as app: | |
file_output=gr.File() | |
upload_button=gr.UploadButton("上传pdf文件",file_types=["pdf"],file_count="single") | |
upload_button.upload(exec_tran,upload_button,file_output) | |
app.launch() |