import os import gradio as gr from datasets import ClassLabel from datasets import load_dataset import random import pandas as pd from huggingface_hub import login def remove_space(example): ''' 移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。 並且在文章的開頭跟結尾加入 bos_token = '', eos_token = '' ''' return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1]} def download_file(content, filename): # print(filename) # print(content) with open(filename, "w", encoding="utf-8") as f: f.write(content) def random_elements(dataset, num_examples=5): assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset." picks = [] for _ in range(num_examples): pick = random.randint(0, len(dataset)-1) while pick in picks: pick = random.randint(0, len(dataset)-1) picks.append(pick) df = pd.DataFrame(dataset[picks]) for column, typ in dataset.features.items(): if isinstance(typ, ClassLabel): df[column] = df[column].transform(lambda i: typ.names[i]) return df def random_next(num_examples=5): random_selected = random_elements(dataset["train"], num_examples=num_examples) court_name = random_selected['Court'][0] case_no = random_selected['CaseNo'][0] crime_descrip = random_selected['Corpus-Delicti'][0] title = court_name + "_" + case_no data_tuple = (court_name, case_no, crime_descrip, title) return data_tuple def gen_template(crime_descrip, element, tag): INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the element of crime and its tag that appropriately completes the request." DESCRIPT_KEY = "### Description:" ELEMENT_KEY = "### Element:" TAG_KEY = "### Tag:" END_KEY = "### End" # assert tag == None, "未選取構成要件要素標籤" try: tag_name = tag.split(",")[1].strip(")").strip().strip("'") except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。 # 改為調適 Alpaca 格式的資料 blurb = f"{INTRO_BLURB}\n" descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" tag = f"{TAG_KEY}\n{tag_name}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" end = f"{END_KEY}" template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end return template blurb = f"{INTRO_BLURB}\n" # 改為調適 Alpaca 格式的資料 descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" tag = f"{TAG_KEY}\n{tag_name}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" end = f"{END_KEY}" template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end return template # 下載判決書資料集 use_auth_token = os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。 login(token = os.environ['HUB_TOKEN']) dataset = load_dataset("jslin09/Fraud_Case_Verdicts", use_auth_token=use_auth_token, revision="main") dataset = dataset.map(remove_space) #random_selected = random_elements(dataset["train"]) random_selected = random_next() court_name = random_selected[0] case_no = random_selected[1] crime_descrip = random_selected[2] title = random_selected[3] with gr.Blocks() as demo: gr.Markdown( """

Legal Document Annotation

""") with gr.Row(): with gr.Column(): # 犯罪事實段 # court_name = random_selected[0] # case_no = random_selected[1] # crime_descrip = random_selected[2] with gr.Row(): # 抬頭段 # courtName = gr.Label(label='法院名稱', value=court_name) # caseNo = gr.Label(label='案號', value=case_no) title = gr.components.Textbox(label='案號',value=title) prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip) with gr.Row(): with gr.Column(): btn = gr.Button("隨機選擇") # gr.Examples(examples, inputs=[prompt]) with gr.Column(): with gr.Row(): element = gr.components.Textbox(lines=2, label="構成要件要素") # tag = gr.components.Textbox(label="標籤") tag = gr.Dropdown( choices = [("被告(犯罪主體)",""), ("主觀犯意", ""), ("不法行為",""), ("因果關係",""), ("被害人/告訴人",""), ("危害結果",""), ("未遂",""), ("既遂",""), ("中止",""), ("預備","")], label="標籤", info="構成要件要素的標籤") with gr.Row(): with gr.Column(): btn2 = gr.Button("產生標註語料內容") result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True) # btn3 = gr.Button("下載") btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, title]) btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result]) # btn3.click(download_file, inputs=[result, title], outputs=[]) if __name__ == "__main__": demo.launch() # 在遠端啟動時,需要 share=True 。