import os import gradio as gr from datasets import ClassLabel from datasets import load_dataset import random import pandas as pd from huggingface_hub import login def remove_space(example): ''' 移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。 並且在文章的開頭跟結尾加入 bos_token = '', eos_token = '' ''' return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1].replace('犯罪事實:', '')} def download_file(content, filename): with open(filename, "w", encoding="utf-8") as f: f.write(content) def random_elements(dataset, num_examples=5): assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset." picks = [] for _ in range(num_examples): pick = random.randint(0, len(dataset)-1) while pick in picks: pick = random.randint(0, len(dataset)-1) picks.append(pick) df = pd.DataFrame(dataset[picks]) for column, typ in dataset.features.items(): if isinstance(typ, ClassLabel): df[column] = df[column].transform(lambda i: typ.names[i]) return df def random_next(num_examples=5): random_selected = random_elements(dataset["train"], num_examples=num_examples) court_name = random_selected['Court'][0] case_no = random_selected['CaseNo'][0] crime_descrip = random_selected['Corpus-Delicti'][0] filename = court_name + "_" + case_no + '.txt' data_tuple = (court_name, case_no, crime_descrip, filename) return data_tuple def gen_template(crime_descrip, element, tag): INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the legal element of crime and its tag that appropriately completes the request." DESCRIPT_KEY = "### Description:" ELEMENT_KEY = "### Element:" TAG_KEY = "### Tag:" END_KEY = "### End" # assert tag == None, "未選取構成要件要素標籤" # try: # tag_name = tag.split(",")[1].strip(")").strip().strip("'") # except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。 # # 改為調適 Alpaca 格式的資料 # blurb = f"{INTRO_BLURB}\n" # descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" # element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" # tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" # end = f"{END_KEY}" # template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end # return template blurb = f"{INTRO_BLURB}\n" # 改為調適 Alpaca 格式的資料 descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" end = f"{END_KEY}" template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end return template # 用來產生下載檔案按鈕用的 JavaScript js_download = '''function downloadFile(result, filename) { //藉型別陣列建構的 blob 來建立 URL let fileName = filename; const data = result; let blob = new Blob([data], { type: "application/octet-stream", }); var href = URL.createObjectURL(blob); // 從 Blob 取出資料 var link = document.createElement("a"); document.body.appendChild(link); link.href = href; link.download = fileName; link.click(); } ''' # 下載判決書資料集 use_auth_token = os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。 login(token = os.environ['HUB_TOKEN'], add_to_git_credential=True) dataset = load_dataset("jslin09/Fraud_Case_Verdicts", token=use_auth_token, revision="main") dataset = dataset.map(remove_space) # 隨機選取案件 random_selected = random_next() court_name = random_selected[0] case_no = random_selected[1] crime_descrip = random_selected[2] filename = random_selected[3] with gr.Blocks() as demo: gr.Markdown( """

Legal Document Annotation

""") with gr.Row(): with gr.Column(): # 犯罪事實段 with gr.Row(): # 抬頭段 courtName = gr.Label(label='法院名稱', value=court_name, visible=False) caseNo = gr.Label(label='案號', value=case_no, visible=False) filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True) prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip) with gr.Row(): with gr.Column(): btn = gr.Button("🎲 隨機選擇") # btn4 = gr.ClearButton(value="清除標註內容",components=[result, element, tag]) with gr.Row(): element = gr.components.Textbox(lines=2, label="構成要件要素") tag = gr.Dropdown(choices = [("被告(犯罪主體)",""), ("主觀犯意", ""), ("不法行為",""), ("因果關係",""), ("被害人/告訴人",""), ("危害結果",""), ("未遂",""), ("既遂",""), ("中止",""), ("預備","")], label="標籤", info="構成要件要素的標籤", type='value') with gr.Column(): result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True) with gr.Row(): with gr.Column(): with gr.Row(): btn2 = gr.Button("📖 產生標註語料內容") with gr.Row(): btn3 = gr.Button("💾 下載") btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename]) btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result]) btn3.click(None, inputs=[result, filename], js=js_download) # btn4.click(None, components=[result, element, tag]) if __name__ == "__main__": demo.launch() # 在遠端啟動時,需要 share=True 。