File size: 6,413 Bytes
03e2e18 73e9623 01d0f93 73e9623 ac375bc 73e9623 5004c25 73e9623 5004c25 73e9623 a4b8d89 93e9b34 1ce7271 93e9b34 1ce7271 8cd3669 39d12cc c8a0015 8cd3669 01d0f93 73e9623 01d0f93 73e9623 506c1b8 01d0f93 73e9623 5509fb4 613c848 73e9623 aadf949 73e9623 aadf949 a4b8d89 73e9623 aadf949 5509fb4 aadf949 5509fb4 01d0f93 73e9623 b79e549 613c848 73e9623 65fddbf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import os
import gradio as gr
from datasets import ClassLabel
from datasets import load_dataset
import random
import pandas as pd
from huggingface_hub import login
def remove_space(example):
'''
移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。
並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>'
'''
return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1]}
def download_file(content, filename):
with open(filename, "w", encoding="utf-8") as f:
f.write(content)
def random_elements(dataset, num_examples=5):
assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
picks = []
for _ in range(num_examples):
pick = random.randint(0, len(dataset)-1)
while pick in picks:
pick = random.randint(0, len(dataset)-1)
picks.append(pick)
df = pd.DataFrame(dataset[picks])
for column, typ in dataset.features.items():
if isinstance(typ, ClassLabel):
df[column] = df[column].transform(lambda i: typ.names[i])
return df
def random_next(num_examples=5):
random_selected = random_elements(dataset["train"], num_examples=num_examples)
court_name = random_selected['Court'][0]
case_no = random_selected['CaseNo'][0]
crime_descrip = random_selected['Corpus-Delicti'][0]
filename = court_name + "_" + case_no + '.txt'
data_tuple = (court_name, case_no, crime_descrip, filename)
return data_tuple
def gen_template(crime_descrip, element, tag):
INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the legal element of crime and its tag that appropriately completes the request."
DESCRIPT_KEY = "### Description:"
ELEMENT_KEY = "### Element:"
TAG_KEY = "### Tag:"
END_KEY = "### End"
# assert tag == None, "未選取構成要件要素標籤"
# try:
# tag_name = tag.split(",")[1].strip(")").strip().strip("'")
# except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。
# # 改為調適 Alpaca 格式的資料
# blurb = f"{INTRO_BLURB}\n"
# descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
# element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
# tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
# end = f"{END_KEY}"
# template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
# return template
blurb = f"{INTRO_BLURB}\n"
# 改為調適 Alpaca 格式的資料
descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
end = f"{END_KEY}"
template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
return template
# 用來產生下載檔案按鈕用的 JavaScript
js_download = '''function downloadFile(result, filename) {
//藉型別陣列建構的 blob 來建立 URL
let fileName = filename;
const data = result;
let blob = new Blob([data], {
type: "application/octet-stream",
});
var href = URL.createObjectURL(blob);
// 從 Blob 取出資料
var link = document.createElement("a");
document.body.appendChild(link);
link.href = href;
link.download = fileName;
link.click();
}
'''
# 下載判決書資料集
use_auth_token = os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。
login(token = os.environ['AUTH_TOKEN'], add_to_git_credential=True)
dataset = load_dataset("jslin09/Fraud_Case_Verdicts", token=use_auth_token, revision="main")
dataset = dataset.map(remove_space)
# 隨機選取案件
random_selected = random_next()
court_name = random_selected[0]
case_no = random_selected[1]
crime_descrip = random_selected[2]
filename = random_selected[3]
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1 style="text-align: center;">Legal Document Annotation</h1>
""")
with gr.Row():
with gr.Column(): # 犯罪事實段
with gr.Row(): # 抬頭段
courtName = gr.Label(label='法院名稱', value=court_name, visible=False)
caseNo = gr.Label(label='案號', value=case_no, visible=False)
filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True)
prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip)
with gr.Row():
with gr.Column():
btn = gr.Button("🎲 隨機選擇")
# btn4 = gr.ClearButton(value="清除標註內容",components=[result, element, tag])
with gr.Row():
element = gr.components.Textbox(lines=2, label="構成要件要素")
tag = gr.Dropdown(choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"),
("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"),
("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")],
label="標籤", info="構成要件要素的標籤", type='value')
with gr.Column():
result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True)
with gr.Row():
with gr.Column():
with gr.Row():
btn2 = gr.Button("📖 產生標註語料內容")
with gr.Row():
btn3 = gr.Button("💾 下載")
btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename])
btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result])
btn3.click(None, inputs=[result, filename], _js=js_download)
# btn4.click(None, components=[result, element, tag])
if __name__ == "__main__":
demo.launch() # 在遠端啟動時,需要 share=True 。 |