|
import os |
|
import gradio as gr |
|
from datasets import ClassLabel |
|
from datasets import load_dataset |
|
import random |
|
import pandas as pd |
|
from huggingface_hub import login |
|
|
|
def remove_space(example): |
|
''' |
|
移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。 |
|
並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>' |
|
''' |
|
return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1]} |
|
|
|
def download_file(content, filename): |
|
with open(filename, "w", encoding="utf-8") as f: |
|
f.write(content) |
|
|
|
def random_elements(dataset, num_examples=5): |
|
assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset." |
|
picks = [] |
|
for _ in range(num_examples): |
|
pick = random.randint(0, len(dataset)-1) |
|
while pick in picks: |
|
pick = random.randint(0, len(dataset)-1) |
|
picks.append(pick) |
|
|
|
df = pd.DataFrame(dataset[picks]) |
|
for column, typ in dataset.features.items(): |
|
if isinstance(typ, ClassLabel): |
|
df[column] = df[column].transform(lambda i: typ.names[i]) |
|
return df |
|
|
|
def random_next(num_examples=5): |
|
random_selected = random_elements(dataset["train"], num_examples=num_examples) |
|
court_name = random_selected['Court'][0] |
|
case_no = random_selected['CaseNo'][0] |
|
crime_descrip = random_selected['Corpus-Delicti'][0] |
|
filename = court_name + "_" + case_no + '.txt' |
|
data_tuple = (court_name, case_no, crime_descrip, filename) |
|
return data_tuple |
|
|
|
def gen_template(crime_descrip, element, tag): |
|
INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the element of crime and its tag that appropriately completes the request." |
|
DESCRIPT_KEY = "### Description:" |
|
ELEMENT_KEY = "### Element:" |
|
TAG_KEY = "### Tag:" |
|
END_KEY = "### End" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
blurb = f"{INTRO_BLURB}\n" |
|
|
|
descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n" |
|
element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n" |
|
tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n" |
|
end = f"{END_KEY}" |
|
template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end |
|
return template |
|
|
|
|
|
use_auth_token = os.environ['HUB_TOKEN'] |
|
login(token = os.environ['HUB_TOKEN']) |
|
dataset = load_dataset("jslin09/Fraud_Case_Verdicts", use_auth_token=use_auth_token, revision="main") |
|
dataset = dataset.map(remove_space) |
|
|
|
|
|
random_selected = random_next() |
|
court_name = random_selected[0] |
|
case_no = random_selected[1] |
|
crime_descrip = random_selected[2] |
|
filename = random_selected[3] |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
""" |
|
<h1 style="text-align: center;">Legal Document Annotation</h1> |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Row(): |
|
courtName = gr.Label(label='法院名稱', value=court_name, visible=False) |
|
caseNo = gr.Label(label='案號', value=case_no, visible=False) |
|
filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True) |
|
prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip) |
|
with gr.Row(): |
|
with gr.Column(): |
|
btn = gr.Button("隨機選擇") |
|
with gr.Column(): |
|
with gr.Row(): |
|
element = gr.components.Textbox(lines=2, label="構成要件要素") |
|
tag = gr.Dropdown( |
|
choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"), |
|
("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"), |
|
("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")], |
|
label="標籤", info="構成要件要素的標籤", type='value') |
|
with gr.Row(): |
|
with gr.Column(): |
|
btn2 = gr.Button("產生標註語料內容") |
|
result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True) |
|
|
|
btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename]) |
|
btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result]) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |