File size: 5,648 Bytes
03e2e18
73e9623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01d0f93
 
73e9623
 
 
 
 
 
 
 
 
5004c25
 
 
 
 
 
 
 
 
 
 
73e9623
 
 
 
5004c25
73e9623
 
 
 
8cd3669
 
 
 
 
01d0f93
 
73e9623
 
 
 
01d0f93
73e9623
 
 
 
 
 
 
 
 
506c1b8
 
01d0f93
73e9623
 
 
 
 
 
 
 
 
 
 
5004c25
73e9623
 
 
 
 
01d0f93
73e9623
 
 
 
65fddbf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import gradio as gr
from datasets import ClassLabel
from datasets import load_dataset
import random
import pandas as pd
from huggingface_hub import login

def remove_space(example):
    '''
    移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。
    並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>'
    '''
    return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1]} 

def download_file(content, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)

def random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    return df

def random_next(num_examples=5):
    random_selected = random_elements(dataset["train"], num_examples=num_examples)
    court_name = random_selected['Court'][0]
    case_no = random_selected['CaseNo'][0]
    crime_descrip = random_selected['Corpus-Delicti'][0]
    filename = court_name + "_" + case_no + '.txt'
    data_tuple = (court_name, case_no, crime_descrip, filename)
    return data_tuple

def gen_template(crime_descrip, element, tag):
    INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the element of crime and its tag that appropriately completes the request."
    DESCRIPT_KEY = "### Description:"
    ELEMENT_KEY = "### Element:"
    TAG_KEY = "### Tag:"
    END_KEY = "### End"
#    assert tag == None, "未選取構成要件要素標籤"
#    try:
#        tag_name = tag.split(",")[1].strip(")").strip().strip("'")
#    except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。
#        # 改為調適 Alpaca 格式的資料
#        blurb = f"{INTRO_BLURB}\n"
#        descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
#        element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
#        tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
#        end = f"{END_KEY}"
#        template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
#        return template
    blurb = f"{INTRO_BLURB}\n"
    # 改為調適 Alpaca 格式的資料
    descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
    element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
    tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
    end = f"{END_KEY}"
    template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
    return template

# 下載判決書資料集
use_auth_token = os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。
login(token = os.environ['HUB_TOKEN'])
dataset = load_dataset("jslin09/Fraud_Case_Verdicts", use_auth_token=use_auth_token, revision="main")
dataset = dataset.map(remove_space)

# 隨機選取案件
random_selected = random_next()
court_name = random_selected[0]
case_no = random_selected[1]
crime_descrip = random_selected[2]
filename = random_selected[3]

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h1 style="text-align: center;">Legal Document Annotation</h1>
    """)
    with gr.Row():
        with gr.Column(): # 犯罪事實段
            with gr.Row(): # 抬頭段
                courtName = gr.Label(label='法院名稱', value=court_name, visible=False)
                caseNo = gr.Label(label='案號', value=case_no, visible=False)
                filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True)
            prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip)
            with gr.Row():
                with gr.Column():
                    btn = gr.Button("隨機選擇")
        with gr.Column():
            with gr.Row():
                element = gr.components.Textbox(lines=2, label="構成要件要素")
            tag = gr.Dropdown(
                              choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"),
                                         ("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"),
                                         ("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")], 
                              label="標籤", info="構成要件要素的標籤", type='value')
            with gr.Row():
                with gr.Column():
                    btn2 = gr.Button("產生標註語料內容")
            result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True)
#            btn3 = gr.Button("下載")
        btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename])
        btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result])
#        btn3.click(download_file, inputs=[result, title], outputs=[])

if __name__ == "__main__":
    demo.launch() # 在遠端啟動時,需要 share=True 。