File size: 5,893 Bytes
73e9623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
from datasets import ClassLabel
from datasets import load_dataset
import random
import pandas as pd
from huggingface_hub import login

def remove_space(example):
    '''
    移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。
    並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>'
    '''
    return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1]} 

def download_file(content, filename):
#    print(filename)
#    print(content)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)

def random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    return df

def random_next(num_examples=5):
    random_selected = random_elements(dataset["train"], num_examples=num_examples)
    court_name = random_selected['Court'][0]
    case_no = random_selected['CaseNo'][0]
    crime_descrip = random_selected['Corpus-Delicti'][0]
    title = court_name + "_" + case_no
    data_tuple = (court_name, case_no, crime_descrip, title)
    return data_tuple

def gen_template(crime_descrip, element, tag):
    INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the element of crime and its tag that appropriately completes the request."
    DESCRIPT_KEY = "### Description:"
    ELEMENT_KEY = "### Element:"
    TAG_KEY = "### Tag:"
    END_KEY = "### End"
#    assert tag == None, "未選取構成要件要素標籤"
    try:
        tag_name = tag.split(",")[1].strip(")").strip().strip("'")
    except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。
        # 改為調適 Alpaca 格式的資料
        blurb = f"{INTRO_BLURB}\n"
        descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
        element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
        tag = f"{TAG_KEY}\n{tag_name}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
        end = f"{END_KEY}"
        template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
        return template
    blurb = f"{INTRO_BLURB}\n"
    # 改為調適 Alpaca 格式的資料
    descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
    element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
    tag = f"{TAG_KEY}\n{tag_name}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
    end = f"{END_KEY}"
    template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
    return template

#random_selected = random_elements(dataset["train"])
random_selected = random_next()
court_name = random_selected[0]
case_no = random_selected[1]
crime_descrip = random_selected[2]
title = random_selected[3]

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h1 style="text-align: center;">Legal Document Annotation</h1>
    """)
    with gr.Row():
        with gr.Column(): # 犯罪事實段
#            court_name = random_selected[0]
#            case_no = random_selected[1]
#            crime_descrip = random_selected[2]
            with gr.Row(): # 抬頭段
#                courtName = gr.Label(label='法院名稱', value=court_name)
#                caseNo = gr.Label(label='案號', value=case_no)
                title = gr.components.Textbox(label='案號',value=title)
            prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip)
            with gr.Row():
                with gr.Column():
                    btn = gr.Button("隨機選擇")
#            gr.Examples(examples, inputs=[prompt])
        with gr.Column():
            with gr.Row():
                element = gr.components.Textbox(lines=2, label="構成要件要素")
#                tag = gr.components.Textbox(label="標籤")
            tag = gr.Dropdown(
                              choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"),
                                         ("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"),
                                         ("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")], 
                              label="標籤", info="構成要件要素的標籤")
            with gr.Row():
                with gr.Column():
                    btn2 = gr.Button("產生標註語料內容")
            result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True)
#            btn3 = gr.Button("下載")
        btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, title])
        btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result])
#        btn3.click(download_file, inputs=[result, title], outputs=[])

if __name__ == "__main__":
    # 下載判決書資料集
    login(token = os.environ['HUB_TOKEN'])
    use_auth_token=os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。
    dataset = load_dataset("jslin09/Fraud_Case_Verdicts", use_auth_token=use_auth_token, revision="main")
    dataset = dataset.map(remove_space)
    demo.launch(share=True) # 在遠端啟動時,需要 share=True 。