File size: 6,442 Bytes
03e2e18
73e9623
 
 
 
 
 
 
 
 
 
 
 
a1c3187
73e9623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01d0f93
 
73e9623
 
 
ac375bc
73e9623
 
 
 
 
5004c25
 
 
 
 
 
 
 
 
 
 
73e9623
 
 
 
5004c25
73e9623
 
 
 
a4b8d89
93e9b34
1ce7271
 
 
 
 
 
 
 
 
 
 
 
 
 
93e9b34
1ce7271
8cd3669
 
7e5814d
c8a0015
8cd3669
01d0f93
 
73e9623
 
 
 
01d0f93
73e9623
 
 
 
 
 
 
 
 
506c1b8
 
01d0f93
73e9623
 
 
5509fb4
613c848
73e9623
 
aadf949
73e9623
 
aadf949
a4b8d89
 
73e9623
 
aadf949
5509fb4
aadf949
5509fb4
01d0f93
73e9623
1a376b3
613c848
73e9623
 
65fddbf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import gradio as gr
from datasets import ClassLabel
from datasets import load_dataset
import random
import pandas as pd
from huggingface_hub import login

def remove_space(example):
    '''
    移除資料集當中「犯罪事實」欄 (Corpus-Delicti) 當中作為斷詞字元的空白字元,以及每句開頭的「ㄧ、」。
    並且在文章的開頭跟結尾加入 bos_token = '<s>', eos_token = '</s>'
    '''
    return {'Corpus-Delicti': example['Corpus-Delicti'].replace(" ", "").split('一、')[1].replace('犯罪事實:', '')} 

def download_file(content, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)

def random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    return df

def random_next(num_examples=5):
    random_selected = random_elements(dataset["train"], num_examples=num_examples)
    court_name = random_selected['Court'][0]
    case_no = random_selected['CaseNo'][0]
    crime_descrip = random_selected['Corpus-Delicti'][0]
    filename = court_name + "_" + case_no + '.txt'
    data_tuple = (court_name, case_no, crime_descrip, filename)
    return data_tuple

def gen_template(crime_descrip, element, tag):
    INTRO_BLURB = "The following is a description of the crime in the verdict. Write a response for the legal element of crime and its tag that appropriately completes the request."
    DESCRIPT_KEY = "### Description:"
    ELEMENT_KEY = "### Element:"
    TAG_KEY = "### Tag:"
    END_KEY = "### End"
#    assert tag == None, "未選取構成要件要素標籤"
#    try:
#        tag_name = tag.split(",")[1].strip(")").strip().strip("'")
#    except IndexError: # 防呆用的。如果什麼資料都沒填就按下按鈕,就會觸發以下程式碼,並傳回空樣板。
#        # 改為調適 Alpaca 格式的資料
#        blurb = f"{INTRO_BLURB}\n"
#        descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
#        element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
#        tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
#        end = f"{END_KEY}"
#        template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
#        return template
    blurb = f"{INTRO_BLURB}\n"
    # 改為調適 Alpaca 格式的資料
    descript = f"{DESCRIPT_KEY}\n{crime_descrip}\n"
    element = f"{ELEMENT_KEY}\n{element}\n" if element else f"{ELEMENT_KEY}\n<未填寫構成要件要素>\n"
    tag = f"{TAG_KEY}\n{tag}\n" if tag else f"{TAG_KEY}\n<未選取構成要件要素標籤>\n"
    end = f"{END_KEY}"
    template = blurb + '\n' + descript + '\n' + element + '\n' + tag + '\n' + end
    return template

# 用來產生下載檔案按鈕用的 JavaScript
js_download = '''function downloadFile(result, filename) {
  //藉型別陣列建構的 blob 來建立 URL
  let fileName = filename;
  const data = result;
  let blob = new Blob([data], {
    type: "application/octet-stream",
  });
  var href = URL.createObjectURL(blob);
  // 從 Blob 取出資料
  var link = document.createElement("a");
  document.body.appendChild(link);
  link.href = href;
  link.download = fileName;
  link.click();
}
'''

# 下載判決書資料集
use_auth_token = os.environ['HUB_TOKEN'] # 下載判決書資料集所需要的 token。
login(token = os.environ['HUB_TOKEN'], add_to_git_credential=True)
dataset = load_dataset("jslin09/Fraud_Case_Verdicts", token=use_auth_token, revision="main")
dataset = dataset.map(remove_space)

# 隨機選取案件
random_selected = random_next()
court_name = random_selected[0]
case_no = random_selected[1]
crime_descrip = random_selected[2]
filename = random_selected[3]

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h1 style="text-align: center;">Legal Document Annotation</h1>
    """)
    with gr.Row():
        with gr.Column(): # 犯罪事實段
            with gr.Row(): # 抬頭段
                courtName = gr.Label(label='法院名稱', value=court_name, visible=False)
                caseNo = gr.Label(label='案號', value=case_no, visible=False)
                filename = gr.components.Textbox(label='案號',value=filename, show_copy_button=True)
            prompt = gr.components.Textbox(lines=5, label='犯罪事實',value=crime_descrip)
            with gr.Row():
                with gr.Column():
                    btn = gr.Button("🎲 隨機選擇")
#                    btn4 = gr.ClearButton(value="清除標註內容",components=[result, element, tag])
            with gr.Row():
                element = gr.components.Textbox(lines=2, label="構成要件要素")
                tag = gr.Dropdown(choices = [("被告(犯罪主體)","<LEO_SOC>"), ("主觀犯意", "<LEO_SLE>"), ("不法行為","<LEO_ACT>"), ("因果關係","<LEO_CAU>"),
                                         ("被害人/告訴人","<LEO_VIC>"), ("危害結果","<LEO_ROH>"), ("未遂","<LEO_ATP>"), ("既遂","<LEO_ACC>"),
                                         ("中止","<LEO_ABA>"), ("預備","<LEO_PRP>")], 
                                  label="標籤", info="構成要件要素的標籤", type='value')
        with gr.Column():
            result = gr.components.Textbox(lines=5, label="語料內容", show_copy_button=True)
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        btn2 = gr.Button("📖 產生標註語料內容")
                    with gr.Row():
                        btn3 = gr.Button("💾 下載")
        btn.click(random_next, inputs=[], outputs=[courtName, caseNo, prompt, filename])
        btn2.click(gen_template, inputs=[prompt, element, tag], outputs=[result])
        btn3.click(None, inputs=[result, filename], js=js_download)
#        btn4.click(None, components=[result, element, tag])

if __name__ == "__main__":
    demo.launch() # 在遠端啟動時,需要 share=True 。