File size: 5,660 Bytes
32b0450
dccf4fa
 
d97db75
065361a
7fc842c
dccf4fa
e4bbe61
c203e02
7fc842c
065361a
4eebfca
065361a
 
 
e0219f0
4eebfca
065361a
 
 
 
 
 
dccf4fa
 
 
 
315025d
dccf4fa
 
 
2ace237
dccf4fa
 
 
1616f44
351b9e5
081ad0c
9bd8a55
9152599
7fc842c
 
a7c67ff
 
 
 
 
28f307c
 
 
48e0bd2
28f307c
f2aa585
94a4093
c30a9a4
19bcbe1
94b371e
d223e04
94b371e
c30a9a4
 
0df750e
c30a9a4
 
f2aa585
 
ff148d3
f2aa585
 
 
 
 
 
 
 
 
 
dccf4fa
8c28f7d
8d21a67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from transformers import AutoModelForCausalLM
from transformers import BloomTokenizerFast
from transformers import pipeline, set_seed
import random

model_name = "bloom-560m"
model = AutoModelForCausalLM.from_pretrained(f'jslin09/{model_name}-finetuned-fraud')
tokenizer = BloomTokenizerFast.from_pretrained(f'bigscience/{model_name}', bos_token = '<s>', eos_token = '</s>', pad_token = '<pad>')

def rnd_generate(prompt):
    rnd_seed = random.randint(10, 500)
    set_seed(rnd_seed)
    inputs = tokenizer(prompt, return_tensors="pt") # 回傳的張量使用 Pytorch的格式。如果是 Tensorflow 格式的話,則指定為 "tf"。
    results = model.generate(inputs["input_ids"],
                       max_length=500,
                       num_return_sequences=1, # 產生 1 個句子回來。
                       do_sample=True,
                       temperature=0.75,
                       top_k=50, 
                       top_p=0.9)
    return tokenizer.decode(results[0])

def generate(prompt):
    result_length = len(prompt) + 4
    inputs = tokenizer(prompt, return_tensors="pt") # 回傳的張量使用 Pytorch的格式。如果是 Tensorflow 格式的話,則指定為 "tf"。
    results = model.generate(inputs["input_ids"],
                       num_return_sequences=2, # 產生 2 個句子回來。
                       max_length=result_length,
                       early_stopping=True,
                       do_sample=True, 
                       top_k=50, 
                       top_p=0.9
                      )
    return tokenizer.decode(results[0])

examples = [
    ["闕很大明知金融帳戶之存摺、提款卡及密碼係供自己使用之重要理財工具,"],
    ["梅友乾明知其無資力支付酒店消費,亦無付款意願,竟意圖為自己不法之所有,"],
    ["瓊道帕意圖為自己不法所有,基於竊盜之犯意,"]
]

prompts = [
    ["輸入寫書類的句子,讓電腦生成下一句。或是按以下的範例句子。"],
    ["輸入寫書類的開頭句子,讓電腦隨機生成整篇草稿。"]
]

with gr.Blocks() as demo:
    gr.Markdown(
    """
    <h1 style="text-align: center;">Legal Document Drafting</h1>
    """)
    with gr.Row() as row:
        with gr.Column():
            gr.Markdown("""
            <h3>Abstract</h3>
            <p>
            With the development of large-scale language model technology, fine-tuning pre-trained large-scale language models to solve downstream natural language processing tasks has become a mainstream paradigm. However, training a language model in the legal domain requires a large number of legal documents so that the language model can learn legal terms and the particularity of the format of legal documents. Therefore, it usually needs to rely on many manual annotation data sets for training. In the legal domain, obtaining a large amount of manually labeled data sets is practically difficult, which limits the application of traditional NLP methods in drafting legal documents. The experimental results of this paper show that it is feasible to fine-tune a large pre-trained language model on a local computer with a large number of annotation-free legal documents can not only significantly improve the performance of the fine-tuned model on the legal document drafting task but also provide a basis for automatic legal document drafting. Moreover, it offers new ideas and approaches and, at the same time, protects information privacy and reduces information security issues.
            </p>
            <h3>摘要</h3>
            <p>
            隨著大型語言模型技術的發展,藉由微調預訓練的大型語言模型來解決自然語言處理的下游任務,已經是主流的範式。然而,訓練法律專業領域的語言模型,需要有大量的法律文件,以便讓語言模型能學得法律術語以及法律文書格式的特殊性,因此,通常需要依賴大量人工標註的資料集進行訓練,而在法律領域的應用,取得大量人工標註的資料集是有實際上的困難,這使得傳統的NLP方法應用在法律文件起草中的任務就受到了限制。本文實驗結果表明,以大量無標記的法律文件,在本地端電腦中微調大型預訓練語言模型來達成文件草稿生成任務的可行性。此外,除了顯著提高微調後所得之模型在法律文件起草任務上的性能之外,並為實現自動化法律文件起草提供了新的思路和方法,同時保障了資訊隱私以及降低資訊安全等問題。
            </p>
            """)
        with gr.Column(scale=1, min_width=600):
            with gr.Tab("Writing Assist"):
                result = gr.components.Textbox(lines=7, label="Writing Assist", show_label=True, placeholder=prompts[0])
                prompt = gr.components.Textbox(lines=2, label="Prompt", placeholder=examples[0], visible=False)
                gr.Examples(examples, label='Examples', inputs=[prompt])
                prompt.change(generate, inputs=[prompt], outputs=[result])
                btn = gr.Button("Next sentence")
                btn.click(generate, inputs=[result], outputs=[result])
            with gr.Tab("Random Generative"):
                result2 = gr.components.Textbox(lines=7, label="Random Generative", show_label=True, placeholder=prompts[1])
                gr.Examples(examples, label='Examples', inputs=[result2])
                rnd_btn = gr.Button("Random Drafting")
                rnd_btn.click(rnd_generate, inputs=[result2], outputs=[result2])
    
if __name__ == "__main__":
    demo.launch()