Spaces:
Running
Running
File size: 6,253 Bytes
32b0450 dccf4fa d97db75 065361a 7fc842c dccf4fa e4bbe61 c203e02 7fc842c 065361a 4eebfca 065361a e0219f0 4eebfca 065361a dccf4fa 315025d dccf4fa 2ace237 dccf4fa 1616f44 351b9e5 081ad0c 9bd8a55 b3dc994 ba879d2 b3dc994 4118d52 7fc842c a7c67ff aa77ad6 a7c67ff 28f307c 48e0bd2 28f307c f2aa585 94a4093 c30a9a4 19bcbe1 94b371e 7654040 94b371e c30a9a4 7654040 c30a9a4 f2aa585 ff148d3 f2aa585 a186a94 9136540 f2aa585 dccf4fa 8c28f7d 8d21a67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from transformers import AutoModelForCausalLM
from transformers import BloomTokenizerFast
from transformers import pipeline, set_seed
import random
model_name = "bloom-560m"
model = AutoModelForCausalLM.from_pretrained(f'jslin09/{model_name}-finetuned-fraud')
tokenizer = BloomTokenizerFast.from_pretrained(f'bigscience/{model_name}', bos_token = '<s>', eos_token = '</s>', pad_token = '<pad>')
def rnd_generate(prompt):
rnd_seed = random.randint(10, 500)
set_seed(rnd_seed)
inputs = tokenizer(prompt, return_tensors="pt") # 回傳的張量使用 Pytorch的格式。如果是 Tensorflow 格式的話,則指定為 "tf"。
results = model.generate(inputs["input_ids"],
max_length=500,
num_return_sequences=1, # 產生 1 個句子回來。
do_sample=True,
temperature=0.75,
top_k=50,
top_p=0.9)
return tokenizer.decode(results[0])
def generate(prompt):
result_length = len(prompt) + 4
inputs = tokenizer(prompt, return_tensors="pt") # 回傳的張量使用 Pytorch的格式。如果是 Tensorflow 格式的話,則指定為 "tf"。
results = model.generate(inputs["input_ids"],
num_return_sequences=2, # 產生 2 個句子回來。
max_length=result_length,
early_stopping=True,
do_sample=True,
top_k=50,
top_p=0.9
)
return tokenizer.decode(results[0])
examples = [
["闕很大明知金融帳戶之存摺、提款卡及密碼係供自己使用之重要理財工具,"],
["梅友乾明知其無資力支付酒店消費,亦無付款意願,竟意圖為自己不法之所有,"],
["瓊道帕意圖為自己不法所有,基於竊盜之犯意,"],
["周大膽在參與詐欺集團犯罪組織期間,與該詐欺集團不詳成員,意圖為自己不法之所有,共同基於行使偽造私文書、行使偽造特種文書、詐欺取財、洗錢之犯意聯絡,"],
["趙甲王基於行使偽造特種文書及詐欺取財之犯意,於"],
["范不停前因詐欺案件,經"],
["通訊王明知近來盛行以虛設、租賃、借用或買賣行動電話人頭門號之方式,供詐騙集團作為詐欺他人交付財物等不法用途,亦知行動電話門號在現代社會係個人對外聯繫之重要溝通工具,"]
]
prompts = [
["輸入寫書類的句子,讓電腦生成下一句。或是按以下的範例句子按鈕。"],
["輸入寫書類的開頭句子,讓電腦隨機生成整篇草稿。"]
]
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1 style="text-align: center;">Legal Document Drafting</h1>
""")
with gr.Row() as row:
with gr.Column():
gr.Markdown("""
<h3>Abstract</h3>
<p>
With the development of large-scale language model technology, fine-tuning pre-trained large-scale language models has become a mainstream paradigm to solve downstream tasks of natural language processing. However, training a language model in the legal field requires a large number of legal documents so that the language model can learn legal terminology and the particularity of the format of legal documents. The typical NLP method usually needs to rely on a large number of manually annotation data sets for training. However, in the application of the legal field, it is actually difficult to obtain a large number of manually annotation data sets, which restricted the typical method applied to the task of drafting legal documents. The experimental results of this paper show that not only can a large number of unlabeled legal documents that do not require Chinese word segmentation, but more importantly, it can fine-tune a large pre-trained language model on the local computer to achieve the generating legal document drafts task, and at the same time achieve the protection of information privacy and to improve information security issues.
</p>
<h3>摘要</h3>
<p>
隨著大型語言模型技術的發展,藉由微調預訓練的大型語言模型來解決自然語言處理的下游任務,已經是主流的範式。然而,訓練法律專業領域的語言模型,需要有大量的法律文件,以便讓語言模型能學得法律術語以及法律文書格式的特殊性。傳統NLP的做法,通常需要依賴大量人工標註的資料集進行訓練,而在法律領域的應用,取得大量人工標註的資料集是有實際上的困難,這使得傳統方法應用在法律文件起草的任務就受到了限制。本文實驗結果呈現,不僅能以大量無標記且無需中文斷詞的法律文件,更重要是能在本地端電腦中微調大型預訓練語言模型來達成法律文件草稿生成任務,並同時達到保障資訊隱私以及提高資訊安全等目的。
</p>
""")
with gr.Column(scale=1, min_width=600):
with gr.Tab("Writing Assist"):
result = gr.components.Textbox(lines=7, label="Writing Assist", show_label=True, placeholder=prompts[0])
prompt = gr.components.Textbox(lines=2, label="Prompt", placeholder=examples[0], visible=False)
gr.Examples(examples, label='Examples', inputs=[prompt])
prompt.change(generate, inputs=[prompt], outputs=[result])
btn = gr.Button("Next sentence")
btn.click(generate, inputs=[result], outputs=[result])
with gr.Tab("Random Generative"):
# result2 = gr.components.Textbox(lines=7, label="Random Generative", show_label=True, placeholder=prompts[1])
result2 = gr.components.Textbox(lines=7, label="Random Generative", show_label=True, value = examples[0][0])
gr.Examples(examples, label='Examples', inputs=[result2])
rnd_btn = gr.Button("Random Drafting")
rnd_btn.click(rnd_generate, inputs=[result2], outputs=[result2])
if __name__ == "__main__":
demo.launch() |