๊น์ ํ
commited on
Commit
ยท
6cc5f35
1
Parent(s):
acd3b97
feat: upload
Browse files- README.md +6 -5
- app.py +120 -0
- flagged/log.csv +5 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title: Onedoit
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Onedoit V0.1
|
3 |
+
emoji: ๐จ
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.22.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
python-version: 3.12.2
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio
|
2 |
+
from peft import PeftModel, PeftConfig
|
3 |
+
import re
|
4 |
+
import torch
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
6 |
+
import re
|
7 |
+
import torch
|
8 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
9 |
+
from peft import PeftModel, PeftConfig
|
10 |
+
|
11 |
+
# peft_model_id = "/home/afsd721/komt/output_total/checkpoint-1000"
|
12 |
+
|
13 |
+
peft_model_id = "afsd721/onedoit"
|
14 |
+
|
15 |
+
peft_config = PeftConfig.from_pretrained(peft_model_id)
|
16 |
+
bnb_config = BitsAndBytesConfig(
|
17 |
+
load_in_4bit=True,
|
18 |
+
bnb_4bit_use_double_quant=True,
|
19 |
+
bnb_4bit_quant_type="nf4",
|
20 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
21 |
+
)
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(peft_model_id, trust_remote_code=True)
|
23 |
+
model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, quantization_config=bnb_config)
|
24 |
+
model = PeftModel.from_pretrained(model, peft_model_id)
|
25 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
26 |
+
model.to(device=device)
|
27 |
+
model.eval()
|
28 |
+
|
29 |
+
def preprocessing(text):
|
30 |
+
# ๋ฌธ์ ๋ฅผ ์ผ์ผํฌ ์ ์๋ ๋ฌธ์ ์ ๊ฑฐ
|
31 |
+
bad_chars = {"\u200b": "", "โฆ": " ... ", "\ufeff": ""}
|
32 |
+
for bad_char in bad_chars:
|
33 |
+
text = text.replace(bad_char, bad_chars[bad_char])
|
34 |
+
|
35 |
+
error_chars = {"\u3000": " ", "\u2009": " ", "\u2002": " ", "\xa0":" "}
|
36 |
+
for error_char in error_chars:
|
37 |
+
text = text.replace(error_char, error_chars[error_char])
|
38 |
+
|
39 |
+
# ์ด๋ฉ์ผ ์ ๊ฑฐ
|
40 |
+
text = re.sub(r"[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[์ด๋ฉ์ผ]", text).strip()
|
41 |
+
|
42 |
+
# "#๋ฌธ์" ํ์ ์ด์ ์ ๊ฑฐ
|
43 |
+
text = re.sub(r"#\S+", "", text).strip()
|
44 |
+
|
45 |
+
# "@๋ฌธ์" ํ์ ์ด์ ์ ๊ฑฐ
|
46 |
+
text = re.sub(r"@\w+", "", text).strip()
|
47 |
+
|
48 |
+
# URL ์ ๊ฑฐ
|
49 |
+
text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "[์น์ฃผ์]", text).strip()
|
50 |
+
text = re.sub(r"pic\.(\w+\.)+\S*", "[์น์ฃผ์]", text).strip()
|
51 |
+
|
52 |
+
# ๋ด์ค ์ ์๊ถ ๊ด๋ จ ํ
์คํธ ์ ๊ฑฐ
|
53 |
+
re_patterns = [
|
54 |
+
r"\<์ ์๊ถ์(\(c\)|โ|ยฉ|\(Copyright\)|(\(c\))|(\(C\))).+?\>",
|
55 |
+
r"์ ์๊ถ์\(c\)|โ|ยฉ|(Copyright)|(\(c\))|(\(C\))"
|
56 |
+
]
|
57 |
+
|
58 |
+
for re_pattern in re_patterns:
|
59 |
+
text = re.sub(re_pattern, "", text).strip()
|
60 |
+
|
61 |
+
# ๋ด์ค ๋ด ํฌํจ๋ ์ด๋ฏธ์ง์ ๋ํ ๋ ์ด๋ธ ์ ๊ฑฐ
|
62 |
+
text = re.sub(r"\(์ถ์ฒ ?= ?.+\) |\(์ฌ์ง ?= ?.+\) |\(์๋ฃ ?= ?.+\)| \(์๋ฃ์ฌ์ง\) |์ฌ์ง=.+๊ธฐ์ ", "", text).strip()
|
63 |
+
|
64 |
+
|
65 |
+
# ๋ฌธ์ ๋ฅผ ์ผ์ผํฌ ์ ์๋ ๊ตฌ๋์ ์นํ
|
66 |
+
punct_mapping = {"โ": "'", "โน": "e", "ยด": "'", "ยฐ": "", "โฌ": "e", "โข": "tm", "โ": " sqrt ", "ร": "x", "ยฒ": "2", "โ": "-", "โ": "-", "โ": "'", "_": "-", "`": "'", 'โ': '"', 'โ': '"', 'โ': '"', "ยฃ": "e", 'โ': 'infinity', 'ฮธ': 'theta', 'รท': '/', 'ฮฑ': 'alpha', 'โข': '.', 'ร ': 'a', 'โ': '-', 'ฮฒ': 'beta', 'โ
': '', 'ยณ': '3', 'ฯ': 'pi', }
|
67 |
+
for p in punct_mapping:
|
68 |
+
text = text.replace(p, punct_mapping[p])
|
69 |
+
|
70 |
+
# ์ฐ์๋ ๊ณต๋ฐฑ ์นํ
|
71 |
+
text = re.sub(r"\s+", " ", text).strip()
|
72 |
+
|
73 |
+
# ๊ฐํ ๋ฌธ์ "\n" ์ ๊ฑฐ
|
74 |
+
text = text.replace('\n', '')
|
75 |
+
|
76 |
+
# ๊ธฐํ ํ๊ทธ ์ ๊ฑฐ
|
77 |
+
text = re.sub('<.+?>', '', text, 0, re.I|re.S)
|
78 |
+
return text
|
79 |
+
|
80 |
+
|
81 |
+
def my_inference_function(input_text):
|
82 |
+
input_text = preprocessing(input_text)
|
83 |
+
|
84 |
+
generation_config = GenerationConfig(
|
85 |
+
temperature=0.8,
|
86 |
+
top_p=0.8,
|
87 |
+
top_k=100,
|
88 |
+
max_new_tokens=512,
|
89 |
+
early_stopping=True,
|
90 |
+
do_sample=True,
|
91 |
+
)
|
92 |
+
q = f"### instruction: {input_text}\n\n### Response: "
|
93 |
+
gened = model.generate(
|
94 |
+
**tokenizer(
|
95 |
+
q,
|
96 |
+
return_tensors='pt',
|
97 |
+
return_token_type_ids=False
|
98 |
+
).to(device),
|
99 |
+
generation_config=generation_config,
|
100 |
+
pad_token_id=tokenizer.eos_token_id,
|
101 |
+
eos_token_id=tokenizer.eos_token_id,
|
102 |
+
# streamer=streamer,
|
103 |
+
)
|
104 |
+
result_str = tokenizer.decode(gened[0])
|
105 |
+
|
106 |
+
start_tag = f"\n\n### Response: "
|
107 |
+
start_index = result_str.find(start_tag)
|
108 |
+
|
109 |
+
if start_index != -1:
|
110 |
+
result_str = result_str[start_index + len(start_tag):].strip()
|
111 |
+
result_str = preprocessing(result_str)
|
112 |
+
|
113 |
+
return result_str
|
114 |
+
|
115 |
+
gradio_interface = gradio.Interface(
|
116 |
+
fn = my_inference_function,
|
117 |
+
inputs = "text",
|
118 |
+
outputs = "text"
|
119 |
+
)
|
120 |
+
gradio_interface.launch()
|
flagged/log.csv
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
input_text,output,flag,username,timestamp
|
2 |
+
"์ค๋ ๋๋ฌด ํ๋ค์์ด...
|
3 |
+
์ด๋๋ ๊ฐ๋ค์๊ณ ์ผ๋ ํ๋ค์๊ณ ์ค๋ ๋๋ฌด ๋ฒ
์ฐฌ ํ๋ฃจ๋ค",์ ๊ทธ๋ ๊ฒ ํ๋์
จ์ด์?,,,2024-03-26 21:33:30.619223
|
4 |
+
"์ค๋ ๋๋ฌด ํ๋ค์์ด...
|
5 |
+
์ด๋๋ ๊ฐ๋ค์๊ณ ์ผ๋ ํ๋ค์๊ณ ์ค๋ ๋๋ฌด ๋ฒ
์ฐฌ ํ๋ฃจ๋ค",์์ผ๋ก ์ด๋ป๊ฒ ํ์ค ๊ฑด๊ฐ์?,,,2024-03-26 21:33:32.851444
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
peft
|
4 |
+
sentencepiece
|
5 |
+
accelerate
|
6 |
+
bitsandbytes
|