๊น€์ •ํ˜„ commited on
Commit
6cc5f35
ยท
1 Parent(s): acd3b97

feat: upload

Browse files
Files changed (4) hide show
  1. README.md +6 -5
  2. app.py +120 -0
  3. flagged/log.csv +5 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Onedoit
3
- emoji: ๐Ÿ 
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.23.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Onedoit V0.1
3
+ emoji: ๐Ÿจ
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.22.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ python-version: 3.12.2
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio
2
+ from peft import PeftModel, PeftConfig
3
+ import re
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
6
+ import re
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
9
+ from peft import PeftModel, PeftConfig
10
+
11
+ # peft_model_id = "/home/afsd721/komt/output_total/checkpoint-1000"
12
+
13
+ peft_model_id = "afsd721/onedoit"
14
+
15
+ peft_config = PeftConfig.from_pretrained(peft_model_id)
16
+ bnb_config = BitsAndBytesConfig(
17
+ load_in_4bit=True,
18
+ bnb_4bit_use_double_quant=True,
19
+ bnb_4bit_quant_type="nf4",
20
+ bnb_4bit_compute_dtype=torch.bfloat16
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained(peft_model_id, trust_remote_code=True)
23
+ model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, quantization_config=bnb_config)
24
+ model = PeftModel.from_pretrained(model, peft_model_id)
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+ model.to(device=device)
27
+ model.eval()
28
+
29
+ def preprocessing(text):
30
+ # ๋ฌธ์ œ๋ฅผ ์ผ์œผํ‚ฌ ์ˆ˜ ์žˆ๋Š” ๋ฌธ์ž ์ œ๊ฑฐ
31
+ bad_chars = {"\u200b": "", "โ€ฆ": " ... ", "\ufeff": ""}
32
+ for bad_char in bad_chars:
33
+ text = text.replace(bad_char, bad_chars[bad_char])
34
+
35
+ error_chars = {"\u3000": " ", "\u2009": " ", "\u2002": " ", "\xa0":" "}
36
+ for error_char in error_chars:
37
+ text = text.replace(error_char, error_chars[error_char])
38
+
39
+ # ์ด๋ฉ”์ผ ์ œ๊ฑฐ
40
+ text = re.sub(r"[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[์ด๋ฉ”์ผ]", text).strip()
41
+
42
+ # "#๋ฌธ์ž" ํ˜•์‹ ์–ด์ ˆ ์ œ๊ฑฐ
43
+ text = re.sub(r"#\S+", "", text).strip()
44
+
45
+ # "@๋ฌธ์ž" ํ˜•์‹ ์–ด์ ˆ ์ œ๊ฑฐ
46
+ text = re.sub(r"@\w+", "", text).strip()
47
+
48
+ # URL ์ œ๊ฑฐ
49
+ text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "[์›น์ฃผ์†Œ]", text).strip()
50
+ text = re.sub(r"pic\.(\w+\.)+\S*", "[์›น์ฃผ์†Œ]", text).strip()
51
+
52
+ # ๋‰ด์Šค ์ €์ž‘๊ถŒ ๊ด€๋ จ ํ…์ŠคํŠธ ์ œ๊ฑฐ
53
+ re_patterns = [
54
+ r"\<์ €์ž‘๊ถŒ์ž(\(c\)|โ“’|ยฉ|\(Copyright\)|(\(c\))|(\(C\))).+?\>",
55
+ r"์ €์ž‘๊ถŒ์ž\(c\)|โ“’|ยฉ|(Copyright)|(\(c\))|(\(C\))"
56
+ ]
57
+
58
+ for re_pattern in re_patterns:
59
+ text = re.sub(re_pattern, "", text).strip()
60
+
61
+ # ๋‰ด์Šค ๋‚ด ํฌํ•จ๋œ ์ด๋ฏธ์ง€์— ๋Œ€ํ•œ ๋ ˆ์ด๋ธ” ์ œ๊ฑฐ
62
+ text = re.sub(r"\(์ถœ์ฒ˜ ?= ?.+\) |\(์‚ฌ์ง„ ?= ?.+\) |\(์ž๋ฃŒ ?= ?.+\)| \(์ž๋ฃŒ์‚ฌ์ง„\) |์‚ฌ์ง„=.+๊ธฐ์ž ", "", text).strip()
63
+
64
+
65
+ # ๋ฌธ์ œ๋ฅผ ์ผ์œผํ‚ฌ ์ˆ˜ ์žˆ๋Š” ๊ตฌ๋‘์  ์น˜ํ™˜
66
+ punct_mapping = {"โ€˜": "'", "โ‚น": "e", "ยด": "'", "ยฐ": "", "โ‚ฌ": "e", "โ„ข": "tm", "โˆš": " sqrt ", "ร—": "x", "ยฒ": "2", "โ€”": "-", "โ€“": "-", "โ€™": "'", "_": "-", "`": "'", 'โ€œ': '"', 'โ€': '"', 'โ€œ': '"', "ยฃ": "e", 'โˆž': 'infinity', 'ฮธ': 'theta', 'รท': '/', 'ฮฑ': 'alpha', 'โ€ข': '.', 'ร ': 'a', 'โˆ’': '-', 'ฮฒ': 'beta', 'โˆ…': '', 'ยณ': '3', 'ฯ€': 'pi', }
67
+ for p in punct_mapping:
68
+ text = text.replace(p, punct_mapping[p])
69
+
70
+ # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์น˜ํ™˜
71
+ text = re.sub(r"\s+", " ", text).strip()
72
+
73
+ # ๊ฐœํ–‰ ๋ฌธ์ž "\n" ์ œ๊ฑฐ
74
+ text = text.replace('\n', '')
75
+
76
+ # ๊ธฐํƒ€ ํƒœ๊ทธ ์ œ๊ฑฐ
77
+ text = re.sub('<.+?>', '', text, 0, re.I|re.S)
78
+ return text
79
+
80
+
81
+ def my_inference_function(input_text):
82
+ input_text = preprocessing(input_text)
83
+
84
+ generation_config = GenerationConfig(
85
+ temperature=0.8,
86
+ top_p=0.8,
87
+ top_k=100,
88
+ max_new_tokens=512,
89
+ early_stopping=True,
90
+ do_sample=True,
91
+ )
92
+ q = f"### instruction: {input_text}\n\n### Response: "
93
+ gened = model.generate(
94
+ **tokenizer(
95
+ q,
96
+ return_tensors='pt',
97
+ return_token_type_ids=False
98
+ ).to(device),
99
+ generation_config=generation_config,
100
+ pad_token_id=tokenizer.eos_token_id,
101
+ eos_token_id=tokenizer.eos_token_id,
102
+ # streamer=streamer,
103
+ )
104
+ result_str = tokenizer.decode(gened[0])
105
+
106
+ start_tag = f"\n\n### Response: "
107
+ start_index = result_str.find(start_tag)
108
+
109
+ if start_index != -1:
110
+ result_str = result_str[start_index + len(start_tag):].strip()
111
+ result_str = preprocessing(result_str)
112
+
113
+ return result_str
114
+
115
+ gradio_interface = gradio.Interface(
116
+ fn = my_inference_function,
117
+ inputs = "text",
118
+ outputs = "text"
119
+ )
120
+ gradio_interface.launch()
flagged/log.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ input_text,output,flag,username,timestamp
2
+ "์˜ค๋Š˜ ๋„ˆ๋ฌด ํž˜๋“ค์—ˆ์–ด...
3
+ ์šด๋™๋„ ๊ฐ”๋‹ค์™”๊ณ  ์ผ๋„ ํž˜๋“ค์—ˆ๊ณ  ์˜ค๋Š˜ ๋„ˆ๋ฌด ๋ฒ…์ฐฌ ํ•˜๋ฃจ๋„ค",์™œ ๊ทธ๋ ‡๊ฒŒ ํž˜๋“œ์…จ์–ด์š”?,,,2024-03-26 21:33:30.619223
4
+ "์˜ค๋Š˜ ๋„ˆ๋ฌด ํž˜๋“ค์—ˆ์–ด...
5
+ ์šด๋™๋„ ๊ฐ”๋‹ค์™”๊ณ  ์ผ๋„ ํž˜๋“ค์—ˆ๊ณ  ์˜ค๋Š˜ ๋„ˆ๋ฌด ๋ฒ…์ฐฌ ํ•˜๋ฃจ๋„ค",์•ž์œผ๋กœ ์–ด๋–ป๊ฒŒ ํ•˜์‹ค ๊ฑด๊ฐ€์š”?,,,2024-03-26 21:33:32.851444
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ peft
4
+ sentencepiece
5
+ accelerate
6
+ bitsandbytes