Spaces:

afsd721
/

onedoit

Build error

App Files Files Community

김정현 commited on Mar 26, 2024

Commit

6cc5f35

1 Parent(s): acd3b97

feat: upload

Browse files

Files changed (4) hide show

README.md +6 -5
app.py +120 -0
flagged/log.csv +5 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,13 +1,14 @@
 ---
-title: Onedoit
-emoji: 🐠
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 4.23.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Onedoit V0.1
+emoji: 🐨
+colorFrom: yellow
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.22.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+python-version: 3.12.2
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gradio
+from peft import PeftModel, PeftConfig
+import re
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+import re
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import PeftModel, PeftConfig
+# peft_model_id =  "/home/afsd721/komt/output_total/checkpoint-1000"
+peft_model_id = "afsd721/onedoit"
+peft_config = PeftConfig.from_pretrained(peft_model_id)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(peft_model_id, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path, quantization_config=bnb_config)
+model = PeftModel.from_pretrained(model, peft_model_id)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device=device)
+model.eval()
+def preprocessing(text):
+    # 문제를 일으킬 수 있는 문자 제거
+    bad_chars = {"\u200b": "", "…": " ... ", "\ufeff": ""}
+    for bad_char in bad_chars:
+        text = text.replace(bad_char, bad_chars[bad_char])
+    error_chars = {"\u3000": " ", "\u2009": " ", "\u2002": " ", "\xa0":" "}
+    for error_char in error_chars:
+        text = text.replace(error_char, error_chars[error_char])
+    # 이메일 제거
+    text = re.sub(r"[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[이메일]", text).strip()
+    # "#문자" 형식 어절 제거
+    text = re.sub(r"#\S+", "", text).strip()
+    # "@문자" 형식 어절 제거
+    text = re.sub(r"@\w+", "", text).strip()
+    # URL 제거
+    text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "[웹주소]", text).strip()
+    text = re.sub(r"pic\.(\w+\.)+\S*", "[웹주소]", text).strip()
+    # 뉴스 저작권 관련 텍스트 제거
+    re_patterns = [
+        r"\<저작권자(\(c\)|ⓒ|©|\(Copyright\)|(\(c\))|(\(C\))).+?\>",
+        r"저작권자\(c\)|ⓒ|©|(Copyright)|(\(c\))|(\(C\))"
+    ]
+    for re_pattern in re_patterns:
+        text = re.sub(re_pattern, "", text).strip()
+    # 뉴스 내 포함된 이미지에 대한 레이블 제거
+    text = re.sub(r"\(출처 ?= ?.+\) |\(사진 ?= ?.+\) |\(자료 ?= ?.+\)| \(자료사진\) |사진=.+기자 ", "", text).strip()
+    # 문제를 일으킬 수 있는 구두점 치환
+    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
+    for p in punct_mapping:
+        text = text.replace(p, punct_mapping[p])
+    # 연속된 공백 치환
+    text = re.sub(r"\s+", " ", text).strip()
+    # 개행 문자 "\n" 제거
+    text = text.replace('\n', '')
+    # 기타 태그 제거
+    text = re.sub('<.+?>', '', text, 0, re.I|re.S)
+    return text
+def my_inference_function(input_text):
+  input_text = preprocessing(input_text)
+  generation_config = GenerationConfig(
+      temperature=0.8,
+      top_p=0.8,
+      top_k=100,
+      max_new_tokens=512,
+      early_stopping=True,
+      do_sample=True,
+  )
+  q = f"### instruction: {input_text}\n\n### Response: "
+  gened = model.generate(
+      **tokenizer(
+          q,
+          return_tensors='pt',
+          return_token_type_ids=False
+      ).to(device),
+      generation_config=generation_config,
+      pad_token_id=tokenizer.eos_token_id,
+      eos_token_id=tokenizer.eos_token_id,
+      # streamer=streamer,
+  )
+  result_str = tokenizer.decode(gened[0])
+  start_tag = f"\n\n### Response: "
+  start_index = result_str.find(start_tag)
+  if start_index != -1:
+      result_str = result_str[start_index + len(start_tag):].strip()
+  result_str = preprocessing(result_str)
+  return result_str
+gradio_interface = gradio.Interface(
+  fn = my_inference_function,
+  inputs = "text",
+  outputs = "text"
+)
+gradio_interface.launch()

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+input_text,output,flag,username,timestamp
+"오늘 너무 힘들었어...
+운동도 갔다왔고 일도 힘들었고 오늘 너무 벅찬 하루네",왜 그렇게 힘드셨어요?,,,2024-03-26 21:33:30.619223
+"오늘 너무 힘들었어...
+운동도 갔다왔고 일도 힘들었고 오늘 너무 벅찬 하루네",앞으로 어떻게 하실 건가요?,,,2024-03-26 21:33:32.851444

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+peft
+sentencepiece
+accelerate
+bitsandbytes