kinit-tomassako
commited on
Commit
•
eaef024
1
Parent(s):
dcfb42c
Add application file
Browse files- app.py +135 -0
- requirements.txt +2 -0
app.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from datetime import datetime
|
3 |
+
from pathlib import Path
|
4 |
+
from uuid import uuid4
|
5 |
+
from huggingface_hub import CommitScheduler
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import requests as rq
|
9 |
+
import os
|
10 |
+
import pandas as pd
|
11 |
+
import re
|
12 |
+
|
13 |
+
os.environ['CURL_CA_BUNDLE'] = ''
|
14 |
+
description = "Vera - Claim detection"
|
15 |
+
title = "Vera - Claim detection & Summary"
|
16 |
+
#HF_TOKEN = os.getenv('ver_claimdetection_demo')
|
17 |
+
#hf_writer = gr.HuggingFaceDatasetSaver(HF_TOKEN, "ver_claimdetection_demo")
|
18 |
+
|
19 |
+
# JSON_DATASET_DIR = Path("json_dataset")
|
20 |
+
# JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
21 |
+
# JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
|
22 |
+
#
|
23 |
+
# scheduler = CommitScheduler(
|
24 |
+
# repo_id="ver_claimdetection_demo",
|
25 |
+
# repo_type="dataset",
|
26 |
+
# folder_path=JSON_DATASET_DIR,
|
27 |
+
# path_in_repo="data",
|
28 |
+
# )
|
29 |
+
#
|
30 |
+
# def save_json(input: str, output: str) -> None:
|
31 |
+
# with scheduler.lock:
|
32 |
+
# with JSON_DATASET_PATH.open("a") as f:
|
33 |
+
# json.dump({"input": input, "output": output, "datetime": datetime.now().isoformat()}, f)
|
34 |
+
# f.write("\n")
|
35 |
+
def split_into_sentences(text: str) -> list[str]:
|
36 |
+
alphabets = "([A-Za-z])"
|
37 |
+
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
38 |
+
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
39 |
+
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
40 |
+
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
41 |
+
websites = "[.](com|net|org|io|gov|edu|me)"
|
42 |
+
digits = "([0-9])"
|
43 |
+
multiple_dots = r'\.{2,}'
|
44 |
+
|
45 |
+
text = " " + text + " "
|
46 |
+
text = text.replace("\n", " ")
|
47 |
+
text = re.sub(prefixes, "\\1<prd>", text)
|
48 |
+
text = re.sub(websites, "<prd>\\1", text)
|
49 |
+
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
|
50 |
+
text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
|
51 |
+
if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>")
|
52 |
+
text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
|
53 |
+
text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
|
54 |
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
|
55 |
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
|
56 |
+
text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
|
57 |
+
text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
|
58 |
+
text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
|
59 |
+
if "”" in text: text = text.replace(".”", "”.")
|
60 |
+
if "\"" in text: text = text.replace(".\"", "\".")
|
61 |
+
if "!" in text: text = text.replace("!\"", "\"!")
|
62 |
+
if "?" in text: text = text.replace("?\"", "\"?")
|
63 |
+
text = text.replace(".", ".<stop>")
|
64 |
+
text = text.replace("?", "?<stop>")
|
65 |
+
text = text.replace("!", "!<stop>")
|
66 |
+
text = text.replace("<prd>", ".")
|
67 |
+
sentences = text.split("<stop>")
|
68 |
+
sentences = [s.strip() for s in sentences]
|
69 |
+
if sentences and not sentences[-1]: sentences = sentences[:-1]
|
70 |
+
return sentences
|
71 |
+
|
72 |
+
def sentence_builder(input_text, claimdet_drp):
|
73 |
+
url = None
|
74 |
+
match claimdet_drp:
|
75 |
+
case "Mdeberta":
|
76 |
+
url = "https://gvyre1ron7.execute-api.eu-central-1.amazonaws.com/prod"
|
77 |
+
case "Xlm-Roberta":
|
78 |
+
url = "https://b3vbb5xexg.execute-api.eu-central-1.amazonaws.com/prod"
|
79 |
+
payload = "['"+input_text+"']"
|
80 |
+
headers = {
|
81 |
+
'Content-Type': 'text/plain'
|
82 |
+
}
|
83 |
+
response = rq.request("POST", url, headers=headers, data=payload)
|
84 |
+
# return f"""{claimdet_drp} response: """ + str(response.content)
|
85 |
+
res_text = f"""{claimdet_drp} response: """ + str(response.content)
|
86 |
+
try:
|
87 |
+
print(res_text)
|
88 |
+
print('Response type before: ' + str(type(response.content)))
|
89 |
+
ls = response.text.replace("[[", "")
|
90 |
+
ls = ls.replace("]]", "")
|
91 |
+
print('Response text: ' + ls)
|
92 |
+
df_claim = pd.DataFrame(ls.split(","), columns=['is_claim'])
|
93 |
+
print('Response type after: ' + str(type(df_claim)))
|
94 |
+
print('From list: ')
|
95 |
+
print(df_claim)
|
96 |
+
print('Sentences before split:' + input_text)
|
97 |
+
try:
|
98 |
+
sen = split_into_sentences(input_text)
|
99 |
+
print('Sentences after split:' + '#'.join(sen))
|
100 |
+
# print('Sen type after split: ' + str(type(sen)))
|
101 |
+
df_sen = pd.DataFrame(sen, columns=['sentence'])
|
102 |
+
print('Sentences dataframe:')
|
103 |
+
print(df_sen)
|
104 |
+
except Exception as e:
|
105 |
+
print(e)
|
106 |
+
df_sen = pd.DataFrame(columns=['sentence'])
|
107 |
+
except:
|
108 |
+
print('No dataframe of predictions in the model output')
|
109 |
+
df_claim = pd.DataFrame(columns=['is_claim'])
|
110 |
+
df_sen = pd.DataFrame(columns=['sentence'])
|
111 |
+
df = pd.concat([df_claim.reset_index(drop=True), df_sen], axis=1)
|
112 |
+
return res_text, df
|
113 |
+
|
114 |
+
ver_claimdetection_demo = gr.Interface(
|
115 |
+
sentence_builder,
|
116 |
+
[
|
117 |
+
gr.Textbox(label="Input Text", max_lines=200),
|
118 |
+
gr.Dropdown(["Mdeberta", "Xlm-Roberta"], label="Choose ClaimDetection model", info="Choose ClaimDetection model"),
|
119 |
+
],
|
120 |
+
#"text",
|
121 |
+
[
|
122 |
+
gr.Textbox(label="Output Text")
|
123 |
+
, gr.Dataframe(label="Output Dataframe")
|
124 |
+
],
|
125 |
+
description=description, #"Check out the dataset at: [https://huggingface.co/datasets/kinit-tomassako/ver_claimdetection_demo](https://huggingface.co/datasets/kinit-tomassako/ver_claimdetection_demo)",
|
126 |
+
allow_flagging="manual",
|
127 |
+
flagging_options=["Correct", "Incorrect", "Ambiguous"],
|
128 |
+
# flagging_callback=hf_writer,
|
129 |
+
examples=[
|
130 |
+
["The novel SARS-CoV-2 coronavirus that emerged in the city of Wuhan, China, last year and has since caused a large scale COVID-19 epidemic and spread to more than 70 other countries is the product of natural evolution, according to findings published today in the journal Nature Medicine. The analysis of public genome sequence data from SARS-CoV-2 and related viruses found no evidence that the virus was made in a laboratory or otherwise engineered. Coronaviruses are a large family of viruses that can cause illnesses ranging widely in severity. The first known severe illness caused by a coronavirus emerged with the 2003 Severe Acute Respiratory Syndrome (SARS) epidemic in China. A second outbreak of severe illness began in 2012 in Saudi Arabia with the Middle East Respiratory Syndrome (MERS). On December 31 of last year, Chinese authorities alerted the World Health Organization of an outbreak of a novel strain of coronavirus causing severe illness, which was subsequently named SARS-CoV-2. As of February 20, 2020, nearly 167,500 COVID-19 cases have been documented, although many more mild cases have likely gone undiagnosed. The virus has killed over 6,600 people. Shortly after the epidemic began, Chinese scientists sequenced the genome of SARS-CoV-2 and made the data available to researchers worldwide. The resulting genomic sequence data has shown that Chinese authorities rapidly detected the epidemic and that the number of COVID-19 cases have been increasing because of human to human transmission after a single introduction into the human population. Andersen and collaborators at several other research institutions used this sequencing data to explore the origins and evolution of SARS-CoV-2 by focusing in on several tell-tale features of the virus. The scientists analyzed the genetic template for spike proteins, armatures on the outside of the virus that it uses to grab and penetrate the outer walls of human and animal cells.", "Mdeberta"],
|
131 |
+
["I do not know. Of course we can. The novel SARS-CoV-2 coronavirus emerged in the city of Wuhan, China, last year.", "Xlm-Roberta"],
|
132 |
+
]
|
133 |
+
)
|
134 |
+
|
135 |
+
ver_claimdetection_demo.launch(share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio==4.2.0
|
2 |
+
requests==2.31.0
|