PirateXX's picture
Update app.py
52a6e1f
raw
history blame
3.06 kB
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import gradio as gr
import os
import re
import pdfplumber
app = Flask(__name__)
ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))
# function to break text into an array of sentences
def text_to_sentences(text):
re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
return re.split(r'[.!?]', text)
# function to concatenate sentences into chunks of size 600 or less
def chunks_of_600(text, chunk_size=600):
sentences = text_to_sentences(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) <= chunk_size:
current_chunk += sentence
else:
chunks.append(current_chunk)
current_chunk = sentence
chunks.append(current_chunk)
return chunks
def predict(query, device="cpu"):
tokens = tokenizer.encode(query)
all_tokens = len(tokens)
tokens = tokens[:tokenizer.model_max_length - 2]
used_tokens = len(tokens)
tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
mask = torch.ones_like(tokens)
with torch.no_grad():
logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
probs = logits.softmax(dim=-1)
fake, real = probs.detach().cpu().flatten().numpy().tolist()
return real
def findRealProb(text):
chunksOfText = (chunks_of_600(text))
results = []
for chunk in chunksOfText:
output = predict(chunk)
print(chunk)
print("-----------------------------------")
results.append([output, len(chunk)])
ans = 0
for prob, length in results:
ans = ans + prob*length
realProb = ans/len(text)
return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}
def upload_file(file):
if file:
pdf_file = file.name
print(file, pdf_file)
text = ""
with pdfplumber.open(pdf_file) as pdf:
cnt = 0
for page in pdf.pages:
cnt+=1
text+=(page.extract_text(x_tolerance = 1))
if cnt>5:
break
text = text.replace('\n', ' ')
return findRealProb(text)
else:
return {"error":'No PDF file found in request'}
demo = gr.Interface(
fn=upload_file,
inputs=gr.File(),
article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
outputs=gr.outputs.JSON(),
interpretation="default",)
demo.launch(show_api=False)