File size: 8,597 Bytes
278155c c7fe332 278155c 6b44742 278155c 6b44742 2a3ba09 278155c c7fe332 278155c 2a3ba09 278155c 2a3ba09 278155c 2a3ba09 c7fe332 2a3ba09 c7fe332 2a3ba09 278155c 2a3ba09 278155c 2a3ba09 278155c c046ef2 278155c 42b6795 278155c e243e90 c046ef2 278155c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class TextDetectionApp:
def __init__(self):
# Load DeBERTa model and tokenizer
self.deberta_tokenizer = AutoTokenizer.from_pretrained("zeyadusf/deberta-DAIGT-MODELS")
self.deberta_model = AutoModelForSequenceClassification.from_pretrained("zeyadusf/deberta-DAIGT-MODELS")
# Load RoBERTa model and tokenizer
self.roberta_tokenizer = AutoTokenizer.from_pretrained("zeyadusf/roberta-DAIGT-kaggle")
self.roberta_model = AutoModelForSequenceClassification.from_pretrained("zeyadusf/roberta-DAIGT-kaggle")
# Load BERT model and tokenizer
self.bert_tokenizer = AutoTokenizer.from_pretrained("zeyadusf/bert-DAIGT-MODELS")
self.bert_model = AutoModelForSequenceClassification.from_pretrained("zeyadusf/bert-DAIGT-MODELS")
# Load DistilBERT model and tokenizer
self.distilbert_tokenizer = AutoTokenizer.from_pretrained("zeyadusf/distilbert-DAIGT-MODELS")
self.distilbert_model = AutoModelForSequenceClassification.from_pretrained("zeyadusf/distilbert-DAIGT-MODELS")
# Load Feedforward model
self.ff_model = torch.jit.load("model_scripted.pt")
def api_huggingface(self, text):
"""
Generate predictions using the DeBERTa and RoBERTa models.
Args:
text (str): The input text to classify.
Returns:
tuple: Predictions from RoBERTa and DeBERTa models.
"""
# DeBERTa predictions
deberta_inputs = self.deberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
deberta_outputs = self.deberta_model(**deberta_inputs)
deberta_logits = deberta_outputs.logits
deberta_scores = torch.softmax(deberta_logits, dim=1)
deberta_predictions = [
{"label": f"LABEL_{i}", "score": score.item()}
for i, score in enumerate(deberta_scores[0])
]
# RoBERTa predictions
roberta_inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
roberta_outputs = self.roberta_model(**roberta_inputs)
roberta_logits = roberta_outputs.logits
roberta_scores = torch.softmax(roberta_logits, dim=1)
roberta_predictions = [
{"label": f"LABEL_{i}", "score": score.item()}
for i, score in enumerate(roberta_scores[0])
]
return roberta_predictions, deberta_predictions
def generate_ff_input(self, models_results):
"""
Generates input features for the Feedforward model from the API output.
Parameters:
models_results (tuple): Tuple containing the results of DeBERTa and RoBERTa models.
Returns:
torch.Tensor: Feedforward model input features tensor.
"""
roberta, deberta = models_results
input_ff = []
try:
if roberta[0]['label'] == 'LABEL_0':
input_ff.append(roberta[0]['score'])
input_ff.append(roberta[1]['score'])
else:
input_ff.append(roberta[1]['score'])
input_ff.append(roberta[0]['score'])
if deberta[0]['label'] == 'LABEL_0':
input_ff.append(deberta[0]['score'])
input_ff.append(deberta[1]['score'])
else:
input_ff.append(deberta[1]['score'])
input_ff.append(deberta[0]['score'])
except Exception as e:
print(f"Error {e}: The text is long")
input_ff = torch.tensor(input_ff, dtype=torch.float32)
input_ff = input_ff.view(1, -1)
return input_ff
def detect_text(self, text):
"""
Detects whether the input text is generated or human-written using the Feedforward model.
Returns:
str: The detection result indicating if the text is generated or human-written.
"""
with torch.no_grad():
detection_score = self.ff_model(self.generate_ff_input(self.api_huggingface(text)))[0][0].item()
# Return result based on the score threshold
return detection_score
def classify_text(self, text, model_choice):
"""
Classifies the input text using the selected model.
Args:
text (str): The input text to classify.
model_choice (str): The model to use ('DeBERTa', 'RoBERTa', 'BERT', 'DistilBERT', or 'Feedforward').
Returns:
str: The classification result including prediction scores.
"""
if model_choice == 'DeBERTa':
# Tokenize input
inputs = self.deberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Run model
outputs = self.deberta_model(**inputs)
# Get classification results
logits = outputs.logits
scores = torch.softmax(logits, dim=1)[0]
generated_score = scores[1].item()
human_written_score = scores[0].item()
label = "Generated Text" if generated_score > 0.5 else "Human-Written"
return f"{label} ({generated_score * 100:.2f}% Generated, {human_written_score * 100:.2f}% Human-Written)"
elif model_choice == 'RoBERTa':
# Tokenize input
inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Run model
outputs = self.roberta_model(**inputs)
# Get classification results
logits = outputs.logits
scores = torch.softmax(logits, dim=1)[0]
generated_score = scores[1].item()
human_written_score = scores[0].item()
label = "Generated Text" if generated_score > 0.5 else "Human-Written"
return f"{label} ({generated_score * 100:.2f}% Generated, {human_written_score * 100:.2f}% Human-Written)"
elif model_choice == 'BERT':
# Tokenize input
inputs = self.bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Run model
outputs = self.bert_model(**inputs)
# Get classification results
logits = outputs.logits
scores = torch.softmax(logits, dim=1)[0]
generated_score = scores[1].item()
human_written_score = scores[0].item()
label = "Generated Text" if generated_score > 0.5 else "Human-Written"
return f"{label} ({generated_score * 100:.2f}% Generated, {human_written_score * 100:.2f}% Human-Written)"
elif model_choice == 'DistilBERT':
# Tokenize input
inputs = self.distilbert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Run model
outputs = self.distilbert_model(**inputs)
# Get classification results
logits = outputs.logits
scores = torch.softmax(logits, dim=1)[0]
generated_score = scores[1].item()
human_written_score = scores[0].item()
label = "Generated Text" if generated_score > 0.5 else "Human-Written"
return f"{label} ({generated_score * 100:.2f}% Generated, {human_written_score * 100:.2f}% Human-Written)"
elif model_choice == 'DAIGT-Model':
# Run feedforward detection
detection_score = self.detect_text(text)
label = "Generated Text" if detection_score > 0.5 else "Human-Written"
generated_score = detection_score
human_written_score = 1 - detection_score
return f"{label} ({generated_score * 100:.2f}% Generated, {human_written_score * 100:.2f}% Human-Written)"
else:
return "Invalid model selection."
# Initialize the app
dec="""Classify text as generated or human-written using DeBERTa, RoBERTa, BERT, DistilBERT, or ensamble (RoBERTa and DeBERTa) with custom Feedforward model 'DAIGT-Model'.
\n\nYou can see more details at [DAIGT-Catch-the-AI GitHub Repository](https://github.com/zeyadusf/DAIGT-Catch-the-AI/tree/main)
"""
app = TextDetectionApp()
# Gradio Interface
iface = gr.Interface(
fn=app.classify_text,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your text here..."),
gr.Radio(choices=["DeBERTa", "RoBERTa", "BERT", "DistilBERT", "DAIGT-Model"], label="Model Choice")
],
outputs="text",
title="Detection of AI Generated Text with Multiple Models",
description=dec)
iface.launch()
|