File size: 5,854 Bytes
e8af4f9
2cf8c8c
 
 
 
 
52775c0
257baa7
52775c0
2cf8c8c
 
 
 
 
 
 
 
 
9123c31
e8af4f9
 
 
 
2cf8c8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8af4f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d0defb
 
e8af4f9
5d0defb
 
 
 
 
 
 
 
e8af4f9
 
2cf8c8c
e8af4f9
2cf8c8c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import os
import re
from tqdm import tqdm
from typing import List
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
app = Flask(__name__)

ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT_Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT_Detector", use_auth_token= ACCESS_TOKEN, config = config)

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))

device = 'cpu'
model_id = "gpt2"
modelgpt2 = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizergpt2 = GPT2TokenizerFast.from_pretrained(model_id)

def text_to_sentences(text):
    clean_text = text.replace('\n', ' ')
    return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', clean_text)

# function to concatenate sentences into chunks of size 900 or less
def chunks_of_900(text, chunk_size=900):
    sentences = text_to_sentences(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= chunk_size:
            if len(current_chunk)!=0:
                current_chunk += " "+sentence
            else:
                current_chunk += sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)
    return chunks
    
def predict(query, device="cpu"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

def findRealProb(text):
    chunksOfText = (chunks_of_900(text))
    results = []
    for chunk in chunksOfText:
        output = predict(chunk)
        results.append([output, len(chunk)])
    
    ans = 0
    cnt = 0
    for prob, length in results:
        cnt += length
        ans = ans + prob*length
    realProb = ans/cnt
    return {"Real": realProb, "Fake": 1-realProb}, results


def text_to_sentences(text):
    clean_text = text.replace('\n', ' ')
    return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', clean_text)

def calculatePerplexity(text):
    encodings = tokenizergpt2("\n\n".join([text]), return_tensors="pt")
    max_length = modelgpt2.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = modelgpt2(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

    return ppl.item()
    
@app.post("/getPerplexities")
async def calculate_perplexities(data: List[str]):
    perplexities = []
    for text in data:
        sentences = text_to_sentences(text)
        for sentence in sentences:
            perplexity = calculatePerplexity(sentence)
            label = "Human"
            if perplexity < 25:
                label = "AI"
            perplexities.append({"sentence": sentence, "perplexity": perplexity, "label": label})
    return perplexities

demo = gr.Interface(
        fn=findRealProb, 
        inputs=gr.Textbox(placeholder="Copy and paste here..."), 
         article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
        outputs=gr.outputs.JSON(),
        interpretation="default",
        examples=["Cristiano Ronaldo is a Portuguese professional soccer player who currently plays as a forward for Manchester United and the Portugal national team. He is widely considered one of the greatest soccer players of all time, having won numerous awards and accolades throughout his career. Ronaldo began his professional career with Sporting CP in Portugal before moving to Manchester United in 2003. He spent six seasons with the club, winning three Premier League titles and one UEFA Champions League title. In 2009, he transferred to Real Madrid for a then-world record transfer fee of $131 million. He spent nine seasons with the club, winning four UEFA Champions League titles, two La Liga titles, and two Copa del Rey titles. In 2018, he transferred to Juventus, where he spent three seasons before returning to Manchester United in 2021. He has also had a successful international career with the Portugal national team, having won the UEFA European Championship in 2016 and the UEFA Nations League in 2019.", "One rule of thumb which applies to everything that we do - professionally and personally : Know what the customer want and deliver. In this case, it is important to know what the organisation what from employee. Connect the same to the KRA. Are you part of a delivery which directly ties to the larger organisational objective. If yes, then the next question is success rate of one’s delivery. If the KRAs are achieved or exceeded, then the employee is entitled for a decent hike."])

demo.launch(show_api=False)