NoaiGPT commited on
Commit
580ef8a
1 Parent(s): 29d8d01
Files changed (1) hide show
  1. app.py +246 -82
app.py CHANGED
@@ -1,9 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # # import os
2
  # # import json
3
  # # import gradio as gr
4
  # # import spaces
5
  # # import torch
6
- # # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  # # from sentence_splitter import SentenceSplitter
8
  # # from itertools import product
9
 
@@ -11,13 +194,14 @@
11
  # # hf_token = os.getenv('HF_TOKEN')
12
 
13
  # # cuda_available = torch.cuda.is_available()
14
- # # device = torch.device("cpu" if cuda_available else "cpu")
15
  # # print(f"Using device: {device}")
16
 
17
  # # # Initialize paraphraser model and tokenizer
18
- # # paraphraser_model_name = "NoaiGPT/777"
19
- # # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
- # # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
 
21
 
22
  # # # Initialize classifier model and tokenizer
23
  # # classifier_model_name = "andreas122001/roberta-mixed-detector"
@@ -37,7 +221,7 @@
37
  # # main_score = probabilities[0][predicted_class].item()
38
  # # return main_label, main_score
39
 
40
- # # # @spaces.GPU
41
  # # def generate_paraphrases(text, setting, output_format):
42
  # # sentences = splitter.split(text)
43
  # # all_sentence_paraphrases = []
@@ -46,31 +230,31 @@
46
  # # num_return_sequences = 5
47
  # # repetition_penalty = 1.1
48
  # # no_repeat_ngram_size = 2
49
- # # temperature = 1.0
50
  # # max_length = 128
51
  # # elif setting == 2:
52
- # # num_return_sequences = 10
53
  # # repetition_penalty = 1.2
54
  # # no_repeat_ngram_size = 3
55
- # # temperature = 1.2
56
  # # max_length = 192
57
  # # elif setting == 3:
58
- # # num_return_sequences = 15
59
  # # repetition_penalty = 1.3
60
  # # no_repeat_ngram_size = 4
61
- # # temperature = 1.4
62
  # # max_length = 256
63
  # # elif setting == 4:
64
- # # num_return_sequences = 20
65
  # # repetition_penalty = 1.4
66
  # # no_repeat_ngram_size = 5
67
- # # temperature = 1.6
68
  # # max_length = 320
69
  # # else:
70
- # # num_return_sequences = 25
71
  # # repetition_penalty = 1.5
72
  # # no_repeat_ngram_size = 6
73
- # # temperature = 1.8
74
  # # max_length = 384
75
 
76
  # # top_k = 50
@@ -88,36 +272,30 @@
88
  # # }
89
 
90
  # # for i, sentence in enumerate(sentences):
91
- # # inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
-
93
- # # # Generate paraphrases using the specified parameters
94
- # # outputs = paraphraser_model.generate(
95
- # # inputs.input_ids,
96
- # # attention_mask=inputs.attention_mask,
97
  # # num_return_sequences=num_return_sequences,
98
- # # repetition_penalty=repetition_penalty,
99
- # # no_repeat_ngram_size=no_repeat_ngram_size,
100
- # # temperature=temperature,
101
- # # max_length=max_length,
102
  # # top_k=top_k,
103
  # # top_p=top_p,
104
- # # do_sample=True,
105
- # # early_stopping=False,
106
- # # length_penalty=length_penalty
 
107
  # # )
108
 
109
- # # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
 
111
  # # formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
- # # for j, paraphrase in enumerate(paraphrases, 1):
113
  # # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
 
115
  # # json_output["paraphrased_versions"].append({
116
  # # f"original_sentence_{i+1}": sentence,
117
- # # "paraphrases": paraphrases
118
  # # })
119
 
120
- # # all_sentence_paraphrases.append(paraphrases)
121
  # # formatted_output += "\n"
122
 
123
  # # all_combinations = list(product(*all_sentence_paraphrases))
@@ -186,10 +364,18 @@
186
  # import gradio as gr
187
  # import spaces
188
  # import torch
189
- # from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
 
 
190
  # from sentence_splitter import SentenceSplitter
191
  # from itertools import product
192
 
 
 
 
 
 
 
193
  # # Get the Hugging Face token from environment variable
194
  # hf_token = os.getenv('HF_TOKEN')
195
 
@@ -198,10 +384,9 @@
198
  # print(f"Using device: {device}")
199
 
200
  # # Initialize paraphraser model and tokenizer
201
- # paraphraser_model_name = "sharad/ParaphraseGPT"
202
- # paraphraser_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
203
  # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
204
- # paraphrase_pipeline = pipeline("text2text-generation", model=paraphraser_model, tokenizer=paraphraser_tokenizer, device=0 if cuda_available else -1)
205
 
206
  # # Initialize classifier model and tokenizer
207
  # classifier_model_name = "andreas122001/roberta-mixed-detector"
@@ -227,40 +412,26 @@
227
  # all_sentence_paraphrases = []
228
 
229
  # if setting == 1:
230
- # num_return_sequences = 5
231
- # repetition_penalty = 1.1
232
- # no_repeat_ngram_size = 2
233
- # temperature = 0.9
234
  # max_length = 128
235
  # elif setting == 2:
236
- # num_return_sequences = 5
237
- # repetition_penalty = 1.2
238
- # no_repeat_ngram_size = 3
239
- # temperature = 0.95
240
  # max_length = 192
241
  # elif setting == 3:
242
- # num_return_sequences = 5
243
- # repetition_penalty = 1.3
244
- # no_repeat_ngram_size = 4
245
- # temperature = 1.0
246
  # max_length = 256
247
  # elif setting == 4:
248
- # num_return_sequences = 5
249
- # repetition_penalty = 1.4
250
- # no_repeat_ngram_size = 5
251
- # temperature = 1.05
252
  # max_length = 320
253
  # else:
254
- # num_return_sequences = 5
255
- # repetition_penalty = 1.5
256
- # no_repeat_ngram_size = 6
257
- # temperature = 1.1
258
  # max_length = 384
259
 
260
- # top_k = 50
261
- # top_p = 0.95
262
- # length_penalty = 1.0
263
-
264
  # formatted_output = "Original text:\n" + text + "\n\n"
265
  # formatted_output += "Paraphrased versions:\n"
266
 
@@ -272,19 +443,21 @@
272
  # }
273
 
274
  # for i, sentence in enumerate(sentences):
275
- # paraphrases = paraphrase_pipeline(
276
- # sentence,
277
- # num_return_sequences=num_return_sequences,
278
- # do_sample=True,
279
- # top_k=top_k,
280
- # top_p=top_p,
281
- # temperature=temperature,
282
- # no_repeat_ngram_size=no_repeat_ngram_size,
283
- # repetition_penalty=repetition_penalty,
284
- # max_length=max_length
 
 
285
  # )
286
 
287
- # paraphrases_texts = [p['generated_text'] for p in paraphrases]
288
 
289
  # formatted_output += f"Original sentence {i+1}: {sentence}\n"
290
  # for j, paraphrase in enumerate(paraphrases_texts, 1):
@@ -314,7 +487,7 @@
314
  # label, score = classify_text(version)
315
  # formatted_output += f"Version {i}:\n{version}\n"
316
  # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
317
- # if label == "human-produced" or (label == "machine-generated" and score < 0.98):
318
  # human_versions.append((version, label, score))
319
 
320
  # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
@@ -358,24 +531,15 @@
358
 
359
  # # Launch the interface
360
  # iface.launch()
361
-
362
  import os
363
  import json
364
  import gradio as gr
365
  import spaces
366
  import torch
367
- import sys
368
- import subprocess
369
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
370
  from sentence_splitter import SentenceSplitter
371
  from itertools import product
372
 
373
- # Ensure sentencepiece is installed
374
- try:
375
- import sentencepiece
376
- except ImportError:
377
- subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
378
-
379
  # Get the Hugging Face token from environment variable
380
  hf_token = os.getenv('HF_TOKEN')
381
 
@@ -385,7 +549,7 @@ print(f"Using device: {device}")
385
 
386
  # Initialize paraphraser model and tokenizer
387
  paraphraser_model_name = "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"
388
- paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_fast=False)
389
  paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
390
 
391
  # Initialize classifier model and tokenizer
 
1
+ # # # import os
2
+ # # # import json
3
+ # # # import gradio as gr
4
+ # # # import spaces
5
+ # # # import torch
6
+ # # # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
+ # # # from sentence_splitter import SentenceSplitter
8
+ # # # from itertools import product
9
+
10
+ # # # # Get the Hugging Face token from environment variable
11
+ # # # hf_token = os.getenv('HF_TOKEN')
12
+
13
+ # # # cuda_available = torch.cuda.is_available()
14
+ # # # device = torch.device("cpu" if cuda_available else "cpu")
15
+ # # # print(f"Using device: {device}")
16
+
17
+ # # # # Initialize paraphraser model and tokenizer
18
+ # # # paraphraser_model_name = "NoaiGPT/777"
19
+ # # # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
+ # # # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
21
+
22
+ # # # # Initialize classifier model and tokenizer
23
+ # # # classifier_model_name = "andreas122001/roberta-mixed-detector"
24
+ # # # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
+ # # # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
+
27
+ # # # # Initialize sentence splitter
28
+ # # # splitter = SentenceSplitter(language='en')
29
+
30
+ # # # def classify_text(text):
31
+ # # # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
32
+ # # # with torch.no_grad():
33
+ # # # outputs = classifier_model(**inputs)
34
+ # # # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
35
+ # # # predicted_class = torch.argmax(probabilities, dim=-1).item()
36
+ # # # main_label = classifier_model.config.id2label[predicted_class]
37
+ # # # main_score = probabilities[0][predicted_class].item()
38
+ # # # return main_label, main_score
39
+
40
+ # # # # @spaces.GPU
41
+ # # # def generate_paraphrases(text, setting, output_format):
42
+ # # # sentences = splitter.split(text)
43
+ # # # all_sentence_paraphrases = []
44
+
45
+ # # # if setting == 1:
46
+ # # # num_return_sequences = 5
47
+ # # # repetition_penalty = 1.1
48
+ # # # no_repeat_ngram_size = 2
49
+ # # # temperature = 1.0
50
+ # # # max_length = 128
51
+ # # # elif setting == 2:
52
+ # # # num_return_sequences = 10
53
+ # # # repetition_penalty = 1.2
54
+ # # # no_repeat_ngram_size = 3
55
+ # # # temperature = 1.2
56
+ # # # max_length = 192
57
+ # # # elif setting == 3:
58
+ # # # num_return_sequences = 15
59
+ # # # repetition_penalty = 1.3
60
+ # # # no_repeat_ngram_size = 4
61
+ # # # temperature = 1.4
62
+ # # # max_length = 256
63
+ # # # elif setting == 4:
64
+ # # # num_return_sequences = 20
65
+ # # # repetition_penalty = 1.4
66
+ # # # no_repeat_ngram_size = 5
67
+ # # # temperature = 1.6
68
+ # # # max_length = 320
69
+ # # # else:
70
+ # # # num_return_sequences = 25
71
+ # # # repetition_penalty = 1.5
72
+ # # # no_repeat_ngram_size = 6
73
+ # # # temperature = 1.8
74
+ # # # max_length = 384
75
+
76
+ # # # top_k = 50
77
+ # # # top_p = 0.95
78
+ # # # length_penalty = 1.0
79
+
80
+ # # # formatted_output = "Original text:\n" + text + "\n\n"
81
+ # # # formatted_output += "Paraphrased versions:\n"
82
+
83
+ # # # json_output = {
84
+ # # # "original_text": text,
85
+ # # # "paraphrased_versions": [],
86
+ # # # "combined_versions": [],
87
+ # # # "human_like_versions": []
88
+ # # # }
89
+
90
+ # # # for i, sentence in enumerate(sentences):
91
+ # # # inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
+
93
+ # # # # Generate paraphrases using the specified parameters
94
+ # # # outputs = paraphraser_model.generate(
95
+ # # # inputs.input_ids,
96
+ # # # attention_mask=inputs.attention_mask,
97
+ # # # num_return_sequences=num_return_sequences,
98
+ # # # repetition_penalty=repetition_penalty,
99
+ # # # no_repeat_ngram_size=no_repeat_ngram_size,
100
+ # # # temperature=temperature,
101
+ # # # max_length=max_length,
102
+ # # # top_k=top_k,
103
+ # # # top_p=top_p,
104
+ # # # do_sample=True,
105
+ # # # early_stopping=False,
106
+ # # # length_penalty=length_penalty
107
+ # # # )
108
+
109
+ # # # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
+
111
+ # # # formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
+ # # # for j, paraphrase in enumerate(paraphrases, 1):
113
+ # # # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
+
115
+ # # # json_output["paraphrased_versions"].append({
116
+ # # # f"original_sentence_{i+1}": sentence,
117
+ # # # "paraphrases": paraphrases
118
+ # # # })
119
+
120
+ # # # all_sentence_paraphrases.append(paraphrases)
121
+ # # # formatted_output += "\n"
122
+
123
+ # # # all_combinations = list(product(*all_sentence_paraphrases))
124
+
125
+ # # # formatted_output += "\nCombined paraphrased versions:\n"
126
+ # # # combined_versions = []
127
+ # # # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
128
+ # # # combined_paraphrase = " ".join(combination)
129
+ # # # combined_versions.append(combined_paraphrase)
130
+
131
+ # # # json_output["combined_versions"] = combined_versions
132
+
133
+ # # # # Classify combined versions
134
+ # # # human_versions = []
135
+ # # # for i, version in enumerate(combined_versions, 1):
136
+ # # # label, score = classify_text(version)
137
+ # # # formatted_output += f"Version {i}:\n{version}\n"
138
+ # # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
+ # # # if label == "human-produced" or (label == "machine-generated" and score < 0.98):
140
+ # # # human_versions.append((version, label, score))
141
+
142
+ # # # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
143
+ # # # for i, (version, label, score) in enumerate(human_versions, 1):
144
+ # # # formatted_output += f"Version {i}:\n{version}\n"
145
+ # # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
146
+
147
+ # # # json_output["human_like_versions"] = [
148
+ # # # {"version": version, "label": label, "confidence_score": score}
149
+ # # # for version, label, score in human_versions
150
+ # # # ]
151
+
152
+ # # # # If no human-like versions, include the top 5 least confident machine-generated versions
153
+ # # # if not human_versions:
154
+ # # # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
155
+ # # # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
156
+ # # # for i, (version, label, score) in enumerate(human_versions, 1):
157
+ # # # formatted_output += f"Version {i}:\n{version}\n"
158
+ # # # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
159
+
160
+ # # # if output_format == "text":
161
+ # # # return formatted_output, "\n\n".join([v[0] for v in human_versions])
162
+ # # # else:
163
+ # # # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
164
+
165
+ # # # # Define the Gradio interface
166
+ # # # iface = gr.Interface(
167
+ # # # fn=generate_paraphrases,
168
+ # # # inputs=[
169
+ # # # gr.Textbox(lines=5, label="Input Text"),
170
+ # # # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
171
+ # # # gr.Radio(["text", "json"], label="Output Format")
172
+ # # # ],
173
+ # # # outputs=[
174
+ # # # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"),
175
+ # # # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
+ # # # ],
177
+ # # # title="Advanced Diverse Paraphraser with Human-like Filter",
178
+ # # # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
+ # # # )
180
+
181
+ # # # # Launch the interface
182
+ # # # iface.launch()
183
+
184
  # # import os
185
  # # import json
186
  # # import gradio as gr
187
  # # import spaces
188
  # # import torch
189
+ # # from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
190
  # # from sentence_splitter import SentenceSplitter
191
  # # from itertools import product
192
 
 
194
  # # hf_token = os.getenv('HF_TOKEN')
195
 
196
  # # cuda_available = torch.cuda.is_available()
197
+ # # device = torch.device("cuda" if cuda_available else "cpu")
198
  # # print(f"Using device: {device}")
199
 
200
  # # # Initialize paraphraser model and tokenizer
201
+ # # paraphraser_model_name = "sharad/ParaphraseGPT"
202
+ # # paraphraser_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
203
+ # # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
204
+ # # paraphrase_pipeline = pipeline("text2text-generation", model=paraphraser_model, tokenizer=paraphraser_tokenizer, device=0 if cuda_available else -1)
205
 
206
  # # # Initialize classifier model and tokenizer
207
  # # classifier_model_name = "andreas122001/roberta-mixed-detector"
 
221
  # # main_score = probabilities[0][predicted_class].item()
222
  # # return main_label, main_score
223
 
224
+ # # @spaces.GPU
225
  # # def generate_paraphrases(text, setting, output_format):
226
  # # sentences = splitter.split(text)
227
  # # all_sentence_paraphrases = []
 
230
  # # num_return_sequences = 5
231
  # # repetition_penalty = 1.1
232
  # # no_repeat_ngram_size = 2
233
+ # # temperature = 0.9
234
  # # max_length = 128
235
  # # elif setting == 2:
236
+ # # num_return_sequences = 5
237
  # # repetition_penalty = 1.2
238
  # # no_repeat_ngram_size = 3
239
+ # # temperature = 0.95
240
  # # max_length = 192
241
  # # elif setting == 3:
242
+ # # num_return_sequences = 5
243
  # # repetition_penalty = 1.3
244
  # # no_repeat_ngram_size = 4
245
+ # # temperature = 1.0
246
  # # max_length = 256
247
  # # elif setting == 4:
248
+ # # num_return_sequences = 5
249
  # # repetition_penalty = 1.4
250
  # # no_repeat_ngram_size = 5
251
+ # # temperature = 1.05
252
  # # max_length = 320
253
  # # else:
254
+ # # num_return_sequences = 5
255
  # # repetition_penalty = 1.5
256
  # # no_repeat_ngram_size = 6
257
+ # # temperature = 1.1
258
  # # max_length = 384
259
 
260
  # # top_k = 50
 
272
  # # }
273
 
274
  # # for i, sentence in enumerate(sentences):
275
+ # # paraphrases = paraphrase_pipeline(
276
+ # # sentence,
 
 
 
 
277
  # # num_return_sequences=num_return_sequences,
278
+ # # do_sample=True,
 
 
 
279
  # # top_k=top_k,
280
  # # top_p=top_p,
281
+ # # temperature=temperature,
282
+ # # no_repeat_ngram_size=no_repeat_ngram_size,
283
+ # # repetition_penalty=repetition_penalty,
284
+ # # max_length=max_length
285
  # # )
286
 
287
+ # # paraphrases_texts = [p['generated_text'] for p in paraphrases]
288
 
289
  # # formatted_output += f"Original sentence {i+1}: {sentence}\n"
290
+ # # for j, paraphrase in enumerate(paraphrases_texts, 1):
291
  # # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
292
 
293
  # # json_output["paraphrased_versions"].append({
294
  # # f"original_sentence_{i+1}": sentence,
295
+ # # "paraphrases": paraphrases_texts
296
  # # })
297
 
298
+ # # all_sentence_paraphrases.append(paraphrases_texts)
299
  # # formatted_output += "\n"
300
 
301
  # # all_combinations = list(product(*all_sentence_paraphrases))
 
364
  # import gradio as gr
365
  # import spaces
366
  # import torch
367
+ # import sys
368
+ # import subprocess
369
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
370
  # from sentence_splitter import SentenceSplitter
371
  # from itertools import product
372
 
373
+ # # Ensure sentencepiece is installed
374
+ # try:
375
+ # import sentencepiece
376
+ # except ImportError:
377
+ # subprocess.check_call([sys.executable, "-m", "pip", "install", "sentencepiece"])
378
+
379
  # # Get the Hugging Face token from environment variable
380
  # hf_token = os.getenv('HF_TOKEN')
381
 
 
384
  # print(f"Using device: {device}")
385
 
386
  # # Initialize paraphraser model and tokenizer
387
+ # paraphraser_model_name = "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"
388
+ # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_fast=False)
389
  # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
 
390
 
391
  # # Initialize classifier model and tokenizer
392
  # classifier_model_name = "andreas122001/roberta-mixed-detector"
 
412
  # all_sentence_paraphrases = []
413
 
414
  # if setting == 1:
415
+ # num_return_sequences = 3
416
+ # num_beams = 5
 
 
417
  # max_length = 128
418
  # elif setting == 2:
419
+ # num_return_sequences = 3
420
+ # num_beams = 7
 
 
421
  # max_length = 192
422
  # elif setting == 3:
423
+ # num_return_sequences = 3
424
+ # num_beams = 9
 
 
425
  # max_length = 256
426
  # elif setting == 4:
427
+ # num_return_sequences = 3
428
+ # num_beams = 11
 
 
429
  # max_length = 320
430
  # else:
431
+ # num_return_sequences = 3
432
+ # num_beams = 15
 
 
433
  # max_length = 384
434
 
 
 
 
 
435
  # formatted_output = "Original text:\n" + text + "\n\n"
436
  # formatted_output += "Paraphrased versions:\n"
437
 
 
443
  # }
444
 
445
  # for i, sentence in enumerate(sentences):
446
+ # text = "paraphrase: " + sentence + " </s>"
447
+ # encoding = paraphraser_tokenizer.encode_plus(text, max_length=max_length, padding=True, return_tensors="pt")
448
+ # input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
449
+
450
+ # paraphraser_model.eval()
451
+ # beam_outputs = paraphraser_model.generate(
452
+ # input_ids=input_ids,
453
+ # attention_mask=attention_mask,
454
+ # max_length=max_length,
455
+ # early_stopping=True,
456
+ # num_beams=num_beams,
457
+ # num_return_sequences=num_return_sequences
458
  # )
459
 
460
+ # paraphrases_texts = [paraphraser_tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for beam_output in beam_outputs]
461
 
462
  # formatted_output += f"Original sentence {i+1}: {sentence}\n"
463
  # for j, paraphrase in enumerate(paraphrases_texts, 1):
 
487
  # label, score = classify_text(version)
488
  # formatted_output += f"Version {i}:\n{version}\n"
489
  # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
490
+ # if label == "human-produced" or (label == "machine-generated" and score < 0.90): # Adjusted threshold
491
  # human_versions.append((version, label, score))
492
 
493
  # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
 
531
 
532
  # # Launch the interface
533
  # iface.launch()
 
534
  import os
535
  import json
536
  import gradio as gr
537
  import spaces
538
  import torch
 
 
539
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
540
  from sentence_splitter import SentenceSplitter
541
  from itertools import product
542
 
 
 
 
 
 
 
543
  # Get the Hugging Face token from environment variable
544
  hf_token = os.getenv('HF_TOKEN')
545
 
 
549
 
550
  # Initialize paraphraser model and tokenizer
551
  paraphraser_model_name = "ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"
552
+ paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name)
553
  paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
554
 
555
  # Initialize classifier model and tokenizer