NoaiGPT commited on
Commit
51a7969
1 Parent(s): e0b30b2
Files changed (1) hide show
  1. app.py +201 -24
app.py CHANGED
@@ -1,9 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import gradio as gr
4
  import spaces
5
  import torch
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
9
 
@@ -11,13 +194,14 @@ from itertools import product
11
  hf_token = os.getenv('HF_TOKEN')
12
 
13
  cuda_available = torch.cuda.is_available()
14
- device = torch.device("cpu" if cuda_available else "cpu")
15
  print(f"Using device: {device}")
16
 
17
  # Initialize paraphraser model and tokenizer
18
- paraphraser_model_name = "NoaiGPT/777"
19
- paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
- paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
 
21
 
22
  # Initialize classifier model and tokenizer
23
  classifier_model_name = "andreas122001/roberta-mixed-detector"
@@ -37,7 +221,7 @@ def classify_text(text):
37
  main_score = probabilities[0][predicted_class].item()
38
  return main_label, main_score
39
 
40
- # @spaces.GPU
41
  def generate_paraphrases(text, setting, output_format):
42
  sentences = splitter.split(text)
43
  all_sentence_paraphrases = []
@@ -88,36 +272,29 @@ def generate_paraphrases(text, setting, output_format):
88
  }
89
 
90
  for i, sentence in enumerate(sentences):
91
- inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
-
93
- # Generate paraphrases using the specified parameters
94
- outputs = paraphraser_model.generate(
95
- inputs.input_ids,
96
- attention_mask=inputs.attention_mask,
97
  num_return_sequences=num_return_sequences,
98
- repetition_penalty=repetition_penalty,
99
  no_repeat_ngram_size=no_repeat_ngram_size,
100
- temperature=temperature,
101
- max_length=max_length,
102
- top_k=top_k,
103
- top_p=top_p,
104
- do_sample=True,
105
- early_stopping=False,
106
- length_penalty=length_penalty
107
  )
108
 
109
- paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
 
111
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
- for j, paraphrase in enumerate(paraphrases, 1):
113
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
 
115
  json_output["paraphrased_versions"].append({
116
  f"original_sentence_{i+1}": sentence,
117
- "paraphrases": paraphrases
118
  })
119
 
120
- all_sentence_paraphrases.append(paraphrases)
121
  formatted_output += "\n"
122
 
123
  all_combinations = list(product(*all_sentence_paraphrases))
 
1
+ # import os
2
+ # import json
3
+ # import gradio as gr
4
+ # import spaces
5
+ # import torch
6
+ # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
7
+ # from sentence_splitter import SentenceSplitter
8
+ # from itertools import product
9
+
10
+ # # Get the Hugging Face token from environment variable
11
+ # hf_token = os.getenv('HF_TOKEN')
12
+
13
+ # cuda_available = torch.cuda.is_available()
14
+ # device = torch.device("cpu" if cuda_available else "cpu")
15
+ # print(f"Using device: {device}")
16
+
17
+ # # Initialize paraphraser model and tokenizer
18
+ # paraphraser_model_name = "NoaiGPT/777"
19
+ # paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
+ # paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
21
+
22
+ # # Initialize classifier model and tokenizer
23
+ # classifier_model_name = "andreas122001/roberta-mixed-detector"
24
+ # classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
+ # classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
+
27
+ # # Initialize sentence splitter
28
+ # splitter = SentenceSplitter(language='en')
29
+
30
+ # def classify_text(text):
31
+ # inputs = classifier_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
32
+ # with torch.no_grad():
33
+ # outputs = classifier_model(**inputs)
34
+ # probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
35
+ # predicted_class = torch.argmax(probabilities, dim=-1).item()
36
+ # main_label = classifier_model.config.id2label[predicted_class]
37
+ # main_score = probabilities[0][predicted_class].item()
38
+ # return main_label, main_score
39
+
40
+ # # @spaces.GPU
41
+ # def generate_paraphrases(text, setting, output_format):
42
+ # sentences = splitter.split(text)
43
+ # all_sentence_paraphrases = []
44
+
45
+ # if setting == 1:
46
+ # num_return_sequences = 5
47
+ # repetition_penalty = 1.1
48
+ # no_repeat_ngram_size = 2
49
+ # temperature = 1.0
50
+ # max_length = 128
51
+ # elif setting == 2:
52
+ # num_return_sequences = 10
53
+ # repetition_penalty = 1.2
54
+ # no_repeat_ngram_size = 3
55
+ # temperature = 1.2
56
+ # max_length = 192
57
+ # elif setting == 3:
58
+ # num_return_sequences = 15
59
+ # repetition_penalty = 1.3
60
+ # no_repeat_ngram_size = 4
61
+ # temperature = 1.4
62
+ # max_length = 256
63
+ # elif setting == 4:
64
+ # num_return_sequences = 20
65
+ # repetition_penalty = 1.4
66
+ # no_repeat_ngram_size = 5
67
+ # temperature = 1.6
68
+ # max_length = 320
69
+ # else:
70
+ # num_return_sequences = 25
71
+ # repetition_penalty = 1.5
72
+ # no_repeat_ngram_size = 6
73
+ # temperature = 1.8
74
+ # max_length = 384
75
+
76
+ # top_k = 50
77
+ # top_p = 0.95
78
+ # length_penalty = 1.0
79
+
80
+ # formatted_output = "Original text:\n" + text + "\n\n"
81
+ # formatted_output += "Paraphrased versions:\n"
82
+
83
+ # json_output = {
84
+ # "original_text": text,
85
+ # "paraphrased_versions": [],
86
+ # "combined_versions": [],
87
+ # "human_like_versions": []
88
+ # }
89
+
90
+ # for i, sentence in enumerate(sentences):
91
+ # inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
92
+
93
+ # # Generate paraphrases using the specified parameters
94
+ # outputs = paraphraser_model.generate(
95
+ # inputs.input_ids,
96
+ # attention_mask=inputs.attention_mask,
97
+ # num_return_sequences=num_return_sequences,
98
+ # repetition_penalty=repetition_penalty,
99
+ # no_repeat_ngram_size=no_repeat_ngram_size,
100
+ # temperature=temperature,
101
+ # max_length=max_length,
102
+ # top_k=top_k,
103
+ # top_p=top_p,
104
+ # do_sample=True,
105
+ # early_stopping=False,
106
+ # length_penalty=length_penalty
107
+ # )
108
+
109
+ # paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
110
+
111
+ # formatted_output += f"Original sentence {i+1}: {sentence}\n"
112
+ # for j, paraphrase in enumerate(paraphrases, 1):
113
+ # formatted_output += f" Paraphrase {j}: {paraphrase}\n"
114
+
115
+ # json_output["paraphrased_versions"].append({
116
+ # f"original_sentence_{i+1}": sentence,
117
+ # "paraphrases": paraphrases
118
+ # })
119
+
120
+ # all_sentence_paraphrases.append(paraphrases)
121
+ # formatted_output += "\n"
122
+
123
+ # all_combinations = list(product(*all_sentence_paraphrases))
124
+
125
+ # formatted_output += "\nCombined paraphrased versions:\n"
126
+ # combined_versions = []
127
+ # for i, combination in enumerate(all_combinations[:50], 1): # Limit to 50 combinations
128
+ # combined_paraphrase = " ".join(combination)
129
+ # combined_versions.append(combined_paraphrase)
130
+
131
+ # json_output["combined_versions"] = combined_versions
132
+
133
+ # # Classify combined versions
134
+ # human_versions = []
135
+ # for i, version in enumerate(combined_versions, 1):
136
+ # label, score = classify_text(version)
137
+ # formatted_output += f"Version {i}:\n{version}\n"
138
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
+ # if label == "human-produced" or (label == "machine-generated" and score < 0.98):
140
+ # human_versions.append((version, label, score))
141
+
142
+ # formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
143
+ # for i, (version, label, score) in enumerate(human_versions, 1):
144
+ # formatted_output += f"Version {i}:\n{version}\n"
145
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
146
+
147
+ # json_output["human_like_versions"] = [
148
+ # {"version": version, "label": label, "confidence_score": score}
149
+ # for version, label, score in human_versions
150
+ # ]
151
+
152
+ # # If no human-like versions, include the top 5 least confident machine-generated versions
153
+ # if not human_versions:
154
+ # human_versions = sorted([(v, l, s) for v, l, s in zip(combined_versions, [classify_text(v)[0] for v in combined_versions], [classify_text(v)[1] for v in combined_versions])], key=lambda x: x[2])[:5]
155
+ # formatted_output += "\nNo human-like versions found. Showing top 5 least confident machine-generated versions:\n"
156
+ # for i, (version, label, score) in enumerate(human_versions, 1):
157
+ # formatted_output += f"Version {i}:\n{version}\n"
158
+ # formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
159
+
160
+ # if output_format == "text":
161
+ # return formatted_output, "\n\n".join([v[0] for v in human_versions])
162
+ # else:
163
+ # return json.dumps(json_output, indent=2), "\n\n".join([v[0] for v in human_versions])
164
+
165
+ # # Define the Gradio interface
166
+ # iface = gr.Interface(
167
+ # fn=generate_paraphrases,
168
+ # inputs=[
169
+ # gr.Textbox(lines=5, label="Input Text"),
170
+ # gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
171
+ # gr.Radio(["text", "json"], label="Output Format")
172
+ # ],
173
+ # outputs=[
174
+ # gr.Textbox(lines=20, label="Detailed Paraphrases and Classifications"),
175
+ # gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
+ # ],
177
+ # title="Advanced Diverse Paraphraser with Human-like Filter",
178
+ # description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
+ # )
180
+
181
+ # # Launch the interface
182
+ # iface.launch()
183
+
184
  import os
185
  import json
186
  import gradio as gr
187
  import spaces
188
  import torch
189
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
190
  from sentence_splitter import SentenceSplitter
191
  from itertools import product
192
 
 
194
  hf_token = os.getenv('HF_TOKEN')
195
 
196
  cuda_available = torch.cuda.is_available()
197
+ device = torch.device("cuda" if cuda_available else "cpu")
198
  print(f"Using device: {device}")
199
 
200
  # Initialize paraphraser model and tokenizer
201
+ paraphraser_model_name = "sharad/ParaphraseGPT"
202
+ paraphraser_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
203
+ paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name).to(device)
204
+ paraphrase_pipeline = pipeline("text2text-generation", model=paraphraser_model, tokenizer=paraphraser_tokenizer)
205
 
206
  # Initialize classifier model and tokenizer
207
  classifier_model_name = "andreas122001/roberta-mixed-detector"
 
221
  main_score = probabilities[0][predicted_class].item()
222
  return main_label, main_score
223
 
224
+ @spaces.GPU
225
  def generate_paraphrases(text, setting, output_format):
226
  sentences = splitter.split(text)
227
  all_sentence_paraphrases = []
 
272
  }
273
 
274
  for i, sentence in enumerate(sentences):
275
+ paraphrases = paraphrase_pipeline(
276
+ sentence,
277
+ num_beams=3,
278
+ num_beam_groups=3,
 
 
279
  num_return_sequences=num_return_sequences,
280
+ diversity_penalty=2.0,
281
  no_repeat_ngram_size=no_repeat_ngram_size,
282
+ repetition_penalty=repetition_penalty,
283
+ max_length=max_length
 
 
 
 
 
284
  )
285
 
286
+ paraphrases_texts = [p['generated_text'] for p in paraphrases]
287
 
288
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
289
+ for j, paraphrase in enumerate(paraphrases_texts, 1):
290
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
291
 
292
  json_output["paraphrased_versions"].append({
293
  f"original_sentence_{i+1}": sentence,
294
+ "paraphrases": paraphrases_texts
295
  })
296
 
297
+ all_sentence_paraphrases.append(paraphrases_texts)
298
  formatted_output += "\n"
299
 
300
  all_combinations = list(product(*all_sentence_paraphrases))