zmbfeng commited on
Commit
95e6c0d
1 Parent(s): 78ef37b

paragraphsing implemented

Browse files
Files changed (2) hide show
  1. app.py +41 -0
  2. requirements.txt +3 -1
app.py CHANGED
@@ -40,6 +40,35 @@ def combined_similarity(similarity, sentence, query):
40
  combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
41
  return combined_score,similarity,(common_words / max(len(query_words), 1))
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  big_text = """
44
  <div style='text-align: center;'>
45
  <h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
@@ -92,6 +121,8 @@ if 'is_initialized' not in st.session_state:
92
  st.session_state.stop_words = set(stopwords.words('english'))
93
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
94
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
 
 
95
 
96
  if 'list_count' in st.session_state:
97
  st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
@@ -173,6 +204,9 @@ if 'paragraph_sentence_encodings' in st.session_state:
173
  original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
174
  modified_paragraph = ' '.join(reordered_paragraph)
175
 
 
 
 
176
  paragraph_scores.append(
177
  (top_three_avg_similarity, top_three_avg_commonality,
178
  {'modified_text': modified_paragraph, 'original_text': original_paragraph})
@@ -184,5 +218,12 @@ if 'paragraph_sentence_encodings' in st.session_state:
184
  st.write("Top scored paragraphs and their scores:")
185
  for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
186
  st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
 
 
 
 
 
 
 
187
  st.write("Modified Paragraph: ", paragraph['modified_text'])
188
  st.write("Original Paragraph: ", paragraph['original_text'])
 
40
  combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
41
  return combined_score,similarity,(common_words / max(len(query_words), 1))
42
 
43
+
44
+ def paraphrase(sentence):
45
+ text = "paraphrase: " + sentence + " </s>"
46
+
47
+ encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
48
+ input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
49
+
50
+
51
+ outputs = st.session_state.paraphrase_model.generate(
52
+ input_ids=input_ids, attention_mask=attention_masks,
53
+ max_length=256,
54
+ do_sample=True,
55
+ top_k=120,
56
+ top_p=0.95,
57
+ #early_stopping=True,
58
+ early_stopping=False,
59
+ #num_return_sequences=5,
60
+ num_return_sequences=1,
61
+ repetition_penalty=1.5
62
+
63
+ )
64
+ # print(f"outputs = {outputs}")
65
+ results=[]
66
+ for output in outputs:
67
+ print("*")
68
+ line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
69
+ #results.append(line)
70
+ return line
71
+
72
  big_text = """
73
  <div style='text-align: center;'>
74
  <h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
 
121
  st.session_state.stop_words = set(stopwords.words('english'))
122
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
123
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
124
+ st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
125
+ st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
126
 
127
  if 'list_count' in st.session_state:
128
  st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
 
204
  original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
205
  modified_paragraph = ' '.join(reordered_paragraph)
206
 
207
+
208
+
209
+
210
  paragraph_scores.append(
211
  (top_three_avg_similarity, top_three_avg_commonality,
212
  {'modified_text': modified_paragraph, 'original_text': original_paragraph})
 
218
  st.write("Top scored paragraphs and their scores:")
219
  for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
220
  st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
221
+
222
+ output_1 = paraphrase(paragraph['modified_text'])
223
+ print(output_1)
224
+
225
+ output_2 = paraphrase(output_1)
226
+ print(output_2)
227
+ st.write("Paraphrased Paragraph: ", output_2)
228
  st.write("Modified Paragraph: ", paragraph['modified_text'])
229
  st.write("Original Paragraph: ", paragraph['original_text'])
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  transformers
2
  torch
3
  scikit-learn
4
- nltk
 
 
 
1
  transformers
2
  torch
3
  scikit-learn
4
+ nltk
5
+ sentencepiece
6
+ protobuf==3.20.3