paragraphsing implemented
Browse files- app.py +41 -0
- requirements.txt +3 -1
app.py
CHANGED
@@ -40,6 +40,35 @@ def combined_similarity(similarity, sentence, query):
|
|
40 |
combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
|
41 |
return combined_score,similarity,(common_words / max(len(query_words), 1))
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
big_text = """
|
44 |
<div style='text-align: center;'>
|
45 |
<h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
|
@@ -92,6 +121,8 @@ if 'is_initialized' not in st.session_state:
|
|
92 |
st.session_state.stop_words = set(stopwords.words('english'))
|
93 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
94 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
|
|
|
|
95 |
|
96 |
if 'list_count' in st.session_state:
|
97 |
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
|
@@ -173,6 +204,9 @@ if 'paragraph_sentence_encodings' in st.session_state:
|
|
173 |
original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
|
174 |
modified_paragraph = ' '.join(reordered_paragraph)
|
175 |
|
|
|
|
|
|
|
176 |
paragraph_scores.append(
|
177 |
(top_three_avg_similarity, top_three_avg_commonality,
|
178 |
{'modified_text': modified_paragraph, 'original_text': original_paragraph})
|
@@ -184,5 +218,12 @@ if 'paragraph_sentence_encodings' in st.session_state:
|
|
184 |
st.write("Top scored paragraphs and their scores:")
|
185 |
for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
|
186 |
st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
st.write("Modified Paragraph: ", paragraph['modified_text'])
|
188 |
st.write("Original Paragraph: ", paragraph['original_text'])
|
|
|
40 |
combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
|
41 |
return combined_score,similarity,(common_words / max(len(query_words), 1))
|
42 |
|
43 |
+
|
44 |
+
def paraphrase(sentence):
|
45 |
+
text = "paraphrase: " + sentence + " </s>"
|
46 |
+
|
47 |
+
encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
|
48 |
+
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
|
49 |
+
|
50 |
+
|
51 |
+
outputs = st.session_state.paraphrase_model.generate(
|
52 |
+
input_ids=input_ids, attention_mask=attention_masks,
|
53 |
+
max_length=256,
|
54 |
+
do_sample=True,
|
55 |
+
top_k=120,
|
56 |
+
top_p=0.95,
|
57 |
+
#early_stopping=True,
|
58 |
+
early_stopping=False,
|
59 |
+
#num_return_sequences=5,
|
60 |
+
num_return_sequences=1,
|
61 |
+
repetition_penalty=1.5
|
62 |
+
|
63 |
+
)
|
64 |
+
# print(f"outputs = {outputs}")
|
65 |
+
results=[]
|
66 |
+
for output in outputs:
|
67 |
+
print("*")
|
68 |
+
line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
|
69 |
+
#results.append(line)
|
70 |
+
return line
|
71 |
+
|
72 |
big_text = """
|
73 |
<div style='text-align: center;'>
|
74 |
<h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
|
|
|
121 |
st.session_state.stop_words = set(stopwords.words('english'))
|
122 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
123 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
124 |
+
st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
|
125 |
+
st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
|
126 |
|
127 |
if 'list_count' in st.session_state:
|
128 |
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
|
|
|
204 |
original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
|
205 |
modified_paragraph = ' '.join(reordered_paragraph)
|
206 |
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
paragraph_scores.append(
|
211 |
(top_three_avg_similarity, top_three_avg_commonality,
|
212 |
{'modified_text': modified_paragraph, 'original_text': original_paragraph})
|
|
|
218 |
st.write("Top scored paragraphs and their scores:")
|
219 |
for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
|
220 |
st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
|
221 |
+
|
222 |
+
output_1 = paraphrase(paragraph['modified_text'])
|
223 |
+
print(output_1)
|
224 |
+
|
225 |
+
output_2 = paraphrase(output_1)
|
226 |
+
print(output_2)
|
227 |
+
st.write("Paraphrased Paragraph: ", output_2)
|
228 |
st.write("Modified Paragraph: ", paragraph['modified_text'])
|
229 |
st.write("Original Paragraph: ", paragraph['original_text'])
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
transformers
|
2 |
torch
|
3 |
scikit-learn
|
4 |
-
nltk
|
|
|
|
|
|
1 |
transformers
|
2 |
torch
|
3 |
scikit-learn
|
4 |
+
nltk
|
5 |
+
sentencepiece
|
6 |
+
protobuf==3.20.3
|