hellopahe commited on
Commit
77129d5
Β·
1 Parent(s): 46020c0
Files changed (1) hide show
  1. app.py +34 -13
app.py CHANGED
@@ -66,7 +66,7 @@ class LexRank(object):
66
  self.ht = HarvestText()
67
  def find_central(self, content: str):
68
  sentences = self.ht.cut_sentences(content)
69
- embeddings = self.model.encode(sentences, convert_to_tensor=True)
70
 
71
  # Compute the pair-wise cosine similarities
72
  cos_scores = util.cos_sim(embeddings, embeddings).numpy()
@@ -78,19 +78,20 @@ class LexRank(object):
78
  most_central_sentence_indices = numpy.argsort(-centrality_scores)
79
 
80
  num = 100
81
- ptr = 0
82
  for index in most_central_sentence_indices:
83
- num -= len(sentences[index])
84
- if num < 0 and index > 0:
85
- ptr = index + 1
86
  break
87
- return list(sentences[index] for index in most_central_sentence_indices[0: ptr])
 
 
88
 
89
  # ---===--- worker instances ---===---
90
- t_randeng = SummaryExtractor()
91
  # t_tuoling = Tuoling_6B_extractor()
92
 
93
- embedder = Embed()
 
94
  lex = LexRank()
95
 
96
 
@@ -99,9 +100,9 @@ def randeng_extract(content):
99
  output = "εŽŸζ–‡: \n"
100
  for index, sentence in enumerate(sentences):
101
  output += f"{index}: {sentence}\n"
102
- output += "ζ‘˜θ¦:\n"
103
- for index, sentence in enumerate(sentences):
104
- output += f"{index}: {t_randeng.extract(sentence)}\n"
105
  return output
106
 
107
  # def tuoling_extract(content):
@@ -117,10 +118,30 @@ def similarity_check(query, doc):
117
  # scores = list(util.cos_sim(embedding_list[-1], doc_embedding) for doc_embedding in embedding_list[:-1])
118
  return str(scores)
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  with gr.Blocks() as app:
121
  gr.Markdown("δ»ŽδΈ‹ι’ηš„ζ ‡η­Ύι€‰ζ‹©ζ΅‹θ―•ζ¨‘ε— [ζ‘˜θ¦η”Ÿζˆ,η›ΈδΌΌεΊ¦ζ£€ζ΅‹]")
122
  with gr.Tab("LexRank->Randeng-Pegasus-523M"):
123
- text_input_1 = gr.Textbox(label="θ―·θΎ“ε…₯ι•Ώζ–‡ζœ¬:", max_lines=1000)
124
  text_output_1 = gr.Textbox(label="ζ‘˜θ¦ζ–‡ζœ¬", lines=10)
125
  text_button_1 = gr.Button("η”Ÿζˆζ‘˜θ¦")
126
  # with gr.Tab("LexRank->Tuoling-6B-chatGLM"):
@@ -136,7 +157,7 @@ with gr.Blocks() as app:
136
 
137
  # text_button.click(tuoling_extract, inputs=text_input, outputs=text_output)
138
  text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
139
- text_button_similarity.click(similarity_check, inputs=[text_input_query, text_input_doc], outputs=text_output_similarity)
140
 
141
  app.launch(
142
  share=True,
 
66
  self.ht = HarvestText()
67
  def find_central(self, content: str):
68
  sentences = self.ht.cut_sentences(content)
69
+ embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu()
70
 
71
  # Compute the pair-wise cosine similarities
72
  cos_scores = util.cos_sim(embeddings, embeddings).numpy()
 
78
  most_central_sentence_indices = numpy.argsort(-centrality_scores)
79
 
80
  num = 100
81
+ res = []
82
  for index in most_central_sentence_indices:
83
+ if num < 0:
 
 
84
  break
85
+ res.append(sentences[index])
86
+ num -= len(sentences[index])
87
+ return res
88
 
89
  # ---===--- worker instances ---===---
90
+ # t_randeng = SummaryExtractor()
91
  # t_tuoling = Tuoling_6B_extractor()
92
 
93
+ # embedder = Embed()
94
+ embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
95
  lex = LexRank()
96
 
97
 
 
100
  output = "εŽŸζ–‡: \n"
101
  for index, sentence in enumerate(sentences):
102
  output += f"{index}: {sentence}\n"
103
+ # output += "ζ‘˜θ¦:\n"
104
+ # for index, sentence in enumerate(sentences):
105
+ # output += f"{index}: {t_randeng.extract(sentence)}\n"
106
  return output
107
 
108
  # def tuoling_extract(content):
 
118
  # scores = list(util.cos_sim(embedding_list[-1], doc_embedding) for doc_embedding in embedding_list[:-1])
119
  return str(scores)
120
 
121
+ def similarity_search(queries, doc):
122
+ doc_list = doc.split('\n')
123
+ query_list = queries.split('\n')
124
+
125
+ corpus_embeddings = embedder.encode(doc_list, convert_to_tensor=True)
126
+ top_k = min(5, len(doc_list))
127
+ output = ""
128
+ for query in query_list:
129
+ query_embedding = embedder.encode(query, convert_to_tensor=True)
130
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
131
+ cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
132
+ top_results = torch.topk(cos_scores, k=top_k)
133
+ output += "\n\n======================\n\n"
134
+ output += f"Query: {query}"
135
+ output += "\nTop 5 most similar sentences in corpus:"
136
+ for score, idx in zip(top_results[0], top_results[1]):
137
+ output += f"{doc_list[idx]}(Score: {score})"
138
+ return output
139
+
140
+
141
  with gr.Blocks() as app:
142
  gr.Markdown("δ»ŽδΈ‹ι’ηš„ζ ‡η­Ύι€‰ζ‹©ζ΅‹θ―•ζ¨‘ε— [ζ‘˜θ¦η”Ÿζˆ,η›ΈδΌΌεΊ¦ζ£€ζ΅‹]")
143
  with gr.Tab("LexRank->Randeng-Pegasus-523M"):
144
+ text_input_1 = gr.Textbox(label="θ―·θΎ“ε…₯ι•Ώζ–‡ζœ¬:", lines=10, max_lines=1000)
145
  text_output_1 = gr.Textbox(label="ζ‘˜θ¦ζ–‡ζœ¬", lines=10)
146
  text_button_1 = gr.Button("η”Ÿζˆζ‘˜θ¦")
147
  # with gr.Tab("LexRank->Tuoling-6B-chatGLM"):
 
157
 
158
  # text_button.click(tuoling_extract, inputs=text_input, outputs=text_output)
159
  text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
160
+ text_button_similarity.click(similarity_search, inputs=[text_input_query, text_input_doc], outputs=text_output_similarity)
161
 
162
  app.launch(
163
  share=True,