Spaces:
Build error
Build error
hellopahe
commited on
Commit
Β·
77129d5
1
Parent(s):
46020c0
fix
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ class LexRank(object):
|
|
66 |
self.ht = HarvestText()
|
67 |
def find_central(self, content: str):
|
68 |
sentences = self.ht.cut_sentences(content)
|
69 |
-
embeddings = self.model.encode(sentences, convert_to_tensor=True)
|
70 |
|
71 |
# Compute the pair-wise cosine similarities
|
72 |
cos_scores = util.cos_sim(embeddings, embeddings).numpy()
|
@@ -78,19 +78,20 @@ class LexRank(object):
|
|
78 |
most_central_sentence_indices = numpy.argsort(-centrality_scores)
|
79 |
|
80 |
num = 100
|
81 |
-
|
82 |
for index in most_central_sentence_indices:
|
83 |
-
num
|
84 |
-
if num < 0 and index > 0:
|
85 |
-
ptr = index + 1
|
86 |
break
|
87 |
-
|
|
|
|
|
88 |
|
89 |
# ---===--- worker instances ---===---
|
90 |
-
t_randeng = SummaryExtractor()
|
91 |
# t_tuoling = Tuoling_6B_extractor()
|
92 |
|
93 |
-
embedder = Embed()
|
|
|
94 |
lex = LexRank()
|
95 |
|
96 |
|
@@ -99,9 +100,9 @@ def randeng_extract(content):
|
|
99 |
output = "εζ: \n"
|
100 |
for index, sentence in enumerate(sentences):
|
101 |
output += f"{index}: {sentence}\n"
|
102 |
-
output += "ζθ¦:\n"
|
103 |
-
for index, sentence in enumerate(sentences):
|
104 |
-
|
105 |
return output
|
106 |
|
107 |
# def tuoling_extract(content):
|
@@ -117,10 +118,30 @@ def similarity_check(query, doc):
|
|
117 |
# scores = list(util.cos_sim(embedding_list[-1], doc_embedding) for doc_embedding in embedding_list[:-1])
|
118 |
return str(scores)
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
with gr.Blocks() as app:
|
121 |
gr.Markdown("δ»δΈι’ηζ ηΎιζ©ζ΅θ―樑ε [ζθ¦ηζ,ηΈδΌΌεΊ¦ζ£ζ΅]")
|
122 |
with gr.Tab("LexRank->Randeng-Pegasus-523M"):
|
123 |
-
text_input_1 = gr.Textbox(label="θ―·θΎε
₯ιΏζζ¬:", max_lines=1000)
|
124 |
text_output_1 = gr.Textbox(label="ζθ¦ζζ¬", lines=10)
|
125 |
text_button_1 = gr.Button("ηζζθ¦")
|
126 |
# with gr.Tab("LexRank->Tuoling-6B-chatGLM"):
|
@@ -136,7 +157,7 @@ with gr.Blocks() as app:
|
|
136 |
|
137 |
# text_button.click(tuoling_extract, inputs=text_input, outputs=text_output)
|
138 |
text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
|
139 |
-
text_button_similarity.click(
|
140 |
|
141 |
app.launch(
|
142 |
share=True,
|
|
|
66 |
self.ht = HarvestText()
|
67 |
def find_central(self, content: str):
|
68 |
sentences = self.ht.cut_sentences(content)
|
69 |
+
embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu()
|
70 |
|
71 |
# Compute the pair-wise cosine similarities
|
72 |
cos_scores = util.cos_sim(embeddings, embeddings).numpy()
|
|
|
78 |
most_central_sentence_indices = numpy.argsort(-centrality_scores)
|
79 |
|
80 |
num = 100
|
81 |
+
res = []
|
82 |
for index in most_central_sentence_indices:
|
83 |
+
if num < 0:
|
|
|
|
|
84 |
break
|
85 |
+
res.append(sentences[index])
|
86 |
+
num -= len(sentences[index])
|
87 |
+
return res
|
88 |
|
89 |
# ---===--- worker instances ---===---
|
90 |
+
# t_randeng = SummaryExtractor()
|
91 |
# t_tuoling = Tuoling_6B_extractor()
|
92 |
|
93 |
+
# embedder = Embed()
|
94 |
+
embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
|
95 |
lex = LexRank()
|
96 |
|
97 |
|
|
|
100 |
output = "εζ: \n"
|
101 |
for index, sentence in enumerate(sentences):
|
102 |
output += f"{index}: {sentence}\n"
|
103 |
+
# output += "ζθ¦:\n"
|
104 |
+
# for index, sentence in enumerate(sentences):
|
105 |
+
# output += f"{index}: {t_randeng.extract(sentence)}\n"
|
106 |
return output
|
107 |
|
108 |
# def tuoling_extract(content):
|
|
|
118 |
# scores = list(util.cos_sim(embedding_list[-1], doc_embedding) for doc_embedding in embedding_list[:-1])
|
119 |
return str(scores)
|
120 |
|
121 |
+
def similarity_search(queries, doc):
|
122 |
+
doc_list = doc.split('\n')
|
123 |
+
query_list = queries.split('\n')
|
124 |
+
|
125 |
+
corpus_embeddings = embedder.encode(doc_list, convert_to_tensor=True)
|
126 |
+
top_k = min(5, len(doc_list))
|
127 |
+
output = ""
|
128 |
+
for query in query_list:
|
129 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
130 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
131 |
+
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
|
132 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
133 |
+
output += "\n\n======================\n\n"
|
134 |
+
output += f"Query: {query}"
|
135 |
+
output += "\nTop 5 most similar sentences in corpus:"
|
136 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
137 |
+
output += f"{doc_list[idx]}(Score: {score})"
|
138 |
+
return output
|
139 |
+
|
140 |
+
|
141 |
with gr.Blocks() as app:
|
142 |
gr.Markdown("δ»δΈι’ηζ ηΎιζ©ζ΅θ―樑ε [ζθ¦ηζ,ηΈδΌΌεΊ¦ζ£ζ΅]")
|
143 |
with gr.Tab("LexRank->Randeng-Pegasus-523M"):
|
144 |
+
text_input_1 = gr.Textbox(label="θ―·θΎε
₯ιΏζζ¬:", lines=10, max_lines=1000)
|
145 |
text_output_1 = gr.Textbox(label="ζθ¦ζζ¬", lines=10)
|
146 |
text_button_1 = gr.Button("ηζζθ¦")
|
147 |
# with gr.Tab("LexRank->Tuoling-6B-chatGLM"):
|
|
|
157 |
|
158 |
# text_button.click(tuoling_extract, inputs=text_input, outputs=text_output)
|
159 |
text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
|
160 |
+
text_button_similarity.click(similarity_search, inputs=[text_input_query, text_input_doc], outputs=text_output_similarity)
|
161 |
|
162 |
app.launch(
|
163 |
share=True,
|