File size: 1,509 Bytes
3611e07
 
 
 
 
6a0cb69
3611e07
 
 
19d5657
3611e07
19d5657
3611e07
 
6a0cb69
3611e07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a0cb69
4d46ad9
3611e07
 
 
 
 
 
e4369a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy, nltk
nltk.download('punkt')


from harvesttext import HarvestText
from lex_rank_util import degree_centrality_scores, find_siblings
from sentence_transformers import SentenceTransformer, util


class LexRankL12(object):
    def __init__(self):
        self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        self.ht = HarvestText()

    def find_central(self, content: str, num=10, siblings=0):
        if self.contains_chinese(content):
            sentences = self.ht.cut_sentences(content)
        else:
            sentences = nltk.sent_tokenize(content)
        embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu()

        # Compute the pair-wise cosine similarities
        cos_scores = util.cos_sim(embeddings, embeddings).numpy()

        # Compute the centrality for each sentence
        centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

        # We argsort so that the first element is the sentence with the highest score
        most_central_sentence_indices = numpy.argsort(-centrality_scores)

        # num = 100
        res = []
        for index in most_central_sentence_indices:
            if num < 0:
                break
            res.append(find_siblings(sentences, index, siblings)[1])
            num -= 1
        return res

    def contains_chinese(self, content: str):
        for _char in content:
            if '\u4e00' <= _char <= '\u9fa5':
                return True
        return False