File size: 1,684 Bytes
81d5bd9
 
7b26957
85854a6
5636d63
81d5bd9
d341f84
 
85854a6
d341f84
85854a6
 
d341f84
81d5bd9
399c0dd
81d5bd9
d341f84
81d5bd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a501843
81d5bd9
 
 
a8e0066
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gradio as gr
from sentence_transformers import SentenceTransformer, util
import string, re
from cleanco import basename

model = None
def prepare(text):
    text = text.translate(str.maketrans('', '', string.punctuation + 'β€β€œ'))
    pattern = r"\b(?=[MDCLXVII])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})([II]X|[II]V|V?[II]{0,3})\b\.?"
    text = re.sub(pattern, '', text)
    text = basename(text).upper()
    return text
    
def semantic(company_1, company_2):
    global model 
    # Single list of sentences
    sentences = [prepare(company_1), prepare(company_2)]
    if model is None:
        model = SentenceTransformer('all-mpnet-base-v2')
    #Compute embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)
    #Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.cos_sim(embeddings, embeddings)
    #Find the pairs with the highest cosine similarity scores
    pairs = []
    for i in range(len(cosine_scores)-1):
        for j in range(i+1, len(cosine_scores)):
            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
    #Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
    for pair in pairs:
        return "{:.4f}".format(pair['score'])
    
company_1 = "Growth Capital Acquisition Corp"
company_2 = None # "Growth Capital Acquisition Corp III"

title = 'sentences_semantic'
gr.Interface(semantic,inputs=[gr.inputs.Textbox(lines=1, default=company_1, label="Company_1"), gr.inputs.Textbox(lines=1, default=company_2, label="Company_2")],
             outputs=[gr.outputs.Textbox(type="auto",label="Score")],title = title).launch()