File size: 3,090 Bytes
3240876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6be899e
3240876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993b51c
3240876
 
 
 
 
 
 
 
 
 
 
 
 
993b51c
3240876
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
"""Task22.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx

# Task 2
- Raghad Al-Rasheed
- Fawwaz Alsheikhi

using the E5 model as the embedding model and translated dataset from huggingface
"""


"""## Downloading the Embedding model"""

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial


model = SentenceTransformer("intfloat/multilingual-e5-large")

"""## Downloading Translated data from english to arabic"""

from datasets import load_dataset


ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")

import pandas as pd

df = pd.DataFrame(ds['translation'])

df['ar']

df['ar'][0]

"""### Extracting the first 10000 rows out of the data"""

df=df.head(10000)

df['ar'].shape

documents =[doc for doc in df['ar']]

documents[9999]

"""## Embedding the sentences by rows"""

embeddings = model.encode(documents)

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy

def semantic_search(query, embeddings, documents):
    query_embedding = model.encode(query)

    document_embeddings = embeddings
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([documents[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = documents[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents)

documents[999]

"""### Extracting top three related sentences"""

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

df

"""## using english with arabic to see the semantic search of multilangual model"""

df['ar']

df['en']

df_ar = df['ar'].tolist()[:5000]

df_en = df['en'].tolist()[:5000]

combined_list = df_ar + df_en

print(len(combined_list))

embeddings1 = model.encode(combined_list)

def semantic_search(query):
    query_embedding = model.encode(query)

    document_embeddings = embeddings1
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([combined_list[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = combined_list[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("لذهب بعشرة آلاف دولار؟")

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

import gradio as gr

demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
if __name__ == "__main__":
    demo.launch()