File size: 3,395 Bytes
3240876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""Task22.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx

# Task 2
- Raghad Al-Rasheed
- Fawwaz Alsheikhi

using the E5 model as the embedding model and translated dataset from huggingface
"""

!pip install sentence_transformers

"""## Downloading the Embedding model"""

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial


model = SentenceTransformer("intfloat/multilingual-e5-large").to('cuda')

"""## Downloading Translated data from english to arabic"""

!pip3 install datasets
from datasets import load_dataset


ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")

import pandas as pd

df = pd.DataFrame(ds['translation'])

df['ar']

df['ar'][0]

"""### Extracting the first 10000 rows out of the data"""

df=df.head(10000)

df['ar'].shape

documents =[doc for doc in df['ar']]

documents[9999]

"""## Embedding the sentences by rows"""

embeddings = model.encode(documents)

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy

def semantic_search(query, embeddings, documents):
    query_embedding = model.encode(query)

    document_embeddings = embeddings
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([documents[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = documents[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents)

documents[999]

"""### Extracting top three related sentences"""

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

df

"""## using english with arabic to see the semantic search of multilangual model"""

df['ar']

df['en']

df_ar = df['ar'].tolist()[:5000]

df_en = df['en'].tolist()[:5000]

combined_list = df_ar + df_en

print(len(combined_list))

embeddings1 = model.encode(combined_list)

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy

def semantic_search(query, embeddings1, combined_list):
    query_embedding = model.encode(query)

    document_embeddings = embeddings1
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([combined_list[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = combined_list[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("لذهب بعشرة آلاف دولار؟",embeddings1, combined_list)

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

import gradio as gr

demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
if __name__ == "__main__":
    demo.launch()