# -*- coding: utf-8 -*-
"""Task22.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx

# Task 2
- Raghad Al-Rasheed
- Fawwaz Alsheikhi

using the E5 model as the embedding model and translated dataset from huggingface
"""

!pip install sentence_transformers

"""## Downloading the Embedding model"""

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial


model = SentenceTransformer("intfloat/multilingual-e5-large").to('cuda')

"""## Downloading Translated data from english to arabic"""

!pip3 install datasets
from datasets import load_dataset


ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")

import pandas as pd

df = pd.DataFrame(ds['translation'])

df['ar']

df['ar'][0]

"""### Extracting the first 10000 rows out of the data"""

df=df.head(10000)

df['ar'].shape

documents =[doc for doc in df['ar']]

documents[9999]

"""## Embedding the sentences by rows"""

embeddings = model.encode(documents)

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy

def semantic_search(query, embeddings, documents):
    query_embedding = model.encode(query)

    document_embeddings = embeddings
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([documents[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = documents[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents)

documents[999]

"""### Extracting top three related sentences"""

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

df

"""## using english with arabic to see the semantic search of multilangual model"""

df['ar']

df['en']

df_ar = df['ar'].tolist()[:5000]

df_en = df['en'].tolist()[:5000]

combined_list = df_ar + df_en

print(len(combined_list))

embeddings1 = model.encode(combined_list)

from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy

def semantic_search(query, embeddings1, combined_list):
    query_embedding = model.encode(query)

    document_embeddings = embeddings1
    scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
    ls1 = list()
    for i, score in enumerate(scores):
        ls1.append([combined_list[i],score])

    print(scores.index(min(scores)))
    most_similar_doc = combined_list[scores.index(min(scores))]
    print("Most similar document", most_similar_doc)
    return ls1

output = semantic_search("لذهب بعشرة آلاف دولار؟",embeddings1, combined_list)

ranked = sorted(output, key=lambda x: x[1])
ranked[:3]

import gradio as gr

demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
if __name__ == "__main__":
    demo.launch()