# -*- coding: utf-8 -*- """Task22.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx # Task 2 - Raghad Al-Rasheed - Fawwaz Alsheikhi using the E5 model as the embedding model and translated dataset from huggingface """ !pip install sentence_transformers """## Downloading the Embedding model""" from sentence_transformers import SentenceTransformer import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize import math from scipy import spatial model = SentenceTransformer("intfloat/multilingual-e5-large").to('cuda') """## Downloading Translated data from english to arabic""" !pip3 install datasets from datasets import load_dataset ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train") import pandas as pd df = pd.DataFrame(ds['translation']) df['ar'] df['ar'][0] """### Extracting the first 10000 rows out of the data""" df=df.head(10000) df['ar'].shape documents =[doc for doc in df['ar']] documents[9999] """## Embedding the sentences by rows""" embeddings = model.encode(documents) from sentence_transformers import SentenceTransformer import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize import math from scipy import spatial import scipy def semantic_search(query, embeddings, documents): query_embedding = model.encode(query) document_embeddings = embeddings scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings] ls1 = list() for i, score in enumerate(scores): ls1.append([documents[i],score]) print(scores.index(min(scores))) most_similar_doc = documents[scores.index(min(scores))] print("Most similar document", most_similar_doc) return ls1 output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents) documents[999] """### Extracting top three related sentences""" ranked = sorted(output, key=lambda x: x[1]) ranked[:3] df """## using english with arabic to see the semantic search of multilangual model""" df['ar'] df['en'] df_ar = df['ar'].tolist()[:5000] df_en = df['en'].tolist()[:5000] combined_list = df_ar + df_en print(len(combined_list)) embeddings1 = model.encode(combined_list) from sentence_transformers import SentenceTransformer import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize import math from scipy import spatial import scipy def semantic_search(query, embeddings1, combined_list): query_embedding = model.encode(query) document_embeddings = embeddings1 scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings] ls1 = list() for i, score in enumerate(scores): ls1.append([combined_list[i],score]) print(scores.index(min(scores))) most_similar_doc = combined_list[scores.index(min(scores))] print("Most similar document", most_similar_doc) return ls1 output = semantic_search("لذهب بعشرة آلاف دولار؟",embeddings1, combined_list) ranked = sorted(output, key=lambda x: x[1]) ranked[:3] import gradio as gr demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"]) if __name__ == "__main__": demo.launch()