Spaces:
Runtime error
Runtime error
File size: 3,090 Bytes
3240876 6be899e 3240876 993b51c 3240876 993b51c 3240876 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# -*- coding: utf-8 -*-
"""Task22.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1yBvg6i_GsMk--P2nuSG-mfqCDbuIcEpx
# Task 2
- Raghad Al-Rasheed
- Fawwaz Alsheikhi
using the E5 model as the embedding model and translated dataset from huggingface
"""
"""## Downloading the Embedding model"""
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
model = SentenceTransformer("intfloat/multilingual-e5-large")
"""## Downloading Translated data from english to arabic"""
from datasets import load_dataset
ds = load_dataset("Helsinki-NLP/news_commentary", "ar-en",split="train")
import pandas as pd
df = pd.DataFrame(ds['translation'])
df['ar']
df['ar'][0]
"""### Extracting the first 10000 rows out of the data"""
df=df.head(10000)
df['ar'].shape
documents =[doc for doc in df['ar']]
documents[9999]
"""## Embedding the sentences by rows"""
embeddings = model.encode(documents)
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import math
from scipy import spatial
import scipy
def semantic_search(query, embeddings, documents):
query_embedding = model.encode(query)
document_embeddings = embeddings
scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
ls1 = list()
for i, score in enumerate(scores):
ls1.append([documents[i],score])
print(scores.index(min(scores)))
most_similar_doc = documents[scores.index(min(scores))]
print("Most similar document", most_similar_doc)
return ls1
output = semantic_search("ـ لم يكن من السهل قط أن ينخرط المرء في محادثة عقلانية حول قيمة الذهب.",embeddings, documents)
documents[999]
"""### Extracting top three related sentences"""
ranked = sorted(output, key=lambda x: x[1])
ranked[:3]
df
"""## using english with arabic to see the semantic search of multilangual model"""
df['ar']
df['en']
df_ar = df['ar'].tolist()[:5000]
df_en = df['en'].tolist()[:5000]
combined_list = df_ar + df_en
print(len(combined_list))
embeddings1 = model.encode(combined_list)
def semantic_search(query):
query_embedding = model.encode(query)
document_embeddings = embeddings1
scores = [scipy.spatial.distance.cosine(query_embedding, doc) for doc in document_embeddings]
ls1 = list()
for i, score in enumerate(scores):
ls1.append([combined_list[i],score])
print(scores.index(min(scores)))
most_similar_doc = combined_list[scores.index(min(scores))]
print("Most similar document", most_similar_doc)
return ls1
output = semantic_search("لذهب بعشرة آلاف دولار؟")
ranked = sorted(output, key=lambda x: x[1])
ranked[:3]
import gradio as gr
demo = gr.Interface(fn=semantic_search,inputs = ["text"], outputs=["text", "text", "text"])
if __name__ == "__main__":
demo.launch()
|