Spaces:
Runtime error
Runtime error
import pandas as pd | |
import os | |
import pickle | |
from sentence_transformers import SentenceTransformer, util | |
import streamlit as st | |
import io | |
import torch | |
def load_model(): | |
return SentenceTransformer('minilm_sbert') | |
def find_top_similar(sentence, corpus_sentences, corpus_embeddings): | |
# preprocess query | |
model = load_model() | |
query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor | |
# query_embeddings = query_embeddings.to('cuda') # put into gpu | |
query_embeddings = util.normalize_embeddings(query_embeddings) # normalize | |
# find the closest 5 sentences of the corpus for each query sentence based on cosine similarity | |
hits = util.semantic_search(query_embeddings, | |
corpus_embeddings, | |
top_k=len(corpus_embeddings), | |
score_function=util.dot_score) | |
hits = hits[0] # get the hits for the first query | |
# Create dataframe to store top searches | |
records = [] | |
for hit in hits[0:len(corpus_embeddings)]: | |
records.append(corpus_sentences[hit['corpus_id']]) | |
return records | |
def top_k_similarity(df, query, corpus_sentences, corpus_embeddings): | |
hits = find_top_similar([query], corpus_sentences, corpus_embeddings) | |
res = pd.DataFrame() | |
for h in hits: | |
s = df[df['Last job role'] == h] | |
res = pd.concat([res, s]) | |
return res | |
def get_result(df, query, corpus_sentences, corpus_embeddings): | |
result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings) | |
result.drop_duplicates(inplace=True) | |
return result | |
class cpu_unpickler(pickle.Unpickler): | |
""" | |
Overrides the default behavior of the `Unpickler` class to load | |
a `torch.storage` object from abyte string | |
""" | |
def find_class(self, module, name): | |
if module == 'torch.storage' and name == '_load_from_bytes': | |
return lambda b: torch.load(io.BytesIO(b), map_location='cpu') | |
return super().find_class(module, name) | |
def load_embedding(): | |
"""Loads the embeddings from the pickle file""" | |
with open('corpus_embeddings.pkl', 'rb') as file: | |
cache_data = cpu_unpickler(file).load() | |
corpus_sentences = cache_data['sentences'] | |
corpus_embeddings = cache_data['embeddings'] | |
return corpus_sentences, corpus_embeddings | |
def main(): | |
# get dataset | |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y' | |
sheet_name = 'Form Response 3'.replace(' ', '%20') | |
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}' | |
print(url) | |
df = pd.read_csv(url) | |
df = df.iloc[: , :7] | |
# get embeddings | |
corpus_sentences, corpus_embeddings = load_embedding() | |
# streamlit form | |
st.title('Job Posting Similarity') | |
job_title = st.text_input('Insert the job title below:', '') | |
submitted = st.button('Submit') | |
if submitted: | |
result = get_result(df, job_title, corpus_sentences, corpus_embeddings) | |
result.reset_index(drop=True, inplace=True) | |
result.index += 1 | |
st.table(result) | |
if __name__ == '__main__': | |
main() | |