import pandas as pd import os import pickle from sentence_transformers import SentenceTransformer, util import streamlit as st import io import torch @st.cache(allow_output_mutation=True) def load_model(): return SentenceTransformer('all-MiniLM-L6-v2') def find_top_similar(sentence, corpus_sentences, corpus_embeddings): # preprocess query model = load_model() query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor # query_embeddings = query_embeddings.to('cuda') # put into gpu query_embeddings = util.normalize_embeddings(query_embeddings) # normalize # find the closest 5 sentences of the corpus for each query sentence based on cosine similarity hits = util.semantic_search(query_embeddings, corpus_embeddings, top_k=len(corpus_embeddings), score_function=util.dot_score) hits = hits[0] # get the hits for the first query # Create dataframe to store top searches records = [] for hit in hits[0:len(corpus_embeddings)]: records.append(corpus_sentences[hit['corpus_id']]) return records def top_k_similarity(df, query, corpus_sentences, corpus_embeddings): hits = find_top_similar([query], corpus_sentences, corpus_embeddings) res = pd.DataFrame() for h in hits: s = df[df['Last job role'] == h] res = pd.concat([res, s]) return res def get_result(df, query, corpus_sentences, corpus_embeddings): result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings) result.drop_duplicates(inplace=True) return result class cpu_unpickler(pickle.Unpickler): """ Overrides the default behavior of the `Unpickler` class to load a `torch.storage` object from abyte string """ def find_class(self, module, name): if module == 'torch.storage' and name == '_load_from_bytes': return lambda b: torch.load(io.BytesIO(b), map_location='cpu') return super().find_class(module, name) @st.cache(allow_output_mutation=True) def load_embedding(): """Loads the embeddings from the pickle file""" with open('corpus_embeddings.pkl', 'rb') as file: cache_data = cpu_unpickler(file).load() corpus_sentences = cache_data['sentences'] corpus_embeddings = cache_data['embeddings'] return corpus_sentences, corpus_embeddings def main(): # get dataset sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y' sheet_name = 'Form Response 3'.replace(' ', '%20') url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}' print(url) df = pd.read_csv(url) df = df.iloc[: , :7] # get embeddings corpus_sentences, corpus_embeddings = load_embedding() # streamlit form st.title('Job Posting Similarity') job_title = st.text_input('Insert the job title below:', '') submitted = st.button('Submit') if submitted: result = get_result(df, job_title, corpus_sentences, corpus_embeddings) result.reset_index(drop=True, inplace=True) result.index += 1 st.table(result) if __name__ == '__main__': main()