Spaces:
Runtime error
Runtime error
Elvan Selvano
commited on
Commit
·
688b98f
1
Parent(s):
98d0028
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import pickle
|
4 |
+
from sentence_transformers import SentenceTransformer, util
|
5 |
+
import streamlit as st
|
6 |
+
import io
|
7 |
+
import torch
|
8 |
+
|
9 |
+
@st.cache(allow_output_mutation=True)
|
10 |
+
def load_model():
|
11 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
12 |
+
|
13 |
+
def find_top_similar(sentence, corpus_sentences, corpus_embeddings):
|
14 |
+
|
15 |
+
# preprocess query
|
16 |
+
model = load_model()
|
17 |
+
query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor
|
18 |
+
# query_embeddings = query_embeddings.to('cuda') # put into gpu
|
19 |
+
query_embeddings = util.normalize_embeddings(query_embeddings) # normalize
|
20 |
+
|
21 |
+
# find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
|
22 |
+
hits = util.semantic_search(query_embeddings,
|
23 |
+
corpus_embeddings,
|
24 |
+
top_k=len(corpus_embeddings),
|
25 |
+
score_function=util.dot_score)
|
26 |
+
hits = hits[0] # get the hits for the first query
|
27 |
+
|
28 |
+
# Create dataframe to store top searches
|
29 |
+
records = []
|
30 |
+
|
31 |
+
for hit in hits[0:len(corpus_embeddings)]:
|
32 |
+
records.append(corpus_sentences[hit['corpus_id']])
|
33 |
+
|
34 |
+
return records
|
35 |
+
|
36 |
+
def top_k_similarity(df, query, corpus_sentences, corpus_embeddings):
|
37 |
+
hits = find_top_similar([query], corpus_sentences, corpus_embeddings)
|
38 |
+
|
39 |
+
res = pd.DataFrame()
|
40 |
+
|
41 |
+
for h in hits:
|
42 |
+
s = df[df['Last job role'] == h]
|
43 |
+
res = pd.concat([res, s])
|
44 |
+
|
45 |
+
return res
|
46 |
+
|
47 |
+
def get_result(df, query, corpus_sentences, corpus_embeddings):
|
48 |
+
result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings)
|
49 |
+
result.drop_duplicates(inplace=True)
|
50 |
+
return result
|
51 |
+
|
52 |
+
class cpu_unpickler(pickle.Unpickler):
|
53 |
+
"""
|
54 |
+
Overrides the default behavior of the `Unpickler` class to load
|
55 |
+
a `torch.storage` object from abyte string
|
56 |
+
"""
|
57 |
+
def find_class(self, module, name):
|
58 |
+
if module == 'torch.storage' and name == '_load_from_bytes':
|
59 |
+
return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
|
60 |
+
return super().find_class(module, name)
|
61 |
+
|
62 |
+
@st.cache(allow_output_mutation=True)
|
63 |
+
def load_embedding():
|
64 |
+
"""Loads the embeddings from the pickle file"""
|
65 |
+
with open('corpus_embeddings.pkl', 'rb') as file:
|
66 |
+
cache_data = cpu_unpickler(file).load()
|
67 |
+
corpus_sentences = cache_data['sentences']
|
68 |
+
corpus_embeddings = cache_data['embeddings']
|
69 |
+
|
70 |
+
return corpus_sentences, corpus_embeddings
|
71 |
+
|
72 |
+
def main():
|
73 |
+
# get dataset
|
74 |
+
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
|
75 |
+
sheet_name = 'Form Response 3'.replace(' ', '%20')
|
76 |
+
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
|
77 |
+
print(url)
|
78 |
+
df = pd.read_csv(url)
|
79 |
+
df = df.iloc[: , :7]
|
80 |
+
|
81 |
+
# get embeddings
|
82 |
+
corpus_sentences, corpus_embeddings = load_embedding()
|
83 |
+
|
84 |
+
# streamlit form
|
85 |
+
st.title('Job Posting Similarity')
|
86 |
+
job_title = st.text_input('Insert the job title below:', '')
|
87 |
+
submitted = st.button('Submit')
|
88 |
+
|
89 |
+
if submitted:
|
90 |
+
result = get_result(df, job_title, corpus_sentences, corpus_embeddings)
|
91 |
+
result.reset_index(drop=True, inplace=True)
|
92 |
+
result.index += 1
|
93 |
+
st.table(result)
|
94 |
+
|
95 |
+
if __name__ == '__main__':
|
96 |
+
main()
|