Spaces:
Runtime error
Runtime error
Elvan Selvano
commited on
Commit
β’
fdb6b1e
1
Parent(s):
c06b14d
Restructure inference steps
Browse files
app.py
CHANGED
@@ -1,116 +1,103 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
from sentence_transformers import SentenceTransformer, util
|
3 |
import streamlit as st
|
4 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
5 |
-
from cpu_unpickler import cpu_unpickler
|
6 |
st.set_page_config(layout='wide')
|
7 |
|
8 |
@st.cache(allow_output_mutation=True)
|
9 |
def load_model():
|
10 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
11 |
|
12 |
-
def
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
# query_embeddings = query_embeddings.to('cuda') # put into gpu
|
17 |
-
query_embeddings = util.normalize_embeddings(query_embeddings) # normalize
|
18 |
|
19 |
-
# find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
|
20 |
hits = util.semantic_search(query_embeddings,
|
21 |
corpus_embeddings,
|
22 |
top_k=len(corpus_embeddings),
|
23 |
score_function=util.dot_score)
|
24 |
-
hits = hits[0] # get the hits for the first query
|
25 |
-
|
26 |
-
# Create dataframe to store top searches
|
27 |
-
records = []
|
28 |
-
|
29 |
-
for hit in hits[0:len(corpus_embeddings)]:
|
30 |
-
records.append(corpus_sentences[hit['corpus_id']])
|
31 |
-
|
32 |
-
return records
|
33 |
|
34 |
-
|
35 |
-
hits = find_top_similar([query], corpus_sentences, corpus_embeddings)
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
res = pd.concat([res, s])
|
42 |
-
|
43 |
-
return res
|
44 |
-
|
45 |
-
def get_result(df, query, corpus_sentences, corpus_embeddings):
|
46 |
-
result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings)
|
47 |
-
result.drop_duplicates(inplace=True)
|
48 |
return result
|
49 |
|
50 |
@st.cache(allow_output_mutation=True)
|
51 |
-
def
|
52 |
-
"""
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
return
|
59 |
-
|
60 |
-
def
|
61 |
-
|
62 |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
|
63 |
sheet_name = 'Form Response 3'.replace(' ', '%20')
|
64 |
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
corpus_sentences, corpus_embeddings = load_embedding()
|
70 |
-
|
71 |
-
# streamlit form
|
72 |
st.title('Job Posting Similarity')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
job_title = st.text_input('Insert the job title below:', '')
|
74 |
submitted = st.button('Submit')
|
75 |
|
76 |
if submitted:
|
77 |
-
st.info(f'Showing results for {
|
78 |
-
result =
|
79 |
-
result
|
80 |
-
result.index += 1
|
81 |
|
82 |
st.download_button(
|
83 |
-
"
|
84 |
result.to_csv().encode('utf-8'),
|
85 |
"result.csv",
|
86 |
"text/csv",
|
87 |
key='download-csv'
|
88 |
)
|
89 |
|
90 |
-
|
91 |
-
gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
|
92 |
-
# gb.configure_side_bar() #Add a sidebar
|
93 |
-
# gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children") #Enable multi-row selection
|
94 |
-
|
95 |
-
gb.configure_column("LinkedIn Link",
|
96 |
-
headerName="LinkedIn Link",
|
97 |
-
# cellRenderer=JsCode('''function(params) {return '<a href=params.value + '" target="_blank">'+ params.value+'</a>'}'''),
|
98 |
-
cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''),
|
99 |
-
width=300)
|
100 |
-
|
101 |
-
gridOptions = gb.build()
|
102 |
-
|
103 |
-
grid_response = AgGrid(
|
104 |
-
dataframe=result,
|
105 |
-
gridOptions=gridOptions,
|
106 |
-
height=1100,
|
107 |
-
fit_columns_on_grid_load=True,
|
108 |
-
data_return_mode='AS_INPUT',
|
109 |
-
update_mode='VALUE_CHANGED',
|
110 |
-
theme='light',
|
111 |
-
enable_enterprise_modules=True,
|
112 |
-
allow_unsafe_jscode=True,
|
113 |
-
)
|
114 |
|
115 |
if __name__ == '__main__':
|
116 |
main()
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
import pandas as pd
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import streamlit as st
|
5 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
|
|
6 |
st.set_page_config(layout='wide')
|
7 |
|
8 |
@st.cache(allow_output_mutation=True)
|
9 |
def load_model():
|
10 |
return SentenceTransformer('all-MiniLM-L6-v2')
|
11 |
|
12 |
+
def semantic_search(model, sentence, corpus_embeddings):
|
13 |
+
query_embeddings = model.encode(sentence,
|
14 |
+
convert_to_tensor=True,
|
15 |
+
normalize_embeddings=True)
|
|
|
|
|
16 |
|
|
|
17 |
hits = util.semantic_search(query_embeddings,
|
18 |
corpus_embeddings,
|
19 |
top_k=len(corpus_embeddings),
|
20 |
score_function=util.dot_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
return pd.DataFrame(hits[0])
|
|
|
23 |
|
24 |
+
def top_k_similarity(model, df, query, corpus_embeddings):
|
25 |
+
hits = semantic_search(model, [query], corpus_embeddings)
|
26 |
+
result = pd.merge(df, hits, left_on='ID', right_on='corpus_id')
|
27 |
+
result.sort_values(by='score', ascending=False, inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
return result
|
29 |
|
30 |
@st.cache(allow_output_mutation=True)
|
31 |
+
def create_embedding(model: SentenceTransformer, data: pd.DataFrame, key: str) -> Tuple[list, list]:
|
32 |
+
"""Create vector embeddings from the dataset"""
|
33 |
+
corpus_sentences = data[key].astype(str).tolist()
|
34 |
+
corpus_embeddings = model.encode(sentences=corpus_sentences,
|
35 |
+
show_progress_bar=True,
|
36 |
+
convert_to_tensor=True,
|
37 |
+
normalize_embeddings=True)
|
38 |
+
return corpus_embeddings
|
39 |
+
|
40 |
+
def load_dataset(columns: List) -> pd.DataFrame:
|
41 |
+
"""Load real-time dataset from google sheets"""
|
42 |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
|
43 |
sheet_name = 'Form Response 3'.replace(' ', '%20')
|
44 |
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
|
45 |
+
data = pd.read_csv(url)
|
46 |
+
data = data.iloc[: , :7]
|
47 |
+
data.columns = columns
|
48 |
+
data.insert(0, 'ID', range(len(data)))
|
49 |
+
return data
|
50 |
+
|
51 |
+
def show_aggrid_table(result: pd.DataFrame):
|
52 |
+
gb = GridOptionsBuilder.from_dataframe(result)
|
53 |
+
gb.configure_pagination(paginationAutoPageSize=True)
|
54 |
+
gb.configure_side_bar()
|
55 |
+
gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children")
|
56 |
+
gb.configure_column(field="LinkedIn Profile",
|
57 |
+
headerName="LinkedIn Profile",
|
58 |
+
cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''))
|
59 |
+
|
60 |
+
gridOptions = gb.build()
|
61 |
+
|
62 |
+
grid_response = AgGrid(
|
63 |
+
dataframe=result,
|
64 |
+
gridOptions=gridOptions,
|
65 |
+
height=1100,
|
66 |
+
fit_columns_on_grid_load=True,
|
67 |
+
data_return_mode='AS_INPUT',
|
68 |
+
update_mode='VALUE_CHANGED',
|
69 |
+
theme='light',
|
70 |
+
enable_enterprise_modules=True,
|
71 |
+
allow_unsafe_jscode=True,
|
72 |
+
)
|
73 |
|
74 |
+
def main():
|
|
|
|
|
|
|
75 |
st.title('Job Posting Similarity')
|
76 |
+
st.write('This app will help you find similar job titles real-time from ecommurz google sheets.')
|
77 |
+
|
78 |
+
columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
|
79 |
+
'Experience', 'Last Day', 'LinkedIn Profile']
|
80 |
+
data = load_dataset(columns)
|
81 |
+
model = load_model()
|
82 |
+
corpus_embeddings = create_embedding(model, data, 'Previous Role')
|
83 |
+
|
84 |
job_title = st.text_input('Insert the job title below:', '')
|
85 |
submitted = st.button('Submit')
|
86 |
|
87 |
if submitted:
|
88 |
+
st.info(f'Showing results for {job_title}')
|
89 |
+
result = top_k_similarity(model, data, job_title, corpus_embeddings)
|
90 |
+
result = result[columns]
|
|
|
91 |
|
92 |
st.download_button(
|
93 |
+
"Download Table",
|
94 |
result.to_csv().encode('utf-8'),
|
95 |
"result.csv",
|
96 |
"text/csv",
|
97 |
key='download-csv'
|
98 |
)
|
99 |
|
100 |
+
show_aggrid_table(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
if __name__ == '__main__':
|
103 |
main()
|