Elvan Selvano commited on
Commit
fdb6b1e
β€’
1 Parent(s): c06b14d

Restructure inference steps

Browse files
Files changed (1) hide show
  1. app.py +63 -76
app.py CHANGED
@@ -1,116 +1,103 @@
 
1
  import pandas as pd
2
  from sentence_transformers import SentenceTransformer, util
3
  import streamlit as st
4
  from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
5
- from cpu_unpickler import cpu_unpickler
6
  st.set_page_config(layout='wide')
7
 
8
  @st.cache(allow_output_mutation=True)
9
  def load_model():
10
  return SentenceTransformer('all-MiniLM-L6-v2')
11
 
12
- def find_top_similar(sentence, corpus_sentences, corpus_embeddings):
13
- # preprocess query
14
- model = load_model()
15
- query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor
16
- # query_embeddings = query_embeddings.to('cuda') # put into gpu
17
- query_embeddings = util.normalize_embeddings(query_embeddings) # normalize
18
 
19
- # find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
20
  hits = util.semantic_search(query_embeddings,
21
  corpus_embeddings,
22
  top_k=len(corpus_embeddings),
23
  score_function=util.dot_score)
24
- hits = hits[0] # get the hits for the first query
25
-
26
- # Create dataframe to store top searches
27
- records = []
28
-
29
- for hit in hits[0:len(corpus_embeddings)]:
30
- records.append(corpus_sentences[hit['corpus_id']])
31
-
32
- return records
33
 
34
- def top_k_similarity(df, query, corpus_sentences, corpus_embeddings):
35
- hits = find_top_similar([query], corpus_sentences, corpus_embeddings)
36
 
37
- res = pd.DataFrame()
38
-
39
- for h in hits:
40
- s = df[df['Last job role'] == h]
41
- res = pd.concat([res, s])
42
-
43
- return res
44
-
45
- def get_result(df, query, corpus_sentences, corpus_embeddings):
46
- result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings)
47
- result.drop_duplicates(inplace=True)
48
  return result
49
 
50
  @st.cache(allow_output_mutation=True)
51
- def load_embedding():
52
- """Loads the embeddings from the pickle file"""
53
- with open('corpus_embeddings.pkl', 'rb') as file:
54
- cache_data = cpu_unpickler(file).load()
55
- corpus_sentences = cache_data['sentences']
56
- corpus_embeddings = cache_data['embeddings']
57
-
58
- return corpus_sentences, corpus_embeddings
59
-
60
- def main():
61
- # get dataset
62
  sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
63
  sheet_name = 'Form Response 3'.replace(' ', '%20')
64
  url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
65
- df = pd.read_csv(url)
66
- df = df.iloc[: , :7]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # get embeddings
69
- corpus_sentences, corpus_embeddings = load_embedding()
70
-
71
- # streamlit form
72
  st.title('Job Posting Similarity')
 
 
 
 
 
 
 
 
73
  job_title = st.text_input('Insert the job title below:', '')
74
  submitted = st.button('Submit')
75
 
76
  if submitted:
77
- st.info(f'Showing results for { job_title}')
78
- result = get_result(df, job_title, corpus_sentences, corpus_embeddings)
79
- result.reset_index(drop=True, inplace=True)
80
- result.index += 1
81
 
82
  st.download_button(
83
- "Press to Download",
84
  result.to_csv().encode('utf-8'),
85
  "result.csv",
86
  "text/csv",
87
  key='download-csv'
88
  )
89
 
90
- gb = GridOptionsBuilder.from_dataframe(result)
91
- gb.configure_pagination(paginationAutoPageSize=True) # Add pagination
92
- # gb.configure_side_bar() #Add a sidebar
93
- # gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children") #Enable multi-row selection
94
-
95
- gb.configure_column("LinkedIn Link",
96
- headerName="LinkedIn Link",
97
- # cellRenderer=JsCode('''function(params) {return '<a href=params.value + '" target="_blank">'+ params.value+'</a>'}'''),
98
- cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''),
99
- width=300)
100
-
101
- gridOptions = gb.build()
102
-
103
- grid_response = AgGrid(
104
- dataframe=result,
105
- gridOptions=gridOptions,
106
- height=1100,
107
- fit_columns_on_grid_load=True,
108
- data_return_mode='AS_INPUT',
109
- update_mode='VALUE_CHANGED',
110
- theme='light',
111
- enable_enterprise_modules=True,
112
- allow_unsafe_jscode=True,
113
- )
114
 
115
  if __name__ == '__main__':
116
  main()
 
1
+ from typing import List, Tuple
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer, util
4
  import streamlit as st
5
  from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
 
6
  st.set_page_config(layout='wide')
7
 
8
  @st.cache(allow_output_mutation=True)
9
  def load_model():
10
  return SentenceTransformer('all-MiniLM-L6-v2')
11
 
12
+ def semantic_search(model, sentence, corpus_embeddings):
13
+ query_embeddings = model.encode(sentence,
14
+ convert_to_tensor=True,
15
+ normalize_embeddings=True)
 
 
16
 
 
17
  hits = util.semantic_search(query_embeddings,
18
  corpus_embeddings,
19
  top_k=len(corpus_embeddings),
20
  score_function=util.dot_score)
 
 
 
 
 
 
 
 
 
21
 
22
+ return pd.DataFrame(hits[0])
 
23
 
24
+ def top_k_similarity(model, df, query, corpus_embeddings):
25
+ hits = semantic_search(model, [query], corpus_embeddings)
26
+ result = pd.merge(df, hits, left_on='ID', right_on='corpus_id')
27
+ result.sort_values(by='score', ascending=False, inplace=True)
 
 
 
 
 
 
 
28
  return result
29
 
30
  @st.cache(allow_output_mutation=True)
31
+ def create_embedding(model: SentenceTransformer, data: pd.DataFrame, key: str) -> Tuple[list, list]:
32
+ """Create vector embeddings from the dataset"""
33
+ corpus_sentences = data[key].astype(str).tolist()
34
+ corpus_embeddings = model.encode(sentences=corpus_sentences,
35
+ show_progress_bar=True,
36
+ convert_to_tensor=True,
37
+ normalize_embeddings=True)
38
+ return corpus_embeddings
39
+
40
+ def load_dataset(columns: List) -> pd.DataFrame:
41
+ """Load real-time dataset from google sheets"""
42
  sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
43
  sheet_name = 'Form Response 3'.replace(' ', '%20')
44
  url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
45
+ data = pd.read_csv(url)
46
+ data = data.iloc[: , :7]
47
+ data.columns = columns
48
+ data.insert(0, 'ID', range(len(data)))
49
+ return data
50
+
51
+ def show_aggrid_table(result: pd.DataFrame):
52
+ gb = GridOptionsBuilder.from_dataframe(result)
53
+ gb.configure_pagination(paginationAutoPageSize=True)
54
+ gb.configure_side_bar()
55
+ gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children")
56
+ gb.configure_column(field="LinkedIn Profile",
57
+ headerName="LinkedIn Profile",
58
+ cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''))
59
+
60
+ gridOptions = gb.build()
61
+
62
+ grid_response = AgGrid(
63
+ dataframe=result,
64
+ gridOptions=gridOptions,
65
+ height=1100,
66
+ fit_columns_on_grid_load=True,
67
+ data_return_mode='AS_INPUT',
68
+ update_mode='VALUE_CHANGED',
69
+ theme='light',
70
+ enable_enterprise_modules=True,
71
+ allow_unsafe_jscode=True,
72
+ )
73
 
74
+ def main():
 
 
 
75
  st.title('Job Posting Similarity')
76
+ st.write('This app will help you find similar job titles real-time from ecommurz google sheets.')
77
+
78
+ columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
79
+ 'Experience', 'Last Day', 'LinkedIn Profile']
80
+ data = load_dataset(columns)
81
+ model = load_model()
82
+ corpus_embeddings = create_embedding(model, data, 'Previous Role')
83
+
84
  job_title = st.text_input('Insert the job title below:', '')
85
  submitted = st.button('Submit')
86
 
87
  if submitted:
88
+ st.info(f'Showing results for {job_title}')
89
+ result = top_k_similarity(model, data, job_title, corpus_embeddings)
90
+ result = result[columns]
 
91
 
92
  st.download_button(
93
+ "Download Table",
94
  result.to_csv().encode('utf-8'),
95
  "result.csv",
96
  "text/csv",
97
  key='download-csv'
98
  )
99
 
100
+ show_aggrid_table(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  if __name__ == '__main__':
103
  main()