Michelangiolo commited on
Commit
9fa587a
1 Parent(s): 942976d
Files changed (3) hide show
  1. app.py +59 -17
  2. data_manipulation.ipynb +416 -0
  3. df_encoded.parquet +2 -2
app.py CHANGED
@@ -8,35 +8,77 @@ from sentence_transformers import SentenceTransformer
8
  model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
9
 
10
  df = pd.read_parquet('df_encoded.parquet')
11
- df.columns = [['name', 'description', 'year', 'target', 'size', 'stage', 'raised', 'tags', 'text_vector_']]
12
- #if parsing from a parquet, I have a list of array that does not want to get changed
13
- df_knn = [x[0].tolist() for x in df['text_vector_'].values.tolist()]
 
 
 
 
 
 
 
 
 
14
  df = df.reset_index(drop=True)
15
 
16
  from sklearn.neighbors import NearestNeighbors
17
- import numpy as np
18
  import pandas as pd
19
  from sentence_transformers import SentenceTransformer
20
 
21
- #prepare model
22
- nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df_knn)
 
 
 
 
 
 
 
 
23
 
24
- def search(query):
25
  product = model.encode(query).tolist()
26
  # product = df.iloc[0]['text_vector_'] #use one of the products as sample
27
 
 
 
 
28
  distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
29
 
30
  #print out the description of every recommended product
31
- return df.iloc[list(indices)[0]][['name', 'description', 'year', 'target', 'size', 'stage', 'raised', 'tags']]
32
 
33
  #the first module becomes text1, the second module file1
34
- def greet(text1):
35
- return search(text1)
36
-
37
- iface = gr.Interface(
38
- fn=greet,
39
- inputs=['text'],
40
- outputs=["dataframe"]
41
- )
42
- iface.launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
9
 
10
  df = pd.read_parquet('df_encoded.parquet')
11
+ df['tags'] = df['tags'].apply(lambda x : str(x))
12
+ def parse_raised(x):
13
+ if x == 'Undisclosed':
14
+ return 0
15
+ else:
16
+ quantifier = x[-1]
17
+ x = float(x[1:-1])
18
+ if quantifier == 'K':
19
+ return x/1000
20
+ elif quantifier == 'M':
21
+ return x
22
+ df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
23
  df = df.reset_index(drop=True)
24
 
25
  from sklearn.neighbors import NearestNeighbors
 
26
  import pandas as pd
27
  from sentence_transformers import SentenceTransformer
28
 
29
+ def filter_df(df, column_name, filter_type, filter_value):
30
+ if filter_type == '==':
31
+ df_filtered = df[df[column_name]==filter_value]
32
+ elif filter_type == '>=':
33
+ df_filtered = df[df[column_name]>=filter_value]
34
+ elif filter_type == '<=':
35
+ df_filtered = df[df[column_name]<=filter_value]
36
+ elif filter_type == 'contains':
37
+ df_filtered = df[df['target'].str.contains(filter_value)]
38
+ return df_filtered
39
 
40
+ def search(df, query):
41
  product = model.encode(query).tolist()
42
  # product = df.iloc[0]['text_vector_'] #use one of the products as sample
43
 
44
+ #prepare model
45
+ nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
46
+
47
  distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
48
 
49
  #print out the description of every recommended product
50
+ return df.iloc[list(indices)[0]][['name', 'description', 'raised', 'year', 'target', 'size', 'stage', 'tags']]
51
 
52
  #the first module becomes text1, the second module file1
53
+ def greet(size, target, raised, query):
54
+ df_size = filter_df(df, 'size', '==', size)
55
+ df_target = filter_df(df_size, 'target', 'contains', target)
56
+ def raised_zero(x):
57
+ if x == 0:
58
+ return 'Undisclosed'
59
+ else:
60
+ return x
61
+ print('a')
62
+ df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
63
+ df_knn = search(df_raised, query)
64
+ #we live the sorting for last
65
+ df_knn = df_knn.sort_values('raised', ascending=False)
66
+ df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
67
+
68
+ return df_knn
69
+
70
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
71
+ gr.Markdown(
72
+ """
73
+ # Startup Search Engine
74
+ """
75
+ )
76
+ size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+'], multiselect=False, value='11-50', label='size')
77
+ target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], value='B2B', multiselect=False, label='target')
78
+ raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
79
+ query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
80
+ btn = gr.Button(value="Search for a Startup")
81
+ output1 = gr.DataFrame(label='value')
82
+ # btn.click(greet, inputs='text', outputs=['dataframe'])
83
+ btn.click(greet, [size, target, raised, query], [output1])
84
+ demo.launch(share=False)
data_manipulation.ipynb ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 78,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/html": [
11
+ "<div>\n",
12
+ "<style scoped>\n",
13
+ " .dataframe tbody tr th:only-of-type {\n",
14
+ " vertical-align: middle;\n",
15
+ " }\n",
16
+ "\n",
17
+ " .dataframe tbody tr th {\n",
18
+ " vertical-align: top;\n",
19
+ " }\n",
20
+ "\n",
21
+ " .dataframe thead th {\n",
22
+ " text-align: right;\n",
23
+ " }\n",
24
+ "</style>\n",
25
+ "<table border=\"1\" class=\"dataframe\">\n",
26
+ " <thead>\n",
27
+ " <tr style=\"text-align: right;\">\n",
28
+ " <th></th>\n",
29
+ " <th>name</th>\n",
30
+ " <th>description</th>\n",
31
+ " <th>year</th>\n",
32
+ " <th>target</th>\n",
33
+ " <th>size</th>\n",
34
+ " <th>stage</th>\n",
35
+ " <th>raised</th>\n",
36
+ " <th>tags</th>\n",
37
+ " <th>text_vector_</th>\n",
38
+ " </tr>\n",
39
+ " </thead>\n",
40
+ " <tbody>\n",
41
+ " <tr>\n",
42
+ " <th>0</th>\n",
43
+ " <td>0.10 of a Second</td>\n",
44
+ " <td>Smart Indicators for Connected Vehicles</td>\n",
45
+ " <td>2019.0</td>\n",
46
+ " <td>B2B</td>\n",
47
+ " <td>1-10</td>\n",
48
+ " <td>Pre-Funding</td>\n",
49
+ " <td>Undisclosed</td>\n",
50
+ " <td>[connected-vehicles, adas, autonomous-vehicles...</td>\n",
51
+ " <td>[-0.031224824488162994, -0.06342269480228424, ...</td>\n",
52
+ " </tr>\n",
53
+ " <tr>\n",
54
+ " <th>1</th>\n",
55
+ " <td>12trix</td>\n",
56
+ " <td>Math Lessons for Young Kids</td>\n",
57
+ " <td>2012.0</td>\n",
58
+ " <td>B2B, B2C</td>\n",
59
+ " <td>1-10</td>\n",
60
+ " <td>Pre-Funding</td>\n",
61
+ " <td>Undisclosed</td>\n",
62
+ " <td>[sdg, schools, pre-k, serious-games, games, mo...</td>\n",
63
+ " <td>[-0.038649097084999084, 0.028091922402381897, ...</td>\n",
64
+ " </tr>\n",
65
+ " <tr>\n",
66
+ " <th>2</th>\n",
67
+ " <td>1E Therapeutics</td>\n",
68
+ " <td>Novel RNA-targeting Drugs</td>\n",
69
+ " <td>2021.0</td>\n",
70
+ " <td>B2B</td>\n",
71
+ " <td>51-200</td>\n",
72
+ " <td>Seed</td>\n",
73
+ " <td>$120M</td>\n",
74
+ " <td>[pharmaceuticals, chronic-disease, immunology,...</td>\n",
75
+ " <td>[0.04561534896492958, -0.017776092514395714, 0...</td>\n",
76
+ " </tr>\n",
77
+ " <tr>\n",
78
+ " <th>3</th>\n",
79
+ " <td>1MRobotics</td>\n",
80
+ " <td>Retail Automation Solutions with Nano Fulfillment</td>\n",
81
+ " <td>2021.0</td>\n",
82
+ " <td>B2B</td>\n",
83
+ " <td>11-50</td>\n",
84
+ " <td>A</td>\n",
85
+ " <td>$25M</td>\n",
86
+ " <td>[omni-channel, ecommerce, climate-tech, artifi...</td>\n",
87
+ " <td>[0.0024080690927803516, -0.03042100928723812, ...</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>4</th>\n",
91
+ " <td>1touch.io</td>\n",
92
+ " <td>Personal Data Flow Tracking and Data Cataloging</td>\n",
93
+ " <td>2017.0</td>\n",
94
+ " <td>B2B</td>\n",
95
+ " <td>51-200</td>\n",
96
+ " <td>A</td>\n",
97
+ " <td>$16.1M</td>\n",
98
+ " <td>[enterprise-solutions, data-protection, cyber-...</td>\n",
99
+ " <td>[-0.01007091999053955, 0.10431888699531555, -0...</td>\n",
100
+ " </tr>\n",
101
+ " <tr>\n",
102
+ " <th>...</th>\n",
103
+ " <td>...</td>\n",
104
+ " <td>...</td>\n",
105
+ " <td>...</td>\n",
106
+ " <td>...</td>\n",
107
+ " <td>...</td>\n",
108
+ " <td>...</td>\n",
109
+ " <td>...</td>\n",
110
+ " <td>...</td>\n",
111
+ " <td>...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>4981</th>\n",
115
+ " <td>YOW HR</td>\n",
116
+ " <td>Human Resources Engagement and Optimization Pl...</td>\n",
117
+ " <td>2020.0</td>\n",
118
+ " <td>B2B, B2B2C</td>\n",
119
+ " <td>1-10</td>\n",
120
+ " <td>Pre-Funding</td>\n",
121
+ " <td>Undisclosed</td>\n",
122
+ " <td>[content-creators, e-learning, software-applic...</td>\n",
123
+ " <td>[0.026961881667375565, 0.002459645736962557, -...</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>4982</th>\n",
127
+ " <td>Yummi Home Food</td>\n",
128
+ " <td>Marketplace for Homemade Food</td>\n",
129
+ " <td>2012.0</td>\n",
130
+ " <td>B2C</td>\n",
131
+ " <td>11-50</td>\n",
132
+ " <td>Pre-Funding</td>\n",
133
+ " <td>Undisclosed</td>\n",
134
+ " <td>[ecommerce, p2p, delivery, online-shopping, ma...</td>\n",
135
+ " <td>[0.0036857957020401955, 0.03582162782549858, -...</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>4983</th>\n",
139
+ " <td>Yung-Etgar</td>\n",
140
+ " <td>Custom Mechanized Harvesting Systems</td>\n",
141
+ " <td>1982.0</td>\n",
142
+ " <td>B2B</td>\n",
143
+ " <td>51-200</td>\n",
144
+ " <td>Mature</td>\n",
145
+ " <td>Undisclosed</td>\n",
146
+ " <td>[crops, agtech, harvesting, machinery, sdg, cl...</td>\n",
147
+ " <td>[0.027293115854263306, 0.010461761616170406, 0...</td>\n",
148
+ " </tr>\n",
149
+ " <tr>\n",
150
+ " <th>4984</th>\n",
151
+ " <td>YuviTal</td>\n",
152
+ " <td>Digital Health and Fitness Solutions for Organ...</td>\n",
153
+ " <td>2017.0</td>\n",
154
+ " <td>B2B, B2C, B2G</td>\n",
155
+ " <td>11-50</td>\n",
156
+ " <td>Pre-Funding</td>\n",
157
+ " <td>Undisclosed</td>\n",
158
+ " <td>[fitness, digital-wallet, discount, mobile-app...</td>\n",
159
+ " <td>[0.02851911261677742, 0.05474231392145157, -0....</td>\n",
160
+ " </tr>\n",
161
+ " <tr>\n",
162
+ " <th>4985</th>\n",
163
+ " <td>Z-square</td>\n",
164
+ " <td>Microendoscope for Minimally Invasive Imaging ...</td>\n",
165
+ " <td>2013.0</td>\n",
166
+ " <td>B2B</td>\n",
167
+ " <td>11-50</td>\n",
168
+ " <td>Seed</td>\n",
169
+ " <td>$10M</td>\n",
170
+ " <td>[endoscopy, medical-devices, minimally-invasiv...</td>\n",
171
+ " <td>[0.012587728910148144, -0.07959864288568497, -...</td>\n",
172
+ " </tr>\n",
173
+ " </tbody>\n",
174
+ "</table>\n",
175
+ "<p>4986 rows × 9 columns</p>\n",
176
+ "</div>"
177
+ ],
178
+ "text/plain": [
179
+ " name description \\\n",
180
+ "0 0.10 of a Second Smart Indicators for Connected Vehicles \n",
181
+ "1 12trix Math Lessons for Young Kids \n",
182
+ "2 1E Therapeutics Novel RNA-targeting Drugs \n",
183
+ "3 1MRobotics Retail Automation Solutions with Nano Fulfillment \n",
184
+ "4 1touch.io Personal Data Flow Tracking and Data Cataloging \n",
185
+ "... ... ... \n",
186
+ "4981 YOW HR Human Resources Engagement and Optimization Pl... \n",
187
+ "4982 Yummi Home Food Marketplace for Homemade Food \n",
188
+ "4983 Yung-Etgar Custom Mechanized Harvesting Systems \n",
189
+ "4984 YuviTal Digital Health and Fitness Solutions for Organ... \n",
190
+ "4985 Z-square Microendoscope for Minimally Invasive Imaging ... \n",
191
+ "\n",
192
+ " year target size stage raised \\\n",
193
+ "0 2019.0 B2B 1-10 Pre-Funding Undisclosed \n",
194
+ "1 2012.0 B2B, B2C 1-10 Pre-Funding Undisclosed \n",
195
+ "2 2021.0 B2B 51-200 Seed $120M \n",
196
+ "3 2021.0 B2B 11-50 A $25M \n",
197
+ "4 2017.0 B2B 51-200 A $16.1M \n",
198
+ "... ... ... ... ... ... \n",
199
+ "4981 2020.0 B2B, B2B2C 1-10 Pre-Funding Undisclosed \n",
200
+ "4982 2012.0 B2C 11-50 Pre-Funding Undisclosed \n",
201
+ "4983 1982.0 B2B 51-200 Mature Undisclosed \n",
202
+ "4984 2017.0 B2B, B2C, B2G 11-50 Pre-Funding Undisclosed \n",
203
+ "4985 2013.0 B2B 11-50 Seed $10M \n",
204
+ "\n",
205
+ " tags \\\n",
206
+ "0 [connected-vehicles, adas, autonomous-vehicles... \n",
207
+ "1 [sdg, schools, pre-k, serious-games, games, mo... \n",
208
+ "2 [pharmaceuticals, chronic-disease, immunology,... \n",
209
+ "3 [omni-channel, ecommerce, climate-tech, artifi... \n",
210
+ "4 [enterprise-solutions, data-protection, cyber-... \n",
211
+ "... ... \n",
212
+ "4981 [content-creators, e-learning, software-applic... \n",
213
+ "4982 [ecommerce, p2p, delivery, online-shopping, ma... \n",
214
+ "4983 [crops, agtech, harvesting, machinery, sdg, cl... \n",
215
+ "4984 [fitness, digital-wallet, discount, mobile-app... \n",
216
+ "4985 [endoscopy, medical-devices, minimally-invasiv... \n",
217
+ "\n",
218
+ " text_vector_ \n",
219
+ "0 [-0.031224824488162994, -0.06342269480228424, ... \n",
220
+ "1 [-0.038649097084999084, 0.028091922402381897, ... \n",
221
+ "2 [0.04561534896492958, -0.017776092514395714, 0... \n",
222
+ "3 [0.0024080690927803516, -0.03042100928723812, ... \n",
223
+ "4 [-0.01007091999053955, 0.10431888699531555, -0... \n",
224
+ "... ... \n",
225
+ "4981 [0.026961881667375565, 0.002459645736962557, -... \n",
226
+ "4982 [0.0036857957020401955, 0.03582162782549858, -... \n",
227
+ "4983 [0.027293115854263306, 0.010461761616170406, 0... \n",
228
+ "4984 [0.02851911261677742, 0.05474231392145157, -0.... \n",
229
+ "4985 [0.012587728910148144, -0.07959864288568497, -... \n",
230
+ "\n",
231
+ "[4986 rows x 9 columns]"
232
+ ]
233
+ },
234
+ "execution_count": 78,
235
+ "metadata": {},
236
+ "output_type": "execute_result"
237
+ }
238
+ ],
239
+ "source": [
240
+ "import pandas as pd\n",
241
+ "\n",
242
+ "df = pd.read_parquet('df_encoded.parquet')\n",
243
+ "df"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 89,
249
+ "metadata": {},
250
+ "outputs": [],
251
+ "source": [
252
+ "import os\n",
253
+ "os.system('pip install openpyxl')\n",
254
+ "os.system('pip install sentence-transformers')\n",
255
+ "import pandas as pd\n",
256
+ "import gradio as gr\n",
257
+ "from sentence_transformers import SentenceTransformer\n",
258
+ "\n",
259
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
260
+ "\n",
261
+ "df = pd.read_parquet('df_encoded.parquet')\n",
262
+ "df['tags'] = df['tags'].apply(lambda x : str(x))\n",
263
+ "def parse_raised(x):\n",
264
+ " if x == 'Undisclosed':\n",
265
+ " return 0\n",
266
+ " else: \n",
267
+ " quantifier = x[-1]\n",
268
+ " x = float(x[1:-1])\n",
269
+ " if quantifier == 'K':\n",
270
+ " return x/1000\n",
271
+ " elif quantifier == 'M':\n",
272
+ " return x\n",
273
+ "df['raised'] = df['raised'].apply(lambda x : parse_raised(x))\n",
274
+ "df = df.reset_index(drop=True)\n",
275
+ "\n",
276
+ "from sklearn.neighbors import NearestNeighbors\n",
277
+ "import numpy as np\n",
278
+ "import pandas as pd\n",
279
+ "from sentence_transformers import SentenceTransformer"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 92,
285
+ "metadata": {},
286
+ "outputs": [
287
+ {
288
+ "name": "stderr",
289
+ "output_type": "stream",
290
+ "text": [
291
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
292
+ " warnings.warn(\n",
293
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Slider, please remove them: {'step_size': 1}\n",
294
+ " warnings.warn(\n"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stdout",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "Running on local URL: http://127.0.0.1:7896\n",
302
+ "\n",
303
+ "To create a public link, set `share=True` in `launch()`.\n"
304
+ ]
305
+ },
306
+ {
307
+ "data": {
308
+ "text/html": [
309
+ "<div><iframe src=\"http://127.0.0.1:7896/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
310
+ ],
311
+ "text/plain": [
312
+ "<IPython.core.display.HTML object>"
313
+ ]
314
+ },
315
+ "metadata": {},
316
+ "output_type": "display_data"
317
+ },
318
+ {
319
+ "data": {
320
+ "text/plain": []
321
+ },
322
+ "execution_count": 92,
323
+ "metadata": {},
324
+ "output_type": "execute_result"
325
+ },
326
+ {
327
+ "name": "stdout",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "a\n"
331
+ ]
332
+ }
333
+ ],
334
+ "source": [
335
+ "def filter_df(df, column_name, filter_type, filter_value):\n",
336
+ " if filter_type == '==':\n",
337
+ " df_filtered = df[df[column_name]==filter_value]\n",
338
+ " elif filter_type == '>=':\n",
339
+ " df_filtered = df[df[column_name]>=filter_value]\n",
340
+ " elif filter_type == '<=':\n",
341
+ " df_filtered = df[df[column_name]<=filter_value]\n",
342
+ " elif filter_type == 'contains':\n",
343
+ " df_filtered = df[df['target'].str.contains(filter_value)]\n",
344
+ " return df_filtered\n",
345
+ "\n",
346
+ "def search(df, query):\n",
347
+ " product = model.encode(query).tolist()\n",
348
+ " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
349
+ "\n",
350
+ " #prepare model\n",
351
+ " nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
352
+ "\n",
353
+ " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
354
+ "\n",
355
+ " #print out the description of every recommended product\n",
356
+ " return df.iloc[list(indices)[0]][['name', 'description', 'raised', 'year', 'target', 'size', 'stage', 'tags']]\n",
357
+ "\n",
358
+ "#the first module becomes text1, the second module file1\n",
359
+ "def greet(size, target, raised, query): \n",
360
+ " df_size = filter_df(df, 'size', '==', size)\n",
361
+ " df_target = filter_df(df_size, 'target', 'contains', target)\n",
362
+ " def raised_zero(x):\n",
363
+ " if x == 0:\n",
364
+ " return 'Undisclosed'\n",
365
+ " else:\n",
366
+ " return x\n",
367
+ " print('a')\n",
368
+ " df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]\n",
369
+ " df_knn = search(df_raised, query)\n",
370
+ " #we live the sorting for last\n",
371
+ " df_knn = df_knn.sort_values('raised', ascending=False)\n",
372
+ " df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))\n",
373
+ "\n",
374
+ " return df_knn\n",
375
+ "\n",
376
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
377
+ " gr.Markdown(\n",
378
+ " \"\"\"\n",
379
+ " # Gradio with History\n",
380
+ " \"\"\"\n",
381
+ " )\n",
382
+ " size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+'], multiselect=False, value='11-50', label='size')\n",
383
+ " target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], value='B2B', multiselect=False, label='target')\n",
384
+ " raised = gr.Slider(0, 20, value=5, step_size=1, label=\"Minimum raising (in Millions)\")\n",
385
+ " query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')\n",
386
+ " btn = gr.Button(value=\"Search for a Startup\")\n",
387
+ " output1 = gr.DataFrame(label='value')\n",
388
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
389
+ " btn.click(greet, [size, target, raised, query], [output1])\n",
390
+ "demo.launch(share=False)"
391
+ ]
392
+ }
393
+ ],
394
+ "metadata": {
395
+ "kernelspec": {
396
+ "display_name": "Python 3",
397
+ "language": "python",
398
+ "name": "python3"
399
+ },
400
+ "language_info": {
401
+ "codemirror_mode": {
402
+ "name": "ipython",
403
+ "version": 3
404
+ },
405
+ "file_extension": ".py",
406
+ "mimetype": "text/x-python",
407
+ "name": "python",
408
+ "nbconvert_exporter": "python",
409
+ "pygments_lexer": "ipython3",
410
+ "version": "3.9.13"
411
+ },
412
+ "orig_nbformat": 4
413
+ },
414
+ "nbformat": 4,
415
+ "nbformat_minor": 2
416
+ }
df_encoded.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f3ea0cc9455a2bb2a2ee793b9588e85c5df7ad4e9e04e363d45747092e93422
3
- size 24419137
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7f7bb66d15c188968839a3464fb2e5352cd0f4cd6bee7306ed5d9b3ecff4fbe
3
+ size 24191629