|
import os |
|
os.system('pip install openpyxl') |
|
os.system('pip install sentence-transformers') |
|
import pandas as pd |
|
import gradio as gr |
|
from sentence_transformers import SentenceTransformer |
|
gpt3_api_key = os.environ['GPT3_API_KEY_CIVILIENCE'] |
|
|
|
|
|
model = SentenceTransformer('all-mpnet-base-v2') |
|
|
|
df = pd.read_parquet('df_encoded3.parquet') |
|
df['tags'] = df['tags'].apply(lambda x : str(x)) |
|
def parse_raised(x): |
|
if x == 'Undisclosed': |
|
return 0 |
|
else: |
|
quantifier = x[-1] |
|
x = float(x[1:-1]) |
|
if quantifier == 'K': |
|
return x/1000 |
|
elif quantifier == 'M': |
|
return x |
|
df['raised'] = df['raised'].apply(lambda x : parse_raised(x)) |
|
df['stage'] = df['stage'].apply(lambda x : x.lower()) |
|
df = df.reset_index(drop=True) |
|
|
|
from sklearn.neighbors import NearestNeighbors |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
|
|
nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist()) |
|
|
|
def search(df, query): |
|
product = model.encode(query).tolist() |
|
|
|
|
|
|
|
|
|
distances, indices = nbrs.kneighbors([product]) |
|
|
|
|
|
return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags', 'text_vector_']] |
|
|
|
def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0): |
|
if filter_type == '==': |
|
df_filtered = df[df[column_name]==filter_value] |
|
elif filter_type == '>=': |
|
df_filtered = df[df[column_name]>=filter_value] |
|
elif filter_type == '<=': |
|
df_filtered = df[df[column_name]<=filter_value] |
|
elif filter_type == 'contains': |
|
df_filtered = df[df['target'].str.contains(filter_value)] |
|
|
|
if df_filtered.size >= minimum_acceptable_size: |
|
return df_filtered |
|
else: |
|
return df |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
def score_filter(df, query, min_score): |
|
|
|
def cosine_sim(query, vector): |
|
return cosine_similarity([query], [vector])[0][0] |
|
|
|
|
|
vector_col = np.array(df['text_vector_'].tolist()) |
|
|
|
|
|
query = model.encode([query])[0] |
|
|
|
|
|
df['similarity'] = np.apply_along_axis(cosine_sim, 1, vector_col, query) |
|
df = df[df['similarity']>=min_score] |
|
return df |
|
|
|
import requests |
|
|
|
def gpt3_question(api_key, prompt): |
|
api_endpoint = "https://api.openai.com/v1/engines/text-davinci-003/completions" |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {api_key}" |
|
} |
|
data = { |
|
"prompt": prompt, |
|
"max_tokens": 500, |
|
"temperature": 0.7 |
|
} |
|
print('sending request') |
|
response = requests.post(api_endpoint, headers=headers, json=data) |
|
print(response.text) |
|
generated_text = response.json()["choices"][0]["text"] |
|
|
|
return generated_text |
|
|
|
def competitor_analysis_foo(startup_array, max_paragraphs): |
|
prompt = f""" |
|
{str(startup_array)} |
|
This is a list of startups in the following format: [name, stage, description]: |
|
|
|
Write a {max_paragraphs} paragraph competitors analysis based on this data. Do not name the paragraphs. |
|
""" |
|
response = gpt3_question(gpt3_api_key, prompt) |
|
|
|
for x in range(10): |
|
response = response.replace(f'Paragraph {x}:', '') |
|
response = response.replace(f'Paragraph {x}', '') |
|
response = response.replace('\n\n', '\n').strip() |
|
|
|
|
|
|
|
return response |
|
|
|
|
|
def vector_search(size, target, stage, query, var_metadata, var_fresh): |
|
def raised_zero(x): |
|
if x == 0: |
|
return 'Undisclosed' |
|
else: |
|
return x |
|
df_knn = search(df, query) |
|
df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x)) |
|
|
|
df_size = filter_df(df_knn, 'size', '==', size, 1) |
|
|
|
if stage != 'ALL': |
|
df_stage = filter_df(df_size, 'stage', '==', stage.lower(), 1) |
|
else: |
|
|
|
df_stage = df_size |
|
|
|
df_target = filter_df(df_stage, 'target', 'contains', target, 1) |
|
|
|
|
|
|
|
|
|
return df_target.drop('text_vector_', axis=1)[0:100], df_target[0:100], True |
|
|
|
def write_competitor_analysis(var_metadata, query, var_fresh): |
|
|
|
if var_fresh == True: |
|
df_final = score_filter(var_metadata, query, 0.35) |
|
df_final = df_final[['name', 'stage', 'description']][0:10].values.tolist() |
|
|
|
if len(df_final) == 0: |
|
|
|
|
|
response = 'score too low to output valid results' |
|
if len(df_final) >= 1 and len(df_final) <= 3: |
|
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=1) |
|
elif len(df_final) > 3 and len(df_final) <= 5: |
|
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=2) |
|
elif len(df_final) > 6: |
|
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=3) |
|
|
|
return response, False |
|
|
|
else: |
|
return 'Perform a new Startup Search first', False |
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo: |
|
gr.Markdown( |
|
""" |
|
# Startup Search Engine |
|
""" |
|
) |
|
var_fresh = gr.Variable(value=False) |
|
var_metadata = gr.Variable(value=0) |
|
var_query = gr.Variable(value=0) |
|
size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size') |
|
target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target') |
|
stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'ALL'], multiselect=False, value='ALL', label='stage') |
|
|
|
query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing') |
|
|
|
|
|
btn2 = gr.Button(value="Search for a Startup") |
|
btn1 = gr.Button(value="Write a competitor analysis") |
|
|
|
output1 = gr.Textbox(label='competitor analysis') |
|
output2 = gr.DataFrame(label='value') |
|
|
|
btn1.click(write_competitor_analysis, [var_metadata, query, var_fresh], [output1, var_fresh]) |
|
btn2.click(vector_search, [size, target, stage, query, var_metadata, var_fresh], [output2, var_metadata, var_fresh]) |
|
|
|
demo.launch(share=False) |