Michelangiolo's picture
hid keys
493eace
import os
os.system('pip install openpyxl')
os.system('pip install sentence-transformers')
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
gpt3_api_key = os.environ['GPT3_API_KEY_CIVILIENCE']
# gpt3_api_key = os.environ['GPT3_API_KEY_ROBERT']
model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
df = pd.read_parquet('df_encoded3.parquet')
df['tags'] = df['tags'].apply(lambda x : str(x))
def parse_raised(x):
if x == 'Undisclosed':
return 0
else:
quantifier = x[-1]
x = float(x[1:-1])
if quantifier == 'K':
return x/1000
elif quantifier == 'M':
return x
df['raised'] = df['raised'].apply(lambda x : parse_raised(x))
df['stage'] = df['stage'].apply(lambda x : x.lower())
df = df.reset_index(drop=True)
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sentence_transformers import SentenceTransformer
nbrs = NearestNeighbors(n_neighbors=5000, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
def search(df, query):
product = model.encode(query).tolist()
# product = df.iloc[0]['text_vector_'] #use one of the products as sample
#prepare model
#
distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
#print out the description of every recommended product
return df.iloc[list(indices)[0]][['name', 'raised', 'target', 'size', 'stage', 'country', 'source', 'description', 'tags', 'text_vector_']]
def filter_df(df, column_name, filter_type, filter_value, minimum_acceptable_size=0):
if filter_type == '==':
df_filtered = df[df[column_name]==filter_value]
elif filter_type == '>=':
df_filtered = df[df[column_name]>=filter_value]
elif filter_type == '<=':
df_filtered = df[df[column_name]<=filter_value]
elif filter_type == 'contains':
df_filtered = df[df['target'].str.contains(filter_value)]
if df_filtered.size >= minimum_acceptable_size:
return df_filtered
else:
return df
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
def score_filter(df, query, min_score):
# Define function to compute cosine similarity between two vectors
def cosine_sim(query, vector):
return cosine_similarity([query], [vector])[0][0]
# df_results = search(df, 'age reversing')[0:50]
vector_col = np.array(df['text_vector_'].tolist())
# Define query vector
query = model.encode([query])[0]
# Compute cosine similarity between query vector and every sample vector
df['similarity'] = np.apply_along_axis(cosine_sim, 1, vector_col, query)
df = df[df['similarity']>=min_score]
return df
import requests
def gpt3_question(api_key, prompt):
api_endpoint = "https://api.openai.com/v1/engines/text-davinci-003/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
data = {
"prompt": prompt,
"max_tokens": 500,
"temperature": 0.7
}
print('sending request')
response = requests.post(api_endpoint, headers=headers, json=data)
print(response.text)
generated_text = response.json()["choices"][0]["text"]
return generated_text
def competitor_analysis_foo(startup_array, max_paragraphs):
prompt = f"""
{str(startup_array)}
This is a list of startups in the following format: [name, stage, description]:
Write a {max_paragraphs} paragraph competitors analysis based on this data. Do not name the paragraphs.
"""
response = gpt3_question(gpt3_api_key, prompt)
for x in range(10):
response = response.replace(f'Paragraph {x}:', '')
response = response.replace(f'Paragraph {x}', '')
response = response.replace('\n\n', '\n').strip()
# with open('competitor_analysis.txt', 'w') as file:
# file.write(response)
return response
#the first module becomes text1, the second module file1
def vector_search(size, target, stage, query, var_metadata, var_fresh): #greet('11-500+', 'B2B', 'pre-seed', 'age-reversing')
def raised_zero(x):
if x == 0:
return 'Undisclosed'
else:
return x
df_knn = search(df, query)
df_knn['raised'] = df_knn['raised'].apply(lambda x : raised_zero(x))
df_size = filter_df(df_knn, 'size', '==', size, 1)
if stage != 'ALL':
df_stage = filter_df(df_size, 'stage', '==', stage.lower(), 1)
else:
#we bypass the filter
df_stage = df_size
df_target = filter_df(df_stage, 'target', 'contains', target, 1)
# display(df_stage)
# df_raised = df_target[(df_target['raised'] >= raised) | (df_target['raised'] == 0)]
return df_target.drop('text_vector_', axis=1)[0:100], df_target[0:100], True #.sort_values('raised', ascending=False)
def write_competitor_analysis(var_metadata, query, var_fresh):
if var_fresh == True:
df_final = score_filter(var_metadata, query, 0.35)
df_final = df_final[['name', 'stage', 'description']][0:10].values.tolist()
if len(df_final) == 0:
# df_final = df_final[['name', 'stage', 'description']][0:3].values.tolist()
# response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=1)
response = 'score too low to output valid results'
if len(df_final) >= 1 and len(df_final) <= 3:
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=1)
elif len(df_final) > 3 and len(df_final) <= 5:
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=2)
elif len(df_final) > 6:
response = competitor_analysis_foo(startup_array=df_final, max_paragraphs=3)
return response, False #we reset fresh state
else:
return 'Perform a new Startup Search first', False #we reset fresh state
with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
gr.Markdown(
"""
# Startup Search Engine
"""
)
var_fresh = gr.Variable(value=False)
var_metadata = gr.Variable(value=0)
var_query = gr.Variable(value=0)
size = gr.Radio(['1-10', '11-50', '51-200', '201-500', '500+', '11-500+'], multiselect=False, value='11-500+', label='size')
target = gr.Radio(['B2B', 'B2C', 'B2G', 'B2B2C'], multiselect=False, value='B2B', label='target')
stage = gr.Radio(['pre-seed', 'A', 'B', 'C', 'ALL'], multiselect=False, value='ALL', label='stage')
# raised = gr.Slider(0, 20, value=5, step_size=1, label="Minimum raising (in Millions)")
query = gr.Textbox(label='Describe the Startup you are searching for', value='age reversing')
# competitor_analysis = gr.Radio(['write', 'do not write'], multiselect=False, value='do not write', label='write a competitor analysis')
btn2 = gr.Button(value="Search for a Startup")
btn1 = gr.Button(value="Write a competitor analysis")
output1 = gr.Textbox(label='competitor analysis')
output2 = gr.DataFrame(label='value')
btn1.click(write_competitor_analysis, [var_metadata, query, var_fresh], [output1, var_fresh]) #competitor analysis
btn2.click(vector_search, [size, target, stage, query, var_metadata, var_fresh], [output2, var_metadata, var_fresh]) #startup search
demo.launch(share=False)