Spaces:

tomascufaro
/

keyword_classification

Sleeping

App Files Files Community

tomascufaro commited on Jan 17, 2024

Commit

ad1fbe3

verified ·

1 Parent(s): 2f15652

Create app.py

Browse files

Files changed (1) hide show

app.py +104 -0

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+#Parent directory
+import sys
+import os
+import time
+import pandas as pd    # for data manipulation (pip install pandas)
+import matplotlib.pyplot as plt
+from random import randint
+from urllib.parse import urlparse
+import numpy as np
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import create_extraction_chain
+from langchain.llms import OpenAI
+from langchain.chat_models import ChatOpenAI
+from typing import Optional
+from langchain.chains.openai_functions import (
+    create_structured_output_chain, create_tagging_chain_pydantic
+)
+from langchain.prompts import ChatPromptTemplate
+import gradio as gr
+from collections import defaultdict
+# Schema
+schema = {
+    "properties": {
+        "keyword": {"type": "string"},
+        "category": {"type": "string"},
+    },
+    "required": ["keyword", "category"],
+}
+# Input
+prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", "You are an expert marketing researcher specialized in the finance industry"),
+        ("human", """{prompt_input}.
+         Here you have the categories splitted by coma: {categories}.
+         and Here you have the keywords splitted by coma: {keywords}."""),
+        ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
+    ]
+)
+llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OpenAI_APIKEY'], model="gpt-3.5-turbo")
+chain = create_extraction_chain(schema, llm, prompt, verbose=1)
+def run_chain(input_prompt, keywords_file, categories_file, batch_size=50):
+    results = []
+    batch_size = batch_size
+    index = 0
+    try:
+        keywords = pd.read_csv(keywords_file.name)
+    except:
+        keywords = pd.read_excel(keywords_file.name)
+    try:
+        categories = pd.read_csv(categories_file.name)
+    except:
+        categories = pd.read_excel(categories_file.name)
+    keywords = list(keywords[keywords.columns[0]].values)
+    categories = list(categories[categories.columns[0]].values)
+    while index < len(keywords):
+        try:
+            batch = keywords[index:index+batch_size]
+        except:
+            batch = keywords[index:]
+        try:
+            result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
+        except Exception as E:
+            print('this batch did not worked from {} to {}'.format(index, index + batch_size))
+            print(E)
+            result = []
+        results += result
+        index += batch_size
+        results_to_csv(results)
+        #print((index, batch_size, len(keywords)))
+    return results, 'themes_results.csv'
+def results_to_csv(results):
+    super_dict = defaultdict(list)
+    for d in results:
+        for k, v in d.items():  # d.items() in Python 3+
+            super_dict[k].append(v)
+    pd.DataFrame(super_dict).to_csv('themes_results.csv', index=False)
+with gr.Blocks() as demo:
+    prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
+into the appropriate categories.
+The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category.
+Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.""")
+    gr.Markdown("Upload CSV or xlsx with keywords: Just a csv  with all the keywords in one column. Should have a header")
+    keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
+    gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
+    categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
+    with gr.Accordion("Open for More!"):
+        gr.Markdown("Look at me...")
+    btn = gr.Button(value="run")
+    txt_3 = gr.Textbox(value="", label="Output")
+    output_file = gr.File(label="Output File",
+                file_count="single",
+                file_types=["", ".", ".csv",".xls",".xlsx"])
+    btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])
+demo.launch()