hf-dataset-chat-to-sql

Sleeping

App Files Files Community

augray commited on Sep 13

Commit

ca78baa

•

1 Parent(s): e30a182

Correct table name

Browse files

Files changed (2) hide show

.gitignore +2 -1
app.py +104 -23

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- .env


1	+ .env
2	+ .venv

app.py CHANGED Viewed

@@ -1,18 +1,17 @@
 import json
 import os
 import urllib.parse
 import gradio as gr
 import requests
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from huggingface_hub import InferenceClient
-example = HuggingfaceHubSearch().example_value()
-client = InferenceClient(
-    "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    token=os.environ["HF_TOKEN"],
-)
 def get_iframe(hub_repo_id, sql_query=None):
@@ -34,20 +33,53 @@ def get_iframe(hub_repo_id, sql_query=None):
     return iframe
-def get_column_info(hub_repo_id):
     url: str = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}"
     response = requests.get(url)
     try:
         data = response.json()
         data = data.get("dataset_info")
-        key = list(data.keys())[0]
-        features: str = json.dumps(data.get(key).get("features"))
     except Exception as e:
         gr.Error(f"Error getting column info: {e}")
-    return features
-def query_dataset(hub_repo_id, features, query):
     messages = [
         {
             "role": "system",
@@ -55,22 +87,71 @@ def query_dataset(hub_repo_id, features, query):
         },
         {
             "role": "user",
-            "content": f"""table train
 # Features
 {features}
 # Query
-{query}
 """,
         },
     ]
-    response = client.chat_completion(
-        messages=messages,
-        max_tokens=1000,
-        stream=False,
     )
-    query = response.choices[0].message.content
-    return query, get_iframe(hub_repo_id, query)
 with gr.Blocks() as demo:
@@ -107,21 +188,21 @@ with gr.Blocks() as demo:
     with gr.Row():
         search_out = gr.HTML(label="Search Results")
     with gr.Row():
-        features = gr.Code(label="Features", language="json", visible=False)
     gr.on(
         [btn.click, search_in.submit],
         fn=get_iframe,
         inputs=[search_in],
         outputs=[search_out],
     ).then(
-        fn=get_column_info,
         inputs=[search_in],
-        outputs=[features],
     )
     gr.on(
         [btn2.click, query.submit],
         fn=query_dataset,
-        inputs=[search_in, features, query],
         outputs=[sql_out, search_out],
     )

 import json
+import logging
 import os
 import urllib.parse
+from typing import Any
 import gradio as gr
 import requests
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub.repocard import CardData, RepoCard
+logger = logging.getLogger(__name__)
+example = HuggingfaceHubSearch().example_value()
 def get_iframe(hub_repo_id, sql_query=None):
     return iframe
+def get_table_info(hub_repo_id):
     url: str = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}"
     response = requests.get(url)
     try:
         data = response.json()
         data = data.get("dataset_info")
+        return json.dumps(data)
     except Exception as e:
         gr.Error(f"Error getting column info: {e}")
+def get_table_name(config: str | None, split: str | None, config_choices: list[str], split_choices: list[str]):
+    if len(config_choices) > 0 and config is None:
+        config = config_choices[0]
+    if len(split_choices) > 0 and split is None:
+        split = split_choices[0]
+    if len(config_choices) > 1 and len(split_choices) > 1:
+        base_name = f"{config}_{split}"
+    elif len(config_choices) >= 1 and len(split_choices) <= 1:
+        base_name = config
+    else:
+        base_name = split
+    def replace_char(c):
+        if c.isalnum():
+            return c
+        if c in ["-", "_", "/"]:
+            return "_"
+        return ""
+    table_name = "".join(
+        replace_char(c) for c in base_name
+    )
+    if table_name[0].isdigit():
+        table_name = f"_{table_name}"
+    return table_name.lower()
+def get_prompt_messages(card_data: dict[str, Any], natural_language_query: str):
+    config_choices = get_config_choices(card_data)
+    split_choices = get_split_choices(card_data)
+    chosen_config = config_choices[0] if len(config_choices) > 0 else None
+    chosen_split = split_choices[0] if len(split_choices) > 0 else None
+    table_name = get_table_name(chosen_config, chosen_split, config_choices, split_choices)
+    features = card_data[chosen_config]["features"]
     messages = [
         {
             "role": "system",
         },
         {
             "role": "user",
+            "content": f"""table {table_name}
 # Features
 {features}
 # Query
+{natural_language_query}
 """,
         },
     ]
+    return messages
+def get_config_choices(card_data: dict[str, Any]) -> list[str]:
+    return list(card_data.keys())
+def get_split_choices(card_data: dict[str, Any]) -> list[str]:
+    splits = set()
+    for config in card_data.values():
+        splits.update(config.get("splits", {}).keys())
+    return list(splits)
+def query_dataset(hub_repo_id, card_data, query):
+    card_data = json.loads(card_data)
+    messages = get_prompt_messages(card_data, query)
+    api_key = os.environ["API_KEY_TOGETHER_AI"].strip()
+    response = requests.post(
+        "https://api.together.xyz/v1/chat/completions",
+        json=dict(
+            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+            messages=messages,
+            max_tokens=1000,
+        ),
+        headers={"Authorization": f"Bearer {api_key}"},
     )
+    if response.status_code != 200:
+        logger.warning(response.text)
+    try:
+        response.raise_for_status()
+    except Exception as e:
+        gr.Error(f"Could not query LLM for suggestion: {e}")
+    response_dict = response.json()
+    duck_query = response_dict["choices"][0]["message"]["content"]
+    duck_query = _sanitize_duck_query(duck_query)
+    return duck_query, get_iframe(hub_repo_id, duck_query)
+def _sanitize_duck_query(duck_query: str) -> str:
+    # Sometimes the LLM wraps the query like this:
+    # ```sql
+    # select * from x;
+    # ```
+    # This removes that wrapping if present.
+    if "```" not in duck_query:
+        return duck_query
+    start_idx = duck_query.index("```") + len("```")
+    end_idx = duck_query.rindex("```")
+    duck_query = duck_query[start_idx:end_idx]
+    if duck_query.startswith("sql\n"):
+        duck_query = duck_query.replace("sql\n", "", 1)
+    return duck_query
 with gr.Blocks() as demo:
     with gr.Row():
         search_out = gr.HTML(label="Search Results")
     with gr.Row():
+        card_data = gr.Code(label="Card data", language="json", visible=False)
     gr.on(
         [btn.click, search_in.submit],
         fn=get_iframe,
         inputs=[search_in],
         outputs=[search_out],
     ).then(
+        fn=get_table_info,
         inputs=[search_in],
+        outputs=[card_data],
     )
     gr.on(
         [btn2.click, query.submit],
         fn=query_dataset,
+        inputs=[search_in, card_data, query],
         outputs=[sql_out, search_out],
     )