Spaces:
Running
Running
File size: 8,380 Bytes
2996fd9 66dcc60 2996fd9 bfef1ae 2996fd9 bfef1ae 2996fd9 0d7e823 2996fd9 e5e6a27 b16a321 2996fd9 e36771a 2996fd9 0d7e823 7324d20 f1224f3 0d7e823 2996fd9 f1224f3 2996fd9 f1224f3 218bcb5 f1224f3 d73ef8b f1224f3 d73ef8b f1224f3 d73ef8b 2cde806 f1224f3 d8a66e4 2cde806 f1224f3 26376e0 2996fd9 1a0d424 2996fd9 31825c6 2996fd9 218bcb5 2996fd9 b16a321 2996fd9 b16a321 2996fd9 7324d20 218bcb5 2996fd9 2cde806 2996fd9 31825c6 2996fd9 2d17c93 2996fd9 bfef1ae 2996fd9 |
|
"""The UI file for the SynthGenAI package."""
import os
import asyncio
import gradio as gr
from synthgenai import DatasetConfig, DatasetGeneratorConfig, LLMConfig, InstructionDatasetGenerator, PreferenceDatasetGenerator,RawDatasetGenerator,SentimentAnalysisDatasetGenerator, SummarizationDatasetGenerator, TextClassificationDatasetGenerator
def generate_synthetic_dataset(
llm_model,
temperature,
top_p,
max_tokens,
dataset_type,
topic,
domains,
language,
additional_description,
num_entries,
hf_token,
hf_repo_name,
llm_env_vars,
):
"""
Generate a dataset based on the provided parameters.
Args:
llm_model (str): The LLM model to use.
temperature (float): The temperature for the LLM.
top_p (float): The top_p value for the LLM.
max_tokens (int): The maximum number of tokens for the LLM.
dataset_type (str): The type of dataset to generate.
topic (str): The topic of the dataset.
domains (str): The domains for the dataset.
language (str): The language of the dataset.
additional_description (str): Additional description for the dataset.
num_entries (int): The number of entries in the dataset.
hf_token (str): The Hugging Face token.
hf_repo_name (str): The Hugging Face repository name.
llm_env_vars (str): Comma-separated environment variables for the LLM.
Returns:
str: A message indicating the result of the dataset generation.
"""
os.environ["HF_TOKEN"] = hf_token
for var in llm_env_vars.split(","):
key, value = var.split("=")
os.environ[key.strip()] = value.strip()
llm_config = LLMConfig(
model=llm_model,
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens,
)
dataset_config = DatasetConfig(
topic=topic,
domains=domains.split(","),
language=language,
additional_description=additional_description,
num_entries=num_entries,
)
dataset_generator_config = DatasetGeneratorConfig(
llm_config=llm_config,
dataset_config=dataset_config,
)
if dataset_type == "Raw":
generator = RawDatasetGenerator(dataset_generator_config)
elif dataset_type == "Instruction":
generator = InstructionDatasetGenerator(dataset_generator_config)
elif dataset_type == "Preference":
generator = PreferenceDatasetGenerator(dataset_generator_config)
elif dataset_type == "Sentiment Analysis":
generator = SentimentAnalysisDatasetGenerator(dataset_generator_config)
elif dataset_type == "Summarization":
generator = SummarizationDatasetGenerator(dataset_generator_config)
elif dataset_type == "Text Classification":
generator = TextClassificationDatasetGenerator(dataset_generator_config)
else:
return "Invalid dataset type"
async def generate():
dataset = await generator.agenerate_dataset()
dataset.save_dataset(hf_repo_name=hf_repo_name)
return "Dataset generated and saved successfully."
try:
return asyncio.run(generate())
except RuntimeError as e:
if str(e) == "Event loop is closed":
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
return loop.run_until_complete(generate())
else:
raise
def ui_main():
"""
Launch the Gradio UI for the SynthGenAI dataset generator.
"""
with gr.Blocks(
title="SynthGenAI Dataset Generator",
css="""
.gradio-container .gr-block {
margin-bottom: 10px;
margin-left: 5px;
margin-right: 5px;
text-align: center;
}
""",
theme="ParityError/Interstellar",
) as demo:
gr.HTML(
"""
<div style="text-align: center;">
<img src="https://raw.githubusercontent.com/Shekswess/synthgenai/refs/heads/main/docs/assets/logo_header.png" alt="Header Image" style="width: 50%; margin: 0 auto;" />
<h1>SynthGenAI Dataset Generator</h1>
<h2>Overview π§</h2>
<p>SynthGenAI is designed to be modular and can be easily extended to include different API providers for LLMs and new features.</p>
<h2>Why SynthGenAI? π€</h2>
<p>Interest in synthetic data generation has surged recently, driven by the growing recognition of data as a critical asset in AI development. Synthetic data generation addresses challenges by allowing us to create diverse and useful datasets using current pre-trained Large Language Models (LLMs).</p>
<h2>LLM Providers π€</h2>
<p>For more information on which LLMs are allowed and how they can be used, please refer to the <a href="https://shekswess.github.io/synthgenai/llm_providers/">documentation</a>.</p>
<a href="https://github.com/Shekswess/synthgenai/tree/main">GitHub Repository</a> | <a href="https://shekswess.github.io/synthgenai/">Documentation</a>
</div>
"""
)
with gr.Row():
llm_model = gr.Textbox(
label="LLM Model", placeholder="model_provider/model_name", value="huggingface/mistralai/Mistral-7B-Instruct-v0.3"
)
llm_env_vars = gr.Textbox(
label="LLM Environment Variables",
placeholder="Comma-separated environment variables (e.g., KEY1=VALUE1, KEY2=VALUE2)",
value="HUGGINGFACE_API_KEY=hf_1234566789912345677889, OPENAI_API_KEY=sk-1234566789912345677889",
)
temperature = gr.Slider(
label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.5
)
top_p = gr.Slider(
label="Top P", minimum=0.0, maximum=1.0, step=0.1, value=0.9
)
max_tokens = gr.Number(label="Max Tokens", value=2048)
with gr.Row():
dataset_type = gr.Dropdown(
label="Dataset Type",
choices=[
"Raw",
"Instruction",
"Preference",
"Sentiment Analysis",
"Summarization",
"Text Classification",
],
)
topic = gr.Textbox(label="Topic", placeholder="Dataset topic", value="Artificial Intelligence")
domains = gr.Textbox(label="Domains", placeholder="Comma-separated domains", value="Machine Learning, Deep Learning")
language = gr.Textbox(
label="Language", placeholder="Language", value="English"
)
additional_description = gr.Textbox(
label="Additional Description",
placeholder="Additional description",
value="This dataset must be more focused on healthcare implementations of AI, Machine Learning, and Deep Learning.",
)
num_entries = gr.Number(label="Number of Entries To Generated", value=1000)
with gr.Row():
hf_token = gr.Textbox(
label="Hugging Face Token to Save Dataset",
placeholder="Your HF Token",
type="password",
)
hf_repo_name = gr.Textbox(
label="Hugging Face Repo Name",
placeholder="organization_or_user_name/dataset_name",
value="Shekswess/synthgenai-dataset",
)
generate_button = gr.Button("Generate Dataset")
output = gr.Textbox(label="Operation Result", value="")
generate_button.click(
generate_synthetic_dataset,
inputs=[
llm_model,
temperature,
top_p,
max_tokens,
dataset_type,
topic,
domains,
language,
additional_description,
num_entries,
hf_token,
hf_repo_name,
llm_env_vars,
],
outputs=output,
)
demo.launch(inbrowser=True, favicon_path=None)
if __name__ == "__main__":
ui_main() |