training-data-collection_3

Sleeping

App Files Files Community

Oscar Wang commited on Jul 27, 2024

Commit

8fb98cd

verified ·

1 Parent(s): 591799e

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -28

app.py CHANGED Viewed

@@ -1,28 +1,32 @@
 import pandas as pd
-from groq import Groq
 import os
 import gradio as gr
 import threading
 import time
 client = Groq()
-max_size = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
 file_index = 1
-data_directory = 'data'
-current_file = os.path.join(data_directory, f'data{file_index}.csv')
 file_paths = [current_file]
 combined_tokens = 0
-update_interval = 1  # Update interval in seconds
-# Ensure the data directory exists
-if not os.path.exists(data_directory):
-    os.makedirs(data_directory)
 def get_file_size(filename):
-    if os.path.isfile(filename):
-        return os.path.getsize(filename)
-    return 0
 def generate_and_save_data():
     global file_index, current_file, file_paths, combined_tokens
     while True:
@@ -47,9 +51,9 @@ def generate_and_save_data():
             prompt_tokens = 0
             for chunk in completion:
                 content = chunk.choices[0].delta.content
-                if content is not None:
                     prompt += content
-                    prompt_tokens += len(content.split())  # Assuming tokens are words for simplicity
             # Use the generated prompt to query the model again
             second_completion = client.chat.completions.create(
@@ -61,7 +65,7 @@ def generate_and_save_data():
                     }
                 ],
                 temperature=1,
-                max_tokens=1024,
                 top_p=1,
                 stream=True,
                 stop=None,
@@ -71,9 +75,9 @@ def generate_and_save_data():
             response_tokens = 0
             for chunk in second_completion:
                 content = chunk.choices[0].delta.content
-                if content is not None:
                     response += content
-                    response_tokens += len(content.split())  # Assuming tokens are words for simplicity
             # Update the combined token count
             combined_tokens += (prompt_tokens + response_tokens)
@@ -86,33 +90,32 @@ def generate_and_save_data():
             data = pd.DataFrame({"prompt": [prompt], "response": [response]})
             # Check the size of the current file
-            if get_file_size(current_file) >= max_size:
                 file_index += 1
-                current_file = os.path.join(data_directory, f'data{file_index}.csv')
                 file_paths.append(current_file)
-            # Check if the current file exists
-            file_exists = os.path.isfile(current_file)
-            # If the file exists, append without overwriting
-            if file_exists:
-                data.to_csv(current_file, mode='a', header=False, index=False)
-            else:
-                data.to_csv(current_file, mode='w', header=True, index=False)
             # Wait for the next update interval
-            time.sleep(update_interval)
         except Exception as e:
             print(f"An error occurred: {e}. Retrying in 5 seconds...")
             time.sleep(5)
 def get_available_files():
     return [f for f in file_paths if os.path.isfile(f)]
 def update_file_list():
     return gr.update(choices=get_available_files())
 def update_token_count():
     return combined_tokens

 import pandas as pd
 import os
 import gradio as gr
 import threading
 import time
+from groq import Groq
+# Initialize Groq client
 client = Groq()
+# Constants
+MAX_SIZE = 1.1 * 1024 * 1024 * 1024  # 1.1GB in bytes
+DATA_DIRECTORY = 'data'
+UPDATE_INTERVAL = 1  # Update interval in seconds
+# Ensure the data directory exists
+os.makedirs(DATA_DIRECTORY, exist_ok=True)
+# Initialize variables
 file_index = 1
+current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
 file_paths = [current_file]
 combined_tokens = 0
+# Helper function to get file size
 def get_file_size(filename):
+    return os.path.getsize(filename) if os.path.isfile(filename) else 0
+# Data generation and saving function
 def generate_and_save_data():
     global file_index, current_file, file_paths, combined_tokens
     while True:
             prompt_tokens = 0
             for chunk in completion:
                 content = chunk.choices[0].delta.content
+                if content:
                     prompt += content
+                    prompt_tokens += len(content.split())
             # Use the generated prompt to query the model again
             second_completion = client.chat.completions.create(
                     }
                 ],
                 temperature=1,
+                max_tokens=5000,
                 top_p=1,
                 stream=True,
                 stop=None,
             response_tokens = 0
             for chunk in second_completion:
                 content = chunk.choices[0].delta.content
+                if content:
                     response += content
+                    response_tokens += len(content.split())
             # Update the combined token count
             combined_tokens += (prompt_tokens + response_tokens)
             data = pd.DataFrame({"prompt": [prompt], "response": [response]})
             # Check the size of the current file
+            if get_file_size(current_file) >= MAX_SIZE:
                 file_index += 1
+                current_file = os.path.join(DATA_DIRECTORY, f'data{file_index}.csv')
                 file_paths.append(current_file)
+            # Append data to the current file
+            mode = 'a' if os.path.isfile(current_file) else 'w'
+            header = not os.path.isfile(current_file)
+            data.to_csv(current_file, mode=mode, header=header, index=False)
             # Wait for the next update interval
+            time.sleep(UPDATE_INTERVAL)
         except Exception as e:
             print(f"An error occurred: {e}. Retrying in 5 seconds...")
             time.sleep(5)
+# Get available files
 def get_available_files():
     return [f for f in file_paths if os.path.isfile(f)]
+# Update file list
 def update_file_list():
     return gr.update(choices=get_available_files())
+# Update token count
 def update_token_count():
     return combined_tokens