awoo

Browse files

Signed-off-by: Balazs Horvath <[email protected]>

Files changed (2) hide show

dataset_tools/Insert Multiple Tags.ipynb +66 -0
dataset_tools/Remove Tags 2.ipynb +71 -0

dataset_tools/Insert Multiple Tags.ipynb ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Directory where caption files are located\n",
+    "directory = r'C:\\Users\\kade\\Desktop\\training_dir_staging'\n",
+    "\n",
+    "# Function to inject tags into the caption content\n",
+    "def inject_tags(content):\n",
+    "    tags_to_add = ['realistic', 'photo', 'photo \\\\(medium\\\\)', 'photography \\\\(artwork\\\\)']\n",
+    "    existing_tags = set(content.lower().split(', '))\n",
+    "    # Add tags to the content if they don't already exist\n",
+    "    for tag in tags_to_add:\n",
+    "        if tag.lower() not in existing_tags:\n",
+    "            content += f', {tag}'\n",
+    "    return content\n",
+    "\n",
+    "# Function to recursively process files in a directory\n",
+    "def process_directory(directory):\n",
+    "    for filename in os.listdir(directory):\n",
+    "        filepath = os.path.join(directory, filename)\n",
+    "        if os.path.isdir(filepath):\n",
+    "            process_directory(filepath)  # Recursively process subdirectories\n",
+    "        elif filename.endswith('.txt'):\n",
+    "            # Read content from the file\n",
+    "            with open(filepath, 'r', encoding='utf-8') as file:\n",
+    "                content = file.read()\n",
+    "            # Inject tags into the content\n",
+    "            modified_content = inject_tags(content)\n",
+    "            # Write modified content back to the file\n",
+    "            with open(filepath, 'w', encoding='utf-8') as file:\n",
+    "                file.write(modified_content)\n",
+    "\n",
+    "# Start processing from the main directory\n",
+    "process_directory(directory)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

dataset_tools/Remove Tags 2.ipynb ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Directory where caption files are located\n",
+    "directory = r'C:\\Users\\kade\\Desktop\\training_dir_staging'\n",
+    "\n",
+    "# Tags to remove\n",
+    "tags_to_remove = ['grandfathered content', '3d \\\\(artwork\\\\)', 'digital media \\\\(artwork\\\\)']\n",
+    "\n",
+    "# Function to remove tags from the caption content\n",
+    "def remove_tags(content):\n",
+    "    # Split the content into tags\n",
+    "    tags = [tag.strip() for tag in content.split(',')]\n",
+    "\n",
+    "    # Remove specified tags\n",
+    "    cleaned_tags = [tag for tag in tags if tag.lower() not in tags_to_remove]\n",
+    "\n",
+    "    # Reconstruct the content without removed tags\n",
+    "    cleaned_content = ', '.join(cleaned_tags)\n",
+    "    return cleaned_content\n",
+    "\n",
+    "# Function to recursively process files in a directory\n",
+    "def process_directory(directory):\n",
+    "    for filename in os.listdir(directory):\n",
+    "        filepath = os.path.join(directory, filename)\n",
+    "        if os.path.isdir(filepath):\n",
+    "            process_directory(filepath)  # Recursively process subdirectories\n",
+    "        elif filename.endswith('.txt'):\n",
+    "            # Read content from the file\n",
+    "            with open(filepath, 'r', encoding='utf-8') as file:\n",
+    "                content = file.read()\n",
+    "            # Remove specified tags from the content\n",
+    "            modified_content = remove_tags(content)\n",
+    "            # Write modified content back to the file\n",
+    "            with open(filepath, 'w', encoding='utf-8') as file:\n",
+    "                file.write(modified_content)\n",
+    "\n",
+    "# Start processing from the main directory\n",
+    "process_directory(directory)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}