k4d3 commited on
Commit
3fccb67
1 Parent(s): f12337d

Signed-off-by: Balazs Horvath <[email protected]>

dataset_tools/Insert Multiple Tags.ipynb ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "\n",
11
+ "# Directory where caption files are located\n",
12
+ "directory = r'C:\\Users\\kade\\Desktop\\training_dir_staging'\n",
13
+ "\n",
14
+ "# Function to inject tags into the caption content\n",
15
+ "def inject_tags(content):\n",
16
+ " tags_to_add = ['realistic', 'photo', 'photo \\\\(medium\\\\)', 'photography \\\\(artwork\\\\)']\n",
17
+ " existing_tags = set(content.lower().split(', '))\n",
18
+ " # Add tags to the content if they don't already exist\n",
19
+ " for tag in tags_to_add:\n",
20
+ " if tag.lower() not in existing_tags:\n",
21
+ " content += f', {tag}'\n",
22
+ " return content\n",
23
+ "\n",
24
+ "# Function to recursively process files in a directory\n",
25
+ "def process_directory(directory):\n",
26
+ " for filename in os.listdir(directory):\n",
27
+ " filepath = os.path.join(directory, filename)\n",
28
+ " if os.path.isdir(filepath):\n",
29
+ " process_directory(filepath) # Recursively process subdirectories\n",
30
+ " elif filename.endswith('.txt'):\n",
31
+ " # Read content from the file\n",
32
+ " with open(filepath, 'r', encoding='utf-8') as file:\n",
33
+ " content = file.read()\n",
34
+ " # Inject tags into the content\n",
35
+ " modified_content = inject_tags(content)\n",
36
+ " # Write modified content back to the file\n",
37
+ " with open(filepath, 'w', encoding='utf-8') as file:\n",
38
+ " file.write(modified_content)\n",
39
+ "\n",
40
+ "# Start processing from the main directory\n",
41
+ "process_directory(directory)"
42
+ ]
43
+ }
44
+ ],
45
+ "metadata": {
46
+ "kernelspec": {
47
+ "display_name": "Python 3",
48
+ "language": "python",
49
+ "name": "python3"
50
+ },
51
+ "language_info": {
52
+ "codemirror_mode": {
53
+ "name": "ipython",
54
+ "version": 3
55
+ },
56
+ "file_extension": ".py",
57
+ "mimetype": "text/x-python",
58
+ "name": "python",
59
+ "nbconvert_exporter": "python",
60
+ "pygments_lexer": "ipython3",
61
+ "version": "3.12.3"
62
+ }
63
+ },
64
+ "nbformat": 4,
65
+ "nbformat_minor": 2
66
+ }
dataset_tools/Remove Tags 2.ipynb ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "\n",
11
+ "# Directory where caption files are located\n",
12
+ "directory = r'C:\\Users\\kade\\Desktop\\training_dir_staging'\n",
13
+ "\n",
14
+ "# Tags to remove\n",
15
+ "tags_to_remove = ['grandfathered content', '3d \\\\(artwork\\\\)', 'digital media \\\\(artwork\\\\)']\n",
16
+ "\n",
17
+ "# Function to remove tags from the caption content\n",
18
+ "def remove_tags(content):\n",
19
+ " # Split the content into tags\n",
20
+ " tags = [tag.strip() for tag in content.split(',')]\n",
21
+ "\n",
22
+ " # Remove specified tags\n",
23
+ " cleaned_tags = [tag for tag in tags if tag.lower() not in tags_to_remove]\n",
24
+ "\n",
25
+ " # Reconstruct the content without removed tags\n",
26
+ " cleaned_content = ', '.join(cleaned_tags)\n",
27
+ " return cleaned_content\n",
28
+ "\n",
29
+ "# Function to recursively process files in a directory\n",
30
+ "def process_directory(directory):\n",
31
+ " for filename in os.listdir(directory):\n",
32
+ " filepath = os.path.join(directory, filename)\n",
33
+ " if os.path.isdir(filepath):\n",
34
+ " process_directory(filepath) # Recursively process subdirectories\n",
35
+ " elif filename.endswith('.txt'):\n",
36
+ " # Read content from the file\n",
37
+ " with open(filepath, 'r', encoding='utf-8') as file:\n",
38
+ " content = file.read()\n",
39
+ " # Remove specified tags from the content\n",
40
+ " modified_content = remove_tags(content)\n",
41
+ " # Write modified content back to the file\n",
42
+ " with open(filepath, 'w', encoding='utf-8') as file:\n",
43
+ " file.write(modified_content)\n",
44
+ "\n",
45
+ "# Start processing from the main directory\n",
46
+ "process_directory(directory)\n"
47
+ ]
48
+ }
49
+ ],
50
+ "metadata": {
51
+ "kernelspec": {
52
+ "display_name": "Python 3",
53
+ "language": "python",
54
+ "name": "python3"
55
+ },
56
+ "language_info": {
57
+ "codemirror_mode": {
58
+ "name": "ipython",
59
+ "version": 3
60
+ },
61
+ "file_extension": ".py",
62
+ "mimetype": "text/x-python",
63
+ "name": "python",
64
+ "nbconvert_exporter": "python",
65
+ "pygments_lexer": "ipython3",
66
+ "version": "3.12.3"
67
+ }
68
+ },
69
+ "nbformat": 4,
70
+ "nbformat_minor": 2
71
+ }