toolkit / remove_extra_whitespace
k4d3's picture
joy updates (we havent tested yet) and remove_extra_whitespace
ee41534
raw
history blame
2.01 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This script removes all extra spaces (more than one) and new line characters (truncating to one single character)
from all *.caption and *.txt files in a target directory recursively. If no target directory is provided as an
argument, it processes the current directory.
Usage:
python script_name.py [target_directory]
Args:
target_directory (str, optional): The path to the target directory. If not provided, the current directory is used.
"""
import os
import sys
import glob
def remove_extra_spaces_and_newlines(file_path):
"""
Removes extra spaces (more than one) and new line characters from the given file.
Truncates the text to a single space or new line character without removing any text.
Args:
file_path (str): The path to the file to be processed.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Replace multiple spaces with a single space
content = ' '.join(content.split())
# Replace multiple newlines with a single newline
content = '\n'.join(line.strip() for line in content.split('\n'))
with open(file_path, 'w', encoding='utf-8') as file:
file.write(content)
def process_files_in_directory(directory):
"""
Processes all *.caption and *.txt files in the given directory recursively.
Removes extra spaces and new line characters from each file.
Args:
directory (str): The path to the directory to be processed.
"""
for file_path in glob.glob(os.path.join(directory, '**', '*.caption'), recursive=True):
remove_extra_spaces_and_newlines(file_path)
for file_path in glob.glob(os.path.join(directory, '**', '*.txt'), recursive=True):
remove_extra_spaces_and_newlines(file_path)
if __name__ == "__main__":
if len(sys.argv) > 1:
target_directory = sys.argv[1]
else:
target_directory = os.getcwd()
process_files_in_directory(target_directory)