import os import json from pathlib import Path from tqdm.auto import tqdm from typing import List, Any, Dict MAX_WORDS = 250 def folder_to_json(folder_in: Path, json_path: Path) -> List[Any]: """ Process JSON lines from files in a given folder and write processed data to a new JSON file. Parameters: folder_in (Path): Path to the input folder containing the JSON files to process. json_path (Path): Path to the output JSON file where the processed data will be written. Returns: List[Any]: List containing processed JSON data from all files in the input folder. Example: folder_to_json(Path("/path/to/input/folder"), Path("/path/to/output.json")) """ folder_in = Path(folder_in) json_out = [] # Initialize list to hold processed JSON data from all files # Calculate total number of files in the input folder to set up the progress bar total_files = sum([len(files) for r, d, files in os.walk(folder_in)]) # Initialize progress bar with total file count, description, and unit of progress with tqdm(total=total_files, desc='Processing', unit='file') as pbar: # Iterate through all files in the input folder for subdir, _, files in os.walk(folder_in): # Set progress bar postfix to display current directory pbar.set_postfix_str(f"Directory: {subdir}", refresh=False) for file in files: # Update progress bar postfix to display current file and directory pbar.set_postfix_str(f"Dir: {subdir} | File: {file}", refresh=True) # Create full file path for the current file file_path = Path(subdir) / file # Open and read the current file with open(file_path, 'r', encoding='utf-8') as f: for line in f: # Load JSON data from each line and process it article = json.loads(line) # Ensure the preprocess function is defined and accessible processed_article = preprocess(article) # Add processed data to the output list json_out.extend(processed_article) # Update progress bar after processing each file pbar.update(1) # Notify that the writing process is starting pbar.write("Writing file!") # Open the output file and write the processed data as JSON with open(json_path, "w", encoding='utf-8') as outfile: json.dump(json_out, outfile) # Notify that the writing process is complete pbar.write("File written!") # Return the list of processed data return json_out def preprocess(article: Dict[str, Any]) -> List[Dict[str, Any]]: """ Preprocess a given article dictionary, extracting and processing the 'text' field. Because of the `break` introduced we are only taking the first chunk Parameters: article (Dict[str, Any]): Input dictionary containing an article. Expected to have a 'text' field. Returns: List[Dict[str, Any]]: A list of dictionaries, where each dictionary represents a preprocessed chunk of the original article's text. Each dictionary also contains the original article's fields (excluding 'text'), with an additional 'chunk_number' field indicating the order of the chunk. Example: article = {"text": "Example text", "title": "Example Title", "author": "John Doe"} processed = preprocess(article) print(processed) """ # Create a new dictionary excluding the 'text' field from the original article article_out = {k: v for k, v in article.items() if k != 'text'} # Create a prefix using the article's text. Adjust this line as needed based on the actual structure of 'article' prefix = f'عنوان: {article["text"]}. ' out = [] # Initialize the list to hold the preprocessed chunks # Iterate over chunks obtained by splitting the article's text using the group_arabic_paragraphs function # Ensure group_arabic_paragraphs is defined and accessible for i, chunk in enumerate(group_arabic_paragraphs(article['text'], MAX_WORDS)): # Concatenate the prefix with the current chunk chunk = prefix + chunk # Create a new dictionary with the chunk, original article fields (excluding 'text'), and chunk number # Then append this dictionary to the 'out' list out.append({'chunk': chunk, **article_out, 'chunk_number': i}) # Only take the first chunk break # Return the list of preprocessed chunks return out def group_arabic_paragraphs(arabic_text: str, max_words: int) -> List[str]: """ Group contiguous paragraphs of Arabic text without exceeding the max_words limit per group. Parameters: arabic_text (str): The input Arabic text where paragraphs are separated by newlines. max_words (int): The maximum number of words allowed per group of paragraphs. Returns: List[str]: A list of strings where each string is a group of contiguous paragraphs. Example: arabic_text = "Paragraph1.\nParagraph2.\nParagraph3." max_words = 5 result = group_arabic_paragraphs(arabic_text, max_words) print(result) # Output will depend on word count of each paragraph and max_words. """ # Splitting the input text into paragraphs using newline as a delimiter paragraphs = arabic_text.split('\n') # Initialize variables to hold the grouped paragraphs and word count grouped_paragraphs = [] current_group = [] current_word_count = 0 # Iterate through each paragraph in the input text for paragraph in paragraphs: # Count the number of words in the paragraph word_count = len(paragraph.split()) # If adding the paragraph won't exceed the word limit, add it to the current group if current_word_count + word_count <= max_words: current_group.append(paragraph) current_word_count += word_count # Update the word count for the current group else: # If the paragraph exceeds the word limit, start a new group if current_group: grouped_paragraphs.append('\n'.join(current_group)) # Initialize a new group with the current paragraph current_group = [paragraph] current_word_count = word_count # Reset the word count for the new group # Add the last group if not empty if current_group: grouped_paragraphs.append('\n'.join(current_group)) # Return the grouped paragraphs as a list of strings return grouped_paragraphs if __name__ == '__main__': folder = Path('output') file_out = Path('arwiki.json') folder_to_json(folder, file_out) print('Done!')