import wikipediaapi import json from tqdm import tqdm import time def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti", "Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"], min_length=500, max_pages=1000): """ Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories """ wiki = wikipediaapi.Wikipedia( language='az', extract_format=wikipediaapi.ExtractFormat.WIKI, user_agent='AzGPTDataCollector/1.0' ) collected_pages = {} visited_pages = set() def collect_pages(category_title): if len(collected_pages) >= max_pages: return category = wiki.page(f"Kateqoriya:{category_title}") if not category.exists(): print(f"Category not found: {category_title}") return # First, process all articles in this category for member in category.categorymembers.values(): if len(collected_pages) >= max_pages: return if member.title in visited_pages: continue visited_pages.add(member.title) # Skip if it's a category or template page if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'): continue # Skip if content is too short if len(member.text) < min_length: continue collected_pages[member.title] = { 'title': member.title, 'text': member.text, 'url': member.fullurl, 'length': len(member.text) } print(f"Collected: {member.title} ({len(member.text)} chars)") # Delay to avoid hitting API limits time.sleep(0.1) # Then process subcategories for subcategory in category.categorymembers.values(): if subcategory.title.startswith('Kateqoriya:'): collect_pages(subcategory.title.replace('Kateqoriya:', '')) # Start collection from each category for category in categories: print(f"\nStarting collection from category: {category}") collect_pages(category) return collected_pages def preprocess_text(text): """ Enhanced text preprocessing for Azerbaijani text """ # Remove extra whitespace text = ' '.join(text.split()) # Add space after punctuation if missing for punct in '.!?،؛:()[]{}«»': text = text.replace(punct, punct + ' ') # Fix common OCR errors in Azerbaijani text replacements = { 'i': 'ı', # Replace dotted i with dotless ı where appropriate 'І': 'I', '...': '…', } for old, new in replacements.items(): text = text.replace(old, new) return text def save_dataset(pages, output_file='az_wiki_data.json'): """ Save collected pages to a JSON file """ with open(output_file, 'w', encoding='utf-8') as f: json.dump(pages, f, ensure_ascii=False, indent=2) print(f"Saved {len(pages)} pages to {output_file}") def main(): # Collect pages with minimum length requirement print("Starting data collection...") pages = get_wiki_pages(min_length=500, max_pages=100) # 500 chars minimum length # Preprocess and save print("\nPreprocessing and saving data...") for title in pages: pages[title]['text'] = preprocess_text(pages[title]['text']) save_dataset(pages) # Print statistics total_chars = sum(page['length'] for page in pages.values()) if pages: print(f"\nCollection complete!") print(f"Total pages: {len(pages)}") print(f"Total characters: {total_chars}") print(f"Average page length: {total_chars / len(pages):.2f} characters") # Print some titles as examples print("\nSample of collected articles:") for title in list(pages.keys())[:5]: print(f"- {title} ({pages[title]['length']} chars)") if __name__ == "__main__": main()