toolkit / txt2emoji

way too much effort for emojis

3f04019 about 24 hours ago

8.96 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	This script converts text to related emojis, injecting context-relevant visual
	cues into datasets. It enables the enrichment of text data with corresponding
	emojis, enhancing its expressiveness and engagement.

	Before running this script, you need to download the required resources. Just open
	up `python` and type:

	import nltk
	nltk.download('punkt_tab')
	"""

	import nltk
	from nltk.tokenize import word_tokenize
	from emoji import EMOJI_DATA
	import argparse
	from pathlib import Path
	import re

	# Download required NLTK data (only needed once)
	nltk.download('punkt', quiet=True)

	def get_emoji_mapping():
	"""Create a mapping of words to emojis."""
	emoji_map = {}
	# Group emoji variations together
	emoji_variations = {}

	for emoji_char, data in EMOJI_DATA.items():
	base_emoji = data.get('base', emoji_char) # Get base emoji or use current one
	if base_emoji not in emoji_variations:
	emoji_variations[base_emoji] = set()
	emoji_variations[base_emoji].add(emoji_char)

	if 'en' in data: # If emoji has English description
	words = data['en'].lower().replace('_', ' ').split()
	for word in words:
	if word not in emoji_map:
	emoji_map[word] = []
	emoji_map[word].append(emoji_char)
	return emoji_map, emoji_variations

	def text_to_emojis(text):
	"""Convert text to related emojis."""
	# Create emoji mapping and variations
	emoji_map, emoji_variations = get_emoji_mapping()

	# Regex pattern to match any token containing numbers
	number_pattern = re.compile(r'.\d+.')

	# Emojis to exclude
	excluded_emojis = {
	'🔶',
	'⭕',
	'🔷',
	'🔹',
	'🔸',
	'🔺',
	'🔻',
	'🔴',
	'🔵',
	'🔼',
	'🔾',
	'🇵🇬',
	'🀄',
	'🔲',
	'✅'
	}

	# Words to exclude from emoji conversion
	excluded_words = {
	'(',
	')',
	'purple',
	'abdominal',
	'penetration',
	'feral',
	'body',
	'nude',
	'anthro',
	'big',
	'small',
	'the',
	'a',
	'an',
	'and',
	'or',
	'but',
	'if',
	'then',
	'because',
	'as',
	'until',
	'while',
	',',
	'hi',
	'res',
	'pussy'
	'penetrated',
	'equine',
	'felid',
	'feline',
	'equid',
	'genital',
	'genitals',
	'penetrating',
	'medial',
	'ring',
	'inside',
	'duo',
	'solo',
	'in',
	'hair',
	'andromorph',
	'from',
	'behind',
	'position',
	'pantherine',
	'animal',
	'brown',
	'sub',
	'dom',
	'explicit',
	'black',
	'bulge',
	'dominant',
	'kousen',
	'rendan',
	'genitalia',
	'tan',
	'simple',
	'media',
	'vaginal',
	'red',
	'pecs',
	'navel',
	'background',
	'pubes',
	}

	# Track used emojis and their variations
	used_emojis = set()

	# Additional manual mappings for common words
	custom_mappings = {
	'markings': '🏷️',
	'sweat': '💧',
	'toes': '👣',
	'teeth': '🦷',
	'fingering': '👉',
	'blush': '😊',
	'male': '♂️',
	'tiger': '🐯',
	'fluids': '💧',
	'wolf': '🐺',
	'dog': '🐶',
	'female': '♀️',
	'intersex': '⚧️',
	'muscular': '💪',
	'wheelbarrow': '🚜',
	'sex': '💑',
	'size': '📏',
	'difference': '🔢',
	'penis': '🔱',
	'paws': '🐾',
	'pawpads': '🐾',
	'hindpaw': '🐾',
	'fur': '🧥',
	'horse': '🐴',
	#'pussy': '',
	'ejaculation': '💦',
	'cum': '💦',
	'love': '❤️',
	'smaller': '🔽',
	'bigger': '🔼',
	'larger': '🔼',
	'cat': '😺',
	'cats': '😺',
	'dog': '🐶',
	'dogs': '🐶',
	'sun': '☀️',
	'moon': '🌙',
	'star': '⭐',
	'happy': '😊',
	'sad': '😢',
	'angry': '😠',
	'food': '🍔',
	'heart': '❤️',
	'fire': '🔥',
	'hot': '🔥',
	'cold': '❄️',
	'snow': '❄️',
	'rain': '🌧️',
	'smile': '😊',
	'laugh': '😂',
	'cry': '😢',
	}

	# Tokenize the input text
	tokens = word_tokenize(text.lower())

	# Store found emojis with their explanations
	found_emojis = []
	explanations = []

	def is_emoji_usable(emoji):
	"""Check if emoji or any of its variations have been used."""
	# Find the base emoji
	base_emoji = None
	for base, variations in emoji_variations.items():
	if emoji in variations:
	base_emoji = base
	break

	if base_emoji:
	# Check if any variation has been used
	return not any(variation in used_emojis for variation in emoji_variations[base_emoji])
	return emoji not in used_emojis

	# Process each token
	for token in tokens:
	# Skip excluded words and anything containing numbers
	if token in excluded_words or number_pattern.match(token):
	continue

	# First check custom mappings
	if token in custom_mappings:
	emoji = custom_mappings[token]
	if emoji not in excluded_emojis and is_emoji_usable(emoji):
	found_emojis.append(emoji)
	used_emojis.add(emoji)
	explanations.append(f"'{token}' → {emoji} (custom mapping)")
	#else:
	# explanations.append(f"'{token}' → (skipped - emoji {emoji} already used)")
	continue

	# Then check emoji mapping
	if token in emoji_map:
	found_match = False
	for emoji in emoji_map[token]:
	if emoji not in excluded_emojis and is_emoji_usable(emoji):
	found_emojis.append(emoji)
	used_emojis.add(emoji)
	explanations.append(f"'{token}' → {emoji} (from emoji database)")
	found_match = True
	break
	if not found_match:
	available_emojis = [e for e in emoji_map[token] if e not in excluded_emojis]
	#if available_emojis:
	# explanations.append(f"'{token}' → (skipped - all matching emojis {', '.join(available_emojis)} already used)")
	#else:
	# explanations.append(f"'{token}' → (skipped - all matching emojis are excluded)")
	else:
	explanations.append(f"'{token}' → (no matching emoji found)")

	# Return emojis and explanations
	return ' '.join(found_emojis) if found_emojis else '', explanations

	def process_file(file_path):
	"""Process a single text file and create corresponding emoji and explanation files."""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()

	emojis, explanations = text_to_emojis(text)

	# Create output filenames
	emoji_file = file_path.with_suffix('.emoji')
	explanation_file = file_path.with_suffix('.emoji.explain')

	with open(emoji_file, 'w', encoding='utf-8') as f:
	f.write(emojis)

	with open(explanation_file, 'w', encoding='utf-8') as f:
	f.write('\n'.join(explanations))

	print(f"Processed: {file_path} → {emoji_file} and {explanation_file}")
	except Exception as e:
	print(f"Error processing {file_path}: {str(e)}")

	def main():
	# Set up argument parser
	parser = argparse.ArgumentParser(description='Convert text files to emoji representations')
	parser.add_argument('directory', nargs='?', default='.',
	help='Directory to process (default: current directory)')

	args = parser.parse_args()

	# Convert to Path object and resolve to absolute path
	base_dir = Path(args.directory).resolve()

	if not base_dir.exists():
	print(f"Error: Directory '{base_dir}' does not exist")
	return

	# Find all .txt files in directory and subdirectories
	txt_files = list(base_dir.rglob('*.txt'))

	if not txt_files:
	print(f"No .txt files found in {base_dir}")
	return

	print(f"Found {len(txt_files)} .txt files to process")

	# Process each file
	for file_path in txt_files:
	process_file(file_path)

	if __name__ == "__main__":
	main()