toolkit / txt2emoji
k4d3's picture
way too much effort for emojis
3f04019
raw
history blame
8.96 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This script converts text to related emojis, injecting context-relevant visual
cues into datasets. It enables the enrichment of text data with corresponding
emojis, enhancing its expressiveness and engagement.
Before running this script, you need to download the required resources. Just open
up `python` and type:
import nltk
nltk.download('punkt_tab')
"""
import nltk
from nltk.tokenize import word_tokenize
from emoji import EMOJI_DATA
import argparse
from pathlib import Path
import re
# Download required NLTK data (only needed once)
nltk.download('punkt', quiet=True)
def get_emoji_mapping():
"""Create a mapping of words to emojis."""
emoji_map = {}
# Group emoji variations together
emoji_variations = {}
for emoji_char, data in EMOJI_DATA.items():
base_emoji = data.get('base', emoji_char) # Get base emoji or use current one
if base_emoji not in emoji_variations:
emoji_variations[base_emoji] = set()
emoji_variations[base_emoji].add(emoji_char)
if 'en' in data: # If emoji has English description
words = data['en'].lower().replace('_', ' ').split()
for word in words:
if word not in emoji_map:
emoji_map[word] = []
emoji_map[word].append(emoji_char)
return emoji_map, emoji_variations
def text_to_emojis(text):
"""Convert text to related emojis."""
# Create emoji mapping and variations
emoji_map, emoji_variations = get_emoji_mapping()
# Regex pattern to match any token containing numbers
number_pattern = re.compile(r'.*\d+.*')
# Emojis to exclude
excluded_emojis = {
'πŸ”Ά',
'β­•',
'πŸ”·',
'πŸ”Ή',
'πŸ”Έ',
'πŸ”Ί',
'πŸ”»',
'πŸ”΄',
'πŸ”΅',
'πŸ”Ό',
'πŸ”Ύ',
'πŸ‡΅πŸ‡¬',
'πŸ€„',
'πŸ”²',
'βœ…'
}
# Words to exclude from emoji conversion
excluded_words = {
'(',
')',
'purple',
'abdominal',
'penetration',
'feral',
'body',
'nude',
'anthro',
'big',
'small',
'the',
'a',
'an',
'and',
'or',
'but',
'if',
'then',
'because',
'as',
'until',
'while',
',',
'hi',
'res',
'pussy'
'penetrated',
'equine',
'felid',
'feline',
'equid',
'genital',
'genitals',
'penetrating',
'medial',
'ring',
'inside',
'duo',
'solo',
'in',
'hair',
'andromorph',
'from',
'behind',
'position',
'pantherine',
'animal',
'brown',
'sub',
'dom',
'explicit',
'black',
'bulge',
'dominant',
'kousen',
'rendan',
'genitalia',
'tan',
'simple',
'media',
'vaginal',
'red',
'pecs',
'navel',
'background',
'pubes',
}
# Track used emojis and their variations
used_emojis = set()
# Additional manual mappings for common words
custom_mappings = {
'markings': '🏷️',
'sweat': 'πŸ’§',
'toes': 'πŸ‘£',
'teeth': '🦷',
'fingering': 'πŸ‘‰',
'blush': '😊',
'male': '♂️',
'tiger': '🐯',
'fluids': 'πŸ’§',
'wolf': '🐺',
'dog': '🐢',
'female': '♀️',
'intersex': '⚧️',
'muscular': 'πŸ’ͺ',
'wheelbarrow': '🚜',
'sex': 'πŸ’‘',
'size': 'πŸ“',
'difference': 'πŸ”’',
'penis': 'πŸ”±',
'paws': '🐾',
'pawpads': '🐾',
'hindpaw': '🐾',
'fur': 'πŸ§₯',
'horse': '🐴',
#'pussy': '',
'ejaculation': 'πŸ’¦',
'cum': 'πŸ’¦',
'love': '❀️',
'smaller': 'πŸ”½',
'bigger': 'πŸ”Ό',
'larger': 'πŸ”Ό',
'cat': '😺',
'cats': '😺',
'dog': '🐢',
'dogs': '🐢',
'sun': 'β˜€οΈ',
'moon': 'πŸŒ™',
'star': '⭐',
'happy': '😊',
'sad': '😒',
'angry': '😠',
'food': 'πŸ”',
'heart': '❀️',
'fire': 'πŸ”₯',
'hot': 'πŸ”₯',
'cold': '❄️',
'snow': '❄️',
'rain': '🌧️',
'smile': '😊',
'laugh': 'πŸ˜‚',
'cry': '😒',
}
# Tokenize the input text
tokens = word_tokenize(text.lower())
# Store found emojis with their explanations
found_emojis = []
explanations = []
def is_emoji_usable(emoji):
"""Check if emoji or any of its variations have been used."""
# Find the base emoji
base_emoji = None
for base, variations in emoji_variations.items():
if emoji in variations:
base_emoji = base
break
if base_emoji:
# Check if any variation has been used
return not any(variation in used_emojis for variation in emoji_variations[base_emoji])
return emoji not in used_emojis
# Process each token
for token in tokens:
# Skip excluded words and anything containing numbers
if token in excluded_words or number_pattern.match(token):
continue
# First check custom mappings
if token in custom_mappings:
emoji = custom_mappings[token]
if emoji not in excluded_emojis and is_emoji_usable(emoji):
found_emojis.append(emoji)
used_emojis.add(emoji)
explanations.append(f"'{token}' β†’ {emoji} (custom mapping)")
#else:
# explanations.append(f"'{token}' β†’ (skipped - emoji {emoji} already used)")
continue
# Then check emoji mapping
if token in emoji_map:
found_match = False
for emoji in emoji_map[token]:
if emoji not in excluded_emojis and is_emoji_usable(emoji):
found_emojis.append(emoji)
used_emojis.add(emoji)
explanations.append(f"'{token}' β†’ {emoji} (from emoji database)")
found_match = True
break
if not found_match:
available_emojis = [e for e in emoji_map[token] if e not in excluded_emojis]
#if available_emojis:
# explanations.append(f"'{token}' β†’ (skipped - all matching emojis {', '.join(available_emojis)} already used)")
#else:
# explanations.append(f"'{token}' β†’ (skipped - all matching emojis are excluded)")
else:
explanations.append(f"'{token}' β†’ (no matching emoji found)")
# Return emojis and explanations
return ' '.join(found_emojis) if found_emojis else '', explanations
def process_file(file_path):
"""Process a single text file and create corresponding emoji and explanation files."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
emojis, explanations = text_to_emojis(text)
# Create output filenames
emoji_file = file_path.with_suffix('.emoji')
explanation_file = file_path.with_suffix('.emoji.explain')
with open(emoji_file, 'w', encoding='utf-8') as f:
f.write(emojis)
with open(explanation_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(explanations))
print(f"Processed: {file_path} β†’ {emoji_file} and {explanation_file}")
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Convert text files to emoji representations')
parser.add_argument('directory', nargs='?', default='.',
help='Directory to process (default: current directory)')
args = parser.parse_args()
# Convert to Path object and resolve to absolute path
base_dir = Path(args.directory).resolve()
if not base_dir.exists():
print(f"Error: Directory '{base_dir}' does not exist")
return
# Find all .txt files in directory and subdirectories
txt_files = list(base_dir.rglob('*.txt'))
if not txt_files:
print(f"No .txt files found in {base_dir}")
return
print(f"Found {len(txt_files)} .txt files to process")
# Process each file
for file_path in txt_files:
process_file(file_path)
if __name__ == "__main__":
main()