|
|
|
|
|
|
|
""" |
|
This script converts text to related emojis, injecting context-relevant visual |
|
cues into datasets. It enables the enrichment of text data with corresponding |
|
emojis, enhancing its expressiveness and engagement. |
|
|
|
Before running this script, you need to download the required resources. Just open |
|
up `python` and type: |
|
|
|
import nltk |
|
nltk.download('punkt_tab') |
|
""" |
|
|
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
from emoji import EMOJI_DATA |
|
import argparse |
|
from pathlib import Path |
|
import re |
|
|
|
|
|
nltk.download('punkt', quiet=True) |
|
|
|
def get_emoji_mapping(): |
|
"""Create a mapping of words to emojis.""" |
|
emoji_map = {} |
|
|
|
emoji_variations = {} |
|
|
|
for emoji_char, data in EMOJI_DATA.items(): |
|
base_emoji = data.get('base', emoji_char) |
|
if base_emoji not in emoji_variations: |
|
emoji_variations[base_emoji] = set() |
|
emoji_variations[base_emoji].add(emoji_char) |
|
|
|
if 'en' in data: |
|
words = data['en'].lower().replace('_', ' ').split() |
|
for word in words: |
|
if word not in emoji_map: |
|
emoji_map[word] = [] |
|
emoji_map[word].append(emoji_char) |
|
return emoji_map, emoji_variations |
|
|
|
def text_to_emojis(text): |
|
"""Convert text to related emojis.""" |
|
|
|
emoji_map, emoji_variations = get_emoji_mapping() |
|
|
|
|
|
number_pattern = re.compile(r'.*\d+.*') |
|
|
|
|
|
excluded_emojis = { |
|
'πΆ', |
|
'β', |
|
'π·', |
|
'πΉ', |
|
'πΈ', |
|
'πΊ', |
|
'π»', |
|
'π΄', |
|
'π΅', |
|
'πΌ', |
|
'πΎ', |
|
'π΅π¬', |
|
'π', |
|
'π²', |
|
'β
' |
|
} |
|
|
|
|
|
excluded_words = { |
|
'(', |
|
')', |
|
'purple', |
|
'abdominal', |
|
'penetration', |
|
'feral', |
|
'body', |
|
'nude', |
|
'anthro', |
|
'big', |
|
'small', |
|
'the', |
|
'a', |
|
'an', |
|
'and', |
|
'or', |
|
'but', |
|
'if', |
|
'then', |
|
'because', |
|
'as', |
|
'until', |
|
'while', |
|
',', |
|
'hi', |
|
'res', |
|
'pussy' |
|
'penetrated', |
|
'equine', |
|
'felid', |
|
'feline', |
|
'equid', |
|
'genital', |
|
'genitals', |
|
'penetrating', |
|
'medial', |
|
'ring', |
|
'inside', |
|
'duo', |
|
'solo', |
|
'in', |
|
'hair', |
|
'andromorph', |
|
'from', |
|
'behind', |
|
'position', |
|
'pantherine', |
|
'animal', |
|
'brown', |
|
'sub', |
|
'dom', |
|
'explicit', |
|
'black', |
|
'bulge', |
|
'dominant', |
|
'kousen', |
|
'rendan', |
|
'genitalia', |
|
'tan', |
|
'simple', |
|
'media', |
|
'vaginal', |
|
'red', |
|
'pecs', |
|
'navel', |
|
'background', |
|
'pubes', |
|
} |
|
|
|
|
|
used_emojis = set() |
|
|
|
|
|
custom_mappings = { |
|
'markings': 'π·οΈ', |
|
'sweat': 'π§', |
|
'toes': 'π£', |
|
'teeth': 'π¦·', |
|
'fingering': 'π', |
|
'blush': 'π', |
|
'male': 'βοΈ', |
|
'tiger': 'π―', |
|
'fluids': 'π§', |
|
'wolf': 'πΊ', |
|
'dog': 'πΆ', |
|
'female': 'βοΈ', |
|
'intersex': 'β§οΈ', |
|
'muscular': 'πͺ', |
|
'wheelbarrow': 'π', |
|
'sex': 'π', |
|
'size': 'π', |
|
'difference': 'π’', |
|
'penis': 'π±', |
|
'paws': 'πΎ', |
|
'pawpads': 'πΎ', |
|
'hindpaw': 'πΎ', |
|
'fur': 'π§₯', |
|
'horse': 'π΄', |
|
|
|
'ejaculation': 'π¦', |
|
'cum': 'π¦', |
|
'love': 'β€οΈ', |
|
'smaller': 'π½', |
|
'bigger': 'πΌ', |
|
'larger': 'πΌ', |
|
'cat': 'πΊ', |
|
'cats': 'πΊ', |
|
'dog': 'πΆ', |
|
'dogs': 'πΆ', |
|
'sun': 'βοΈ', |
|
'moon': 'π', |
|
'star': 'β', |
|
'happy': 'π', |
|
'sad': 'π’', |
|
'angry': 'π ', |
|
'food': 'π', |
|
'heart': 'β€οΈ', |
|
'fire': 'π₯', |
|
'hot': 'π₯', |
|
'cold': 'βοΈ', |
|
'snow': 'βοΈ', |
|
'rain': 'π§οΈ', |
|
'smile': 'π', |
|
'laugh': 'π', |
|
'cry': 'π’', |
|
} |
|
|
|
|
|
tokens = word_tokenize(text.lower()) |
|
|
|
|
|
found_emojis = [] |
|
explanations = [] |
|
|
|
def is_emoji_usable(emoji): |
|
"""Check if emoji or any of its variations have been used.""" |
|
|
|
base_emoji = None |
|
for base, variations in emoji_variations.items(): |
|
if emoji in variations: |
|
base_emoji = base |
|
break |
|
|
|
if base_emoji: |
|
|
|
return not any(variation in used_emojis for variation in emoji_variations[base_emoji]) |
|
return emoji not in used_emojis |
|
|
|
|
|
for token in tokens: |
|
|
|
if token in excluded_words or number_pattern.match(token): |
|
continue |
|
|
|
|
|
if token in custom_mappings: |
|
emoji = custom_mappings[token] |
|
if emoji not in excluded_emojis and is_emoji_usable(emoji): |
|
found_emojis.append(emoji) |
|
used_emojis.add(emoji) |
|
explanations.append(f"'{token}' β {emoji} (custom mapping)") |
|
|
|
|
|
continue |
|
|
|
|
|
if token in emoji_map: |
|
found_match = False |
|
for emoji in emoji_map[token]: |
|
if emoji not in excluded_emojis and is_emoji_usable(emoji): |
|
found_emojis.append(emoji) |
|
used_emojis.add(emoji) |
|
explanations.append(f"'{token}' β {emoji} (from emoji database)") |
|
found_match = True |
|
break |
|
if not found_match: |
|
available_emojis = [e for e in emoji_map[token] if e not in excluded_emojis] |
|
|
|
|
|
|
|
|
|
else: |
|
explanations.append(f"'{token}' β (no matching emoji found)") |
|
|
|
|
|
return ' '.join(found_emojis) if found_emojis else '', explanations |
|
|
|
def process_file(file_path): |
|
"""Process a single text file and create corresponding emoji and explanation files.""" |
|
try: |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
|
|
emojis, explanations = text_to_emojis(text) |
|
|
|
|
|
emoji_file = file_path.with_suffix('.emoji') |
|
explanation_file = file_path.with_suffix('.emoji.explain') |
|
|
|
with open(emoji_file, 'w', encoding='utf-8') as f: |
|
f.write(emojis) |
|
|
|
with open(explanation_file, 'w', encoding='utf-8') as f: |
|
f.write('\n'.join(explanations)) |
|
|
|
print(f"Processed: {file_path} β {emoji_file} and {explanation_file}") |
|
except Exception as e: |
|
print(f"Error processing {file_path}: {str(e)}") |
|
|
|
def main(): |
|
|
|
parser = argparse.ArgumentParser(description='Convert text files to emoji representations') |
|
parser.add_argument('directory', nargs='?', default='.', |
|
help='Directory to process (default: current directory)') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
base_dir = Path(args.directory).resolve() |
|
|
|
if not base_dir.exists(): |
|
print(f"Error: Directory '{base_dir}' does not exist") |
|
return |
|
|
|
|
|
txt_files = list(base_dir.rglob('*.txt')) |
|
|
|
if not txt_files: |
|
print(f"No .txt files found in {base_dir}") |
|
return |
|
|
|
print(f"Found {len(txt_files)} .txt files to process") |
|
|
|
|
|
for file_path in txt_files: |
|
process_file(file_path) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|