In [None]:
# Install necessary libraries
!pip install transformers pandas datasets accelerate

In [None]:
# Additional installations for PyTorch and CUDA
!pip install torch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia

In [None]:
import re
import pandas as pd
import torch
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from accelerate import Accelerator

In [None]:
# Text cleaning functions
def fix_text(text):
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    return text

def clean_tweet(tweet, allow_new_lines=False):
    bad_start = ['http:', 'https:']
    for w in bad_start:
        tweet = re.sub(f" {w}\\S+", "", tweet)  # removes white space before url
        tweet = re.sub(f"{w}\\S+ ", "", tweet)  # in case a tweet starts with a url
        tweet = re.sub(f"\n{w}\\S+ ", "", tweet)  # in case the url is on a new line
        tweet = re.sub(f"\n{w}\\S+", "", tweet)  # in case the url is alone on a new line
        tweet = re.sub(f"{w}\\S+", "", tweet)  # any other case?
    tweet = re.sub(' +', ' ', tweet)  # replace multiple spaces with one space
    if not allow_new_lines:  # remove new lines
        tweet = ' '.join(tweet.split())
    return tweet.strip()

def boring_tweet(tweet):
    "Check if this is a boring tweet"
    boring_stuff = ['http', '@', '#']
    not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])
    return not_boring_words < 3

In [None]:
# Load and filter the dataset for a specified party
def load_and_filter_data(party):
    curated_tweets = pd.read_csv('/kaggle/input/curated-tweets/curated_tweets.csv')
    data = curated_tweets[curated_tweets.Partei == party][['text']].astype(str)
    return data

In [None]:
# Initialize tokenizer
def initialize_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='', eos_token='', pad_token='')
    return tokenizer

In [None]:
# Prepare dataset for training
def prepare_dataset(data, tokenizer):
    training_examples = f' ' + data['text'] + ''
    task_df = pd.DataFrame({'text': training_examples})
    tweet_data = Dataset.from_pandas(task_df)

    def preprocess(example):
        return tokenizer(example['text'], truncation=True)

    tweet_data = tweet_data.map(preprocess, batched=False)
    tweet_data = tweet_data.train_test_split(train_size=.8)
    return tweet_data

In [None]:
# Initialize model and related components
def initialize_model_and_components(tokenizer):
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    return model, data_collator

In [None]:
# Set training arguments
def set_training_arguments():
    training_args = TrainingArguments(
        output_dir="/kaggle/working/tweets",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=6,
        per_device_eval_batch_size=6,
        load_best_model_at_end=True,
        log_level='info',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-4,
        warmup_steps=1e2,
        seed=38,
        report_to="none",
    )
    return training_args

In [None]:
# Train and evaluate the model
def train_and_evaluate_model(model, training_args, tweet_data, data_collator):
    optimizer = AdamW(model.parameters(), lr=2e-4, eps=1e-8)
    total_steps = len(tweet_data["train"]) * training_args.num_train_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1e2, num_training_steps=total_steps)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tweet_data["train"],
        eval_dataset=tweet_data["test"],
        data_collator=data_collator,
        optimizers=(optimizer, scheduler),
    )
    trainer.train()
    trainer.evaluate()

In [None]:
# Generate text using the fine-tuned model
def generate_text(model, tokenizer, prompt):
    device = torch.device("cuda")
    model.eval()
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)

    sample_outputs = model.generate(
        generated,
        do_sample=True,
        top_k=20,
        max_length=70,
        top_p=0.98,
        num_return_sequences=10,
        temperature=0.95
    )

    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
# Save the fine-tuned model
def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
# Main function to execute the workflow
def main(party):
    data = load_and_filter_data(party)
    tokenizer = initialize_tokenizer()
    tweet_data = prepare_dataset(data, tokenizer)
    model, data_collator = initialize_model_and_components(tokenizer)
    training_args = set_training_arguments()
    train_and_evaluate_model(model, training_args, tweet_data, data_collator)

    # Generate some example text
    prompt = "Die Deutsche Kultur"
    generate_text(model, tokenizer, prompt)

    # Save the model
    save_model(model, tokenizer, "/kaggle/working/{}_gpt2-finetuned".format(party))

In [None]:
# Train your desired model
party = "Die Linke"  # Parties available for training: AfD, FDP, Fraktionslos, SPD, Bündnis 90/Die Grünen, CDU, CSU, Die Linke
main(party)