{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3f7f2ede-4f06-4d5a-b19c-30a7fc4406bc",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "77cdea1b-525e-493c-9eca-c99d33d9ac54",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from torch.utils.data import DataLoader\n",
"from torch.nn import functional as F\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a5d0f4dd-0f71-4314-9e0e-62311de3eef3",
"metadata": {},
"outputs": [],
"source": [
"#all_tweets_labeled = pd.read_parquet('classification/model_with_only_language_models/final_dataset_since_october_2022.parquet.gzip')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "da3bcd2a-b6c1-4026-8905-777b4ac351ad",
"metadata": {},
"outputs": [],
"source": [
"#all_tweets_labeled.head()"
]
},
{
"cell_type": "code",
"execution_count": 246,
"id": "e996e9fe-4dc1-4a4c-82a0-8cb3a7862ee8",
"metadata": {},
"outputs": [],
"source": [
"all_tweets_labeled = pd.DataFrame([\n",
" {\"id\": 1, \"text\": \"\"\"tl;dr\n",
"\n",
"Humans are just ChatGPT Wrappers in sunglasses\n",
" \n",
"& I couldn’t be more optimistic about the future as a result\n",
"\n",
"Thank you \n",
"@ekang426322\n",
" for an exceptionally curated day at BUIDL Europe!\n",
" 🫶\"\"\", \"viral\": 1},\n",
" {\"id\": 2, \"text\": \"\"\"USD0++ discovered a new source of yield — depeg. \n",
"\n",
"Respect to the innovation\n",
"\"\"\", \"viral\": 0},\n",
" {\"id\": 3, \"text\": \"\"\"here you can see 4 ai agents \n",
"@dongossen100\n",
" , me, \n",
"@WorldWideWarden16\n",
" and \n",
"@provenauthority291\n",
" discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence\"\"\",\n",
" \"viral\": 1},\n",
" {\"id\": 4, \"text\": \"\"\"\n",
" arrived to lisbon, building energy is the air\"\"\", \"viral\": 0},\n",
" dict(id=5,text=\"\"\"\n",
" received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\n",
"\n",
"can't wait to address it all\n",
"\n",
"main points:\n",
"\n",
"- show rank in X DMs to quickly filter out inbox\n",
"\n",
"- rank labels are too distracting (already fixed) \n",
"\n",
"- add an option for users to toggle on/off scores inside the feed\n",
"\n",
"- add a percentile label, e.g. qw 801 (Top 0.1%)\n",
"\n",
"- enable others to add reviews to impact the rank \n",
"\n",
"- explain in detail how rankings are calculated \n",
"\n",
"- show breakdowns of people in DeFi, DePin, Memecoins etc.\n",
"\n",
"- make X Rank opensource \n",
"\n",
"- create a web version\n",
"\n",
"p.s. the current version is just a tiny step in our roadmap for the next two months. \n",
"\n",
"thank you for the feedback \n",
"@socialfi_panda101\n",
" \n",
"@adamkillam100\n",
" \n",
"@FamKien106\n",
" \n",
"@antongotchi104\n",
" \n",
"@kliuless128\n",
" \n",
"@0xsudogm163\n",
" \n",
"@monosarin120\n",
" \n",
"@flb_xyz56\n",
" 🫶\n",
" \"\"\",\n",
" viral=0),\n",
" dict(id=6, text=\"\"\"ai agents are in the air\n",
"\n",
"and web3 is trained to sniff out alpha\"\"\", viral=1),\n",
" dict(id=7, text=\"\"\"While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀\"\"\", viral=1),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 247,
"id": "a0f4c14d-c9e4-4de6-b723-8e7c0e166b90",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" text | \n",
" viral | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" tl;dr\\n\\nHumans are just ChatGPT Wrappers in s... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" USD0++ discovered a new source of yield — depe... | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" here you can see 4 ai agents \\n@dongossen100\\n... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" \\n arrived to lisbon, building energy is th... | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" \\n received a wealth of valuable feedback o... | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 6 | \n",
" ai agents are in the air\\n\\nand web3 is traine... | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" 7 | \n",
" While Trump is going to do something great wit... | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id text viral\n",
"0 1 tl;dr\\n\\nHumans are just ChatGPT Wrappers in s... 1\n",
"1 2 USD0++ discovered a new source of yield — depe... 0\n",
"2 3 here you can see 4 ai agents \\n@dongossen100\\n... 1\n",
"3 4 \\n arrived to lisbon, building energy is th... 0\n",
"4 5 \\n received a wealth of valuable feedback o... 0\n",
"5 6 ai agents are in the air\\n\\nand web3 is traine... 1\n",
"6 7 While Trump is going to do something great wit... 1"
]
},
"execution_count": 247,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_tweets_labeled"
]
},
{
"cell_type": "code",
"execution_count": 248,
"id": "3e8326c3-1df6-435d-b0ee-e7b9449c6675",
"metadata": {},
"outputs": [],
"source": [
"from classification.model_with_only_language_models.text_preprocessing import clean_tweet"
]
},
{
"cell_type": "code",
"execution_count": 249,
"id": "5bb79b0c-42d1-4f1c-ad65-7ebfbbd17098",
"metadata": {},
"outputs": [],
"source": [
"dataset = all_tweets_labeled\n",
"\n",
"dataset.loc[:, \"viral\"] = dataset.viral.astype(int)\n",
"dataset[\"cleaned_text\"] = dataset.text.apply(lambda x: clean_tweet(x, demojize_emojis=False))"
]
},
{
"cell_type": "code",
"execution_count": 250,
"id": "f45533d3-f3f6-49bc-b347-663d72fffa34",
"metadata": {},
"outputs": [],
"source": [
"dataset = dataset.dropna()\n",
"dataset = dataset[['id', 'cleaned_text', 'viral']]"
]
},
{
"cell_type": "code",
"execution_count": 251,
"id": "4eb4afa9-3de4-4579-b1a3-9418ca534453",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" cleaned_text | \n",
" viral | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" tl ;d rHumans are just ChatGPT Wrappers in sun... | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" USD 0 + + discovered a new source of yield — d... | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" here you can see 4 ai agents @USER , me , @USE... | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" arrived to lisbon , building energy is the air | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" received a wealth of valuable feedback on the ... | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id cleaned_text viral\n",
"0 1 tl ;d rHumans are just ChatGPT Wrappers in sun... 1\n",
"1 2 USD 0 + + discovered a new source of yield — d... 0\n",
"2 3 here you can see 4 ai agents @USER , me , @USE... 1\n",
"3 4 arrived to lisbon , building energy is the air 0\n",
"4 5 received a wealth of valuable feedback on the ... 0"
]
},
"execution_count": 251,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 252,
"id": "f6f076f8-3b0e-446b-ac69-582e1bcf1ee0",
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset"
]
},
{
"cell_type": "code",
"execution_count": 253,
"id": "86ca78a6-998d-45f5-bc0e-d22531dbc174",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['id', 'cleaned_text', 'viral'],\n",
" num_rows: 7\n",
"})"
]
},
"execution_count": 253,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds = Dataset.from_pandas(dataset)\n",
"ds"
]
},
{
"cell_type": "code",
"execution_count": 340,
"id": "e88ed93f-0b0c-4743-a506-9a4006534151",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
"from transformers import DataCollatorWithPadding\n",
"from transformers import BertweetTokenizer"
]
},
{
"cell_type": "code",
"execution_count": 372,
"id": "4ec382e5-073b-40e1-8ce6-a6ff9e51644f",
"metadata": {},
"outputs": [],
"source": [
"class Tokenizer(BertweetTokenizer):\n",
" def __init__(self, *args, **kwargs):\n",
" return super().__init__(*args, **kwargs)\n",
"\n",
" def __call__(self, *args, **kwargs):\n",
" return super().__call__(*args, max_length=120, **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 373,
"id": "56eb937a-483f-4f2f-b7fe-c3da2aa42526",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AutoModelForSequenceClassification\n",
"\n",
"CHECKPOINT = \"classification/model_with_only_language_models/models/trained_vinai_bertweet-base.pt\"\n",
"MODEL_NAME = \"vinai/bertweet-base\"\n",
"\n",
"def get_device():\n",
" #device = torch.device(\"mps\") if torch.mps.is_available() else torch.device(\"cpu\")\n",
" return torch.device(\"cpu\")\n",
" return device\n",
" \n",
"\n",
"def get_model():\n",
" model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)\n",
" model.load_state_dict(torch.load(CHECKPOINT))\n",
" model.to(get_device())\n",
" tokenizer = Tokenizer.from_pretrained(MODEL_NAME, truncation=True, max_length=100)\n",
"\n",
" return tokenizer, model"
]
},
{
"cell_type": "code",
"execution_count": 374,
"id": "5fe5af4a-3eb8-4fe0-99e8-c967d61241f2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" model.load_state_dict(torch.load(CHECKPOINT))\n",
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
"The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
"The class this function is called from is 'Tokenizer'.\n"
]
}
],
"source": [
"tokenizer, model = get_model()"
]
},
{
"cell_type": "code",
"execution_count": 375,
"id": "6cdc0d7e-d264-49b8-822e-9a862a929a2f",
"metadata": {},
"outputs": [],
"source": [
"def tokenize_function(example, tokenizer):\n",
" # Truncate to max length. Note that a tweet's maximum length is 280\n",
" # TODO: check dynamic padding: https://huggingface.co/course/chapter3/2?fw=pt#dynamic-padding\n",
" #return tokenizer(example[\"cleaned_text\"], truncation=True, max_length=100)\n",
" return tokenizer(example[\"cleaned_text\"])"
]
},
{
"cell_type": "code",
"execution_count": 376,
"id": "bc27ce0b-66bb-4a6f-98c5-78983594c3bd",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee20a2b256964124930de15d8e97f4ef",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/7 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
]
}
],
"source": [
"tokenized_datasets = ds.map(lambda x: tokenize_function(x, tokenizer=tokenizer), batched=True)\n",
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
"\n",
"#tokenized_datasets = tokenized_datasets.remove_columns([\"__index_level_0__\", \"cleaned_text\", \"id\"])\n",
"tokenized_datasets = tokenized_datasets.remove_columns([\"cleaned_text\", \"id\"])\n",
"tokenized_datasets = tokenized_datasets.rename_column(\"viral\", \"labels\")\n",
"tokenized_datasets.set_format(\"torch\")"
]
},
{
"cell_type": "code",
"execution_count": 377,
"id": "77a12396-386c-4aba-8ed4-e269ecda13a1",
"metadata": {},
"outputs": [],
"source": [
"eval_dataloader = DataLoader(tokenized_datasets, batch_size=1, collate_fn=data_collator)"
]
},
{
"cell_type": "code",
"execution_count": 378,
"id": "dc98302c-d539-4af3-8979-64156dda8317",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([0.8640])\n",
"tensor([0.5687])\n",
"tensor([0.9722])\n",
"tensor([0.0006])\n",
"tensor([0.0033])\n",
"tensor([0.0091])\n",
"tensor([0.9982])\n"
]
}
],
"source": [
"if torch.mps.is_available():\n",
" torch.mps.empty_cache()\n",
"if torch.cuda.is_available():\n",
" torch.cuda.empty_cache()\n",
"\n",
"model.eval()\n",
"for batch in eval_dataloader:\n",
" batch = {k: v.to(get_device()) for k, v in batch.items()}\n",
" with torch.no_grad():\n",
" outputs = model(**batch)\n",
"\n",
" logits = outputs.logits\n",
" probabilities = F.softmax(logits, dim=-1)\n",
" predictions = torch.argmax(logits, dim=-1)\n",
" \n",
" print(probabilities[:, 1])\n",
" #print(predictions)"
]
},
{
"cell_type": "code",
"execution_count": 379,
"id": "4feb1954-7ad2-461d-bf52-8dd2e0d6591f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"128.65210151672363 MiB\n"
]
}
],
"source": [
"print(sum(p.numel() for p in model.parameters()) / 1024**2, \"MiB\")"
]
},
{
"cell_type": "code",
"execution_count": 380,
"id": "15e2dc8f-c38d-4828-9c90-638c9782eb54",
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline"
]
},
{
"cell_type": "code",
"execution_count": 381,
"id": "37af7000-ab64-4b1c-bd29-c648b433420f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"/var/folders/xd/g8p1g555153b4v2qp8q7shb00000gn/T/ipykernel_40634/3099302733.py:15: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" model.load_state_dict(torch.load(CHECKPOINT))\n",
"The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n",
"The tokenizer class you load from this checkpoint is 'BertweetTokenizer'. \n",
"The class this function is called from is 'Tokenizer'.\n"
]
}
],
"source": [
"tokenizer, model = get_model()"
]
},
{
"cell_type": "code",
"execution_count": 382,
"id": "a05fa75b-e571-4b14-b158-1b43ee17871a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Device set to use cpu\n"
]
}
],
"source": [
"pipe = pipeline(\n",
" 'text-classification',\n",
" model=model,\n",
" tokenizer=tokenizer,\n",
" device=\"cpu\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 383,
"id": "f1bcb478-c16f-4135-9d61-9df69538e8ce",
"metadata": {},
"outputs": [],
"source": [
"texts = [\n",
" 'tl;dr\\n\\nHumans are just ChatGPT Wrappers in sunglasses\\n \\n& I couldn’t be more optimistic about the future as a result\\n\\nThank you \\n@ekang426322\\n for an exceptionally curated day at BUIDL Europe!\\n 🫶',\n",
" 'USD0++ discovered a new source of yield — depeg. \\n\\nRespect to the innovation\\n',\n",
" 'here you can see 4 ai agents \\n@dongossen100\\n , me, \\n@WorldWideWarden16\\n and \\n@provenauthority291\\n discuss how we can make single-task manual low memory agents(humans) work harder to achieve Artificial Generalized Superintelligence',\n",
" '\\n arrived to lisbon, building energy is the air',\n",
" \"\\n received a wealth of valuable feedback on the journey to reaching 7,000 users for X Rank in just 10 days\\n\\ncan't wait to address it all\\n\\nmain points:\\n\\n- show rank in X DMs to quickly filter out inbox\\n\\n- rank labels are too distracting (already fixed) \\n\\n- add an option for users to toggle on/off scores inside the feed\\n\\n- add a percentile label, e.g. qw 801 (Top 0.1%)\\n\\n- enable others to add reviews to impact the rank \\n\\n- explain in detail how rankings are calculated \\n\\n- show breakdowns of people in DeFi, DePin, Memecoins etc.\\n\\n- make X Rank opensource \\n\\n- create a web version\\n\\np.s. the current version is just a tiny step in our roadmap for the next two months. \\n\\nthank you for the feedback \\n@socialfi_panda101\\n \\n@adamkillam100\\n \\n@FamKien106\\n \\n@antongotchi104\\n \\n@kliuless128\\n \\n@0xsudogm163\\n \\n@monosarin120\\n \\n@flb_xyz56\\n 🫶\\n \",\n",
" 'ai agents are in the air\\n\\nand web3 is trained to sniff out alpha',\n",
" 'While Trump is going to do something great with crypto, Wallchain is going to do something great with incentives🚀',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 403,
"id": "52ab46d9-ed16-43dd-ab0b-4af0757e7c96",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 86.40%\n",
" 56.87%\n",
" 97.22%\n",
" 0.06%\n",
" 0.33%\n",
" 0.91%\n",
" 99.82%\n"
]
}
],
"source": [
"for text in texts:\n",
" res = pipe(clean_tweet(text, demojize_emojis=False), top_k=2)\n",
" LABEL_1_result = [x['score'] for x in res if x['label'] == 'LABEL_1'][0]\n",
" print(f\"{LABEL_1_result:7.2%}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "033adc09-7c2f-414b-a7e4-d7d8095af580",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "117e3390-130a-4750-ad6a-c03c80050b0f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "612dee88-0e40-4072-a3af-21a6f3dc5488",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (ViralTweets)",
"language": "python",
"name": "viraltweets"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}