{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "import pandas as pd\n", "from utils import *\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rste = load_dataset(\"razhan/rste\", split=\"train\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_rste = rste.to_pandas()\n", "df = df_rste\n", "pd.set_option('display.max_colwidth', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['clean_text'] = df['text'].apply(process_text)\n", "\n", "# df['contains_non_kurdish'] = df[\"text\"].apply(contains_non_kurdish_characters)\n", "# print(df['contains_non_kurdish'].sum())\n", "# df[df['contains_non_kurdish'] == True]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['clean_text'] = df['clean_text'].apply(keep_kurdish_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df[df['contains_non_kurdish'] == False]['clean_text'].to_csv(\"data/data.ckb.txt\", index=False, header=False)\n", "df['clean_text'].to_csv(\"data/data.ckb.txt\", index=False, header=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['text'].str.contains('ھ')]\n", "indices_with_substring = df[df['text'].str.contains('ھ')].index\n", "# print(indices_with_substring)\n", "df.loc[indices_with_substring]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_text = ''.join(df[\"text\"])\n", "\n", "unique_characters = set(all_text)\n", "\n", "print(\"Unique characters:\", unique_characters)\n", "print(\"Number of unique characters:\", len(unique_characters))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_text = ''.join(df[\"clean_text\"])\n", "# all_text = ''.join(df[df['contains_non_kurdish'] == False]['clean_text'])\n", "unique_characters = set(all_text)\n", "unique_punctuations = extract_punctuation(all_text)\n", "print(\"Unique characters:\", unique_characters)\n", "print(\"Number of unique characters:\", len(unique_characters))\n", "print(\"Unique punctuations:\", unique_punctuations)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['contains_non_kurdish'] = df[\"text\"].apply(contains_non_kurdish_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(unique_punctuations)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df = pd.read_csv(\"asosoft_test_punc.csv\")\n", "# df['summary'] = df['summary'].apply(process_text)\n", "# df.to_csv(\"asosoft_test_clean.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "oscar_dataset = load_dataset(\"oscar-corpus/OSCAR-2301\", language=\"ckb\", split='train', token=True)\n", "wiki_dataset = load_dataset(\"wikipedia\", language=\"ckb\", date=\"20231120\", split='train', beam_runner='DirectRunner')\n", "\n", "df_oscar = oscar_dataset.to_pandas()\n", "df_wiki = wiki_dataset.to_pandas()\n", "df = pd.concat([df_oscar, df_wiki], ignore_index=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"text\"] = df[\"text\"].apply(process_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# text = df[\"text\"].str.cat(sep=\"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['clean_text'] = df['text'].apply(keep_kurdish_characters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"clean_text\"] = df[\"clean_text\"].apply(process_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['contains_non_kurdish'] = df[\"clean_text\"].apply(contains_non_kurdish_characters)\n", "print(df['contains_non_kurdish'].sum())\n", "# df[df['contains_non_kurdish'] == True].iloc[0]['clean_text']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['repeated_ngram'] = df['clean_text'].apply(lambda x: contains_repeated_ngram(x, 10))\n", "print(df['repeated_ngram'].sum())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# drop rows where repeated_ngram are True\n", "df = df[df['repeated_ngram'] == False]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[df['repeated_ngram'] == True].iloc[0]['clean_text']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_text = \"\".join(df[\"clean_text\"])\n", "# all_text = ''.join(df[df['contains_non_kurdish'] == False]['clean_text'])\n", "unique_characters = set(all_text)\n", "unique_punctuations = extract_punctuation(all_text)\n", "print(\"Unique characters:\", unique_characters)\n", "print(\"Number of unique characters:\", len(unique_characters))\n", "print(\"Unique punctuations:\", unique_punctuations)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import sent_tokenize\n", "data = []\n", "for i, row in df.iterrows():\n", " sentences = tokenizer_ckb.sent_tokenize(row['clean_text'])\n", " # sentences = row['clean_text'].split('\\n')\n", " # sentences = sent_tokenize(row['clean_text'])\n", " sentences = [sent_tokenize(s) for s in sentences]\n", " # flatten list of lists\n", " sentences = [item for sublist in sentences for item in sublist]\n", " # split on period and keep the period\n", " sentences = [s.split('.') for s in sentences]\n", " sentences = [item for sublist in sentences for item in sublist]\n", "\n", " sentences = [s + '.' for s in sentences]\n", " data.extend(sentences)\n", " # if i == 5:\n", " # break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(len(data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# longest line in data\n", "max_line = max(data, key=len)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(max_line.split())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max_line" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# calulate the length of each line in the data and take the average\n", "lengths = [len(line.split()) for line in data]\n", "avg_length = sum(lengths) / len(lengths)\n", "print(avg_length)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# give me all the lines above 20 words\n", "long_lines = [line for line in data if len(line.split()) > 25]\n", "print(len(long_lines))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Write sentences to file\n", "with open(\"data/oscar_wiki.ckb.txt\", \"w\") as f:\n", " for sentence in data:\n", " f.write(sentence + \"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }