{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import psycopg2\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pandas\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_21040\\1041354989.py:12: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", " df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n" ] } ], "source": [ "# db_connection -----------------------------------------------------------\n", "con_details = {\n", " \"host\" : \"localhost\",\n", " \"database\" : \"next\",\n", " \"user\" : \"postgres\",\n", " \"password\" : \"postgres\",\n", " \"port\" : \"5432\"\n", "}\n", "con = psycopg2.connect(**con_details)\n", "\n", "# get data tables ---------------------------------------------------------\n", "df = pd.read_sql_query(\"\"\"SELECT s.id,s.speech_content,s.date,f.abbreviation AS party\n", " FROM open_discourse.speeches AS s\n", " INNER JOIN open_discourse.factions AS f ON\n", " s.faction_id = f.id;\"\"\", con)\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Cleaning" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idspeech_contentdateparty
00Meine Damen und Herren! Ich eröffne die 2. Sit...1949-09-12not found
11Der Bundesrat ist versammelt, Herr Präsident.\\n1949-09-12not found
22Ich danke für diese Erklärung. Ich stelle dami...1949-09-12not found
33Ja, ich habe den Wunsch.\\n1949-09-12not found
44Ich erteile dem Herrn Bundespräsidenten das Wo...1949-09-12not found
...............
9309551084268\\n\\nWir sind zwar Kollegen.2022-12-16not found
9309561084269\\n\\nLiebe, sehr geehrte Frau Präsidentin!2022-12-16CDU/CSU
9309571084270\\n\\nVielen Dank.2022-12-16not found
9309581084272\\n\\nDen Abschluss dieser Aktuellen Stunde bild...2022-12-16not found
9309591084273\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...2022-12-16SPD
\n", "

930960 rows × 4 columns

\n", "
" ], "text/plain": [ " id speech_content \\\n", "0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... \n", "1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n \n", "2 2 Ich danke für diese Erklärung. Ich stelle dami... \n", "3 3 Ja, ich habe den Wunsch.\\n \n", "4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... \n", "... ... ... \n", "930955 1084268 \\n\\nWir sind zwar Kollegen. \n", "930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! \n", "930957 1084270 \\n\\nVielen Dank. \n", "930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... \n", "930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... \n", "\n", " date party \n", "0 1949-09-12 not found \n", "1 1949-09-12 not found \n", "2 1949-09-12 not found \n", "3 1949-09-12 not found \n", "4 1949-09-12 not found \n", "... ... ... \n", "930955 2022-12-16 not found \n", "930956 2022-12-16 CDU/CSU \n", "930957 2022-12-16 not found \n", "930958 2022-12-16 not found \n", "930959 2022-12-16 SPD \n", "\n", "[930960 rows x 4 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"speech_content\"].replace(\"\\({\\d+}\\)\", \"\", inplace=True, regex=True) #removing keys from interruptions\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_pickle(\"speeches_1949_09_12\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }