gArthur98's picture
uopdate
38468af
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as pe
from wordcloud import WordCloud, STOPWORDS
from nltk import FreqDist
st.set_option('deprecation.showPyplotGlobalUse', False)
st.title("Welcome To The Explore Page: ")
st.markdown("On this page you will be able to some EDA Visuals")
##loading my dataset
data= pd.read_csv("datasets/Train.csv")
clean_data= pd.read_csv("datasets/clean_copy.csv")
clean_data= clean_data.dropna()
##plotting my wordcloud for the unclean dataset
unclean_words= " ".join(data["safe_text"])
wc= WordCloud(stopwords=STOPWORDS).generate(unclean_words)
plt.figure(figsize= (5,10))
plt.title("Most common Words in unclean Dataset")
plt.imshow(wc)
st.pyplot()
##creating a wordcloud of my most common word in cleaned tweet
clean_words= ' '.join(clean_data["clean_tweet"]).split() ##converting the dataframe to corpus of words
freq_words= pd.DataFrame(FreqDist(clean_words).most_common(20), columns= ["word", "count"])
fig= pe.treemap(data_frame=freq_words, path=["word"], values= "count", title= "Top 20 Most Frequent Words After Cleaning")
st.plotly_chart(fig)
##getting the tweet lengths
data["tweet_length"]= [len(i.split(" ")) for i in data["safe_text"]]
words= data["tweet_length"].value_counts().reset_index()
fig_2= pe.scatter(data_frame=words, x="tweet_length", y="count", size= "count", color= "tweet_length", title= "Tweet Lenghts")
st.plotly_chart(fig_2)