teven's picture
typo
5d392e6
raw
history blame
5.48 kB
import streamlit as st
import json
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
def visualization(path_data, lang, num_docs, num_docs_for_words):
with open(path_data) as json_file:
data = json.load(json_file)
num_docs = min(num_docs, len(data))
st.title(f"{num_docs} {lang} documents from Oscar with their stats.")
sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
words = set([word for sentence in sentences for word in sentence])
words_data = [{"len_word": len(word), "word": word} for word in words]
words_data = pd.DataFrame(words_data)
data = data[:num_docs]
data = pd.DataFrame(data)
columns = list(data)
keys = []
values = {}
st.header("Filtering based on document content")
if "special_%" in columns:
special_ratio = st.sidebar.slider(
"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
)
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
keys.append(("special_%", special_cutoff, True))
if "stop_%" in columns:
stop_ratio = st.sidebar.slider(
"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
)
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
keys.append(("stop_%", stop_cutoff, False))
@st.cache(suppress_st_warning=True)
def recalculate_flagged_words(file):
def flagged_word_ratio(text: str, flagged_word_list):
return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
flagged_word_list = [word.decode().strip() for word in file.readlines()]
flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
data["flagged_%"] = flagged_word_ratios
flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
if "flagged_%" in columns:
flagged_ratio = st.sidebar.slider(
"% filtered by flagged words ratio", 0.0, 50.0, 0.0, step=0.1
)
flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
keys.append(("flagged_%", flagged_cutoff, True))
if "perplexity" in columns:
ppl_ratio = st.sidebar.slider(
"% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
)
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
keys.append(("perplexity", ppl_cutoff, True))
cond = [
(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
for key, cutoff, max_cutoff in keys
]
cond = np.all(cond, axis=0)
data_not_keep = data.loc[np.invert(cond)]
st.subheader(f"Filtered data: {np.invert(cond).sum()} docs")
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
st.dataframe(data_not_keep)
data_keep = data.loc[cond]
st.subheader(f"Kept data: {cond.sum()} docs")
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
st.dataframe(data_keep)
# def plot_hist(dataframe, key, num_bins=50):
# st.subheader(" ".join(key.split("_")))
# hist_values = dataframe[key].values
# max_range = np.max(hist_values)
# hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
# st.bar_chart(hist_values)
# st.markdown(f"Each bin is of size: {max_range/num_bins}.")
# for key, _, _ in keys:
# plot_hist(data, key)
st.header("Filtering links and concatenated words")
max_len_word = int(np.max(words_data["len_word"])) + 1
cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word)
cond_words = words_data["len_word"] <= cutoff_word
words_keep = words_data.loc[cond_words]
st.subheader(f"Words that we keep (for {num_docs_for_words} documents)")
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
st.dataframe(words_keep)
words_not_keep = words_data.loc[np.invert(cond_words)]
st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)")
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
st.dataframe(words_not_keep)
st.header("Download data")
with open(path_data) as json_file:
btn = st.download_button(
label="Download data as json",
data=json_file,
file_name="data.json",
)
path_data = "./en_examples_with_stats_ldnoob.json"
lang = "English"
num_docs = 5000
num_docs_for_words = 500
visualization(path_data, lang, num_docs, num_docs_for_words)