Cisconardi's picture
Update app.py
4eac896 verified
import streamlit as st
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from span_marker import SpanMarkerModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, PartOfSpeech
from torch import cuda
from spacy.cli import download
import transformers
from torch import bfloat16
import os
import scipy.cluster.hierarchy as sch # HIERARCHY
# ------------------------------------------------------------------------------
# Funzione per ottenere la configurazione della lingua
# ------------------------------------------------------------------------------
def get_language_config(selected_language):
"""
Restituisce un dizionario di configurazione in base alla lingua selezionata.
Include il modello spaCy, il modello linguistico per il rilevamento (SpanMarker)
e i parametri per DataForSEO.
"""
language_options = {
"English (US)": {
"spacy_model": "en_core_web_sm",
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
"dataforseo_params": {"language": "en-us"}
},
"English (UK)": {
"spacy_model": "en_core_web_sm", # spaCy non ha un modello UK specifico, si usa quello standard
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
"dataforseo_params": {"language": "en-gb"}
},
"Italiano": {
"spacy_model": "it_core_news_sm",
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3", # Sostituire con il modello appropriato se disponibile
"dataforseo_params": {"language": "it-it"}
},
"Español": {
"spacy_model": "es_core_news_sm",
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
"dataforseo_params": {"language": "es-es"}
},
"Deutsch": {
"spacy_model": "de_core_news_sm",
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
"dataforseo_params": {"language": "de-de"}
},
"Français": {
"spacy_model": "fr_core_news_sm",
"linguistic_model": "nbroad/span-marker-xdistil-l12-h384-orgs-v3",
"dataforseo_params": {"language": "fr-fr"}
}
}
return language_options.get(selected_language, language_options["English (US)"])
# ------------------------------------------------------------------------------
# Configurazione della pagina
# ------------------------------------------------------------------------------
st.set_page_config(
page_title="Keywords Cluster for SEO",
layout="wide",
initial_sidebar_state="expanded",
menu_items={
'Get Help': 'https://www.linkedin.com/in/francisco-nardi-212b338b/',
'Report a bug': "https://www.linkedin.com/in/francisco-nardi-212b338b/",
'About': "# A simple keywords clustering tool for SEO purpose."
}
)
# Inizializzazione della sessione (opzionale)
if 'model_loaded' not in st.session_state:
st.session_state.model_loaded = False
if 'analysis_complete' not in st.session_state:
st.session_state.analysis_complete = False
if 'current_step' not in st.session_state:
st.session_state.current_step = 0
# Stili CSS personalizzati
st.markdown("""
<style>
.stProgress > div > div > div > div {
background-color: #1f77b4;
}
.success-message {
padding: 1rem;
border-radius: 0.5rem;
background-color: #d4edda;
color: #155724;
border: 1px solid #c3e6cb;
margin-bottom: 1rem;
}
.info-box {
padding: 1rem;
border-radius: 0.5rem;
background-color: #e2f0fd;
border: 1px solid #b8daff;
margin-bottom: 1rem;
}
.sidebar .sidebar-content {
width: 400px !important;
}
</style>
""", unsafe_allow_html=True)
# ------------------------------------------------------------------------------
# 1) Caricamento modelli con cache_resource
# ------------------------------------------------------------------------------
@st.cache_resource
def load_models(language_config):
"""Carica i modelli necessari con caching (una sola volta)."""
with st.spinner("Loading models... This may take a few minutes."):
try:
# Scarica il modello spaCy in base alla lingua selezionata
spacy_model_name = language_config["spacy_model"]
download(spacy_model_name)
# Modello SpanMarker: rilevazione entità (Brand/Unbranded)
linguistic_model_name = language_config["linguistic_model"]
if cuda.is_available():
model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name).cuda()
else:
model_filter = SpanMarkerModel.from_pretrained(linguistic_model_name)
# Modello di embedding SentenceTransformer (resta invariato)
embedding_model = SentenceTransformer("all-mpnet-base-v2")
return model_filter, embedding_model
except Exception as e:
st.error(f"Error loading models: {str(e)}")
raise
# ------------------------------------------------------------------------------
# 2) Lettura CSV con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def load_csv(file, skiprows, nrows):
"""Carica il CSV con caching."""
df = pd.read_csv(file, skiprows=skiprows, nrows=nrows)
return df
# ------------------------------------------------------------------------------
# 3) Funzione di etichettatura Brand/Unbranded con cache_data
# ------------------------------------------------------------------------------
@st.cache_data
def process_keywords(df, model_filter):
"""
Rileva eventuali keyword di tipo 'Brand' utilizzando il modello SpanMarker.
Ritorna la lista di etichette 'Brand' o 'Unbranded' per ciascuna keyword.
"""
results = []
total = len(df)
progress_text = "Processing keywords..."
progress_bar = st.progress(0, text=progress_text)
for i, keyword in enumerate(df['Keyword']):
try:
entities = model_filter.predict([keyword])
label = (
"Brand"
if entities and isinstance(entities[0], list) and any(entity.get("label") == "ORG" for entity in entities[0])
else "Unbranded"
)
results.append(label)
except Exception as e:
st.error(f"Error processing keyword '{keyword}': {str(e)}")
results.append("Unbranded")
progress_bar.progress((i + 1) / total, text=f"{progress_text} ({i+1}/{total})")
progress_bar.empty()
return results
# ------------------------------------------------------------------------------
# 4) Creazione del modello di topic
# ------------------------------------------------------------------------------
def create_topic_model(embedding_model, model_params, language_config):
"""Crea e configura il modello di topic modeling."""
try:
# Configurazione quantizzazione per Hugging Face
bnb_config = transformers.BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
# Configurazione UMAP
umap_model = UMAP(
n_neighbors=model_params['umap_n_neighbors'],
n_components=model_params['umap_n_components'],
min_dist=model_params['umap_min_dist'],
metric='cosine',
random_state=42
)
# Configurazione HDBSCAN
hdbscan_model = HDBSCAN(
min_cluster_size=model_params['min_cluster_size'],
min_samples=model_params['min_samples'],
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
# Configurazione CountVectorizer
vectorizer_model = CountVectorizer(
stop_words="english",
min_df=model_params['min_df'],
max_df=model_params['max_df'],
ngram_range=(model_params['ngram_min'], model_params['ngram_max'])
)
# Configurazione Llama 2
model_id = 'meta-llama/Llama-2-7b-chat-hf'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map='auto',
)
model.eval()
generator = transformers.pipeline(
model=model,
tokenizer=tokenizer,
task='text-generation',
temperature=model_params['llama_temperature'],
max_new_tokens=model_params['llama_max_tokens'],
repetition_penalty=model_params['llama_repetition_penalty']
)
# Prompt configuration
system_prompt = """
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant for labeling topics.
<</SYS>>
"""
example_prompt = """
I have a topic that contains the following documents:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
The topic is described by the following keywords: 'meat, beef, eat, eating, emissions, steak, food, health, processed, chicken'.
Based on the information about the topic above, please create a short label of this topic. Make sure you to only return the label and nothing more.
[/INST] Environmental impacts of eating meat
"""
main_prompt = """
[INST]
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the information about the topic above, please create a **short label** of this topic.
**Return only the label** and avoid adding any explanations or extra text such as 'topic'.
[/INST]
"""
prompt = system_prompt + example_prompt + main_prompt
# Create representation models
keybert_model = KeyBERTInspired()
# Utilizza il modello spaCy in base alla lingua selezionata
pos_model = PartOfSpeech(language_config["spacy_model"])
mmr_model = MaximalMarginalRelevance(diversity=model_params['diversity_factor'])
llama2 = TextGeneration(generator, prompt=prompt)
representation_model = {
"KeyBERT": keybert_model,
"Llama2": llama2,
"MMR": mmr_model,
"POS": pos_model
}
return BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
top_n_words=model_params['top_n_words'],
verbose=True
)
except Exception as e:
st.error(f"Error creating topic model: {str(e)}")
raise
# ------------------------------------------------------------------------------
# 5) Analisi principale (cachiamo i risultati finali dell'analisi)
# ------------------------------------------------------------------------------
@st.cache_data(hash_funcs={
SpanMarkerModel: lambda _: None, # ignora hashing per SpanMarker
SentenceTransformer: lambda _: None # ignora hashing per SentenceTransformer
})
def run_analysis(df, model_filter, embedding_model, model_params, exclude_brand_keywords, language_config):
"""
- Etichetta (facoltativo) come 'Brand' o 'Unbranded'
- Filtra i brand se richiesto
- Crea embeddings
- Esegue il topic modeling
- Restituisce il modello e il DataFrame dei risultati
"""
# Se l'utente sceglie di escludere i brand, etichettiamo e filtriamo
if exclude_brand_keywords:
df['Label'] = process_keywords(df, model_filter)
filtered_df = df[df['Label'] == 'Unbranded']
else:
df['Label'] = "Unbranded"
filtered_df = df
filtered_keywords = filtered_df['Keyword'].tolist()
if not filtered_keywords:
st.warning("No keywords found for analysis (perhaps all were branded).")
return None, None
# Genera embeddings
embeddings = embedding_model.encode(filtered_keywords, show_progress_bar=True)
# Crea e applica topic model (passando anche la configurazione della lingua)
topic_model = create_topic_model(embedding_model, model_params, language_config)
topics, probs = topic_model.fit_transform(filtered_keywords, embeddings)
# Ottieni gli embeddings ridotti per la visualizzazione
reduced_embeddings = topic_model.umap_model.embedding_
# Usa i label generati da Llama 2 come label finali
llama_topic_labels = {
topic: "".join(list(zip(*values))[0])
for topic, values in topic_model.topic_aspects_["Llama2"].items()
}
llama_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(llama_topic_labels)
# Ottieni le informazioni sui topic
topic_info = topic_model.get_topic_info()
topic_labels = dict(zip(topic_info["Topic"], topic_info["CustomName"]))
# Ottieni le informazioni di default BERT
bert_labels = dict(zip(topic_info["Topic"], topic_info["Name"]))
# Creiamo il DataFrame dei risultati
results_df = pd.DataFrame({
"Keyword": filtered_keywords,
"Topic ID": topics,
"Confidence": probs
})
# Aggiungiamo le label Llama e BERT
results_df["Llama label"] = [
topic_labels[topic] if topic in topic_labels else "Outlier Topic"
for topic in topics
]
results_df["BERT label"] = [
bert_labels[topic] if topic in bert_labels else "Outlier Topic"
for topic in topics
]
# Se nel CSV c'è una colonna 'Volume', la aggiungiamo
if "Volume" in filtered_df.columns:
results_df["Volume"] = filtered_df["Volume"].values
return topic_model, results_df
# ------------------------------------------------------------------------------
# 6) Main Streamlit App
# ------------------------------------------------------------------------------
def main():
st.title("🔍 Keywords Cluster for SEO")
# ------------------------------------------------------------------------------
# Sidebar: Selezione della lingua e configurazioni
# ------------------------------------------------------------------------------
with st.sidebar:
st.header("Configuration")
# Selezione della lingua
selected_language = st.selectbox(
"Select Language",
["English (US)", "English (UK)", "Italiano", "Español", "Deutsch", "Français"],
index=0,
help="Seleziona la lingua per l'analisi. Questo imposterà il modello spaCy, il modello linguistico per il rilevamento e i parametri per DataForSEO."
)
language_config = get_language_config(selected_language)
# File upload e configurazione righe
uploaded_file = st.file_uploader(
"Upload CSV file",
type="csv",
help="File must contain a 'Keyword' column"
)
with st.expander("CSV Reading Options"):
min_rows = st.number_input(
"Start reading from row",
min_value=1,
value=1,
help="Define the first row of the CSV file from which data should be read."
)
max_rows = st.number_input(
"Maximum rows to read",
min_value=1,
value=5000,
help="Define how many rows in total to read from the CSV file, starting from the row defined above."
)
# Opzione per escludere keyword brand
exclude_brands = st.checkbox(
"Exclude Organization keywords",
value=False,
help="If enabled, organization-labeled keywords are excluded from the analysis. (ex. company ltd)"
)
# Parametri UMAP
with st.expander("UMAP Parameters"):
umap_n_neighbors = st.slider("N Neighbors", 2, 100, 10)
umap_n_components = st.slider("N Components", 2, 50, 2)
umap_min_dist = st.slider("Min Distance", 0.0, 1.0, 0.0, 0.01)
# Parametri HDBSCAN
with st.expander("HDBSCAN Parameters"):
min_cluster_size = st.slider("Min Cluster Size", 2, 50, 5)
min_samples = st.slider("Min Samples", 1, 20, 5)
# Parametri Vectorizer
with st.expander("Vectorizer Parameters"):
min_df_type = st.radio(
"Min Document Frequency Type",
["Absolute", "Relative"],
help="Absolute: minimum count of documents, Relative: minimum fraction of documents"
)
if min_df_type == "Absolute":
min_df = st.number_input("Min Document Count", 1, 100, 2)
else:
min_df = st.slider("Min Document Fraction", 0.0, 0.5, 0.1, 0.01)
max_df = st.slider(
"Max Document Fraction",
min_value=float(min_df) if isinstance(min_df, float) else 0.5,
max_value=1.0,
value=0.95,
step=0.05
)
st.info(
f"Documents must appear in at least {min_df} "
f"{'documents' if isinstance(min_df, int) else '% of documents'} "
f"and at most {int(max_df * 100)}% of documents"
)
ngram_min = st.number_input("N-gram Min", 1, 3, 1)
ngram_max = st.number_input("N-gram Max", 1, 3, 2)
# Parametri Topic Model
with st.expander("Topic Model Parameters"):
top_n_words = st.slider("Top N Words", 5, 30, 10)
diversity_factor = st.slider("Topic Diversity", 0.0, 1.0, 0.3)
# Parametri Llama 2
with st.expander("Llama 2 Parameters"):
llama_temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.1)
llama_max_tokens = st.slider("Max Tokens", 50, 200, 100)
llama_repetition_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1, 0.1)
# Help section
with st.expander("ℹ️ Help"):
st.markdown("""
**How to use this app:**
1. Upload a CSV file with keywords
2. Configure CSV reading options
3. (Optionally) check "Exclude brand-labeled keywords"
4. Adjust model parameters if needed
5. Click 'Start Analysis'
6. Wait for results to appear
**Advanced Parameters:**
- UMAP: Controls dimensionality reduction
- HDBSCAN: Controls clustering behavior
- Vectorizer: Controls text preprocessing
- Topic Model: Controls topic generation
- Llama 2: Controls topic labeling
**Language Selection:**
Selezionando la lingua verranno impostati:
- Il modello spaCy da utilizzare (per es. 'en_core_web_sm' per English o 'it_core_news_sm' per Italiano)
- Il modello linguistico per il rilevamento (SpanMarker) (sostituisci i placeholder con i modelli corretti se disponibili)
- I parametri per DataForSEO (ad es. il codice lingua come 'en-us', 'it-it', ecc.)
""")
# ------------------------------------------------------------------------------
# 7) Prepariamo dizionario parametri per il topic model
# ------------------------------------------------------------------------------
model_params = {
'umap_n_neighbors': umap_n_neighbors,
'umap_n_components': umap_n_components,
'umap_min_dist': umap_min_dist,
'min_cluster_size': min_cluster_size,
'min_samples': min_samples,
'min_df': min_df,
'max_df': max_df,
'ngram_min': ngram_min,
'ngram_max': ngram_max,
'top_n_words': top_n_words,
'diversity_factor': diversity_factor,
'llama_temperature': llama_temperature,
'llama_max_tokens': llama_max_tokens,
'llama_repetition_penalty': llama_repetition_penalty
}
# ------------------------------------------------------------------------------
# 8) Se abbiamo caricato un file, procediamo
# ------------------------------------------------------------------------------
if uploaded_file is not None:
try:
# Carica dati con caching
df = load_csv(
file=uploaded_file,
skiprows=min_rows - 1,
nrows=max_rows - min_rows + 1
)
if 'Keyword' not in df.columns:
st.error("CSV must contain a 'Keyword' column")
return
# Preview dati
with st.expander("Preview Data", expanded=True):
st.write(f"Reading rows {min_rows} to {max_rows}")
st.dataframe(
df.head(),
use_container_width=True
)
st.write(f"Total rows loaded: {len(df)}")
# Pulsante per avviare l'analisi
if st.button("Start Analysis", type="primary"):
try:
# Carichiamo i modelli (cache_resource) con la configurazione della lingua
with st.spinner("Loading models..."):
model_filter, embedding_model = load_models(language_config)
# Eseguiamo l'analisi (cache_data)
with st.spinner("Processing data..."):
topic_model, results_df = run_analysis(
df,
model_filter,
embedding_model,
model_params,
exclude_brand_keywords=exclude_brands,
language_config=language_config
)
if topic_model is None or results_df is None:
st.error("Analysis failed!")
return
# Visualizza riepilogo configurazione
with st.expander("Configuration Summary", expanded=False):
st.subheader("Model Parameters")
st.json(model_params)
st.subheader("Language Configuration")
st.json(language_config)
# ------------------------------------------------------------------------------
# 9) Mostra risultati
# ------------------------------------------------------------------------------
st.write("### Results Table")
st.dataframe(results_df, use_container_width=True, hide_index=True)
# Visualizza la dashboard interattiva
st.write("### Interactive Topic Visualization")
try:
# Embedding ridotto
fig = topic_model.visualize_documents(
results_df['Keyword'].tolist(),
reduced_embeddings=topic_model.umap_model.embedding_,
hide_annotations=True,
hide_document_hover=False,
custom_labels=True
)
st.plotly_chart(fig, theme="streamlit", use_container_width=True)
# Visualizzazione dei topic
st.write("### Topic Overview")
try:
topic_fig = topic_model.visualize_topics(custom_labels=True)
st.plotly_chart(topic_fig, theme="streamlit", use_container_width=True)
except Exception as e:
st.error(f"Error creating topic visualization: {str(e)}")
# Visualizzazione barchart dei topic
st.write("### Topic Distribution")
try:
n_topics = len(topic_model.get_topic_info())
n_topics = min(50, max(1, n_topics - 1)) # -1 per outlier
barchart_fig = topic_model.visualize_barchart(
top_n_topics=n_topics,
custom_labels=True
)
st.plotly_chart(barchart_fig, theme="streamlit", use_container_width=True)
except Exception as e:
st.error(f"Error creating barchart visualization: {str(e)}")
# (A) AGGIUNTA: Visualizzazione gerarchica dei topic
st.write("### Hierarchical Topics")
try:
docs = results_df["Keyword"].tolist()
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(
docs,
linkage_function=linkage_function
)
# Grafico gerarchico
fig_hierarchy = topic_model.visualize_hierarchy(
hierarchical_topics=hierarchical_topics,
custom_labels=True
)
st.plotly_chart(fig_hierarchy, theme="streamlit", use_container_width=True)
# (B) AGGIUNTA: Visualizzazione testuale dell'albero
st.write("### Hierarchical Topic Tree")
tree = topic_model.get_topic_tree(hierarchical_topics)
st.text(tree) # Oppure st.code(tree) per un blocco formattato
except Exception as e:
st.error(f"Error creating hierarchical visualization: {str(e)}")
# Download risultati in CSV
st.download_button(
label="Download Results",
data=results_df.to_csv(index=False),
file_name="keyword_analysis_results.csv",
mime="text/csv",
key="download_results"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
except Exception as e:
st.error(f"Error reading file: {str(e)}")
else:
# Messaggio iniziale
st.info("""
👋 Welcome to the Keywords Cluster for SEO!
1. Upload a CSV file with a column named **'Keyword'**.
2. Adjust parameters in the sidebar if needed.
3. Click **"Start Analysis"**.
4. Explore the data.
5. Download the results (this will refresh page).
""")
if __name__ == "__main__":
main()