Spaces:

HalalFoodNLP
/

halalnlp

Running

File size: 16,387 Bytes

import gradio as gr
from transformers import pipeline, AutoTokenizer
from classifier import MistralForSequenceClassification
import torch
import nltk
import json
import pandas as pd
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
import base64
from PIL import Image
from nltk import bigrams
import malaya
from collections import Counter
import os
from flagging import HuggingFaceDatasetSaver

HF_TOKEN = os.getenv('HUGGINGFACE_HUB_TOKEN')

hf_writer = HuggingFaceDatasetSaver(HF_TOKEN,'HalalFoodNLP/tpb-crowdsourced-dataset')

with open('en.json') as fopen:
    en = json.load(fopen)
    
stopwords = malaya.text.function.get_stopwords()
stopwords = stopwords + en + ['lor', 'quote','Quote','QUOTE','pm', 'long', 'jer', 'time', 'feel', 'liao', 'wow', 'https', 'http', 've', 'ko', 'kena', 'post', 'ni', 'tu', 'don', 'je', 'jeh', 'la', 'tau', 'haha', 'hahaha', 'hahahaha']
stopwords += ['for me', 'to be', 'in the', 'me to', 'for me to']

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
tokenizer_tpb = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')
model_tpb = MistralForSequenceClassification.from_pretrained('HalalFoodNLP/tpb-model-halal', torch_dtype=torch.bfloat16, token = HF_TOKEN)
model_sentiment = MistralForSequenceClassification.from_pretrained('malaysia-ai/sentiment-mistral-191M-MLM', torch_dtype=torch.bfloat16)
pipeline_tpb = pipeline(task="text-classification", model=model_tpb, tokenizer=tokenizer_tpb)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_tpb)

data = []
with open('sentiment-tpb-dataset.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# Update the generate_wordcloud function to return a PIL Image object
def generate_wordcloud(text):
    # Generate the word cloud
    wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text)
    
    # Create the plot
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)

    # Save the plot to a bytes buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)
    
    # Convert bytes buffer to PIL Image
    image = Image.open(buf)
    return image

# Add a function to generate bigrams
def generate_bigrams(text):
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stopwords]
    bi_grams = list(bigrams(words))
    return Counter(bi_grams).most_common(10)

def predict_decision(sentiment_label):
    if sentiment_label == 'positive':
        return "High likelihood of purchase"
    elif sentiment_label == 'neutral':
        return "Moderate likelihood of purchase"
    else:
        return "Low likelihood of purchase"
    
# Function to generate report based on TPB sentiment
def generate_report(tpb_sentiment_df):
    report = "## TPB Factor Analysis and Recommendations Report\n\n"
    
    for _, row in tpb_sentiment_df.iterrows():
        tpb_label = row['tpb_label']
        positive_percentage = row['positive']
        negative_percentage = row['negative']

        
        if negative_percentage > 70:  # Only generate recommendations for positive < 70%
            if tpb_label == "attitude":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- High negative perception regarding product quality
- Concerns about halal certification and its authenticity
- Pricing issues in comparison to perceived value

**Recommended Actions:**

1. **Quality Control Improvements**
   - Implement enhanced product quality measures
   - Obtain globally recognized halal certifications
   - Conduct regular quality audits

2. **Educational Campaigns**
   - Educate customers on halal certification processes
   - Raise awareness about the health benefits of halal products
   - Highlight ethical and sustainable sourcing

3. **Pricing Strategy Adjustment**
   - Reassess pricing to align with customer expectations
   - Introduce discount programs or loyalty initiatives
"""
            if tpb_label == "religious knowledge":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Lack of awareness and understanding about the halal process
- Customers may be unsure of the religious guidelines followed

**Recommended Actions:**

1. **Religious Knowledge Enhancement**
   - Provide clear educational materials on the halal process
   - Collaborate with religious scholars to endorse products
   - Ensure transparent labeling and certification

2. **Community Engagement**
   - Host webinars or community events about halal
   - Partner with local religious organizations for outreach
   - Share customer testimonials emphasizing trust in your certification
"""

            if tpb_label == "subjective norms":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Social influence or peer pressure regarding halal compliance is weak
- Lack of community-driven recommendations for the product

**Recommended Actions:**

1. **Influence Social Circles**
   - Engage community leaders or influencers to endorse products
   - Create social campaigns around the halal certification to enhance peer recommendations

2. **Referral Programs**
   - Introduce referral programs where existing customers can promote the product
   - Offer incentives for customers who share their experiences with others

3. **Testimonials and Success Stories**
   - Use customer testimonials and success stories to strengthen social trust
"""
            
            if tpb_label == "perceived behavioural control":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Perceived difficulty in understanding or accessing halal-certified products
- Concerns about control over product quality and sourcing transparency

**Recommended Actions:**

1. **Improve Accessibility**
   - Make halal products more accessible through multiple platforms (e-commerce, retail stores)
   - Ensure ease of purchase and fast delivery options

2. **Enhance Transparency**
   - Provide detailed information about sourcing and production processes
   - Use blockchain or similar technology to enhance transparency in halal certification

3. **Customer Empowerment**
   - Offer customer feedback channels to empower users to voice concerns and suggestions
   - Ensure that concerns are addressed promptly to build trust and satisfaction
"""
    
    return report


def search_company(keyword):
    if not keyword:
        return None, None, None, None, None, None, None, None, None, None

    filtered_df = df[df['text'].str.contains(keyword, case=False)]
    
    if filtered_df.empty:
        return None, None, None, None, None, None, None, None, None, None

    # Calculate sentiment distribution
    sentiment_counts = filtered_df['label'].value_counts(normalize=True) * 100
    colors = ['red' if sentiment == 'negative' else 'gray' if sentiment == 'neutral' else 'blue' for sentiment in sentiment_counts.index]

    # Create the bar plot
    sentiment_fig = go.Figure(data=[go.Bar(
        x=sentiment_counts.index,
        y=sentiment_counts.values,
        text=[f'{val:.1f}%' for val in sentiment_counts.values],
        textposition='auto',
        marker_color=colors 
    )])
    sentiment_fig.update_layout(
        title='Overall Sentiment Distribution',
        xaxis_title='Sentiment',
        yaxis_title='Percentage'
    )

    tpb_counts = filtered_df['tpb_label'].value_counts(normalize=True) * 100
    tpb_fig = go.Figure(data=[go.Bar(
        x=tpb_counts.index,
        y=tpb_counts.values,
        text=[f'{val:.1f}%' for val in tpb_counts.values],
        textposition='auto'
    )])
    tpb_fig.update_layout(title='Overall TPB Factor Distribution', xaxis_title='TPB Factor', yaxis_title='Percentage')

    # Calculate sentiment distribution within each TPB factor
    tpb_sentiment_df = filtered_df.groupby(['tpb_label', 'label']).size().unstack(fill_value=0)
    tpb_sentiment_df = tpb_sentiment_df.div(tpb_sentiment_df.sum(axis=1), axis=0) * 100

    color_map = {
        'negative': 'red',
        'neutral': 'gray',
        'positive': 'blue'
    }
        
    tpb_sentiment_fig = go.Figure()
    for sentiment in tpb_sentiment_df.columns:
        tpb_sentiment_fig.add_trace(go.Bar(
            name=sentiment,
            x=tpb_sentiment_df.index,
            y=tpb_sentiment_df[sentiment],
            text=[f'{val:.1f}%' for val in tpb_sentiment_df[sentiment]],
            textposition='auto',
            marker_color=color_map.get(sentiment, 'gray')
        ))
    
    tpb_sentiment_fig.update_layout(
        barmode='stack',
        title='Sentiment Distribution within TPB Factors',
        xaxis_title='TPB Factor',
        yaxis_title='Percentage'
    )

    report = generate_report(tpb_sentiment_df.reset_index())

    wordclouds = {}
    bigrams_data = {}
    for label in filtered_df['tpb_label'].unique():
        text = ' '.join(filtered_df[filtered_df['tpb_label'] == label]['text']).replace('QUOTE','').replace('quote','').replace('sijil halal','').replace('halal','')
        wordclouds[label] = generate_wordcloud(text)
        bigrams_data[label] = generate_bigrams(text)

    words_only = {
        key: [word_pair for word_pair, _ in value]
        for key, value in bigrams_data.items()
    }
    bigram_df = pd.DataFrame({
        label: data for label, data in words_only.items()
    })

    bigram_df.index = [f"Top {i+1}" for i in range(len(bigram_df))]

    return (sentiment_fig, tpb_fig, tpb_sentiment_fig, filtered_df[filtered_df['text'].str.len() < 300].head(5), 
            report, wordclouds.get('attitude'), wordclouds.get('religious knowledge'), 
            wordclouds.get('subjective norms'), wordclouds.get('perceived behavioural control'), bigram_df)


def text_classification_and_sentiment(text, keywords_df):
    result_tpb = pipeline_tpb(text)
    tpb_label = result_tpb[0]['label']
    tpb_score = result_tpb[0]['score']
    
    result_sentiment = sentiment_pipeline(text)
    sentiment_label = result_sentiment[0]['label']
    sentiment_score = result_sentiment[0]['score']

    keywords_df = pd.read_excel('IMG_8137.xlsx')
    
    # Check for keywords in the first column of the DataFrame
    keywords = keywords_df.iloc[:, 0].tolist()
    for keyword in keywords:
        if keyword.lower() in text.lower():
            sentiment_label = 'negative'
            sentiment_score = 1.0 

    decision = predict_decision(sentiment_label)
    
    tpb_output = f"TPB Label: {tpb_label}"
    sentiment_output = f"Sentiment: {sentiment_label}\nProbability: {sentiment_score * 100:.2f}%"
    decision_output = f"Decision: {decision}"

    hf_writer.flag([text,tpb_label, sentiment_label])


    return tpb_output, sentiment_output, decision_output


examples = [
    "Alhamdulillah, hari ni dapat makan dekat restoran halal baru. Rasa puas hati dan tenang bila tau makanan yang kita makan dijamin halal.",
    "Semua orang cakap kena check logo halal sebelum beli makanan. Dah jadi macam second nature dah sekarang. Korang pun sama kan?"
]

css = """
:root {
    --bg: #FFFFFF; /* Set the background color to white */
    --col: #191919; /* Define primary text color */
    --bg-dark: #000000; /* Define dark background color if needed */
    --col-dark: #ECF2F7; /* Define dark text color if needed */
    ----body-background-fill:  #FFFFFF;
}

html, body {
    background-color: var(--bg); /* Set the background color to white for the entire page */
    margin: 0; /* Remove default body margin */
    padding: 0; /* Remove default body padding */
}

.container { 
    max-width: 1000px; 
    margin: auto; 
    padding: 20px; 
}

.title { 
    text-align: center; 
    margin-bottom: 20px; 
}

.nav-buttons { 
    display: flex; 
    justify-content: center; 
    gap: 10px; 
    margin-bottom: 20px; 
}

#recommendation_report {
    background-color: #f9f9f9; /* Keep this background light for the report section */
    padding: 20px;
    border: 2px solid #e0e0e0;
    border-radius: 10px;
    margin-top: 20px;
    font-family: Arial, sans-serif;
    font-size: 14px;
}

.wrap-text { 
    white-space: normal !important; 
    word-wrap: break-word; 
}

.footer {visibility: hidden}

"""

with gr.Blocks(css=css + """
     body, .gradio-container, .root, .wrap, #root .background .container {
        background-color: white !important;
        background-image: none !important;
        background-fill: white !important;
    }
    
""", theme='aisyahhrazak/miku-aisyah@=1.2.2') as demo:
                
    with gr.Tabs() as tabs:
        with gr.TabItem("User View", id=0):
            gr.Markdown("## Text Classification and Sentiment Analysis Based on User Input About Halal Food Acquisition")
            gr.Markdown("Enter a text to see TPB classification, sentiment analysis, and purchase prediction results!")
            input_text = gr.Textbox(lines=2, label="Input Comment", placeholder="Model can make mistakes, we are striving to improve the model.")
            with gr.Row():
                tpb_output = gr.Textbox(lines=3, label="TPB Classification")
                sentiment_output = gr.Textbox(lines=3, label="Sentiment Analysis")
            decision_output = gr.Textbox(lines=3, label="Purchase Prediction")
            # This needs to be called at some point prior to the first call to callback.flag()
            hf_writer.setup([input_text,tpb_output, sentiment_output], "flagged_data_points")
            classify_button = gr.Button("Analyze")

            classify_button.click(fn=text_classification_and_sentiment, inputs=input_text, outputs=[tpb_output, sentiment_output, decision_output])
            gr.Examples(examples=examples, inputs=input_text)

        
        with gr.TabItem("Company View", id=1):
            gr.Markdown("# Sentiment Analysis and Purchase Decision Factor for Halal Food Acquisition")

            input_text = gr.Textbox(lines=1, label="Search Keyword", placeholder="Enter keyword")
            search_button = gr.Button("Search")
        
            with gr.Row():
                sentiment_chart = gr.Plot(label="Sentiment Distribution")
                tpb_chart = gr.Plot(label="TPB Factor Distribution")

            tpb_sentiment_chart = gr.Plot(label="Sentiment Distribution within TPB Factors")
                        # Update word cloud outputs to be in a single row
            gr.Markdown("### Word Clouds by TPB Label")
            
            with gr.Row():
                attitude_wc = gr.Image(label="Attitude Word Cloud", height=200, width=300)
                religious_knowledge_wc = gr.Image(label="Religious Knowledge Word Cloud", height=200, width=300)
                subjective_norms_wc = gr.Image(label="Subjective Norms Word Cloud",height=200, width=300)
                perceived_behavioural_control_wc = gr.Image(label="Perceived Behavioural Control Word Cloud", height=200, width=300)

            with gr.Accordion("See Recommendation Details"):
                report_output = gr.Markdown(label="Recommendation Report", elem_id="recommendation_report")

            gr.Markdown("### Top Bigrams by TPB Label")
            bigram_table = gr.Dataframe(label="Top Bigrams for Each TPB Label")

            output_table = gr.Dataframe(
                headers=["text", "tpb_label", "sentiment", "score"],
                label="Company Analysis Results",
                wrap=True
            )

            search_button.click(
                fn=search_company, 
                inputs=input_text, 
                outputs=[
                    sentiment_chart, tpb_chart, tpb_sentiment_chart, output_table, report_output,
                    attitude_wc, religious_knowledge_wc, subjective_norms_wc, perceived_behavioural_control_wc,bigram_table
                ]
            )

demo.launch()