File size: 16,387 Bytes
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535c2fb
516b663
25bff98
4c5415e
25bff98
516b663
25bff98
7f2db85
 
 
 
 
 
 
 
 
 
 
 
c86a287
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ce4957
 
7f2db85
 
 
8ce4957
 
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ce4957
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01c4e47
8cfe38a
 
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25bff98
 
7f2db85
25bff98
8cfe38a
 
25bff98
7f2db85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
import gradio as gr
from transformers import pipeline, AutoTokenizer
from classifier import MistralForSequenceClassification
import torch
import nltk
import json
import pandas as pd
import plotly.graph_objects as go
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import io
import base64
from PIL import Image
from nltk import bigrams
import malaya
from collections import Counter
import os
from flagging import HuggingFaceDatasetSaver

HF_TOKEN = os.getenv('HUGGINGFACE_HUB_TOKEN')

hf_writer = HuggingFaceDatasetSaver(HF_TOKEN,'HalalFoodNLP/tpb-crowdsourced-dataset')

with open('en.json') as fopen:
    en = json.load(fopen)
    
stopwords = malaya.text.function.get_stopwords()
stopwords = stopwords + en + ['lor', 'quote','Quote','QUOTE','pm', 'long', 'jer', 'time', 'feel', 'liao', 'wow', 'https', 'http', 've', 'ko', 'kena', 'post', 'ni', 'tu', 'don', 'je', 'jeh', 'la', 'tau', 'haha', 'hahaha', 'hahahaha']
stopwords += ['for me', 'to be', 'in the', 'me to', 'for me to']

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
tokenizer_tpb = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')
model_tpb = MistralForSequenceClassification.from_pretrained('HalalFoodNLP/tpb-model-halal', torch_dtype=torch.bfloat16, token = HF_TOKEN)
model_sentiment = MistralForSequenceClassification.from_pretrained('malaysia-ai/sentiment-mistral-191M-MLM', torch_dtype=torch.bfloat16)
pipeline_tpb = pipeline(task="text-classification", model=model_tpb, tokenizer=tokenizer_tpb)
sentiment_pipeline = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_tpb)

data = []
with open('sentiment-tpb-dataset.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# Update the generate_wordcloud function to return a PIL Image object
def generate_wordcloud(text):
    # Generate the word cloud
    wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text)
    
    # Create the plot
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)

    # Save the plot to a bytes buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)
    
    # Convert bytes buffer to PIL Image
    image = Image.open(buf)
    return image

# Add a function to generate bigrams
def generate_bigrams(text):
    words = nltk.word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stopwords]
    bi_grams = list(bigrams(words))
    return Counter(bi_grams).most_common(10)

def predict_decision(sentiment_label):
    if sentiment_label == 'positive':
        return "High likelihood of purchase"
    elif sentiment_label == 'neutral':
        return "Moderate likelihood of purchase"
    else:
        return "Low likelihood of purchase"
    
# Function to generate report based on TPB sentiment
def generate_report(tpb_sentiment_df):
    report = "## TPB Factor Analysis and Recommendations Report\n\n"
    
    for _, row in tpb_sentiment_df.iterrows():
        tpb_label = row['tpb_label']
        positive_percentage = row['positive']
        negative_percentage = row['negative']

        
        if negative_percentage > 70:  # Only generate recommendations for positive < 70%
            if tpb_label == "attitude":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- High negative perception regarding product quality
- Concerns about halal certification and its authenticity
- Pricing issues in comparison to perceived value

**Recommended Actions:**

1. **Quality Control Improvements**
   - Implement enhanced product quality measures
   - Obtain globally recognized halal certifications
   - Conduct regular quality audits

2. **Educational Campaigns**
   - Educate customers on halal certification processes
   - Raise awareness about the health benefits of halal products
   - Highlight ethical and sustainable sourcing

3. **Pricing Strategy Adjustment**
   - Reassess pricing to align with customer expectations
   - Introduce discount programs or loyalty initiatives
"""
            if tpb_label == "religious knowledge":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Lack of awareness and understanding about the halal process
- Customers may be unsure of the religious guidelines followed

**Recommended Actions:**

1. **Religious Knowledge Enhancement**
   - Provide clear educational materials on the halal process
   - Collaborate with religious scholars to endorse products
   - Ensure transparent labeling and certification

2. **Community Engagement**
   - Host webinars or community events about halal
   - Partner with local religious organizations for outreach
   - Share customer testimonials emphasizing trust in your certification
"""

            if tpb_label == "subjective norms":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Social influence or peer pressure regarding halal compliance is weak
- Lack of community-driven recommendations for the product

**Recommended Actions:**

1. **Influence Social Circles**
   - Engage community leaders or influencers to endorse products
   - Create social campaigns around the halal certification to enhance peer recommendations

2. **Referral Programs**
   - Introduce referral programs where existing customers can promote the product
   - Offer incentives for customers who share their experiences with others

3. **Testimonials and Success Stories**
   - Use customer testimonials and success stories to strengthen social trust
"""
            
            if tpb_label == "perceived behavioural control":
                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
                report += """
**Current Issues:**
- Perceived difficulty in understanding or accessing halal-certified products
- Concerns about control over product quality and sourcing transparency

**Recommended Actions:**

1. **Improve Accessibility**
   - Make halal products more accessible through multiple platforms (e-commerce, retail stores)
   - Ensure ease of purchase and fast delivery options

2. **Enhance Transparency**
   - Provide detailed information about sourcing and production processes
   - Use blockchain or similar technology to enhance transparency in halal certification

3. **Customer Empowerment**
   - Offer customer feedback channels to empower users to voice concerns and suggestions
   - Ensure that concerns are addressed promptly to build trust and satisfaction
"""
    
    return report


def search_company(keyword):
    if not keyword:
        return None, None, None, None, None, None, None, None, None, None

    filtered_df = df[df['text'].str.contains(keyword, case=False)]
    
    if filtered_df.empty:
        return None, None, None, None, None, None, None, None, None, None

    # Calculate sentiment distribution
    sentiment_counts = filtered_df['label'].value_counts(normalize=True) * 100
    colors = ['red' if sentiment == 'negative' else 'gray' if sentiment == 'neutral' else 'blue' for sentiment in sentiment_counts.index]

    # Create the bar plot
    sentiment_fig = go.Figure(data=[go.Bar(
        x=sentiment_counts.index,
        y=sentiment_counts.values,
        text=[f'{val:.1f}%' for val in sentiment_counts.values],
        textposition='auto',
        marker_color=colors 
    )])
    sentiment_fig.update_layout(
        title='Overall Sentiment Distribution',
        xaxis_title='Sentiment',
        yaxis_title='Percentage'
    )

    tpb_counts = filtered_df['tpb_label'].value_counts(normalize=True) * 100
    tpb_fig = go.Figure(data=[go.Bar(
        x=tpb_counts.index,
        y=tpb_counts.values,
        text=[f'{val:.1f}%' for val in tpb_counts.values],
        textposition='auto'
    )])
    tpb_fig.update_layout(title='Overall TPB Factor Distribution', xaxis_title='TPB Factor', yaxis_title='Percentage')

    # Calculate sentiment distribution within each TPB factor
    tpb_sentiment_df = filtered_df.groupby(['tpb_label', 'label']).size().unstack(fill_value=0)
    tpb_sentiment_df = tpb_sentiment_df.div(tpb_sentiment_df.sum(axis=1), axis=0) * 100

    color_map = {
        'negative': 'red',
        'neutral': 'gray',
        'positive': 'blue'
    }
        
    tpb_sentiment_fig = go.Figure()
    for sentiment in tpb_sentiment_df.columns:
        tpb_sentiment_fig.add_trace(go.Bar(
            name=sentiment,
            x=tpb_sentiment_df.index,
            y=tpb_sentiment_df[sentiment],
            text=[f'{val:.1f}%' for val in tpb_sentiment_df[sentiment]],
            textposition='auto',
            marker_color=color_map.get(sentiment, 'gray')
        ))
    
    tpb_sentiment_fig.update_layout(
        barmode='stack',
        title='Sentiment Distribution within TPB Factors',
        xaxis_title='TPB Factor',
        yaxis_title='Percentage'
    )

    report = generate_report(tpb_sentiment_df.reset_index())

    wordclouds = {}
    bigrams_data = {}
    for label in filtered_df['tpb_label'].unique():
        text = ' '.join(filtered_df[filtered_df['tpb_label'] == label]['text']).replace('QUOTE','').replace('quote','').replace('sijil halal','').replace('halal','')
        wordclouds[label] = generate_wordcloud(text)
        bigrams_data[label] = generate_bigrams(text)

    words_only = {
        key: [word_pair for word_pair, _ in value]
        for key, value in bigrams_data.items()
    }
    bigram_df = pd.DataFrame({
        label: data for label, data in words_only.items()
    })

    bigram_df.index = [f"Top {i+1}" for i in range(len(bigram_df))]

    return (sentiment_fig, tpb_fig, tpb_sentiment_fig, filtered_df[filtered_df['text'].str.len() < 300].head(5), 
            report, wordclouds.get('attitude'), wordclouds.get('religious knowledge'), 
            wordclouds.get('subjective norms'), wordclouds.get('perceived behavioural control'), bigram_df)


def text_classification_and_sentiment(text, keywords_df):
    result_tpb = pipeline_tpb(text)
    tpb_label = result_tpb[0]['label']
    tpb_score = result_tpb[0]['score']
    
    result_sentiment = sentiment_pipeline(text)
    sentiment_label = result_sentiment[0]['label']
    sentiment_score = result_sentiment[0]['score']

    keywords_df = pd.read_excel('IMG_8137.xlsx')
    
    # Check for keywords in the first column of the DataFrame
    keywords = keywords_df.iloc[:, 0].tolist()
    for keyword in keywords:
        if keyword.lower() in text.lower():
            sentiment_label = 'negative'
            sentiment_score = 1.0 

    decision = predict_decision(sentiment_label)
    
    tpb_output = f"TPB Label: {tpb_label}"
    sentiment_output = f"Sentiment: {sentiment_label}\nProbability: {sentiment_score * 100:.2f}%"
    decision_output = f"Decision: {decision}"

    hf_writer.flag([text,tpb_label, sentiment_label])


    return tpb_output, sentiment_output, decision_output


examples = [
    "Alhamdulillah, hari ni dapat makan dekat restoran halal baru. Rasa puas hati dan tenang bila tau makanan yang kita makan dijamin halal.",
    "Semua orang cakap kena check logo halal sebelum beli makanan. Dah jadi macam second nature dah sekarang. Korang pun sama kan?"
]

css = """
:root {
    --bg: #FFFFFF; /* Set the background color to white */
    --col: #191919; /* Define primary text color */
    --bg-dark: #000000; /* Define dark background color if needed */
    --col-dark: #ECF2F7; /* Define dark text color if needed */
    ----body-background-fill:  #FFFFFF;
}

html, body {
    background-color: var(--bg); /* Set the background color to white for the entire page */
    margin: 0; /* Remove default body margin */
    padding: 0; /* Remove default body padding */
}

.container { 
    max-width: 1000px; 
    margin: auto; 
    padding: 20px; 
}

.title { 
    text-align: center; 
    margin-bottom: 20px; 
}

.nav-buttons { 
    display: flex; 
    justify-content: center; 
    gap: 10px; 
    margin-bottom: 20px; 
}

#recommendation_report {
    background-color: #f9f9f9; /* Keep this background light for the report section */
    padding: 20px;
    border: 2px solid #e0e0e0;
    border-radius: 10px;
    margin-top: 20px;
    font-family: Arial, sans-serif;
    font-size: 14px;
}

.wrap-text { 
    white-space: normal !important; 
    word-wrap: break-word; 
}

.footer {visibility: hidden}

"""

with gr.Blocks(css=css + """
     body, .gradio-container, .root, .wrap, #root .background .container {
        background-color: white !important;
        background-image: none !important;
        background-fill: white !important;
    }
    
""", theme='aisyahhrazak/miku-aisyah@=1.2.2') as demo:
                
    with gr.Tabs() as tabs:
        with gr.TabItem("User View", id=0):
            gr.Markdown("## Text Classification and Sentiment Analysis Based on User Input About Halal Food Acquisition")
            gr.Markdown("Enter a text to see TPB classification, sentiment analysis, and purchase prediction results!")
            input_text = gr.Textbox(lines=2, label="Input Comment", placeholder="Model can make mistakes, we are striving to improve the model.")
            with gr.Row():
                tpb_output = gr.Textbox(lines=3, label="TPB Classification")
                sentiment_output = gr.Textbox(lines=3, label="Sentiment Analysis")
            decision_output = gr.Textbox(lines=3, label="Purchase Prediction")
            # This needs to be called at some point prior to the first call to callback.flag()
            hf_writer.setup([input_text,tpb_output, sentiment_output], "flagged_data_points")
            classify_button = gr.Button("Analyze")

            classify_button.click(fn=text_classification_and_sentiment, inputs=input_text, outputs=[tpb_output, sentiment_output, decision_output])
            gr.Examples(examples=examples, inputs=input_text)

        
        with gr.TabItem("Company View", id=1):
            gr.Markdown("# Sentiment Analysis and Purchase Decision Factor for Halal Food Acquisition")

            input_text = gr.Textbox(lines=1, label="Search Keyword", placeholder="Enter keyword")
            search_button = gr.Button("Search")
        
            with gr.Row():
                sentiment_chart = gr.Plot(label="Sentiment Distribution")
                tpb_chart = gr.Plot(label="TPB Factor Distribution")

            tpb_sentiment_chart = gr.Plot(label="Sentiment Distribution within TPB Factors")
                        # Update word cloud outputs to be in a single row
            gr.Markdown("### Word Clouds by TPB Label")
            
            with gr.Row():
                attitude_wc = gr.Image(label="Attitude Word Cloud", height=200, width=300)
                religious_knowledge_wc = gr.Image(label="Religious Knowledge Word Cloud", height=200, width=300)
                subjective_norms_wc = gr.Image(label="Subjective Norms Word Cloud",height=200, width=300)
                perceived_behavioural_control_wc = gr.Image(label="Perceived Behavioural Control Word Cloud", height=200, width=300)

            with gr.Accordion("See Recommendation Details"):
                report_output = gr.Markdown(label="Recommendation Report", elem_id="recommendation_report")

            gr.Markdown("### Top Bigrams by TPB Label")
            bigram_table = gr.Dataframe(label="Top Bigrams for Each TPB Label")

            output_table = gr.Dataframe(
                headers=["text", "tpb_label", "sentiment", "score"],
                label="Company Analysis Results",
                wrap=True
            )

            search_button.click(
                fn=search_company, 
                inputs=input_text, 
                outputs=[
                    sentiment_chart, tpb_chart, tpb_sentiment_chart, output_table, report_output,
                    attitude_wc, religious_knowledge_wc, subjective_norms_wc, perceived_behavioural_control_wc,bigram_table
                ]
            )

demo.launch()