Spaces:

HalalFoodNLP
/

halalnlp

Running

App Files Files Community

aisyahhrazak commited on 27 days ago

Commit

7f2db85

•

1 Parent(s): 68abf71

Upload 7 files

Browse files

Files changed (7) hide show

IMG_8137.xlsx +0 -0
app.py +421 -0
attn_mask_utils.py +160 -0
bidirectional_mistral.py +281 -0
classifier.py +90 -0
en.json +1 -0
sentiment-tpb-dataset.jsonl +0 -0

IMG_8137.xlsx ADDED Viewed

Binary file (14.6 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,421 @@

+import gradio as gr
+from transformers import pipeline, AutoTokenizer
+from classifier import MistralForSequenceClassification
+import torch
+import nltk
+import json
+import pandas as pd
+import plotly.graph_objects as go
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import io
+import base64
+from PIL import Image
+from nltk import bigrams
+import malaya
+from collections import Counter
+with open('en.json') as fopen:
+    en = json.load(fopen)
+stopwords = malaya.text.function.get_stopwords()
+stopwords = stopwords + en + ['lor', 'quote','Quote','QUOTE','pm', 'long', 'jer', 'time', 'feel', 'liao', 'wow', 'https', 'http', 've', 'ko', 'kena', 'post', 'ni', 'tu', 'don', 'je', 'jeh', 'la', 'tau', 'haha', 'hahaha', 'hahahaha']
+stopwords += ['for me', 'to be', 'in the', 'me to', 'for me to']
+nltk.download('punkt', quiet=True)
+nltk.download('punkt_tab', quiet=True)
+nltk.download('stopwords', quiet=True)
+nltk.download('vader_lexicon', quiet=True)
+tokenizer_tpb = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')
+model_tpb = MistralForSequenceClassification.from_pretrained('HalalFoodNLP/tpb-model-halal', torch_dtype=torch.bfloat16)
+model_sentiment = MistralForSequenceClassification.from_pretrained('malaysia-ai/sentiment-mistral-191M-MLM', torch_dtype=torch.bfloat16)
+pipeline_tpb = pipeline(task="text-classification", model=model_tpb, tokenizer=tokenizer_tpb)
+sentiment_pipeline = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_tpb)
+data = []
+with open('sentiment-tpb-dataset.jsonl', 'r') as file:
+    for line in file:
+        data.append(json.loads(line))
+df = pd.DataFrame(data)
+# Update the generate_wordcloud function to return a PIL Image object
+def generate_wordcloud(text):
+    # Generate the word cloud
+    wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text)
+    # Create the plot
+    plt.figure(figsize=(10, 5))
+    plt.imshow(wordcloud, interpolation='bilinear')
+    plt.axis('off')
+    plt.tight_layout(pad=0)
+    # Save the plot to a bytes buffer
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    buf.seek(0)
+    # Convert bytes buffer to PIL Image
+    image = Image.open(buf)
+    return image
+# Add a function to generate bigrams
+def generate_bigrams(text):
+    words = nltk.word_tokenize(text.lower())
+    words = [word for word in words if word.isalnum() and word not in stopwords]
+    bi_grams = list(bigrams(words))
+    return Counter(bi_grams).most_common(10)
+def predict_decision(sentiment_label):
+    if sentiment_label == 'positive':
+        return "High likelihood of purchase"
+    elif sentiment_label == 'neutral':
+        return "Moderate likelihood of purchase"
+    else:
+        return "Low likelihood of purchase"
+# Function to generate report based on TPB sentiment
+def generate_report(tpb_sentiment_df):
+    report = "## TPB Factor Analysis and Recommendations Report\n\n"
+    for _, row in tpb_sentiment_df.iterrows():
+        tpb_label = row['tpb_label']
+        positive_percentage = row['positive']
+        negative_percentage = row['negative']
+        if negative_percentage > 70:  # Only generate recommendations for positive < 70%
+            if tpb_label == "attitude":
+                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
+                report += """
+**Current Issues:**
+- High negative perception regarding product quality
+- Concerns about halal certification and its authenticity
+- Pricing issues in comparison to perceived value
+**Recommended Actions:**
+1. **Quality Control Improvements**
+   - Implement enhanced product quality measures
+   - Obtain globally recognized halal certifications
+   - Conduct regular quality audits
+2. **Educational Campaigns**
+   - Educate customers on halal certification processes
+   - Raise awareness about the health benefits of halal products
+   - Highlight ethical and sustainable sourcing
+3. **Pricing Strategy Adjustment**
+   - Reassess pricing to align with customer expectations
+   - Introduce discount programs or loyalty initiatives
+"""
+            if tpb_label == "religious knowledge":
+                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
+                report += """
+**Current Issues:**
+- Lack of awareness and understanding about the halal process
+- Customers may be unsure of the religious guidelines followed
+**Recommended Actions:**
+1. **Religious Knowledge Enhancement**
+   - Provide clear educational materials on the halal process
+   - Collaborate with religious scholars to endorse products
+   - Ensure transparent labeling and certification
+2. **Community Engagement**
+   - Host webinars or community events about halal
+   - Partner with local religious organizations for outreach
+   - Share customer testimonials emphasizing trust in your certification
+"""
+            if tpb_label == "subjective norms":
+                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
+                report += """
+**Current Issues:**
+- Social influence or peer pressure regarding halal compliance is weak
+- Lack of community-driven recommendations for the product
+**Recommended Actions:**
+1. **Influence Social Circles**
+   - Engage community leaders or influencers to endorse products
+   - Create social campaigns around the halal certification to enhance peer recommendations
+2. **Referral Programs**
+   - Introduce referral programs where existing customers can promote the product
+   - Offer incentives for customers who share their experiences with others
+3. **Testimonials and Success Stories**
+   - Use customer testimonials and success stories to strengthen social trust
+"""
+            if tpb_label == "perceived behavioural control":
+                report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
+                report += """
+**Current Issues:**
+- Perceived difficulty in understanding or accessing halal-certified products
+- Concerns about control over product quality and sourcing transparency
+**Recommended Actions:**
+1. **Improve Accessibility**
+   - Make halal products more accessible through multiple platforms (e-commerce, retail stores)
+   - Ensure ease of purchase and fast delivery options
+2. **Enhance Transparency**
+   - Provide detailed information about sourcing and production processes
+   - Use blockchain or similar technology to enhance transparency in halal certification
+3. **Customer Empowerment**
+   - Offer customer feedback channels to empower users to voice concerns and suggestions
+   - Ensure that concerns are addressed promptly to build trust and satisfaction
+"""
+    return report
+def search_company(keyword):
+    if not keyword:
+        return None, None, None, None
+    filtered_df = df[df['text'].str.contains(keyword, case=False)]
+    if filtered_df.empty:
+        return None, None, None, None
+    # Calculate sentiment distribution
+    sentiment_counts = filtered_df['label'].value_counts(normalize=True) * 100
+    colors = ['red' if sentiment == 'negative' else 'gray' if sentiment == 'neutral' else 'blue' for sentiment in sentiment_counts.index]
+    # Create the bar plot
+    sentiment_fig = go.Figure(data=[go.Bar(
+        x=sentiment_counts.index,
+        y=sentiment_counts.values,
+        text=[f'{val:.1f}%' for val in sentiment_counts.values],
+        textposition='auto',
+        marker_color=colors
+    )])
+    sentiment_fig.update_layout(
+        title='Overall Sentiment Distribution',
+        xaxis_title='Sentiment',
+        yaxis_title='Percentage'
+    )
+    tpb_counts = filtered_df['tpb_label'].value_counts(normalize=True) * 100
+    tpb_fig = go.Figure(data=[go.Bar(
+        x=tpb_counts.index,
+        y=tpb_counts.values,
+        text=[f'{val:.1f}%' for val in tpb_counts.values],
+        textposition='auto'
+    )])
+    tpb_fig.update_layout(title='Overall TPB Factor Distribution', xaxis_title='TPB Factor', yaxis_title='Percentage')
+    # Calculate sentiment distribution within each TPB factor
+    tpb_sentiment_df = filtered_df.groupby(['tpb_label', 'label']).size().unstack(fill_value=0)
+    tpb_sentiment_df = tpb_sentiment_df.div(tpb_sentiment_df.sum(axis=1), axis=0) * 100
+    # Define colors for each sentiment
+    color_map = {
+        'negative': 'red',
+        'neutral': 'gray',
+        'positive': 'blue'
+    }
+    tpb_sentiment_fig = go.Figure()
+    for sentiment in tpb_sentiment_df.columns:
+        tpb_sentiment_fig.add_trace(go.Bar(
+            name=sentiment,
+            x=tpb_sentiment_df.index,
+            y=tpb_sentiment_df[sentiment],
+            text=[f'{val:.1f}%' for val in tpb_sentiment_df[sentiment]],
+            textposition='auto',
+            marker_color=color_map.get(sentiment, 'gray')
+        ))
+    tpb_sentiment_fig.update_layout(
+        barmode='stack',
+        title='Sentiment Distribution within TPB Factors',
+        xaxis_title='TPB Factor',
+        yaxis_title='Percentage'
+    )
+    report = generate_report(tpb_sentiment_df.reset_index())
+    wordclouds = {}
+    bigrams_data = {}
+    for label in filtered_df['tpb_label'].unique():
+        text = ' '.join(filtered_df[filtered_df['tpb_label'] == label]['text']).replace('QUOTE','').replace('quote','').replace('sijil halal','').replace('halal','')
+        wordclouds[label] = generate_wordcloud(text)
+        bigrams_data[label] = generate_bigrams(text)
+    # Extract only the words
+    words_only = {
+        key: [word_pair for word_pair, _ in value]
+        for key, value in bigrams_data.items()
+    }
+    # Create a single DataFrame for bigrams, with only the bigram text (no frequency)
+    bigram_df = pd.DataFrame({
+        label: data for label, data in words_only.items()
+    })
+    print(bigrams_data.items())
+    bigram_df.index = [f"Top {i+1}" for i in range(len(bigram_df))]
+    return (sentiment_fig, tpb_fig, tpb_sentiment_fig, filtered_df[filtered_df['text'].str.len() < 300].head(5),
+            report, wordclouds.get('attitude'), wordclouds.get('religious knowledge'),
+            wordclouds.get('subjective norms'), wordclouds.get('perceived behavioural control'),bigram_df)
+def text_classification_and_sentiment(text, keywords_df):
+    result_tpb = pipeline_tpb(text)
+    tpb_label = result_tpb[0]['label']
+    tpb_score = result_tpb[0]['score']
+    result_sentiment = sentiment_pipeline(text)
+    sentiment_label = result_sentiment[0]['label']
+    sentiment_score = result_sentiment[0]['score']
+    keywords_df = pd.read_excel('IMG_8137.xlsx')
+    # Check for keywords in the first column of the DataFrame
+    keywords = keywords_df.iloc[:, 0].tolist()
+    for keyword in keywords:
+        if keyword.lower() in text.lower():
+            sentiment_label = 'negative'
+            sentiment_score = 1.0
+    decision = predict_decision(sentiment_label)
+    tpb_output = f"TPB Label: {tpb_label}"
+    sentiment_output = f"Sentiment: {sentiment_label}\nProbability: {sentiment_score * 100:.2f}%"
+    decision_output = f"Decision: {decision}"
+    return tpb_output, sentiment_output, decision_output
+examples = [
+    "Alhamdulillah, hari ni dapat makan dekat restoran halal baru. Rasa puas hati dan tenang bila tau makanan yang kita makan dijamin halal.",
+    "Semua orang cakap kena check logo halal sebelum beli makanan. Dah jadi macam second nature dah sekarang. Korang pun sama kan?"
+]
+css = """
+:root {
+    --bg: #FFFFFF; /* Set the background color to white */
+    --col: #191919; /* Define primary text color */
+    --bg-dark: #000000; /* Define dark background color if needed */
+    --col-dark: #ECF2F7; /* Define dark text color if needed */
+    ----body-background-fill:  #FFFFFF;
+}
+html, body {
+    background-color: var(--bg); /* Set the background color to white for the entire page */
+    margin: 0; /* Remove default body margin */
+    padding: 0; /* Remove default body padding */
+}
+.container {
+    max-width: 1000px;
+    margin: auto;
+    padding: 20px;
+}
+.title {
+    text-align: center;
+    margin-bottom: 20px;
+}
+.nav-buttons {
+    display: flex;
+    justify-content: center;
+    gap: 10px;
+    margin-bottom: 20px;
+}
+#recommendation_report {
+    background-color: #f9f9f9; /* Keep this background light for the report section */
+    padding: 20px;
+    border: 2px solid #e0e0e0;
+    border-radius: 10px;
+    margin-top: 20px;
+    font-family: Arial, sans-serif;
+    font-size: 14px;
+}
+.wrap-text {
+    white-space: normal !important;
+    word-wrap: break-word;
+}
+.footer {visibility: hidden}
+"""
+with gr.Blocks(css=css + """
+     body, .gradio-container, .root, .wrap, #root .background .container {
+        background-color: white !important;
+        background-image: none !important;
+        background-fill: white !important;
+    }
+""", theme='aisyahhrazak/miku-aisyah@=1.2.2') as demo:
+    with gr.Tabs() as tabs:
+        with gr.TabItem("User View", id=0):
+            gr.Markdown("## Text Classification and Sentiment Analysis Based on User Input About Halal Food Acquisition")
+            gr.Markdown("Enter a text to see TPB classification, sentiment analysis, and purchase prediction results!")
+            input_text = gr.Textbox(lines=2, label="Input Comment", placeholder="Model can make mistakes, we are striving to improve the model.")
+            with gr.Row():
+                tpb_output = gr.Textbox(lines=3, label="TPB Classification")
+                sentiment_output = gr.Textbox(lines=3, label="Sentiment Analysis")
+            decision_output = gr.Textbox(lines=3, label="Purchase Prediction")
+            classify_button = gr.Button("Analyze")
+            classify_button.click(fn=text_classification_and_sentiment, inputs=input_text, outputs=[tpb_output, sentiment_output, decision_output])
+            gr.Examples(examples=examples, inputs=input_text)
+        with gr.TabItem("Company View", id=1):
+            gr.Markdown("# Sentiment Analysis and Purchase Decision Factor for Halal Food Acquisition")
+            input_text = gr.Textbox(lines=1, label="Search Keyword", placeholder="Enter keyword")
+            search_button = gr.Button("Search")
+            with gr.Row():
+                sentiment_chart = gr.Plot(label="Sentiment Distribution")
+                tpb_chart = gr.Plot(label="TPB Factor Distribution")
+            tpb_sentiment_chart = gr.Plot(label="Sentiment Distribution within TPB Factors")
+                        # Update word cloud outputs to be in a single row
+            gr.Markdown("### Word Clouds by TPB Label")
+            with gr.Row():
+                attitude_wc = gr.Image(label="Attitude Word Cloud", height=200, width=300)
+                religious_knowledge_wc = gr.Image(label="Religious Knowledge Word Cloud", height=200, width=300)
+                subjective_norms_wc = gr.Image(label="Subjective Norms Word Cloud",height=200, width=300)
+                perceived_behavioural_control_wc = gr.Image(label="Perceived Behavioural Control Word Cloud", height=200, width=300)
+            with gr.Accordion("See Recommendation Details"):
+                report_output = gr.Markdown(label="Recommendation Report", elem_id="recommendation_report")
+            gr.Markdown("### Top Bigrams by TPB Label")
+            bigram_table = gr.Dataframe(label="Top Bigrams for Each TPB Label")
+            output_table = gr.Dataframe(
+                headers=["text", "tpb_label", "sentiment", "score"],
+                label="Company Analysis Results",
+                wrap=True
+            )
+            search_button.click(
+                fn=search_company,
+                inputs=input_text,
+                outputs=[
+                    sentiment_chart, tpb_chart, tpb_sentiment_chart, output_table, report_output,
+                    attitude_wc, religious_knowledge_wc, subjective_norms_wc, perceived_behavioural_control_wc,bigram_table
+                ]
+            )
+demo.launch()

attn_mask_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+def _prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+    Args:
+        attention_mask (`torch.Tensor` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
+            The input shape should be a tuple that defines `(batch_size, query_length)`.
+        inputs_embeds (`torch.Tensor`):
+            The embedded inputs as a torch Tensor.
+        past_key_values_length (`int`):
+            The length of the key value cache.
+        sliding_window (`int`, *optional*):
+            If the model uses windowed attention, a sliding window should be passed.
+    """
+    attn_mask_converter = AttentionMaskConverter(
+        is_causal=False, sliding_window=sliding_window
+    )  # is_causal=True in original implementation
+    key_value_length = input_shape[-1] + past_key_values_length
+    # 4d mask is passed through the layers
+    if attention_mask is not None and len(attention_mask.shape) == 2:
+        attention_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            key_value_length=key_value_length,
+            dtype=inputs_embeds.dtype,
+        )
+    elif attention_mask is not None and len(attention_mask.shape) == 4:
+        expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+        if tuple(attention_mask.shape) != expected_shape:
+            raise ValueError(
+                f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+            )
+        else:
+            # if the 4D mask has correct shape - invert it and fill with negative infinity
+            inverted_mask = 1.0 - attention_mask
+            attention_mask = inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+            )
+    else:
+        attention_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0],
+            input_shape[-1],
+            key_value_length,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+    return attention_mask
+# Adapted from _prepare_4d_causal_attention_mask
+def _prepare_4d_causal_attention_mask_for_sdpa(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.
+    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
+    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
+    """
+    attn_mask_converter = AttentionMaskConverter(
+        is_causal=False, sliding_window=sliding_window
+    )  # is_causal=True in original implementation
+    key_value_length = input_shape[-1] + past_key_values_length
+    batch_size, query_length = input_shape
+    # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
+    # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
+    # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+    is_tracing = (
+        torch.jit.is_tracing()
+        or isinstance(inputs_embeds, torch.fx.Proxy)
+        or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+    )
+    if attention_mask is not None:
+        # 4d mask is passed through
+        if len(attention_mask.shape) == 4:
+            expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
+            if tuple(attention_mask.shape) != expected_shape:
+                raise ValueError(
+                    f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
+                )
+            else:
+                # if the 4D mask has correct shape - invert it and fill with negative infinity
+                inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype)
+                attention_mask = inverted_mask.masked_fill(
+                    inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
+                )
+                return attention_mask
+        elif not is_tracing and torch.all(attention_mask == 1):
+            if query_length == 1:
+                # For query_length == 1, causal attention and bi-directional attention are the same.
+                attention_mask = None
+            elif key_value_length == query_length:
+                attention_mask = None
+            else:
+                # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation
+                # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+                # Reference: https://github.com/pytorch/pytorch/issues/108108
+                pass
+    elif query_length > 1 and key_value_length != query_length:
+        # See the comment above (https://github.com/pytorch/pytorch/issues/108108).
+        # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`.
+        attention_mask = True
+    elif is_tracing:
+        raise ValueError(
+            'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.'
+        )
+    if attention_mask is None:
+        expanded_4d_mask = None
+    elif attention_mask is True:
+        expanded_4d_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0],
+            input_shape[-1],
+            key_value_length,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+    else:
+        expanded_4d_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            dtype=inputs_embeds.dtype,
+            key_value_length=key_value_length,
+        )
+        # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        if not is_tracing and expanded_4d_mask.device.type == "cuda":
+            expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
+                expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
+            )
+    return expanded_4d_mask

bidirectional_mistral.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from typing import List, Optional, Tuple, Union
+import torch
+from transformers import (
+    MistralModel,
+    MistralPreTrainedModel,
+    MistralForCausalLM,
+    MistralConfig,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.models.mistral.modeling_mistral import (
+    MistralDecoderLayer,
+    MistralRMSNorm,
+    MistralAttention,
+    MistralFlashAttention2,
+    MistralSdpaAttention,
+    MistralMLP,
+)
+from torch import nn
+from transformers.utils import logging
+from attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+logger = logging.get_logger(__name__)
+class ModifiedMistralAttention(MistralAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+class ModifiedMistralFlashAttention2(MistralFlashAttention2):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+class ModifiedMistralSdpaAttention(MistralSdpaAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": ModifiedMistralAttention,
+    "flash_attention_2": ModifiedMistralFlashAttention2,
+    "sdpa": ModifiedMistralSdpaAttention,
+}
+class ModifiedMistralDecoderLayer(MistralDecoderLayer):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](
+            config, layer_idx
+        )
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = MistralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+class MistralBiModel(MistralModel):
+    def __init__(self, config: MistralConfig):
+        MistralPreTrainedModel.__init__(self, config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(
+            config.vocab_size, config.hidden_size, self.padding_idx
+        )
+        self.layers = nn.ModuleList(
+            [
+                ModifiedMistralDecoderLayer(config, layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from forward() in transformers.models.mistral.modeling_mistral.MistralModel
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if (
+            attention_mask is not None
+            and self._attn_implementation == "flash_attention_2"
+            and use_cache
+        ):
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = (
+                attention_mask
+                if (attention_mask is not None and 0 in attention_mask)
+                else None
+            )
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # The original implementation is by-passed, see attn_mask_utils.py
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = (
+                next_decoder_cache.to_legacy_cache()
+                if use_legacy_cache
+                else next_decoder_cache
+            )
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class MistralBiForMNTP(MistralForCausalLM):
+    def __init__(self, config):
+        MistralPreTrainedModel.__init__(self, config)
+        self.model = MistralBiModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()

classifier.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from bidirectional_mistral import MistralBiModel
+from transformers import MistralPreTrainedModel
+import torch
+import numpy as np
+from typing import Optional, List
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.modeling_outputs import SequenceClassifierOutputWithPast
+class MistralForSequenceClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralBiModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        token_type_ids: Optional[bool] = None
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = transformer_outputs[0][:, 0]
+        logits = self.score(pooled_output)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

en.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]

sentiment-tpb-dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff