|
--- |
|
license: apache-2.0 |
|
metrics: |
|
- accuracy |
|
pipeline_tag: text-classification |
|
tags: |
|
- cnn |
|
- amazon_reviews |
|
datasets: |
|
- yassiracharki/Amazon_Reviews_Binary_for_Sentiment_Analysis |
|
language: |
|
- en |
|
library_name: fasttext |
|
--- |
|
# Model Card for Model ID |
|
|
|
# Downloads |
|
!pip install contractions |
|
!pip install textsearch |
|
!pip install tqdm |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
# Fundamental classes |
|
import tensorflow as tf |
|
from tensorflow import keras |
|
import pandas as pd |
|
import numpy as np |
|
|
|
# Time |
|
import time |
|
import datetime |
|
|
|
# Preprocessing |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing import sequence |
|
from sklearn.preprocessing import LabelEncoder |
|
import contractions |
|
from bs4 import BeautifulSoup |
|
import re |
|
import tqdm |
|
import unicodedata |
|
|
|
seed = 3541 |
|
np.random.seed(seed) |
|
|
|
# Define a dummy loss to bypass the error during model loading |
|
def dummy_loss(y_true, y_pred): |
|
return tf.reduce_mean(y_pred - y_true) |
|
|
|
# Loading the model Trained on Amazon reviews |
|
modelAmazon = keras.models.load_model( |
|
'/kaggle/input/pre-trained-model-binary-cnn-nlp-amazon-reviews/tensorflow1/pre_trained_sentiment_analysis_cnn_model_amazon_reviews/1/Binary_Classification_86_Amazon_Reviews_CNN.h5', |
|
compile=False |
|
) |
|
|
|
# Compile the model with the correct loss function and reduction |
|
modelAmazon.compile( |
|
optimizer='adam', |
|
loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE), |
|
metrics=['accuracy'] |
|
) |
|
|
|
# Loading Amazon test data |
|
dataset_test_Amazon = pd.read_csv('/kaggle/input/amazon-reviews-for-sa-binary-negative-positive-csv/amazon_review_sa_binary_csv/test.csv') |
|
|
|
# Loading Amazon train data (to be used on the label encoder) |
|
dataset_train_Amazon = pd.read_csv('/kaggle/input/amazon-reviews-for-sa-binary-negative-positive-csv/amazon_review_sa_binary_csv/train.csv') |
|
|
|
# Shuffling the Test Data |
|
test_Amazon = dataset_test_Amazon.sample(frac=1) |
|
train_Amazon = dataset_train_Amazon.sample(frac=1) |
|
|
|
# Taking a tiny portion of the database (because it will only be used on the label encoder) |
|
train_Amazon = dataset_train_Amazon.iloc[:100, :] |
|
|
|
# Taking only necessary columns |
|
y_test_Amazon = test_Amazon['class_index'].values |
|
X_train_Amazon = train_Amazon['review_text'].values |
|
y_train_Amazon = train_Amazon['class_index'].values |
|
|
|
# Preprocess corpus function |
|
def pre_process_corpus(corpus): |
|
processed_corpus = [] |
|
for doc in tqdm.tqdm(corpus): |
|
doc = contractions.fix(doc) |
|
doc = BeautifulSoup(doc, "html.parser").get_text() |
|
doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore') |
|
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A) |
|
doc = doc.lower() |
|
doc = doc.strip() |
|
processed_corpus.append(doc) |
|
return processed_corpus |
|
|
|
# Preprocessing the Data |
|
X_test_Amazon = pre_process_corpus(test_Amazon['review_text'].values) |
|
X_train_Amazon = pre_process_corpus(X_train_Amazon) |
|
|
|
# Creating and Fitting the Tokenizer |
|
etc ... |
|
|
|
More info on the Model's page on Kaggle : |
|
|
|
https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-amazon-reviews |