File size: 2,954 Bytes
d982179 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
---
license: apache-2.0
metrics:
- accuracy
pipeline_tag: text-classification
tags:
- CNN
- NLP
- Yelp
- Reviews
- pre_trained
---
# Model Card for Model ID
# Downloads
!pip install contractions
!pip install textsearch
!pip install tqdm
import nltk
nltk.download('punkt')
# Fundamental classes
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
# Time
import time
import datetime
# Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import contractions
from bs4 import BeautifulSoup
import re
import tqdm
import unicodedata
seed = 3541
np.random.seed(seed)
# Define a dummy loss to bypass the error during model loading
def dummy_loss(y_true, y_pred):
return tf.reduce_mean(y_pred - y_true)
# Loading the model Trained on Yelp reviews
modelYelp = keras.models.load_model(
'/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5',
compile=False
)
# Compile the model with the correct loss function and reduction
modelYelp.compile(
optimizer='adam',
loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
metrics=['accuracy']
)
# Loading Yelp test data
dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv')
# Loading Yelp train data (to be used on the label encoder)
dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv')
# Shuffling the Test Data
test_Yelp = dataset_test_Yelp.sample(frac=1)
train_Yelp = dataset_train_Yelp.sample(frac=1)
# Taking a tiny portion of the database (because it will only be used on the label encoder)
train_Yelp = dataset_train_Yelp.iloc[:100, :]
# Taking only necessary columns
y_test_Yelp = test_Yelp['class_index'].values
X_train_Yelp = train_Yelp['review_text'].values
y_train_Yelp = train_Yelp['class_index'].values
# Preprocess corpus function
def pre_process_corpus(corpus):
processed_corpus = []
for doc in tqdm.tqdm(corpus):
doc = contractions.fix(doc)
doc = BeautifulSoup(doc, "html.parser").get_text()
doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
doc = doc.lower()
doc = doc.strip()
processed_corpus.append(doc)
return processed_corpus
# Preprocessing the Data
X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values)
X_train_Yelp = pre_process_corpus(X_train_Yelp)
# Creating and Fitting the Tokenizer
etc ...
# More info on the Model page on Kaggle :
https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-yelpreviews |