Answer To: CSC 180-01/02 Intelligent Systems (Fall 2021) Project 1: Yelp Business Rating Prediction using...
Karthi answered on Sep 23 2021
nn_network
In [ ]:
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn import svm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, GRU
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.neighbors import KNeighborsClassifier
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential
%matplotlib inline
In [ ]:
business = pd.read_csv("/home/kai/yelp_dataset/business.csv")
review_all = pd.read_csv("/home/kai/yelp_dataset/review.csv")
In [ ]:
a = business[business['categories'].str.contains('Restaurant') == True]
rev = review_all[review_all.business_id.isin(a['business_id']) == True]
In [ ]:
rev_samp = rev.sample(n = 350000, random_state = 42)
train = rev_samp[0:280000]
test = rev_samp[280000:]
In [ ]:
train.shape, test.shape
Out[ ]:
((280000, 9), (70000, 9))
In [ ]:
train = train[['text', 'stars']]
train['stars'].hist();train.head()
Out[ ]:
text stars
2760442 Second time here.... first time had the pulled... 5
3014452 Great place. Like their sauce and lunch specia... 5
2876979 So goooooooood and so simple! I love their pel... 5
469097 We stopped in for a late lunch on a Tuesday af... 3
4971248 A great option to try hakka chinese since its ... 4
In [ ]:
train = pd.get_dummies(train, columns = ['stars'])
train.head()
Out[ ]:
text stars_1 stars_2 stars_3 stars_4 stars_5
2760442 Second time here.... first time had the pulled... 0 0 0 0 1
3014452 Great place. Like their sauce and lunch specia... 0 0 0 0 1
2876979 So goooooooood and so simple! I love their pel... 0 0 0 0 1
469097 We stopped in for a late lunch on a Tuesday af... 0 0 1 0 0
4971248 A great option to try hakka chinese since its ... 0 0 0 1 0
In [ ]:
test = test[['text', 'stars']]
test = pd.get_dummies(test, columns = ['stars'])
train.shape, test.shape
Out[ ]:
((280000, 6), (70000, 6))
In [ ]:
train_samp = train.sample(frac = .1, random_state = 42)
test_samp = test.sample(frac = .1, random_state = 42)
train_samp.shape, test_samp.shape
Out[ ]:
((28000, 6), (7000, 6))
In [ ]:
max_features = 2000
tfidf = TfidfVectorizer(max_features = max_features)
In [ ]:
class NBFeatures(BaseEstimator):
'''Class implementation of Jeremy Howards NB Linear model'''
def __init__(self, alpha):
# Smoothing Parameter: always going to be one for my use
self.alpha = alpha
def preprocess_x(self, x, r):
return x.multiply(r)
# calculate probabilities
def pr(self, x, y_i, y):
p = x[y == y_i].sum(0)
return (p + self.alpha)/((y==y_i).sum()+self.alpha)
# calculate the log ratio and represent as sparse matrix
# ie fit the nb model
def fit(self, x, y = None):
self._r = sparse.csr_matrix(np.log(self.pr(x, 1, y) /self.pr(x, 0, y)))
return self
# apply the nb fit to original features x
def transform(self, x):
x_nb = self.preprocess_x(x, self._r)
return x_nb
In [ ]:
lr = LogisticRegression()
nb = NBFeatures(1)
p = Pipeline([
('tfidf', tfidf),
('nb', nb),
('lr', lr)
])
In [ ]:
class_names = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']
scores = []
preds = np.zeros((len(test_samp), len(class_names)))
for i, class_name in enumerate(class_names):
train_target = train_samp[class_name]
cv_score = np.mean(cross_val_score(estimator = p, X = train_samp['text'].values,
y = train_target, cv = 3, scoring = 'accuracy'))
scores.append(cv_score)
print('CV score for class {} is {}'.format(class_name, cv_score))
p.fit(train_samp['text'].values, train_target)
preds[:,i] = p.predict_proba(test_samp['text'].values)[:,1]
CV score for class stars_1 is 0.9282499819604656
CV score for class stars_2 is 0.90339283521352
CV score for class stars_3 is 0.8591786654537303
CV score for class stars_4 is 0.7321071676830603
CV score for class stars_5 is...