使用BF神经网络进行基于影评的语义分析

一、准备数据

1
2
3
4
5
import pandas as pd
data = pd.DataFrame([])

data['reviews'] = pd.read_table('reviews.txt',header=None,names=['reviews'])
data['labels'] = pd.read_table('labels.txt',header=None,names=['labels'])
1
2
pos_data = data[data['labels'] == 'positive'].reset_index(drop=True)
neg_data = data[data['labels'] == 'negative'].reset_index(drop=True)
1
2
pos_data.iloc[:,0][:3]
list(neg_data.iloc[:,0][:1])
['story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  ']
1
2
3
4
5
6
7
8
9
10
11
from pyecharts import Pie, Style

pie = Pie("正负样本占比", title_pos='left')
style = Style()
pie_style = style.add(label_pos = 'left',
is_label_show = True,
label_text_color = True,
label_text_size = 20)

pie.add("样例条数", ["positive","negative"],[len(pos_data), len(neg_data)], **pie_style)
pie

屏幕快照 2018-07-26 下午5.16.54

二、理论验证

通过观察发现正面评论里常出现些赞美的词汇,负面评论出现批判的词汇。考虑能否根据这个特征来对正负评论进行预测?

根据这个想法进行一次快速的验证

1
2
3
from collections import Counter
import numpy as np
from time import time
1
data.iloc[1][0]
'story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane  violent mob by the crazy chantings of it  s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it  s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly .  '
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
t0 = time()
# 创建计数器,分别把正负面评论进行词频统计并把它放在三个计数器里。

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

for words in pos_data['reviews']:
for word in words.split(" "):
positive_counts[word] += 1
total_counts[word] += 1

for words in neg_data['reviews']:
for word in words.split(" "):
negative_counts[word] += 1
total_counts[word] += 1

print("处理完毕\n耗时{}秒。".format(time() - t0))
处理完毕
耗时5.25712513923645秒。
1
2
# positive_counts.most_common()
# negative_counts.most_common()

通过观察发现无论是正样本还是负样本,词频最高的词汇大多是常用词汇如 a、the、and、of … 等单词。下面把正负计算器汇总起来一起观察

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 这里计算的是词汇分别在好评中出现的次数与差评出现次数的比例然后取对数。

pos_neg_ratios = Counter()

for term,cnt in total_counts.most_common():
# if cnt > 100:
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
# 加 1 为了避免分母为负
pos_neg_ratios[term] = pos_neg_ratio

for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
# 如果比例小于 1 说明差评中词汇含量大,这里计算比例倒数的对数。加 0.01 同样为了避免分母为0。
# 最终得出的结果表示,某个单词的值越大则说明该词汇所能代表的正负面情绪越明显。
1
2
# 正面词汇
pos_neg_ratios.most_common()[:5]
[('edie', 4.6913478822291435),
 ('antwone', 4.477336814478207),
 ('din', 4.406719247264253),
 ('gunga', 4.189654742026425),
 ('goldsworthy', 4.174387269895637)]
1
2
# 负面词汇
list(reversed(pos_neg_ratios.most_common()))[:5]
[('whelk', -4.605170185988092),
 ('pressurized', -4.605170185988092),
 ('bellwood', -4.605170185988092),
 ('mwuhahahaa', -4.605170185988092),
 ('insulation', -4.605170185988092)]

通过以上观测,初步验证了我们的想法是有效可行的。某些词汇在好评和差评中出现的次数明显差距很大

三、构建神经网络原型

1
2
3
4
5
6
7
from IPython.display import Image

review = "This is a horrible, terrible movie."

Image(filename='sentiment_network.png')

# 这里是我们基于以上思想构建的简单神经网络原型

output_17_0

1
2
3
review = 'The movie was excellent.'

Image(filename='sentiment_network_pos.png')

output_18_0

四、将每条评论转换为输入向量

这里为了简单起见只是做了对每个出现的单词做个词频统计,但是实际情况往往比这个复杂,事实上大量无关单词的出现严重扰乱了数据的属性。针对这个情况后面会介绍一种简单的处理方式。更优化的处理方式可以参考TF—IDF。

1
2
3
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print("输入向量的维数:",vocab_size)
输入向量的维数: 74074
1
2
3
# 创建一个容器,这里注意是二维数据集
layer_0 = np.zeros((1, vocab_size))
layer_0
array([[0., 0., 0., ..., 0., 0., 0.]])
1
2
3
4
5
6
# 创建一个带索引的词汇字典
word2index = {}

for i, word in enumerate(vocab):
word2index[word] = i
# word2index
1
2
3
4
5
6
7
8
9
def update_input_layer(review):
layer_0 = np.zeros((1, vocab_size))
for word in review.split(" "):
layer_0[0][word2index[word]] += 1
return layer_0
layer_0 = update_input_layer(reviews[0]) # 测试reviews[0]这条评论转换成数值向量
layer_0

# 为了计算效率高,可以考虑直接进行归一化操作
array([[18.,  0.,  0., ...,  0.,  0.,  0.]])

创建标签

1
2
3
4
5
6
7
def get_target_for_label(label):
if label == 'POSITIVE':
return 1
else:
return 0
# X = layer_1
# y = [*map(get_target_for_label, labels)]

五、创建神经网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import time
import sys
import numpy as np

class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):

np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

def pre_process_data(self, reviews, labels):

review_vocab = set()
for review in reviews:
for word in review.split(" "):
review_vocab.add(word)

self.review_vocab = list(review_vocab)
label_vocab = set()
for label in labels:
label_vocab.add(label)
self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i

self.label2index = {}
for i, label in enumerate(self.label_vocab):
self.label2index[label] = i

def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.learning_rate = learning_rate
self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.hidden_nodes, self.output_nodes))
self.layer_0 = np.zeros((1,input_nodes))

def update_input_layer(self,review):
self.layer_0 *= 0
for word in review.split(" "):
if(word in self.word2index.keys()):
self.layer_0[0][self.word2index[word]] += 1

def get_target_for_label(self,label):
if(label == 'POSITIVE'):
return 1
else:
return 0

def sigmoid(self,x):
return 1 / (1 + np.exp(-x))

def sigmoid_output_2_derivative(self,output):
return output * (1 - output)

def train(self, training_reviews, training_labels):
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):

review = training_reviews[i]
label = training_labels[i]

self.update_input_layer(review)

# Hidden layer
layer_1 = self.layer_0.dot(self.weights_0_1)
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

# Output error
layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

# Backpropagated error
layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

# Update the weights
self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

if(layer_2 >= 0.5 and label == 'POSITIVE'):
correct_so_far += 1
elif(layer_2 < 0.5 and label == 'NEGATIVE'):
correct_so_far += 1


elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")

def test(self, testing_reviews, testing_labels):

correct = 0
start = time.time()

for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1


elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")

def run(self, review):

self.update_input_layer(review.lower())
layer_1 = self.layer_0.dot(self.weights_0_1)
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
if(layer_2[0] >= 0.5):
return "POSITIVE"
else:
return "NEGATIVE"

六、初步测试

1、首先训练之前先看下测试效果是否为50%

1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1553. #Correct:500 #Tested:1000 Testing Accuracy:50.0%
1
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):367.6 #Correct:1251 #Trained:2501 Training Accuracy:50.0%
Progress:20.8% Speed(reviews/sec):366.4 #Correct:2501 #Trained:5001 Training Accuracy:50.0%
Progress:31.2% Speed(reviews/sec):367.6 #Correct:3751 #Trained:7501 Training Accuracy:50.0%
Progress:41.6% Speed(reviews/sec):368.7 #Correct:5001 #Trained:10001 Training Accuracy:50.0%
Progress:52.0% Speed(reviews/sec):368.5 #Correct:6251 #Trained:12501 Training Accuracy:50.0%
Progress:62.5% Speed(reviews/sec):368.6 #Correct:7501 #Trained:15001 Training Accuracy:50.0%
Progress:72.9% Speed(reviews/sec):368.6 #Correct:8751 #Trained:17501 Training Accuracy:50.0%
Progress:83.3% Speed(reviews/sec):368.5 #Correct:10001 #Trained:20001 Training Accuracy:50.0%
Progress:93.7% Speed(reviews/sec):368.7 #Correct:11251 #Trained:22501 Training Accuracy:50.0%
Progress:99.9% Speed(reviews/sec):368.4 #Correct:12000 #Trained:24000 Training Accuracy:50.0%

2、正式开始训练,发现准确率一直是50%没有提升,考虑是否因为学习率太高造成无法收敛。

然后调小学习率重新测试

1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):334.6 #Correct:1248 #Trained:2501 Training Accuracy:49.9%
Progress:20.8% Speed(reviews/sec):334.0 #Correct:2498 #Trained:5001 Training Accuracy:49.9%
Progress:31.2% Speed(reviews/sec):340.9 #Correct:3748 #Trained:7501 Training Accuracy:49.9%
Progress:41.6% Speed(reviews/sec):348.0 #Correct:4998 #Trained:10001 Training Accuracy:49.9%
Progress:52.0% Speed(reviews/sec):348.3 #Correct:6248 #Trained:12501 Training Accuracy:49.9%
Progress:62.5% Speed(reviews/sec):346.7 #Correct:7490 #Trained:15001 Training Accuracy:49.9%
Progress:72.9% Speed(reviews/sec):348.8 #Correct:8746 #Trained:17501 Training Accuracy:49.9%
Progress:83.3% Speed(reviews/sec):348.0 #Correct:9996 #Trained:20001 Training Accuracy:49.9%
Progress:93.7% Speed(reviews/sec):346.8 #Correct:11246 #Trained:22501 Training Accuracy:49.9%
Progress:99.9% Speed(reviews/sec):347.0 #Correct:11995 #Trained:24000 Training Accuracy:49.9%

3、继续调小学习率

1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.001)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):318.7 #Correct:1263 #Trained:2501 Training Accuracy:50.4%
Progress:20.8% Speed(reviews/sec):319.5 #Correct:2615 #Trained:5001 Training Accuracy:52.2%
Progress:31.2% Speed(reviews/sec):319.9 #Correct:4035 #Trained:7501 Training Accuracy:53.7%
Progress:41.6% Speed(reviews/sec):320.5 #Correct:5566 #Trained:10001 Training Accuracy:55.6%
Progress:52.0% Speed(reviews/sec):320.3 #Correct:7047 #Trained:12501 Training Accuracy:56.3%
Progress:62.5% Speed(reviews/sec):320.1 #Correct:8658 #Trained:15001 Training Accuracy:57.7%
Progress:72.9% Speed(reviews/sec):319.8 #Correct:10202 #Trained:17501 Training Accuracy:58.2%
Progress:83.3% Speed(reviews/sec):319.5 #Correct:11889 #Trained:20001 Training Accuracy:59.4%
Progress:93.7% Speed(reviews/sec):319.3 #Correct:13525 #Trained:22501 Training Accuracy:60.1%
Progress:99.9% Speed(reviews/sec):319.2 #Correct:14574 #Trained:24000 Training Accuracy:60.7%

4、增加隐藏层

1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], hidden_nodes=15, learning_rate=0.0003)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):269.4 #Correct:1304 #Trained:2501 Training Accuracy:52.1%
Progress:20.8% Speed(reviews/sec):269.0 #Correct:2765 #Trained:5001 Training Accuracy:55.2%
Progress:31.2% Speed(reviews/sec):268.6 #Correct:4395 #Trained:7501 Training Accuracy:58.5%
Progress:41.6% Speed(reviews/sec):268.5 #Correct:6083 #Trained:10001 Training Accuracy:60.8%
Progress:52.0% Speed(reviews/sec):267.9 #Correct:7774 #Trained:12501 Training Accuracy:62.1%
Progress:62.5% Speed(reviews/sec):267.2 #Correct:9483 #Trained:15001 Training Accuracy:63.2%
Progress:72.9% Speed(reviews/sec):266.0 #Correct:11199 #Trained:17501 Training Accuracy:63.9%
Progress:83.3% Speed(reviews/sec):265.0 #Correct:13023 #Trained:20001 Training Accuracy:65.1%
Progress:93.7% Speed(reviews/sec):264.3 #Correct:14854 #Trained:22501 Training Accuracy:66.0%
Progress:99.9% Speed(reviews/sec):263.7 #Correct:15993 #Trained:24000 Training Accuracy:66.6%

第一次训练总结:

通过以上训练效果发现,首先学习率过大时模型无法收敛。当取值为 0.001 时模型开始缓慢的提升。正常情况下模型在刚开始提升的速度很快到后面越来越慢。

从以上效果来看,当模型准确率提升到60%时速度开始放缓,因此即使增加更多的迭代次数对效果的提升也不会很明显。因此我们考虑时哪些原因可能会造成这种情况。

需要解决的问题:

  • 训练速度太慢
  • 准确度不高

可能的原因

  • 模型隐藏层节点过少,过于简单
  • 数据本身存在噪音对模型影响较大

经过第4步的测试,发现增加隐藏层为20对效果并没有提升,反而效果更差。考虑继续调小学习率到 0.0003 模型准确率得以提升。
发现:增加节点数的同时需要调小学习率

如果把神经网络比如成挖掘机,我们的目的时从数据里挖掘出有价值的金子。往往刚开始的时候很难挖到金子,可能并不是因为挖掘机挖的不够深而在于我们挖掘的位置或者操纵它的方式不对。所以我们从新回到数据集上考虑噪音和信号的问题。

七、关于噪音的分析

关于噪音

通过观察数据,发现在每一条评论中的空格字符,以及类似 a、the、at…这些字符占据了大多数甚至时几十个。这样的话放到模型里意味着我们给这些跟情绪不相关的词相当大的权重,而真正有价值的词被淹没了。有价值的情绪词汇出现的频率大多知识出现了1次。我们考虑一个最简单的方式是在评论转换为数值向量的时候不去累加词频而是简单的赋值为1,如下

1
2
3
4
5
def update_input_layer(self,review):
self.layer_0 *= 0
for word in review.split(" "):
if(word in self.word2index.keys()):
self.layer_0[0][self.word2index[word]] = 1

下面我们重新验证下我们的思路:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import time
import sys
import numpy as np

class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):

np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

def pre_process_data(self, reviews, labels):

review_vocab = set()
for review in reviews:
for word in review.split(" "):
review_vocab.add(word)

self.review_vocab = list(review_vocab)
label_vocab = set()
for label in labels:
label_vocab.add(label)
self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i

self.label2index = {}
for i, label in enumerate(self.label_vocab):
self.label2index[label] = i

def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.learning_rate = learning_rate
self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.hidden_nodes, self.output_nodes))
self.layer_0 = np.zeros((1,input_nodes))

def update_input_layer(self,review):
self.layer_0 *= 0
for word in review.split(" "):
if(word in self.word2index.keys()):
self.layer_0[0][self.word2index[word]] = 1

def get_target_for_label(self,label):
if(label == 'POSITIVE'):
return 1
else:
return 0

def sigmoid(self,x):
# return 1 / (1 + np.exp(-x))
return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

def sigmoid_output_2_derivative(self,output):
return output * (1 - output)

def train(self, training_reviews, training_labels):
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):

review = training_reviews[i]
label = training_labels[i]

self.update_input_layer(review)

# Hidden layer
layer_1 = self.layer_0.dot(self.weights_0_1)
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))

# Output error
layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

# Backpropagated error
layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

# Update the weights
self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate # update input-to-hidden weights with gradient descent step

if(layer_2 >= 0.5 and label == 'POSITIVE'):
correct_so_far += 1
elif(layer_2 < 0.5 and label == 'NEGATIVE'):
correct_so_far += 1


elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")

def test(self, testing_reviews, testing_labels):

correct = 0
start = time.time()

for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1


elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")

def run(self, review):

self.update_input_layer(review.lower())
layer_1 = self.layer_0.dot(self.weights_0_1)
layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
if(layer_2[0] >= 0.5):
return "POSITIVE"
else:
return "NEGATIVE"
1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], hidden_nodes=15, learning_rate=0.0003)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):206.7 #Correct:1909 #Trained:2501 Training Accuracy:76.3%
Progress:20.8% Speed(reviews/sec):207.2 #Correct:3917 #Trained:5001 Training Accuracy:78.3%
Progress:31.2% Speed(reviews/sec):206.2 #Correct:5979 #Trained:7501 Training Accuracy:79.7%
Progress:41.6% Speed(reviews/sec):204.5 #Correct:8068 #Trained:10001 Training Accuracy:80.6%
Progress:52.0% Speed(reviews/sec):203.6 #Correct:10165 #Trained:12501 Training Accuracy:81.3%
Progress:62.5% Speed(reviews/sec):203.1 #Correct:12236 #Trained:15001 Training Accuracy:81.5%
Progress:72.9% Speed(reviews/sec):202.6 #Correct:14312 #Trained:17501 Training Accuracy:81.7%
Progress:83.3% Speed(reviews/sec):202.2 #Correct:16455 #Trained:20001 Training Accuracy:82.2%
Progress:93.7% Speed(reviews/sec):202.0 #Correct:18598 #Trained:22501 Training Accuracy:82.6%
Progress:99.9% Speed(reviews/sec):201.8 #Correct:19894 #Trained:24000 Training Accuracy:82.8%
1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], hidden_nodes=10, learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):264.1 #Correct:1812 #Trained:2501 Training Accuracy:72.4%
Progress:20.8% Speed(reviews/sec):264.1 #Correct:3802 #Trained:5001 Training Accuracy:76.0%
Progress:31.2% Speed(reviews/sec):264.2 #Correct:5896 #Trained:7501 Training Accuracy:78.6%
Progress:41.6% Speed(reviews/sec):264.2 #Correct:8045 #Trained:10001 Training Accuracy:80.4%
Progress:52.0% Speed(reviews/sec):264.2 #Correct:10172 #Trained:12501 Training Accuracy:81.3%
Progress:62.5% Speed(reviews/sec):264.2 #Correct:12319 #Trained:15001 Training Accuracy:82.1%
Progress:72.9% Speed(reviews/sec):264.0 #Correct:14438 #Trained:17501 Training Accuracy:82.4%
Progress:83.3% Speed(reviews/sec):263.7 #Correct:16615 #Trained:20001 Training Accuracy:83.0%
Progress:93.7% Speed(reviews/sec):263.5 #Correct:18796 #Trained:22501 Training Accuracy:83.5%
Progress:99.9% Speed(reviews/sec):263.5 #Correct:20117 #Trained:24000 Training Accuracy:83.8%
  • 看下在测试集的效果
1
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1620. #Correct:849 #Tested:1000 Testing Accuracy:84.9%

八、关于网络计算效率低的分析

通过上面的数据优化大大提升了模型的准确率,但是训练的速度还是很慢。所以我们考虑下这么才能提升训练的速度呢?

1
Image(filename='sentiment_network.png')

output_44_0

1
2
3
4
5
6
7
def update_input_layer(review):
layer_0 = np.zeros((1, vocab_size))
for word in review.split(" "):
layer_0[0][word2index[word]] = 1
return layer_0
layer_0 = update_input_layer(reviews[0]) # 测试reviews[0]这条评论转换成数值向量
layer_0.sum()
93.0

再次考虑我们的模型,这里我们的输入层节点个数为 vocab_size(74074) 个,在 reviews[0] 中有数值的词汇只有93个,是个极度稀疏的向量。其中大多数都为 0 ,在输入值为 0 的时候乘以权重得出结果仍然为0,这不仅没有任何意义还大大增加的计算量。 所以我们考虑用什么方式可以优化这个问题呢?

解决思路:
我们记录下非 0 元素的索引,然后在计算输出层到隐藏层的时候,只在非 0 元素索引的位置乘以权重然后进行求和,这样就大大节省了计算量。

示例如下:

1
2
3
4
5
6
7
8
9
10
11
12
layer_0 = np.zeros(10)
layer_0[4] = 1
layer_0[9] = 1

weight_0_1 = np.random.randn(10,5)
indexs = [4,9]
layer_1 = np.zeros(5)

for index in indexs:
layer_1 += layer_0[index] * weight_0_1[index]

layer_1.sum()
-2.131818189044033

进一步考虑,这里我们的输入值都为 1, 因此 1 乘以权值这个计算步骤也可以省略,只需要对索引位置的权值进行求和就好了。

下面我们来实现这一思路:

  • 不再让隐藏层中做乘以 0 的步骤
  • 不再让隐藏层做权重乘以 1 的步骤
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import time
import sys
import numpy as np

class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
# 输入层节点个数为词汇库长度

def pre_process_data(self, reviews, labels):
# 提取出所有词汇
review_vocab = set()
for review in reviews:
for word in review.split(" "):
review_vocab.add(word)
self.review_vocab = list(review_vocab)

# 提取标签
label_vocab = set()
for label in labels:
label_vocab.add(label)
self.label_vocab = list(label_vocab)

self.review_vocab_size = len(self.review_vocab)
self.label_vocab_size = len(self.label_vocab)

# 创建词汇库字典
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i

# 创建标签索引字典
self.label2index = {}
for i, label in enumerate(self.label_vocab):
self.label2index[label] = i

# 初始化模型,输入节点为词汇库长度,隐藏层节点默认为10,输出节点默认为1,学习率默认为0.1
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.learning_rate = learning_rate

self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.hidden_nodes, self.output_nodes))

self.layer_1 = np.zeros((1,hidden_nodes))

# 定义标签转换函数
def get_target_for_label(self,label):
if(label == 'POSITIVE'):
return 1
else:
return 0

def sigmoid(self,x):
return 1 / (1 + np.exp(-x))

def sigmoid_output_2_derivative(self,output):
return output * (1 - output)

#=======================训练函数=============================
def train(self, training_reviews_raw, training_labels):

training_reviews = list()
for review in training_reviews_raw:
indices = set()
for word in review.split(" "):
if(word in self.word2index.keys()):
indices.add(self.word2index[word])
training_reviews.append(list(indices))
# 以上代码记录下每条评论的每个词汇在对应字典中的位置索引,以此作为输入数据。
# 最终这里生成的training_reviews里面存放的是每条影评的各个词汇在词汇字典里的索引。

assert(len(training_reviews) == len(training_labels))

# 定义一个容器计算正确的个数用以计算准确率
correct_so_far = 0
start = time.time()

for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]

# 输入层
self.layer_1 *= 0
for index in review:
self.layer_1 += self.weights_0_1[index]

# 输出层
layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

# 输出层误差和梯度
layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

# 反向传播误差和梯度
layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

# 更新权值
self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step

for index in review:
self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

if(layer_2 >= 0.5 and label == 'POSITIVE'):
correct_so_far += 1
elif(layer_2 < 0.5 and label == 'NEGATIVE'):
correct_so_far += 1

elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")

def test(self, testing_reviews, testing_labels):

correct = 0
start = time.time()

for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1

elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")

def run(self, review):
self.layer_1 *= 0
unique_indices = set()
for word in review.lower().split(" "):
if word in self.word2index.keys():
unique_indices.add(self.word2index[word])
for index in unique_indices:
self.layer_1 += self.weights_0_1[index]

layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

if(layer_2[0] >= 0.5):
return "POSITIVE"
else:
return "NEGATIVE"
1
2
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], hidden_nodes=15, learning_rate=0.003)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1871. #Correct:1974 #Trained:2501 Training Accuracy:78.9%
Progress:20.8% Speed(reviews/sec):1753. #Correct:4037 #Trained:5001 Training Accuracy:80.7%
Progress:31.2% Speed(reviews/sec):1741. #Correct:6163 #Trained:7501 Training Accuracy:82.1%
Progress:41.6% Speed(reviews/sec):1696. #Correct:8320 #Trained:10001 Training Accuracy:83.1%
Progress:52.0% Speed(reviews/sec):1692. #Correct:10490 #Trained:12501 Training Accuracy:83.9%
Progress:62.5% Speed(reviews/sec):1695. #Correct:12636 #Trained:15001 Training Accuracy:84.2%
Progress:72.9% Speed(reviews/sec):1696. #Correct:14773 #Trained:17501 Training Accuracy:84.4%
Progress:83.3% Speed(reviews/sec):1675. #Correct:16959 #Trained:20001 Training Accuracy:84.7%
Progress:93.7% Speed(reviews/sec):1678. #Correct:19150 #Trained:22501 Training Accuracy:85.1%
Progress:99.9% Speed(reviews/sec):1674. #Correct:20467 #Trained:24000 Training Accuracy:85.2%
1
mlp.test(reviews[-1000:], labels[-1000:])
Progress:99.9% Speed(reviews/sec):2611. #Correct:857 #Tested:1000 Testing Accuracy:85.7%

九、进一步降低降低噪音

1
2
# 正面评论经常出现的词汇
pos_neg_ratios.most_common()
[('edie', 4.6913478822291435),
 ('antwone', 4.477336814478207),
 ('din', 4.406719247264253),
 ('gunga', 4.189654742026425),
 ('goldsworthy', 4.174387269895637),
 ('gypo', 4.0943445622221),
 ('yokai', 4.0943445622221),
 ('paulie', 4.07753744390572),
 ('visconti', 3.9318256327243257),
 ('flavia', 3.9318256327243257),
 ('blandings', 3.871201010907891),
 ('kells', 3.871201010907891),
 ('brashear', 3.8501476017100584),
 ...]
1
2
# 负面评论常出现词汇
list(reversed(pos_neg_ratios.most_common()))[0:30]
[('whelk', -4.605170185988092),
 ('pressurized', -4.605170185988092),
 ('bellwood', -4.605170185988092),
 ('mwuhahahaa', -4.605170185988092),
 ('insulation', -4.605170185988092),
 ('hoodies', -4.605170185988092),
 ('yaks', -4.605170185988092),
 ('deamon', -4.605170185988092),
 ('ziller', -4.605170185988092),
 ('lagomorph', -4.605170185988092),
 ('marinaro', -4.605170185988092),
 ('accelerant', -4.605170185988092),
 ('yez', -4.605170185988092),
 ('superhu', -4.605170185988092),
 ('fastidiously', -4.605170185988092),
 ('spotlessly', -4.605170185988092),
 ('dahlink', -4.605170185988092),
 ('rebanished', -4.605170185988092),
 ('unmated', -4.605170185988092),
 ('wushu', -4.605170185988092),
 ('nix', -4.605170185988092),
 ('echance', -4.605170185988092),
 ('vannet', -4.605170185988092),
 ('hodet', -4.605170185988092),
 ('francie', -4.605170185988092),
 ('vivisects', -4.605170185988092),
 ('degeneration', -4.605170185988092),
 ('lowlight', -4.605170185988092),
 ('slackly', -4.605170185988092),
 ('unrurly', -4.605170185988092)]
1
2
3
4
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
<div class="bk-root">
    <a href="https://bokeh.pydata.org" target="_blank" class="bk-logo bk-logo-small bk-logo-notebook"></a>
    <span id="98fe141d-97bd-4335-a540-473361f947a4">Loading BokehJS ...</span>
</div>
1
2
3
4
5
6
7
8
9
10
hist, edges = np.histogram(list(map(lambda x:x[1],
pos_neg_ratios.most_common())),
density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
line_color="#555555")
show(p)


1
2
3
4
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
frequency_frequency[cnt] += 1
1
2
3
4
5
6
7
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import time
import sys
import numpy as np

class SentimentNetwork:
def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
np.random.seed(1)
self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
# 输入层节点个数为词汇库长度

def pre_process_data(self, reviews, labels, polarity_cutoff, min_count): # 提取出所有词汇
#---------------进一步去除噪音-------------------
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

for i in range(len(reviews)):
if(labels[i] == 'POSITIVE'):
for word in reviews[i].split(" "):
positive_counts[word] += 1
total_counts[word] += 1
else:
for word in reviews[i].split(" "):
negative_counts[word] += 1
total_counts[word] += 1

pos_neg_ratios = Counter()

for term,cnt in list(total_counts.most_common()):
if(cnt >= 50):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio

for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))


# populate review_vocab with all of the words in the given reviews
review_vocab = set()
for review in reviews:
for word in review.split(" "):
## New for Project 6: only add words that occur at least min_count times
# and for words with pos/neg ratios, only add words
# that meet the polarity_cutoff
if(total_counts[word] > min_count):
if(word in pos_neg_ratios.keys()):
if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
review_vocab.add(word)
else:
review_vocab.add(word)
#
##
## ----------------------------------------

self.review_vocab = list(review_vocab)

# 提取标签
label_vocab = set()
for label in labels:
label_vocab.add(label)
self.label_vocab = list(label_vocab)

self.review_vocab_size = len(self.review_vocab)
self.label_vocab_size = len(self.label_vocab)

# 创建词汇库字典
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i

# 创建标签索引字典
self.label2index = {}
for i, label in enumerate(self.label_vocab):
self.label2index[label] = i

# 初始化模型,输入节点为词汇库长度,隐藏层节点默认为10,输出节点默认为1,学习率默认为0.1
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.learning_rate = learning_rate

self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.hidden_nodes, self.output_nodes))

self.layer_1 = np.zeros((1,hidden_nodes))

# 定义标签转换函数
def get_target_for_label(self,label):
if(label == 'POSITIVE'):
return 1
else:
return 0

def sigmoid(self,x):
return 1 / (1 + np.exp(-x))

def sigmoid_output_2_derivative(self,output):
return output * (1 - output)

#=======================训练函数=============================
def train(self, training_reviews_raw, training_labels):

training_reviews = list()
for review in training_reviews_raw:
indices = set()
for word in review.split(" "):
if(word in self.word2index.keys()):
indices.add(self.word2index[word])
training_reviews.append(list(indices))
# 以上代码记录下每条评论的每个词汇在对应字典中的位置索引,以此作为输入数据。
# 最终这里生成的training_reviews里面存放的是每条影评的各个词汇在词汇字典里的索引。

assert(len(training_reviews) == len(training_labels))

# 定义一个容器计算正确的个数用以计算准确率
correct_so_far = 0
start = time.time()

for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]

# 输入层
self.layer_1 *= 0
for index in review:
self.layer_1 += self.weights_0_1[index]

# 输出层
layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

# 输出层误差和梯度
layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

# 反向传播误差和梯度
layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

# 更新权值
self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step

for index in review:
self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

if(layer_2 >= 0.5 and label == 'POSITIVE'):
correct_so_far += 1
elif(layer_2 < 0.5 and label == 'NEGATIVE'):
correct_so_far += 1

elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")

def test(self, testing_reviews, testing_labels):

correct = 0
start = time.time()

for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1

elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0

sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")

def run(self, review):
self.layer_1 *= 0
unique_indices = set()
for word in review.lower().split(" "):
if word in self.word2index.keys():
unique_indices.add(self.word2index[word])
for index in unique_indices:
self.layer_1 += self.weights_0_1[index]

layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

if(layer_2[0] >= 0.5):
return "POSITIVE"
else:
return "NEGATIVE"
1
2
3
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],
min_count=20,polarity_cutoff=0.6,
learning_rate=0.01)
1
mlp.train(reviews[:-1000],labels[:-1000])
1
mlp.test(reviews[-1000:], labels[-1000:])
Progress:99.9% Speed(reviews/sec):5572. #Correct:845 #Tested:1000 Testing Accuracy:84.5%
1
2
3
4
5
6
7
def get_most_similar_words(focus = "horrible"):
most_similar = Counter()

for word in mlp_full.word2index.keys():
most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])

return most_similar.most_common()
1
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
1
mlp_full.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1712. #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):1646. #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):1545. #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):1571. #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):1560. #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):1573. #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):1567. #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):1531. #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):1512. #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):1504. #Correct:20335 #Trained:24000 Training Accuracy:84.7%
1
2
# get_most_similar_words("excellent")
# get_most_similar_words("terrible")
1
2
3
4
5
6
7
8
9
10
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
if(word in mlp_full.word2index.keys()):
words_to_visualize.append(word)

for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
if(word in mlp_full.word2index.keys()):
words_to_visualize.append(word)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
if word in pos_neg_ratios.keys():
vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
if(pos_neg_ratios[word] > 0):
pos+=1
colors_list.append("#00ff00")
else:
neg+=1
colors_list.append("#000000")
1
2
3
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
x2=words_top_ted_tsne[:,1],
names=words_to_visualize,
color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words