Tensorflow——对电影评论进行分类
前言
本文整理自TensorFlow练习1: 对评论进行分类,修改了原文代码中的一些bug,重构了一下代码,并添加了一些注释,最后还添加了如何使用训练后的模型进行预测,希望对初学者有一定的帮助。原作者的博客中还有很多tensorflow相关的练习,大家可以多多关注一下,感谢作者的分享。
数据集
本例子是对电影评论作分类,分为正面的评论和负面的评论。原文中提供了两份数据集:
第三方包
- numpy
- tensorflow
- pickle
- nltk
创建词汇表
把数据集中的数据通过中文分词,分解出每一个单词,作词形还原,例如复数单词还原为原单词(cats->cat),过滤出现频率过高和过低的单词,最后以列表形式返回:
1# 创建词汇表
2def create_lexicon(p_file, n_file):
3 result_lex = []
4
5 # 读取文件
6 def process_file(txtfile):
7 with open(txtfile, 'r') as f:
8 arr = []
9 lines = f.readlines()
10 # print(lines)
11 for line in lines:
12 words = word_tokenize(line.lower())
13 arr += words
14 return arr
15
16 # 分词
17 result_lex += process_file(p_file)
18 result_lex += process_file(n_file)
19 # print(len(result_lex))
20 # 词形还原(cats->cat)
21 lemmatizer = WordNetLemmatizer()
22 result_lex = [lemmatizer.lemmatize(word) for word in result_lex]
23 # 统计词出现次数
24 word_count = Counter(result_lex)
25 # print(word_count)
26 # 去掉不常用的词
27 result_lex = []
28 for word in word_count:
29 num = word_count[word]
30 if 2000 > num > 20:
31 result_lex.append(word)
32 # print(len(result_lex))
33 return result_lex
准备特征集
把每条评论转换为向量, 转换原理:假设词汇表为[‘woman’, ‘great’, ‘feel’, ‘actually’, ’looking’, ’latest’, ‘seen’, ‘is’] 当然实际上要大的多。例如:评论’i think this movie is great’ 转换为 [0,1,0,0,0,0,0,1],把评论中出现的字在词汇表中标记,出现过的标记为1,其余标记为0:
1# lex:词汇表;review:评论
2def word_to_vector(_lex, review):
3 words = word_tokenize(review.lower())
4 lemmatizer = WordNetLemmatizer()
5 words = [lemmatizer.lemmatize(word) for word in words]
6 features = np.zeros(len(_lex))
7 for word in words:
8 if word in _lex:
9 features[_lex.index(word)] = 1
10 return features
把数据集转换为特征集
调用上述方法把数据集中的每条评论转换为向量,并在最后拼接分类,标识该条评论是正面还是负面:
1# 将所有评论转换为向量,并拼接评论的的分类
2def normalize_dataset(inner_lex):
3 ds = []
4
5 # [0,1]代表负面评论 [1,0]代表正面评论,拼接到向量后
6 with open(pos_file, 'r') as f:
7 lines = f.readlines()
8 for line in lines:
9 one_sample = [word_to_vector(inner_lex, line), [1, 0]]
10 ds.append(one_sample)
11 with open(neg_file, 'r') as f:
12 lines = f.readlines()
13 for line in lines:
14 one_sample = [word_to_vector(inner_lex, line), [0, 1]]
15 ds.append(one_sample)
16
17 # print(len(ds))
18 return ds
整理数据
通过上述的步骤,我们已经创建了词汇表和把词汇表转换为向量,这时我们可以把结果保存起来,方便以后的调用,该方法可以保存数据或者重新加载数据:
1# 整理数据
2def clear_up(has_dataset):
3 if not has_dataset:
4 # 词典:文本中出现过的单词
5 _lex = create_lexicon(pos_file, neg_file)
6 # 词典转换为向量
7 ds = normalize_dataset(_lex)
8 random.shuffle(ds)
9 # 把整理好的数据保存到文件,方便使用。到此完成了数据的整理工作
10 with open('lex.pickle', 'wb') as f:
11 pickle.dump(_lex, f)
12 with open('dataset.pickle', 'wb') as f:
13 pickle.dump(ds, f)
14 else:
15 _lex = pickle.load(open('lex.pickle', 'rb'))
16 ds = pickle.load(open('dataset.pickle', 'rb'))
17 return _lex, ds
定义待训练的神经网络
定义前馈神经网络,每添加一个层,需要有输入, 参数权重,偏置,以及最后的输出, 当然还有激励函数:
1# 定义待训练的神经网络
2def neural_network(_lex, data):
3 # 输入层
4 n_input_layer = len(_lex)
5 # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
6 n_layer_1 = 2000
7 n_layer_2 = 2000
8 # 输出层
9 n_output_layer = 2
10
11 # 定义第一层"神经元"的权重和偏移量
12 layer_1_w_b = {'w_': tf.Variable(tf.random_normal([n_input_layer, n_layer_1])),
13 'b_': tf.Variable(tf.random_normal([n_layer_1]))}
14 # 定义第二层"神经元"的权重和偏移量
15 layer_2_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_1, n_layer_2])),
16 'b_': tf.Variable(tf.random_normal([n_layer_2]))}
17 # 定义输出层"神经元"的权重和偏移量
18 layer_output_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_2, n_output_layer])),
19 'b_': tf.Variable(tf.random_normal([n_output_layer]))}
20
21 # w*x + b
22 layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
23 layer_1 = tf.nn.relu(layer_1) # 激活函数
24 layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
25 layer_2 = tf.nn.relu(layer_2) # 激活函数
26 layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
27
28 return layer_output
使用数据训练神经网络
使用整理好的数据训练神经网络,最后保存模型:
1# 使用数据训练神经网络
2def train_neural_network(has_dataset):
3 _lex, _dataset = clear_up(has_dataset)
4 _dataset = np.array(_dataset)
5 x = tf.placeholder('float', [None, len(_dataset[0][0])])
6 y = tf.placeholder('float')
7
8 # 每次使用50条数据进行训练
9 batch_size = 50
10
11 predict = neural_network(_lex, x)
12 cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=y))
13 optimizer = tf.train.AdamOptimizer().minimize(cost_func) # learning rate 默认 0.001
14
15 epochs = 10
16 with tf.Session() as session:
17 saver = tf.train.Saver()
18 session.run(tf.global_variables_initializer())
19 train_x = _dataset[:, 0]
20 train_y = _dataset[:, 1]
21 for epoch in range(epochs):
22 i = 0
23 epoch_loss = 0
24 while i < len(train_x):
25 start = i
26 end = i + batch_size
27 batch_x = train_x[start:end]
28 batch_y = train_y[start:end]
29 _, c = session.run([optimizer, cost_func], feed_dict={x: list(batch_x), y: list(batch_y)})
30 epoch_loss += c
31 i += batch_size
32 print(epoch, ' : ', epoch_loss) #
33
34 text_x = _dataset[:, 0]
35 text_y = _dataset[:, 1]
36 correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
37 accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
38 print('准确率: ', accuracy.eval({x: list(text_x), y: list(text_y)}))
39 # 保存session
40 saver.save(session, './model.ckpt')
使用模型预测
把评论字符串转换为向量,调用保存的模型,进行预测,[0]为正面评论,[1]为负面评论:
1# 使用模型预测
2def prediction(text):
3 _lex = pickle.load(open('lex.pickle', 'rb'))
4 x = tf.placeholder('float')
5 predict = neural_network(_lex, x)
6 with tf.Session() as session:
7 session.run(tf.global_variables_initializer())
8 saver = tf.train.Saver()
9 saver.restore(session, 'model.ckpt')
10 features = word_to_vector(_lex, text)
11 res = session.run(tf.argmax(predict.eval(feed_dict={x: [features]}), 1))
12 return res
结果
训练的结果:
0 : 154066.731888
1 : 54634.1575928
2 : 28382.4827766
3 : 50698.829367
4 : 23948.3449777
5 : 9888.94352563
6 : 3610.13648503
7 : 1088.47069195
8 : 809.218485014
9 : 868.404643207
准确率: 0.975239
预测的结果:
1print(prediction("very good")) # [0]
2print(prediction("very bad")) # [1]
完整代码
1import os
2import numpy as np
3import tensorflow as tf
4import random
5import pickle
6from collections import Counter
7from nltk.tokenize import word_tokenize
8from nltk.stem import WordNetLemmatizer
9
10os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
11
12pos_file = 'pos.txt'
13neg_file = 'neg.txt'
14
15
16# 创建词汇表
17def create_lexicon(p_file, n_file):
18 result_lex = []
19
20 # 读取文件
21 def process_file(txtfile):
22 with open(txtfile, 'r') as f:
23 arr = []
24 lines = f.readlines()
25 # print(lines)
26 for line in lines:
27 words = word_tokenize(line.lower())
28 arr += words
29 return arr
30
31 # 分词
32 result_lex += process_file(p_file)
33 result_lex += process_file(n_file)
34 # print(len(result_lex))
35 # 词形还原(cats->cat)
36 lemmatizer = WordNetLemmatizer()
37 result_lex = [lemmatizer.lemmatize(word) for word in result_lex]
38 # 统计词出现次数
39 word_count = Counter(result_lex)
40 # print(word_count)
41 # 去掉不常用的词
42 result_lex = []
43 for word in word_count:
44 num = word_count[word]
45 if 2000 > num > 20:
46 result_lex.append(word)
47 # print(len(result_lex))
48 return result_lex
49
50
51# 把每条评论转换为向量, 转换原理:
52# 假设lex为['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is'] 当然实际上要大的多
53# 评论'i think this movie is great' 转换为 [0,1,0,0,0,0,0,1], 把评论中出现的字在lex中标记,出现过的标记为1,其余标记为0
54# lex:词汇表;review:评论
55def word_to_vector(_lex, review):
56 words = word_tokenize(review.lower())
57 lemmatizer = WordNetLemmatizer()
58 words = [lemmatizer.lemmatize(word) for word in words]
59 features = np.zeros(len(_lex))
60 for word in words:
61 if word in _lex:
62 features[_lex.index(word)] = 1
63 return features
64
65
66# 将所有评论转换为向量,并拼接评论的的分类
67def normalize_dataset(inner_lex):
68 ds = []
69
70 # [0,1]代表负面评论 [1,0]代表正面评论,拼接到向量后
71 with open(pos_file, 'r') as f:
72 lines = f.readlines()
73 for line in lines:
74 one_sample = [word_to_vector(inner_lex, line), [1, 0]]
75 ds.append(one_sample)
76 with open(neg_file, 'r') as f:
77 lines = f.readlines()
78 for line in lines:
79 one_sample = [word_to_vector(inner_lex, line), [0, 1]]
80 ds.append(one_sample)
81
82 # print(len(ds))
83 return ds
84
85
86# 整理数据
87def clear_up(has_dataset):
88 if not has_dataset:
89 # 词典:文本中出现过的单词
90 _lex = create_lexicon(pos_file, neg_file)
91 # 词典转换为向量
92 ds = normalize_dataset(_lex)
93 random.shuffle(ds)
94 # 把整理好的数据保存到文件,方便使用。到此完成了数据的整理工作
95 with open('lex.pickle', 'wb') as f:
96 pickle.dump(_lex, f)
97 with open('dataset.pickle', 'wb') as f:
98 pickle.dump(ds, f)
99 else:
100 _lex = pickle.load(open('lex.pickle', 'rb'))
101 ds = pickle.load(open('dataset.pickle', 'rb'))
102 return _lex, ds
103
104
105# 定义待训练的神经网络
106def neural_network(_lex, data):
107 # 输入层
108 n_input_layer = len(_lex)
109 # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
110 n_layer_1 = 2000
111 n_layer_2 = 2000
112 # 输出层
113 n_output_layer = 2
114
115 # 定义第一层"神经元"的权重和偏移量
116 layer_1_w_b = {'w_': tf.Variable(tf.random_normal([n_input_layer, n_layer_1])),
117 'b_': tf.Variable(tf.random_normal([n_layer_1]))}
118 # 定义第二层"神经元"的权重和偏移量
119 layer_2_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_1, n_layer_2])),
120 'b_': tf.Variable(tf.random_normal([n_layer_2]))}
121 # 定义输出层"神经元"的权重和偏移量
122 layer_output_w_b = {'w_': tf.Variable(tf.random_normal([n_layer_2, n_output_layer])),
123 'b_': tf.Variable(tf.random_normal([n_output_layer]))}
124
125 # w*x + b
126 layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
127 layer_1 = tf.nn.relu(layer_1) # 激活函数
128 layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
129 layer_2 = tf.nn.relu(layer_2) # 激活函数
130 layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
131
132 return layer_output
133
134
135# 使用数据训练神经网络
136def train_neural_network(has_dataset):
137 _lex, _dataset = clear_up(has_dataset)
138 _dataset = np.array(_dataset)
139 x = tf.placeholder('float', [None, len(_dataset[0][0])])
140 y = tf.placeholder('float')
141
142 # 每次使用50条数据进行训练
143 batch_size = 50
144
145 predict = neural_network(_lex, x)
146 cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predict, labels=y))
147 optimizer = tf.train.AdamOptimizer().minimize(cost_func) # learning rate 默认 0.001
148
149 epochs = 10
150 with tf.Session() as session:
151 saver = tf.train.Saver()
152 session.run(tf.global_variables_initializer())
153 train_x = _dataset[:, 0]
154 train_y = _dataset[:, 1]
155 for epoch in range(epochs):
156 i = 0
157 epoch_loss = 0
158 while i < len(train_x):
159 start = i
160 end = i + batch_size
161 batch_x = train_x[start:end]
162 batch_y = train_y[start:end]
163 _, c = session.run([optimizer, cost_func], feed_dict={x: list(batch_x), y: list(batch_y)})
164 epoch_loss += c
165 i += batch_size
166 print(epoch, ' : ', epoch_loss) #
167
168 text_x = _dataset[:, 0]
169 text_y = _dataset[:, 1]
170 correct = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
171 accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
172 print('准确率: ', accuracy.eval({x: list(text_x), y: list(text_y)}))
173 # 保存session
174 saver.save(session, './model.ckpt')
175
176
177# 使用模型预测
178def prediction(text):
179 _lex = pickle.load(open('lex.pickle', 'rb'))
180 x = tf.placeholder('float')
181 predict = neural_network(_lex, x)
182 with tf.Session() as session:
183 session.run(tf.global_variables_initializer())
184 saver = tf.train.Saver()
185 saver.restore(session, 'model.ckpt')
186 features = word_to_vector(_lex, text)
187 res = session.run(tf.argmax(predict.eval(feed_dict={x: [features]}), 1))
188 return res
189
190
191if __name__ == "__main__":
192 # 训练模型
193 train_neural_network(False)
194 # 预测结果
195 # print(prediction("very good"))
