该章的源代码已经调通,如下, 先记录下来,再慢慢理解


#!/usr/bin/env python
# coding: utf-8

# In[1]:

import pandas as pd
import numpy as np
import pickle

import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation, Dropout, Embedding, Reshape, Dot, Concatenate, Multiply
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
plt.rcParams['figure.figsize']=(20, 10)

# 读入数据

# In[2]:文件下载地址:http://dataset.cs.mcgill.ca/ubuntu-corpus-1.0/ubuntu_blobs.tgz

with open("dataset.pkl", "rb") as f:
    data = pickle.load(f)

# In[3]:

print("size ======= %s" % len(data))

# In[4]:

import gc
gc.collect()

# 看看数据里都是什么

# In[5]:

for j in range(len(data)):
    print("======= %s" % j)
    for i, k in enumerate(data[j]):
        print(k)

# 这里分析最长的句子的长度

# In[6]:

# 这里分析最长的句子的长度
length=map(len, data[0]['c'])
res=list(length)
context_length=np.max(res[:])
print(context_length)

length=map(len, data[0]['r'])
res=list(length)
response_length=np.max(res[:])
print(response_length)

# 这里分析整个词典的大小

# In[7]:

context_size = np.max(list(map(lambda x:  max(x) if len(x)>0 else 0, data[0]['c'])))
print(context_size)
response_size = max(list(map(lambda x:  max(x) if len(x)>0 else 0, data[0]['r'])))
print(response_size)

# In[8]:

max(data[0]['r'][1])

# In[9]:

embedding_dim=64
lstm_dim=64

context_length=np.max(list(map(len, data[0]['c'])))
#print(context_length)
response_length=np.max(list( map(len, data[0]['r'])))
#print(response_length)

Y = data[0]['r']

print('Begin Modeling...')

context_size = np.max(list(map(lambda x:  max(x) if len(x)>0 else 0, data[0]['c'])))
response_size = max(list(map(lambda x:  max(x) if len(x)>0 else 0, data[0]['r'])))
volcabulary_size=max(context_size, response_size)

context_length=120

# 对上下文部分进行嵌入和建模
context=Input(shape=((context_length,)),  name='context_input')
context_embedded=Embedding(input_length=context_length, output_dim=embedding_dim, input_dim=volcabulary_size)(context)
context_lstm=LSTM(lstm_dim)(context_embedded)

# 对回应部分进行嵌入和建模
response_length=120
response=Input(shape=((response_length,)), name='response_input')
response_embedded=Embedding(input_length=response_length, output_dim=embedding_dim, input_dim=volcabulary_size)(response)
response_lstm=LSTM(lstm_dim)(response_embedded)

#print(response_lstm.outputs)

x = Dot([1, 1])([context_lstm, response_lstm])
#x = Multiply()([context_lstm, response_lstm])
yhat = Dense(2, activation='softmax')(x)

model = Model(inputs=[context, response], outputs=yhat)
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print('Finish compiling...')
model.summary()

# In[10]:

# 针对该数据定制的generator。一般应该将三个部分分离以后再编制generator

def data_gen(data, batch_size=100):    
    contextRaw = data['c']
    responseRaw = data['r']
    yRaw = data['y']

    number_of_batches = len(contextRaw) // batch_size
    counter=0

    context_length=np.max(list(map(len, contextRaw)))//3    
    response_length=np.max(list( map(len, responseRaw)))//3

    context_length=120
    response_length=120

    while 1:        
        lowerBound = batch_size*counter
        upperBound = batch_size*(counter+1)
        Ctemp = contextRaw[lowerBound : upperBound]
        C_batch = pad_sequences(Ctemp, maxlen=context_length, padding='post')
        C_res = np.zeros((batch_size, context_length), dtype=np.int)

        Rtemp = responseRaw[lowerBound : upperBound]
        R_batch = pad_sequences(Rtemp, maxlen=response_length, padding='post')
        R_res = np.zeros((batch_size, response_length), dtype=np.int)
        for k in np.arange(batch_size):
            C_res[k, :] = C_batch[k, :]
            R_res[k, :] = R_batch[k, :]
        y_res= keras.utils.to_categorical(yRaw[lowerBound : upperBound])
        counter += 1
        yield([C_res.astype('float32'), R_res.astype('float32')], y_res.astype('float32'))
        if (counter < number_of_batches):            
            counter=0            

# 下面训练这个模型。在6GB显存的GTX 1060上,小批量的大小不能超过200。读者有时间可以试试多次迭代,看看效果。

# In[11]:

#Y = keras.utils.to_categorical(data[0]['y'], num_classes=2)
batch_size=168
model.fit_generator(data_gen(data[0], batch_size=batch_size), 
                    steps_per_epoch=len(data[0]['c'])//batch_size,
                    validation_data = data_gen(data[1]),
                    validation_steps = len(data[1]['c'])//batch_size,
                    epochs=1)

# 下面我们将模型存入磁盘。我们也可以在拟合过程中使用checkponit选项将每一步的结果都分别存入一个磁盘文件中。

# In[12]:

# 将模型结构存为JSON格式
model_json = model.to_json()
with open("dual_lstm_model.json", "w") as json_file:
    json_file.write(model_json)
# 将模型拟合得到的权重存入HDF5文件中
model.save_weights("dual_lstm_model.h5")
print("模型已经写入磁盘")

# In[13]:

# 如果要调用已有模型,可以通过如下方法

# 从磁盘载入模型结构
json_file = open('dual_lstm_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# 从磁盘读入模型权重
model.load_weights("dual_lstm_model.h5")
print("载入模型完毕")
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print('模型编译完毕...')

# 下面进行预测。输入数据的组织形式应该遵循data generator里面的数据处理和输出组织形式,但是我们可以通过predict_generator方法直接引用现有的data generator,只是用在测试集,而不是训练集上。

# In[14]:

batch_size=256
ypred = model.predict_generator( data_gen(data[2], batch_size=batch_size), steps=(len(data[2]['c'])//batch_size), verbose=1)

# In[15]:

yTest = data[1]['y']

ypred2=(2-(ypred[:,0]>ypred[:,1]))-1
z = [str(ypred2[i])==yTest[i] for i in range(len(ypred2))]
np.mean(z)