回调函数modelcheckpoint导致keras错误

Question

回调函数modelcheckpoint导致keras错误

当我使用回调函数modelcheckpoint时,我似乎得到了这个错误.

我从github问题中读到了解决方案可以使用的问题model.get_weight,但我隐含地只存储它,因为我只存储具有最佳权重的解决方案.

Keras似乎只使用h5来保存权重,这让我觉得有没有其他方法可以使用eras API来存储它们,如果是这样的话怎么样？如果没有,我该如何存储？

举例来重现这个问题:

#!/usr/bin/python


import glob, os
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import warnings
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from keras.utils import np_utils
from keras import metrics
import keras
from keras import backend as K
from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.layers.core import Dense, Activation, Lambda, Reshape,Flatten
from keras.layers import Conv1D,Conv2D,MaxPooling2D, MaxPooling1D, Reshape
#from keras.utils.visualize_util import plot
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers.merge import Concatenate, Add
import h5py
import random
import tensorflow as tf
import math
from keras.callbacks import CSVLogger
from keras.callbacks import ModelCheckpoint


if len(sys.argv) < 5:
    print "Missing Arguments!"
    print "python keras_convolutional_feature_extraction.py <workspace> <totale_frames> <fbank-dim> <window-height> <batch_size>"
    print "Example:"
    print "python keras_convolutional_feature_extraction.py deltas 15 40 5 100"
    sys.exit()


total_frames = int(sys.argv[2])
total_frames_with_deltas = total_frames*3
dim = int(sys.argv[3])
window_height = int(sys.argv[4])
inserted_batch_size = int(sys.argv[5])
stride = 1
splits = ((dim - window_height)+1)/stride

#input_train_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_train_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(inserted_batch_size)+"_fws_input"
#output_train_data ="/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_train_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(inserted_batch_size)+"_fws_output"
#input_test_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_test_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(1)+"_fws_input"
#output_test_data = "/media/carl/E2302E68302E443F/"+str(sys.argv[1])+"/fbank/org_test_total_frames_"+str(total_frames)+"_dim_"+str(dim)+"_winheig_"+str(window_height)+"_batch_"+str(1)+"_fws_output"

#train_files =[f for f in listdir(input_train_data) if isfile(join(input_train_data, f))]
#test_files =[f for f in listdir(input_test_data) if isfile(join(input_test_data, f))]

#print len(train_files)
np.random.seed(100)
print "hallo"
def train_generator():
    while True:
#        input = random.choice(train_files)
#        h5f = h5py.File(input_train_data+'/'+input, 'r')
#        train_input = h5f['train_input'][:]
#        train_output = h5f['train_output'][:]
#        h5f.close()
        train_input = np.random.randint(100,size=((inserted_batch_size,splits*total_frames_with_deltas,window_height,3)))
        train_list_list = []
        train_input = train_input.reshape((inserted_batch_size,splits*total_frames_with_deltas,window_height,3))
        train_input_list = np.split(train_input,splits*total_frames_with_deltas,axis=1)
        for i in range(len(train_input_list)):
            train_input_list[i] = train_input_list[i].reshape(inserted_batch_size,window_height,3)


        #for i in range(len(train_input_list)):
        #    train_input_list[i] = train_input_list[i].reshape(inserted_batch_size,33,window_height,1,3)

        train_output = np.random.randint(5, size = (1,total_frames,5))
        middle = int(math.ceil(total_frames/2))

        train_output = train_output[:,middle:middle+1,:].reshape((inserted_batch_size,1,5))
        #print train_output.shape
        #print len(train_input_list)
        #print train_input_list[0].shape
        yield (train_input_list, train_output)
print "hallo"
def test_generator():
    while True:
#        input = random.choice(test_files)
#        h5f = h5py.File(input_test_data+'/'+input, 'r')
#        test_input = h5f['test_input'][:]
#        test_output = h5f['test_output'][:]
#        h5f.close()
        test_input = np.random.randint(100,size=((inserted_batch_size,splits*total_frames_with_deltas,window_height,3)))
        test_input = test_input.reshape((inserted_batch_size,splits*total_frames_with_deltas,window_height,3))
        test_input_list = np.split(test_input,splits*total_frames_with_deltas,axis=1)
        #test_input_list = np.split(test_input,45,axis=3)

        for i in range(len(test_input_list)):
            test_input_list[i] = test_input_list[i].reshape(inserted_batch_size,window_height,3)

        #for i in range(len(test_input_list)):
        #    test_input_list[i] = test_input_list[i].reshape(inserted_batch_size,33,window_height,1,3)

        test_output = np.random.randint(5, size = (1,total_frames,5))

        middle = int(math.ceil(total_frames/2))

        test_output = test_output[:,middle:middle+1,:].reshape((inserted_batch_size,1,5))

        yield (test_input_list, test_output)
print "hallo"

def fws():
    #print "Inside"
    #   Params:
    #   batch ,  lr, decay , momentum, epochs
    #
    #Input shape: (batch_size,40,45,3)
    #output shape: (1,15,50)
    # number of unit in conv_feature_map = splitd
    next(train_generator())
    model_output = []
    list_of_input = [Input(shape=(8,3)) for i in range(splits*total_frames_with_deltas)]
    output = []

    #Conv
    skip = total_frames_with_deltas
    for steps in range(total_frames_with_deltas):
        conv = Conv1D(filters = 100, kernel_size = 8)
        column = 0
        for  _ in range(splits):
            #print "column " + str(column) + "steps: " + str(steps)
            output.append(conv(list_of_input[(column*skip)+steps]))
            column = column + 1

    #print len(output)
    #print splits*total_frames_with_deltas


    conv = []
    for section in range(splits):
        column = 0
        skip = splits
        temp = []
        for _ in range(total_frames_with_deltas):
            temp.append(output[((column*skip)+section)])
            column = column + 1
        conv.append(Add()(temp))
        #print len(conv)



    output_conc = Concatenate()(conv)
    #print output_conc.get_shape
    output_conv = Reshape((splits, -1))(output_conc)
    #print output_conv.get_shape

    #Pool
    pooled = MaxPooling1D(pool_size = 6, strides = 2)(output_conv)
    reshape = Reshape((1,-1))(pooled)

    #Fc
    dense1 = Dense(units = 1024, activation = 'relu',    name = "dense_1")(reshape)
    #dense2 = Dense(units = 1024, activation = 'relu',    name = "dense_2")(dense1)
    dense3 = Dense(units = 1024, activation = 'relu',    name = "dense_3")(dense1)
    final = Dense(units = 5, activation = 'relu',    name = "final")(dense3)

    model = Model(inputs = list_of_input , outputs = final)
    sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True)
    model.compile(loss="categorical_crossentropy", optimizer=sgd , metrics = ['accuracy'])
    print "compiled"

    model_yaml = model.to_yaml()
    with open("model.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)

    print "Model saved!"

    log= CSVLogger('/home/carl/kaldi-trunk/dnn/experimental/yesno_cnn_50_training_total_frames_'+str(total_frames)+"_dim_"+str(dim)+"_window_height_"+str(window_height)+".csv")
    filepath='yesno_cnn_50_training_total_frames_'+str(total_frames)+"_dim_"+str(dim)+"_window_height_"+str(window_height)+"weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_weights_only=True, mode='max')


    print "log"
    #plot_model(model, to_file='model.png')
    print "Fit"
    hist_current = model.fit_generator(train_generator(),
                        steps_per_epoch=444,#len(train_files),
                        epochs = 10000,
                        verbose = 1,
                        validation_data = test_generator(),
                        validation_steps=44,#len(test_files),
                        pickle_safe = True,
                        workers = 4,
                        callbacks = [log,checkpoint])

fws()

Run Code Online (Sandbox Code Playgroud)

执行脚本:python name_of_script.py yens 50 40 8 1

这给了我一个完整的追溯:

完全追溯错误:

carl@ca-ThinkPad-T420s:~/Dropbox$ python mini.py yesno 50 40 8 1
Using TensorFlow backend.
Couldn't import dot_parser, loading of dot files will not be possible.
hallo
hallo
hallo
compiled
Model saved!
log
Fit
/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:2252: UserWarning: Expected no kwargs, you passed 1
kwargs passed to function are ignored with Tensorflow backend
  warnings.warn('\n'.join(msg))
Epoch 1/10000
2017-05-26 13:01:45.851125: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-26 13:01:45.851345: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-05-26 13:01:45.851392: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
443/444 [============================>.] - ETA: 4s - loss: 100.1266 - acc: 0.3138Epoch 00000: saving model to yesno_cnn_50_training_total_frames_50_dim_40_window_height_8weights-improvement-00-0.48.hdf5
Traceback (most recent call last):
  File "mini.py", line 205, in <module>

  File "mini.py", line 203, in fws

  File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1933, in fit_generator
    callbacks.on_epoch_end(epoch, epoch_logs)
  File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 77, in on_epoch_end
    callback.on_epoch_end(epoch, logs)
  File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 411, in on_epoch_end
    self.model.save_weights(filepath, overwrite=True)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2503, in save_weights
    save_weights_to_hdf5_group(f, self.layers)
  File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2746, in save_weights_to_hdf5_group
    f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/attrs.py", line 93, in __setitem__
    self.create(name, data=value, dtype=base.guess_dtype(value))
  File "/usr/local/lib/python2.7/dist-packages/h5py/_hl/attrs.py", line 183, in create
    attr = h5a.create(self._id, self._e(tempname), htype, space)
  File "h5py/_objects.pyx", line 54, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2684)
  File "h5py/_objects.pyx", line 55, in h5py._objects.with_phil.wrapper (/tmp/pip-4rPeHA-build/h5py/_objects.c:2642)
  File "h5py/h5a.pyx", line 47, in h5py.h5a.create (/tmp/pip-4rPeHA-build/h5py/h5a.c:1904)
RuntimeError: Unable to create attribute (Object header message is too large)

Run Code Online (Sandbox Code Playgroud)

Answer 1

Tom*_*nys 3

如果您查看 Keras 尝试在layer_names属性下保存的数据量（在正在创建的输出 HDF5 文件内），您会发现它需要超过 64kB。

np.asarray([layer.name.encode('utf8') for layer in model.layers]).nbytes
>> 77100

Run Code Online (Sandbox Code Playgroud)

我引用https://support.hdfgroup.org/HDF5/faq/limits.html：

是否存在对象标头限制？这对 HDF5 有什么影响？

对象标头有一个限制（在 HDF5-1.8 中），即 64 KB。数据集的数据类型存储在对象标头中，因此您可以拥有的数据类型的大小受到限制。（参见 HDFFV-1089）

上面的代码（几乎完全）是从回溯中复制的：

File "/usr/local/lib/python2.7/dist-packages/keras/engine/topology.py", line 2746, in save_weights_to_hdf5_group
f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers]

Run Code Online (Sandbox Code Playgroud)

我正在使用 numpyasarray方法快速获取数字，但h5py得到类似的数字（我猜），如果您想查找，请参阅https://github.com/h5py/h5py/blob/master/h5py/_hl/attrs.py#L102确切的数字。

无论如何，要么您需要实现自己的方法来保存/加载权重（或使用现有的解决方法），要么您需要为模型中的所有层提供一个非常短的名称:)，如下所示：

list_of_input = [Input(shape=(8,3), name=('i%x' % i)) for i in range(splits*total_frames_with_deltas)]
conv = Conv1D(filters = 100, kernel_size = 8, name='cv%x' % steps) 
conv.append(Add(name='add%x' % section)(temp))
output_conc = Concatenate(name='ct')(conv)
output_conv = Reshape((splits, -1), name='rs1')(output_conc)
pooled = MaxPooling1D(pool_size = 6, strides = 2, name='pl')(output_conv)
reshape = Reshape((1,-1), name='rs2')(pooled) 
dense1 = Dense(units = 1024, activation = 'relu', name = "d1")(reshape) 
dense2 = Dense(units
= 1024, activation = 'relu', name = "d2")(dense1) 
dense3 = Dense(units = 1024, activation = 'relu', name = "d3")(dense1) 
final = Dense(units = 5, activation = 'relu', name = "fl")(dense3)

Run Code Online (Sandbox Code Playgroud)

您一定不要忘记命名所有图层，因为图层名称转换为的（numpy）字符串数组在保存时使用其中每个单独字符串的最长字符串的大小！

按照上面的建议重命名图层后（大约需要 26kB），模型已成功保存。希望这个详尽的答案对某人有所帮助。

更新：我刚刚向 Keras 提交了一个 PR，应该可以在不实现任何自定义加载/保存方法的情况下解决问题，请参阅7508

归档时间：	8 年，9 月前
查看次数：	3167 次
最近记录：	7 年前