use*_*244 5 python keras tensorflow tensorflow2.0
我正在使用预训练模型来训练图像分类器。下面的代码在 CPU 和单个 GPU 上运行良好(即当 #GPU=1 时)
class Metrics(tf.keras.callbacks.Callback):
def __init__(self, train_tf_data, val_tf_data, CLASSES, logs={}, **kwargs):
super().__init__(**kwargs)
# self.keras_metric = tf.keras.metrics.Mean("val_f1_after_epoch")
self.train_tf_data = train_tf_data
self.val_tf_data = val_tf_data
# self.model = model
self.CLASSES = CLASSES
def on_epoch_end(self, epoch, logs={}):
# self.keras_metric.reset_state()
# for train data
self.train_reports = test_model(model=self.model, data=self.train_tf_data, CLASSES=self.CLASSES)
self.train_f1_after_epoch = self.train_reports['f1_score']
self.train_recall_after_epoch = self.train_reports['recall']
self.train_prec_after_epoch = self.train_reports['precision']
# for val data
self.val_reports = test_model(model=self.model, data=self.val_tf_data, CLASSES=self.CLASSES)
self.val_f1_after_epoch = self.val_reports['f1_score']
self.val_recall_after_epoch = self.val_reports['recall']
self.val_prec_after_epoch = self.val_reports['precision']
# saving train results to log dir
logs["f1_after_epoch"]=self.train_f1_after_epoch
logs['precision_after_epoch'] = self.train_prec_after_epoch
logs['recall_after_epoch'] = self.train_recall_after_epoch
# saving val results to log dir
logs['val_f1_after_epoch'] = self.val_f1_after_epoch
logs['val_precision_after_epoch'] = self.val_prec_after_epoch
logs['val_recall_after_epoch'] = self.val_recall_after_epoch
# self.keras_metric.update_state(self.val_f1_after_epoch)
print('reports_after_epoch', self.train_reports)
print('val_reports_after_epoch', self.val_reports)
with strategy.scope():
pretrained_model = tf.keras.applications.MobileNetV2(
weights='imagenet',
include_top=False,
input_shape=[*IMAGE_SIZE, IMG_CHANNELS])
pretrained_model.trainable = True #fine tuning
q_aware_pretrained_model = tf.keras.models.clone_model(pretrained_model,
clone_function=apply_quantization_to_dense,)
base_model = tf.keras.Sequential([
tf.keras.layers.Lambda(# Convert image from int[0, 255] to the format expect by this base_model
lambda data:tf.keras.applications.mobilenet.preprocess_input(
tf.cast(data, tf.float32)), input_shape=[*IMAGE_SIZE, 3]),
q_aware_pretrained_model,
tf.keras.layers.GlobalAveragePooling2D()])
base_model.layers[1]._name = 'custom_mnet_trainable'
base_model.add(tf.keras.layers.Dense(64, name='object_dense',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_64'))
base_model.add(tf.keras.layers.Dropout(rate=0.5, name='dropout_dense_64'))
base_model.add(tf.keras.layers.Dense(32, name='object_dense_2',kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.BatchNormalization(scale=False, center = False))
base_model.add(tf.keras.layers.Activation('relu', name='relu_dense_32'))
base_model.add(tf.keras.layers.Dropout(rate=0.4, name='dropout_dense_32'))
base_model.add(tf.keras.layers.Dense(16, name='object_dense_16', kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)))
base_model.add(tf.keras.layers.Dense(len(CLASS_NAMES), activation='softmax', name='object_prob'))
m1 = tf.keras.metrics.CategoricalAccuracy()
m2 = tf.keras.metrics.Recall()
m3 = tf.keras.metrics.Precision()
m4 = Metrics(train_tf_data=train_data, val_tf_data=test_data, CLASSES=CLASS_NAMES)
optimizers = [
tfa.optimizers.AdamW(learning_rate=lr * .001 , weight_decay=wd),
tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
]
optimizers_and_layers = [(optimizers[0], base_model.layers[0]), (optimizers[1], base_model.layers[1:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
annotated_model = tf.keras.models.clone_model(
base_model,
clone_function=apply_quantization_to_dense,
)
model = tfmot.quantization.keras.quantize_apply(annotated_model)
model.compile(
optimizer= optimizer, loss=tfa.losses.SigmoidFocalCrossEntropy(reduction=tf.keras.losses.Reduction.AUTO),
metrics=[m1, m2, m3],
)
tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)
checkpoint_name = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep + "training_chkpts/cp-{epoch:04d}-{val_f1_after_epoch:.2f}.ckpt"
checkpoint_dir_path = os.getcwd() + os.sep + CUSTOM_MODEL_PATH + os.sep+ "training_chkpts"
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_name,
monitor = 'val_f1_after_epoch',
save_best_only=True,
save_weights_only=True,
mode='max',
save_freq='epoch',
verbose=1)
checkpoint_cb._supports_tf_logs = False
current_dir = os.getcwd()
history = model.fit(train_data, validation_data=test_data,
epochs=N_EPOCHS,
callbacks=[m4, checkpoint_cb, tensorboard_cb])
Run Code Online (Sandbox Code Playgroud)
但如果我使用 GPU 数量 > 1 的系统,则会抛出以下错误。
Epoch 1/2 6/Unknown - 44s 150ms/step - 损失:19.2255 - categorical_accuracy:0.0625 - 召回率:0.0000e+00 - 精度:0.0000e+00
/bwz_venv/lib/python3.8/site-packages/keras/engine/tical.py:1410:CustomMaskWarning:自定义遮罩层需要配置,并且必须覆盖 get_config。加载时,自定义遮罩层必须传递给 custom_objects 参数。layer_config = serialize_layer_fn(layer) 288/Unknown - 84s 141ms/step - 损失:13.7873 - categorical_accuracy:0.1788 - 召回率:0.0080 - 精度:0.77082021-12-30 15:08:31.404434:W tensorflow/core/framework/op_kernel.cc :1745] OP_REQUIRES 在 transpose_op.cc:142 处失败:INVALID_ARGUMENT:转置需要大小为 0 的向量。但是 input(1) 是大小为 4 的向量
回溯(最近一次调用):文件“/usr/lib/python3.8/runpy.py”,第 194 行,在 _run_module_as_main return _run_code(code, main_globals, None, File“/usr/lib/python3.8/runpy” .py”,第 87 行,在 _run_code exec(code, run_globals) 文件“/ssd/custom_mnet_v2.py”,第 536 行,在历史 = model.fit(train_data,validation_data=test_data,文件“bwz_venv/lib/python3.8”中/site-packages/keras/utils/traceback_utils.py”,第 67 行,在 error_handler 中从无文件“/bwz_venv/lib/python3.8/site-packages/tensorflow/python/eager/execute 中引发 e.with_traceback(filtered_tb) .py”,第 58 行,quick_execute 张量 = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,tensorflow.python.framework.errors_impl.InvalidArgumentError: 发现 3 个根错误。
(0) INVALID_ARGUMENT:转置需要大小为 0 的向量。但是 input(1) 是大小为 4 的向量 [[{{nodegradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} ]] [[div_no_nan_3/ReadVariableOp/_558]]
(1) INVALID_ARGUMENT:转置需要大小为 0 的向量。但是 input(1) 是大小为 4 的向量 [[{{nodegradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} ]] [[assert_less_equal/Assert/AssertGuard/else/_4049/assert_less_equal/Assert/AssertGuard/Assert/data_4/_546]]
(2) INVALID_ARGUMENT:转置需要大小为 0 的向量。但是 input(1) 是大小为 4 的向量 [[{{nodegradient_tape/replica_1/sequential/custom_mnet_trainable/Conv1/Conv2D/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer}} ]] 0 次成功操作。0 个派生错误被忽略。[操作:__inference_train_function_1079980]
函数调用栈:train_function -> train_function -> train_function
我已经测试过的几件事
这是重现错误的 Google Colab Notebook 的链接(请设置#GPU>1)
我 70% 确信这是由您的 lambda 层引起的。我认为当您运行多 GPU 时,您的模型需要可序列化,以便可以将其放置在每个 GPU 上。我认为 lambda 层无法序列化。
请参阅此注释:
WARNING: tf.keras.layers.Lambda layers have (de)serialization limitations!
Run Code Online (Sandbox Code Playgroud)
这里 https://keras.io/api/layers/core_layers/lambda/。
尝试将 lambda 层重写为支持序列化的真正自定义层,即它实现 get_config()。所以而不是
lambda data:tf.keras.applications.mobilenet.preprocess_input(
tf.cast(data, tf.float32)),
Run Code Online (Sandbox Code Playgroud)
编写一个适当的自定义层作为
class Prep(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def get_config(self):
return super().get_config()
def call(self, inputs):
return tf.keras.applications.mobilenet.preprocess_input(
tf.cast(inputs, tf.float32))
Run Code Online (Sandbox Code Playgroud)
然后将新的 Prep 层添加到您的顺序模型中。LMK 如果有效的话。如果没有我会删除这个回复。
| 归档时间: |
|
| 查看次数: |
672 次 |
| 最近记录: |