esp32_cam读取并处理图像

Question

esp32_cam读取并处理图像

nop*_*act 5 c++ arduino esp32 tensorflow-lite

我正在尝试在 esp32_cam 上使用 tensorflow-lite 对图像进行分类。我定义了需要解决的以下子任务：

拍照
将照片尺寸缩小至（例如）28x28 像素灰度
使用经过训练的模型进行推理

现在我陷入了第 1 点和第 2 点之间，无法解决。到目前为止我所做的：我使用将图像保存到缓冲区中esp_camera_fb_get()。然后，我将缓冲区中的值放入二维数组中。然而，当我打印出其中一些值时，它们永远不会变成 0 或 255，即使我覆盖了整个镜头或在其附近放置了一个明亮的光源。

我有四个问题：

如何正确录制图像？
如何将其转换为二维数组？
如何将尺寸从（例如）160x120 缩小到 28x28？
如何正确地Serial.print()复制每个像素值并将其绘制在我的计算机上（例如使用 python matplotlib）

#define CAMERA_MODEL_AI_THINKER

 #include <esp_camera.h>
 #include "camera_pins.h"

 #define FRAME_SIZE FRAMESIZE_QQVGA
 #define WIDTH 160
 #define HEIGHT 120    

 uint16_t img_array [HEIGHT][WIDTH] = { 0 };


 bool setup_camera(framesize_t);
 void frame_to_array(camera_fb_t * frame);
 void print_image_shape(camera_fb_t * frame);
 bool capture_image();

 void setup() {
     Serial.begin(115200);
     Serial.println(setup_camera(FRAME_SIZE) ? "OK" : "ERR INIT");
 }

 void loop() {
     if (!capture_image()) {
         Serial.println("Failed capture");
         delay(2000);

         return;
     }

     //print_features();
     delay(3000);
 }



 bool setup_camera(framesize_t frameSize) {
     camera_config_t config;

     config.ledc_channel = LEDC_CHANNEL_0;
     config.ledc_timer = LEDC_TIMER_0;
     config.pin_d0 = Y2_GPIO_NUM;
     config.pin_d1 = Y3_GPIO_NUM;
     config.pin_d2 = Y4_GPIO_NUM;
     config.pin_d3 = Y5_GPIO_NUM;
     config.pin_d4 = Y6_GPIO_NUM;
     config.pin_d5 = Y7_GPIO_NUM;
     config.pin_d6 = Y8_GPIO_NUM;
     config.pin_d7 = Y9_GPIO_NUM;
     config.pin_xclk = XCLK_GPIO_NUM;
     config.pin_pclk = PCLK_GPIO_NUM;
     config.pin_vsync = VSYNC_GPIO_NUM;
     config.pin_href = HREF_GPIO_NUM;
     config.pin_sscb_sda = SIOD_GPIO_NUM;
     config.pin_sscb_scl = SIOC_GPIO_NUM;
     config.pin_pwdn = PWDN_GPIO_NUM;
     config.pin_reset = RESET_GPIO_NUM;
     config.xclk_freq_hz = 20000000;
     config.pixel_format = PIXFORMAT_GRAYSCALE;
     config.frame_size = frameSize;
     config.jpeg_quality = 12;
     config.fb_count = 1;

     bool ok = esp_camera_init(&config) == ESP_OK;

     sensor_t *sensor = esp_camera_sensor_get();
     sensor->set_framesize(sensor, frameSize);

     return ok;
 }



 bool capture_image() {

     camera_fb_t * frame = NULL;
     frame = esp_camera_fb_get();

     print_image_shape(frame);

     frame_to_array(frame);

     esp_camera_fb_return(frame);

     if (!frame)
         return false;

     return true;
 }


 void print_image_shape(camera_fb_t * frame){

     // print shape of image and total length (=heigth*width)
     Serial.print("Width: ");
     Serial.print(frame->width);
     Serial.print("\tHeigth: ");
     Serial.print(frame->height);
     Serial.print("\tLength: ");
     Serial.println(frame->len);
 }

 void frame_to_array(camera_fb_t * frame){

     int len = frame->len;
     char imgBuffer[frame->len];
     int counter = 0;

     uint16_t img_array [HEIGHT][WIDTH] = { 0 };

     int h_counter = 0;
     int w_counter = 0;

     // write values from buffer into 2D Array
     for (int h=0; h < HEIGHT; h++){
         //Serial.println(h);
         for (int w=0; w < WIDTH; w++){
             //Serial.println(w);
             int position = h*(len/HEIGHT)+w;

             //Serial.println(position);
             img_array[h][w] = {frame->buf[position]};

             //Serial.print(img_array[h][w]);
             //Serial.print(",");
             //delay(2);
         }
     }


     //Serial.println("Current frame:");

     Serial.println("=====================");

 }

Run Code Online (Sandbox Code Playgroud)

相机_pin.h：

#if defined(CAMERA_MODEL_WROVER_KIT)
#define PWDN_GPIO_NUM    -1
#define RESET_GPIO_NUM   -1
#define XCLK_GPIO_NUM    21
#define SIOD_GPIO_NUM    26
#define SIOC_GPIO_NUM    27

#define Y9_GPIO_NUM      35
#define Y8_GPIO_NUM      34
#define Y7_GPIO_NUM      39
#define Y6_GPIO_NUM      36
#define Y5_GPIO_NUM      19
#define Y4_GPIO_NUM      18
#define Y3_GPIO_NUM       5
#define Y2_GPIO_NUM       4
#define VSYNC_GPIO_NUM   25
#define HREF_GPIO_NUM    23
#define PCLK_GPIO_NUM    22

#elif defined(CAMERA_MODEL_ESP_EYE)
#define PWDN_GPIO_NUM    -1
#define RESET_GPIO_NUM   -1
#define XCLK_GPIO_NUM    4
#define SIOD_GPIO_NUM    18
#define SIOC_GPIO_NUM    23

#define Y9_GPIO_NUM      36
#define Y8_GPIO_NUM      37
#define Y7_GPIO_NUM      38
#define Y6_GPIO_NUM      39
#define Y5_GPIO_NUM      35
#define Y4_GPIO_NUM      14
#define Y3_GPIO_NUM      13
#define Y2_GPIO_NUM      34
#define VSYNC_GPIO_NUM   5
#define HREF_GPIO_NUM    27
#define PCLK_GPIO_NUM    25

#elif defined(CAMERA_MODEL_M5STACK_PSRAM)
#define PWDN_GPIO_NUM     -1
#define RESET_GPIO_NUM    15
#define XCLK_GPIO_NUM     27
#define SIOD_GPIO_NUM     25
#define SIOC_GPIO_NUM     23

#define Y9_GPIO_NUM       19
#define Y8_GPIO_NUM       36
#define Y7_GPIO_NUM       18
#define Y6_GPIO_NUM       39
#define Y5_GPIO_NUM        5
#define Y4_GPIO_NUM       34
#define Y3_GPIO_NUM       35
#define Y2_GPIO_NUM       32
#define VSYNC_GPIO_NUM    22
#define HREF_GPIO_NUM     26
#define PCLK_GPIO_NUM     21

#elif defined(CAMERA_MODEL_M5STACK_WIDE)
#define PWDN_GPIO_NUM     -1
#define RESET_GPIO_NUM    15
#define XCLK_GPIO_NUM     27
#define SIOD_GPIO_NUM     22
#define SIOC_GPIO_NUM     23

#define Y9_GPIO_NUM       19
#define Y8_GPIO_NUM       36
#define Y7_GPIO_NUM       18
#define Y6_GPIO_NUM       39
#define Y5_GPIO_NUM        5
#define Y4_GPIO_NUM       34
#define Y3_GPIO_NUM       35
#define Y2_GPIO_NUM       32
#define VSYNC_GPIO_NUM    25
#define HREF_GPIO_NUM     26
#define PCLK_GPIO_NUM     21

#elif defined(CAMERA_MODEL_AI_THINKER)
#define PWDN_GPIO_NUM     32
#define RESET_GPIO_NUM    -1
#define XCLK_GPIO_NUM      0
#define SIOD_GPIO_NUM     26
#define SIOC_GPIO_NUM     27

#define Y9_GPIO_NUM       35
#define Y8_GPIO_NUM       34
#define Y7_GPIO_NUM       39
#define Y6_GPIO_NUM       36
#define Y5_GPIO_NUM       21
#define Y4_GPIO_NUM       19
#define Y3_GPIO_NUM       18
#define Y2_GPIO_NUM        5
#define VSYNC_GPIO_NUM    25
#define HREF_GPIO_NUM     23
#define PCLK_GPIO_NUM     22

#else
#error "Camera model not selected"
#endif

Run Code Online (Sandbox Code Playgroud)

Answer 1

PHA*_*HAN 5

我没有使用过 ESP32 相机，所以我不能谈论这个，但我在 STM32 上做过一个类似的项目，所以这就是我能回答的：

1. 如何正确录制图像？

我在微控制器上设置摄像头时也遇到了麻烦，所以我的想法和你一样，通过串口将图像传回电脑。请参考第4点。

2.如何将其转换为二维数组？

我怀疑您想这样做是为了将其复制到 tflite 微模型输入缓冲区。如果是这样的话，你就不需要了！您可以将展平的一维图像数组写入模型输入缓冲区，因为这是 tflite micro 实际期望的：

uint8_t img_array[HEIGHT * WIDTH] = { 0 }; // grayscale goes from 0 to 255. fits in 8bits
TfLiteTensor* model_input = nullptr;
...
void setup(){
    ... // Create your tflite interpreter and rest of your code
    model_input = interpreter->input(0); // get model input pointer
}
void loop() {
    ...
    // tflite model has input shape [batch_size, height, width, channels]
    // which in turn is [1, HEIGHT, WIDTH, 1] one channel because I think you are
    // using grayscale images, otherwise 3(RGB)
    // but tflite micro expects flattened 1D array so you can just do this
    for (uint32_t i = 0; i < HEIGHT*WIDTH; i++){
        // Assuming your model input expects signed 8bit integers
        model_input->data.int8[i] = (int8_t) (img_array[i] - 128);
    }
}

Run Code Online (Sandbox Code Playgroud)

编辑：最后一行获取model_input指向模型输入结构的指针并访问其data成员（如果您不熟悉 C 中的结构指针，请参阅此内容）。然后，由于我假设您的模型输入数据类型是 8 位有符号整数，因此它使用int8. model_input->data.f[i]例如，如果您的模型输入数据类型是 32 位浮点数，您就可以使用。这是包含所有可用访问类型的源代码。正确寻址模型输入缓冲区后，我们分配相应的img_array像素数据。由于像素数据的范围是[0, 255]，我们需要将其转换为有效的有符号8位整数类型和范围，因此必须减去128，得到[-128, 127]范围。

希望你明白了。如果您使用 RGB565 等其他格式，请告诉我，我会给您一个不同的片段。

编辑：如果您要捕获 RGB 图像，最常用的格式是 RGB565，这意味着每 16 位就有一个像素数据（5 为红色，6 为绿色，5 为蓝色）。下面是一个片段，它将以该格式捕获的图像转换为 RGB888（这可能是您的模型所期望的）并将其复制到模型输入缓冲区：

// NOTICE FRAME BUFFER IS NOW uint16_t to store each pixel
uint16_t img_array[HEIGHT * WIDTH] = { 0 }; 
TfLiteTensor* model_input = nullptr;
...
void setup(){
    ... // Create your tflite interpreter and rest of your code
    model_input = interpreter->input(0); // get model input pointer
}
void loop() {
    ...
    // Fill input buffer
    uint32_t input_ix = 0; // index for the model input

    // tflite model has input shape [batch_size, height, width, channels]
    // which in turn is [1, HEIGHT, WIDTH, 3] three channels because RGB
    // but tflite micro expects flattened 1D array so you can just do this
    for (uint32_t pix = 0; i < HEIGHT*WIDTH; pix++){
       // Convert from RGB55 to RGB888 and int8 range
       uint16_t color = img_array[pix];
       int16_t r = ((color & 0xF800) >> 11)*255/0x1F - 128;
       int16_t g = ((color & 0x07E0) >> 5)*255/0x3F - 128;
       int16_t b = ((color & 0x001F) >> 0)*255/0x1F - 128;

       model_input->data.int8[input_ix] =   (int8_t) r;
       model_input->data.int8[input_ix+1] = (int8_t) g;
       model_input->data.int8[input_ix+2] = (int8_t) b;

       input_ix += 3;
    }
}

Run Code Online (Sandbox Code Playgroud)

这是 C 语言中 RGB888 到 RGB565 的分步指南，我只是做了相反的操作。您可能已经注意到屏蔽颜色通道位后的乘法。以红色为例：一旦你屏蔽掉这些位，(color & 0xF800) >> 11)红色值将从 [0, (2^5)-1] 开始，但我们想要一个 [0, 255] 范围，所以我们除以该数字（ (2^5 )-1 = 31 = 0x1F ) 并乘以 255，给出我们想要的范围。然后我们可以减去 128 得到 [-128, 127] 有符号的 8 位范围。事实上，之前进行乘法是为了保持精度。蓝色通道是相同的，在绿色通道中，我们除以 (2^6)-1=63=0x3F，因为它有 6 位。

3. 如何将尺寸从（例如）160x120 缩小到 28x28？

你可以用 C 语言实现一个算法，但我采用了简单的方法：我在已经训练好的模型中添加了一个预处理 lambda 层，它的作用就是：

IMG_SIZE = (28, 28)

def lm_uc_preprocess(inputs):
    # 'nearest' is the ONLY method supported by tflite micro as of October 2020 as you can see in
    # https://github.com/tensorflow/tensorflow/blob/a1e5d73663152b0d7f0d9661e5d602b442acddba/tensorflow/lite/micro/all_ops_resolver.cc#L70
    res_imgs = tf.image.resize(inputs, IMG_SIZE, method='nearest') 
    # Normalize to the range [-1,1] # (OPTIONAL)
    norm_imgs = res_imgs*(1/127.5) -1 # multiply by reciprocal of 127.5 as DIV is not supported by tflite micro
    return norm_imgs

Run Code Online (Sandbox Code Playgroud)

编辑：大多数计算机视觉模型期望图像输入值范围为 [0, 1] 或 [-1, 1]，但像素值通常为 8 位，因此它们的范围为 [0, 255]。要将它们的值标准化到所需的范围 [a, b]，我们可以应用以下公式：
$归一化公式$

在我们的例子中，min(x)=0，max(x)=255，a=-1，b=1。因此，每个归一化值是x_normalized = x_value/127.5 -1。
直观地你可以看到255/127.5 -1 = 1，以及0/255 -1 = -1。这就是 127.5 和 -1 值的来源。

现在您可以定义完整的模型：

capture_height, capture_width, channels = (160, 120, 1)

uc_final_model = keras.models.Sequential([
    keras.layers.InputLayer((capture_height, capture_width, channels), dtype=tf.float32),
    keras.layers.Lambda(lm_uc_preprocess), # (160, 120) to (28, 28)
    my_trained_model
])

# You should quantize your model parameters and inputs to int8 when compressing to tflite after this

Run Code Online (Sandbox Code Playgroud)

这样，最终模型的输入形状等于相机捕获分辨率。这允许我复制图像数组，如第 2 点所示。

4.如何正确地 Serial.print() 每个像素值来复制这些值并将它们绘制在我的计算机上（例如使用 python matplotlib）

我尝试了一些方法，这对我有用：您可以尝试打印这样的值123, 32, 1, 78, 90,（即用逗号分隔），这应该很容易做到。然后，如果您使用 Arduino，您可以使用这个很酷的程序来记录串行数据。如果您不使用 arduino，Putty 具有日志记录功能。然后你可以做这样的事情：

with open("img_test.txt") as f:
    str_img_test = f.read()

img_test = np.array(str_img_test.split(",")[:-1], dtype=np.uint8)
img_test = img_test.reshape(160, 120)

plt.figure()
plt.imshow(img_test)
plt.axis('off')
plt.show()

Run Code Online (Sandbox Code Playgroud)

捕获图像并保存日志的过程有点麻烦，但它不应该太令人沮丧，因为这只是为了调试图像是否正确捕获。

这是一个非常普遍的问题，所以如果我错过了什么或者您想在某些方面有更深入的了解，请告诉我。

编辑

我已在此存储库上公开（并开源）我的完整代码和文档，其中包含与您正在构建的应用程序非常相似的应用程序。此外，我还计划将计算机视觉示例移植到 ESP32。请注意，该存储库正在开发中，并且将持续一段时间，尽管此示例已经完成（待修订）。

我认为很多对微控制器深度学习感兴趣的人会发现该存储库有趣且有用。

归档时间：	5 年前
查看次数：	5878 次
最近记录：	5 年前