使用 Terraform 创建时,ECS Fargate 任务无法进行运行状况检查

Vis*_*rez 4 amazon-web-services node.js docker terraform aws-fargate

我创建了一个 ECS 集群以及一个负载均衡器,以使用 Terraform 在 Fargate 上公开一个 basc hello-world 节点应用程序。Terraform 成功地创建了我的 aws 资源,并在 ECS Fargate 上部署了正确的映像,但该任务从未通过初始运行状况检查并无限期地重新启动。我认为这是一个端口转发问题,但我相信我的Dockerfile负载均衡器和任务定义都公开了正确的端口。

以下是我在 ECS 仪表板上查看服务的“事件”选项卡时看到的错误:

service my-first-service (port 2021) is unhealthy in target-group target-group due to (reason Request timed out).
Run Code Online (Sandbox Code Playgroud)

以下是我用于部署到 Fargate 的应用程序代码、Dockerfile 和 Terraform 文件:

index.js

service my-first-service (port 2021) is unhealthy in target-group target-group due to (reason Request timed out).
Run Code Online (Sandbox Code Playgroud)

Dockerfile

const express = require('express')
const app = express()
const port = 2021

app.get('/', (req, res) => res.send('Hello World!'))

app.listen(port, () => console.log(`Example app listening on port ${port}!`))
Run Code Online (Sandbox Code Playgroud)

application_load_balancer_target_group.tf

# Use an official Node runtime as a parent image
FROM node:12.7.0-alpine

# Set the working directory to /app
WORKDIR '/app'

# Copy package.json to the working directory
COPY package.json .

# Install any needed packages specified in package.json
RUN yarn

# Copying the rest of the code to the working directory
COPY . .

# Make port 2021 available to the world outside this container
EXPOSE 2021

# Run index.js when the container launches
CMD ["node", "index.js"]
Run Code Online (Sandbox Code Playgroud)

application_load_balaner.tf

resource "aws_lb_target_group" "target_group" {
  name        = "target-group"
  port        = 80
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = "${aws_default_vpc.default_vpc.id}" # Referencing the default VPC
  health_check {
    matcher = "200,301,302"
    path = "/"
  }
}

resource "aws_lb_listener" "listener" {
  load_balancer_arn = "${aws_alb.application_load_balancer.arn}" # Referencing our load balancer
  port              = "80"
  protocol          = "HTTP"
  default_action {
    type             = "forward"
    target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our tagrte group
  }
}
Run Code Online (Sandbox Code Playgroud)

ecs_cluster.tf

resource "aws_alb" "application_load_balancer" {
  name               = "test-lb-tf" # Naming our load balancer
  load_balancer_type = "application"
  subnets = [ # Referencing the default subnets
    "${aws_default_subnet.default_subnet_a.id}",
    "${aws_default_subnet.default_subnet_b.id}",
    "${aws_default_subnet.default_subnet_c.id}"
  ]
  # Referencing the security group
  security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
}

# Creating a security group for the load balancer:
resource "aws_security_group" "load_balancer_security_group" {
  ingress {
    from_port   = 80 # Allowing traffic in from port 80
    to_port     = 80
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic in from all sources
  }

  egress {
    from_port   = 0 # Allowing any incoming port
    to_port     = 0 # Allowing any outgoing port
    protocol    = "-1" # Allowing any outgoing protocol 
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
  }
}
Run Code Online (Sandbox Code Playgroud)

ecs_service.tf

resource "aws_ecs_cluster" "my_cluster" {
  name = "my-cluster" # Naming the cluster
}
Run Code Online (Sandbox Code Playgroud)

ecs_task_definition.tf

# Providing a reference to our default VPC (these are needed by the aws_ecs_service at the bottom of this file)
resource "aws_default_vpc" "default_vpc" {
}

# Providing a reference to our default subnets (NOTE: Make sure the availability zones match your zone)
resource "aws_default_subnet" "default_subnet_a" {
  availability_zone = "us-east-2a"
}

resource "aws_default_subnet" "default_subnet_b" {
  availability_zone = "us-east-2b"
}

resource "aws_default_subnet" "default_subnet_c" {
  availability_zone = "us-east-2c"
}


resource "aws_ecs_service" "my_first_service" {
  name            = "my-first-service"                             # Naming our first service
  cluster         = "${aws_ecs_cluster.my_cluster.id}"             # Referencing our created Cluster
  task_definition = "${aws_ecs_task_definition.my_first_task.arn}" # Referencing the task our service will spin up
  launch_type     = "FARGATE"
  desired_count   = 1 # Setting the number of containers we want deployed to 1

  # NOTE: The following 'load_balancer' snippet was added here after the creation of the application_load_balancer files.
  load_balancer {
    target_group_arn = "${aws_lb_target_group.target_group.arn}" # Referencing our target group
    container_name   = "${aws_ecs_task_definition.my_first_task.family}"
    container_port   = 2021 # Specifying the container port
  }

  network_configuration {
    subnets          = ["${aws_default_subnet.default_subnet_a.id}", "${aws_default_subnet.default_subnet_b.id}", "${aws_default_subnet.default_subnet_c.id}"]
    assign_public_ip = true # Providing our containers with public IPs
  }
}


resource "aws_security_group" "service_security_group" {
  ingress {
    from_port = 0
    to_port   = 0
    protocol  = "-1"
    # Only allowing traffic in from the load balancer security group
    security_groups = ["${aws_security_group.load_balancer_security_group.id}"]
  }

  egress {
    from_port   = 0 # Allowing any incoming port
    to_port     = 0 # Allowing any outgoing port
    protocol    = "-1" # Allowing any outgoing protocol 
    cidr_blocks = ["0.0.0.0/0"] # Allowing traffic out to all IP addresses
  }
}

Run Code Online (Sandbox Code Playgroud)

我这里哪里出错了?

cod*_*zer 7

当我从 k8s 迁移到 ECS Fargate 时,我遇到了同样类似的问题。我的任务无法开始,简直就是噩梦。k8s 中的相同图像在相同的健康检查下运行良好。

我可以看到您在 task_definition 中丢失了healthCheck,至少这对我来说是个问题。

这是我的containerDefinition

  container_definitions = jsonencode([{
    name        = "${var.app_name}-container-${var.environment}"
    image       = "${var.container_repository}:${var.container_image_version}"
    essential   = true

    environment: concat(
      var.custom_env_variables,
      [
        {
          name  = "JAVA_TOOL_OPTIONS"
          value = "-Xmx${var.container_memory_max_ram}m -XX:MaxRAM=${var.container_memory_max_ram}m -XX:+UseParallelGC -XX:MinHeapFreeRatio=5 -XX:MaxHeapFreeRatio=10 -XX:GCTimeRatio=4"
        },
        {
          name  = "SPRING_PROFILES_ACTIVE"
          value = var.spring_profile
        },
        {
          name  = "APP_NAME"
          value = var.spring_app_name
        }
      ]
    )

    portMappings = [
      {
        protocol      = "tcp"
        containerPort = var.container_port
      },
      {
        protocol      = "tcp"
        containerPort = var.container_actuator_port
      }
    ]
    healthCheck = {
      retries = 10
      command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
      timeout: 5
      interval: 10
      startPeriod: var.health_start_period
    }
    logConfiguration = {
      logDriver = "awslogs"
      options = {
        awslogs-group         = aws_cloudwatch_log_group.main.name
        awslogs-stream-prefix = "ecs"
        awslogs-region        = var.aws_region
      }
    }
    mountPoints = [{
        sourceVolume = "backend_efs",
        containerPath = "/data",
        readOnly = false
    }]
  }])
Run Code Online (Sandbox Code Playgroud)

healthCheckaprt:

healthCheck = {
      retries = 10
      command = [ "CMD-SHELL", "curl -f http://localhost:8081/actuator/liveness || exit 1" ]
      timeout: 5
      interval: 10
      startPeriod: var.health_start_period
    }
Run Code Online (Sandbox Code Playgroud)

为了启动容器,需要有一种方法来检查任务是否运行正常。我只能通过curl. 我有一个端点可以返回我是否处于活动状态。你需要指定你的,重要的是返回 200。

另外,默认情况下没有curl命令,您需要添加它,DockerFile因为这是我花了几个小时的下一个问题,因为 ECS 上没有明显的错误。

我添加了这一行:

RUN apt-get update && apt-get install -y --no-install-recommends curl
Run Code Online (Sandbox Code Playgroud)