使用日志在 s3 存储桶内搜索

CREATE EXTERNAL TABLE IF NOT EXISTS S3Accesslogs(
  BucketOwner string,
  Bucket string,
  RequestDateTime string,
  RemoteIP string,
  Requester string,
  RequestID string,
  Operation string,
  Key string,
  RequestURI_operation string,
  RequestURI_key string,
  RequestURI_httpProtoversion string,
  HTTPstatus string,
  ErrorCode string,
  BytesSent string,
  ObjectSize string,
  TotalTime string,
  TurnAroundTime string,
  Referrer string,
  UserAgent string,
  VersionId string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
 'serialization.format' = '1',
  'input.regex' = '([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\") ([^ ]*)$'
) 
LOCATION 's3://s3-server-access/logs/'

Run Code Online (Sandbox Code Playgroud)

然后，您可以使用 SQL 查询它：

SELECT requestdatetime, bucket, remoteip, requesturi_key
FROM s3accesslogs
WHERE bucket IN ('bucket1', 'bucket2')
    AND remoteip = '123.45.67.89'
ORDER BY requestdatetime DESC
LIMIT 100;

Run Code Online (Sandbox Code Playgroud)

归档时间：	9 年，1 月前
查看次数：	12764 次
最近记录：	6 年，5 月前

使用日志在 ​​s3 存储桶内搜索

使用日志在 s3 存储桶内搜索