将格式正确的JSON写入S3以加载Athena/Redshift

Mez*_*Mez 3 amazon-web-services aws-lambda amazon-athena amazon-kinesis-firehose

我有一个触发器正在为Kinesis上收到的每个事务执行lambda函数.生产者通过PutRecordsRequest()方法发送多个交易.Lambda函数如下;

var AWS = require('aws-sdk');
var firehose = new AWS.Firehose();
var fhStreamName = "transactions";

function writeToS3(jsonString,firehoseStreamName){

    console.log("Writing to S3 : " + jsonString)

    // Prepare storage to postings firehose stream...
    var params = { 
        DeliveryStreamName: firehoseStreamName, 
        Record: { 
            Data:  jsonString
        }
    };

    // Store data!
    firehose.putRecord(params, function(err, data) {
        if (err) { 

            // This needs to be fired to Kinesis in the future...
            console.log(err, err.stack); 
        }
        else{  
            console.log(data);            
        }
    });
}

function processEvent(event) {

    // Convert data object because this is all that we need
    var buf = new Buffer(event, "base64"); 

    // Convert to actual string which is readable
    var jsonString = buf.toString("utf8");

    return jsonString;
}   

exports.handler = function(event, context) {  

    var result = "";  

    // Loop events and register to firehose...  
    for(var i=0; i<event.Records.length; i++){
        result = result + processEvent(event.Records[i].kinesis.data,fhStreamName); 
    }   

    writeToS3(result,fhStreamName); 

    context.done();
};
Run Code Online (Sandbox Code Playgroud)

但是,在编写事务时,在S3上它们不会写为JSON数组.以下是一个例子:

{
  "userName" : "val1",
  "betID" : "val2",
  "anotherID" : val3
}{
  "userName" : "val4",
  "anotherID" : "val5",
  "productID" : val6, 
}
Run Code Online (Sandbox Code Playgroud)

这种格式的数据可以直接加载到Athena或Redshift,还是必须在有效的数组中?我可以在http://docs.aws.amazon.com/redshift/latest/dg/copy-usage_notes-copy-from-json.html看到它仍然可以加载到Redshift中.

以下是在Athena中创建表时使用的属性...

ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
) LOCATION 's3://asgaard-data/data/'
Run Code Online (Sandbox Code Playgroud)

如何加载此数据才能查询?

Jam*_*mes 5

对于Athena,JSON记录需要是每行一个对象:

{ "userName" : "val1", "betID" : "val2", "anotherID" : val3 }
{ "userName" : "val4", "anotherID" : "val5", "productID" : val6 }
Run Code Online (Sandbox Code Playgroud)

这看似违反直觉,因为生成的文件本身并不是格式良好的JSON对象,但换行符分隔的文本适用于Athena,Hive和类似的处理工具.我相信相同的结构适用于Redshift,但Redshift有更多的选择.