分析node.js中的大型json日志文件

use*_*549 7 javascript json node.js

我有以下JSON文件:

sensorlogs.json
{"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12},
{"UTCTime":10000002,"s1":23,"s2":33,"s4":13},
{"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14},
{"UTCTime":10000005,"s1":26,"s2":36,"s3":44,"s4":16},
{"UTCTime":10000006,"s1":27,"s2":37,"s4":17},
{"UTCTime":10000004,"s1":25,"s2":35,"s4":15},
...
{"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99}
]}
Run Code Online (Sandbox Code Playgroud)

传感器s1,s2,s3等都以不同的频率发送(注意s3每2秒发送一次,并且时间表可能出现故障).

我怎样才能实现像 -

Analyzing s1:
s = [[10000001, 22], [10000002, 23],.. [12345678,57]]
s1 had 2 missing entries
Analyzing s2:
s = [[10000001, 32], [10000002, 33],.. [12345678,35]]
s2 had 0 missing entries
Analyzing s3:
s = [[10000001, 42], [10000003, 43],.. [12345678,77]]
s3 had 0 missing entries
Analyzing s4:
s = [[10000001, 12], [10000003, 13],.. [12345678,99]]
s4 had 1 missing entries
Run Code Online (Sandbox Code Playgroud)

sensorlogs.json是16 GB.

可以根据连续UTC时间戳的差异找到缺少的条目.每个传感器以已知频率传输.

由于内存限制,我无法使用多个大型数组进行分析,因此我必须在同一个JSON日志文件上进行多次传递,并且只使用单个大型数组进行分析.

到目前为止我所拥有的是 -

var result = [];
//1. Extract all the keys from the log file
console.log("Extracting keys... \n");
var stream = fs.createReadStream(filePath);
var lineReader = lr.createInterface(
{
  input: stream
});

lineReader.on('line', function (line) 
{
  getKeys(line);//extract all the keys from the JSON
});
stream.on('end', function()
{
  //obj -> arr
  for(var key in tmpObj)
    arrStrm.push(key);

  //2. Validate individual sensors
  console.log("Validating the sensor data ...\n");

  //Synchronous execution of the sensors in the array
  async.each(arrStrm, function(key)
  {
    {
        currSensor = key;
        console.log("validating " + currSensor + "...\n");

        stream = fs.createReadStream(filePath);
        lineReader = lr.createInterface(
        {
          input: stream
        });

        lineReader.on('line', function (line) 
        {
          processLine(line);//Create the arrays for the sensors
        });
        stream.on('end', function()
        {
            processSensor(currSensor);//Process the data for the current sensor
        });
    }
  });
});

function getKeys(line) 
{
    if(((pos = line.indexOf('[')) >= 0)||((pos = line.indexOf(']')) >= 0))
        return;
    if (line[line.length-1] == '\r') line=line.substr(0,line.length-1); // discard CR (0x0D)
    if (line[line.length-1] == ',') line=line.substr(0,line.length-1); // discard ,
//  console.log(line);

    if (line.length > 1) 
    { // ignore empty lines
        var obj = JSON.parse(line); // parse the JSON
        for(var key in obj) 
        {
            if(key != "debug")
            {
                if(tmpObj[key] == undefined)
                    tmpObj[key]=[];
            }
        };
    }
}
Run Code Online (Sandbox Code Playgroud)

当然这不起作用,我无法在网上找到任何解释如何实现的内容.

注意:我可以选择任何我选择的语言来开发这个工具(C/C++,C#/ Java/Python),但我会使用JavaScript,因为它能够轻松地解析JSON数组(而且我对JS的改进感兴趣)以及).如果JavaScript不是最好的语言制作这样的工具,是否有人喜欢建议使用替代语言?

编辑:一些重要的信息要么不是很清楚,要么我之前没有包含,但看起来重要的是要包含在问题中 -

  1. JSON日志中的数据不是实时流式传输,而是存储在硬盘中的JSON文件
  2. 存储的数据不是按时间顺序排列的,这意味着时间戳可能没有按正确的顺序排列.因此,每个传感器需要对数据进行排序基于所述时间戳之后,已经存储在数组中
  3. 我不能为每个传感器使用单独的数组(这与在RAM中存储整个16 GB JSON相同),为了节省内存,一次只能使用一个数组.是的,我的日志中有超过4个传感器,这只是一个样本(大约20个给出一个想法)

我修改了我的JSON和预期输出

一种解决方案可能是对JSON文件进行多次传递,一次将一个传感器数据与时间戳存储在一个数组中,然后对数组进行排序,最后分析数据是否存在损坏和间隙.这就是我在上面的代码中尝试做的事情

Art*_*rez 2

再次编辑以考虑您的编辑:

var fs = require('fs');
var stream = fs.createReadStream('sensorlogs.json', {flags: 'r', encoding: 'utf-8'});
var buffer = '';
var sensor = process.argv[2];
var readings = [];
var missingCont = 0;
console.log('Analizying ' + sensor + ':');

stream.on('data', function(d) {
    buffer += d.toString();
    processBuffer();
    console.log(readings);
    console.log(sensor + ' had ' + missingCont + ' missing entries');
});

function processBuffer() {
  buffer = buffer.slice(buffer.indexOf('[{'));
  while(buffer.indexOf('{') != -1) {
    buffer = buffer.slice(buffer.indexOf('{"'));
    processLine(buffer.slice(0, buffer.indexOf('}') + 1));
    buffer = buffer.slice(buffer.indexOf('}') + 2);
  }
};

function processLine(line) {
  if(line != ""){
    var obj = JSON.parse(line);
    if(!obj[sensor]){
      missingCont++;
    }else{
      var pos;
      for(pos = 0; pos < readings.length; pos++){
        if(obj.UTCTime < readings[pos][0]){
          var reading = [obj.UTCTime, obj[sensor]]
          readings.splice(pos, 0, reading);
          break;
        }
      }
      if(pos == readings.length){
        readings.push([obj.UTCTime, obj[sensor]]);
      }
    }
  }
};
Run Code Online (Sandbox Code Playgroud)

您必须使用要分析的传感器的参数来调用它:

node.exe scripts\processJson.js <param>
Run Code Online (Sandbox Code Playgroud)

为了测试它,我拿了这个样本:

{"arr":[{"UTCTime":10000001,"s1":22,"s2":32,"s3":42,"s4":12},
{"UTCTime":10000005,"s1":20,"s2":30,"s3":40,"s4":10},
{"UTCTime":10000002,"s1":23,"s2":33,"s4":13},
{"UTCTime":10000003,"s1":24,"s2":34,"s3":43,"s4":14},
{"UTCTime":12345678,"s1":57,"s2":35,"s3":77,"s4":99}
]}
Run Code Online (Sandbox Code Playgroud)

输出是:

> node.exe scripts\processJson.js s1
Analizying s1:
[[10000001, 22], [10000002, 23], [10000003, 24], [10000005, 20], [12345678, 57]]
s1 had 0 missing entries

> node.exe scripts\processJson.js s2
Analizying s2:
[[10000001, 32], [10000002, 33], [10000003, 34], [10000005, 30], [12345678, 35]]
s2 had 0 missing entries

> node.exe scripts\processJson.js s3
Analizying s3:
[[10000001, 42], [10000003, 43], [10000005, 40], [12345678, 77]]
s3 had 1 missing entries

> node.exe scripts\processJson.js s4
Analizying s4:
[[10000001, 12], [10000002, 13], [10000003, 14], [10000005, 10], [12345678, 99]]
s4 had 0 missing entries
Run Code Online (Sandbox Code Playgroud)