在客户端JavaScript中读取逐行文件

Question

在客户端JavaScript中读取逐行文件

Ant*_*rin 22 html javascript html5 client-side filereader

你可以帮我解决以下问题.

目标

在客户端(通过JS和HTML5类在浏览器中)逐行读取文件,而不将整个文件加载到内存中.

脚本

我正在处理网页,它应解析客户端的文件.目前,我正在阅读本文中描述的文件.

HTML:

<input type="file" id="files" name="files[]" />

Run Code Online (Sandbox Code Playgroud)

JavaScript的:

$("#files").on('change', function(evt){
    // creating FileReader
    var reader = new FileReader();

    // assigning handler
    reader.onloadend = function(evt) {      
        lines = evt.target.result.split(/\r?\n/);

        lines.forEach(function (line) {
            parseLine(...);
        }); 
    };

    // getting File instance
    var file = evt.target.files[0];

    // start reading
    reader.readAsText(file);
}

Run Code Online (Sandbox Code Playgroud)

问题是FileReader立即读取整个文件,这会导致大文件崩溃的选项卡(大小> = 300 MB).使用reader.onprogress不能解决问题,因为它只会增加结果,直到它达到极限.

发明一个轮子

我已经在互联网上做了一些研究,并没有找到任何简单的方法来做到这一点(有很多文章描述了这个确切的功能,但在服务器端为node.js).

作为解决它的唯一方法,我只看到以下内容:

按块拆分文件(通过File.split(startByte, endByte)方法)
在该块中找到最后一个新行字符('/ n')
读取除最后一个新行字符后的部分以外的那个块,并将其转换为字符串并按行分割
从第2步中找到的最后一个新行字符开始读取下一个块

但我最好使用现有的东西来避免熵增长.

Answer 1

Ant*_*rin 14

最终我创建了新的逐行阅读器,这与以前的阅读器完全不同.

特点是:

基于索引的文件访问(顺序和随机)
针对重复随机读取进行了优化(为已经过去导航的行保存了字节偏移的里程碑),因此在您读取所有文件一次后,访问第43422145行的速度几乎与访问第12行一样快.
在文件中搜索:找到下一个并找到所有.
精确的索引,偏移量和匹配长度,因此您可以轻松突出显示它们

查看这个jsFiddle以获取示例.

用法:

// Initialization
var file; // HTML5 File object
var navigator = new FileNavigator(file);

// Read some amount of lines (best performance for sequential file reading)
navigator.readSomeLines(startingFromIndex, function (err, index, lines, eof, progress) { ... });

// Read exact amount of lines
navigator.readLines(startingFromIndex, count, function (err, index, lines, eof, progress) { ... });

// Find first from index
navigator.find(pattern, startingFromIndex, function (err, index, match) { ... });

// Find all matching lines
navigator.findAll(new RegExp(pattern), indexToStartWith, limitOfMatches, function (err, index, limitHit, results) { ... });

Run Code Online (Sandbox Code Playgroud)

性能与以前的解决方案相同.你可以测量它在jsFiddle中调用'Read'.

GitHub:https://github.com/anpur/client-line-navigator/wiki

Answer 2

Ant*_*rin 8

更新:从我的第二个答案检查LineNavigator,那个读者更好.

我制作了自己的读者,满足了我的需求.

性能

由于问题仅与大文件有关,因此性能是最重要的部分. 在此输入图像描述

如您所见,性能与直接读取几乎相同(如上所述). ~~目前我正在努力使它变得更好,因为更大的时间消费者是异步调用以避免调用堆栈限制命中,这对于执行问题不是必需的.~~性能问题解决了.

质量

以下案例进行了测试:

空的文件
单行文件
最后用新行char填充文件而不用
检查已解析的行
在同一页面上多次运行
没有线丢失,没有订单问题

代码和用法

HTML:

<input type="file" id="file-test" name="files[]" />
<div id="output-test"></div>

Run Code Online (Sandbox Code Playgroud)

用法:

$("#file-test").on('change', function(evt) {
    var startProcessing = new Date();
    var index = 0;
    var file = evt.target.files[0];
    var reader = new FileLineStreamer();
    $("#output-test").html("");

    reader.open(file, function (lines, err) {
        if (err != null) {
            $("#output-test").append('<span style="color:red;">' + err + "</span><br />");
            return;
        }
        if (lines == null) {
            var milisecondsSpend = new Date() - startProcessing;
            $("#output-test").append("<strong>" + index + " lines are processed</strong> Miliseconds spend: " + milisecondsSpend + "<br />");           
            return;
        }

        // output every line
        lines.forEach(function (line) {
            index++;
            //$("#output-test").append(index + ": " + line + "<br />");
        });

        reader.getNextLine();
    });

    reader.getNextLine();   
});

Run Code Online (Sandbox Code Playgroud)

码:

function FileLineStreamer() {   
    var loopholeReader = new FileReader();
    var chunkReader = new FileReader(); 
    var delimiter = "\n".charCodeAt(0); 

    var expectedChunkSize = 15000000; // Slice size to read
    var loopholeSize = 200;         // Slice size to search for line end

    var file = null;
    var fileSize;   
    var loopholeStart;
    var loopholeEnd;
    var chunkStart;
    var chunkEnd;
    var lines;
    var thisForClosure = this;
    var handler;

    // Reading of loophole ended
    loopholeReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole (start: )"));
            return;
        }
        var view = new DataView(evt.target.result);

        var realLoopholeSize = loopholeEnd - loopholeStart;     

        for(var i = realLoopholeSize - 1; i >= 0; i--) {                    
            if (view.getInt8(i) == delimiter) {
                chunkEnd = loopholeStart + i + 1;
                var blob = file.slice(chunkStart, chunkEnd);
                chunkReader.readAsText(blob);
                return;
            }
        }

        // No delimiter found, looking in the next loophole
        loopholeStart = loopholeEnd;
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
        thisForClosure.getNextBatch();
    };

    // Reading of chunk ended
    chunkReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole"));
            return;
        }

        lines = evt.target.result.split(/\r?\n/);       
        // Remove last new line in the end of chunk
        if (lines.length > 0 && lines[lines.length - 1] == "") {
            lines.pop();
        }

        chunkStart = chunkEnd;
        chunkEnd = Math.min(chunkStart + expectedChunkSize, fileSize);
        loopholeStart = Math.min(chunkEnd, fileSize);
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);

        thisForClosure.getNextBatch();
    };

    this.getProgress = function () {
        if (file == null)
            return 0;
        if (chunkStart == fileSize)
            return 100;         
        return Math.round(100 * (chunkStart / fileSize));
    }

    // Public: open file for reading
    this.open = function (fileToOpen, linesProcessed) {
        file = fileToOpen;
        fileSize = file.size;
        loopholeStart = Math.min(expectedChunkSize, fileSize);
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
        chunkStart = 0;
        chunkEnd = 0;
        lines = null;
        handler = linesProcessed;
    };

    // Public: start getting new line async
    this.getNextBatch = function() {
        // File wasn't open
        if (file == null) {     
            handler(null, new Error("You must open a file first"));
            return;
        }
        // Some lines available
        if (lines != null) {
            var linesForClosure = lines;
            setTimeout(function() { handler(linesForClosure, null) }, 0);
            lines = null;
            return;
        }
        // End of File
        if (chunkStart == fileSize) {
            handler(null, null);
            return;
        }
        // File part bigger than expectedChunkSize is left
        if (loopholeStart < fileSize) {
            var blob = file.slice(loopholeStart, loopholeEnd);
            loopholeReader.readAsArrayBuffer(blob);
        }
        // All file can be read at once
        else {
            chunkEnd = fileSize;
            var blob = file.slice(chunkStart, fileSize);
            chunkReader.readAsText(blob);
        }
    };
};

Run Code Online (Sandbox Code Playgroud)

归档时间：	11 年，5 月前
查看次数：	26039 次
最近记录：	8 年，5 月前