Read a file one line at a time in node.js?

I am trying to read a large file one line at a time. I found a question on Quora that dealt with the subject but I'm missing some connections to make the whole thing fit together.

 var Lazy=require("lazy");
 new Lazy(process.stdin)
     .lines
     .forEach(
          function(line) { 
              console.log(line.toString()); 
          }
 );
 process.stdin.resume();

The bit that I'd like to figure out is how I might read one line at a time from a file instead of STDIN as in this sample.

I tried:

 fs.open('./VeryBigFile.csv', 'r', '0666', Process);

 function Process(err, fd) {
    if (err) throw err;
    // DO lazy read 
 }

but it's not working. I know that in a pinch I could fall back to using something like PHP, but I would like to figure this out.

I don't think the other answer would work as the file is much larger than the server I'm running it on has memory for.

Answers


Since Node.js v0.12 and as of Node.js v4.0.0, there is a stable readline core module. Here's the easiest way to read lines from a file, without any external modules:

var lineReader = require('readline').createInterface({
  input: require('fs').createReadStream('file.in')
});

lineReader.on('line', function (line) {
  console.log('Line from file:', line);
});

The last line is read correctly (as of Node v0.12 or later), even if there is no final \n.

UPDATE: this example has been added to Node's API official documentation.


For such a simple operation there shouldn't be any dependency on third-party modules. Go easy.

var fs = require('fs'),
    readline = require('readline');

var rd = readline.createInterface({
    input: fs.createReadStream('/path/to/file'),
    output: process.stdout,
    console: false
});

rd.on('line', function(line) {
    console.log(line);
});

You don't have to open the file, but instead, you have to create a ReadStream.

fs.createReadStream

Then pass that stream to Lazy


there is a very nice module for reading a file line by line, it's called line-reader

with it you simply just write:

var lineReader = require('line-reader');

lineReader.eachLine('file.txt', function(line, last) {
  console.log(line);
  // do whatever you want with line...
  if(last){
    // or check if it's the last one
  }
});

you can even iterate the file with a "java-style" interface, if you need more control:

lineReader.open('file.txt', function(reader) {
  if (reader.hasNextLine()) {
    reader.nextLine(function(line) {
      console.log(line);
    });
  }
});

require('fs').readFileSync('file.txt', 'utf-8').split(/\r?\n/).forEach(function(line){
  console.log(line);
})

You can always roll your own line reader. I have'nt benchmarked this snippet yet, but it correctly splits the incoming stream of chunks into lines without the trailing '\n'

var last = "";

process.stdin.on('data', function(chunk) {
    var lines, i;

    lines = (last+chunk).split("\n");
    for(i = 0; i < lines.length - 1; i++) {
        console.log("line: " + lines[i]);
    }
    last = lines[i];
});

process.stdin.on('end', function() {
    console.log("line: " + last);
});

process.stdin.resume();

I did come up with this when working on a quick log parsing script that needed to accumulate data during the log parsing and I felt that it would nice to try doing this using js and node instead of using perl or bash.

Anyway, I do feel that small nodejs scripts should be self contained and not rely on third party modules so after reading all the answers to this question, each using various modules to handle line parsing, a 13 SLOC native nodejs solution might be of interest .


Old topic, but this works:

var rl = readline.createInterface({
      input : fs.createReadStream('/path/file.txt'),
      output: process.stdout,
      terminal: false
})
rl.on('line',function(line){
     console.log(line) //or parse line
})

Simple. No need for an external module.


With the carrier module:

var carrier = require('carrier');

process.stdin.resume();
carrier.carry(process.stdin, function(line) {
    console.log('got one line: ' + line);
});

I ended up with a massive, massive memory leak using Lazy to read line by line when trying to then process those lines and write them to another stream due to the way drain/pause/resume in node works (see: http://elegantcode.com/2011/04/06/taking-baby-steps-with-node-js-pumping-data-between-streams/ (i love this guy btw)). I haven't looked closely enough at Lazy to understand exactly why, but I couldn't pause my read stream to allow for a drain without Lazy exiting.

I wrote the code to process massive csv files into xml docs, you can see the code here: https://github.com/j03m/node-csv2xml

If you run the previous revisions with Lazy line it leaks. The latest revision doesn't leak at all and you can probably use it as the basis for a reader/processor. Though I have some custom stuff in there.

Edit: I guess I should also note that my code with Lazy worked fine until I found myself writing large enough xml fragments that drain/pause/resume because a necessity. For smaller chunks it was fine.


Edit:

Use a transform stream.


With a BufferedReader you can read lines.

new BufferedReader ("lorem ipsum", { encoding: "utf8" })
    .on ("error", function (error){
        console.log ("error: " + error);
    })
    .on ("line", function (line){
        console.log ("line: " + line);
    })
    .on ("end", function (){
        console.log ("EOF");
    })
    .read ();

I was frustrated by the lack of a comprehensive solution for this, so I put together my own attempt (git / npm). Copy-pasted list of features:

  • Interactive line processing (callback-based, no loading the entire file into RAM)
  • Optionally, return all lines in an array (detailed or raw mode)
  • Interactively interrupt streaming, or perform map/filter like processing
  • Detect any newline convention (PC/Mac/Linux)
  • Correct eof / last line treatment
  • Correct handling of multi-byte UTF-8 characters
  • Retrieve byte offset and byte length information on per-line basis
  • Random access, using line-based or byte-based offsets
  • Automatically map line-offset information, to speed up random access
  • Zero dependencies
  • Tests

NIH? You decide :-)


Since posting my original answer, I found that split is a very easy to use node module for line reading in a file; Which also accepts optional parameters.

var split = require('split');
fs.createReadStream(file)
    .pipe(split())
    .on('data', function (line) {
      //each chunk now is a seperate line! 
    });

Haven't tested on very large files. Let us know if you do.


function createLineReader(fileName){
    var EM = require("events").EventEmitter
    var ev = new EM()
    var stream = require("fs").createReadStream(fileName)
    var remainder = null;
    stream.on("data",function(data){
        if(remainder != null){//append newly received data chunk
            var tmp = new Buffer(remainder.length+data.length)
            remainder.copy(tmp)
            data.copy(tmp,remainder.length)
            data = tmp;
        }
        var start = 0;
        for(var i=0; i<data.length; i++){
            if(data[i] == 10){ //\n new line
                var line = data.slice(start,i)
                ev.emit("line", line)
                start = i+1;
            }
        }
        if(start<data.length){
            remainder = data.slice(start);
        }else{
            remainder = null;
        }
    })

    stream.on("end",function(){
        if(null!=remainder) ev.emit("line",remainder)
    })

    return ev
}


//---------main---------------
fileName = process.argv[2]

lineReader = createLineReader(fileName)
lineReader.on("line",function(line){
    console.log(line.toString())
    //console.log("++++++++++++++++++++")
})

In most cases this should be enough:

const fs = require("fs")

fs.readFile('./file', 'utf-8', (err, file) => {
  const lines = file.split('\n')

  for (let line of lines)
    console.log(line)
});

I wanted to tackle this same problem, basically what in Perl would be:

while (<>) {
    process_line($_);
}

My use case was just a standalone script, not a server, so synchronous was fine. These were my criteria:

  • The minimal synchronous code that could reuse in many projects.
  • No limits on file size or number of lines.
  • No limits on length of lines.
  • Able to handle full Unicode in UTF-8, including characters beyond the BMP.
  • Able to handle *nix and Windows line endings (old-style Mac not needed for me).
  • Line endings character(s) to be included in lines.
  • Able to handle last line with or without end-of-line characters.
  • Not use any external libraries not included in the node.js distribution.

This is a project for me to get a feel for low-level scripting type code in node.js and decide how viable it is as a replacement for other scripting languages like Perl.

After a surprising amount of effort and a couple of false starts this is the code I came up with. It's pretty fast but less trivial than I would've expected: (fork it on GitHub)

var fs            = require('fs'),
    StringDecoder = require('string_decoder').StringDecoder,
    util          = require('util');

function lineByLine(fd) {
  var blob = '';
  var blobStart = 0;
  var blobEnd = 0;

  var decoder = new StringDecoder('utf8');

  var CHUNK_SIZE = 16384;
  var chunk = new Buffer(CHUNK_SIZE);

  var eolPos = -1;
  var lastChunk = false;

  var moreLines = true;
  var readMore = true;

  // each line
  while (moreLines) {

    readMore = true;
    // append more chunks from the file onto the end of our blob of text until we have an EOL or EOF
    while (readMore) {

      // do we have a whole line? (with LF)
      eolPos = blob.indexOf('\n', blobStart);

      if (eolPos !== -1) {
        blobEnd = eolPos;
        readMore = false;

      // do we have the last line? (no LF)
      } else if (lastChunk) {
        blobEnd = blob.length;
        readMore = false;

      // otherwise read more
      } else {
        var bytesRead = fs.readSync(fd, chunk, 0, CHUNK_SIZE, null);

        lastChunk = bytesRead !== CHUNK_SIZE;

        blob += decoder.write(chunk.slice(0, bytesRead));
      }
    }

    if (blobStart < blob.length) {
      processLine(blob.substring(blobStart, blobEnd + 1));

      blobStart = blobEnd + 1;

      if (blobStart >= CHUNK_SIZE) {
        // blobStart is in characters, CHUNK_SIZE is in octets
        var freeable = blobStart / CHUNK_SIZE;

        // keep blob from growing indefinitely, not as deterministic as I'd like
        blob = blob.substring(CHUNK_SIZE);
        blobStart -= CHUNK_SIZE;
        blobEnd -= CHUNK_SIZE;
      }
    } else {
      moreLines = false;
    }
  }
}

It could probably be cleaned up further, it was the result of trial and error.


Generator based line reader: https://github.com/neurosnap/gen-readlines

var fs = require('fs');
var readlines = require('gen-readlines');

fs.open('./file.txt', 'r', function(err, fd) {
  if (err) throw err;
  fs.fstat(fd, function(err, stats) {
    if (err) throw err;

    for (var line of readlines(fd, stats.size)) {
      console.log(line.toString());
    }

  });
});

If you want to read a file line by line and writing this in another:

var fs = require('fs');
var readline = require('readline');
var Stream = require('stream');

function readFileLineByLine(inputFile, outputFile) {

   var instream = fs.createReadStream(inputFile);
   var outstream = new Stream();
   outstream.readable = true;
   outstream.writable = true;

   var rl = readline.createInterface({
      input: instream,
      output: outstream,
      terminal: false
   });

   rl.on('line', function (line) {
        fs.appendFileSync(outputFile, line + '\n');
   });
};

var fs = require('fs');

function readfile(name,online,onend,encoding) {
    var bufsize = 1024;
    var buffer = new Buffer(bufsize);
    var bufread = 0;
    var fd = fs.openSync(name,'r');
    var position = 0;
    var eof = false;
    var data = "";
    var lines = 0;

    encoding = encoding || "utf8";

    function readbuf() {
        bufread = fs.readSync(fd,buffer,0,bufsize,position);
        position += bufread;
        eof = bufread ? false : true;
        data += buffer.toString(encoding,0,bufread);
    }

    function getLine() {
        var nl = data.indexOf("\r"), hasnl = nl !== -1;
        if (!hasnl && eof) return fs.closeSync(fd), online(data,++lines), onend(lines); 
        if (!hasnl && !eof) readbuf(), nl = data.indexOf("\r"), hasnl = nl !== -1;
        if (!hasnl) return process.nextTick(getLine);
        var line = data.substr(0,nl);
        data = data.substr(nl+1);
        if (data[0] === "\n") data = data.substr(1);
        online(line,++lines);
        process.nextTick(getLine);
    }
    getLine();
}

I had the same problem and came up with above solution looks simular to others but is aSync and can read large files very quickly

Hopes this helps


I have a little module which does this well and is used by quite a few other projects npm readline Note thay in node v10 there is a native readline module so I republished my module as linebyline https://www.npmjs.com/package/linebyline

if you dont want to use the module the function is very simple:

var fs = require('fs'),
EventEmitter = require('events').EventEmitter,
util = require('util'),
newlines = [
  13, // \r
  10  // \n
];
var readLine = module.exports = function(file, opts) {
if (!(this instanceof readLine)) return new readLine(file);

EventEmitter.call(this);
opts = opts || {};
var self = this,
  line = [],
  lineCount = 0,
  emit = function(line, count) {
    self.emit('line', new Buffer(line).toString(), count);
  };
  this.input = fs.createReadStream(file);
  this.input.on('open', function(fd) {
    self.emit('open', fd);
  })
  .on('data', function(data) {
   for (var i = 0; i < data.length; i++) {
    if (0 <= newlines.indexOf(data[i])) { // Newline char was found.
      lineCount++;
      if (line.length) emit(line, lineCount);
      line = []; // Empty buffer.
     } else {
      line.push(data[i]); // Buffer new line data.
     }
   }
 }).on('error', function(err) {
   self.emit('error', err);
 }).on('end', function() {
  // Emit last line if anything left over since EOF won't trigger it.
  if (line.length){
     lineCount++;
     emit(line, lineCount);
  }
  self.emit('end');
 }).on('close', function() {
   self.emit('close');
 });
};
util.inherits(readLine, EventEmitter);

Another solution is to run logic via sequential executor nsynjs. It reads file line-by-line using node readline module, and it doesn't use promises or recursion, therefore not going to fail on large files. Here is how the code will looks like:

var nsynjs = require('nsynjs');
var textFile = require('./wrappers/nodeReadline').textFile; // this file is part of nsynjs

function process(textFile) {

    var fh = new textFile();
    fh.open('path/to/file');
    var s;
    while (typeof(s = fh.readLine(nsynjsCtx).data) != 'undefined')
        console.log(s);
    fh.close();
}

var ctx = nsynjs.run(process,{},textFile,function () {
    console.log('done');
});

Code above is based on this exampe: https://github.com/amaksr/nsynjs/blob/master/examples/node-readline/index.js


i use this:

function emitLines(stream, re){
    re = re && /\n/;
    var buffer = '';

    stream.on('data', stream_data);
    stream.on('end', stream_end);

    function stream_data(data){
        buffer += data;
        flush();
    }//stream_data

    function stream_end(){
        if(buffer) stream.emmit('line', buffer);
    }//stream_end


    function flush(){
        var re = /\n/;
        var match;
        while(match = re.exec(buffer)){
            var index = match.index + match[0].length;
            stream.emit('line', buffer.substring(0, index));
            buffer = buffer.substring(index);
            re.lastIndex = 0;
        }
    }//flush

}//emitLines

use this function on a stream and listen to the line events that is will emit.

gr-


While you should probably use the readline module as the top answer suggests, readline appears to be oriented toward command line interfaces rather than line reading. It's also a little bit more opaque regarding buffering. (Anyone who needs a streaming line oriented reader probably will want to tweak buffer sizes). The readline module is ~1000 lines while this, with stats and tests, is 34.

const EventEmitter = require('events').EventEmitter;
class LineReader extends EventEmitter{
    constructor(f, delim='\n'){
        super();
        this.totalChars = 0;
        this.totalLines = 0;
        this.leftover = '';

        f.on('data', (chunk)=>{
            this.totalChars += chunk.length;
            let lines = chunk.split(delim);
            if (lines.length === 1){
                this.leftover += chunk;
                return;
            }
            lines[0] = this.leftover + lines[0];
            this.leftover = lines[lines.length-1];
            if (this.leftover) lines.pop();
            this.totalLines += lines.length;
            for (let l of lines) this.onLine(l);
        });
        // f.on('error', ()=>{});
        f.on('end', ()=>{console.log('chars', this.totalChars, 'lines', this.totalLines)});
    }
    onLine(l){
        this.emit('line', l);
    }
}
//Command line test
const f = require('fs').createReadStream(process.argv[2], 'utf8');
const delim = process.argv[3];
const lineReader = new LineReader(f, delim);
lineReader.on('line', (line)=> console.log(line));

Here's an even shorter version, without the stats, at 19 lines:

class LineReader extends require('events').EventEmitter{
    constructor(f, delim='\n'){
        super();
        this.leftover = '';
        f.on('data', (chunk)=>{
            let lines = chunk.split(delim);
            if (lines.length === 1){
                this.leftover += chunk;
                return;
            }
            lines[0] = this.leftover + lines[0];
            this.leftover = lines[lines.length-1];
            if (this.leftover) 
                lines.pop();
            for (let l of lines)
                this.emit('line', l);
        });
    }
}

const fs = require("fs")

fs.readFile('./file', 'utf-8', (err, data) => {
var innerContent;
    console.log("Asynchronous read: " + data.toString());
    const lines = data.toString().split('\n')
    for (let line of lines)
        innerContent += line + '<br>';


});

I wrap the whole logic of daily line processing as a npm module: line-kit https://www.npmjs.com/package/line-kit

// example
var count = 0
require('line-kit')(require('fs').createReadStream('/etc/issue'),
                    (line) => { count++; },
                    () => {console.log(`seen ${count} lines`)})

I use below code the read lines after verify that its not a directory and its not included in the list of files need not to be check.

(function () {
  var fs = require('fs');
  var glob = require('glob-fs')();
  var path = require('path');
  var result = 0;
  var exclude = ['LICENSE',
    path.join('e2e', 'util', 'db-ca', 'someother-file'),
    path.join('src', 'favicon.ico')];
  var files = [];
  files = glob.readdirSync('**');

  var allFiles = [];

  var patternString = [
    'trade',
    'order',
    'market',
    'securities'
  ];

  files.map((file) => {
    try {
      if (!fs.lstatSync(file).isDirectory() && exclude.indexOf(file) === -1) {
        fs.readFileSync(file).toString().split(/\r?\n/).forEach(function(line){
          patternString.map((pattern) => {
            if (line.indexOf(pattern) !== -1) {
              console.log(file + ' contain `' + pattern + '` in in line "' + line +'";');
              result = 1;
            }
          });
        });
      }
    } catch (e) {
      console.log('Error:', e.stack);
    }
  });
  process.exit(result);

})();

I have looked through all above answers, all of them use third-party library to solve it. It's have a simple solution in Node's API. e.g

const fs= require('fs')

let stream = fs.createReadStream('<filename>', { autoClose: true })

stream.on('data', chunk => {
    let row = chunk.toString('ascii')
}))

Need Your Help

Setting table column width

html css html-table

I've got a simple table that is used for an inbox as follows:

What is the best solution for database connection pooling in python?

python mysql connection-pooling

I have developed some custom DAO-like classes to meet some very specialized requirements for my project that is a server-side process that does not run inside any kind of framework.