Split a CSV string by line skipping newlines conta

2019-03-21 23:33发布

If the following regex can split a csv string by line.

var lines = csv.split(/\r|\r?\n/g);

How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?

Example:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",

If you don't see it, here's a version with the newlines visible:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n 

The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.

Update:

I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.

Here's the code for validating and parsing an entry (ie one line):

$.csvEntry2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;

  // build the CSV validator regex
  var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
  reValid = RegExp(reValid.source.replace(/S/g, separator));
  reValid = RegExp(reValid.source.replace(/D/g, delimiter));

  // build the CSV line parser regex
  var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
  reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
  reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');

  // Return NULL if input string is not well formed CSV string.
  if (!reValid.test(csv)) {
    return null;
  }

  // "Walk" the string using replace with callback.
  var output = [];
  csv.replace(reValue, function(m0, m1, m2) {
    // Remove backslash from any delimiters in the value
    if (m1 !== undefined) {
      var reDelimiterUnescape = /\\D/g;              
      reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
      output.push(m1.replace(reDelimiterUnescape, delimiter));
    } else if (m2 !== undefined) { 
      output.push(m2);
    }
    return '';
  });

  // Handle special case of empty last value.
  var reEmptyLast = /S\s*$/;
  reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
  if (reEmptyLast.test(csv)) {
    output.push('');
  }

  return output;
};

Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.

This is the code that does the split-by-line part:

$.csv2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
  var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;

  // process by line
  var lines = csv.split(/\r\n|\r|\n/g);
  var output = [];
  for(var i in lines) {
    if(i < skip) {
      continue;
    }
    // process each value
    var line = $.csvEntry2Array(lines[i], {
      delimiter: delimiter,
      separator: separator
    });
    output.push(line);
  }

  return output;
};

For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.

I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.

Update:

splitLines: function(csv, delimiter) {
  var state = 0;
  var value = "";
  var line = "";
  var lines = [];
  function endOfRow() {
    lines.push(value);
    value = "";
    state = 0;
  };
  csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
    switch (state) {
      // the start of an entry
      case 0:
        if (m0 === "\"") {
          state = 1;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (/^\r$/.test(m0)) {
          // carriage returns are ignored
        } else {
          value += m0;
          state = 3;
        }
        break;
      // delimited input  
      case 1:
        if (m0 === "\"") {
          state = 2;
        } else {
          value += m0;
          state = 1;
        }
        break;
      // delimiter found in delimited input
      case 2:
        // is the delimiter escaped?
        if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
          value += m0;
          state = 1;
        } else if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal state");
        }
        break;
      // un-delimited input
      case 3:
        if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\"") {
          throw new Error("Unquoted delimiter found");
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal data");
        }
          break;
      default:
        throw new Error("Unknown state");
    }
    return "";
  });
  if (state != 0) {
    endOfRow();
  }
  return lines;
}

All it took is 4 states for a line splitter:

  • 0: the start of an entry
  • 1: the following is quoted
  • 2: a second quote has been encountered
  • 3: the following isn't quoted

It's almost a complete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.

Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a complete parser to a line-splitter.

Update:

Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 compliant.

3条回答
趁早两清
2楼-- · 2019-03-21 23:58

Be careful- That newline is PART of that value. It's not PCB, it's P\nCB.

However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.

查看更多
爷、活的狠高调
3楼-- · 2019-03-22 00:14

As I have noted in a comment there is no complete solution just using single regex.

A novel method using several regexps by splitting on comma and joining back strings with embedded commas is described here:-

Personally I would use a simple finite state machine as described here

The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.

查看更多
萌系小妹纸
4楼-- · 2019-03-22 00:17

It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:

var lines = csv.split(/\r?\n/g);
var bad = [];

for(var i=lines.length-1; i> 0; i--) {
    // find all the unescaped quotes on the line:
    var m = lines[i].match(/[^\\]?\"/g);

    // if there are an odd number of them, this line, and the line after it is bad:
    if((m ? m.length : 0) % 2 == 1) { bad.push(i--); }
}

// starting at the bottom of the list, merge lines back, using \r\n
for(var b=0,len=bad.length; b < len; b++) {
    lines.splice(bad[b]-1, 2, lines[bad[b]-1]+"\r\n"+lines[bad[b]]);
}

(This answer is licensed under both CC0 and WTFPL.)

查看更多
登录 后发表回答