If the following regex can split a csv string by line.
var lines = csv.split(/\r|\r?\n/g);
How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?
Example:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",
If you don't see it, here's a version with the newlines visible:
2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n
The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.
Update:
I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.
Here's the code for validating and parsing an entry (ie one line):
$.csvEntry2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
// build the CSV validator regex
var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
reValid = RegExp(reValid.source.replace(/S/g, separator));
reValid = RegExp(reValid.source.replace(/D/g, delimiter));
// build the CSV line parser regex
var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');
// Return NULL if input string is not well formed CSV string.
if (!reValid.test(csv)) {
return null;
}
// "Walk" the string using replace with callback.
var output = [];
csv.replace(reValue, function(m0, m1, m2) {
// Remove backslash from any delimiters in the value
if (m1 !== undefined) {
var reDelimiterUnescape = /\\D/g;
reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
output.push(m1.replace(reDelimiterUnescape, delimiter));
} else if (m2 !== undefined) {
output.push(m2);
}
return '';
});
// Handle special case of empty last value.
var reEmptyLast = /S\s*$/;
reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
if (reEmptyLast.test(csv)) {
output.push('');
}
return output;
};
Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.
This is the code that does the split-by-line part:
$.csv2Array = function(csv, meta) {
var meta = (meta !== undefined ? meta : {});
var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;
// process by line
var lines = csv.split(/\r\n|\r|\n/g);
var output = [];
for(var i in lines) {
if(i < skip) {
continue;
}
// process each value
var line = $.csvEntry2Array(lines[i], {
delimiter: delimiter,
separator: separator
});
output.push(line);
}
return output;
};
For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.
I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.
Update:
splitLines: function(csv, delimiter) {
var state = 0;
var value = "";
var line = "";
var lines = [];
function endOfRow() {
lines.push(value);
value = "";
state = 0;
};
csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
switch (state) {
// the start of an entry
case 0:
if (m0 === "\"") {
state = 1;
} else if (m0 === "\n") {
endOfRow();
} else if (/^\r$/.test(m0)) {
// carriage returns are ignored
} else {
value += m0;
state = 3;
}
break;
// delimited input
case 1:
if (m0 === "\"") {
state = 2;
} else {
value += m0;
state = 1;
}
break;
// delimiter found in delimited input
case 2:
// is the delimiter escaped?
if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
value += m0;
state = 1;
} else if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal state");
}
break;
// un-delimited input
case 3:
if (m0 === ",") {
value += m0;
state = 0;
} else if (m0 === "\"") {
throw new Error("Unquoted delimiter found");
} else if (m0 === "\n") {
endOfRow();
} else if (m0 === "\r") {
// Ignore
} else {
throw new Error("Illegal data");
}
break;
default:
throw new Error("Unknown state");
}
return "";
});
if (state != 0) {
endOfRow();
}
return lines;
}
All it took is 4 states for a line splitter:
- 0: the start of an entry
- 1: the following is quoted
- 2: a second quote has been encountered
- 3: the following isn't quoted
It's almost a complete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.
Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a complete parser to a line-splitter.
Update:
Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 compliant.
Be careful- That newline is PART of that value. It's not
PCB
, it'sP\nCB
.However, why can't you just use
string.split(',')
? If need be, you can run through the list and cast to ints or remove the padded quotation marks.As I have noted in a comment there is no complete solution just using single regex.
A novel method using several regexps by splitting on comma and joining back strings with embedded commas is described here:-
Personally I would use a simple finite state machine as described here
The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.
It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:
(This answer is licensed under both CC0 and WTFPL.)