I'm trying to parse a CSV file (with header line) using boost spirit.
The csv is not in a constant format. Sometimes there is some extra column or the order of the column is mixed. I'm interested in few columns, whose header name is well known.
For instance my CSV may look like:
Name,Surname,Age
John,Doe,32
Or:
Age,Name
32,John
I want to parse only the content of Name and Age (N.B. Age is integer type). At the moment i come out with a very ugly solution where Spirit parses the first line and creates a vector that contains an enum in the positions i'm interested into. And then i have to do the parsing of the terminal symbols by hand...
enum LineItems {
NAME, AGE, UNUSED
};
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() :
CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
start = qi::omit[header[qi::_a = qi::_1]] >> eol >> line(_a) % eol;
header = (lit("Name")[phx::push_back(phx::ref(qi::_val), LineItems::NAME)]
| lit("Age")[phx::push_back(phx::ref(qi::_val), LineItems::AGE)]
| column[phx::push_back(phx::ref(qi::_val), LineItems::UNUSED)]) % colsep;
line = (column % colsep)[phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1,
qi::_val)];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
}
void convertFunc(std::vector<string>& columns, std::vector<LineItems>& positions, CsvLine &csvLine) {
//terminal symbol parsing here, and assign to csvLine struct.
...
}
private:
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvLine(std::vector<LineItems>), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
};
Here is the full source.
What if the header parser could prepare a vector<rule<...>*>
and the "line parser" just use this vector to parse itself? a sort of advanced nabialek trick (i've been trying but i couldn't make it).
Or is there any better way to parse this kind of CSV with Spirit?
(any help is appreciated, thank you in advance)
I'd go with the concept that you have,
I think it's plenty elegant (the qi locals even allow reentrant use of this).
To reduce the cruft in the rules (Boost Spirit: "Semantic actions are evil"?) you could move the "conversion function" off into attribute transformation customization points.
Oops. As commented that was too simple. However, you can still reduce the cruftiness quite a bit. With two simple tweaks, the grammar reads:
item.add("Name", NAME)("Age", AGE);
start = omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
The tweaks:
- using
qi::symbols
to map from header to LineItem
using a raw semantinc action ([convert]
) which directly access the context (see boost spirit semantic action parameters):
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
Arguably the upshot is akin to doing auto convert = phx::bind(&CsvGrammar<It>::convertFunc, this, qi::_1, qi::_r1, qi::_val)
but using auto
with Proto/Phoenix/Spirit expressions is notoriously error prone (UB due to dangling refs to temporaries from the expression template), so I'd certainly prefer the way shown above.
Live On Coliru
//#define BOOST_SPIRIT_DEBUG
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <iostream>
#include <boost/fusion/include/at_c.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string>
#include <vector>
namespace qi = boost::spirit::qi;
namespace phx = boost::phoenix;
using std::string;
enum LineItems { NAME, AGE, UNUSED };
struct CsvLine {
string name;
int age;
};
using Column = std::string;
using Columns = std::vector<Column>;
using CsvFile = std::vector<CsvLine>;
template<typename It>
struct CsvGrammar: qi::grammar<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> {
CsvGrammar() : CsvGrammar::base_type(start) {
using namespace qi;
static const char colsep = ',';
item.add("Name", NAME)("Age", AGE);
start = qi::omit[ header[_a=_1] ] >> eol >> line(_a) % eol;
header = (item | omit[column] >> attr(UNUSED)) % colsep;
line = (column % colsep) [convert];
column = quoted | *~char_(",\n");
quoted = '"' >> *("\"\"" | ~char_("\"\n")) >> '"';
BOOST_SPIRIT_DEBUG_NODES((header)(column)(quoted));
}
private:
qi::rule<It, std::vector<LineItems>(), qi::blank_type> header;
qi::rule<It, CsvFile(), qi::locals<std::vector<LineItems>>, qi::blank_type> start;
qi::rule<It, CsvLine(std::vector<LineItems> const&), qi::blank_type> line;
qi::rule<It, Column(), qi::blank_type> column;
qi::rule<It, std::string()> quoted;
qi::rule<It, qi::blank_type> empty;
qi::symbols<char, LineItems> item;
struct final {
using Ctx = typename decltype(line)::context_type;
void operator()(Columns const& columns, Ctx &ctx, bool &pass) const {
auto& csvLine = boost::fusion::at_c<0>(ctx.attributes);
auto& positions = boost::fusion::at_c<1>(ctx.attributes);
int i =0;
for (LineItems position : positions) {
switch (position) {
case NAME: csvLine.name = columns[i]; break;
case AGE: csvLine.age = atoi(columns[i].c_str()); break;
default: break;
}
i++;
}
pass = true; // returning false fails the `line` rule
}
} convert;
};
int main() {
const std::string s = "Surname,Name,Age,\nJohn,Doe,32\nMark,Smith,43";
auto f(begin(s)), l(end(s));
CsvGrammar<std::string::const_iterator> p;
CsvFile parsed;
bool ok = qi::phrase_parse(f, l, p, qi::blank, parsed);
if (ok) {
for (CsvLine line : parsed) {
std::cout << '[' << line.name << ']' << '[' << line.age << ']';
std::cout << std::endl;
}
} else {
std::cout << "Parse failed\n";
}
if (f != l)
std::cout << "Remaining unparsed: '" << std::string(f, l) << "'\n";
}
Prints
[Doe][32]
[Smith][43]