Whilst researching performance trade-offs between Python and C++, I've devised a small example, which mostly focusses on a dumb substring matching.
Here is the relevant C++:
using std::string;
std::vector<string> matches;
std::copy_if(patterns.cbegin(), patterns.cend(), back_inserter(matches),
[&fileContents] (const string &pattern) { return fileContents.find(pattern) != string::npos; } );
The above is built with -O3.
And here is Python:
def getMatchingPatterns(patterns, text):
return filter(text.__contains__, patterns)
Both of them take a large-ish set of patterns and input file, and filter down the list of patterns to the ones found in the file using a dumb substring search.
The versions are:
- gcc - 4.8.2 (Ubuntu) and 4.9.2 (cygwin)
- python - 2.7.6 (Ubuntu) and 2.7.8 (cygwin)
What was surprising to me is the performance. I've run both on a low-spec Ubuntu and Python was faster by about 20%. The same on mid-spec PC with cygwin - Python twice faster. Profiler shows that 99+% of cycles are spent in string matching (string copying and list comprehensions are insignificant).
Obviously, the Python implementation is native C, and I'd expected to be roughly the same as C++, but did not expect it as fast.
Any insight into relevant CPython optimisations in comparison to gcc would be most welcome.
For reference, here are the full examples. The inputs just take a set of 50K HTLMs (all read from disk in each test, no special caching):
Python:
import sys
def getMatchingPatterns(patterns, text):
return filter(text.__contains__, patterns)
def serialScan(filenames, patterns):
return zip(filenames, [getMatchingPatterns(patterns, open(filename).read()) for filename in filenames])
if __name__ == "__main__":
with open(sys.argv[1]) as filenamesListFile:
filenames = filenamesListFile.read().split()
with open(sys.argv[2]) as patternsFile:
patterns = patternsFile.read().split()
resultTuple = serialScan(filenames, patterns)
for filename, patterns in resultTuple:
print ': '.join([filename, ','.join(patterns)])
C++:
#include <iostream>
#include <iterator>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <algorithm>
using namespace std;
using MatchResult = unordered_map<string, vector<string>>;
static const size_t PATTERN_RESERVE_DEFAULT_SIZE = 5000;
MatchResult serialMatch(const vector<string> &filenames, const vector<string> &patterns)
{
MatchResult res;
for (auto &filename : filenames)
{
ifstream file(filename);
const string fileContents((istreambuf_iterator<char>(file)),
istreambuf_iterator<char>());
vector<string> matches;
std::copy_if(patterns.cbegin(), patterns.cend(), back_inserter(matches),
[&fileContents] (const string &pattern) { return fileContents.find(pattern) != string::npos; } );
res.insert(make_pair(filename, std::move(matches)));
}
return res;
}
int main(int argc, char **argv)
{
vector<string> filenames;
ifstream filenamesListFile(argv[1]);
std::copy(istream_iterator<string>(filenamesListFile), istream_iterator<string>(),
back_inserter(filenames));
vector<string> patterns;
patterns.reserve(PATTERN_RESERVE_DEFAULT_SIZE);
ifstream patternsFile(argv[2]);
std::copy(istream_iterator<string>(patternsFile), istream_iterator<string>(),
back_inserter(patterns));
auto matchResult = serialMatch(filenames, patterns);
for (const auto &matchItem : matchResult)
{
cout << matchItem.first << ": ";
for (const auto &matchString : matchItem.second)
cout << matchString << ",";
cout << endl;
}
}