I attempted to compare the built-in strstr() in g++ (g++ (Debian 7.2.0-19) with the assembly implementation in Agner Fog's asmlib (http://www.agner.org/optimize/asmlib.zip). The target CPU is core-i3 supporting sse4x and avx.
To compile:
g++ -O2 -msse4 main.cpp libaelf64.a
(you would need the static library is from the asmlib distribution).
Functions rdtsc64_start(), rdtsc64_end() to measure performance follow intel's white paper (https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf)
Provided the technique is correct, A_strstr() from asmlib is about 5 times slower than gcc's strstr(). I am making an effort to go though A_strstr()'s assembly to better understand what is going on, meanwhile, does anyone with (or without) experience with asmlib have an explanation? Thanks.
#include <iostream>
#include "asmlib.h"
#include <cstdint>
#include <unistd.h>
#include <string.h>
inline __attribute__ ((always_inline))
static uint64_t rdtsc64_start() {
unsigned long cycles_high, cycles_low;
asm volatile (
"CPUID\n\t"
"RDTSC\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t": "=r" (cycles_high), "=r"
(cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
return ((uint64_t)cycles_high << 32) | cycles_low;
}
inline __attribute__ ((always_inline))
static uint64_t rdtsc64_end() {
unsigned long cycles_high, cycles_low;
asm volatile(
"RDTSCP\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r"
(cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
return ((uint64_t)cycles_high << 32) | cycles_low;
}
static void rdtsc64_warmup() {
unsigned long cycles_high, cycles_low;
asm volatile (
"CPUID\n\t"
"RDTSC\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
asm volatile(
"RDTSCP\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
"%rbx", "%rcx", "%rdx");
asm volatile (
"CPUID\n\t"
"RDTSC\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
asm volatile(
"RDTSCP\n\t"
"mov %%rdx, %0\n\t"
"mov %%rax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
"%rbx", "%rcx", "%rdx");
}
int main(void) {
char a[] = "This is a very long string with a WORD inside.";
char b[] = "WORD";
char *p = nullptr;
rdtsc64_warmup();
int64_t start = rdtsc64_start();
for (volatile int64_t i = 0; i < 100000000; ++i) {
char *p = ::strstr(a,b);
}
int64_t end = rdtsc64_end();
std::cerr << "strstr, counter= " << (end - start) << std::endl;
rdtsc64_warmup();
start = rdtsc64_start();
for (volatile int64_t i = 0; i < 100000000; ++i) {
char *p = A_strstr(a, b);
}
end = rdtsc64_end();
std::cerr << "A_strstr, counter=" << (end - start) << std::endl;
return 0;
}