Probable instruction Cache Synchronization issue i

A lot of related questions <How is x86 instruction cache synchronized? > mention x86 should properly handle i-cache synchronization in self modifying code. I wrote the following piece of code which toggles a function call on and off from different threads interleaved with its execution. I am using compare and swap operation as an additional guard so that the modification is atomic. But I am getting intermittent crashes (SIGSEGV, SIGILL) and analyzing the core dump makes me suspicious if the processor is trying to execute partially updated instructions. The code and the analysis given below. May be I am missing something here. Let me know if that's the case.

toggle.c

#include <stdio.h>
#include <inttypes.h>
#include <time.h>
#include <pthread.h>
#include <sys/mman.h>
#include <errno.h>
#include <unistd.h>

int active = 1; // Whether the function is toggled on or off
uint8_t* funcAddr = 0; // Address where function call happens which we need to toggle on/off
uint64_t activeSequence = 0; // Byte sequence for toggling on the function CALL
uint64_t deactiveSequence = 0; // NOP byte sequence for toggling off the function CALL

inline int modify_page_permissions(uint8_t* addr) {

  long page_size = sysconf(_SC_PAGESIZE);
  int code = mprotect((void*)(addr - (((uint64_t)addr)%page_size)), page_size,
    PROT_READ | PROT_WRITE | PROT_EXEC);

  if (code) {
    fprintf(stderr, "mprotect was not successfull! code %d\n", code);
    fprintf(stderr, "errno value is : %d\n", errno);
    return 0;
  }

  // If the 8 bytes we need to modify straddles a page boundary make the next page writable too
  if (page_size - ((uint64_t)addr)%page_size < 8) {
    code = mprotect((void*)(addr-((uint64_t)addr)%page_size+ page_size) , page_size,
      PROT_READ | PROT_WRITE | PROT_EXEC);
    if (code) {
      fprintf(stderr, "mprotect was not successfull! code %d\n", code);
      fprintf(stderr, "errno value is : %d\n", errno);
      return 0;;
    }
  }

  return 1;
}

void* add_call(void* param) {

  struct timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = 50000;

  while (1) {
    if (!active) {
      if (activeSequence != 0) {
        int status = modify_page_permissions(funcAddr);
        if (!status) {
          return 0;
        }

        uint8_t* start_addr = funcAddr - 8;

        fprintf(stderr, "Activating foo..\n");
        uint64_t res = __sync_val_compare_and_swap((uint64_t*) start_addr,
                                    *((uint64_t*)start_addr), activeSequence);
        active = 1;
      } else {
        fprintf(stderr, "Active sequence not initialized..\n");
      }
    }

    nanosleep(&ts, NULL);
  }

}

int remove_call(uint8_t* addr) {

  if (active) {
    // Remove gets called first before add so we initialize active and deactive state byte sequences during the first call the remove
    if (deactiveSequence == 0) {
      uint64_t sequence =  *((uint64_t*)(addr-8));
      uint64_t mask = 0x0000000000FFFFFF;
      uint64_t deactive = (uint64_t) (sequence & mask);
      mask = 0x9090909090000000; // We NOP 5 bytes of CALL instruction and leave rest of the 3 bytes as it is

      activeSequence = sequence;
      deactiveSequence = deactive |  mask;
      funcAddr = addr;
    }

    int status = modify_page_permissions(addr);
    if (!status) {
      return -1;
    }

    uint8_t* start_addr = addr - 8;

    fprintf(stderr, "Deactivating foo..\n");
    uint64_t res = __sync_val_compare_and_swap((uint64_t*)start_addr,
                                  *((uint64_t*)start_addr), deactiveSequence);
    active = 0;
    // fprintf(stderr, "Result : %p\n", res);
  }
}

int counter = 0;

void foo(int i) {

  // Use the return address to determine where we need to patch foo CALL instruction (5 bytes)
  uint64_t* addr = (uint64_t*)__builtin_extract_return_addr(__builtin_return_address(0));

  fprintf(stderr, "Foo counter : %d\n", counter++);
  remove_call((uint8_t*)addr);
}

// This thread periodically checks if the method is inactive and if so reactivates it
void spawn_add_call_thread() {
  pthread_t tid;
  pthread_create(&tid, NULL, add_call, (void*)NULL);
}

int main() {

  spawn_add_call_thread();

  int i=0;
  for (i=0; i<1000000; i++) {
    // fprintf(stderr, "i : %d..\n", i);
   foo(i);
  }

  fprintf(stderr, "Final count : %d..\n\n\n", counter);
}

Core dump analysis

Program terminated with signal 4, Illegal instruction.
#0  0x0000000000400a28 in main () at toggle.c:123
(gdb) info frame
 Stack level 0, frame at 0x7fff7c8ee360:
   rip = 0x400a28 in main (toggle.c:123); saved rip 0x310521ed5d
 source language c.
 Arglist at 0x7fff7c8ee350, args:
 Locals at 0x7fff7c8ee350, Previous frame's sp is 0x7fff7c8ee360
 Saved registers:
 rbp at 0x7fff7c8ee350, rip at 0x7fff7c8ee358
(gdb) disas /r 0x400a28,+30
 Dump of assembler code from 0x400a28 to 0x400a46:
  => 0x0000000000400a28 <main+64>:   ff (bad)
     0x0000000000400a29 <main+65>:   ff (bad)
     0x0000000000400a2a <main+66>:   ff eb  ljmpq  *<internal disassembler error>
     0x0000000000400a2c <main+68>:   e7 48  out    %eax,$0x48
 (gdb) disas /r main
  Dump of assembler code for function main:
     0x00000000004009e8 <+0>:    55 push   %rbp
     ...
     0x0000000000400a24 <+60>:   89 c7  mov    %eax,%edi
     0x0000000000400a26 <+62>:   e8 11 ff ff ff callq  0x40093c <foo>
     0x0000000000400a2b <+67>:   eb e7  jmp    0x400a14 <main+44>

So as can be seen the instruction pointer seems to positioned within an address inside the CALL instruction and processor is apparently trying to execute that misaligned instruction causing an illegal instruction fault.

回答1:

I think your problem is that you replaced a 5-byte CALL instruction with 5 1-byte NOPs. Consider what happens when your thread has executed 3 of the NOPs, and then your master thread decides to swap the CALL instruction back in. Your thread's PC will be three bytes in the middle of the CALL instruction and will therefore execute an unexpected and likely illegal instruction.

What you need to do is swap the 5-byte CALL instruction with a 5-byte NOP. You just need to find a multibyte instruction that does nothing (such as or'ing a register against itself) and if you need some extra bytes, prepend some prefix bytes such as a gs override prefix and an address-size override prefix (both of which will do nothing). By using a 5-byte NOP, your thread will be guaranteed to either be at the CALL instruction or past the CALL instruction, but never inside of it.

回答2:

On 80x86 most calls use a relative displacement, not an absolute address. Essentially its "call the code at here + < displacement >" and not "call the code at < address >".

For 64-bit code, the displacement may be 8 bits or 32-bits. It's never 64-bits.

For example, for a 2-byte "call with 8-bit displacement" instruction, you'd be trashing 6 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).

For another example, for a 5-byte "call with 32-bit displacement" instruction, you'd be trashing 3 bytes before the call instruction, the call opcode itself, and the instruction's operand (the displacement).

However...

These aren't the only way to call. For example, you can call using a function pointer, where the address of the code being called is not in the instruction at all (but may be in a register or be a variable in memory). There's also an optimisation called "tail call optimisation" where a call followed by a ret is replaced with a jmp (likely with some additional stack diddling for passing parameters, cleaning up the caller's local variables, etc).

Essentially; your code is severely broken, you can't cover all the possible corner cases, you shouldn't be doing this to begin with, and you probably should be using a function pointer instead of self modifying code (which would be faster and easier and portable too).