Why does this tail-call recursive fibonacci functi

2019-07-03 01:52发布

问题:

I implemented the following tail-call recursive fibonacci function to try out gcc's Tail Call Optimization (as mentioned here):

#include <stdint.h>
uint64_t fib_tc( uint8_t n) {
  uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
    if(n == 0) 
      return acc;
    else
      return fib_helper( n-1, acc+prev, acc);
  }
  fib_helper(n,1,0);
}

Initially, I did not specify a -O level (so it defaults to 1). The function worked as expected with -O1. Later, I tried compiling with -O2, but that broke the function - it always returns 0. Same result when using -O3.

I took a look at the assembly output with -O2 (using: gcc -O2 -std=gnu99 -S fib_tc.c ) and see the following assembly:

    .file   "fib_tc.c"
    .text
    .p2align 4,,15
    .type   fib_helper.1752, @function
fib_helper.1752:
.LFB1:
    .cfi_startproc
    testb   %dil, %dil
    movq    %rsi, %rax
    jne .L4
    rep ret
    .p2align 4,,10
    .p2align 3
.L4:
    leaq    (%rsi,%rdx), %rsi
    subl    $1, %edi
    movq    %rax, %rdx
    movzbl  %dil, %edi
    jmp fib_helper.1752
    .cfi_endproc
.LFE1:
    .size   fib_helper.1752, .-fib_helper.1752
    .p2align 4,,15
    .globl  fib_tc
    .type   fib_tc, @function
fib_tc:
.LFB0:
    .cfi_startproc
    testb   %dil, %dil
    jne .L11
    ret
    .p2align 4,,10
    .p2align 3
.L11:
    subl    $1, %edi
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    movl    $1, %edx
    movzbl  %dil, %edi
    movl    $1, %esi
    call    fib_helper.1752
    addq    $8, %rsp
   .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE0:
    .size   fib_tc, .-fib_tc
    .ident  "GCC: (GNU) 4.8.2 20131212 (Red Hat 4.8.2-7)"
    .section    .note.GNU-stack,"",@progbits

I notice that with -O2 it seems to have done the TCO (notice the line above: jmp fib_helper.1752). However, this is the non-working version that always returns 0.

I then generated assembly using -O1 to check the difference ( gcc -O1 -std=gnu99 -S fib_tc.c -o fib_tc1.S ):

    .file   "fib_tc.c"
    .text
    .type   fib_helper.1752, @function
fib_helper.1752:
.LFB1:
    .cfi_startproc
    movq    %rsi, %rax
    testb   %dil, %dil
    je  .L6
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    leaq    (%rsi,%rdx), %rax
    subl    $1, %edi
    movzbl  %dil, %edi
    movq    %rsi, %rdx
    movq    %rax, %rsi
    call    fib_helper.1752
    addq    $8, %rsp
    .cfi_def_cfa_offset 8
.L6:
    rep ret
    .cfi_endproc
.LFE1:
    .size   fib_helper.1752, .-fib_helper.1752
    .globl  fib_tc
    .type   fib_tc, @function
fib_tc:
.LFB0:
    .cfi_startproc
    subq    $8, %rsp
    .cfi_def_cfa_offset 16
    movzbl  %dil, %edi
    movl    $0, %edx
    movl    $1, %esi
    call    fib_helper.1752
    addq    $8, %rsp
    .cfi_def_cfa_offset 8
   ret
   .cfi_endproc
.LFE0:
    .size   fib_tc, .-fib_tc
    .ident  "GCC: (GNU) 4.8.2 20131212 (Red Hat 4.8.2-7)"
   .section .note.GNU-stack,"",@progbits

This version (which works) does not seem to have been tail call optimized (based on the call to fib_helper above).

As shown above I'm using gcc 4.8.2 on Fedora 20 Linux (64 bit).

Before I conclude that there is a bug in gcc related to TCO I wanted to check to make sure I haven't done something wrong in the code or in my analysis.

EDIT: as it was pointed out in the comments that there could be an issue with nested functions, I also tried using separate functions as follows and that always returns 0 as well:

uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
  if(n == 0) 
    return acc;
  else
    return fib_helper( n-1, acc+prev, acc);
}

uint64_t fib_tc( uint8_t n) {
  fib_helper(n,1,0);
}

EDIT2: As pointed out in the comments below, I was missing the return (I guess I've been doing too much OCaml lately :) It should be:

uint64_t fib_tc( uint8_t n) {
  uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
    if(n == 0) 
      return acc;
    else
      return fib_helper( n-1, acc+prev, acc);
  }
  return fib_helper(n,1,0);
}