I implemented the following tail-call recursive fibonacci function to try out gcc's Tail Call Optimization (as mentioned here):
#include <stdint.h>
uint64_t fib_tc( uint8_t n) {
uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
if(n == 0)
return acc;
else
return fib_helper( n-1, acc+prev, acc);
}
fib_helper(n,1,0);
}
Initially, I did not specify a -O level (so it defaults to 1). The function worked as expected with -O1. Later, I tried compiling with -O2, but that broke the function - it always returns 0. Same result when using -O3.
I took a look at the assembly output with -O2 (using: gcc -O2 -std=gnu99 -S fib_tc.c ) and see the following assembly:
.file "fib_tc.c"
.text
.p2align 4,,15
.type fib_helper.1752, @function
fib_helper.1752:
.LFB1:
.cfi_startproc
testb %dil, %dil
movq %rsi, %rax
jne .L4
rep ret
.p2align 4,,10
.p2align 3
.L4:
leaq (%rsi,%rdx), %rsi
subl $1, %edi
movq %rax, %rdx
movzbl %dil, %edi
jmp fib_helper.1752
.cfi_endproc
.LFE1:
.size fib_helper.1752, .-fib_helper.1752
.p2align 4,,15
.globl fib_tc
.type fib_tc, @function
fib_tc:
.LFB0:
.cfi_startproc
testb %dil, %dil
jne .L11
ret
.p2align 4,,10
.p2align 3
.L11:
subl $1, %edi
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $1, %edx
movzbl %dil, %edi
movl $1, %esi
call fib_helper.1752
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size fib_tc, .-fib_tc
.ident "GCC: (GNU) 4.8.2 20131212 (Red Hat 4.8.2-7)"
.section .note.GNU-stack,"",@progbits
I notice that with -O2 it seems to have done the TCO (notice the line above: jmp fib_helper.1752). However, this is the non-working version that always returns 0.
I then generated assembly using -O1 to check the difference ( gcc -O1 -std=gnu99 -S fib_tc.c -o fib_tc1.S ):
.file "fib_tc.c"
.text
.type fib_helper.1752, @function
fib_helper.1752:
.LFB1:
.cfi_startproc
movq %rsi, %rax
testb %dil, %dil
je .L6
subq $8, %rsp
.cfi_def_cfa_offset 16
leaq (%rsi,%rdx), %rax
subl $1, %edi
movzbl %dil, %edi
movq %rsi, %rdx
movq %rax, %rsi
call fib_helper.1752
addq $8, %rsp
.cfi_def_cfa_offset 8
.L6:
rep ret
.cfi_endproc
.LFE1:
.size fib_helper.1752, .-fib_helper.1752
.globl fib_tc
.type fib_tc, @function
fib_tc:
.LFB0:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movzbl %dil, %edi
movl $0, %edx
movl $1, %esi
call fib_helper.1752
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size fib_tc, .-fib_tc
.ident "GCC: (GNU) 4.8.2 20131212 (Red Hat 4.8.2-7)"
.section .note.GNU-stack,"",@progbits
This version (which works) does not seem to have been tail call optimized (based on the call to fib_helper above).
As shown above I'm using gcc 4.8.2 on Fedora 20 Linux (64 bit).
Before I conclude that there is a bug in gcc related to TCO I wanted to check to make sure I haven't done something wrong in the code or in my analysis.
EDIT: as it was pointed out in the comments that there could be an issue with nested functions, I also tried using separate functions as follows and that always returns 0 as well:
uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
if(n == 0)
return acc;
else
return fib_helper( n-1, acc+prev, acc);
}
uint64_t fib_tc( uint8_t n) {
fib_helper(n,1,0);
}
EDIT2: As pointed out in the comments below, I was missing the return (I guess I've been doing too much OCaml lately :) It should be:
uint64_t fib_tc( uint8_t n) {
uint64_t fib_helper(uint8_t n, uint64_t acc, uint64_t prev) {
if(n == 0)
return acc;
else
return fib_helper( n-1, acc+prev, acc);
}
return fib_helper(n,1,0);
}