Tensorflow “undefined symbol” when loading custom

I am using Tensorflow 1.9rc0 compiled from sources using bazel version 0.15.0. I'm using cuda support with cuda version 9.2 and cudnn version 7.

I am trying to build a custom op which executes a cuda kernel. I've followed the doumentation regarding that matter and checked a few implemented ops in order to develop it.

I ended up with the following code:

kernel_example.h:

#ifndef KERNEL_EXAMPLE_H_
#define KERNEL_EXAMPLE_H_

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/tensor_types.h"

#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/platform/types.h"


namespace functor {

template <typename Device,typename T>
struct ExampleFunctor {
  void operator()(const Device& d,
                  const T* input, const T* filter, T* output,
                  int in_depth, int input_cols, int input_rows,
                  int out_depth, int filter_cols, int filter_rows,
                  int stride_rows, int stride_cols,
                  int n_elements);
};

#if GOOGLE_CUDA

typedef Eigen::GpuDevice GPUDevice;

template <typename T>
struct ExampleFunctor<GPUDevice, T> {
    void operator()(
        const GPUDevice& d,
        const T* input, const T* filter, T* output,
        int in_depth, int input_cols, int input_rows,
        int out_depth, int filter_cols, int filter_rows,
        int stride_rows, int stride_cols,
        int n_elements);
};

#endif //GOOGLE_CUDA

}

kernel_example.cc:

#include "kernel_example.h"
#include "tensorflow/core/framework/op_kernel.h"

#include "tensorflow/core/framework/common_shape_fns.h"

using namespace tensorflow;

using CPUDevice = Eigen::ThreadPoolDevice;
using GPUDevice = Eigen::GpuDevice;

REGISTER_OP("MyConvGpu")
    .Input("input: T")
    .Input("filter: T")
    .Output("output: T")
    .SetShapeFn(tensorflow::shape_inference::UnknownShape);

// OpKernel definition.
// template parameter <T> is the datatype of the tensors.
template <typename Device, typename T>
class ExampleOp : public OpKernel {
 public:
  explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    // Loading op parameters and defining  variables

    functor::ExampleFunctor<Device, T> functor;
    functor(
        context->eigen_device<Device>(),
        input.flat<T>().data(),
        filter.flat<T>().data(),
        output->flat<T>().data(),
        in_depth, input_cols, input_rows,
        out_depth, filter_cols, filter_rows,
        stride_rows, stride_cols,
        static_cast<int>(output->NumElements()));
  }
};

#if GOOGLE_CUDA

#define REGISTER_GPU_KERNEL(T)                                    \
  REGISTER_KERNEL_BUILDER(Name("Example")                         \
                              .Device(DEVICE_GPU)                 \
                              .TypeConstraint<T>("T"),            \
                              ExampleOp<GPUDevice, T>);

TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);

//REGISTER_GPU_KERNEL(Eigen::half)
REGISTER_GPU_KERNEL(float)
REGISTER_GPU_KERNEL(double)
REGISTER_GPU_KERNEL(int)

#endif

kernel_example.cu.cc:

#ifdef GOOGLE_CUDA

#define EIGEN_USE_GPU
#include "kernel_example.h"
#include "tensorflow/core/util/cuda_kernel_helper.h"

using namespace tensorflow;

namespace functor {

// Define the CUDA kernel.
template <typename T>
__global__ void ExampleCudaKernel(const T* input, const T* filter, T* output,
                               const int batch_size,
                               const int in_depth, const int input_cols, const int input_rows,
                               const int out_depth, const int filter_cols, const int filter_rows,
                               const int stride_rows, const int stride_cols,
                               const int n_elements) {
    // Kernel here
}

// Define the GPU implementation that launches the CUDA kernel.
template <typename T>
void ExampleFunctor<GPUDevice, T>::operator()(
    const Eigen::GpuDevice& d,
    const T* input, const T* filter, T* output,
    int in_depth, int input_cols, int input_rows,
    int out_depth, int filter_cols, int filter_rows,
    int stride_rows, int stride_cols,
    int n_elements) {

  // Launch the cuda kernel.
  ExampleCudaKernel<T>
        <<<(n_elements + 255) / 256, 256>>>(input, filter, output,
                                             batch_size,
                                             in_depth, input_cols, input_rows,
                                             out_depth, filter_cols, filter_rows,
                                             stride_rows, stride_cols,
                                             n_elements);
}

// Explicitly instantiate functors for the types of OpKernels registered.
template struct ExampleFunctor<GPUDevice, float>;
template struct ExampleFunctor<GPUDevice, double>;
template struct ExampleFunctor<GPUDevice, int>;

} //namespace functor

#endif  // GOOGLE_CUDA

Everything compiles correctly, generating a example.so library file, with the following bazel build file:

load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")

tf_custom_op_library(
    name = "example.so",
    srcs = ["kernel_example.h", "kernel_example.cc"],
    gpu_srcs = ["kernel_example.h", "kernel_example.cu.cc"],
)

However, when loading this library into a Tensorflow execution using

module = tf.load_op_library('./example.so')

I get the following output:

Traceback (most recent call last):
  File "mnist.py", line 51, in <module>
    my_conv_gpu_module = tf.load_op_library('./example.so')
  File "/usr/lib/python3.6/site-packages/tensorflow/python/framework/load_library.py", line 56, in load_op_library
    lib_handle = py_tf.TF_LoadLibrary(library_filename)
tensorflow.python.framework.errors_impl.NotFoundError: ./example.so: undefined symbol: _ZN10tensorflow16FormatFromStringERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPNS_12TensorFormatE

I've developed other ops, which don't use cuda acceleration, and there is no problem loading them, even though their implementation is pretty similar.

Also, I've read other topics regarding this error but the solution always seems to be adding this --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" flag to bazel build arguments, given that Tensoflow's binary pip packages are built using gcc version 4. I'm doing so, but the error persists.

I've also tried this code on a different environment, with Tensorflow rc1.8, cuda 8 and cudnn 6. But without any luck.

What am I missing?