-->

Extending Rcpp function to input vector of any typ

2020-07-26 17:45发布

问题:

I have following function which does a simple loop on NumericVector and returns int type value.

  Rcpp::cppFunction({'
  int calc_streak( NumericVector x, int i1, int i2){
  int cur_streak=1;

  if (NumericVector::is_na(x[0])){
    cur_streak = NumericVector::get_na();
  } else {
    cur_streak = 1;
  }

  for(int j = i1; j <= i2 ; ++j) {
    if( x[ j ] == x[ j-1 ]){
      cur_streak += 1;

    } else if(NumericVector::is_na( x[ j ] )){
      cur_streak = NumericVector::get_na();

    } else {
      cur_streak = 1;

    }
  }
  return cur_streak;
}
"})

calc_streak(c(1,1,1,1),i1=0,i2=3)
# [1] 4

Function works fine for me but the real issue is when I'm trying to extend this functionality on other input-types. I've been searching on stack here and here, but those examples doesn't work in my case or I don't know how to use examples properly. I've tried few methods dealing with unknown input type, and none was successful in my case. Three examples below


  1. The simplest one inspired by this - created main function which run one of previously defined functions depending on type of argument TYPEOF(x). This function returns expected value for integer and numeric. For character session crashes

    Rcpp::cppFunction('
    #include <Rcpp.h>
    using namespace Rcpp;
    
    int streak_run_int(IntegerVector x, int i1, int i2){
      int cur_streak=1;
    
      if (IntegerVector::is_na(x[0])){
        cur_streak = NumericVector::get_na();
      } else {
       cur_streak = 1;
      }
    
      for(int j = i1; j <= i2 ; ++j) {
        if( x[ j ] == x[ j-1 ]){
         cur_streak += 1;
    
        } else if(IntegerVector::is_na( x[ j ] )){
          cur_streak = NumericVector::get_na();
    
        } else {
          cur_streak = 1;
    
        }
      }
      return cur_streak;
    }
    
    int streak_run_char(CharacterVector x, int i1, int i2){
      int cur_streak=1;
    
      if (CharacterVector::is_na(x[0])){
        cur_streak = NumericVector::get_na();
      } else {
        cur_streak = 1;
      }
    
      for(int j = i1; j <= i2 ; ++j) {
        if( x[ j ] == x[ j-1 ]){
        cur_streak += 1;
    
        } else if(CharacterVector::is_na( x[ j ] )){
          cur_streak = NumericVector::get_na();
    
        } else {
          cur_streak = 1;
    
        }
      }
      return cur_streak;
    }
    
    
      // [[Rcpp::export]]
    int streak_run4(SEXP x, int i1, int i2) {
      switch (TYPEOF(x)) {
      case INTSXP: {
        return streak_run_int(as<IntegerVector>(x), i1, i2);
      }
      case STRSXP: {
        return streak_run_char(as<CharacterVector>(x), i1, i2);
      }
      default: { return 0; }
      }
    }
    ')
    
    # expected results for int and real - for character session crashes
    streak_run4( c(1,1,1,1),i1=0, i2=3)
    streak_run4( as.integer(c(1,1,1,1)),i1=0, i2=3)
    streak_run4( as.character(c(1,1,1,1)),i1=0, i2=3) 
    

  1. Second function has exactly the same idea, but using template instead of defining multiple functions. Same results as above - session crash on character input

    Rcpp::cppFunction('
    #include <Rcpp.h>
    using namespace Rcpp;
    
    namespace impl {
    
      template <int RTYPE>
        int streak_run_impl(const Vector<RTYPE>& x, int i1, int i2)
      {
        int cur_streak=1;
    
        if (Vector<RTYPE>::is_na(x[0])){
          cur_streak = NumericVector::get_na();
        } else {
          cur_streak = 1;
        }
    
        for(int j = i1; j <= i2 ; ++j) {
          if( x[ j ] == x[ j-1 ]){
            cur_streak += 1;
    
          } else if(Vector<RTYPE>::is_na( x[ j ] )){
            cur_streak = NumericVector::get_na();
    
          } else {
            cur_streak = 1;
    
          }
        }
        return cur_streak;
        }
    
    }
    
    // [[Rcpp::export]]
    int streak_run3(SEXP x, int i1, int i2) {
      switch (TYPEOF(x)) {
      case INTSXP: {
        return impl::streak_run_impl(as<IntegerVector>(x), i1, i2);
      }
      case REALSXP: {
        return impl::streak_run_impl(as<NumericVector>(x), i1, i2);
      }
      case STRSXP: {
        return impl::streak_run_impl(as<CharacterVector>(x), i1, i2);
      }
      case LGLSXP: {
        return impl::streak_run_impl(as<LogicalVector>(x), i1, i2);
      }
      case CPLXSXP: {
        return impl::streak_run_impl(as<ComplexVector>(x), i1, i2);
      }
      default: {
        return 0;
      }
      }
    }
    ')
    
    streak_run3( c(1,1,1,1),i1=0, i2=3)
    streak_run3( as.integer(c(1,1,1,1)),i1=0, i2=3)
    streak_run3( as.character(c(1,1,1,1)),i1=0, i2=3)
    

  1. Another one is inspired by this article, and this time I wasn't even able to compile C++ function, while having an error use of overloaded operator '==' is ambiguous. Anyway, after examining two above examples, I don't expect any other result.

    Rcpp::cppFunction('
    #include <Rcpp.h>
    using namespace Rcpp;
    
    class streak_run2_impl {
      private:
      int i1;
      int i2;
    
      public:
      streak_run2_impl(int i1, int i2) : i1(i1), i2(i2) {}
    
      template <int RTYPE>
      IntegerVector operator()(const Vector<RTYPE>& x)
      {
    
        int cur_streak=1;
    
        if (Vector<RTYPE>::is_na(x[0])){
        cur_streak = NumericVector::get_na();
        } else {
        cur_streak = 1;
        }
    
        for(int j = i1; j <= i2 ; ++j) {
          if( x[ j ] == x[ j-1 ] ){
            cur_streak += 1;
    
          } else if(Vector<RTYPE>::is_na( x[ j ] )){
    
            cur_streak = NumericVector::get_na();
    
          } else {
            cur_streak = 1;
          }
        }
        return cur_streak;
      }
    };
    
    
    // [[Rcpp::export]]
    RObject streak_run2(RObject x, int i1 = 0, int i2=6){
      RCPP_RETURN_VECTOR(streak_run2_impl(i1, i2), x);
    }
    ')
    

So my question is: How to properly define this function to obtain results for input vector of any R class?
I would be obliged for any advices.

回答1:

I think the main error in examples are that you start your loop at j = 0 so you call operator[](-1). The following works for me. Make the following func.cpp

#include <Rcpp.h>
#include <algorithm>
using namespace Rcpp;

template <int RTYPE>
int streak_run_impl(const Vector<RTYPE>& x, int i1, int i2)
{
  int cur_streak = 1;

  if (Vector<RTYPE>::is_na(x[0])){
    cur_streak = NA_INTEGER;
  } else {
    cur_streak = 1;
  }

  for(int j = std::max(i1, 1) /* have to start at one at least */; 
      j < std::min(i2 + 1, (int)x.size()) /* check size of x */; ++j){
    if(x[j] == x[j - 1]){
      cur_streak += 1;

    } else if(Vector<RTYPE>::is_na(x[j])){
      cur_streak = NA_INTEGER;

    } else {
      cur_streak = 1;

    }
  }
  return cur_streak;
}

// [[Rcpp::export]]
int streak_run3(SEXP x, int i1, int i2) {
  switch (TYPEOF(x)) {
    case INTSXP: {
      return streak_run_impl(as<IntegerVector>(x), i1, i2);
    }
    case REALSXP: {
      return streak_run_impl(as<NumericVector>(x), i1, i2);
    }
    case STRSXP: {
      return streak_run_impl(as<CharacterVector>(x), i1, i2);
    }
    case LGLSXP: {
      return streak_run_impl(as<LogicalVector>(x), i1, i2);
    }
    case CPLXSXP: {
      return streak_run_impl(as<ComplexVector>(x), i1, i2);
    }
    default: {
      return 0;
    }
  }
}

Then run this R script with the working directory set to that of the .cpp file

Rcpp::sourceCpp("func.cpp")

streak_run3(c(1,1,1,1), i1=0, i2=3)
streak_run3(as.integer(c(1,1,1,1)), i1=0, i2=3)
streak_run3(as.character(c(1,1,1,1)), i1=0, i2=3)


回答2:

First, nice post! Unfortunately, none of the above resources you've linked are relevant to your problem as its due to a completely different error that wasn't picked up in the prototype function. For why the prototype returned a valid value on call, well that is pure luck.

As @BenjaminChristoffersen pointed out, the code is running into an undefined behavior (UB) due to an out-of-bounds (OOB) error occurring. His solution will effectively "fix" the problem.

However, to diagnose this in the future on your own, switch from using the element accessor [] to (), which checks that the element you are requesting is within the bounds. e.g. Is j in 0 to n - 1?

e.g.

  if (Vector<RTYPE>::is_na( x( 0 ) )){
  // ------------------------^---^

    cur_streak = NumericVector::get_na();
  } else {
    cur_streak = 1;
  }

  for(int j = i1; j <= i2 ; ++j) {
    if( x( j ) == x( j-1 )){
      // ^---^-----^-----^
      cur_streak += 1;

    } else if(Vector<RTYPE>::is_na( x( j ) )){
      // --------------------------- ^   ^

      cur_streak = NumericVector::get_na();

    } else {
      cur_streak = 1;

    }
  }

Running the same commands then give:

streak_run3( c(1,1,1,1),i1=0, i2=3)

Output:

Error in streak_run3(c(1, 1, 1, 1), i1 = 0, i2 = 3) : 
  Index out of bounds: [index=-1; extent=4].

Input:

streak_run3( as.integer(c(1,1,1,1)),i1=0, i2=3)

Output:

Error in streak_run3(as.integer(c(1, 1, 1, 1)), i1 = 0, i2 = 3) : 
  Index out of bounds: [index=-1; extent=4].

Input:

streak_run3( as.character(c(1,1,1,1)),i1=0, i2=3)

Output:

Error in streak_run3(as.character(c(1, 1, 1, 1)), i1 = 0, i2 = 3) : 
  Index out of bounds: [index=-1; extent=4].


标签: c++ r rcpp