I am translating the epsilon-greedy algorithm for multiarmed bandits from here. This is a rather nice demonstration of the power and elegance of Rcpp. However, the results from this version do not tally with the one that is mentioned in the link above. I am aware that this is probably a very niche question but have no other venue to post this on!
A summary of the code is as follows. Basically, we have a set of arms, each of which pays out a reward with a pre-defined probability and our job is to show that by drawing at random from the arms while drawing the arm with the best reward intermittently eventually allows us to converge on to the best arm. A nice explanation of this algorithm is provided by John Myles White.
Now, to the code:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::plugins(cpp11)]]
struct EpsilonGreedy {
double epsilon;
std::vector<int> counts;
std::vector<double> values;
};
int index_max(std::vector<double>& v) {
return std::distance(v.begin(), std::max_element(v.begin(), v.end()));
}
int index_rand(std::vector<double>& v) {
return R::runif(0, v.size()-1);
}
int select_arm(EpsilonGreedy& algo) {
if (R::runif(0, 1) > algo.epsilon) {
return index_max(algo.values);
} else {
return index_rand(algo.values);
}
}
void update(EpsilonGreedy& algo, int chosen_arm, double reward) {
algo.counts[chosen_arm] += 1;
int n = algo.counts[chosen_arm];
double value = algo.values[chosen_arm];
algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
}
struct BernoulliArm {
double p;
};
int draw(BernoulliArm arm) {
if (R::runif(0, 1) > arm.p) {
return 0;
} else {
return 1;
}
}
// [[Rcpp::export]]
DataFrame test_algorithm(double epsilon, std::vector<double>& means, int n_sims, int horizon) {
std::vector<BernoulliArm> arms;
for (auto& mu : means) {
BernoulliArm b = {mu};
arms.push_back(b);
}
std::vector<int> sim_num, time, chosen_arms;
std::vector<double> rewards;
for (int sim = 1; sim <= n_sims; ++sim) {
std::vector<int> counts(means.size(), 0);
std::vector<double> values(means.size(), 0.0);
EpsilonGreedy algo = {epsilon, counts, values};
for (int t = 1; t <= horizon; ++t) {
int chosen_arm = select_arm(algo);
double reward = draw(arms[chosen_arm]);
update(algo, chosen_arm, reward);
sim_num.push_back(sim);
time.push_back(t);
chosen_arms.push_back(chosen_arm);
rewards.push_back(reward);
}
}
DataFrame results = DataFrame::create(Named("sim_num") = sim_num,
Named("time") = time,
Named("chosen_arm") = chosen_arms,
Named("reward") = rewards);
return results;
}
/***R
means <- c(0.1, 0.1, 0.1, 0.1, 0.9)
results <- test_algorithm(0.1, means, 5000, 250)
p2 <- ggplot(results) + geom_bar(aes(x = chosen_arm)) + theme_bw()
*/
The plot of the arm chosen during simulations (i.e., plot p2 above) is as follows:
Obviously, the first arm is being chosen disproportionately despite having a low reward! What is going on?