Insert boost::dynamic_bitset<> into boost::bima

2019-08-21 13:31发布

问题:

I am trying to insert boost::dynamic_bitset<> into boost::bimap. However, it is very slow as compared to inserting integers or strings. Minimal Example is given, the code of which is shown below

// Example program
#include <iostream>
#include <string>
#include <boost/bimap.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>


namespace std {
    template <typename Block, typename Alloc> struct hash<boost::dynamic_bitset<Block, Alloc> > {
        size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const {
            size_t seed = boost::hash_value(bs.size());

            std::vector<Block> blocks(bs.num_blocks());
            boost::hash_range(seed, blocks.begin(), blocks.end());

            return seed;
        }
    };
}


namespace bimaps = boost::bimaps;
    typedef boost::dynamic_bitset<> Bitset;
    typedef boost::bimap<
    bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
    bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
    typedef bimap_reference::value_type position;
    bimap_reference reference_index_vector;


class bitsets{
public:
    void encode_string(
            std::string &sequence_content,
            std::string &binary_sequence);

    void encode_positions(
                std::string &pos,
                std::string &pos_binary_sequence);

};




int main()
{
    bitsets b;
    std::string binary_sequence, decoded_string ;
    std::string pos_binary_sequence, decoded_positions;
    std::string sequence_content = "ADGFGGFFAAACGFGCAAFGCAAFGCAFGCAAFGCAFGCAAGGGCAGDDDCGGAFFGCA";

    for(size_t i = 0; i < sequence_content.size(); i++){
        std::string substr = sequence_content.substr(i,15);
        b.encode_string(substr, binary_sequence);
        boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
        std::string pos = std::to_string(i);
        b.encode_positions(pos, pos_binary_sequence);
        boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
        reference_index_vector.insert(position(pos_bits, bits));
        binary_sequence.clear();
        pos_binary_sequence.clear();
        i += 14;
}
    for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
            iter != iend; ++iter ) {
        std::cout << iter->left << " <--> "<< iter->right <<std::endl;
    }
}



void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
    for (size_t i = 0; i < substr.size(); ++i){
        switch (substr[i]){
        case 'A':
        case 'a':
            binary_sequence += "00";
            break;
        case 'C':
        case 'c':
            binary_sequence += "01";
            break;
        case 'D':
        case 'd':
            binary_sequence += "10";
            break;
        case 'G':
        case 'g':
            binary_sequence += "110";
            break;
        case 'F':
        case 'f':
            binary_sequence += "110";
            break;
        }
    }
}

void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
    for(size_t i = 0; i < pos.size(); ++i){
        switch (pos[i]){
        case '0':
            pos_binary_sequence += "1101";
            break;
        case '1':
            pos_binary_sequence += "100";
            break;
        case '2':
            pos_binary_sequence += "1110";
            break;
        case '3':
            pos_binary_sequence += "1100";
            break;
        case '4':
            pos_binary_sequence += "101";
            break;
        case '5':
            pos_binary_sequence += "000";
            break;
        case '6':
            pos_binary_sequence += "001";
            break;
        case '7':
            pos_binary_sequence += "011";
            break;
        case '8':
            pos_binary_sequence += "1111";
            break;
        case '9':
            pos_binary_sequence += "010";
            break;
        }
    }
}

For a string of 5 million characters long it takes a lot of time (approx 5 mins) intead of a few seconds for integers/strings to insert into the bimap. Why boost::dynamic_bitset<> is slow and how can I improve it.

回答1:

I refactored a little, fixed the hash function (I think - pls check it) and ran the test for a sequence of 5,000,000 random thingumyjigs (aleles?)

With a release build, the test ran in 900 milliseconds - less than one second.

Here's the code. I make no claims about the correctness or efficiency of the algorithm. I suspect there is room for improvement.

// Example program
#include <iostream>
#include <string>
#include <boost/bimap.hpp>
#include <boost/dynamic_bitset.hpp>
#include <boost/bimap/unordered_set_of.hpp>
#include <boost/bimap/unordered_multiset_of.hpp>
#include <random>
#include <chrono>
#include <thread>

namespace std {
    template <typename Block, typename Alloc>
    struct hash<boost::dynamic_bitset<Block, Alloc> > {

        using bitset_type = boost::dynamic_bitset<Block, Alloc>;
        using block_type = typename bitset_type::block_type ;

        size_t operator()(boost::dynamic_bitset<Block, Alloc> const& bs) const
        {
            thread_local static std::vector<block_type> block_data;
            auto blocks = bs.num_blocks();
            block_data.assign(blocks, 0);
            to_block_range(bs, block_data.begin());
            return boost::hash<std::vector<block_type>>()(block_data);
        }
    };
}



namespace bimaps = boost::bimaps;
typedef boost::dynamic_bitset<> Bitset;
typedef boost::bimap<
        bimaps::unordered_set_of<Bitset, std::hash<Bitset>>,
        bimaps::unordered_multiset_of<Bitset, std::hash<Bitset> > > bimap_reference;
typedef bimap_reference::value_type position;


class bitsets{
public:
    void encode_string(
            std::string &sequence_content,
            std::string &binary_sequence);

    void encode_positions(
            std::string &pos,
            std::string &pos_binary_sequence);

};

bimap_reference test(const std::string& sequence_content)
{
    bimap_reference reference_index_vector;

    bitsets b;
    std::string binary_sequence, decoded_string ;
    std::string pos_binary_sequence, decoded_positions;

    for(size_t i = 0; i < sequence_content.size(); i++){
        std::string substr = sequence_content.substr(i,15);
        b.encode_string(substr, binary_sequence);
        boost::dynamic_bitset<> bits = boost::dynamic_bitset<> (binary_sequence);
        std::string pos = std::to_string(i);
        b.encode_positions(pos, pos_binary_sequence);
        boost::dynamic_bitset<> pos_bits = boost::dynamic_bitset<> (pos_binary_sequence);
        reference_index_vector.insert(position(pos_bits, bits));
        binary_sequence.clear();
        pos_binary_sequence.clear();
        i += 14;
    }

    return reference_index_vector;
}


int main()
{
    static const std::string valid_chars = "ADGF";
    std::random_device rd;
    auto eng = std::default_random_engine(rd());
    auto dist = std::uniform_int_distribution<int>(0, valid_chars.size() - 1);

    static const auto sequence_size = std::size_t(5'000'000);

    auto sequence_content = std::string(sequence_size, ' ');
    std::generate(sequence_content.begin(), sequence_content.end(), [&]{ return valid_chars[dist(eng)]; });

    auto start_time = std::chrono::system_clock::now();
    auto reference_index_vector = test(sequence_content);

    auto end_time = std::chrono::system_clock::now();

    std::cout
            << "test took: "
            << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count()
            << " milliseconds\n";
    std::this_thread::sleep_for(std::chrono::seconds(5));


    for( bimap_reference::const_iterator iter = reference_index_vector.begin(), iend = reference_index_vector.end();
         iter != iend; ++iter ) {
        std::cout << iter->left << " <--> "<< iter->right <<std::endl;
    }
}



void bitsets::encode_string(std::string &substr, std::string &binary_sequence){
    for (size_t i = 0; i < substr.size(); ++i){
        switch (substr[i]){
            case 'A':
            case 'a':
                binary_sequence += "00";
                break;
            case 'C':
            case 'c':
                binary_sequence += "01";
                break;
            case 'D':
            case 'd':
                binary_sequence += "10";
                break;
            case 'G':
            case 'g':
                binary_sequence += "110";
                break;
            case 'F':
            case 'f':
                binary_sequence += "110";
                break;
        }
    }
}

void bitsets::encode_positions(std::string &pos, std::string &pos_binary_sequence){
    for(size_t i = 0; i < pos.size(); ++i){
        switch (pos[i]){
            case '0':
                pos_binary_sequence += "1101";
                break;
            case '1':
                pos_binary_sequence += "100";
                break;
            case '2':
                pos_binary_sequence += "1110";
                break;
            case '3':
                pos_binary_sequence += "1100";
                break;
            case '4':
                pos_binary_sequence += "101";
                break;
            case '5':
                pos_binary_sequence += "000";
                break;
            case '6':
                pos_binary_sequence += "001";
                break;
            case '7':
                pos_binary_sequence += "011";
                break;
            case '8':
                pos_binary_sequence += "1111";
                break;
            case '9':
                pos_binary_sequence += "010";
                break;
        }
    }
}