C：并行线程的性能，比单thrad低(C: performance of pthread, low

我困惑关于我的代码的性能，它只用13S单线程处理时，但它会消耗80。我不知道是否载体只能由一个线程一次访问，如果是的话很可能我不得不使用结构数组来存储数据，而不是载体，任何人都可以好心帮？

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iterator>
#include <string>
#include <ctime>
#include <bangdb/database.h>
#include "SEQ.h"

#define NUM_THREADS 16

using namespace std;


typedef struct _thread_data_t {
    std::vector<FDT> *Query;
    unsigned long start;
    unsigned long end;
    connection* conn;
    int thread;
} thread_data_t;



void *thr_func(void *arg) {

    thread_data_t *data = (thread_data_t *)arg;
    std::vector<FDT> *Query = data->Query;
    unsigned long start = data->start;
    unsigned long end = data->end;
    connection* conn = data->conn;

    printf("thread %d started %lu -> %lu\n", data->thread, start, end);

    for (unsigned long i=start;i<=end ;i++ )
    {
        FDT *fout = conn->get(&((*Query).at(i)));
        if (fout == NULL)
        {
            //printf("%s\tNULL\n", s);

        }
        else
        {
            printf("Thread:%d\t%s\n", data->thread, fout->data);
        }
    }

    pthread_exit(NULL);
}


int main(int argc, char *argv[])
{

    if (argc<2)
    {
        printf("USAGE: ./seq <.txt>\n");
        printf("/home/rd/SCRIPTs/12X18610_L5_I052.R1.clean.code.seq\n");

        exit(-1);
    }
    printf("%s\n", argv[1]);

    vector<FDT> Query;

    FILE* fpin;
    if((fpin=fopen(argv[1],"r"))==NULL)  {
        printf("Can't open Input file %s\n", argv[1]);
        return -1; 
    }

    char *key = (char *)malloc(36);

    while (fscanf(fpin, "%s", key) != EOF)
    {
        SEQ * sequence = new SEQ(key);

        FDT *fk = new FDT( (void*)sequence, sizeof(*sequence) );

        Query.push_back(*fk);
    }

    unsigned long Querysize = (unsigned long)(Query.size());
    std::cout << "myvector stores " << Querysize << " numbers.\n";



    //create database, table and connection
    database* db = new database((char*)"berrydb");

    //get a table, a new one or existing one, walog tells if log is on or off
    table* tbl = db->gettable((char*)"hg19", JUSTOPEN);

    if(tbl == NULL)
    {
        printf("ERROR:table NULL error");
        exit(-1);
    }

    //get a new connection
    connection* conn = tbl->getconnection();
    if(conn == NULL)
    {
        printf("ERROR:connection NULL error");
        exit(-1);
    }

    cerr<<"begin querying...\n";


    time_t begin, end;
    double duration;
    begin = clock();




    unsigned long ThreadDealSize = Querysize/NUM_THREADS;
    cerr<<"Querysize:"<<ThreadDealSize<<endl;



    pthread_t thr[NUM_THREADS];
    int rc;

    thread_data_t thr_data[NUM_THREADS];

    for (int i=0;i<NUM_THREADS ;i++ )
    {
        unsigned long ThreadDealStart = ThreadDealSize*i;
        unsigned long ThreadDealEnd   = ThreadDealSize*(i+1) - 1;

        if (i == (NUM_THREADS-1) )
        {
            ThreadDealEnd = Querysize-1;
        }

        thr_data[i].conn = conn;
        thr_data[i].Query = &Query;
        thr_data[i].start = ThreadDealStart;
        thr_data[i].end = ThreadDealEnd;
        thr_data[i].thread = i;
    }


    for (int i=0;i<NUM_THREADS ;i++ )
    {
        if (rc = pthread_create(&thr[i], NULL, thr_func, &thr_data[i]))
        {
          fprintf(stderr, "error: pthread_create, rc: %d\n", rc);
          return EXIT_FAILURE;
        }
    }


    for (int i = 0; i < NUM_THREADS; ++i) {
        pthread_join(thr[i], NULL);
    }


    cerr<<"done\n"<<endl;
    end = clock();
    duration = double(end - begin) / CLOCKS_PER_SEC;
    cerr << "runtime:   " << duration << "\n" << endl;

    db->closedatabase(OPTIMISTIC);
    delete db;
    printf("Done\n");


  return EXIT_SUCCESS;
}

像标准库中的所有数据结构，方法vector是可重入的，但不是线程安全的。这意味着不同的情况下，可以由多个线程独立访问，但是每个实例只能由一个线程在同一时间访问，你必须确保它。但既然你为每个线程独立的载体，这不是你的问题。

什么是可能是你的问题是printf 。 printf是线程安全的，这意味着你可以从任何数目的线程同时调用它，但在被包裹在相互排斥的内部成本。

在你的程序的螺纹部分工作的绝大部分是内部完成printf 。那么，什么可能发生的情况是，所有的线程启动并迅速得到了printf ，所有但第一个将停止。当printf的完成并释放互斥，系统认为该调度正在等待它的线程。它可能会，所以相当缓慢的上下文切换发生。和每一个后重复printf 。

究竟怎么发生取决于其实际锁定正在使用原始的，这取决于您的操作系统和标准库的版本上。该系统应在每次醒来时只剩下一卧铺，但许多实现真正唤醒他们。因此，除了printf S IN大多是循环的方式执行，承担为每一个上下文切换，可能有相当多的附加虚假唤醒，其中螺纹刚发现锁被持有，并再次进入休眠状态。

所以从这个教训是，线程不会使事情更快自动的。他们只能帮助时：

该线程大部分的它的时间做阻塞系统调用。等东西的网络服务器线程等待数据从插座，不是从响应数据来自磁盘终于为网络以接受响应。在这种情况下，有许多的线程，只要他们大多是独立的帮助。
只是有这么多，因为有CPU线程的线程。目前通常的数目是4（或者四核或双核超线程）。多个线程不能物理并行运行，所以他们没有提供增益，招致一些开销。 16个线程因此矫枉过正。

他们从来没有帮助时，他们都操纵相同的对象，所以他们最终花费大部分时间等待锁反正。除了任何你锁定自己的对象，请记住，输入和输出文件句柄必须在内部锁定为好。

内存分配也需要在线程之间同步的内部，但现代分配器有螺纹，以避免大部分是单独的池; 如果默认分配器被证明是具有许多线程太慢，也有一些专业的人就可以使用。