rootbeer CUDA示例代码定量吞吐量增益(rootbeer CUDA example cod

以下是对Nvidia CUDA的rootbeer示例代码，我与熊蜂和optirun在笔记本电脑上运行与Ubuntu 12.04（精确）。这款笔记本电脑采用了NVIDIA的Optimus，因此optirun。该GPU恰好是了NVIDIA GeForce GT 540M其中Nvidia的网站上说，有96个内核。我得到几乎没有吞吐量增益。问题是什么？

package com.random.test;

import java.util.ArrayList;
import java.util.Formatter;
import java.util.List;

import edu.syr.pcpratts.rootbeer.runtime.Kernel;
import edu.syr.pcpratts.rootbeer.runtime.Rootbeer;

public class ArraySumApp {
    final static int numberOfJobs = 1024; // 1024 in the original example
    final static int sizeOfArray = 512; // 512 in the original example
    final static int theAnswer = 130816;

    public int[] sumArrays(List<int[]> arrays) {

        List<Kernel> jobs = new ArrayList<Kernel>();
        int[] ret = new int[arrays.size()];
        for (int i = 0; i < arrays.size(); ++i) {
            jobs.add(new ArraySum(arrays.get(i), ret, i));
        }

        Rootbeer rootbeer = new Rootbeer();
        rootbeer.runAll(jobs);
        return ret;
    }

    private static long measureOneJob() {

        int[] source = new int[ArraySumApp.sizeOfArray];
        int[] destination = new int[1];
        for (int i = 0; i < ArraySumApp.sizeOfArray; i++)
            source[i] = i;
        Kernel job = new ArraySum(source, destination, 0);

        ElapsedTimer et = new ElapsedTimer();
        job.gpuMethod();
        long timeInMs = et.stopInMilliseconds();
        System.out.println("measureOneJob " + et.stringInMilliseconds());

        assert destination[0] == ArraySumApp.theAnswer : "cosmic rays";
        return timeInMs;
    }

    public static void main(String[] args) {

        Helper.assertAssertionEnabled();

        // measure the time to do one job
        ArraySumApp.measureOneJob();
        long oneJob = ArraySumApp.measureOneJob();

        ArraySumApp app = new ArraySumApp();
        List<int[]> arrays = new ArrayList<int[]>();

        // you want 1000s of threads to run on the GPU all at once for speedups
        for (int i = 0; i < ArraySumApp.numberOfJobs; ++i) {
            int[] array = new int[ArraySumApp.sizeOfArray];
            for (int j = 0; j < array.length; ++j) {
                array[j] = j;
            }
            arrays.add(array);
        }

        ElapsedTimer et = new ElapsedTimer();
        int[] sums = app.sumArrays(arrays);
        long allJobs = et.stopInMilliseconds();
        System.out.println("measureAllJobs " + et.stringInMilliseconds());

        double gainFactor = ((double) ArraySumApp.numberOfJobs) * oneJob
                / allJobs;
        System.out.println(String.format(
                "throughput gain factor %.1f\nthroughput gain %.1f\n",
                gainFactor, gainFactor - 1.0d));

        // check the number of answers is correct
        assert sums.length == ArraySumApp.numberOfJobs : "cosmic rays";

        // check they all have the answer
        for (int i = 0; i < ArraySumApp.numberOfJobs; i++)
            assert sums[i] == ArraySumApp.theAnswer : "cosmic rays";
    }
}

class ArraySum implements Kernel {

    final static int repetitionFactor = 100000;

    private int[] source;
    private int[] ret;
    private int index;

    public ArraySum(int[] src, int[] dst, int i) {
        source = src;
        ret = dst;
        index = i;
    }

    public void gpuMethod() {
        for (int repetition = 0; repetition < ArraySum.repetitionFactor; repetition++) {
            int sum = 0;
            for (int i = 0; i < source.length; ++i) {
                sum += source[i];
            }
            ret[index] = sum;
        }
    }
}

class Helper {
    private Helper() {
    }

    static void assertAssertionEnabled() {
        try {
            assert false;
        } catch (AssertionError e) {
            return;
        }
        Helper.noteCosmicRays();
    }

    static void noteCosmicRays() // programmer design or logic error
    {
        throw new RuntimeException("cosmic rays");
    }
}

class ElapsedTimer {
    private org.joda.time.DateTime t0;
    private long savedStopInMilliseconds;

    public ElapsedTimer() {
        this.t0 = new org.joda.time.DateTime();
    }

    public long stopInMilliseconds() {
        return stop();
    }

    public String stringInMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%d ms", this.savedStopInMilliseconds);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInMilliseconds() {
        stop();
        return stringInMilliseconds();
    }

    public String stringInSecondsAndMilliseconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        f.format("%5.3f s", this.savedStopInMilliseconds / 1000.0d);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSecondsAndMilliseconds() {
        stop();
        return stringInSecondsAndMilliseconds();
    }

    public long stopInSeconds() {
        return (stop() + 500L) / 1000L; // rounding
    }

    public String stringInSeconds() // relies on a saved stop
    {
        Formatter f = new Formatter();
        long elapsed = (this.savedStopInMilliseconds + 500L) / 1000L; // rounding
        f.format("%d s", elapsed);
        String s = f.toString();
        f.close();
        return s;
    }

    public String stopStringInSeconds() {
        stop();
        return stringInSeconds();
    }

    /**
     * This is private. Use the stopInMilliseconds method if this is what you
     * need.
     */
    private long stop() {
        org.joda.time.DateTime t1 = new org.joda.time.DateTime();
        savedStopInMilliseconds = t1.getMillis() - this.t0.getMillis();
        return savedStopInMilliseconds;
    }
}

这是输出：

measureOneJob 110 ms
measureOneJob 26 ms
CudaRuntime2 ctor: elapsedTimeMillis: 609
measureAllJobs 24341 ms
throughput gain factor 1.1
throughput gain 0.1

Answer 1:

所述rootbeer显影剂说，需要数组元素的总和的示例代码是不是最好的示例，并且可替代的示例将显示的吞吐量增益。

Answer 2:

你可以看到： https://github.com/pcpratts/rootbeer1/tree/develop/gtc2013/Matrix

这是2013年NVIDIA GTC大会的例子。我获得了20倍的加速在4核心的Java矩阵乘法使用调换。

这个例子是一个瓷砖矩阵乘法使用GPU上的共享存储器。从NVIDIA的文献，使用共享内存是获得良好的加速比的最重要的apsects之一。使用共享存储器你在一个块的负载值的每个线程到共享阵列。然后，你必须多次重用这些共同的价值观。这样可以节省从全局内存获取的时间。

甲从全局存储器中取出需要大约200-300个时钟周期和取从共享存储器对当前特斯拉2.0 archicture约2-3个时钟周期。