1 __global__ static void maxSort1(int *values, int N) {
2 // 1) COPY-INTO: Copy data from the values vector
3 // into shared memory:
4 __shared__ int shared[THREAD_WORK_SIZE * NUM_THREADS];
5 for (unsigned k = 0; k < THREAD_WORK_SIZE; k++) {
6 unsigned loc = k * blockDim.x + threadIdx.x;
7 if (loc < N) {
8 shared[loc] = values[loc + blockIdx.x * blockDim.x];
9 }
10 }
11 __syncthreads();
12 // 2) SORT: each thread sorts its chunk of data
13 // with a small sorting net.
14 int index1 = threadIdx.x * THREAD_WORK_SIZE;
15 int index2 = threadIdx.x * THREAD_WORK_SIZE + 1;
16 int index3 = threadIdx.x * THREAD_WORK_SIZE + 2;
17 int index4 = threadIdx.x * THREAD_WORK_SIZE + 3;
18 if (index4 < N) {
19 swapIfNecessary(shared, index1, index3);
20 swapIfNecessary(shared, index2, index4);
21 swapIfNecessary(shared, index1, index2);
22 swapIfNecessary(shared, index3, index4);
23 swapIfNecessary(shared, index2, index3);
24 }
25 __syncthreads();
26 // 3) SCATTER: the threads distribute their data
27 // along the array.
28 __shared__ int scattered[THREAD_WORK_SIZE * 300];
29 unsigned int nextLoc = threadIdx.x;
30 for (unsigned i = 0; i < THREAD_WORK_SIZE; i++) {
31 scattered[nextLoc] = shared[threadIdx.x * THREAD_WORK_SIZE + i];
32 nextLoc += blockDim.x;
33 }
34 __syncthreads();
35 // 4) COPY-BACK: Copy the data back from the shared
36 // memory into the values vector:
37 for (unsigned k = 0; k < THREAD_WORK_SIZE; k++) {
38 unsigned loc = k * blockDim.x + threadIdx.x;
39 if (loc < N) {
40 values[loc + blockIdx.x * blockDim.x] = scattered[loc];
41 }
42 }
43 }
11.6分岔研究历史
GPU的历史都比较新,所以关于GPU的分岔分析资料也比较新:
Ryoo, S. Rodrigues, C. Baghsorkhi, S. Stone, S. Kirk, D. and Hwu, Wen-Mei. "Optimization principles and application performance evaluation of a multithreaded GPU using CUDA", PPoPP, p 73-82 (2008) CUDA介绍
Coutinho, B. Diogo, S. Pereira, F and Meira, W. "Divergence Analysis and Optimizations", PACT, p 320-329 (2011) 分岔分析与优化
Sampaio, D. Martins, R. Collange, S. and Pereira, F. "Divergence Analysis", TOPLAS, 2013. 分岔分析
1 __global__ static void maxSort1(int *values, int N) {
2 // 1) COPY-INTO: Copy data from the values vector
3 // into shared memory:
4 __shared__ int shared[THREAD_WORK_SIZE * NUM_THREADS];
5 for (unsigned k = 0; k < THREAD_WORK_SIZE; k++) {
6 unsigned loc = k * blockDim.x + threadIdx.x;
7 if (loc < N) {
8 shared[loc] = values[loc + blockIdx.x * blockDim.x];
9 }
10 }
11 __syncthreads();
12 // 2) SORT: each thread sorts its chunk of data
13 // with a small sorting net.
14 int index1 = threadIdx.x * THREAD_WORK_SIZE;
15 int index2 = threadIdx.x * THREAD_WORK_SIZE + 1;
16 int index3 = threadIdx.x * THREAD_WORK_SIZE + 2;
17 int index4 = threadIdx.x * THREAD_WORK_SIZE + 3;
18 if (index4 < N) {
19 swapIfNecessary(shared, index1, index3);
20 swapIfNecessary(shared, index2, index4);
21 swapIfNecessary(shared, index1, index2);
22 swapIfNecessary(shared, index3, index4);
23 swapIfNecessary(shared, index2, index3);
24 }
25 __syncthreads();
26 // 3) SCATTER: the threads distribute their data
27 // along the array.
28 __shared__ int scattered[THREAD_WORK_SIZE * 300];
29 unsigned int nextLoc = threadIdx.x;
30 for (unsigned i = 0; i < THREAD_WORK_SIZE; i++) {
31 scattered[nextLoc] = shared[threadIdx.x * THREAD_WORK_SIZE + i];
32 nextLoc += blockDim.x;
33 }
34 __syncthreads();
35 // 4) COPY-BACK: Copy the data back from the shared
36 // memory into the values vector:
37 for (unsigned k = 0; k < THREAD_WORK_SIZE; k++) {
38 unsigned loc = k * blockDim.x + threadIdx.x;
39 if (loc < N) {
40 values[loc + blockIdx.x * blockDim.x] = scattered[loc];
41 }
42 }
43 }
11.6分岔研究历史
GPU的历史都比较新,所以关于GPU的分岔分析资料也比较新:
Ryoo, S. Rodrigues, C. Baghsorkhi, S. Stone, S. Kirk, D. and Hwu, Wen-Mei. "Optimization principles and application performance evaluation of a multithreaded GPU using CUDA", PPoPP, p 73-82 (2008) CUDA介绍
Coutinho, B. Diogo, S. Pereira, F and Meira, W. "Divergence Analysis and Optimizations", PACT, p 320-329 (2011) 分岔分析与优化
Sampaio, D. Martins, R. Collange, S. and Pereira, F. "Divergence Analysis", TOPLAS, 2013. 分岔分析