## NVIDIA Interview Question

Software Engineer / Developers**Country:**United States

**Interview Type:**Written Test

```
#define M 1000
#define N 5
int list[M];
int sums[N] = { 0 };
void func(int n, int start, int end) {
for (int i = start; i < end; i++) {
sums[n] += list[i];
}
}
int main(int argc, char* argv[])
{
thread* threads[N];
int prev = 0;
int count = 0;
for (int i = 0; i < N; i++) {
if (i == N - 1) {
threads[i] = new thread(count, func, prev, M);
} else {
threads[i] = new thread(count, func, prev, (i + 1) * M / N);
}
count++;
prev = (i + 1) * M / N;
}
int sum = 0;
for (int i = 0; i < N; i++) {
threads[i]->join();
delete threads[i];
sum += sums[i];
}
cout << sum << endl;
return 0;
}
```

Note 1 : ask the interviewer if integer addition is atomic on the system, if it is, there is no need for adding up all the elements together at the end

Note 2 : if the number of threads is huge (as Vik said) you can use spin locks on a global integer and thereby enforce atomicity for integer addition

Each thread have to calculate a part of array. Size of part X is M/N, and last part size is M-X*(N-1); each thread add numbers to a local variable and add this value to global variable via atomic operation.

Yeah but then since you have N threads now merging the answer into a single global variable will need some kind of locking mechanism on that global variable. Now consider this if the number of threads is huge the step of merging the result into a single variable becomes bottleneck and all threads are waiting to take a lock on a single variable so that they can update the total sum...

You can break sums, since is an operation you can do in parallel

IE you have an array of 20 elements and 2 threads when in one thread you can sum 1-10 in one thread and the other 10 elements in other thread, the tricky part here in java is that at the end of every thread you will need to accumulate results in a share variable or something you will need to synchronize this if you don't want to have some race problems.

You may like to review map reduce (this is kind of the idea behind it)

hope it helps~

import java.util.concurrent.CountDownLatch;

public class Sum {

public static Integer sum=new Integer(0);

/**

* @param args

*/

public static void main(String[] args) {

int[] arr = new int[100];

for (int i = 0; i < arr.length; i++) {

arr[i] = 1;

}

int threads = 3;

CountDownLatch count = new CountDownLatch(threads);

int length = arr.length;

int num = length/threads;

for (int i = 0; i < threads-1; i++) {

new Thread(new SumPart(arr, num*i, (num*(i+1))-1, count)).start();

}

new Thread(new SumPart(arr, (threads-1)*num, length-1, count)).start();

try {

count.await();

} catch (Exception e) { }

System.out.println(sum);

}

}

class SumPart implements Runnable{

private CountDownLatch countDownLatch;

private int[] arr = null;

private int start,end=0;

public SumPart(int[] arr, int start, int end, CountDownLatch countDownLatch) {

this.arr = arr;

this.start = start;

this.end = end;

this.countDownLatch = countDownLatch;

}

@Override

public void run() {

while (start<=end) {

Sum.sum = Sum.sum+arr[start++];

}

countDownLatch.countDown();

}

}

Put the result of each thread into a linked-list. Using another thread , add the linked-list nodes deleting the nodes as it gets summed and then put the result in another node.

At the end of first summing phase there will be only one node in the linked-list containing the intermediate sum . Continuing the same process till we have only 1 node in the list for 3-5 times with same value.

DWORD SUBARRAY(DWORD *ARR,DWORD SIZE)

{

INT I=0;

CREATESEMPHORE(FREETHREADCNT,N)

WHILE(I<M)

{

WAITFORSINGLEOBJECT(FREETHREADCNT,INFINITE)

IF(I=M-1)&&(I%2!=0)

{

CREATETHREAD(TSUM,ARR[I],0);

I++;

}

ELSE

{

CREATETHREAD(TSUM,ARR[I],ARR[I+1]);

I=I+2;

}

}

}

VOLITLE DWORD SUM=0;

VOID TSUM(DWORD A, DWORD B)

{

ATOMICADD(SUM,A+B)

RELEASESEMAPHORE(FREETHREADCNT)

}

Language: CUDA/C++

Device specs:

- Tesla C2050

- 448 CUDA cores

- 49152 bytes of shared memory per block

- 3072 MB of global memory

- 1024 max threads per block

This implementation is a two-pass reduction kernel as explained in the CUDA Handbook by Jonathan Wilt, there is room for improvement.

Basically, each block performs a summation on the subarray it is allocated. Then, the sums from all the blocks are added by calling the same kernel again.

For an allocation with blocks of 16 threads, these are the reported times:

```
64 x1 array:0.055840 ms.
256x1 array:0.05616 ms
512x1 array:0.056480 ms
1024x1 array:0.056704 ms
4096x1 array:0.076640 ms
8192x1 array:0.098848 ms.
```

For an allocation with blocks of 256 threads, these are the reported times:

```
64x1 array:0.135904 ms
256x1 array:0.064192 ms
512x1 array:0.064064 ms
1024x1 array:0.064512 ms
4096x1 array:0.064928 ms
8192x1 array:0.067456 ms
```

CUDA Code:

```
#include <cuda.h>
#include <iostream>
#include <stdlib.h>
#include <cstdio>
#include <sys/time.h>
#include <omp.h>
#define BLOCK_SIZE 256
#define ARRAY_SIZE 4096
//to handle API errors
static void HandleError( cudaError_t err,
const char *file,
int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//for timing the time to perform the summation
cudaEvent_t event_gpu_start,event_gpu_end;
float gpu (int * numbers_d, int * blocksSums_d, int * answers_d, int size, int numBlocks, int blockSize);
__global__ void reduction_1 (int * numbers, int * blockSums, int blocks);
int main (int argc, char ** argv){
//array to reduce
//CPU data
int * numbers = new int [ARRAY_SIZE];
int * answer = new int [1];
//arrays to hold GPU data
int * answer_d;
int * numbers_d; //copy of data on device
int * blockSums_d; //holds blocks sums using during the second pass
//allocate the array to be summed. assign ints from 0 to size of array
for (int i=0; i < ARRAY_SIZE; i++) {
numbers[i] = i;
}
//allocate gpu memory space and copy the array to be summed
HANDLE_ERROR(cudaMalloc((void**)&answer_d,sizeof(int)));
HANDLE_ERROR(cudaMalloc((void **)&numbers_d,ARRAY_SIZE*sizeof(int)));
HANDLE_ERROR(cudaMemcpy(numbers_d, numbers, ARRAY_SIZE*sizeof(int),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMalloc((void **)&blockSums_d,(ARRAY_SIZE/BLOCK_SIZE)*sizeof(int)));
//display message
printf ("[INFO]:data allocation done\n");
printf ("[INFO]:reduction started\n");
//call function to initialize and launch threads
float gpu_time = gpu (numbers_d, blockSums_d, answer_d, ARRAY_SIZE, ARRAY_SIZE/BLOCK_SIZE,BLOCK_SIZE);
printf ("[INFO]:reduction done\n");
printf ("[INFO]:time to reduce array %ux%u of integers:%3.6f ms.\n[INFO]:with %u blocks of %u threads\n"
,ARRAY_SIZE,1,gpu_time,ARRAY_SIZE/BLOCK_SIZE,BLOCK_SIZE);
HANDLE_ERROR(cudaMemcpy(answer,answer_d, sizeof(int), cudaMemcpyDeviceToHost));
printf ("[INFO]:the sum is %u\n", answer[0]);
HANDLE_ERROR(cudaFree(numbers_d));
HANDLE_ERROR(cudaFree(blockSums_d));
HANDLE_ERROR(cudaFree(answer_d));
}
float gpu (int * numbers_d, int * blockSums_d, int * answer_d, int size, int numBlocks, int blockSize) {
float gpu_time = 0.00;
//start timing the running time of the kernels
HANDLE_ERROR(cudaEventCreate(&event_gpu_start));
HANDLE_ERROR(cudaEventRecord(event_gpu_start));
unsigned int sharedMemorySize = blockSize*sizeof(int);
//first pass: calculate sum of subarrays
reduction_1 <<<numBlocks,blockSize,sharedMemorySize>>>(numbers_d,blockSums_d, size);
cudaDeviceSynchronize();
//second pass: calculate sum of all the sums of subarrays
reduction_1<<<1,blockSize,sharedMemorySize>>>(blockSums_d,answer_d,numBlocks);
HANDLE_ERROR(cudaEventCreate(&event_gpu_end));
HANDLE_ERROR(cudaEventRecord(event_gpu_end));
//calculate the time elapsed
cudaEventSynchronize(event_gpu_end);
cudaEventElapsedTime(&gpu_time, event_gpu_start, event_gpu_end);
return gpu_time;
}
__global__ void reduction_1 (int * numbers, int * blockSums, int size) {
int sum = 0;
extern __shared__ int interim[]; //dynamic shared memory
//for each thread add together whatever it reads
for (int i = blockIdx.x*blockDim.x + threadIdx.x; i < size; i+=blockDim.x*gridDim.x) {
sum+=numbers[i];
}
//store results in a shared array within a block
interim[threadIdx.x]= sum;
//make sure all threads reach this point
__syncthreads();
//now we have reduce the number of active threads per block
//for an array size = 64 and blocksize = 16;
//the first phase we reassign the same value in sum
//for the second pass
//we will use half of the threads to add together partial sums
//so we start with threads = 16/2=8 that add together x and x+8
//then 8/2 = 4 which adds x and x+4
//then 4/2 = 2 which adds x and x+2
//then 2/2 = 1 which adds x and x+1
for (int threads = blockDim.x>>1; threads; threads>>=1) {
if (threadIdx.x < threads) {
interim[threadIdx.x] +=interim[threads+threadIdx.x];
}
__syncthreads();
}
//the first thread of each block writes the sum of the subarray in an output array
if (threadIdx.x == 0) {
blockSums[blockIdx.x] = interim[threadIdx.x];
}
}
```

I don't think a global lock is necessary. Have each thread return a partial sum and store it in a separate array element. Then the main thread sums up partial sums once partial sum-calculating threads are done

- Anonymous October 09, 2013