1 /*
2 // generate cuda_* version of all v_array functions
3 g++ -E -DCUDA_DEVICE_MODE "path_to_v_array"/v_mem_manager.c > cuda_v_mem_manager.c
4 g++ -E -DCUDA_DEVICE_MODE "path_to_v_array"/v_array.c > cuda_v_array.c
5 // compile
6 nvcc -arch sm_11 -Xcompiler "-fpermissive" -I"path_to_v_array" -L"path_to_v_array" -lv_array -o example example.cu
7 */
8
9 /* for printf, malloc, memcpy and memcmp */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 /* for v_mem_manager_* and v_array_* functions */
14 #include <v_mem_manager.h>
15 #include <v_array.h>
16
17 /* host wrapper functions using OS alloc. func. */
18 __host__ void standard_malloc_func(void** ptr, size_t size, void* user_data) {
19 *ptr = malloc(size);
20 }
21 __host__ void standard_memcpy_func(void* dest_ptr, void* src_ptr, size_t size) {
22 memcpy(dest_ptr, src_ptr, size);
23 }
24 __host__ void standard_free_func(void* ptr, void* user_data) {
25 free(ptr);
26 }
27
28 /* host wrapper functions using CUDA runtime */
29 __host__ void cuda_malloc_func(void** ptr, size_t size, void* user_data) {
30 cudaMalloc(ptr, size);
31 }
32 __host__ void cuda_memcpyH2D_func(void* dest_ptr, void* src_ptr, size_t size) {
33 cudaMemcpy(dest_ptr, src_ptr, size, cudaMemcpyHostToDevice);
34 }
35 __host__ void cuda_memcpyD2H_func(void* dest_ptr, void* src_ptr, size_t size) {
36 cudaMemcpy(dest_ptr, src_ptr, size, cudaMemcpyDeviceToHost);
37 }
38 __host__ void cuda_free_func(void* ptr, void* user_data) {
39 cudaFree(ptr);
40 }
41
42 __device__ void cuda_mem_malloc_func(void**, size_t, void*);
43 __device__ void cuda_mem_memcpy_func(void*, void*, size_t);
44 __device__ void cuda_mem_free_func(void*, void*);
45
46 /* The following workaround is used because CUDA code cannot call a __device__ function from another file */
47 #include "cuda_v_mem_manager.c"
48 #include "cuda_v_array.c"
49
50 /* device wrapper functions using memory manager */
51 __device__ void cuda_mem_malloc_func(void** ptr, size_t size, void* user_data) {
52 *ptr = cuda_v_mem_manager_allocate_chunk((v_mem_manager_data_t*)user_data);
53 }
54 __device__ void cuda_mem_memcpy_func(void* dest_ptr, void* src_ptr, size_t size) {
55 char* dest = (char*)dest_ptr;
56 char* src = (char*)src_ptr;
57
58 while(size-- > 0) {
59 *dest++ = *src++;
60 }
61 }
62 __device__ void cuda_mem_free_func(void* ptr, void* user_data) {
63 cuda_v_mem_manager_free_chunk((v_mem_manager_data_t*)user_data, ptr);
64 }
65
66 /* CUDA kernel */
67 __global__ void exampleKernel(v_mem_manager_data_t* mem, unsigned int array_row_num, V_ARRAY_OFFSET_T* result)
68 {
69 v_array_t a1, a2;
70 V_ARRAY_OFFSET_T offset_a1, offset_a2;
71 v_array_iter_t i1, i2;
72 int i, value;
73
74 /* multiple threads cannot safely access a critical section */
75 if(threadIdx.x == 0) {
76 /* create a new array elements */
77 a1 = cuda_v_array_new_elem(1, V_ARRAY_FALSE, sizeof(int), array_row_num, NULL, mem);
78 a2 = cuda_v_array_new_elem(2, V_ARRAY_FALSE, sizeof(int), array_row_num, NULL, mem);
79
80 /* compute offsets from pointers */
81 offset_a1 = cuda_v_array_get_offset(mem, a1);
82 offset_a2 = cuda_v_array_get_offset(mem, a2);
83
84 /* create iterators */
85 i1.offset = offset_a1;
86 i1.idx = 0;
87 i2.offset = offset_a2;
88 i2.idx = 0;
89
90 /* insert some data */
91 for(i=0; i<2; i++) {
92 value = i + blockIdx.x;
93 i1 = cuda_v_array_append_data(mem, i1, &value, NULL, NULL);
94 i2 = cuda_v_array_append_data(mem, i2, &value, NULL, NULL);
95 }
96
97 /* append a2 to a1 */
98 i1.offset = offset_a1;
99 i1.idx = 0;
100 cuda_v_array_append_copy(mem, i1, a2, NULL, NULL);
101
102 /* store array offset for host */
103 result[blockIdx.x] = offset_a1;
104 }
105 }
106
107 int main()
108 {
109 unsigned int i, block_num = 1, mem_chunk_num = 10, array_row_num = 3;
110 size_t mem_chunk_size = sizeof(v_array_elem_t) + (sizeof(int)*array_row_num);
111 size_t result_size = block_num * sizeof(V_ARRAY_OFFSET_T);
112 v_mem_manager_data_t *d_mem;
113 V_ARRAY_OFFSET_T *d_result, *h_result;
114 v_array_iter_t iter;
115
116 /* STEP 1: memory allocation */
117 /* create GPU memory manager from CPU */
118 d_mem = v_mem_manager_new_data(mem_chunk_num, mem_chunk_size, &cuda_malloc_func, NULL, &cuda_memcpyH2D_func);
119 /* allocate GPU result array */
120 cuda_malloc_func((void**)&d_result, result_size, NULL);
121 /* allocate CPU result array */
122 standard_malloc_func((void**)&h_result, result_size, NULL);
123
124 /* STEP 2: execution */
125 /* exec GPU version */
126 exampleKernel<<<block_num, 1>>>(d_mem, array_row_num, d_result);
127
128 /* STEP 3: retrieve results and display them */
129 /* copy GPU result to CPU memory */
130 cuda_memcpyD2H_func(h_result, d_result, result_size);
131 /* print arrays */
132 for(i=0; i<block_num; i++) {
133 /* copy GPU array to CPU memory */
134 v_array_t h_array = v_array_pre_copy(d_mem, NULL, h_result[i], standard_malloc_func, cuda_memcpyD2H_func);
135 /* print array's content */
136 iter.offset = (V_ARRAY_OFFSET_T)h_array;
137 iter.idx = 0;
138 printf("%d [", i);
139 while(v_array_iter_inside_right(NULL, iter)) {
140 iter = v_array_get_row(NULL, iter);
141 printf("%d ", *(int*)iter.result.data);
142 iter = v_array_iter_next(iter);
143 }
144 printf("]\n");
145 /* desallocate host copy */
146 v_array_free(NULL, (V_ARRAY_OFFSET_T)h_array, standard_free_func, standard_memcpy_func);
147 }
148
149 /* STEP 4: memory desallocation */
150 cuda_free_func(d_mem, NULL);
151 cuda_free_func(d_result, NULL);
152 standard_free_func(h_result, NULL);
153
154 return 0;
155 }