Appendix B

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <string.h>
 4 
 5 __device__ void myCopyFunction(void* dest_ptr,
void* src_ptr, size_t size) { 6 char* dest = (char*)dest_ptr; 7 char* src = (char*)src_ptr; 8 9 while(size-- > 0) { 10 *dest++ = *src++; 11 } 12 } 13 14 __global__ void testKernel1(int* arg, void* mem) 15 { 16 int temp = 20; 17 *(void**)mem = (void*)&temp; 18 } 19 20 __global__ void testKernel2(int* arg, void* mem) 21 { 22 myCopyFunction(arg, *(void**)mem, sizeof(int)); 23 } 24 25 int main() 26 { 27 int arg; 28 size_t arg_size = sizeof(int); 29 int *d_arg, *h_arg; 30 void* d_mem; 31 size_t mem_size = sizeof(void*); 32 33 arg = 10; 34 35 /* STEP 1: memory allocation for GPU exec. */ 36 /* alloc. GPU memory and copy 'arg' value */ 37 cudaMalloc(&d_arg, arg_size); 38 cudaMalloc(&d_mem, mem_size); 39 cudaMemcpy(d_arg, &arg, arg_size,
cudaMemcpyHostToDevice); 40 41 /* alloc. CPU memory for the result */ 42 h_arg = (int*)malloc(arg_size); 43 44 /* STEP 2: execution */ 45 /* exec GPU version */ 46 testKernel1<<<1,1>>>(d_arg, d_mem); 47 testKernel2<<<1,1>>>(d_arg, d_mem); 48 49 /* STEP 3: retrieve and check results */ 50 /* copy GPU result to CPU memory */ 51 cudaMemcpy(h_arg, d_arg, arg_size,
cudaMemcpyDeviceToHost); 52 53 printf("%d\n", *h_arg); 54 55 return 0; 56 }
  1 	.version 2.1
  2 	.target sm_20
  3 	// compiled with /usr/lib//be
  4 	// nvopencc 3.1 built on 2010-06-07
  5 
  6 	.visible .func _Z14myCopyFunctionPvS_m
(.param .u64 __cudaparmf1__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf2__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf3__Z14myCopyFunctionPvS_m) 7 8 //----------------------------------------------------------- 9 // Compiling test.cpp3.i (/tmp/ccBI#.asnrJg) 10 //----------------------------------------------------------- 11 12 //----------------------------------------------------------- 13 // Options: 14 //----------------------------------------------------------- 15 // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64 16 // -O3 (Optimization level) 17 // -g0 (Debug level) 18 // -m2 (Report advisories) 19 //----------------------------------------------------------- 20 21 // ... content was removed here 22 23 .visible .func _Z14myCopyFunctionPvS_m
(.param .u64 __cudaparmf1__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf2__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf3__Z14myCopyFunctionPvS_m) 24 { 25 .reg .u32 %r<3>; 26 .reg .u64 %rd<15>; 27 .reg .pred %p<4>; 28 .loc 17 5 0 29 $LDWbegin__Z14myCopyFunctionPvS_m: 30 ld.param.u64 %rd1, [__cudaparmf1__Z14myCopyFunctionPvS_m]; 31 mov.s64 %rd2, %rd1; 32 ld.param.u64 %rd3, [__cudaparmf2__Z14myCopyFunctionPvS_m]; 33 mov.s64 %rd4, %rd3; 34 ld.param.u64 %rd5, [__cudaparmf3__Z14myCopyFunctionPvS_m]; 35 mov.s64 %rd6, %rd5; 36 .loc 17 6 0 37 mov.s64 %rd7, %rd2; 38 .loc 17 7 0 39 mov.s64 %rd8, %rd4; 40 sub.u64 %rd9, %rd6, 1; 41 mov.u64 %rd10, -1; 42 setp.eq.u64 %p1, %rd9, %rd10; 43 @%p1 bra $Lt_0_1282; 44 mov.s64 %rd11, %rd6; 45 mov.s64 %rd12, %rd11; 46 $Lt_0_1794: 47 //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown 48 .loc 17 10 0 49 add.u64 %rd8, %rd8, 1; 50 add.u64 %rd7, %rd7, 1; 51 ld.s8 %r1, [%rd8+-1]; 52 st.s8 [%rd7+-1], %r1; 53 sub.u64 %rd9, %rd9, 1; 54 mov.u64 %rd13, -1; 55 setp.ne.u64 %p2, %rd9, %rd13; 56 @%p2 bra $Lt_0_1794; 57 $Lt_0_1282: 58 .loc 17 12 0 59 ret; 60 $LDWend__Z14myCopyFunctionPvS_m: 61 } // _Z14myCopyFunctionPvS_m 62 63 .entry _Z11testKernel1PiPv ( 64 .param .u64 __cudaparm__Z11testKernel1PiPv_arg, 65 .param .u64 __cudaparm__Z11testKernel1PiPv_mem) 66 { 67 .reg .u64 %rd<4>; 68 .local .s32 __cuda_local_var_24173_6_temp_0; 69 .loc 17 14 0 70 $LDWbegin__Z11testKernel1PiPv: 71 .loc 17 17 0 72 mov.u64 %rd1, __cuda_local_var_24173_6_temp_0; 73 ld.param.u64 %rd2, [__cudaparm__Z11testKernel1PiPv_mem]; 74 st.global.u64 [%rd2+0], %rd1; 75 .loc 17 18 0 76 exit; 77 $LDWend__Z11testKernel1PiPv: 78 } // _Z11testKernel1PiPv 79 80 .entry _Z11testKernel2PiPv ( 81 .param .u64 __cudaparm__Z11testKernel2PiPv_arg, 82 .param .u64 __cudaparm__Z11testKernel2PiPv_mem) 83 { 84 .reg .u32 %r<3>; 85 .reg .u64 %rd<7>; 86 .reg .pred %p<3>; 87 .loc 17 20 0 88 $LDWbegin__Z11testKernel2PiPv: 89 .loc 17 6 0 90 ld.param.u64 %rd1, [__cudaparm__Z11testKernel2PiPv_arg]; 91 .loc 17 7 0 92 ld.param.u64 %rd2, [__cudaparm__Z11testKernel2PiPv_mem]; 93 ldu.global.u64 %rd3, [%rd2+0]; 94 mov.s64 %rd4, 3; 95 $Lt_2_1794: 96 //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown 97 .loc 17 10 0 98 add.u64 %rd3, %rd3, 1; 99 add.u64 %rd1, %rd1, 1; 100 ld.s8 %r1, [%rd3+-1]; 101 st.global.s8 [%rd1+-1], %r1; 102 .loc 17 9 0 103 sub.u64 %rd4, %rd4, 1; 104 mov.u64 %rd5, -1; 105 setp.ne.u64 %p1, %rd4, %rd5; 106 @%p1 bra $Lt_2_1794; 107 .loc 17 23 0 108 exit; 109 $LDWend__Z11testKernel2PiPv: 110 } // _Z11testKernel2PiPv
xhtml valid? | css valid? | last update on September 2010