Appendix A

 1 #include <stdio.h>
 2 #include <stdlib.h>
 3 #include <string.h>
 4 
 5 __device__ void myCopyFunction(void* dest_ptr,
void* src_ptr, size_t size) { 6 char* dest = (char*)dest_ptr; 7 char* src = (char*)src_ptr; 8 9 while(size-- > 0) { 10 *dest++ = *src++; 11 } 12 } 13 14 __global__ void testKernel1(int* arg, void* mem) 15 { 16 int temp = 20; 17 *(void**)mem = (void*)&temp; 18 } 19 20 __global__ void testKernel2(int* arg, void* mem) 21 { 22 myCopyFunction(arg, *(void**)mem, sizeof(int)); 23 } 24 25 int main() 26 { 27 int arg; 28 size_t arg_size = sizeof(int); 29 int *d_arg, *h_arg; 30 void* d_mem; 31 size_t mem_size = sizeof(void*); 32 33 arg = 10; 34 35 /* STEP 1: memory allocation for GPU exec. */ 36 /* alloc. GPU memory and copy 'arg' value */ 37 cudaMalloc(&d_arg, arg_size); 38 cudaMalloc(&d_mem, mem_size); 39 cudaMemcpy(d_arg, &arg, arg_size,
cudaMemcpyHostToDevice); 40 41 /* alloc. CPU memory for the result */ 42 h_arg = (int*)malloc(arg_size); 43 44 /* STEP 2: execution */ 45 /* exec GPU version */ 46 testKernel1<<<1,1>>>(d_arg, d_mem); 47 testKernel2<<<1,1>>>(d_arg, d_mem); 48 49 /* STEP 3: retrieve and check results */ 50 /* copy GPU result to CPU memory */ 51 cudaMemcpy(h_arg, d_arg, arg_size,
cudaMemcpyDeviceToHost); 52 53 printf("%d\n", *h_arg); 54 55 return 0; 56 }
 1 	.version 1.4
 2 	.target sm_10, map_f64_to_f32
 3 	// compiled with /usr/lib//be
 4 	// nvopencc 3.1 built on 2010-06-07
 5 
 6 	//-----------------------------------------------------------
 7 	// Compiling test.cpp3.i (/tmp/ccBI#.KR1vMQ)
 8 	//-----------------------------------------------------------
 9 
10 	//-----------------------------------------------------------
11 	// Options:
12 	//-----------------------------------------------------------
13 	//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
14 	//  -O3	(Optimization level)
15 	//  -g0	(Debug level)
16 	//  -m2	(Report advisories)
17 	//-----------------------------------------------------------
18 
19 	// ... content was removed here ...
20 
21 	.entry _Z11testKernel1PiPv (
22 		.param .u64 __cudaparm__Z11testKernel1PiPv_arg,
23 		.param .u64 __cudaparm__Z11testKernel1PiPv_mem)
24 	{
25 	.reg .u64 %rd<4>;
26 	.local .s32 __cuda_local_var_22192_6_temp_0;
27 	.loc	17	14	0
28 $LDWbegin__Z11testKernel1PiPv:
29 	.loc	17	17	0
30 	mov.u64 	%rd1, __cuda_local_var_22192_6_temp_0;
31 	ld.param.u64 	%rd2, [__cudaparm__Z11testKernel1PiPv_mem];
32 	st.global.u64 	[%rd2+0], %rd1;
33 	.loc	17	18	0
34 	exit;
35 $LDWend__Z11testKernel1PiPv:
36 	} // _Z11testKernel1PiPv
37 
38 	.entry _Z11testKernel2PiPv (
39 		.param .u64 __cudaparm__Z11testKernel2PiPv_arg,
40 		.param .u64 __cudaparm__Z11testKernel2PiPv_mem)
41 	{
42 	.reg .u16 %rh<3>;
43 	.reg .u64 %rd<7>;
44 	.reg .pred %p<3>;
45 	.loc	17	20	0
46 $LDWbegin__Z11testKernel2PiPv:
47 	.loc	17	6	0
48 	ld.param.u64 	%rd1,
[__cudaparm__Z11testKernel2PiPv_arg];
49 	.loc	17	7	0
50 	ld.param.u64 	%rd2,
[__cudaparm__Z11testKernel2PiPv_mem];
51 	ld.global.u64 	%rd3, [%rd2+0];
52 	mov.s64 	%rd4, 3;
53 $Lt_1_1794:
54  //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown 55 .loc 17 10 0 56 add.u64 %rd3, %rd3, 1; 57 add.u64 %rd1, %rd1, 1; 58 ld.global.s8 %rh1, [%rd3+-1]; 59 st.global.s8 [%rd1+-1], %rh1; 60 .loc 17 9 0 61 sub.u64 %rd4, %rd4, 1; 62 mov.u64 %rd5, -1; 63 setp.ne.u64 %p1, %rd4, %rd5; 64 @%p1 bra $Lt_1_1794; 65 .loc 17 23 0 66 exit; 67 $LDWend__Z11testKernel2PiPv: 68 } // _Z11testKernel2PiPv
xhtml valid? | css valid? | last update on September 2010