1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 __device__ void myCopyFunction(void* dest_ptr,
void* src_ptr, size_t size) {
6 char* dest = (char*)dest_ptr;
7 char* src = (char*)src_ptr;
8
9 while(size-- > 0) {
10 *dest++ = *src++;
11 }
12 }
13
14 __global__ void testKernel1(int* arg, void* mem)
15 {
16 int temp = 20;
17 *(void**)mem = (void*)&temp;
18 }
19
20 __global__ void testKernel2(int* arg, void* mem)
21 {
22 myCopyFunction(arg, *(void**)mem, sizeof(int));
23 }
24
25 int main()
26 {
27 int arg;
28 size_t arg_size = sizeof(int);
29 int *d_arg, *h_arg;
30 void* d_mem;
31 size_t mem_size = sizeof(void*);
32
33 arg = 10;
34
35 /* STEP 1: memory allocation for GPU exec. */
36 /* alloc. GPU memory and copy 'arg' value */
37 cudaMalloc(&d_arg, arg_size);
38 cudaMalloc(&d_mem, mem_size);
39 cudaMemcpy(d_arg, &arg, arg_size,
cudaMemcpyHostToDevice);
40
41 /* alloc. CPU memory for the result */
42 h_arg = (int*)malloc(arg_size);
43
44 /* STEP 2: execution */
45 /* exec GPU version */
46 testKernel1<<<1,1>>>(d_arg, d_mem);
47 testKernel2<<<1,1>>>(d_arg, d_mem);
48
49 /* STEP 3: retrieve and check results */
50 /* copy GPU result to CPU memory */
51 cudaMemcpy(h_arg, d_arg, arg_size,
cudaMemcpyDeviceToHost);
52
53 printf("%d\n", *h_arg);
54
55 return 0;
56 }
1 .version 1.4
2 .target sm_10, map_f64_to_f32
3 // compiled with /usr/lib//be
4 // nvopencc 3.1 built on 2010-06-07
5
6 //-----------------------------------------------------------
7 // Compiling test.cpp3.i (/tmp/ccBI#.KR1vMQ)
8 //-----------------------------------------------------------
9
10 //-----------------------------------------------------------
11 // Options:
12 //-----------------------------------------------------------
13 // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
14 // -O3 (Optimization level)
15 // -g0 (Debug level)
16 // -m2 (Report advisories)
17 //-----------------------------------------------------------
18
19 // ... content was removed here ...
20
21 .entry _Z11testKernel1PiPv (
22 .param .u64 __cudaparm__Z11testKernel1PiPv_arg,
23 .param .u64 __cudaparm__Z11testKernel1PiPv_mem)
24 {
25 .reg .u64 %rd<4>;
26 .local .s32 __cuda_local_var_22192_6_temp_0;
27 .loc 17 14 0
28 $LDWbegin__Z11testKernel1PiPv:
29 .loc 17 17 0
30 mov.u64 %rd1, __cuda_local_var_22192_6_temp_0;
31 ld.param.u64 %rd2, [__cudaparm__Z11testKernel1PiPv_mem];
32 st.global.u64 [%rd2+0], %rd1;
33 .loc 17 18 0
34 exit;
35 $LDWend__Z11testKernel1PiPv:
36 } // _Z11testKernel1PiPv
37
38 .entry _Z11testKernel2PiPv (
39 .param .u64 __cudaparm__Z11testKernel2PiPv_arg,
40 .param .u64 __cudaparm__Z11testKernel2PiPv_mem)
41 {
42 .reg .u16 %rh<3>;
43 .reg .u64 %rd<7>;
44 .reg .pred %p<3>;
45 .loc 17 20 0
46 $LDWbegin__Z11testKernel2PiPv:
47 .loc 17 6 0
48 ld.param.u64 %rd1,
[__cudaparm__Z11testKernel2PiPv_arg];
49 .loc 17 7 0
50 ld.param.u64 %rd2,
[__cudaparm__Z11testKernel2PiPv_mem];
51 ld.global.u64 %rd3, [%rd2+0];
52 mov.s64 %rd4, 3;
53 $Lt_1_1794:
54 //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown
55 .loc 17 10 0
56 add.u64 %rd3, %rd3, 1;
57 add.u64 %rd1, %rd1, 1;
58 ld.global.s8 %rh1, [%rd3+-1];
59 st.global.s8 [%rd1+-1], %rh1;
60 .loc 17 9 0
61 sub.u64 %rd4, %rd4, 1;
62 mov.u64 %rd5, -1;
63 setp.ne.u64 %p1, %rd4, %rd5;
64 @%p1 bra $Lt_1_1794;
65 .loc 17 23 0
66 exit;
67 $LDWend__Z11testKernel2PiPv:
68 } // _Z11testKernel2PiPv