1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 __device__ void myCopyFunction(void* dest_ptr,
void* src_ptr, size_t size) {
6 char* dest = (char*)dest_ptr;
7 char* src = (char*)src_ptr;
8
9 while(size-- > 0) {
10 *dest++ = *src++;
11 }
12 }
13
14 __global__ void testKernel1(int* arg, void* mem)
15 {
16 int temp = 20;
17 *(void**)mem = (void*)&temp;
18 }
19
20 __global__ void testKernel2(int* arg, void* mem)
21 {
22 myCopyFunction(arg, *(void**)mem, sizeof(int));
23 }
24
25 int main()
26 {
27 int arg;
28 size_t arg_size = sizeof(int);
29 int *d_arg, *h_arg;
30 void* d_mem;
31 size_t mem_size = sizeof(void*);
32
33 arg = 10;
34
35 /* STEP 1: memory allocation for GPU exec. */
36 /* alloc. GPU memory and copy 'arg' value */
37 cudaMalloc(&d_arg, arg_size);
38 cudaMalloc(&d_mem, mem_size);
39 cudaMemcpy(d_arg, &arg, arg_size,
cudaMemcpyHostToDevice);
40
41 /* alloc. CPU memory for the result */
42 h_arg = (int*)malloc(arg_size);
43
44 /* STEP 2: execution */
45 /* exec GPU version */
46 testKernel1<<<1,1>>>(d_arg, d_mem);
47 testKernel2<<<1,1>>>(d_arg, d_mem);
48
49 /* STEP 3: retrieve and check results */
50 /* copy GPU result to CPU memory */
51 cudaMemcpy(h_arg, d_arg, arg_size,
cudaMemcpyDeviceToHost);
52
53 printf("%d\n", *h_arg);
54
55 return 0;
56 }
1 .version 2.1
2 .target sm_20
3 // compiled with /usr/lib//be
4 // nvopencc 3.1 built on 2010-06-07
5
6 .visible .func _Z14myCopyFunctionPvS_m
(.param .u64 __cudaparmf1__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf2__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf3__Z14myCopyFunctionPvS_m)
7
8 //-----------------------------------------------------------
9 // Compiling test.cpp3.i (/tmp/ccBI#.asnrJg)
10 //-----------------------------------------------------------
11
12 //-----------------------------------------------------------
13 // Options:
14 //-----------------------------------------------------------
15 // Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
16 // -O3 (Optimization level)
17 // -g0 (Debug level)
18 // -m2 (Report advisories)
19 //-----------------------------------------------------------
20
21 // ... content was removed here
22
23 .visible .func _Z14myCopyFunctionPvS_m
(.param .u64 __cudaparmf1__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf2__Z14myCopyFunctionPvS_m,
.param .u64 __cudaparmf3__Z14myCopyFunctionPvS_m)
24 {
25 .reg .u32 %r<3>;
26 .reg .u64 %rd<15>;
27 .reg .pred %p<4>;
28 .loc 17 5 0
29 $LDWbegin__Z14myCopyFunctionPvS_m:
30 ld.param.u64 %rd1, [__cudaparmf1__Z14myCopyFunctionPvS_m];
31 mov.s64 %rd2, %rd1;
32 ld.param.u64 %rd3, [__cudaparmf2__Z14myCopyFunctionPvS_m];
33 mov.s64 %rd4, %rd3;
34 ld.param.u64 %rd5, [__cudaparmf3__Z14myCopyFunctionPvS_m];
35 mov.s64 %rd6, %rd5;
36 .loc 17 6 0
37 mov.s64 %rd7, %rd2;
38 .loc 17 7 0
39 mov.s64 %rd8, %rd4;
40 sub.u64 %rd9, %rd6, 1;
41 mov.u64 %rd10, -1;
42 setp.eq.u64 %p1, %rd9, %rd10;
43 @%p1 bra $Lt_0_1282;
44 mov.s64 %rd11, %rd6;
45 mov.s64 %rd12, %rd11;
46 $Lt_0_1794:
47 //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown
48 .loc 17 10 0
49 add.u64 %rd8, %rd8, 1;
50 add.u64 %rd7, %rd7, 1;
51 ld.s8 %r1, [%rd8+-1];
52 st.s8 [%rd7+-1], %r1;
53 sub.u64 %rd9, %rd9, 1;
54 mov.u64 %rd13, -1;
55 setp.ne.u64 %p2, %rd9, %rd13;
56 @%p2 bra $Lt_0_1794;
57 $Lt_0_1282:
58 .loc 17 12 0
59 ret;
60 $LDWend__Z14myCopyFunctionPvS_m:
61 } // _Z14myCopyFunctionPvS_m
62
63 .entry _Z11testKernel1PiPv (
64 .param .u64 __cudaparm__Z11testKernel1PiPv_arg,
65 .param .u64 __cudaparm__Z11testKernel1PiPv_mem)
66 {
67 .reg .u64 %rd<4>;
68 .local .s32 __cuda_local_var_24173_6_temp_0;
69 .loc 17 14 0
70 $LDWbegin__Z11testKernel1PiPv:
71 .loc 17 17 0
72 mov.u64 %rd1, __cuda_local_var_24173_6_temp_0;
73 ld.param.u64 %rd2, [__cudaparm__Z11testKernel1PiPv_mem];
74 st.global.u64 [%rd2+0], %rd1;
75 .loc 17 18 0
76 exit;
77 $LDWend__Z11testKernel1PiPv:
78 } // _Z11testKernel1PiPv
79
80 .entry _Z11testKernel2PiPv (
81 .param .u64 __cudaparm__Z11testKernel2PiPv_arg,
82 .param .u64 __cudaparm__Z11testKernel2PiPv_mem)
83 {
84 .reg .u32 %r<3>;
85 .reg .u64 %rd<7>;
86 .reg .pred %p<3>;
87 .loc 17 20 0
88 $LDWbegin__Z11testKernel2PiPv:
89 .loc 17 6 0
90 ld.param.u64 %rd1,
[__cudaparm__Z11testKernel2PiPv_arg];
91 .loc 17 7 0
92 ld.param.u64 %rd2,
[__cudaparm__Z11testKernel2PiPv_mem];
93 ldu.global.u64 %rd3, [%rd2+0];
94 mov.s64 %rd4, 3;
95 $Lt_2_1794:
96 //<loop> Loop body line 7, nesting depth: 1,
estimated iterations: unknown
97 .loc 17 10 0
98 add.u64 %rd3, %rd3, 1;
99 add.u64 %rd1, %rd1, 1;
100 ld.s8 %r1, [%rd3+-1];
101 st.global.s8 [%rd1+-1], %r1;
102 .loc 17 9 0
103 sub.u64 %rd4, %rd4, 1;
104 mov.u64 %rd5, -1;
105 setp.ne.u64 %p1, %rd4, %rd5;
106 @%p1 bra $Lt_2_1794;
107 .loc 17 23 0
108 exit;
109 $LDWend__Z11testKernel2PiPv:
110 } // _Z11testKernel2PiPv