admin管理员组文章数量:1025292
nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
Share
Improve this question
asked Nov 18, 2024 at 13:40
He HuangHe Huang
231 silver badge3 bronze badges
3
|
1 Answer
Reset to default 1From your error message I can see that you are running on Windows.
Only on Linux is uint
defined, on Windows it is not.
You can fix this easily, by using unsigned
instead, or just define uint
as needed. The following fix will make your code compile and run correctly.
Add thus define uint as needed (note that on modern platforms unsigned
means unsigned int
, aka uint32_t
):
#ifndef uint
typedef unsigned uint;
#endif
This problem manifests as it does, because the nvcc preprocessor cannot translate the __global__
, __device__
etc directives in code that with faulty syntax. If there are (too many) syntax errors, then the nvcc system will just feed the code into the c++ compiler as is without processing the CUDA specific extensions. That compiler will then fail on those extensions, resulting in the errors that you see.
In general when you see a lot of errors relating to CUDA stuff the odds are high that you have a simple syntax error somewhere tripping up the nvcc preprocessor.
The full code (see below) now compiles and runs correctly, outputting the correct result.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#ifndef uint
typedef unsigned uint;
#endif
__global__ void kernel(uint* A, uint* B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char** argv) {
uint* Xs, * Ys;
uint* Xs_d, * Ys_d;
uint SIZE = 4;
Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
Share
Improve this question
asked Nov 18, 2024 at 13:40
He HuangHe Huang
231 silver badge3 bronze badges
3
- Could it be that your header files are messed up somehow? This really shouldn't happen... – einpoklum Commented Nov 19, 2024 at 7:32
-
This sometimes happens to me if there is a syntax error somewhere else in the file, this stops the parser from processing the CUDA extensions. Scroll further down the list of errors and see if there are any c++ (not cuda) issues, fix those and recompile. The
uint
looks suspect, try and replace it withunsigned
oruint32_t
– Johan Commented Nov 19, 2024 at 7:55 - 1 you have other errors on that line such as the use of uint on windows without a typedef. fix all other errors and this attribute error will go away stackoverflow/questions/3552094/c-uint-unsigned-int-int – Robert Crovella Commented Nov 19, 2024 at 14:37
1 Answer
Reset to default 1From your error message I can see that you are running on Windows.
Only on Linux is uint
defined, on Windows it is not.
You can fix this easily, by using unsigned
instead, or just define uint
as needed. The following fix will make your code compile and run correctly.
Add thus define uint as needed (note that on modern platforms unsigned
means unsigned int
, aka uint32_t
):
#ifndef uint
typedef unsigned uint;
#endif
This problem manifests as it does, because the nvcc preprocessor cannot translate the __global__
, __device__
etc directives in code that with faulty syntax. If there are (too many) syntax errors, then the nvcc system will just feed the code into the c++ compiler as is without processing the CUDA specific extensions. That compiler will then fail on those extensions, resulting in the errors that you see.
In general when you see a lot of errors relating to CUDA stuff the odds are high that you have a simple syntax error somewhere tripping up the nvcc preprocessor.
The full code (see below) now compiles and runs correctly, outputting the correct result.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#ifndef uint
typedef unsigned uint;
#endif
__global__ void kernel(uint* A, uint* B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char** argv) {
uint* Xs, * Ys;
uint* Xs_d, * Ys_d;
uint SIZE = 4;
Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
本文标签: cerror attribute quotglobalquot does not apply hereStack Overflow
版权声明:本文标题:c++ - error: attribute "__global__" does not apply here - Stack Overflow 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://it.en369.cn/questions/1745618494a2159433.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
uint
looks suspect, try and replace it withunsigned
oruint32_t
– Johan Commented Nov 19, 2024 at 7:55