admin管理员组

文章数量:1025292

nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)

nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

__global__ void kernel(uint *A, uint *B, int row) {
  auto x = threadIdx.x / 4;
  auto y = threadIdx.x % 4;
  A[x * row + y] = x;
  B[x * row + y] = y;
}

int main(int argc, char **argv) {
  uint *Xs, *Ys;
  uint *Xs_d, *Ys_d;

  uint SIZE = 4;

  Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
  Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));

  cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
  cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));

  dim3 grid_size(1, 1, 1);
  dim3 block_size(4 * 4);

  kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);

  cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
  cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

  cudaDeviceSynchronize();

  for (int row = 0; row < SIZE; ++row) {
    for (int col = 0; col < SIZE; ++col) {
      std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
    }
    std::cout << "\n";
  }

  cudaFree(Xs_d);
  cudaFree(Ys_d);
  free(Xs);
  free(Ys);
}

results in

PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
             ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                              ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                     ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                    ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                       ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {

nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)

nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

__global__ void kernel(uint *A, uint *B, int row) {
  auto x = threadIdx.x / 4;
  auto y = threadIdx.x % 4;
  A[x * row + y] = x;
  B[x * row + y] = y;
}

int main(int argc, char **argv) {
  uint *Xs, *Ys;
  uint *Xs_d, *Ys_d;

  uint SIZE = 4;

  Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
  Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));

  cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
  cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));

  dim3 grid_size(1, 1, 1);
  dim3 block_size(4 * 4);

  kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);

  cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
  cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

  cudaDeviceSynchronize();

  for (int row = 0; row < SIZE; ++row) {
    for (int col = 0; col < SIZE; ++col) {
      std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
    }
    std::cout << "\n";
  }

  cudaFree(Xs_d);
  cudaFree(Ys_d);
  free(Xs);
  free(Ys);
}

results in

PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
             ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                              ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                     ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                    ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                       ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
Share Improve this question asked Nov 18, 2024 at 13:40 He HuangHe Huang 231 silver badge3 bronze badges 3
  • Could it be that your header files are messed up somehow? This really shouldn't happen... – einpoklum Commented Nov 19, 2024 at 7:32
  • This sometimes happens to me if there is a syntax error somewhere else in the file, this stops the parser from processing the CUDA extensions. Scroll further down the list of errors and see if there are any c++ (not cuda) issues, fix those and recompile. The uint looks suspect, try and replace it with unsigned or uint32_t – Johan Commented Nov 19, 2024 at 7:55
  • 1 you have other errors on that line such as the use of uint on windows without a typedef. fix all other errors and this attribute error will go away stackoverflow/questions/3552094/c-uint-unsigned-int-int – Robert Crovella Commented Nov 19, 2024 at 14:37
Add a comment  | 

1 Answer 1

Reset to default 1

From your error message I can see that you are running on Windows.

Only on Linux is uint defined, on Windows it is not.

You can fix this easily, by using unsigned instead, or just define uint as needed. The following fix will make your code compile and run correctly.

Add thus define uint as needed (note that on modern platforms unsigned means unsigned int, aka uint32_t):

#ifndef uint
    typedef unsigned uint;
#endif

This problem manifests as it does, because the nvcc preprocessor cannot translate the __global__, __device__ etc directives in code that with faulty syntax. If there are (too many) syntax errors, then the nvcc system will just feed the code into the c++ compiler as is without processing the CUDA specific extensions. That compiler will then fail on those extensions, resulting in the errors that you see.

In general when you see a lot of errors relating to CUDA stuff the odds are high that you have a simple syntax error somewhere tripping up the nvcc preprocessor.

The full code (see below) now compiles and runs correctly, outputting the correct result.

#include <cuda_runtime.h>
#include <iostream>
#include <vector>

#ifndef uint
   typedef unsigned uint;
#endif

__global__ void kernel(uint* A, uint* B, int row) {
    auto x = threadIdx.x / 4;
    auto y = threadIdx.x % 4;
    A[x * row + y] = x;
    B[x * row + y] = y;
}

int main(int argc, char** argv) {
    uint* Xs, * Ys;
    uint* Xs_d, * Ys_d;

    uint SIZE = 4;

    Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
    Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));

    cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
    cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));

    dim3 grid_size(1, 1, 1);
    dim3 block_size(4 * 4);

    kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);

    cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
    cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

    cudaDeviceSynchronize();

    for (int row = 0; row < SIZE; ++row) {
        for (int col = 0; col < SIZE; ++col) {
            std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
        }
        std::cout << "\n";
    }

    cudaFree(Xs_d);
    cudaFree(Ys_d);
    free(Xs);
    free(Ys);
}

nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)

nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

__global__ void kernel(uint *A, uint *B, int row) {
  auto x = threadIdx.x / 4;
  auto y = threadIdx.x % 4;
  A[x * row + y] = x;
  B[x * row + y] = y;
}

int main(int argc, char **argv) {
  uint *Xs, *Ys;
  uint *Xs_d, *Ys_d;

  uint SIZE = 4;

  Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
  Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));

  cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
  cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));

  dim3 grid_size(1, 1, 1);
  dim3 block_size(4 * 4);

  kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);

  cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
  cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

  cudaDeviceSynchronize();

  for (int row = 0; row < SIZE; ++row) {
    for (int col = 0; col < SIZE; ++col) {
      std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
    }
    std::cout << "\n";
  }

  cudaFree(Xs_d);
  cudaFree(Ys_d);
  free(Xs);
  free(Ys);
}

results in

PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
             ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                              ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                     ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                    ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                       ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {

nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)

nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>

__global__ void kernel(uint *A, uint *B, int row) {
  auto x = threadIdx.x / 4;
  auto y = threadIdx.x % 4;
  A[x * row + y] = x;
  B[x * row + y] = y;
}

int main(int argc, char **argv) {
  uint *Xs, *Ys;
  uint *Xs_d, *Ys_d;

  uint SIZE = 4;

  Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
  Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));

  cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
  cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));

  dim3 grid_size(1, 1, 1);
  dim3 block_size(4 * 4);

  kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);

  cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
  cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

  cudaDeviceSynchronize();

  for (int row = 0; row < SIZE; ++row) {
    for (int col = 0; col < SIZE; ++col) {
      std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
    }
    std::cout << "\n";
  }

  cudaFree(Xs_d);
  cudaFree(Ys_d);
  free(Xs);
  free(Ys);
}

results in

PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
             ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                              ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                     ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                    ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                       ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
                                                           ^

D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
  __declspec(__global__) void kernel(uint *A, uint *B, int row) {
Share Improve this question asked Nov 18, 2024 at 13:40 He HuangHe Huang 231 silver badge3 bronze badges 3
  • Could it be that your header files are messed up somehow? This really shouldn't happen... – einpoklum Commented Nov 19, 2024 at 7:32
  • This sometimes happens to me if there is a syntax error somewhere else in the file, this stops the parser from processing the CUDA extensions. Scroll further down the list of errors and see if there are any c++ (not cuda) issues, fix those and recompile. The uint looks suspect, try and replace it with unsigned or uint32_t – Johan Commented Nov 19, 2024 at 7:55
  • 1 you have other errors on that line such as the use of uint on windows without a typedef. fix all other errors and this attribute error will go away stackoverflow/questions/3552094/c-uint-unsigned-int-int – Robert Crovella Commented Nov 19, 2024 at 14:37
Add a comment  | 

1 Answer 1

Reset to default 1

From your error message I can see that you are running on Windows.

Only on Linux is uint defined, on Windows it is not.

You can fix this easily, by using unsigned instead, or just define uint as needed. The following fix will make your code compile and run correctly.

Add thus define uint as needed (note that on modern platforms unsigned means unsigned int, aka uint32_t):

#ifndef uint
    typedef unsigned uint;
#endif

This problem manifests as it does, because the nvcc preprocessor cannot translate the __global__, __device__ etc directives in code that with faulty syntax. If there are (too many) syntax errors, then the nvcc system will just feed the code into the c++ compiler as is without processing the CUDA specific extensions. That compiler will then fail on those extensions, resulting in the errors that you see.

In general when you see a lot of errors relating to CUDA stuff the odds are high that you have a simple syntax error somewhere tripping up the nvcc preprocessor.

The full code (see below) now compiles and runs correctly, outputting the correct result.

#include <cuda_runtime.h>
#include <iostream>
#include <vector>

#ifndef uint
   typedef unsigned uint;
#endif

__global__ void kernel(uint* A, uint* B, int row) {
    auto x = threadIdx.x / 4;
    auto y = threadIdx.x % 4;
    A[x * row + y] = x;
    B[x * row + y] = y;
}

int main(int argc, char** argv) {
    uint* Xs, * Ys;
    uint* Xs_d, * Ys_d;

    uint SIZE = 4;

    Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
    Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));

    cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
    cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));

    dim3 grid_size(1, 1, 1);
    dim3 block_size(4 * 4);

    kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);

    cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
    cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);

    cudaDeviceSynchronize();

    for (int row = 0; row < SIZE; ++row) {
        for (int col = 0; col < SIZE; ++col) {
            std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
                << "] ";
        }
        std::cout << "\n";
    }

    cudaFree(Xs_d);
    cudaFree(Ys_d);
    free(Xs);
    free(Ys);
}

本文标签: cerror attribute quotglobalquot does not apply hereStack Overflow