tensorRT---认识cuda RuntimeAPI（认识thrust，认识cuda的错误机制）

thrust#include <stdio.h>#include <thrust/host_vector.h>#include <thrust/device_vector.h>#include <thrust/sort.h>#include <iostream>using namespace std;__host__ __device__

zsffuture

369人浏览 · 2022-05-01 20:40:57

zsffuture · 2022-05-01 20:40:57 发布

thrust


#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <iostream>
using namespace std;

__host__ __device__
int sort_func(int a, int b){
    return a > b;
}

int main(){

    int data[] = {5, 3, 1, 5, 2, 0};
    int ndata  = sizeof(data) / sizeof(data[0]);
    thrust::host_vector<int> array1(data, data + ndata);
    thrust::sort(array1.begin(), array1.end(), sort_func);

    thrust::device_vector<int> array2 = thrust::host_vector<int>(data, data + ndata);
    thrust::sort(array2.begin(), array2.end(), []__device__(int a, int b){return a < b;});

    printf("array1------------------------\n");
    for(int i = 0; i < array1.size(); ++i)
        cout << array1[i] << endl;

    printf("array2------------------------\n");
    for(int i = 0; i < array2.size(); ++i)
        cout << array2[i] << endl;
    return 0;
}

cuda错误机制

# 知识点
1. 若cuda核函数出错，由于他是异步的，立即执行cudaPeekAtLastError只会拿到对输入参数校验是否正确的状态，而不会拿到核函数是否执行正确的状态
2. 因此需要等待核函数执行完毕后，才真的知道当前核函数是否出错，一般通过设备同步或者流同步进行等待
3. 错误分为可恢复和不可恢复两种：
    - 可恢复：
        - 参数配置错误等，例如block越界（一般最大值是1024），shared memory大小超出范围（一般是48KB）
        - 通过cudaGetLastError可以获取错误代码，同时把当前状态恢复为success
        - 该错误在调用核函数后可以立即通过cudaGetLastError/cudaPeekAtLastError拿到
        - 该错误在下一个函数调用的时候会覆盖
    - 不可恢复：
        - 核函数执行错误，例如访问越界等等异常
        - 该错误则会传递到之后的所有cuda操作上
        - 错误状态通常需要等到核函数执行完毕才能够拿到，也就是有可能在后续的任何流程中突然异常（因为是异步的）


#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
using namespace std;

__global__ void func(float* ptr){

    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    if(pos == 999){
        ptr[999] = 5;
    }
}

int main(){

    float* ptr = nullptr;

    // 因为核函数是异步的，因此不会立即检查到他是否存在异常
    func<<<100, 10>>>(ptr);
    //func<<<100, 1050>>>(ptr);
    auto code1 = cudaPeekAtLastError();
    cout << cudaGetErrorString(code1) << endl;

    // 对当前设备的核函数进行同步，等待执行完毕，可以发现过程是否存在异常
    auto code2 = cudaDeviceSynchronize();
    cout << cudaGetErrorString(code2) << endl;

    // 异常会一直存在，以至于后续的函数都会失败
    float* new_ptr = nullptr;
    auto code3 = cudaMalloc(&new_ptr, 100);
    cout << cudaGetErrorString(code3) << endl;
    return 0;
}

FlagOS智算系统软件栈

欢迎来到FlagOS开发社区，这里是一个汇聚了AI开发者、数据科学家、机器学习爱好者以及业界专家的活力平台。我们致力于成为业内领先的Triton技术交流与应用分享的殿堂，为推动人工智能技术的普及与深化应用贡献力量。

更多推荐

PyTorch CUDA调试第一步：5分钟学会使用torch_use_cuda_dsa

是PyTorch提供的一个调试工具，它允许你在CUDA设备端（GPU）执行断言检查。简单来说，就是在GPU上运行的代码中加入断言语句，当条件不满足时会触发错误，帮助你快速发现代码中的问题。这对于调试CUDA内核中的错误特别有用，因为设备端的错误通常比主机端更难调试。是一个非常实用的调试工具，尤其适合CUDA内核的调试。通过设备端断言，你可以快速发现代码中的逻辑错误，提高调试效率。希望这篇笔记能帮助

FlagOS智算系统软件栈

如何用AI优化PyTorch CUDA调试：torch_use_cuda_dsa详解

例如，在矩阵乘法中，可以断言矩阵的维度匹配，或者在计算过程中检查中间值是否在合理范围内。通过AI辅助工具，如Kimi-K2模型，我们可以快速生成带有详细注释的代码示例，解释每个参数的作用和调试技巧。AI不仅能帮助我们理解复杂的CUDA调试技术，还能提供自动补全和错误诊断功能，显著提高开发效率。为了更好地理解断言的作用，我们可以故意在代码中引入一些可能触发断言的条件。对于需要进行CUDA调试的开发者

FlagOS智算系统软件栈

解决bitsandbytes安装难题：libcudart.so找不到的终极方案

🚀 **bitsandbytes** 是一个革命性的PyTorch库，通过8位量化技术让大型语言模型变得触手可及。这个强大的工具能够将模型推理和训练的内存消耗降低到原来的几分之一，但安装时经常遇到的"libcudart.so not found"错误让许多开发者头疼不已。今天，我将为你提供一套完整的解决方案，彻底告别这个困扰！## 🔍 为什么会出现libcudart.so找不到的错误？