CUDA笔记1(线程模型/内存模型)

一线程模型

1.hello

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>


__global__ void hello_cuda()
{
	printf("Hello CUDA world \n");
}

int main()
{
	int nx, ny;
	nx = 16;
	ny = 4;

	dim3 block(8, 2);
	dim3 grid(nx / block.x,ny / block.y); 

	hello_cuda <<< grid, block >>> ();
	
	cudaDeviceSynchronize(); //同步函数
	cudaDeviceReset();//充值释放资源
	
	return 0;
}

在这里插入图片描述

2.动态分配blocksize

在这里插入图片描述

3.限制

在这里插入图片描述

4.计算索引

左乘右加

二内存模型

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <time.h>

__global__ void mem_trs_test(int * input)
{
	int gid = blockIdx.x * blockDim.x + threadIdx.x;
	printf("tid : %d , gid : %d, value : %d \n",threadIdx.x,gid,input[gid]);
}

__global__ void mem_trs_test2(int * input, int size)
{
	int gid = blockIdx.x * blockDim.x + threadIdx.x;

	if(gid < size)
		printf("tid : %d , gid : %d, value : %d \n", threadIdx.x, gid, input[gid]);
}

int main()
{
	int size = 150;
	int byte_size = size * sizeof(int);

	int * h_input;
	h_input = (int*)malloc(byte_size);申请主机内存

	time_t t;
	srand((unsigned)time(&t));
	for (int i = 0; i < size; i++)
	{
		h_input[i] = (int)(rand() & 0xff);
	}

	int * d_input;
	cudaMalloc((void**)&d_input, byte_size);//申请显卡内存

	cudaMemcpy(d_input,h_input,byte_size,cudaMemcpyHostToDevice);// 内存拷贝

	dim3 block(32);
	dim3 grid(5);

	mem_trs_test2 << <grid, block >> > (d_input,size);
	cudaDeviceSynchronize();

	cudaFree(d_input);//释放显卡内存
	free(h_input);//释放主机内存

	cudaDeviceReset();
	return 0;
}

1.内存分类

主机内存
- 普通内存是在栈空间,通过new的对象是在堆空间
内存的类型
- GlobalMemory:全局内存速度:普通，读写。大小: 显存大小 (11GB etc.)
- Constant Memory，常量内存只读。大小:一般64KB，16bit寻址速度:很快，
- Shared Memory，共享内存,速度:快，读写。大小:2080Ti有48KB
- LocalMemory，本地内存(栈内存,函数内定义的变量,和全局内存共用内存,其实是全局内存)速度:普通，读写。大小: 可用内存/SM数量/SM最大常驻线程数
- TextureMemory(渲染)，纹理内存速度:快，只读。
- Register，寄存器速度:最快，读写。
LocalMemory(栈内存)

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

static __global__ void test_kernel() {
	int array[3];
	float value = 5;
	__shared__ int shared_value;

	printf("array is local = %s\n", __isLocal(array) ? "true" : "false");
	printf("value is local = %s\n", __isLocal(&value) ? "true" : "false");
	printf("shared value is local = % s\n", __isLocal(&shared_value) ? "true" : "false");
}
int  main() {
	test_kernel <<<1,1>>> ();
	cudaDeviceSynchronize;
	return 0;
}

结果:
array is local = true
value is local = true
shared value is local = false

Shared Memory

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

// 方式2，声明共享的变量，不能给初始值。需要由线程来初始化
__shared__  int shared_value2;

static __global__ void test_kernel() {

    // 方式1，声明静态大小(大小固定就是8)的共享内存，所有线程公用
    __shared__ int shared_array[8];

    // 方式2，声明共享的变量，不能给初始值(初始值是没有意义的)。需要由线程来初始化
    __shared__ int shared_value1;

    if (threadIdx.x == 0) {
        shared_value1 = 5;
        shared_value2 = 8;
        shared_array[0] = 33;
    }

    __syncthreads();
    printf("%d.shared value1 = %d, shared value2 = %d\n", threadIdx.x, shared_value1, shared_value2);
    printf("%d.shared array[0] = %d\n", shared_array[0]);
}

int  main() {
    test_kernel << <1, 2 >> > ();
    cudaDeviceSynchronize;
    return 0;
}

结果
0.shared value1 = 5, shared value2 = 8
1.shared value1 = 5, shared value2 = 8
33.shared array[0] = 5
33.shared array[0] = 5

动态指定共享内存

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;


static __global__ void test_kernel() {

    // 方式3 使用extern声明外部的动态大小共享内存，由核函数的第三个参数指定
    extern __shared__ int shared_array[8];


    if (threadIdx.x == 0) {
        shared_array[0] = blockIdx.x;
    }

    __syncthreads();
    printf("%d, %d, shared_array2 = %d\n", threadIdx.x, blockIdx.x, shared_array[0]);
}

int  main() {
    test_kernel << <1, 2 ,sizeof(int)*5>> > ();
    cudaDeviceSynchronize;
    return 0;
}

GlobalMemory1

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;


// 方式2: device 定义
__device__ float global_array[100];

static __global__ void test_kernel(float* device_ptr) {
    printf("device ptr is global = %s\n" ,__isGlobal(device_ptr) ? "true" : "false");
    printf("global array is global = %s\n", __isGlobal(global_array) ? "true" : "false");
}

int main() {
    //方式1:主机分配
    float* device_ptr = nullptr; 
    cudaMalloc(&device_ptr,sizeof(float) * 100);
    test_kernel << <1,1 >> > (device_ptr);
    cudaDeviceSynchronize();
    return 0;
}

结果
device ptr is global = true
global array is global = true

GlobalMemory2内存转移
GlobalMemory3 Pinned Memory1(页锁定内存)

默认下，通过new、malloc函数分配的Host Memory;是pageable的(可置换页上的内存)。OS可能将pageable内存置换到虚拟内存上。GP无法安全获取pageable memory，因为host OS物理上的数据转移时机无法被GPU捕获。因此pageable host memory传送到device时，CUDA驱动会先分配一个page-locked的pinned host memory并将host数据储存到这个临时空间里，然后GPU从这个pinned host memory中获取数据。
通过cudaMallocHost或 cudaHostAlloc(分配主机上的内存)，可以显式的分配Pinned Memory (page-locked memory)。PinnedMemory会常驻在物理内存上，不会被交换，因此可以使用DMA(Direct MemoryAccess)技术，直接在CPU/GPU上使用该内存。
区别cudaMalloc这是分配设备(显卡)上的内存
Pinned Memory相比Pageable Memory可用更少 (后者可以利用虚拟内存技术增大内存)，更珍贵(消耗物内存空间)。但是访问速度更快(避免了临时空间环节)，可以直接被GPU使用。本质上来讲，PinnedMemory仅仅是GPU可以使用，但是依旧是Host Memory
Global Memory3 Pinned Memory2(页锁定内存)

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

 

static __global__ void test_kernel(float* array) {
    array[threadIdx.x] = threadIdx.x;
}

int main() {
    int num = 5;
    float* array = nullptr;
    cudaMallocHost(&array, sizeof(float) * num);

    test_kernel << <1, num >> > (array);
    cudaDeviceSynchronize();

    for (int i = 0; i < num; i++)
    {
        printf("array[%d]=%f\n",i,array[i]);
    }
    return 0;
}

Pinned Memory依旧是Host Memory。它的使用场景是，代替malloc、new，然后再进行cudaMemcpy
虽然Pinned Memory可以允许GPU直接访问，但是这种操作效率是低效的正确的使用场景是适合大数据传输的中间储存，不适合频繁的读写操作

global memory3- Zero Copy Memory (零拷贝内存)
Zero Copy Memory:即内存的复制过程，没有CPu的参与。直接由GPU和内存条操作也是上面提到的DMA(Direct MemoryAccess)
ZeroCopy Memory实质上就是Pinned Memory映射到device的空间地址，本质上是等价的
他由cudaMallocHost、 cudaHostAlloc实现分配，cudaFreeHost释放。cudaHostGetDevicePointer可以获取分配的Pinned Memory映射到device的地址
Unified Memory1(统一内存global memory4
由统一内存系统管理的内存。UnifedVirtualAddressing(统一虚拟地址)将CPu和GPu内存看作为一个整体进行管理和使用。分配的内存可以cPu/GPu直接访问
区别:在Linux下内存的位置将会选择在device memory上。因此设备访问速度会快而host访问会慢。在Windows下，将会以zero-copy memory的形式存在host memory上

由cudaMallocManaged 分配(linux是放在gpu上的)，cudaFree释放
Unified Memory1(常量内存const memory5

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

// 直接定义和初始化(可选 )
__constant__ float warp_matrix[6] = { 1,2,3,4,5, 6 };

static __global__ void test_kernel( ) {
    // 在核函数内,constant内存不能修改,否则报错
    printf("warp_matrix[%d] = %f\n",threadIdx.x, warp_matrix[threadIdx.x]);
}

int main() {
    // 覆盖warp_matrix的初始化值定义
    float host_warp_matrix[6] = {6,5,3,1,6,2};
    
    cudaMemcpyToSymbol(warp_matrix, host_warp_matrix,sizeof(host_warp_matrix));
    
    test_kernel << <1,6>> > ();
    cudaDeviceSynchronize();

}

三执行模型(线程束)

SIMT、SIMD

SIMT(Single Instruction MultipleThread)单指令多线程，CUDA的执行模型
SIMD(Single Instruction Multiple Data)单指令多数据，多在CPu的流指令集使用

warp(包含32个线程)

一个Stream Multiprocessors有2个Block ( warp),
线程束，则是指thread warp，也就是这里的32个core组成的SM block
注意:thread warp是逻辑上的线程束，硬件执行时的抽象，并不能直接指硬件
Warps和thread blocks
- 任意时刻，任务的执行都是以warp为单位的，warp_size的大小通常是32。也就是启动n个线程，则需要ceil(n / warp_size)个线程束
- 线程束中的线程有两种状态:活动，
  - 线程启动后的状态active:不活跃，
  - 线程未启动，或者等待时的状态(分支问题)inactive:
- warp类型根据调度器来决定其状态有
  - Selected warp:选中
  - Eligible warp:准备好待执行
  - Stalled warp:没准备好
- 一个block需要的warp数量 = ceil(T / warp_size)。T = block中的线程数对于一个warp中启动的线程数，如果小于32，则剩余部分线程为inactive状态inactive的线程依旧消耗了资源
Warps利thread blocks
- 对于gradDim=m,blockDim=n的任务，总共需要的warp数为:mceil(n / warp_size)。而实际上物理warp数有限 (1080Ti:68 SM,64 Core per SM,total warp = 682=136)。则GPu调度器会根据任务总数，分别对warp进行调度选中的条件为:32个core空闲，并且指令参数准备就绪
- block内的warps公用shared memory;通常block内的所有warps需要一定程度上的同步(分批执行同一个任务)和并行
- grid内的blocks，通常是独立、并行的。每个block之间使用相互独立的sharedmemory
- 因此设计eridDim 、blockDim时，考虑block内通常可以互通数据、并行、同步数据等。而block间相互独立
- block的大小也尽量是warp_size的倍数，尽量避免warp_size无法填满造成的nactive线程消耗

warp 的执行

有序结果(同时加载的)
- 一个warp内有32个线程，可以保证每个线程执行的同一行代码一个block内n个线程(比如512)，此时512个线程是否执行的同一行代码不确定
  如果需要保证block内每一个线程都到达特定行后再执行，在核函数中使用syncthreads(
  前面的代码如果num>warp_size时，特定时候可能出现打印顺序并非如期
  而这里加了_syncthreads0后的程序则必定按照顺序打印

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

static __global__ void test_kernel(int* array) {
    array[threadIdx.x] = threadIdx.x;
    //__syncthreads()
    printf("step1 %d\n", threadIdx.x);
    //__syncthreads()
    printf("step2 %d\n", threadIdx.x);
    //__syncthreads()
    printf("step3 %d\n", threadIdx.x);
}

int main() {
    const int num = 5;
    int* array = nullptr;
    cudaMallocManaged(&array, sizeof(int) * num);
    test_kernel << <1, num >> > (array);
    cudaDeviceSynchronize();
    for (int i = 0; i < num; ++i) {
        printf("array[%d] = %d\n", i, array[i]);
    }
}

结果
step1 0
step1 1
step1 2
step1 3
step1 4
step2 0
step2 1
step2 2
step2 3
step2 4
step3 0
step3 1
step3 2
step3 3
step3 4
array[0] = 0
array[1] = 1
array[2] = 2
array[3] = 3
array[4] = 4

主要由分支语句导致，例如if
一个warp内的所有线程执行都是同一行代码，但是遇到if分支语句时就存在了分歧。会导致逻辑上的错误
因此warp分歧的处理1让warp中不满足条件的线程执行分支，为active线程。满足条件的为inactive线程2、将满足分支的inactive线程激活为active，并执行满足分支3、继续一同并行
如右图输出的，step2分支先执行，而后step1执行，最后一起执行step3
不满足的先走,满足的走,一起走
为了节约资源,尽量少使用分支语句

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

static __global__ void test_kernel(int* array) {
    if (threadIdx.x % 2 == 0) {
        printf("step1 %d\n", threadIdx.x);
    }
    else {
        printf("step2 %d\n", threadIdx.x);
    }
    printf("step3 %d\n", threadIdx.x);
}

int main() {
    const int num = 5;
    int* array = nullptr;
    cudaMallocManaged(&array, sizeof(int) * num);
    test_kernel << <1, num >> > (array);
    cudaDeviceSynchronize();
    for (int i = 0; i < num; ++i) {
        printf("array[%d] = %d\n", i, array[i]);
    }
}

结果
step2 1
step2 3
step1 0
step1 2
step1 4
step3 0
step3 1
step3 2
step3 3
step3 4
array[0] = 0
array[1] = 0
array[2] = 0
array[3] = 0
array[4] = 0

编程范例

规约求和

#include <stdio.h>
#include "cuda_runtime.h"

using namespace std;

static __global__ void test_kernel(int* array, int* sum_output, int n) {
    int position = blockIdx.x * blockDim.x + threadIdx.x;
    extern __shared__ int cache[];//block内共享,加快访问速度

    int value = 0;
    int num_thread_per_block = blockDim.x;
    int lane = threadIdx.x;

    if (position < n)
        value = array[position];

    for (int i = num_thread_per_block / 2; i > 0; i /= 2) {
        cache[lane] = value;
        __syncthreads(); // 等待所有线程存储完数据
        if(lane < i) value += cache[lane + i];
        __syncthreads(); // 等待所有线程读取完数据

    }

    if (lane == 0) {
        printf("block %d value = %d\n", blockIdx.x, value);
        atomicAdd(sum_output, value);
    }
        
}

int main() {
    const int num = 10;
    int* array = nullptr; 
    int* sum_output = nullptr;
    cudaMallocManaged(&array,sizeof(int) * num); 
    cudaMallocManaged(&sum_output, sizeof(int)); 
    *sum_output = 0;
    int ground_truth = 0; 
    for (int i = 0; i < num; ++i) {
        array[i] = i;
        ground_truth += i;
    }
    // 请让num_thread_per_block是2的幂次方,因为block的归约是2倍步长缩减
    int num_thread_per_block = 8;
    int num_block_per_grid = (num + num_thread_per_block - 1) / num_thread_per_block;
    printf("num_block_per_grid = %d,num_thread_per_block = %d\n", num_block_per_grid, num_thread_per_block);
    int shared_bytes = sizeof(int) * num_thread_per_block;
    test_kernel <<<num_block_per_grid, num_thread_per_block, shared_bytes >>> (array, sum_output, num);
   
    cudaDeviceSynchronize();
    printf("sum output = %d, ground truth = %d\n", *sum_output, ground_truth);
}

指令的等待延迟Latency Hiding1

当涉及指令时，会存在延迟。指令主要分为两种:
1.Arithmeticinstruction，算术指令
2.Memoryinstruction，内存指令

他们之间的耗时一般估计为:
1.10-20 cycle for arithmetic operations
2.400-800 cycles for global memory accesses
算数计算相比内存访问而言要快很多。内存访问的合理性也容易成为系统性能影响的关键
这里内存访问指: globalmemory、shared memory、pinned memory、 constantmemory之间的选造成的影响
而影响指令性能的另一个方面是，并行性优化。例如所有线程都执行了同-个算数指令时，其性能和吞吐最佳
再例如:
1.所有的线程都访问了同一个内存地址时，可以被合并为一个访问操作
2.所有的线程都访问了相邻内存，也可以合并为一次大的连续的内存访问
主要指内存访问，我们应当尽量减少内存的访问次数，方法是尽可能使得线程访问的是连续的内存区域，使得内存访问操作可以被并行优化

性能优化总结

1 .尽量warp内的线程访问的内存是连续的
2.尽量少的使用分支(if、switch)，造成部分线程inactive影响性能
3.block的大小设置应当是warp_size的整数倍
4.block不宜太小，一般为256、512
5.尽量使用sharedmemory做缓存，避免频繁的与globalmemory交互
6.pinnedmemory应该是内存复制到device的媒介，避免gpu中直接访问
7.善用constantmemory，对于常量性质的数据，可以利用并加速
8.尽量使用pinned memory(page-locked memory)，而非pageable memory
9.当需要简化代码内存管理时，可以使用cudaMallocManaged9