Bootstrap

昇腾AI原生创新算子挑战赛(S1赛季)复盘- Addcmul算子

  • 很亢奋,今年终迎来了算子大比。在交流前先感谢算子相关工作人员的辛勤付出,才能有牛刀小试的机会,辛苦了
  • 其次,这里分享的个人的一点小小的不成器思路,如果有更好的解题思路,或者优化方案,还望大家多交流,不吝指教

  • 具体的Pytorch调用公式如下,公式计算不难
    • 首先要考虑value在Pytorch是标量,但是案例中是tensor
    • 那么我们需要tensor输入转value标量,再调用标量乘法API
    • 通过tiling引入plusvalue变量并输入this->plusvalue

half buf = static_cast<half>(this->plusvalue);
Muls(calcBuf1Local,calcBuf1Local,buf,this->tileLength);复制

cke_33519.png

  • 关于这道题目,官方提供了5个测试案例,个人认为第四题是当仁不让的难中之王,理由如下
    • 需要考虑INT8的格式转换,查找官方文档Muls计算并不支持INT8格式
    • Atlas 200/500 A2推理产品,支持的数据类型为:Tensor(half/int16_t/float/int32_t)
    • 所以个人思路是需要将输入INT8格式转换为half类型或者其他类型,进行运算
    • 最后将运算后的结果重新再转换为INT8格式输出

cke_209979.png

cke_83928.png

  • 具体的一个API思路如下
Cast(calcBuf1Local,x1LocalLocal,RoundMode::CAST_NONE,this->tileLength);
Cast(calcBuf2Local,x2LocalLocal,RoundMode::CAST_NONE,this->tileLength);
    
Mul(calcBuf1Local,calcBuf1Local,calcBuf2Local,this->tileLength);

half buf = static_cast<half>(this->plusvalue);
Muls(calcBuf1Local,calcBuf1Local,buf,this->tileLength);
Cast(calcBuf2Local,input_dataLocal,RoundMode::CAST_NONE,this->tileLength);
Add(calcBuf1Local,calcBuf1Local,calcBuf2Local, this->tileLength);
Cast(yLocal,calcBuf1Local,RoundMode::CAST_CEIL,this->tileLength);复制

  • 类型转换API完成,那么如何嵌入主题代码呢?
  • 个人思路是通过tiling中对类型进行识别再通过 this->inputtype 自定义变量来辨别计算公式
  __aicore__ inline void Process() {

    CopyInit();
    int32_t loopCount = this->tileNumy * BUFFER_NUM;
    if (this->inputtype != 2) {
      for (int32_t i = 0; i < loopCount; i++) {
        CopyIn(i);
        Compute(i);
        CopyOut(i);
      }
    }
    else if (this->inputtype == 2) {
      for (int32_t i = 0; i < loopCount; i++) {
        CopyIn(i);
        Compute_int8(i);
        CopyOut(i);
    }
    }

  }复制

  • 前面提到类型识别,具体API是什么呢?
  • dt 与 数字绑定 在文档里搜索就可以
  auto dt = context->GetInputTensor(0)->GetDataType();//获取DataType类型的属性值。
  tiling.set_inputtype(dt);//如果真要改,这里还要考虑多数据类型的计算
  if (dt == 1) {//DT_FLOAT16 = 1, // fp16 type
    sizeofdatatype = 2;
  }else if (dt == 0) {// DT_FLOAT = 0,  // float type
    sizeofdatatype = 4;
  }else if (dt == 2){ // DT_INT8 = 2,    // int8 type
    sizeofdatatype = 1;
  }else if(dt == 3){ //  DT_INT32 = 3
    sizeofdatatype = 4;
  }else
  {
    return ge::GRAPH_FAILED;
  }复制

  • 解决了类型转换后,最重要的问题就是如何解决{1,32}Tensor与{32,32}Tensor相乘问题
  • 解决方案就是需要广播,但是不幸的是AscendC广播指令具体API在下一个版本才公布,那么不就是瞎子摸象
  • 个人的思路是只解决二维Tensor之间的乘法,更高维的Tensor暂时不支持
  • 解决方案就是暴力拆解
    • 如果是{1,32}1行32列,那么将行复制32次,构成{32,32}
    • 如果是{32,1}32行1列,那么讲列复制32次,构成{32,32}
  • 看上去很简单,但是实施起来还要考虑很多问题
    • GM 与 LocalTensor之间的内存交换问题
    • BUFFER_NUM乒乓操作的内存分割问题
    • tileLength和lasttileLength的内存使用问题
    • 数据对齐后使得blocklength变大,取值难度增加问题
  • 这里只粘贴部分代码展示思路
 if (BUFFER_NUM == 2) {
      //开启double
      //buffer时,由于将输入数据分成了相等的2部分,分块大小为不开启double
      //buffer的一半, 所以需要对最后两个分块数据的起始地址做处理
      if ((progress == (this->tileNumy * BUFFER_NUM - 2)) ||
          (progress == (this->tileNumy * BUFFER_NUM - 1))) {
        //分块大小变为tileLength的一半
        //倒数第2个分块数据的起始地址向前移动(tileLength-lasttileLength),最后一个分块的起始地址以此为基础进行移动

        // DataCopy(inLocal[this->tileLength + this->tileLength1],x2Gm[(progress - 2) * (this->tileLength2) + this->lasttileLength2],(this->tileLength2));
        if(this->OneX2Buf[1] == 1)
        {
          // DataCopy(inLocal[this->tileLength + this->tileLength1],x2Gm[((progress - 2) * (this->tileLengthy) + this->lasttileLengthy)%(this->axis0)], this->tileLengthy);
          for (i = 0;i<this->tileLengthy;i++)
          {
            inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(((progress - 2) * (this->tileLengthy) + this->lasttileLengthy+i)%(this->axis0)));
          }
        }
        else if(this->OneX2Buf[0] == 1)
        {
          if(((progress + 1)* this->tileLengthy) > (this->axis0 * this->axisy))
          {
            for (i = 0;i<(this->axis0 * this->axisy - ((progress)* this->tileLengthy));i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
            for (i = 0;i<(((progress + 1)* this->tileLengthy) - this->axis0 * (this->axisy));i++)
            {          
              inLocal.SetValue(this->tileLength + this->tileLength1 + i + (this->axis0 * this->axisy - ((progress)* this->tileLengthy)),x2Gm.GetValue(this->axisy));
            }
            this->axisy = this->axisy + 1;
          }
          else if (((progress + 1)* this->tileLengthy) == (this->axis0 * this->axisy))
          {
            for (i = 0;i<this->tileLengthy;i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
            this->axisy = 1;
          }
          else if (((progress + 1)*this->tileLengthy) < (this->axis0 * this->axisy))
          {
            for (i = 0;i<this->tileLengthy;i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
          }
        }
        else{
          DataCopy(
              inLocal[this->tileLength + this->tileLength1],
              x2Gm[(progress - 2) * (this->tileLengthy) + this->lasttileLengthy],
              (this->tileLengthy));
        }        
      }
      else {
        // inLocal.SetValue(this->tileLength + this->tileLength1+i,calcBuf4Local.GetValue(progress * this->tileLengthy +i));
        if(this->OneX2Buf[1] == 1)
        {
          // DataCopy(inLocal[this->tileLength + this->tileLength1],x2Gm[((progress )* this->tileLengthy)%(this->axis0)], this->tileLengthy);
          for (i = 0;i<this->tileLengthy;i++)
          {
            inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(((progress )* this->tileLengthy + i)%(this->axis0)));
          }
        }
        else if(this->OneX2Buf[0] == 1)
        {
          if(((progress + 1)* this->tileLengthy) > (this->axis0 * this->axisy))
          {
            for (i = 0;i<(this->axis0 * this->axisy - ((progress)* this->tileLengthy));i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
            for (i = 0;i<(((progress + 1)* this->tileLengthy) - this->axis0 * (this->axisy));i++)
            {          
              inLocal.SetValue(this->tileLength + this->tileLength1 + i + (this->axis0 * this->axisy - ((progress)* this->tileLengthy)),x2Gm.GetValue(this->axisy));
            }
            this->axisy = this->axisy + 1;
          }
          else if (((progress + 1)* this->tileLengthy) == (this->axis0 * this->axisy))
          {
            for (i = 0;i<this->tileLengthy;i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
            this->axisy = this->axisy + 1;
          }
          else if (((progress + 1)*this->tileLengthy) < (this->axis0 * this->axisy))
          {
            for (i = 0;i<this->tileLengthy;i++)
            {
              inLocal.SetValue(this->tileLength + this->tileLength1 + i,x2Gm.GetValue(this->axisy-1));
            }
          }
        }
        else{
          DataCopy(inLocal[this->tileLength + this->tileLength1],x2Gm[progress * this->tileLengthy], this->tileLengthy);
        }
        
      }
    }


    inQueueIN.EnQue(inLocal);
  }复制

  • 解决了这个问题后还有一个大问题需要解决
    • Tensor的范围是【-10,10】
    • 这就有问题了,可能会超过范围,那么溢出时,得到数值是多少,会跟C编译器一样吗?
    • 这个见文档可知,cast 将src按照round_mode取整,以int8_t格式(溢出默认按照饱和处理)2^7-1存入dst中
    • 这里本人并没有着手处理,只是修改范围【-5,5】然后测试

  • 在编程过程中肯定遇到很多迷之BUG,那么如何定位呢?
    • op_host直接printf
    • op_kernel ,dump_tensor-PRINTF本版本暂不支持,只能看看/var/log/npu/slog/debug/device-app-974443
      • 但是个人看来,定位能力还是较弱
      • 如果有更好定位方式,望告知

  • 最后展示下5个测试案例的测试结果
  • 案例一

 

cke_1819397.png

  • 案例二

cke_1835269.png

  •  案例三

cke_1877908.png

  • 案例四

cke_1898725.png

  • 案例五

cke_1915215.png

;