diff options
| author | Jianhui Li <jian.hui.li@intel.com> | 2025-11-22 07:46:04 +0000 |
|---|---|---|
| committer | Jianhui Li <jian.hui.li@intel.com> | 2025-11-22 07:46:04 +0000 |
| commit | bfae01fa3f6453ee1d0f67e98c3d6c2b1fcee8f2 (patch) | |
| tree | c99e2184ee9d522577f5963dd24cd83639e4d884 | |
| parent | b3f2a4ab3d57ff906e03dd03b6365ba99d2169bf (diff) | |
propogation hornor pre-defined layout at anchor opusers/Jianhui-Li/XeGPU/anchor-op-layout
7 files changed, 328 insertions, 237 deletions
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 9ddc408a17f7..70c61a445e8a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -847,7 +847,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, - OptionalAttr<DistributeLayoutAttr>:$layout); + OptionalAttr<DistributeLayoutAttr>:$anchor_layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -906,7 +906,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, "xegpu::CachePolicyAttr": $l3_hint, - "xegpu::DistributeLayoutAttr": $layout)> + "xegpu::DistributeLayoutAttr": $anchor_layout)> ]; let hasVerifier = 1; @@ -991,7 +991,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, - OptionalAttr<DistributeLayoutAttr>:$layout); + OptionalAttr<DistributeLayoutAttr>:$anchor_layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index b3a780abd3f1..6d45a51ab026 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -387,6 +387,8 @@ private: ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results); + bool hasAnchorLayout(xegpu::DistributeLayoutAttr anchorLayout); + public: LayoutInfoPropagation(DataFlowSolver &solver, SymbolTableCollection &symbolTable, @@ -475,49 +477,72 @@ LogicalResult LayoutInfoPropagation::visitOperation( return success(); } +bool LayoutInfoPropagation::hasAnchorLayout( + xegpu::DistributeLayoutAttr anchorLayout) { + if (anchorLayout == nullptr) { + return false; + } + if (layoutKind == LayoutKind::InstData) { + return !(anchorLayout.getEffectiveInstDataAsInt().empty()); + } else if (layoutKind == LayoutKind::Lane) { + return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() || + anchorLayout.getEffectiveLaneDataAsInt().empty()); + } + return false; +} + void LayoutInfoPropagation::visitPrefetchNdOp( xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - // Here we assign the default layout to the tensor descriptor operand of - // prefetch. - auto tdescTy = prefetch.getTensorDescType(); - - auto uArch = getUArch(getChipStr(prefetch).value_or("")); - const auto *uArchInstruction = - dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>( - uArch->getInstruction( - xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); - - auto blockWHC = - uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); - if (!blockWHC) - prefetch.emitWarning("No known block params found for the element type."); - auto [bWidth, bHeight, bCount] = blockWHC.value(); - SmallVector<int> instData; - int instWidth = xegpu::getLargestDivisor( - static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, - bCount); - if (instWidth == -1) - prefetch.emitWarning( - "No suitable instruction multiple found for the given shape."); - if (tdescTy.getRank() == 1) - instData = {instWidth}; - else { - int instHeight = xegpu::getLargestDivisor( - static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); - if (instHeight == -1) + + LayoutInfo prefetchLayout; + xegpu::DistributeLayoutAttr anchorLayout = prefetch.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + prefetchLayout = LayoutInfo(anchorLayout); + } else { + // Here we assign the default layout to the tensor descriptor operand of + // prefetch. + auto tdescTy = prefetch.getTensorDescType(); + + auto uArch = getUArch(getChipStr(prefetch).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch)); + + auto blockWHC = + uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType()); + if (!blockWHC) + prefetch.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = xegpu::getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) prefetch.emitWarning( "No suitable instruction multiple found for the given shape."); - instData = {instHeight, instWidth}; - } - LayoutInfo prefetchLayout; - if (layoutKind == LayoutKind::InstData) - prefetchLayout = - LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData)); - else - prefetchLayout = getDefaultSIMTLayoutInfo( - tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); + if (tdescTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = xegpu::getLargestDivisor( + static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); + if (instHeight == -1) + prefetch.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } + + if (layoutKind == LayoutKind::InstData) + prefetchLayout = + LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData)); + else + prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); + prefetch.setAnchorLayoutAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get())); + } // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -617,69 +642,96 @@ void LayoutInfoPropagation::visitUpdateNdOffsetOp( void LayoutInfoPropagation::visitDpasOp( xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - VectorType aTy = dpas.getLhsType(); - VectorType bTy = dpas.getRhsType(); - - auto uArch = getUArch(getChipStr(dpas).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - const auto *uArchInstruction = - dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction( - xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); - - const unsigned dataALen = aTy.getShape().front(); - auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); - const int maxALen = - xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); - if (maxALen == -1) - dpas.emitWarning( - "No suitable instruction multiple found for the given shape."); - - const unsigned dataBLen = bTy.getShape().back(); - auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); - const int maxBLen = - xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); - if (maxBLen == -1) - dpas.emitWarning( - "No suitable instruction multiple found for the given shape."); - SmallVector<int> instDataA = {maxALen, subgroupSize}; - SmallVector<int> instDataB = {subgroupSize, maxBLen}; LayoutInfo dpasALayout; LayoutInfo dpasBLayout; LayoutInfo dpasCLayout; - if (layoutKind == LayoutKind::InstData) { - dpasALayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA)); - dpasBLayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB)); + xegpu::DistributeLayoutAttr anchorLayoutC = dpas.getAnchorLayoutCdAttr(); + if (hasAnchorLayout(anchorLayoutC)) { + xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getAnchorLayoutAAttr(); + xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getAnchorLayoutBAttr(); + assert(hasAnchorLayout(anchorLayoutA) && + "Expected anchor layout for DPAS A operand."); + assert(hasAnchorLayout(anchorLayoutB) && + "Expected anchor layout for DPAS B operand."); + dpasALayout = LayoutInfo(anchorLayoutA); + dpasBLayout = LayoutInfo(anchorLayoutB); + dpasCLayout = LayoutInfo(anchorLayoutC); + } else { - dpasALayout = getSIMTLayoutInfoForDPASOperand( - aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA()); - dpasBLayout = getSIMTLayoutInfoForDPASOperand( - bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB()); - } - propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); - propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); - if (operands.size() > 2) { - VectorType cTy = dpas.getAccType(); - const unsigned dataCLen = bTy.getShape().back(); - auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); - const int maxCLen = - xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); - if (maxCLen == -1) + VectorType aTy = dpas.getLhsType(); + VectorType bTy = dpas.getRhsType(); + + auto uArch = getUArch(getChipStr(dpas).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction( + xegpu::uArch::InstructionKind::SubgroupMatrixMultiplyAcc)); + + const unsigned dataALen = aTy.getShape().front(); + auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); + const int maxALen = + xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); + if (maxALen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); - SmallVector<int> instDataC = {maxALen, maxCLen}; - if (layoutKind == LayoutKind::InstData) - dpasCLayout = - LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); - else - dpasCLayout = getSIMTLayoutInfoForDPASOperand( - cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); + const unsigned dataBLen = bTy.getShape().back(); + auto supportedBLen = uArchInstruction->getSupportedN(bTy.getElementType()); + + const int maxBLen = + xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); + + if (maxBLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataA = {maxALen, subgroupSize}; + SmallVector<int> instDataB = {subgroupSize, maxBLen}; + + if (layoutKind == LayoutKind::InstData) { + dpasALayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA)); + dpasBLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB)); + } else { + dpasALayout = getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA()); + dpasBLayout = getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB()); + } + if (operands.size() > 2) { + VectorType cTy = dpas.getAccType(); + if (layoutKind == LayoutKind::InstData) { + const unsigned dataCLen = bTy.getShape().back(); + auto supportedCLen = + uArchInstruction->getSupportedN(bTy.getElementType()); + const int maxCLen = xegpu::getLargestDivisor( + dataCLen, ArrayRef<unsigned>(supportedCLen)); + if (maxCLen == -1) + dpas.emitWarning( + "No suitable instruction multiple found for the given shape."); + SmallVector<int> instDataC = {maxALen, maxCLen}; + dpasCLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); + } else + dpasCLayout = getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); + + dpas.setAnchorLayoutCdAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(dpasCLayout.get())); + } + dpas.setAnchorLayoutAAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(dpasALayout.get())); + dpas.setAnchorLayoutBAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(dpasBLayout.get())); + } + + propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); + propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); + if (operands.size() > 2) { propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout)); } } @@ -689,43 +741,51 @@ void LayoutInfoPropagation::visitStoreNdOp( xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - auto uArch = getUArch(getChipStr(store).value_or("")); - const auto *uArchInstruction = - dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>( - uArch->getInstruction( - xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); - VectorType dataTy = store.getValueType(); - auto blockWHC = uArchInstruction->getBlockWidthHeightCount( - store.getValueType().getElementType()); - if (!blockWHC) - store.emitWarning("No known block params found for the element type."); - auto [bWidth, bHeight, bCount] = blockWHC.value(); - SmallVector<int> instData; - int instWidth = xegpu::getLargestDivisor( - static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, - bCount); - if (instWidth == -1) - store.emitWarning( - "No suitable instruction multiple found for the given shape."); - if (dataTy.getRank() == 1) - instData = {instWidth}; - else { - int instHeight = xegpu::getLargestDivisor( - static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); - if (instHeight == -1) + LayoutInfo storeLayout; + xegpu::DistributeLayoutAttr anchorLayout = store.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + storeLayout = LayoutInfo(anchorLayout); + } else { + auto uArch = getUArch(getChipStr(store).value_or("")); + const auto *uArchInstruction = + dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>( + uArch->getInstruction( + xegpu::uArch::InstructionKind::Subgroup2DBlockStore)); + VectorType dataTy = store.getValueType(); + auto blockWHC = uArchInstruction->getBlockWidthHeightCount( + store.getValueType().getElementType()); + if (!blockWHC) + store.emitWarning("No known block params found for the element type."); + auto [bWidth, bHeight, bCount] = blockWHC.value(); + SmallVector<int> instData; + int instWidth = xegpu::getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, + bCount); + if (instWidth == -1) store.emitWarning( "No suitable instruction multiple found for the given shape."); - instData = {instHeight, instWidth}; - } + if (dataTy.getRank() == 1) + instData = {instWidth}; + else { + int instHeight = xegpu::getLargestDivisor( + static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); + if (instHeight == -1) + store.emitWarning( + "No suitable instruction multiple found for the given shape."); + instData = {instHeight, instWidth}; + } - LayoutInfo storeLayout; - if (layoutKind == LayoutKind::InstData) - storeLayout = - LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData)); - else - storeLayout = - getDefaultSIMTLayoutInfo(store.getValueType(), uArch, - uArchInstruction->getPackedFormatBitSize()); + if (layoutKind == LayoutKind::InstData) + storeLayout = + LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData)); + else + storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, + uArchInstruction->getPackedFormatBitSize()); + store.setAnchorLayoutAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get())); + } + // Propagate the layout to the value operand. // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -736,21 +796,31 @@ void LayoutInfoPropagation::visitStoreNdOp( void LayoutInfoPropagation::visitLoadNdOp( xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - LayoutInfo valueLayout = results[0]->getValue(); - // Need the layout of the value to propagate to the tensor descriptor. - if (!valueLayout.isAssigned()) - return; - LayoutInfo tensorDescLayout = valueLayout; - // LoadNdOp has the transpose effect. However, at the stage of this analysis - // this effect is not expected and should be abstracted away. Emit a - // warning. - if (auto transpose = load.getTranspose()) { - load.emitWarning("Transpose effect is not expected for LoadNdOp at " - "LayoutInfoPropagation stage."); - tensorDescLayout = valueLayout.transpose(transpose.value()); + + LayoutInfo loadLayout; + xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + loadLayout = LayoutInfo(anchorLayout); + } else { + + LayoutInfo valueLayout = results[0]->getValue(); + // Need the layout of the value to propagate to the tensor descriptor. + if (!valueLayout.isAssigned()) + return; + loadLayout = valueLayout; + // LoadNdOp has the transpose effect. However, at the stage of this analysis + // this effect is not expected and should be abstracted away. Emit a + // warning. + if (auto transpose = load.getTranspose()) { + load.emitWarning("Transpose effect is not expected for LoadNdOp at " + "LayoutInfoPropagation stage."); + loadLayout = valueLayout.transpose(transpose.value()); + } + load.setAnchorLayoutAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get())); } // Propagate the new layout to the tensor descriptor operand. - propagateIfChanged(operands[0], operands[0]->meet(tensorDescLayout)); + propagateIfChanged(operands[0], operands[0]->meet(loadLayout)); } /// For vector::TransposeOp, the layout of the result is transposed and @@ -840,37 +910,49 @@ void LayoutInfoPropagation::visitVectorBitcastOp( void LayoutInfoPropagation::visitLoadGatherOp( xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - // The layout is strictly determined by the payload type. - auto payloadTy = dyn_cast<VectorType>(load.getValueType()); - if (!payloadTy) { - load.emitWarning("Not propagating, non-vector payload supplied."); - return; - } - auto uArch = getUArch(getChipStr(load).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - SmallVector<int> instData{subgroupSize}; - if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) - instData.push_back(chunkSize); - else if (auto srcTdescTy = - dyn_cast<xegpu::TensorDescType>(load.getSourceType())) { - if (srcTdescTy.getChunkSizeAsInt() > 1) + + LayoutInfo loadLayout; + LayoutInfo maskLayout; + xegpu::DistributeLayoutAttr anchorLayout = load.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + loadLayout = LayoutInfo(anchorLayout); + maskLayout = loadLayout; + } else { + + // The layout is strictly determined by the payload type. + auto payloadTy = dyn_cast<VectorType>(load.getValueType()); + if (!payloadTy) { + load.emitWarning("Not propagating, non-vector payload supplied."); + return; + } + auto uArch = getUArch(getChipStr(load).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + SmallVector<int> instData{subgroupSize}; + if (auto chunkSize = load.getChunkSize().value_or(0); chunkSize > 1) instData.push_back(chunkSize); - } - LayoutInfo layout; - if (layoutKind == LayoutKind::InstData) - layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData)); - else - layout = getDefaultSIMTLayoutInfo(payloadTy, uArch, - uArch->getGeneralPackedFormatBitSize(), - /*scattered*/ true); - - // Mask operand should have 1D default layout. - LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); + else if (auto srcTdescTy = + dyn_cast<xegpu::TensorDescType>(load.getSourceType())) { + if (srcTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + + if (layoutKind == LayoutKind::InstData) + loadLayout = + LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData)); + else + loadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); + + // Mask operand should have 1D default layout. + maskLayout = getDefaultSIMTLayoutInfo(load->getContext(), 1, subgroupSize); + load.setAnchorLayoutAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get())); + } // Propagate the new layout to the tensor descriptor operand. if (isa<xegpu::TensorDescType>(load.getSourceType())) - propagateIfChanged(operands[0], operands[0]->meet(layout)); + propagateIfChanged(operands[0], operands[0]->meet(loadLayout)); // Propagate the new layout to the mask and optional offset operand. propagateIfChanged(operands[1], operands[1]->meet(maskLayout)); if (load.getOffsets()) @@ -898,21 +980,26 @@ void LayoutInfoPropagation::visitCreateDescOp( void LayoutInfoPropagation::visitStoreScatterOp( xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands, ArrayRef<const LayoutInfoLattice *> results) { - // Currently, for 2D StoreScatterOp we expect that the height dimension of - // the tensor descriptor is equal to the subgroup size. This is ensured by - // the op verifier. - auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType()); - if (!payloadTy) { - storeScatter.emitWarning("Not propagating, non-vector payload supplied."); - return; - } - LayoutInfo payloadLayout; - auto uArch = getUArch(getChipStr(storeScatter).value_or("")); - const int subgroupSize = uArch->getSubgroupSize(); - if (auto layout = storeScatter.getLayoutAttr()) { - payloadLayout = LayoutInfo(layout); + LayoutInfo payloadLayout; + LayoutInfo maskLayout; + xegpu::DistributeLayoutAttr anchorLayout = storeScatter.getAnchorLayoutAttr(); + if (hasAnchorLayout(anchorLayout)) { + payloadLayout = LayoutInfo(anchorLayout); + maskLayout = payloadLayout; } else { + // Currently, for 2D StoreScatterOp we expect that the height dimension of + // the tensor descriptor is equal to the subgroup size. This is ensured by + // the op verifier. + auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType()); + if (!payloadTy) { + storeScatter.emitWarning("Not propagating, non-vector payload supplied."); + return; + } + + auto uArch = getUArch(getChipStr(storeScatter).value_or("")); + const int subgroupSize = uArch->getSubgroupSize(); + if (layoutKind == LayoutKind::InstData) { SmallVector<int> instData{subgroupSize}; if (auto chunkSize = storeScatter.getChunkSize().value_or(0); @@ -936,10 +1023,13 @@ void LayoutInfoPropagation::visitStoreScatterOp( payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(), /*scattered=*/true); } - } - LayoutInfo maskLayout = - getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); + maskLayout = + getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); + + storeScatter.setAnchorLayoutAttr( + dyn_cast<xegpu::DistributeLayoutAttr>(payloadLayout.get())); + } // Propagate the payload operand layout propagateIfChanged(operands[0], operands[0]->meet(payloadLayout)); // Propagate the destination (if tdesc) operand layout diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index b0b748c3409c..c644f784606e 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } - auto layout = op.getLayoutAttr(); + auto layout = op.getAnchorLayoutAttr(); if (layout) layout = layout.dropInstData(); @@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets SmallVector<Value> convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); - auto layout = op.getLayoutAttr(); + auto layout = op.getAnchorLayoutAttr(); if (layout) layout = layout.dropInstData(); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 4fe35a16b399..572e5442760b 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -147,7 +147,7 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // check for "permament" layout only after "temporary" layout name lookup // for backward compatibility if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp)) - return loadGatherOp.getLayoutAttr(); + return loadGatherOp.getAnchorLayoutAttr(); } if (auto arg = dyn_cast<BlockArgument>(value)) { @@ -178,7 +178,7 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { // check for "permament" layout only after "temporary" layout name lookup if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op)) - if (auto layout = storeScatterOp.getLayoutAttr()) + if (auto layout = storeScatterOp.getAnchorLayoutAttr()) return layout; return getDistributeLayoutAttr(opr.get()); @@ -193,7 +193,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, xegpu::DistributeLayoutAttr candidate = layout; if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) { - if (auto perm = loadOp.getLayoutAttr()) + if (auto perm = loadOp.getAnchorLayoutAttr()) candidate = perm; } @@ -211,7 +211,7 @@ maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) { if (idx == 0) { - if (auto perm = storeOp.getLayoutAttr()) + if (auto perm = storeOp.getAnchorLayoutAttr()) candidate = perm; } } diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index c31ef323a94d..62a33a4797d2 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -5,14 +5,14 @@ // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<inst_data = [16, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [16, 16]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<inst_data = [8, 16]>, anchor_layout_b = #xegpu.layout<inst_data = [16, 16]>, anchor_layout_cd = #xegpu.layout<inst_data = [8, 16]>, layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<inst_data = [8, 16]>> gpu.module @test { func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { @@ -46,7 +46,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : + //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [8, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout<inst_data = [8, 16]>> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> @@ -85,7 +85,7 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : + //CHECK: xegpu.load_nd {{.*}} <{anchor_layout = #xegpu.layout<inst_data = [4, 16]>}> {layout_result_0 = #xegpu.layout<inst_data = [4, 16]>} : //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout<inst_data = [4, 16]>> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> @@ -113,9 +113,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> -// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> +// CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.layout<inst_data = [16, 8]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{anchor_layout = #xegpu.layout<inst_data = [16, 8]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index eb004932af4b..d1bee47dd6d3 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -6,14 +6,14 @@ gpu.module @test { // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -32,7 +32,8 @@ func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: me gpu.module @test { // CHECK-LABEL: func.func @dpas_i8( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: vector<8x32xi8>, %[[ARG1:[0-9a-zA-Z]+]]: vector<32x16xi8>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xi32>) { -// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], +// CHECK: %[[T0:.*]] = xegpu.dpas %[[ARG0]], %[[ARG1]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} + func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) { %c0 = arith.constant 0 : index %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> @@ -46,8 +47,8 @@ func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memre gpu.module @test { // CHECK-LABEL: func.func @load_with_transpose_effect( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{transpose = array<i64: 1, 0>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : -// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> -> vector<16x16xf16> +// CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32> @@ -108,7 +109,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK-NEXT: %[[T2:.*]] = xegpu.create_tdesc %[[ARG1]], %[[CST]] : memref<256xf16>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T2]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.scatter_tdesc_attr<chunk_size = 16 : i64>, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>, vector<16xi1> -> vector<16x16xf16> func.func @load_gather_with_chunksize(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index @@ -135,7 +136,7 @@ gpu.module @test { // CHECK-NEXT: %[[CST0:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK-NEXT: %[[T0:.*]] = xegpu.create_tdesc %[[ARG0]], %[[CST]] : memref<256xf32>, vector<16xindex> -> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : +// CHECK-NEXT: %{{.*}} = xegpu.load %[[T0]], %[[CST0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, vector<16xi1> -> vector<16xf32> func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) { %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex> @@ -183,9 +184,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>, chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -204,7 +205,7 @@ gpu.module @test { // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK: xegpu.store %[[LOAD_VEC]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -220,10 +221,10 @@ gpu.module @test { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> // CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -239,11 +240,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> // CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> -// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] // CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> -// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xf16> // CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] -// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +// CHECK-SAME <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> %offset = arith.constant dense<12> : vector<16xindex> @@ -256,9 +257,9 @@ func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( -// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} +// CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16> -// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} +// CHECK: %[[LOAD1:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xi16> // CHECK: %{{.*}} = vector.bitcast %[[LOAD0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: vector<8x16xi16> to vector<8x16xf16> @@ -281,7 +282,7 @@ func.func @vector_bitcast_i16_to_f16(%arg0: memref<8x16xi16>, %arg1: memref<16x1 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i32_to_f16( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} // CHECK-SAME: vector<16x8xi32> to vector<16x16xf16> @@ -302,7 +303,7 @@ func.func @vector_bitcast_i32_to_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x8 // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_i32( -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %{{.*}} <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} // CHECK-SAME: !xegpu.tensor_desc<8x32xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>> -> vector<8x32xi16> // CHECK-NEXT: %{{.*}} = vector.bitcast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: vector<8x32xi16> to vector<8x16xi32> @@ -339,9 +340,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> -// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-NEXT: %[[T2:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> // CHECK-NEXT: %{{.*}} = arith.addf %[[T1]], %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : vector<16x16xf16> func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) { @@ -362,9 +363,9 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { // CHECK: %[[T2:.*]] = arith.addf %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> -// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> -// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK: %[[T3:.*]] = xegpu.dpas %{{.*}}, %[[T2]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T3]], %[[ARG2]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK-NEXT: xegpu.store_nd %[[T2]], %[[ARG3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) { %0 = xegpu.load_nd %arg0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> %1 = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16> @@ -385,11 +386,11 @@ gpu.module @test { // CHECK-NEXT: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<0.000000e+00> : vector<8x16xf32> // CHECK-NEXT: %[[T2:.*]]:3 = scf.for %{{.*}} iter_args(%[[ARG4:.*]] = %[[T0]], %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[CST]]) -> // CHECK-SAME: (!xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32>) { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG4]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16> -// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-NEXT: %[[T5:.*]] = xegpu.load_nd %[[ARG5]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> -// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T6:.*]] = xegpu.dpas %[[T4]], %[[T5]], %[[ARG6]] {anchor_layout_a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, anchor_layout_b = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>, anchor_layout_cd = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> // CHECK-NEXT: %[[T7:.*]] = xegpu.update_nd_offset %[[ARG4]], [{{.*}}] : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NEXT: %[[T8:.*]] = xegpu.update_nd_offset %[[ARG5]], [{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> @@ -397,7 +398,7 @@ gpu.module @test { // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, vector<8x16xf32> // CHECK-NEXT: } {layout_result_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-NEXT: %[[T3:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK-NEXT: xegpu.store_nd %[[T2]]#2, %[[T3]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index @@ -425,11 +426,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>, // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { // CHECK: %{{.*}} = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} @@ -455,11 +456,11 @@ gpu.module @test { // CHECK-SAME: %[[ARG2:[0-9a-zA-Z]+]]: i1, %[[ARG3:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG4:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { // CHECK: %[[T1:.*]] = scf.if %[[ARG2]] -> (vector<16x16xf16>) { -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T3]] : vector<16x16xf16> // CHECK-NEXT: } else { -// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : +// CHECK-NEXT: %[[T4:.*]] = xegpu.load_nd %[[ARG1]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> // CHECK-NEXT: scf.yield %[[T4]] : vector<16x16xf16> // CHECK-NEXT: } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} @@ -539,7 +540,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_2d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}, %{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> func.func @prefetch_2d(%arg0: memref<256x256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> @@ -552,7 +553,7 @@ gpu.module @test { // CHECK-LABEL: func.func @prefetch_1d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> +// CHECK-NEXT: xegpu.prefetch_nd %[[T0]] <{anchor_layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> func.func @prefetch_1d(%arg0: memref<256xf16>){ %c0 = arith.constant 0 : index %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> @@ -599,7 +600,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim1_distributed( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16> @@ -621,7 +622,7 @@ gpu.module @test { // CHECK-LABEL: func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) { -// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} +// CHECK: %[[LOAD:.*]] = xegpu.load_nd %arg0 <{anchor_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> // CHECK-NEXT: %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}} // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 3760737cf51f..171cadeeaeaf 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -286,7 +286,7 @@ gpu.module @test_distribution { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{anchor_layout = #xegpu.layout<inst_data = [8]>, chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>, // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> @@ -554,7 +554,7 @@ gpu.module @test_distribution { %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex> %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1> - // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}> + // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{anchor_layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>, chunk_size = 1 : i64}> // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} : // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32> %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32> |
