//===- TensorTilingInterface.cpp - Tiling Interface models *- C++ ------*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/TilingInterface.h" using namespace mlir; using namespace mlir::tensor; namespace { struct PadOpTiling : public TilingInterface::ExternalModel { SmallVector getLoopIteratorTypes(Operation *op) const { auto padOp = cast(op); SmallVector iteratorTypes( padOp.getResultType().getRank(), utils::IteratorType::parallel); return iteratorTypes; } SmallVector getIterationDomain(Operation *op, OpBuilder &b) const { ReifiedRankedShapedTypeDims reifiedShapes; (void)reifyResultShapes(b, op, reifiedShapes); OpFoldResult zero = b.getIndexAttr(0); OpFoldResult one = b.getIndexAttr(1); // Initialize all the ranges to {zero, one, one}. All the `ub`s are // overwritten. SmallVector loopRanges(reifiedShapes[0].size(), {zero, one, one}); for (const auto &ub : enumerate(reifiedShapes[0])) loopRanges[ub.index()].size = ub.value(); return loopRanges; } FailureOr getTiledImplementation(Operation *op, OpBuilder &b, ArrayRef offsets, ArrayRef sizes) const { FailureOr result = tensor::bubbleUpPadSlice(b, cast(op), offsets, sizes); if (failed(result)) return failure(); return result.value(); } LogicalResult getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) const { resultOffsets.assign(offsets.begin(), offsets.end()); resultSizes.assign(sizes.begin(), sizes.end()); return success(); } LogicalResult getIterationDomainTileFromResultTile( Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVectorImpl &iterDomainOffsets, SmallVectorImpl &iterDomainSizes) const { iterDomainOffsets.assign(offsets.begin(), offsets.end()); iterDomainSizes.assign(sizes.begin(), sizes.end()); return success(); } FailureOr generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes) const { return getTiledImplementation(op, b, offsets, sizes); } }; } // namespace FailureOr tensor::bubbleUpPadSlice(OpBuilder &b, tensor::PadOp padOp, ArrayRef offsets, ArrayRef sizes, bool generateZeroSliceGuard) { // Only constant padding value supported. Value padValue = padOp.getConstantPaddingValue(); if (!padValue) return failure(); // Helper variables and functions for various arithmetic operations. These // are used extensively for computing new offset/length and padding values. Location loc = padOp->getLoc(); AffineExpr dim0, dim1; bindDims(b.getContext(), dim0, dim1); // Subtract two integers. auto subMap = AffineMap::get(2, 0, {dim0 - dim1}); auto sub = [&](OpFoldResult v1, OpFoldResult v2) { return affine::makeComposedFoldedAffineApply(b, loc, subMap, {v1, v2}); }; // Take the minimum of two integers. auto idMap = AffineMap::getMultiDimIdentityMap(2, b.getContext()); auto min = [&](OpFoldResult v1, OpFoldResult v2) { return affine::makeComposedFoldedAffineMin(b, loc, idMap, {v1, v2}); }; // Take the maximum of two integers. auto max = [&](OpFoldResult v1, OpFoldResult v2) { return affine::makeComposedFoldedAffineMax(b, loc, idMap, {v1, v2}); }; // Zero index-typed integer. OpFoldResult zero = b.getIndexAttr(0); // Compute new offsets, lengths, low padding, high padding. SmallVector newOffsets, newLengths; SmallVector newLows, newHighs; // Set to true if the original data source is not read at all. bool hasZeroLen = false; // Same as hasZeroLen, but for dynamic dimension sizes. This condition // is true if the original data source turns out to be unused at runtime. Value dynHasZeroLenCond; int64_t rank = padOp.getSourceType().getRank(); // Only unit stride supported. SmallVector newStrides(rank, b.getIndexAttr(1)); for (unsigned dim = 0; dim < rank; ++dim) { auto low = padOp.getMixedLowPad()[dim]; bool hasLowPad = !isZeroInteger(low); auto high = padOp.getMixedHighPad()[dim]; bool hasHighPad = !isZeroInteger(high); auto offset = offsets[dim]; auto length = sizes[dim]; // If the dim has no padding, we dont need to calculate new values for that // dim as the exisiting ones are correct even after the pattern. if (!hasLowPad && !hasHighPad) { newOffsets.push_back(offset); newLengths.push_back(length); newLows.push_back(low); newHighs.push_back(high); continue; } auto srcSize = tensor::getMixedSize(b, loc, padOp.getSource(), dim); // The new amount of low padding is `low - offset`. Except for the case // where none of the low padding is read. In that case, the new amount of // low padding is zero. // // Optimization: If low = 0, then newLow = 0. OpFoldResult newLow = hasLowPad ? max(zero, sub(low, offset)) : zero; newLows.push_back(newLow); // Start reading the data from position `offset - low`. Since the original // read may have started in the low padding zone, this value could be // negative. Therefore, start reading from: // // max(offset - low, 0) // // The original read could also have started in the high padding zone. // In that case, set the offset to the end of source tensor. The new // ExtractSliceOp length will be zero in that case. (Effectively reading // no data from the source.) // // Optimization: If low = 0, then the formula can be simplified. OpFoldResult newOffset = hasLowPad ? min(max(sub(offset, low), zero), srcSize) : min(offset, srcSize); newOffsets.push_back(newOffset); // The original ExtractSliceOp was reading until position `offset + // length`. Therefore, the corresponding position within the source tensor // is: // // offset + length - low // // In case the original ExtractSliceOp stopped reading within the low // padding zone, this value can be negative. In that case, the end // position of the read should be zero. (Similar to newOffset.) // // The original read could also have stopped in the high padding zone. // In that case, set the end positition of the read should be the end of // the source tensor. (Similar to newOffset.) // srcSize - newOffset represents how much length we have available // and length - newLow represents how much length we want at most. // Note that there are many ways to order this indexing math to compute // newLength, but we want to make sure that the final affine.min ops in the // sequence are bounding the index to as small a value as possible. If // ValueBoundsOpInterface is used, this calculation will get upper bounds // from the affine.min ops, so we want to use the smallest known value to // set the bound at the end of the computation sequence. In this case, the // index will be upper bounded by length - newLow. OpFoldResult newLength = min(sub(srcSize, newOffset), sub(length, newLow)); // Optimization: If low = 0, then newLow = 0. then newLength >= 0 assuming // length >= 0. if (hasLowPad) newLength = max(newLength, zero); newLengths.push_back(newLength); // Check if newLength is zero. In that case, no SubTensorOp should be // executed. if (isZeroInteger(newLength)) { hasZeroLen = true; } else if (!hasZeroLen) { Value check = arith::CmpIOp::create( b, loc, arith::CmpIPredicate::eq, getValueOrCreateConstantIndexOp(b, loc, newLength), getValueOrCreateConstantIndexOp(b, loc, zero)); dynHasZeroLenCond = dynHasZeroLenCond ? arith::OrIOp::create(b, loc, check, dynHasZeroLenCond) : check; } // The amount of high padding is simply the number of elements remaining, // so that the result has the same length as the original ExtractSliceOp. // As an optimization, if the original high padding is zero, then the new // high padding must also be zero. OpFoldResult newHigh = hasHighPad ? sub(sub(length, newLength), newLow) : zero; newHighs.push_back(newHigh); } // The shape of the result can be obtained from the sizes passed in. SmallVector dynDims; SmallVector shape; dispatchIndexOpFoldResults(sizes, dynDims, shape); RankedTensorType resultType = RankedTensorType::get(shape, padOp.getResultType().getElementType()); // Insert cast to ensure that types match. (May be folded away.) auto castResult = [&](Value val) -> Value { if (resultType == val.getType()) return val; return tensor::CastOp::create(b, loc, resultType, val); }; // In cases where the original data source is unused: Emit a GenerateOp and // do not generate a SliceOp. (The result shape of the SliceOp would // have a dimension of size 0, the semantics of which is unclear.) auto createGenerateOp = [&]() { // Create GenerateOp. auto generateOp = tensor::GenerateOp::create( b, loc, resultType, dynDims, [&](OpBuilder &builder, Location gLoc, ValueRange indices) { tensor::YieldOp::create(builder, gLoc, padValue); }); return generateOp; }; // Emit a SliceOp and a PadOp. Should not be used in cases where // the result shape of the new SliceOp has a zero dimension. auto createPadOfExtractSlice = [&]() { // Create pad(extract_slice(x)). auto newSliceOp = tensor::ExtractSliceOp::create( b, loc, padOp.getSource(), newOffsets, newLengths, newStrides); auto newPadOp = PadOp::create( b, loc, Type(), newSliceOp, newLows, newHighs, /*nofold=*/padOp.getNofold(), getPrunedAttributeList(padOp, PadOp::getAttributeNames())); // Copy region to new PadOp. IRMapping bvm; padOp.getRegion().cloneInto(&newPadOp.getRegion(), bvm); // Cast result and return. return std::make_tuple(newPadOp, newSliceOp); }; // Rewrite extract_slice(pad(x)) into a GenerateOp it is statically known that // the original data source x is not used. if (hasZeroLen) { Operation *generateOp = createGenerateOp(); return TilingResult{{generateOp}, {castResult(generateOp->getResult(0))}, /*generatedSlices=*/{}}; } // If there are dynamic dimensions: Generate an scf.if check to avoid // creating SliceOps with result dimensions of size 0 at runtime. if (generateZeroSliceGuard && dynHasZeroLenCond) { Operation *thenOp; Operation *elseOp; Operation *sliceOp; auto result = scf::IfOp::create( b, loc, dynHasZeroLenCond, /*thenBuilder=*/ [&](OpBuilder &b, Location loc) { thenOp = createGenerateOp(); scf::YieldOp::create(b, loc, castResult(thenOp->getResult(0))); }, /*elseBuilder=*/ [&](OpBuilder &b, Location loc) { std::tie(elseOp, sliceOp) = createPadOfExtractSlice(); scf::YieldOp::create(b, loc, castResult(elseOp->getResult(0))); }); return TilingResult{ {elseOp}, SmallVector(result->getResults()), {sliceOp}}; } auto [newPadOp, sliceOp] = createPadOfExtractSlice(); return TilingResult{ {newPadOp}, {castResult(newPadOp->getResult(0))}, {sliceOp}}; } void mlir::tensor::registerTilingInterfaceExternalModels( DialectRegistry ®istry) { registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) { tensor::PadOp::attachInterface(*ctx); }); }