Skip to content

Commit

Permalink
update cnm-to-gpu conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
ge0mk committed Sep 16, 2024
1 parent dcd57f8 commit cf2fda5
Show file tree
Hide file tree
Showing 7 changed files with 310 additions and 199 deletions.
9 changes: 9 additions & 0 deletions cinnamon/include/cinm-mlir/Conversion/CommonPatterns.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <llvm/ADT/ArrayRef.h>
#include <llvm/ADT/STLExtras.h>

#include <llvm/ADT/SmallVector.h>
#include <mlir/IR/AffineMap.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/Location.h>
#include <mlir/IR/OpDefinition.h>
Expand Down Expand Up @@ -37,4 +39,11 @@ struct ConvertCnmSetZeroToAffine : public OpConversionPattern<cnm::SetZeroOp> {
ConversionPatternRewriter &) const override;
};

SmallVector<Value> createAffineApply(OpBuilder &builder, Location loc,
AffineMap map, ValueRange values);

void createMemrefSubviewCopy(OpBuilder &builder, Location loc, Value src,
Value dst, ArrayRef<int64_t> sliceShape,
ValueRange srcOffsets, ValueRange dstOffsets);

} // namespace mlir
19 changes: 18 additions & 1 deletion cinnamon/justfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,28 @@ cinm-opt-help: (cinm-opt "--help")
debug-cinm-opt *ARGS:
gdb --args {{build_dir}}/bin/cinm-opt {{ARGS}}

cinm-to-cnm FILE *ARGS: (
cinm-opt FILE
"--cinm-tiling"
"--affine-loop-unroll='unroll-full unroll-full-threshold=1'"
"--convert-cinm-to-cnm"
"--lower-affine"
"--one-shot-bufferize='bufferize-function-boundaries function-boundary-type-conversion=identity-layout-map'"
"--convert-linalg-to-affine-loops"
"--lower-affine"
"--buffer-loop-hoisting"
"--buffer-hoisting"
"--cse"
ARGS
)

cnm-to-gpu FILE *ARGS: (cinm-opt FILE "--convert-cnm-to-gpu" ARGS)
cinm-to-gpu FILE *ARGS: (cinm-to-cnm FILE "--convert-cnm-to-gpu" ARGS)

cinm-vulkan-runner FILE *ARGS:
{{build_dir}}/bin/cinm-vulkan-runner {{FILE}} \
--shared-libs=../llvm-project/build/lib/libvulkan-runtime-wrappers.so,../llvm-project/build/lib/libmlir_runner_utils.so.17 \
--shared-libs={{llvm_prefix}}/lib/libvulkan-runtime-wrappers.so,{{llvm_prefix}}/lib/libmlir_runner_utils.so \
--entry-point-result=void \
{{ARGS}}

genBench NAME: (doNinja "cinm-opt")
Expand Down
238 changes: 85 additions & 153 deletions cinnamon/lib/Conversion/CnmToGPU/CnmToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/BuiltinTypeInterfaces.h>
#include <mlir/IR/BuiltinTypes.h>
#include <mlir/IR/Location.h>
#include <mlir/IR/MLIRContext.h>
#include <mlir/IR/OpDefinition.h>
#include <mlir/IR/PatternMatch.h>
Expand All @@ -41,61 +42,28 @@

namespace mlir::cnm {
namespace cnmtogpu {
SmallVector<int64_t, 2> getBufferTypeShape(cnm::BufferType bufferType) {
SmallVector<int64_t> shape{bufferType.getShape()};
while (shape.size() < bufferType.getWorkgroupShape().size()) {
shape.insert(shape.begin(), 1);
}
return shape;
}

MemRefType convertCnmBufferToMemRefType(cnm::BufferType bufferType) {
ArrayRef<int64_t> workgroupShape = bufferType.getWorkgroupShape();
SmallVector<int64_t, 2> shape = getBufferTypeShape(bufferType);
for (size_t i = 0; i < workgroupShape.size(); i++) {
shape[i] *= workgroupShape[i];
}
SmallVector<int64_t> shape{bufferType.getWorkgroupShape()};
shape.append(bufferType.getShape().begin(), bufferType.getShape().end());
return MemRefType::get(shape, bufferType.getElementType());
}

SmallVector<Value, 2> createCalculateScatterIndices(Location loc,
OpBuilder &builder,
const AffineMap &scatterMap,
ValueRange indices,
BufferType bufferType) {
SmallVector<Value> bufferIndices;
ArrayRef<int64_t> workgroupShape = bufferType.getWorkgroupShape();
for (size_t i = 0; i < workgroupShape.size(); i++) {
const AffineExpr indexExpr =
scatterMap.getResult(i) * workgroupShape[i] +
scatterMap.getResult(workgroupShape.size() + i);
bufferIndices.push_back(builder.create<affine::AffineApplyOp>(
loc, AffineMap::get(indices.size(), 0, indexExpr), indices));
}
return bufferIndices;
}

void convertLaunchParameter(ConversionPatternRewriter &rewriter, Location loc,
Value buffer, ValueRange threadIds,
ArrayRef<int64_t> workgroupShape,
BlockArgument arg) {
const BufferType bufferType = buffer.getType().dyn_cast<cnm::BufferType>();
const SmallVector<int64_t, 2> bufferShape = getBufferTypeShape(bufferType);
const MemRefType memrefType = convertCnmBufferToMemRefType(bufferType);

const Value source = createOrFoldUnrealizedConversionCast(
loc, rewriter, convertCnmBufferToMemRefType(bufferType),
rewriter.getRemappedValue(buffer));

const SmallVector<int64_t, 2> staticOffsets(workgroupShape.size(),
ShapedType::kDynamic);
const SmallVector<int64_t, 2> staticSizes{bufferShape};
const SmallVector<int64_t, 2> staticStrides(workgroupShape.size(), 1);

SmallVector<Value, 2> dynamicOffsets;
for (size_t i = 0; i < workgroupShape.size(); i++) {
const AffineExpr indexExpr = rewriter.getAffineDimExpr(0) * bufferShape[i];
dynamicOffsets.push_back(rewriter.create<affine::AffineApplyOp>(
loc, AffineMap::get(1, 0, indexExpr), ValueRange{threadIds[i]}));
SmallVector<int64_t> staticOffsets(memrefType.getRank(), 0);
SmallVector<int64_t> staticSizes{memrefType.getShape()};
const SmallVector<int64_t> staticStrides(memrefType.getRank(), 1);
for (unsigned i = 0; i < threadIds.size(); i++) {
staticSizes[i] = 1;
staticOffsets[i] = ShapedType::kDynamic;
}

const Type resultType = memref::SubViewOp::inferRankReducedResultType(
Expand All @@ -104,7 +72,7 @@ void convertLaunchParameter(ConversionPatternRewriter &rewriter, Location loc,

const Value subview =
rewriter
.create<memref::SubViewOp>(loc, resultType, source, dynamicOffsets,
.create<memref::SubViewOp>(loc, resultType, source, threadIds,
ValueRange{}, ValueRange{}, staticOffsets,
staticSizes, staticStrides)
.getResult();
Expand All @@ -129,8 +97,15 @@ struct ConvertCnmAllocToGPU : public OpConversionPattern<cnm::AllocOp> {
LogicalResult
matchAndRewrite(cnm::AllocOp op, OpAdaptor,
ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOpWithNewOp<memref::AllocOp>(
op, convertCnmBufferToMemRefType(op.getType()));
Type asyncToken;
ValueRange asyncDependencies;
ValueRange dynamicSizes;
ValueRange symbolOperands;
UnitAttr hostShared;

rewriter.replaceOpWithNewOp<gpu::AllocOp>(
op, convertCnmBufferToMemRefType(op.getType()), asyncToken,
asyncDependencies, dynamicSizes, symbolOperands, hostShared);
return success();
}
};
Expand All @@ -141,44 +116,29 @@ struct ConvertCnmScatterToGPU : public OpConversionPattern<cnm::ScatterOp> {
LogicalResult
matchAndRewrite(cnm::ScatterOp op, OpAdaptor,
ConversionPatternRewriter &rewriter) const override {
const WorkgroupType workgroupType = op.getWg().getType();
const ArrayRef<int64_t> workgroupShape = workgroupType.getShape();
const cnm::BufferType bufferType =
op.getOperandTypes()[1].dyn_cast<cnm::BufferType>();
const SmallVector<int64_t, 2> bufferShape = getBufferTypeShape(bufferType);

Value memref = rewriter.getRemappedValue(op.getOperand(1));
memref = createOrFoldUnrealizedConversionCast(
op.getLoc(), rewriter, convertCnmBufferToMemRefType(bufferType),
memref);

const Value tensor = op.getOperand(0);
const RankedTensorType tensorType =
tensor.getType().dyn_cast<RankedTensorType>();

SmallVector<affine::AffineForOp, 2> loops;
SmallVector<Value> indices;

for (int64_t size : tensorType.getShape()) {
affine::AffineForOp loop =
rewriter.create<affine::AffineForOp>(op.getLoc(), 0, size, 1);
loops.push_back(loop);
indices.push_back(loop.getBody()->getArgument(0));
rewriter.setInsertionPointToStart(loop.getBody());
}

// inner most loop body
const AffineMap scatterMap = op.getScatterMap();
SmallVector<Value> bufferIndices = createCalculateScatterIndices(
op.getLoc(), rewriter, scatterMap, indices, bufferType);

const Value element =
rewriter.create<tensor::ExtractOp>(op.getLoc(), tensor, indices);
rewriter.create<memref::StoreOp>(op.getLoc(), element, memref,
bufferIndices);

// replace token with const 0
rewriter.setInsertionPointAfter(loops[0]);
rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
Value src = rewriter.getRemappedValue(op.getOperand(0));
Value dst = rewriter.getRemappedValue(op.getOperand(1));
dst = createOrFoldUnrealizedConversionCast(
op.getLoc(), rewriter, convertCnmBufferToMemRefType(bufferType), dst);

const SmallVector<int64_t> loopSteps(workgroupShape.size(), 1);
createNestedAffineForLoops(
rewriter, op.getLoc(), workgroupShape, loopSteps, ValueRange{},
[&](OpBuilder &builder, Location loc, ValueRange indices,
ValueRange) -> SmallVector<Value> {
const SmallVector<Value> mappedIndices =
createAffineApply(builder, loc, op.getScatterMap(), indices);
createMemrefSubviewCopy(builder, loc, src, dst, bufferType.getShape(),
mappedIndices, indices);
return {};
});

rewriter.eraseOp(op);
return success();
}
};
Expand All @@ -189,55 +149,29 @@ struct ConvertCnmGatherToGPU : public OpConversionPattern<cnm::GatherOp> {
LogicalResult
matchAndRewrite(cnm::GatherOp op, OpAdaptor,
ConversionPatternRewriter &rewriter) const override {
const WorkgroupType workgroupType = op.getWg().getType();
const ArrayRef<int64_t> workgroupShape = workgroupType.getShape();
const cnm::BufferType bufferType =
op.getOperandTypes()[0].dyn_cast<cnm::BufferType>();
const SmallVector<int64_t, 2> bufferShape = getBufferTypeShape(bufferType);

Value memref = rewriter.getRemappedValue(op.getOperand(0));
memref = createOrFoldUnrealizedConversionCast(
op.getLoc(), rewriter, convertCnmBufferToMemRefType(bufferType),
memref);

const RankedTensorType tensorType =
op.getResultTypes()[0].cast<RankedTensorType>();
const Value tensor = rewriter.create<tensor::EmptyOp>(
op.getLoc(), tensorType.getShape(), tensorType.getElementType());

SmallVector<affine::AffineForOp, 2> loops;
SmallVector<Value> indices;

for (int64_t size : tensorType.getShape()) {
const Value iterArg =
loops.empty() ? tensor : loops.back().getBody()->getArgument(1);
affine::AffineForOp loop = rewriter.create<affine::AffineForOp>(
op.getLoc(), 0, size, 1, SmallVector<Value, 1>{iterArg});
indices.push_back(loop.getBody()->getArgument(0));

if (!loops.empty()) {
rewriter.create<affine::AffineYieldOp>(op.getLoc(), loop.getResult(0));
}

rewriter.setInsertionPointToStart(loop.getBody());
loops.push_back(loop);
}

// inner most loop body
const Value iterArg = loops.back().getBody()->getArgument(1);

const AffineMap gatherMap = op.getGatherMap();
SmallVector<Value> bufferIndices = createCalculateScatterIndices(
op.getLoc(), rewriter, gatherMap, indices, bufferType);
const Value element =
rewriter.create<memref::LoadOp>(op.getLoc(), memref, bufferIndices);
const Value result = rewriter.create<tensor::InsertOp>(op.getLoc(), element,
iterArg, indices);
rewriter.create<affine::AffineYieldOp>(op.getLoc(), result);

// replace token with const 0
rewriter.setInsertionPointAfter(loops[0]);
const Value token = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 0);
rewriter.replaceOp(op, {loops.front().getResult(0), token});
Value src = rewriter.getRemappedValue(op.getOperand(0));
src = createOrFoldUnrealizedConversionCast(
op.getLoc(), rewriter, convertCnmBufferToMemRefType(bufferType), src);
Value dst = rewriter.getRemappedValue(op.getOperand(2));

const SmallVector<int64_t> loopSteps(workgroupShape.size(), 1);
createNestedAffineForLoops(
rewriter, op.getLoc(), workgroupShape, loopSteps, ValueRange{},
[&](OpBuilder &builder, Location loc, ValueRange indices,
ValueRange) -> SmallVector<Value> {
const SmallVector<Value> mappedIndices =
createAffineApply(builder, loc, op.getGatherMap(), indices);
createMemrefSubviewCopy(builder, loc, src, dst, bufferType.getShape(),
indices, mappedIndices);
return {};
});

rewriter.eraseOp(op);
return success();
}
};
Expand All @@ -252,12 +186,11 @@ struct ConvertCnmLaunchToGPU : public OpConversionPattern<cnm::LaunchOp> {
const ArrayRef<int64_t> workgroupShape = workgroupType.getShape();

const Value one = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 1);
const Value gridSizeX = one, gridSizeY = one, gridSizeZ = one;
const Value blockSizeX =
rewriter.create<arith::ConstantIndexOp>(op.getLoc(), workgroupShape[0]);
const Value blockSizeY =
rewriter.create<arith::ConstantIndexOp>(op.getLoc(), workgroupShape[1]);
const Value blockSizeZ = one;
SmallVector<Value, 6> launchDimensions(6, one);
for (size_t i = 0; i < workgroupShape.size(); i++) {
launchDimensions[i] = rewriter.create<arith::ConstantIndexOp>(
op.getLoc(), workgroupShape[i]);
}

const Value dynamicSharedMemorySize;
const Type asyncTokenType;
Expand All @@ -266,23 +199,26 @@ struct ConvertCnmLaunchToGPU : public OpConversionPattern<cnm::LaunchOp> {
const TypeRange privateAttributions;

gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
op.getLoc(), gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY,
blockSizeZ, dynamicSharedMemorySize, asyncTokenType, asyncDependencies,
workgroupAttributions, privateAttributions);

const SmallVector<Value, 3> threadIds{
launchOp.getThreadIds().x,
launchOp.getThreadIds().y,
launchOp.getThreadIds().z,
op.getLoc(), launchDimensions[0], launchDimensions[1],
launchDimensions[2], launchDimensions[3], launchDimensions[4],
launchDimensions[5], dynamicSharedMemorySize, asyncTokenType,
asyncDependencies, workgroupAttributions, privateAttributions);

const SmallVector<Value, 6> allThreadIds{
launchOp.getBlockIds().x, launchOp.getBlockIds().y,
launchOp.getBlockIds().z, launchOp.getThreadIds().x,
launchOp.getThreadIds().y, launchOp.getThreadIds().z,
};
const ValueRange usedThreadIds =
ValueRange{allThreadIds}.take_front(workgroupShape.size());

rewriter.setInsertionPointToEnd(&launchOp.getBody().front());

// convert cnm.buffer parameters to memref subviews
int64_t i = 0;
size_t i = 0;
for (const Value &buffer : op.getParams()) {
convertLaunchParameter(rewriter, op.getLoc(), buffer, threadIds,
workgroupShape, op.getBody().getArgument(i++));
convertLaunchParameter(rewriter, op.getLoc(), buffer, usedThreadIds,
op.getBody().getArgument(i++));
}

launchOp.getBody().front().getOperations().splice(
Expand Down Expand Up @@ -312,21 +248,25 @@ void populateCnmToGPUFinalTypeConversions(TypeConverter &typeConverter) {
[&](cnm::BufferType bufferType) -> std::optional<Type> {
return cnmtogpu::convertCnmBufferToMemRefType(bufferType);
});

typeConverter.addConversion([&](cnm::WorkgroupType t) -> std::optional<Type> {
return IndexType::get(t.getContext());
});
}

void populateCnmToGPUConversionPatterns(RewritePatternSet &patterns,
MLIRContext *context) {
MLIRContext *ctx) {
patterns
.add<cnmtogpu::ConvertCnmWorkgroupToGPU, cnmtogpu::ConvertCnmAllocToGPU,
ConvertCnmSetZeroToAffine, cnmtogpu::ConvertCnmScatterToGPU,
cnmtogpu::ConvertCnmGatherToGPU, cnmtogpu::ConvertCnmLaunchToGPU,
cnmtogpu::ConvertCnmTerminatorToGPU>(context);
cnmtogpu::ConvertCnmTerminatorToGPU>(ctx);
}

struct ConvertCnmToGPUPass
: public ::impl::ConvertCnmToGPUPassBase<ConvertCnmToGPUPass> {
void runOnOperation() final {
TypeConverter converter{};
TypeConverter converter;
populateCnmToGPUFinalTypeConversions(converter);
const auto addUnrealizedCast = [](OpBuilder &builder, Type type,
ValueRange inputs, Location loc) {
Expand All @@ -341,15 +281,7 @@ struct ConvertCnmToGPUPass
populateReconcileUnrealizedCastsPatterns(patterns);

ConversionTarget target(getContext());
// target.addIllegalDialect<cnm::CnmDialect>();
target.addIllegalOp<cnm::WorkgroupOp>();
target.addIllegalOp<cnm::AllocOp>();
target.addIllegalOp<cnm::SetZeroOp>();
target.addIllegalOp<cnm::ScatterOp>();
target.addIllegalOp<cnm::GatherOp>();
target.addIllegalOp<cnm::LaunchOp>();
target.addIllegalOp<cnm::TerminatorOp>();

target.addIllegalDialect<cnm::CnmDialect>();
target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });

if (failed(
Expand Down
Loading

0 comments on commit cf2fda5

Please sign in to comment.