0x1. 背景

）的4.2节中提到的编译优化加速模型训练：

Megatron 4.2节Megatron 4.2节

``````>>> import oneflow as flow
>>> x = flow.randn(2, 3)
>>> y = flow.randn(3)
>>> z = flow._C.bias_add(x, y, axis=1)
``````

nn.Dropout 算子：Dropout算子相信大家非常熟悉，不需要多解释，可以参考下方OneFlow算子文档。

``````>>> import numpy as np
>>> import oneflow as flow
>>> m = flow.nn.Dropout(p=0)
>>> arr = np.array(
...    [
...        [-0.7797, 0.2264, 0.2458, 0.4163],
...        [0.4299, 0.3626, -0.4892, 0.4141],
...        [-1.4115, 1.2183, -0.5503, 0.6520],
...    ]
... )
>>> x = flow.Tensor(arr)
>>> y = m(x)
>>> y
tensor([[-0.7797,  0.2264,  0.2458,  0.4163],
[ 0.4299,  0.3626, -0.4892,  0.4141],
[-1.4115,  1.2183, -0.5503,  0.6520]], dtype=oneflow.float32)
``````

```a
b
a
axis
p```

Op来生成的，这个Op接受一个输入Tensor和`p`

0x3. Pattern匹配和重写

``````def GetDefaultSeed :
NativeCodeCall<"mlir::oneflow::GetDefaultSeed(\$_builder)">;
(
OneFlow_DropoutOp: \$dropout_res
(
\$a,
\$b,
),
\$dropout_op_name,
\$dropout_device_tag,
\$dropout_device_name,
\$dropout_scope_symbol_id,
\$dropout_hierarchy,
\$dropout_op_rate
),
[
(
\$dropout_res__0,
(
\$a,
\$dropout_device_tag,
\$dropout_device_name,
\$dropout_scope_symbol_id,
\$dropout_hierarchy,
\$dropout_op_rate,
(GetDefaultSeed)
)
),
],
>;
``````

`NativeCodeCall`

``````def GetDefaultSeed :
NativeCodeCall<"mlir::oneflow::GetDefaultSeed(\$_builder)">;
``````

``````mlir::IntegerAttr GetDefaultSeed(::mlir::PatternRewriter& rewriter) {
const auto gen = CHECK_JUST(::oneflow::one::DefaultAutoGenerator());
return getSI64IntegerAttr(rewriter, (int64_t)gen->current_seed());
}
``````

``````::llvm::SmallVector<::mlir::Value, 4> CreateFusedBiasAddMaskScale(::mlir::PatternRewriter& rewriter,
OpResult dropout_result,
if (auto dropout_op = llvm::dyn_cast<oneflow::DropoutOp>(dropout_result.getDefiningOp())) {
SmallVector<Value, 4> operands;
auto res = rewriter
dropout_op->getLoc(), dropout_op->getResultTypes().front(), operands,
->getResults();
// bias_add and dropout op is expected to be erased if it is not used
return res;
}
}
return {};
}
``````

``````bool IsAddToOutputNone(ValueRange value) { return (int)value.size() > 0 ? false : true; }
``````

NativeCodeCall的限制引发的问题

）的功能，比较适合这里的占位作用。

``````def OneFlow_RandomMaskLikeOp : OneFlow_BaseOp<"random_mask_like", [NoSideEffect, NoGrad, DeclareOpInterfaceMethods<UserOpCompatibleInterface>]> {
let input = (ins
OneFlow_Tensor:\$like
);
let output = (outs
OneFlow_Tensor:\$out
);
let attrs = (ins
DefaultValuedAttr<F32Attr, "0.">:\$rate,
DefaultValuedAttr<SI64Attr, "0">:\$seed
);
let builders = [
OpBuilder<(ins
"Value":\$like,
"StringRef":\$op_name,
"StringRef":\$device_tag,
"ArrayAttr":\$device_name,
"IntegerAttr":\$scope_symbol_id,
"ArrayAttr":\$hierarchy,
"FloatAttr":\$rate,
"IntegerAttr":\$seed
)>
];
let has_check_fn = 1;
let has_logical_tensor_desc_infer_fn = 1;
let has_physical_tensor_desc_infer_fn = 1;
let has_get_sbp_fn = 1;
let has_data_type_infer_fn = 1;
}
``````

``````void RandomMaskLikeOp::build(mlir::OpBuilder& odsBuilder, mlir::OperationState& odsState,
mlir::Value like, StringRef op_name, StringRef device_tag,
ArrayAttr device_name, IntegerAttr scope_symbol_id,
ArrayAttr hierarchy, mlir::FloatAttr rate, mlir::IntegerAttr seed) {
if (scope_symbol_id) {
}
if (hierarchy) { odsState.addAttribute(hierarchyAttrName(odsState.name), hierarchy); }
}
``````

``````python3: /home/xxx/oneflow/build/oneflow/ir/llvm_monorepo-src/mlir/lib/IR/PatternMatch.cpp:328: void mlir::RewriterBase::replaceOpWithResultsOfAnotherOp(mlir::Operation*, mlir::Operation*): Assertion `op->getNumResults() == newOp->getNumResults() && "replacement op doesn't match results of original op"' failed
``````

0x4. 测试

``````import unittest
import numpy as np
import os
os.environ["ONEFLOW_MLIR_ENABLE_ROUND_TRIP"] = "1"
import oneflow as flow
import oneflow.unittest
x = flow.randn(2, 3, 4, 5)
bias = flow.randn(5)
dropout = flow.nn.Dropout(p=prob)
if with_cuda:
x = x.cuda()
bias = bias.to("cuda")
dropout.to("cuda")
class GraphToRun(flow.nn.Graph):
def __init__(self):
super().__init__()
self.dropout = dropout
def build(self, x, bias):
graph_to_run = GraphToRun()
lazy_res = graph_to_run(x, bias)
test_case.assertTrue(np.array_equal(eager_res.numpy(), lazy_res.numpy()))
@flow.unittest.skip_unless_1n1d()
if __name__ == "__main__":
unittest.main()
``````

``````module {
oneflow.job @GraphToRun_0(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<5xf32>) -> tensor<2x3x4x5xf32> {
%output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_input.0.0_2", output_lbns = ["_GraphToRun_0_input.0.0_2/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [2 : si64, 3 : si64, 4 : si64, 5 : si64]} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
%output_0 = "oneflow.input"(%arg1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_input.0.1_3", output_lbns = ["_GraphToRun_0_input.0.1_3/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [5 : si64]} : (tensor<5xf32>) -> tensor<5xf32>
%0 = "oneflow.bias_add"(%output, %output_0) {axis = 3 : si32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], op_name = "bias_add-0", output_lbns = ["bias_add-0/out_0"], scope_symbol_id = 4611686018427420671 : i64} : (tensor<2x3x4x5xf32>, tensor<5xf32>) -> tensor<2x3x4x5xf32>
%out, %mask = "oneflow.dropout"(%0) {device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], op_name = "dropout-dropout-1", output_lbns = ["dropout-dropout-1/out_0", "dropout-dropout-1/mask_0"], rate = 1.000000e+00 : f32, scope_symbol_id = 4611686018427428863 : i64} : (tensor<2x3x4x5xf32>) -> (tensor<2x3x4x5xf32>, tensor<2x3x4x5xi8>)
%output_1 = "oneflow.output"(%out) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_output.0.0_2", output_lbns = ["_GraphToRun_0_output.0.0_2/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [2 : si64, 3 : si64, 4 : si64, 5 : si64]} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
oneflow.return %output_1 : tensor<2x3x4x5xf32>
}
}
``````

``````module {
oneflow.job @GraphToRun_0(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<5xf32>) -> tensor<2x3x4x5xf32> {
%output = "oneflow.input"(%arg0) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_input.0.0_2", output_lbns = ["_GraphToRun_0_input.0.0_2/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [2 : si64, 3 : si64, 4 : si64, 5 : si64]} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
%output_0 = "oneflow.input"(%arg1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_input.0.1_3", output_lbns = ["_GraphToRun_0_input.0.1_3/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [5 : si64]} : (tensor<5xf32>) -> tensor<5xf32>
%0 = "oneflow.random_mask_like"(%output) {device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], op_name = "bias_add-0", rate = 1.000000e+00 : f32, scope_symbol_id = 4611686018427428863 : i64, seed = 4920936260932536 : si64} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
%1 = "oneflow.fused_bias_add_mask_scale"(%output, %output_0, %0) {axis = 3 : si32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], op_name = "dropout-dropout-1", output_lbns = ["dropout-dropout-1/out_0", "dropout-dropout-1/mask_0"], scale = 1.000000e+00 : f32, scope_symbol_id = 4611686018427428863 : i64} : (tensor<2x3x4x5xf32>, tensor<5xf32>, tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
%output_1 = "oneflow.output"(%1) {data_type = 2 : i32, device_name = ["@0:0"], device_tag = "gpu", hierarchy = [1], is_dynamic = false, nd_sbp = ["B"], op_name = "_GraphToRun_0_output.0.0_2", output_lbns = ["_GraphToRun_0_output.0.0_2/out"], scope_symbol_id = 4611686018427420671 : i64, shape = [2 : si64, 3 : si64, 4 : si64, 5 : si64]} : (tensor<2x3x4x5xf32>) -> tensor<2x3x4x5xf32>
oneflow.return %output_1 : tensor<2x3x4x5xf32>
}
}
``````

0x6. 资料

https://github.com/Oneflow-Inc/oneflow

https://mlir.llvm.org/docs/DeclarativeRewrites/