-
Notifications
You must be signed in to change notification settings - Fork 816
Open
Description
int8 MaxPool performance is worse than fp16 on Apple M2 Pro.
ACL version: v52.8.0
Reproducer:
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/Tensor.h"
#include "tests/SimpleTensor.h"
#include "utils/TypePrinter.h"
#include "utils/Utils.h"
#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"
#include <chrono>
#include <string>
using namespace std;
using namespace arm_compute;
using namespace arm_compute::test;
int main() {
const bool useFp16 = false;
const DataLayout dataLayout = DataLayout::NHWC;
const TensorShape inTensorShape = TensorShape(64, 112, 112, 1);
const TensorShape outTensorShape = TensorShape(64, 56, 56, 1);
NEPoolingLayer pooling;
Tensor inputt;
Tensor outputt;
const DataType tensorDataType = useFp16 ? DataType::F16 : DataType::QASYMM8;
TensorInfo inputInfo(inTensorShape, 1, tensorDataType, dataLayout);
TensorInfo outputInfo(outTensorShape, 1, tensorDataType, dataLayout);
if (!useFp16) {
inputInfo.set_quantization_info(QuantizationInfo(1.0f));
outputInfo.set_quantization_info(QuantizationInfo(1.0f, 0));
}
inputt.allocator()->init(inputInfo);
outputt.allocator()->init(outputInfo);
PoolingLayerInfo pool_info;
pool_info.pool_type = PoolingType::MAX;
pool_info.pool_size = arm_compute::Size2D(3, 3);
pool_info.pad_stride_info = arm_compute::PadStrideInfo(
2,
2,
1,
1,
1,
1,
DimensionRoundingType::FLOOR);
pool_info.data_layout = DataLayout::NHWC;
pool_info.exclude_padding = false;
pool_info.is_global_pooling = false;
pool_info.fp_mixed_precision = false;
pool_info.use_inf_as_limit = true;
pool_info.use_kernel_indices = false;
pooling.configure(&inputt, &outputt, pool_info);
inputt.allocator()->allocate();
outputt.allocator()->allocate();
std::cout << "mode=" << (useFp16 ? "fp16" : "int8") << std::endl;
// Warm-up.
pooling.run();
const auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; i++) {
pooling.run();
}
const auto finish = std::chrono::high_resolution_clock::now();
const uint64_t total_duration =
std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
std::cout << "time: " << total_duration / 100 << std::endl;
}
Results:
mode=fp16
time: 104
---
mode=int8
time: 1424
Reactions are currently unavailable