Skip to content

NEPoolingLayer - int8 vs fp16 performance #1273

@alvoron

Description

@alvoron

int8 MaxPool performance is worse than fp16 on Apple M2 Pro.
ACL version: v52.8.0

Reproducer:

#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/Tensor.h"
#include "tests/SimpleTensor.h"
#include "utils/TypePrinter.h"
#include "utils/Utils.h"

#include "tests/Utils.h"
#include "tests/AssetsLibrary.h"
#include "tests/NEON/Accessor.h"

#include <chrono>
#include <string>

using namespace std;
using namespace arm_compute;
using namespace arm_compute::test;

int main() {
    const bool useFp16 = false;
    const DataLayout dataLayout = DataLayout::NHWC;
    const TensorShape inTensorShape = TensorShape(64, 112, 112, 1);
    const TensorShape outTensorShape = TensorShape(64, 56, 56, 1);

    NEPoolingLayer pooling;

    Tensor inputt;
    Tensor outputt;

    const DataType tensorDataType = useFp16 ? DataType::F16 : DataType::QASYMM8;
    TensorInfo inputInfo(inTensorShape, 1, tensorDataType, dataLayout);
    TensorInfo outputInfo(outTensorShape, 1, tensorDataType, dataLayout);

    if (!useFp16) {
        inputInfo.set_quantization_info(QuantizationInfo(1.0f));
        outputInfo.set_quantization_info(QuantizationInfo(1.0f, 0));
    }

    inputt.allocator()->init(inputInfo);
    outputt.allocator()->init(outputInfo);

    PoolingLayerInfo pool_info;
    pool_info.pool_type = PoolingType::MAX;
    pool_info.pool_size = arm_compute::Size2D(3, 3);
    pool_info.pad_stride_info = arm_compute::PadStrideInfo(
        2,
        2,
        1,
        1,
        1,
        1,
        DimensionRoundingType::FLOOR);
    pool_info.data_layout = DataLayout::NHWC;
    pool_info.exclude_padding = false;
    pool_info.is_global_pooling = false;
    pool_info.fp_mixed_precision = false;
    pool_info.use_inf_as_limit = true;
    pool_info.use_kernel_indices = false;

    pooling.configure(&inputt, &outputt, pool_info);
    inputt.allocator()->allocate();
    outputt.allocator()->allocate();

    std::cout << "mode=" << (useFp16 ? "fp16" : "int8") << std::endl;

    // Warm-up.
    pooling.run();

    const auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < 100; i++) {
        pooling.run();
    }
    const auto finish = std::chrono::high_resolution_clock::now();
    const uint64_t total_duration =
        std::chrono::duration_cast<std::chrono::microseconds>(finish - start).count();
    std::cout << "time: " << total_duration / 100 << std::endl;
}

Results:

mode=fp16
time: 104
---
mode=int8
time: 1424

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions