use numa to alloc mem

luo-cheng2021 · luo-cheng2021 · commit 76d5f0f89ba7 · 2023-08-14T22:32:58.000+08:00
(cherry picked from commit 96b9a59)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 if(CPU_EXTENSIONS_ENABLE_LOG)
   target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_LOG)
 endif()
+target_link_libraries(${PROJECT_NAME} PUBLIC numa)
 
 set(CMAKE_DST lib/cmake/${PROJECT_NAME})
 # header files
diff --git a/src/common/memory_alloc.cpp b/src/common/memory_alloc.cpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <cstdlib>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sched.h>
+#include <numa.h>
+#include "memory_alloc.hpp"
+
+void* llmdnn_alloc(size_t aligned_size, size_t size, bool hint_numa) {
+    if (hint_numa && numa_available() != -1) {
+        int cur_cpu = sched_getcpu();
+        auto cur_numa_node = numa_node_of_cpu(cur_cpu);
+        return numa_alloc_onnode(size, cur_numa_node);
+    } else {
+        return aligned_alloc(aligned_size, size);
+    }
+}
+
+void llmdnn_free(void* p, size_t size, bool hint_numa) {
+    if (hint_numa && numa_available() != -1) {
+        numa_free(p, size);
+    } else {
+        ::free(p);
+    }
+}
diff --git a/src/common/memory_alloc.hpp b/src/common/memory_alloc.hpp
@@ -0,0 +1,10 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory.h>
+
+void* llmdnn_alloc(size_t aligned_size, size_t size, bool hint_numa = true);
+void llmdnn_free(void* p, size_t size, bool hint_numa = true);
diff --git a/src/common/tensor2d.hpp b/src/common/tensor2d.hpp
@@ -9,9 +9,7 @@
 #include <iostream>
 #include <functional>
 #include <assert.h>
-#ifdef ENABLE_NUMA
-#include "numa.h"
-#endif
+#include "memory_alloc.hpp"
 #include "log.hpp"
 #include "bf16.hpp"
 
@@ -30,7 +28,7 @@ struct tensor2D {
     tensor2D() = default;
     tensor2D(const tensor2D&) = delete;
     ~tensor2D() {
-        if (own && data) ::free(data);
+        if (own && data) llmdnn_free(data, capacity);
     }
 
     operator bool() {
@@ -104,22 +102,11 @@ struct tensor2D {
         if (capacity < need_capacity) {
             if (!is_const)
                 need_capacity *= 2;
-            capacity = need_capacity;
             // align begin address to cache line is vital, so tile load can
             // use all bandwidth (L1D/L2 only deliver data in unit of 64-byte aligned cache-line)
-
-#ifdef ENABLE_NUMA
-            if (USE_NUMA) {
-                data = std::shared_ptr<T>(
-                            reinterpret_cast<T*>(numa_alloc_local(capacity)),
-                            [need_capacity](void * p){ numa_free(p, need_capacity); });
-            } else {
-#else
-            {
-#endif
-                if (data) ::free(data);
-                data = reinterpret_cast<T*>(aligned_alloc(64, capacity));
-            }
+            if (data) llmdnn_free(data, capacity);
+            data = reinterpret_cast<T*>(llmdnn_alloc(64, need_capacity));
+            capacity = need_capacity;
             if (is_const)
                 memset(static_cast<void*>(data), 0, need_capacity);
             if (reinterpret_cast<uintptr_t>(data) % 64)
diff --git a/src/mm_kernel_common_amx.hpp b/src/mm_kernel_common_amx.hpp
@@ -16,10 +16,6 @@
 #include <x86intrin.h>
 #endif
 
-#ifdef ENABLE_NUMA
-#include "numa.h"
-#endif
-
 using namespace llmdnn;
 
 namespace amx_kernel {