From eb638e8d0b22e313e8693568b09482bcc71e67d7 Mon Sep 17 00:00:00 2001 From: Rohan Yadav Date: Tue, 31 Oct 2023 12:26:12 -0700 Subject: [PATCH 1/2] performance improvement with manual cholesky mapping --- cunumeric/linalg/cholesky.py | 57 +++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py index 9bba033619..097b7f84af 100644 --- a/cunumeric/linalg/cholesky.py +++ b/cunumeric/linalg/cholesky.py @@ -31,6 +31,19 @@ from ..deferred import DeferredArray from ..runtime import Runtime +def get_gpu_lower_triangular(point): + assert(len(point) == 2) + from legate.core import get_machine + from legate.core.machine import ProcessorKind + machine = get_machine() + num_gpus = machine.count(ProcessorKind.GPU) + gpus = machine.only(ProcessorKind.GPU) + # The linearized block-cyclic lower-triangular-blocked decomposition + # mapping of 2-d points to blocks is: + def mapping(i, j): + return (j + (i * (i + 1)) // 2) % num_gpus + return gpus[mapping(point[0], point[1])] + def transpose_copy_single( context: Context, input: Store, output: Store @@ -83,7 +96,8 @@ def potrf(context: Context, p_output: StorePartition, i: int) -> None: task.throws_exception(LinAlgError) task.add_output(p_output) task.add_input(p_output) - task.execute() + with get_gpu_lower_triangular((i, i)): + task.execute() def trsm( @@ -95,14 +109,15 @@ def trsm( rhs = p_output.get_child_store(i, i) lhs = p_output - launch_domain = Rect(lo=(lo, i), hi=(hi, i + 1)) - task = context.create_manual_task( - CuNumericOpCode.TRSM, launch_domain=launch_domain - ) - task.add_output(lhs) - task.add_input(rhs) - task.add_input(lhs) - task.execute() + for point in Rect(lo=(lo, i), hi=(hi, i + 1)): + task = context.create_manual_task( + CuNumericOpCode.TRSM, launch_domain=Rect(lo=point, hi=point, exclusive=False) + ) + task.add_output(lhs) + task.add_input(rhs) + task.add_input(lhs) + with get_gpu_lower_triangular(point): + task.execute() def syrk(context: Context, p_output: StorePartition, k: int, i: int) -> None: @@ -116,7 +131,8 @@ def syrk(context: Context, p_output: StorePartition, k: int, i: int) -> None: task.add_output(lhs) task.add_input(rhs) task.add_input(lhs) - task.execute() + with get_gpu_lower_triangular((k, k)): + task.execute() def gemm( @@ -134,18 +150,19 @@ def gemm( lhs = p_output rhs1 = p_output - launch_domain = Rect(lo=(lo, k), hi=(hi, k + 1)) - task = context.create_manual_task( - CuNumericOpCode.GEMM, launch_domain=launch_domain - ) - task.add_output(lhs) - task.add_input(rhs1, proj=lambda p: (p[0], i)) - task.add_input(rhs2) - task.add_input(lhs) - task.execute() + for point in Rect(lo=(lo, k), hi=(hi, k + 1)): + task = context.create_manual_task( + CuNumericOpCode.GEMM, launch_domain=Rect(lo=point, hi=point, exclusive=False) + ) + task.add_output(lhs) + task.add_input(rhs1, proj=lambda p: (p[0], i)) + task.add_input(rhs2) + task.add_input(lhs) + with get_gpu_lower_triangular(point): + task.execute() -MIN_CHOLESKY_TILE_SIZE = 2048 +MIN_CHOLESKY_TILE_SIZE = 4096 MIN_CHOLESKY_MATRIX_SIZE = 8192 From defdcf434d8c9ab68982cda2b1addaba74751bdb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Oct 2023 19:28:08 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cunumeric/linalg/cholesky.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py index 097b7f84af..4cad697442 100644 --- a/cunumeric/linalg/cholesky.py +++ b/cunumeric/linalg/cholesky.py @@ -31,17 +31,21 @@ from ..deferred import DeferredArray from ..runtime import Runtime + def get_gpu_lower_triangular(point): - assert(len(point) == 2) + assert len(point) == 2 from legate.core import get_machine from legate.core.machine import ProcessorKind + machine = get_machine() num_gpus = machine.count(ProcessorKind.GPU) gpus = machine.only(ProcessorKind.GPU) + # The linearized block-cyclic lower-triangular-blocked decomposition # mapping of 2-d points to blocks is: def mapping(i, j): return (j + (i * (i + 1)) // 2) % num_gpus + return gpus[mapping(point[0], point[1])] @@ -111,7 +115,8 @@ def trsm( for point in Rect(lo=(lo, i), hi=(hi, i + 1)): task = context.create_manual_task( - CuNumericOpCode.TRSM, launch_domain=Rect(lo=point, hi=point, exclusive=False) + CuNumericOpCode.TRSM, + launch_domain=Rect(lo=point, hi=point, exclusive=False), ) task.add_output(lhs) task.add_input(rhs) @@ -152,7 +157,8 @@ def gemm( for point in Rect(lo=(lo, k), hi=(hi, k + 1)): task = context.create_manual_task( - CuNumericOpCode.GEMM, launch_domain=Rect(lo=point, hi=point, exclusive=False) + CuNumericOpCode.GEMM, + launch_domain=Rect(lo=point, hi=point, exclusive=False), ) task.add_output(lhs) task.add_input(rhs1, proj=lambda p: (p[0], i))