diff --git a/test/Project.toml b/test/Project.toml
index 90670d48..1e877a7e 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -10,6 +10,7 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NEO_jll = "700fe977-ac61-5f37-bbc8-c6c4b2b6a9fd"
+ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -21,3 +22,6 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 libigc_jll = "94295238-5935-5bd7-bb0f-b00942e9bdd5"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
+
+[compat]
+ParallelTestRunner = "2.2"
diff --git a/test/execution.jl b/test/execution.jl
index cd3db014..1e2e1797 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -108,10 +108,10 @@ end
         export external_dummy
         external_dummy() = return
     end
-    import ...KernelModule
+    import .KernelModule
     @oneapi KernelModule.external_dummy()
     @eval begin
-        using ...KernelModule
+        using .KernelModule
         @oneapi external_dummy()
     end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 36773dc5..06d77db2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,390 +1,114 @@
-using Distributed
-using Dates
-import REPL
-using Printf: @sprintf
-using Base.Filesystem: path_separator
+# using Distributed
+# using Dates
+# import REPL
+# using Printf: @sprintf
+# using Base.Filesystem: path_separator
 
-# parse some command-line arguments
-function extract_flag!(args, flag, default=nothing)
-    for f in args
-        if startswith(f, flag)
-            # Check if it's just `--flag` or if it's `--flag=foo`
-            if f != flag
-                val = split(f, '=')[2]
-                if default !== nothing && !(typeof(default) <: AbstractString)
-                  val = parse(typeof(default), val)
-                end
-            else
-                val = default
-            end
-
-            # Drop this value from our args
-            filter!(x -> x != f, args)
-            return (true, val)
-        end
-    end
-    return (false, default)
-end
-do_help, _ = extract_flag!(ARGS, "--help")
-if do_help
-    println("""
-        Usage: runtests.jl [--help] [--list] [--jobs=N] [TESTS...]
-
-               --help             Show this text.
-               --list             List all available tests.
-               --quickfail        Fail the entire run as soon as a single test errored.
-               --jobs=N           Launch `N` processes to perform tests (default: Sys.CPU_THREADS).
+using ParallelTestRunner
+using oneAPI
 
-               Remaining arguments filter the tests that will be executed.""")
-    exit(0)
-end
-_, jobs = extract_flag!(ARGS, "--jobs", Sys.CPU_THREADS)
-do_quickfail, _ = extract_flag!(ARGS, "--quickfail")
+oneAPI.functional() || error("oneAPI.jl is not functional on this system")
 
-include("setup.jl")     # make sure everything is precompiled
 @info "System information:\n" * sprint(io->oneAPI.versioninfo(io))
 
 if Sys.islinux()
-@info "Using oneAPI support library at " * oneAPI.Support.liboneapi_support
+    @info "Using oneAPI support library at " * oneAPI.Support.liboneapi_support
 end
 
-@info "Running $jobs tests in parallel. If this is too many, specify the `--jobs` argument to the tests, or set the JULIA_CPU_THREADS environment variable."
 
 # choose tests
-const tests = []
-const test_runners = Dict()
-## files in the test folder
-for (rootpath, dirs, files) in walkdir(@__DIR__)
-  # find Julia files
-  filter!(files) do file
-    endswith(file, ".jl") && file !== "setup.jl" && file !== "runtests.jl"
-  end
-  isempty(files) && continue
-
-  # strip extension
-  files = map(files) do file
-    file[1:end-3]
-  end
-
-  # prepend subdir
-  subdir = relpath(rootpath, @__DIR__)
-  if subdir != "."
-    files = map(files) do file
-      joinpath(subdir, file)
-    end
-  end
-
-  # unify path separators
-  files = map(files) do file
-    replace(file, path_separator => '/')
-  end
-
-  append!(tests, files)
-  for file in files
-    test_runners[file] = ()->include("$(@__DIR__)/$file.jl")
-  end
-end
-sort!(tests; by=(file)->stat("$(@__DIR__)/$file.jl").size, rev=true)
-## GPUArrays testsuite
+testsuite = find_tests(@__DIR__)
+## GPUArrays test suite
+import GPUArrays
+gpuarrays = pathof(GPUArrays)
+gpuarrays_root = dirname(dirname(gpuarrays))
+gpuarrays_testsuite = joinpath(gpuarrays_root, "test", "testsuite.jl")
+include(gpuarrays_testsuite)
 for name in keys(TestSuite.tests)
-    pushfirst!(tests, "gpuarrays/$name")
-    test_runners["gpuarrays/$name"] = ()->TestSuite.tests[name](oneArray)
+    testsuite["gpuarrays/$name"] = :(TestSuite.tests[$name](oneArray))
 end
-## finalize
-unique!(tests)
 
-# parse some more command-line arguments
-## --list to list all available tests
-do_list, _ = extract_flag!(ARGS, "--list")
-if do_list
-    println("Available tests:")
-    for test in sort(tests)
-        println(" - $test")
-    end
-    exit(0)
-end
-## no options should remain
-optlike_args = filter(startswith("-"), ARGS)
-if !isempty(optlike_args)
-    error("Unknown test options `$(join(optlike_args, " "))` (try `--help` for usage instructions)")
-end
-## the remaining args filter tests
-if !isempty(ARGS)
-  filter!(tests) do test
-    any(arg->startswith(test, arg), ARGS)
-  end
-end
+args = parse_args(ARGS)
 
-# add workers
-const test_exeflags = Base.julia_cmd()
-filter!(test_exeflags.exec) do c
-    return !(startswith(c, "--depwarn") || startswith(c, "--check-bounds"))
-end
-push!(test_exeflags.exec, "--check-bounds=yes")
-push!(test_exeflags.exec, "--startup-file=no")
-push!(test_exeflags.exec, "--depwarn=yes")
-push!(test_exeflags.exec, "--project=$(Base.active_project())")
-const test_exename = popfirst!(test_exeflags.exec)
-function addworker(X; kwargs...)
-    withenv("JULIA_NUM_THREADS" => 1, "OPENBLAS_NUM_THREADS" => 1) do
-        procs = addprocs(X; exename=test_exename, exeflags=test_exeflags, kwargs...)
-        @everywhere procs include($(joinpath(@__DIR__, "setup.jl")))
-        procs
-    end
-end
-addworker(min(jobs, length(tests)))
+init_worker_code = quote
+    using oneAPI, Adapt
 
-# pretty print information about gc and mem usage
-testgroupheader = "Test"
-workerheader = "(Worker)"
-name_align        = maximum([textwidth(testgroupheader) + textwidth(" ") +
-                             textwidth(workerheader); map(x -> textwidth(x) +
-                             3 + ndigits(nworkers()), tests)])
-elapsed_align     = textwidth("Time (s)")
-gc_align      = textwidth("GC (s)")
-percent_align = textwidth("GC %")
-alloc_align   = textwidth("Alloc (MB)")
-rss_align     = textwidth("RSS (MB)")
-printstyled(" "^(name_align + textwidth(testgroupheader) - 3), " | ")
-printstyled("         | ---------------- CPU ---------------- |\n", color=:white)
-printstyled(testgroupheader, color=:white)
-printstyled(lpad(workerheader, name_align - textwidth(testgroupheader) + 1), " | ", color=:white)
-printstyled("Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) |\n", color=:white)
-print_lock = stdout isa Base.LibuvStream ? stdout.lock : ReentrantLock()
-if stderr isa Base.LibuvStream
-    stderr.lock = print_lock
-end
-function print_testworker_stats(test, wrkr, resp)
-    @nospecialize resp
-    lock(print_lock)
-    try
-        printstyled(test, color=:white)
-        printstyled(lpad("($wrkr)", name_align - textwidth(test) + 1, " "), " | ", color=:white)
-        time_str = @sprintf("%7.2f",resp[2])
-        printstyled(lpad(time_str, elapsed_align, " "), " | ", color=:white)
+    import GPUArrays
+    include($gpuarrays_testsuite)
+    testf(f, xs...; kwargs...) = TestSuite.compare(f, oneArray, xs...; kwargs...)
 
-        cpu_gc_str = @sprintf("%5.2f", resp[4])
-        printstyled(lpad(cpu_gc_str, gc_align, " "), " | ", color=:white)
-        # since there may be quite a few digits in the percentage,
-        # the left-padding here is less to make sure everything fits
-        cpu_percent_str = @sprintf("%4.1f", 100 * resp[4] / resp[2])
-        printstyled(lpad(cpu_percent_str, percent_align, " "), " | ", color=:white)
-        cpu_alloc_str = @sprintf("%5.2f", resp[3] / 2^20)
-        printstyled(lpad(cpu_alloc_str, alloc_align, " "), " | ", color=:white)
+    const eltypes = [Int16, Int32, Int64,
+                    Complex{Int16}, Complex{Int32}, Complex{Int64},
+                    Float16, Float32,
+                    ComplexF32]
 
-        cpu_rss_str = @sprintf("%5.2f", resp[6] / 2^20)
-        printstyled(lpad(cpu_rss_str, rss_align, " "), " |\n", color=:white)
-    finally
-        unlock(print_lock)
+    const float16_supported = oneL0.module_properties(device()).fp16flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP16 == oneL0.ZE_DEVICE_MODULE_FLAG_FP16
+    if float16_supported
+        append!(eltypes, [#=Float16,=# ComplexF16])
     end
-end
-global print_testworker_started = (name, wrkr)->begin
-end
-function print_testworker_errored(name, wrkr)
-    lock(print_lock)
-    try
-        printstyled(name, color=:red)
-        printstyled(lpad("($wrkr)", name_align - textwidth(name) + 1, " "), " |",
-            " "^elapsed_align, " failed at $(now())\n", color=:red)
-    finally
-        unlock(print_lock)
+    const float64_supported = oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 == oneL0.ZE_DEVICE_MODULE_FLAG_FP64
+    if float64_supported
+        append!(eltypes, [Float64, ComplexF64])
     end
-end
+    TestSuite.supported_eltypes(::Type{<:oneArray}) = eltypes
+
+
+    const validation_layer = parse(Bool, get(ENV, "ZE_ENABLE_VALIDATION_LAYER", "false"))
+    const parameter_validation = parse(Bool, get(ENV, "ZE_ENABLE_PARAMETER_VALIDATION", "false"))
 
-# run tasks
-t0 = now()
-results = []
-all_tasks = Task[]
-all_tests = copy(tests)
-try
-    # Monitor stdin and kill this task on ^C
-    # but don't do this on Windows, because it may deadlock in the kernel
-    t = current_task()
-    running_tests = Dict{String, DateTime}()
-    if !Sys.iswindows() && isa(stdin, Base.TTY)
-        stdin_monitor = @async begin
-            term = REPL.Terminals.TTYTerminal("xterm", stdin, stdout, stderr)
-            try
-                REPL.Terminals.raw!(term, true)
-                while true
-                    c = read(term, Char)
-                    if c == '\x3'
-                        Base.throwto(t, InterruptException())
-                        break
-                    elseif c == '?'
-                        println("Currently running: ")
-                        tests = sort(collect(running_tests), by=x->x[2])
-                        foreach(tests) do (test, date)
-                            println(test, " (running for ", round(now()-date, Minute), ")")
-                        end
+    # NOTE: based on test/pkg.jl::capture_stdout, but doesn't discard exceptions
+    macro grab_output(ex)
+        quote
+            mktemp() do fname, fout
+                ret = nothing
+                open(fname, "w") do fout
+                    redirect_stdout(fout) do
+                                                ret = $(esc(ex))
                     end
                 end
-            catch e
-                isa(e, InterruptException) || rethrow()
-            finally
-                REPL.Terminals.raw!(term, false)
+                ret, read(fname, String)
             end
         end
     end
-    @sync begin
-        function recycle_worker(p)
-            rmprocs(p, waitfor=30)
-            return nothing
-        end
-
-        for p in workers()
-            @async begin
-                push!(all_tasks, current_task())
-                while length(tests) > 0
-                    test = popfirst!(tests)
-
-                    # sometimes a worker failed, and we need to spawn a new one
-                    if p === nothing
-                        p = addworker(1)[1]
-                    end
-                    wrkr = p
-
-                    local resp
-
-                    # run the test
-                    running_tests[test] = now()
-                    try
-                        resp = remotecall_fetch(runtests, wrkr, test_runners[test], test)
-                    catch e
-                        isa(e, InterruptException) && return
-                        resp = Any[e]
-                    end
-                    delete!(running_tests, test)
-                    push!(results, (test, resp))
 
-                    # act on the results
-                    if resp[1] isa Exception
-                        print_testworker_errored(test, wrkr)
-                        do_quickfail && Base.throwto(t, InterruptException())
-
-                        # the worker encountered some failure, recycle it
-                        # so future tests get a fresh environment
-                        p = recycle_worker(p)
-                    else
-                        print_testworker_stats(test, wrkr, resp)
-
-                        cpu_rss = resp[6]
-                        if haskey(ENV, "CI") && cpu_rss > 3*2^30
-                            # XXX: collecting garbage
-                            #      after each test, we are leaking CPU memory somewhere.
-                            #      this is a problem on CI, where2 we don't have much RAM.
-                            #      work around this by periodically recycling the worker.
-                            p = recycle_worker(p)
-                        end
-                    end
+    # Run some code on-device
+    macro on_device(ex...)
+        code = ex[end]
+        kwargs = ex[1:end-1]
+
+        @gensym kernel
+        esc(quote
+            let
+                function $kernel()
+                    $code
+                    return
                 end
 
-                if p !== nothing
-                    recycle_worker(p)
-                end
+                oneAPI.@sync @oneapi $(kwargs...) $kernel()
             end
-        end
+        end)
     end
-catch e
-    isa(e, InterruptException) || rethrow()
-    # If the test suite was merely interrupted, still print the
-    # summary, which can be useful to diagnose what's going on
-    foreach(task -> begin
-            istaskstarted(task) || return
-            istaskdone(task) && return
-            try
-                schedule(task, InterruptException(); error=true)
-            catch ex
-                @error "InterruptException" exception=ex,catch_backtrace()
-            end
-        end, all_tasks)
-    for t in all_tasks
-        # NOTE: we can't just wait, but need to discard the exception,
-        #       because the throwto for --quickfail also kills the worker.
-        try
-            wait(t)
-        catch e
-            showerror(stderr, e)
-        end
-    end
-finally
-    if @isdefined stdin_monitor
-        schedule(stdin_monitor, InterruptException(); error=true)
-    end
-end
-t1 = now()
-elapsed = canonicalize(Dates.CompoundPeriod(t1-t0))
-println("Testing finished in $elapsed")
 
-# construct a testset to render the test results
-o_ts = Test.DefaultTestSet("Overall")
-Test.push_testset(o_ts)
-completed_tests = Set{String}()
-for (testname, (resp,)) in results
-    push!(completed_tests, testname)
-    if isa(resp, Test.DefaultTestSet)
-        Test.push_testset(resp)
-        Test.record(o_ts, resp)
-        Test.pop_testset()
-    elseif isa(resp, Tuple{Int,Int})
-        fake = Test.DefaultTestSet(testname)
-        for i in 1:resp[1]
-            Test.record(fake, Test.Pass(:test, nothing, nothing, nothing, nothing))
-        end
-        for i in 1:resp[2]
-            Test.record(fake, Test.Broken(:test, nothing))
-        end
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
-    elseif isa(resp, RemoteException) && isa(resp.captured.ex, Test.TestSetException)
-        println("Worker $(resp.pid) failed running test $(testname):")
-        Base.showerror(stdout, resp.captured)
-        println()
-        fake = Test.DefaultTestSet(testname)
-        for i in 1:resp.captured.ex.pass
-            Test.record(fake, Test.Pass(:test, nothing, nothing, nothing, nothing))
-        end
-        for i in 1:resp.captured.ex.broken
-            Test.record(fake, Test.Broken(:test, nothing))
-        end
-        for t in resp.captured.ex.errors_and_fails
-            Test.record(fake, t)
-        end
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
-    else
-        if !isa(resp, Exception)
-            resp = ErrorException(string("Unknown result type : ", typeof(resp)))
-        end
-        # If this test raised an exception that is not a remote testset exception,
-        # i.e. not a RemoteException capturing a TestSetException that means
-        # the test runner itself had some problem, so we may have hit a segfault,
-        # deserialization errors or something similar.  Record this testset as Errored.
-        fake = Test.DefaultTestSet(testname)
-        Test.record(fake, Test.Error(:nontest_error, testname, nothing, Any[(resp, [])], LineNumberNode(1)))
-        Test.push_testset(fake)
-        Test.record(o_ts, fake)
-        Test.pop_testset()
-    end
+    # helper function for sinking a value to prevent the callee from getting optimized away
+    @inline sink(i::Int32) =
+        Base.llvmcall("""%slot = alloca i32
+                        store volatile i32 %0, i32* %slot
+                        %value = load volatile i32, i32* %slot
+                        ret i32 %value""", Int32, Tuple{Int32}, i)
+    @inline sink(i::Int64) =
+        Base.llvmcall("""%slot = alloca i64
+                        store volatile i64 %0, i64* %slot
+                        %value = load volatile i64, i64* %slot
+                        ret i64 %value""", Int64, Tuple{Int64}, i)
 end
-for test in all_tests
-    (test in completed_tests) && continue
-    fake = Test.DefaultTestSet(test)
-    Test.record(fake, Test.Error(:test_interrupted, test, nothing,
-                                    [("skipped", [])], LineNumberNode(1)))
-    Test.push_testset(fake)
-    Test.record(o_ts, fake)
-    Test.pop_testset()
-end
-println()
-Test.print_test_results(o_ts, 1)
-if !o_ts.anynonpass
-    println("    \033[32;1mSUCCESS\033[0m")
-else
-    println("    \033[31;1mFAILURE\033[0m\n")
-    Test.print_test_errors(o_ts)
-    throw(Test.FallbackTestSetException("Test run finished with errors"))
+
+init_code = quote
+    using oneAPI, Adapt
+
+    import ..TestSuite, ..testf
+    import ..eltypes, ..float16_supported, ..float64_supported,
+           ..validation_layer, ..parameter_validation,
+           ..@grab_output, ..@on_device, ..sink
 end
 
+runtests(oneAPI, args; testsuite, init_code, init_worker_code)
diff --git a/test/setup.jl b/test/setup.jl
deleted file mode 100644
index 269d5b9c..00000000
--- a/test/setup.jl
+++ /dev/null
@@ -1,136 +0,0 @@
-using Distributed, Test, oneAPI
-
-oneAPI.functional() || error("oneAPI.jl is not functional on this system")
-
-# GPUArrays has a testsuite that isn't part of the main package.
-# Include it directly.
-import GPUArrays
-gpuarrays = pathof(GPUArrays)
-gpuarrays_root = dirname(dirname(gpuarrays))
-include(joinpath(gpuarrays_root, "test", "testsuite.jl"))
-testf(f, xs...; kwargs...) = TestSuite.compare(f, oneArray, xs...; kwargs...)
-
-const eltypes = [Int16, Int32, Int64,
-                 Complex{Int16}, Complex{Int32}, Complex{Int64},
-                 Float16, Float32,
-                 ComplexF32]
-const float16_supported = oneL0.module_properties(device()).fp16flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP16 == oneL0.ZE_DEVICE_MODULE_FLAG_FP16
-if float16_supported
-    append!(eltypes, [#=Float16,=# ComplexF16])
-end
-const float64_supported = oneL0.module_properties(device()).fp64flags & oneL0.ZE_DEVICE_MODULE_FLAG_FP64 == oneL0.ZE_DEVICE_MODULE_FLAG_FP64
-if float64_supported
-    append!(eltypes, [Float64, ComplexF64])
-end
-TestSuite.supported_eltypes(::Type{<:oneArray}) = eltypes
-
-const validation_layer = parse(Bool, get(ENV, "ZE_ENABLE_VALIDATION_LAYER", "false"))
-const parameter_validation = parse(Bool, get(ENV, "ZE_ENABLE_PARAMETER_VALIDATION", "false"))
-
-using Random
-
-
-## entry point
-
-function runtests(f, name)
-    old_print_setting = Test.TESTSET_PRINT_ENABLE[]
-    Test.TESTSET_PRINT_ENABLE[] = false
-
-    try
-        # generate a temporary module to execute the tests in
-        mod_name = Symbol("Test", rand(1:100), "Main_", replace(name, '/' => '_'))
-        mod = @eval(Main, module $mod_name end)
-        @eval(mod, using Test, Random, oneAPI)
-
-        let id = myid()
-            wait(@spawnat 1 print_testworker_started(name, id))
-        end
-
-        ex = quote
-            GC.gc(true)
-            Random.seed!(1)
-            oneAPI.allowscalar(false)
-
-            @timed @testset $"$name" begin
-                $f()
-            end
-        end
-        data = Core.eval(mod, ex)
-        #data[1] is the testset
-
-        # process results
-        cpu_rss = Sys.maxrss()
-        if VERSION >= v"1.11.0-DEV.1529"
-            tc = Test.get_test_counts(data[1])
-            passes,fails,error,broken,c_passes,c_fails,c_errors,c_broken =
-                tc.passes, tc.fails, tc.errors, tc.broken, tc.cumulative_passes,
-                tc.cumulative_fails, tc.cumulative_errors, tc.cumulative_broken
-        else
-            passes,fails,errors,broken,c_passes,c_fails,c_errors,c_broken =
-                Test.get_test_counts(data[1])
-        end
-        if data[1].anynonpass == false
-            data = ((passes+c_passes,broken+c_broken),
-                    data[2],
-                    data[3],
-                    data[4],
-                    data[5])
-        end
-        res = vcat(collect(data), cpu_rss)
-
-        GC.gc(true)
-        res
-    finally
-        Test.TESTSET_PRINT_ENABLE[] = old_print_setting
-    end
-end
-
-
-## auxiliary stuff
-
-# NOTE: based on test/pkg.jl::capture_stdout, but doesn't discard exceptions
-macro grab_output(ex)
-    quote
-        mktemp() do fname, fout
-            ret = nothing
-            open(fname, "w") do fout
-                redirect_stdout(fout) do
-                    ret = $(esc(ex))
-                end
-            end
-            ret, read(fname, String)
-        end
-    end
-end
-
-# Run some code on-device
-macro on_device(ex...)
-    code = ex[end]
-    kwargs = ex[1:end-1]
-
-    @gensym kernel
-    esc(quote
-        let
-            function $kernel()
-                $code
-                return
-            end
-
-            oneAPI.@sync @oneapi $(kwargs...) $kernel()
-        end
-    end)
-end
-
-# helper function for sinking a value to prevent the callee from getting optimized away
-@inline sink(i::Int32) =
-    Base.llvmcall("""%slot = alloca i32
-                     store volatile i32 %0, i32* %slot
-                     %value = load volatile i32, i32* %slot
-                     ret i32 %value""", Int32, Tuple{Int32}, i)
-@inline sink(i::Int64) =
-    Base.llvmcall("""%slot = alloca i64
-                     store volatile i64 %0, i64* %slot
-                     %value = load volatile i64, i64* %slot
-                     ret i64 %value""", Int64, Tuple{Int64}, i)
-
-nothing # File is loaded via a remotecall to "include". Ensure it returns "nothing".