Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 14 additions & 19 deletions lib/attributes/backend/launcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ void collectLoops(OklLoopInfo& loopInfo, std::list<OklLoopInfo*>& out) {
}
#endif

std::pair<LoopMetaData, LoopMetaData> splitTileAttr(OklLoopInfo& loopInfo, const oklt::Rewriter& r) {
std::pair<LoopMetaData, LoopMetaData> splitTileAttr(OklLoopInfo& loopInfo,
const oklt::Rewriter& r) {
auto sz = util::parseStrTo<size_t>(loopInfo.tileSize);

// Prepare first loop
Expand All @@ -210,15 +211,11 @@ std::pair<LoopMetaData, LoopMetaData> splitTileAttr(OklLoopInfo& loopInfo, const
if (sz.value_or(1024) > 0) {
if (firstMeta.inc.val.empty()) {
firstMeta.inc.val = loopInfo.tileSize;
switch (firstMeta.inc.op.uo) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we still don't handle UnOp::Other case, which will leave firstMeta.inc.op.bo uninitialized

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would ask the author of this code what should happen

Copy link
Collaborator

@kchabSS kchabSS May 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the ::Other were originally there as default values.
In case OP cannot be properly detected (or there is no increment/decrement at all), the default Other is set.
If the Other is detected at the end of loop parsing, an error must be raised.
That is how it was originally written for launcher PoC.

case UnOp::PreInc:
case UnOp::PostInc:
firstMeta.inc.op.bo = BinOp::AddAssign;
break;
case UnOp::PreDec:
case UnOp::PostDec:
firstMeta.inc.op.bo = BinOp::RemoveAssign;
break;
if (firstMeta.inc.op.uo == UnOp::PreInc || firstMeta.inc.op.uo == UnOp::PostInc) {
firstMeta.inc.op.bo = BinOp::AddAssign;
}
if (firstMeta.inc.op.uo == UnOp::PreDec || firstMeta.inc.op.uo == UnOp::PostDec) {
firstMeta.inc.op.bo = BinOp::RemoveAssign;
}
} else {
firstMeta.inc.val = "(" + loopInfo.tileSize + " * " + firstMeta.inc.val + ")";
Expand All @@ -228,13 +225,11 @@ std::pair<LoopMetaData, LoopMetaData> splitTileAttr(OklLoopInfo& loopInfo, const
// Prepare second loop
auto secondMeta = LoopMetaData(loopInfo, r);
secondMeta.range.start = firstMeta.var.name;
switch (secondMeta.condition.op) {
case BinOp::Le:
secondMeta.condition.op = BinOp::Lt;
break;
case BinOp::Ge:
secondMeta.condition.op = BinOp::Gt;
break;
if (secondMeta.condition.op == BinOp::Le) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that a correct behavior that LessEqual translates to LessThan?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that okay that we skip other BinOp cases?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how it was. I would imagine that there could be any kind of Binary/Unary operation for example:

struct MyInfo { int sizes[3] };
....
@kernel void func(..., MyInfo* ptr) {
    @outer for(int i =0 ; i <  *ptr.sizes[0]; i += *ptr.sizes[2]) {
       ...
     }
}

As far as I can see we just pass it as is

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that a correct behavior that LessEqual translates to LessThan?
Yes. That's correct.

secondMeta.condition.op = BinOp::Lt;
}
if (secondMeta.condition.op == BinOp::Ge) {
secondMeta.condition.op = BinOp::Gt;
}
if (sz.value_or(1024) > 0) {
secondMeta.range.end = "(" + firstMeta.var.name + " + " + loopInfo.tileSize + ")";
Expand Down Expand Up @@ -374,8 +369,8 @@ HandleResult handleLauncherKernelAttribute(SessionStage& s,
auto& rewriter = s.getRewriter();

if (!sema.getParsingKernelInfo()) {
return tl::make_unexpected(Error{OkltPipelineErrorCode::INTERNAL_ERROR_KERNEL_INFO_NULL,
"handleKernelAttribute"});
return tl::make_unexpected(
Error{OkltPipelineErrorCode::INTERNAL_ERROR_KERNEL_INFO_NULL, "handleKernelAttribute"});
}

auto kernelInfo = *sema.getParsingKernelInfo();
Expand Down
8 changes: 6 additions & 2 deletions lib/core/utils/for_stmt_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ tl::expected<OklLoopInfo, Error> parseForStmt(SessionStage& stage,
const clang::Attr* a) {
auto& ctx = stage.getCompiler().getASTContext();
OklLoopInfo ret{.attr = a, .stmt = s};
const Expr *start, *end = nullptr;
const Expr* start = nullptr;
const Expr* end = nullptr;

if (isa<DeclStmt>(s.getInit())) {
auto d = dyn_cast<DeclStmt>(s.getInit());
Expand All @@ -90,8 +91,11 @@ tl::expected<OklLoopInfo, Error> parseForStmt(SessionStage& stage,
start = rsh->getSubExpr();
}
ret.range.start = start;
}

auto child_count = std::distance(start->children().begin(), start->children().end());
if (!start) {
return tl::make_unexpected(
Error{std::error_code(), "loop parse: not supported init statement"});
}

// Condition
Expand Down
Original file line number Diff line number Diff line change
@@ -1,62 +1,63 @@
#include <cuda_runtime.h>
#include <cuda_pipeline_primitives.h>
#include <cuda_runtime.h>

namespace {
// Math functions

// Single precision
[[maybe_unused]] inline __device__ float okl_exp10f(float x) { return exp10f(x); }
[[maybe_unused]] inline __device__ float okl_exp10f(float x) {
return exp10f(x);
}

// Warp Shuffle Functions
template <class T>
inline __device__ T okl_shfl_sync(unsigned mask, T var, int srcLane,
int width = warpSize) {
return __shfl_sync(mask, var, srcLane, width);
inline __device__ T okl_shfl_sync(unsigned mask, T var, int srcLane, int width = warpSize) {
return __shfl_sync(mask, var, srcLane, width);
}

template <class T>
inline __device__ T okl_shfl_up_sync(unsigned mask, T var, unsigned int delta,
int width = warpSize) {
return __shfl_up_sync(mask, var, delta, width);
inline __device__ T
okl_shfl_up_sync(unsigned mask, T var, unsigned int delta, int width = warpSize) {
return __shfl_up_sync(mask, var, delta, width);
}

template <class T>
inline __device__ T okl_shfl_down_sync(unsigned mask, T var, unsigned int delta,
int width = warpSize) {
return __shfl_down_sync(mask, var, delta, width);
inline __device__ T
okl_shfl_down_sync(unsigned mask, T var, unsigned int delta, int width = warpSize) {
return __shfl_down_sync(mask, var, delta, width);
}

template <class T>
inline __device__ T okl_shfl_xor_sync(unsigned mask, T var, int laneMask,
int width = warpSize) {
return __shfl_xor_sync(mask, laneMask, width);
inline __device__ T okl_shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) {
return __shfl_xor_sync(mask, var, laneMask, width);
}

// Pipeline Primitives Interface
[[maybe_unused]] _CUDA_PIPELINE_STATIC_QUALIFIER void
okl_memcpy_async(void *__restrict__ dst_shared,
const void *__restrict__ src_global,
size_t size_and_align, size_t zfill = 0) {
__pipeline_memcpy_async(dst_shared, src_global, size_and_align);
[[maybe_unused]] _CUDA_PIPELINE_STATIC_QUALIFIER void okl_memcpy_async(
void* __restrict__ dst_shared,
const void* __restrict__ src_global,
size_t size_and_align,
size_t zfill = 0) {
__pipeline_memcpy_async(dst_shared, src_global, size_and_align);
}

[[maybe_unused]] _CUDA_PIPELINE_STATIC_QUALIFIER void okl_pipeline_commit() {
__pipeline_commit();
__pipeline_commit();
}

[[maybe_unused]] _CUDA_PIPELINE_STATIC_QUALIFIER void
okl_pipeline_wait_prior(size_t N) { __pipeline_wait_prior(N); }
} // namespace
[[maybe_unused]] _CUDA_PIPELINE_STATIC_QUALIFIER void okl_pipeline_wait_prior(size_t N) {
__pipeline_wait_prior(N);
}
} // namespace

extern "C" __global__
__launch_bounds__(1) void _occa_intrinsic_builtin_0(const float *fVec,
float *fSum) {
{
int i = (0) + blockIdx.x;
extern "C" __global__ __launch_bounds__(1) void _occa_intrinsic_builtin_0(const float* fVec,
float* fSum) {
{
int j = (0) + threadIdx.x;
float value = okl_exp10f(fVec[i]);
atomicAdd(&(*fSum), value);
int i = (0) + blockIdx.x;
{
int j = (0) + threadIdx.x;
float value = okl_exp10f(fVec[i]);
atomicAdd(&(*fSum), value);
}
}
}
}