From fd74204825abbedcd4d7c85eb9040b48b4d00a94 Mon Sep 17 00:00:00 2001
From: brendancol <brendan@makepath.com>
Date: Mon, 22 Dec 2025 10:17:28 -0500
Subject: [PATCH] added example files

---
 examples/cuda_utils.py |  46 +++++++++
 examples/hillshade.py  | 228 +++++++++++++++++++++++++++++++++++++++++
 examples/mesh_utils.py | 129 +++++++++++++++++++++++
 examples/playground.py | 122 ++++++++++++++++++++++
 examples/viewshed.py   | 221 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 746 insertions(+)
 create mode 100644 examples/cuda_utils.py
 create mode 100644 examples/hillshade.py
 create mode 100644 examples/mesh_utils.py
 create mode 100644 examples/playground.py
 create mode 100644 examples/viewshed.py

diff --git a/examples/cuda_utils.py b/examples/cuda_utils.py
new file mode 100644
index 0000000..e73690a
--- /dev/null
+++ b/examples/cuda_utils.py
@@ -0,0 +1,46 @@
+import numba as nb
+import numpy as np
+
+def calc_dims(shape):
+    threadsperblock = (32,32)
+    blockspergrid = (
+        (shape[0] + (threadsperblock[0] - 1)) // threadsperblock[0],
+        (shape[1] + (threadsperblock[1] - 1)) // threadsperblock[1]
+    )
+    return blockspergrid, threadsperblock
+
+@nb.cuda.jit(device=True)
+def add(a, b):
+    return float3(a[0]+b[0], a[1]+b[1], a[2]+b[2])
+
+@nb.cuda.jit(device=True)
+def diff(a, b):
+    return float3(a[0]-b[0], a[1]-b[1], a[2]-b[2])
+
+@nb.cuda.jit(device=True)
+def mul(a, b):
+    return float3(a[0]*b, a[1]*b, a[2]*b)
+
+@nb.cuda.jit(device=True)
+def multColor(a, b):
+    return float3(a[0]*b[0], a[1]*b[1], a[2]*b[2])
+
+@nb.cuda.jit(device=True)
+def dot(a, b):
+    return a[0]*b[0] + a[1]*b[1] + a[2]*b[2]
+
+@nb.cuda.jit(device=True)
+def mix(a, b, k):
+    return add(mul(a, k), mul(b, 1-k))
+
+@nb.cuda.jit(device=True)
+def make_float3(a, offset):
+    return float3(a[offset], a[offset+1], a[offset+2])
+
+@nb.cuda.jit(device=True)
+def invert(a):
+    return float3(-a[0], -a[1], -a[2])
+
+@nb.cuda.jit(device=True)
+def float3(a, b, c):
+    return (np.float32(a), np.float32(b), np.float32(c))
diff --git a/examples/hillshade.py b/examples/hillshade.py
new file mode 100644
index 0000000..9276e1d
--- /dev/null
+++ b/examples/hillshade.py
@@ -0,0 +1,228 @@
+import numpy as np
+import numba as nb
+
+import cupy
+import xarray as xr
+from rtxpy import RTX
+
+from typing import Optional
+
+from scipy.spatial.transform import Rotation as R
+
+from raytrace.cuda_utils import *
+from raytrace import mesh_utils
+
+@nb.cuda.jit
+def _generatePrimaryRays(data, x_coords, y_coords, H, W):
+    """
+    A GPU kernel that given a set of x and y discrete coordinates on a raster terrain
+    generates in @data a list of parallel rays that represent camera rays generated from an ortographic camera
+    that is looking straight down at the surface from an origin height 10000
+    """
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+        #data[i,j,0] = j + 1e-6 # x_coords[j] + 1e-6
+        #data[i,j,1] = i + 1e-6 # y_coords[i] + 1e-6
+
+        if (j == W-1):
+            data[i,j,0] = j - 1e-3
+        else:
+            data[i,j,0] = j + 1e-3
+
+        if (i == H-1):
+            data[i,j,1] = i - 1e-3
+        else:
+            data[i,j,1] = i + 1e-3
+
+        data[i,j,2] = 10000 # Location of the camera (height)
+        data[i,j,3] = 1e-3
+        data[i,j,4] = 0
+        data[i,j,5] = 0
+        data[i,j,6] = -1
+        data[i,j,7] = np.inf
+
+
+def generatePrimaryRays(rays, x_coords, y_coords, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    _generatePrimaryRays[griddim, blockdim](rays, x_coords, y_coords, H, W)
+    return 0
+
+
+@nb.cuda.jit
+def _generateShadowRays(rays, hits, normals, H, W, sunDir):
+    """
+    A GPU kernel that given a set rays and their respective intersection points,
+    generates in rays (overwriting the original content) a new set of rays (shadow rays)
+    That have their origins at the point of intersection of their parent ray and direction - the direction towards the sun
+    The normals vectors at the point of intersection of the original rays are cached in @normals
+    Thus we can later use them to do lambertian shading, after the shadow rays have been traced
+    """
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+        dist = hits[i,j,0]
+        norm = make_float3(hits[i,j], 1)
+        if (norm[2] < 0):
+            norm = invert(norm)
+        ray = rays[i,j]
+        rayOrigin = make_float3(ray, 0)
+        rayDir    = make_float3(ray, 4)
+        p = add(rayOrigin, mul(rayDir,dist))
+
+        newOrigin = add(p, mul(norm, 1e-3))
+        ray[0] = newOrigin[0]
+        ray[1] = newOrigin[1]
+        ray[2] = newOrigin[2]
+        ray[3] = 1e-3
+        ray[4] = sunDir[0]
+        ray[5] = sunDir[1]
+        ray[6] = sunDir[2]
+        ray[7] = np.inf if dist > 0 else 0
+
+        normals[i,j,0] = norm[0]
+        normals[i,j,1] = norm[1]
+        normals[i,j,2] = norm[2]
+
+def generateShadowRays(rays, hits, normals, H, W, sunDir):
+    griddim, blockdim = calc_dims((H, W))
+    _generateShadowRays[griddim, blockdim](rays, hits, normals, H, W, sunDir)
+    return 0
+
+
+@nb.cuda.jit
+def _shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
+    """
+    This kernel does a simple Lambertian shading
+    The hits array contains the results of tracing the shadow rays through the scene.
+    If the value in hits[x,y,0] is > 0, then a valid intersection occurred and that means that the point
+    at location x,y is in shadow.
+    The normals array stores the normal at the intersecion point of each camera ray
+    We then use the information for light visibility and normal to apply Lambert's cosine law
+    The final result is stored in output which is an RGB array
+    """
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+        # Normal at the intersection of camera ray (i,j) with the scene
+        norm = make_float3(normals[i,j], 0)
+
+        # Below is same as existing algorithm without shadows and is OK with shadows.
+        # Could be improved with a bit of antialiasing at edges of shadow???
+
+        light_dir = make_float3(sunDir, 0)  # Might have to make it zero if back cull.
+        cos_theta = dot(light_dir, norm)  # light_dir and norm are already normalised.
+
+        temp = (cos_theta + 1) / 2
+
+        if castShadows and hits[i, j, 0] >= 0:
+            temp = temp / 2
+
+        if temp > 1:
+            temp = 1
+        elif temp < 0:
+            temp = 0
+
+        output[i, j] = temp
+
+
+
+def shadeLambert(hits, normals, output, H, W, sunDir, castShadows):
+    griddim, blockdim = calc_dims((H, W))
+    _shadeLambert[griddim, blockdim](hits, normals, output, H, W, sunDir, castShadows)
+    return 0
+
+
+def getSunDir(angle_altitude, azimuth):
+    """
+    Calculate the vector towards the sun based on sun altitude angle and azimuth
+    """
+    north = (0,1,0)
+    rx = R.from_euler('x', angle_altitude, degrees=True)
+    rz = R.from_euler('z', azimuth+180, degrees=True)
+    sunDir = rx.apply(north)
+    sunDir = rz.apply(sunDir)
+    return sunDir
+
+
+def hillshade_rt(raster: xr.DataArray,
+              optix: RTX,
+              shadows: bool = False,
+              azimuth: int = 225,
+              angle_altitude: int = 25,
+              name: Optional[str] = 'hillshade') -> xr.DataArray:
+
+    H,W = raster.shape
+    sunDir = cupy.array(getSunDir(angle_altitude, azimuth))
+
+    #output = np.zeros((H,W,3), np.float32)
+
+    # Device buffers
+    d_rays   = cupy.empty((H,W,8), np.float32)
+    d_hits   = cupy.empty((H,W,4), np.float32)
+    d_aux    = cupy.empty((H,W,3), np.float32)
+    d_output = cupy.empty((H,W), np.float32)
+
+    y_coords = cupy.array(raster.indexes.get('y').values)
+    x_coords = cupy.array(raster.indexes.get('x').values)
+
+    generatePrimaryRays(d_rays, x_coords, y_coords, H, W)
+    device = cupy.cuda.Device(0)
+    device.synchronize()
+    res = optix.trace(d_rays, d_hits, W*H)
+
+    generateShadowRays(d_rays, d_hits, d_aux, H, W, sunDir)
+    if shadows:
+        device.synchronize()
+        res = optix.trace(d_rays, d_hits, W*H)
+
+    shadeLambert(d_hits, d_aux, d_output, H, W, sunDir, shadows)
+
+    if isinstance(raster.data, np.ndarray):
+        output = cupy.asnumpy(d_output[:, :])
+        nanValue = np.nan
+    else:
+        output = d_output[:, :]
+        nanValue = cupy.nan
+
+    output[0, :] = nanValue
+    output[-1, :] = nanValue
+    output[:, 0] = nanValue
+    output[:, -1] = nanValue
+
+    hill = xr.DataArray(output,
+                    name=name,
+                    coords=raster.coords,
+                    dims=raster.dims,
+                    attrs=raster.attrs)
+    return hill
+
+def hillshade_gpu(raster: xr.DataArray,
+              shadows: bool = False,
+              azimuth: int = 225,
+              angle_altitude: int = 25,
+              name: Optional[str] = 'hillshade') -> xr.DataArray:
+    # Move the terrain to GPU for testing the GPU path
+    if not isinstance(raster.data, cupy.ndarray):
+        print("WARNING: raster.data is not a cupy array. Additional overhead will be incurred")
+    H,W = raster.shape
+    optix = RTX()
+
+    datahash = np.uint64(hash(str(raster.data.get())))
+    optixhash = np.uint64(optix.getHash())
+    if (optixhash != datahash):
+        numTris = (H - 1) * (W - 1) * 2
+        verts = cupy.empty(H * W * 3, np.float32)
+        triangles = cupy.empty(numTris * 3, np.int32)
+
+        # Generate a mesh from the terrain (buffers are on the GPU, so generation happens also on GPU)
+        res = mesh_utils.triangulateTerrain(verts, triangles, raster)
+        if res:
+            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
+        res = optix.build(datahash, verts, triangles)
+        if res:
+            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
+        #Clear some GPU memory that we no longer need
+        verts = None
+        triangles = None
+        cupy.get_default_memory_pool().free_all_blocks()
+
+    hill = hillshade_rt(raster, optix, azimuth=azimuth, angle_altitude=angle_altitude, shadows=shadows, name=name)
+    return hill
diff --git a/examples/mesh_utils.py b/examples/mesh_utils.py
new file mode 100644
index 0000000..e847558
--- /dev/null
+++ b/examples/mesh_utils.py
@@ -0,0 +1,129 @@
+
+import numba as nb
+import numpy as np
+import cupy
+from raytrace.cuda_utils import calc_dims
+
+@nb.cuda.jit
+def _triangulateTerrain(verts, triangles, data, H, W, scale, stride):
+    globalId = stride + nb.cuda.grid(1)
+    if globalId < W*H:
+        h = globalId // W
+        w = globalId % W
+        meshMapIndex = h * W + w
+
+        val = data[h,w]
+
+        offset = 3*meshMapIndex
+        verts[offset]   = w # x_coords[w] # w
+        verts[offset+1] = h # y_coords[h] # h
+        verts[offset+2] = val * scale
+
+        if w != W - 1 and h != H - 1:
+            offset = 6*(h * (W-1) + w)
+            triangles[offset+0]= np.int32(meshMapIndex + W)
+            triangles[offset+1]= np.int32(meshMapIndex + W + 1)
+            triangles[offset+2]= np.int32(meshMapIndex)
+            triangles[offset+3]= np.int32(meshMapIndex + W + 1)
+            triangles[offset+4]= np.int32(meshMapIndex + 1)
+            triangles[offset+5]= np.int32(meshMapIndex)
+
+@nb.njit(parallel=True)
+def triangulateCPU(verts, triangles, data, H, W, scale):
+    for h in nb.prange(H):
+        for w in range(W):
+            meshMapIndex = h * W + w
+
+            val = data[h,w]
+
+            offset = 3*meshMapIndex
+            verts[offset]   = w # x_coords[w] # w
+            verts[offset+1] = h # y_coords[h] # h
+            verts[offset+2] = val * scale
+
+            if w != W - 1 and h != H - 1:
+                offset = 6*(h * (W-1) + w)
+                triangles[offset+0]= np.int32(meshMapIndex + W)
+                triangles[offset+1]= np.int32(meshMapIndex + W + 1)
+                triangles[offset+2]= np.int32(meshMapIndex)
+                triangles[offset+3]= np.int32(meshMapIndex + W+1)
+                triangles[offset+4]= np.int32(meshMapIndex + 1)
+                triangles[offset+5]= np.int32(meshMapIndex)
+
+
+def triangulateTerrain(verts, triangles, terrain, scale=1):
+    H,W = terrain.shape
+    if isinstance(terrain.data, np.ndarray):
+        triangulateCPU(verts, triangles, terrain.data, H, W, scale)
+    if isinstance(terrain.data, cupy.ndarray):
+        jobSize = H*W
+        blockdim = 1024
+        griddim = (jobSize + blockdim - 1) // 1024
+        d = 100
+        offset = 0
+        while(jobSize>0):
+            batch = min(d, griddim)
+            _triangulateTerrain[batch, blockdim](verts, triangles, terrain.data, H, W, scale, offset)
+            offset += batch*blockdim
+            jobSize -= batch*blockdim
+    return 0
+
+
+@nb.jit(nopython=True)
+def fillContents(content, verts, triangles, numTris):
+    v = np.empty(12, np.float32)
+    pad = np.zeros(2, np.int8)
+    offset = 0
+    for i in range(numTris):
+        t0 = triangles[3*i+0]
+        t1 = triangles[3*i+1]
+        t2 = triangles[3*i+2]
+        v[3*0+0] = 0
+        v[3*0+1] = 0
+        v[3*0+2] = 0
+        v[3*1+0] = verts[3*t0+0]
+        v[3*1+1] = verts[3*t0+1]
+        v[3*1+2] = verts[3*t0+2]
+        v[3*2+0] = verts[3*t1+0]
+        v[3*2+1] = verts[3*t1+1]
+        v[3*2+2] = verts[3*t1+2]
+        v[3*3+0] = verts[3*t2+0]
+        v[3*3+1] = verts[3*t2+1]
+        v[3*3+2] = verts[3*t2+2]
+
+        offset = 50*i
+        content[offset:offset+48] = v.view(np.uint8)
+        content[offset+48:offset+50] = pad
+
+def write(name, verts, triangles):
+    """
+    Save a triangulated raster to a standard STL file.
+    Windows has a default STL viewer and probably all 3D viewers have native support for it
+    because of its simplicity. Can be used to verify the correctness of the algorithm or
+    to visualize the mesh to get a notion of the size/complexity etc.
+    @param name - The name of the mesh file we're going to save. Should end in .stl
+    @param verts - A numpy array containing all the vertices of the mesh. Format is 3 float32 per vertex (vertex buffer)
+    @param triangles - A numpy array containing all the triangles of the mesh. Format is 3 int32 per triangle (index buffer)
+    """
+    ib = triangles
+    vb = verts
+    if isinstance(ib, cupy.ndarray):
+        ib = cupy.asnumpy(ib)
+    if isinstance(vb, cupy.ndarray):
+        vb = cupy.asnumpy(vb)
+
+    header = np.zeros(80, np.uint8)
+    nf = np.empty(1, np.uint32)
+    numTris = triangles.shape[0] // 3
+    nf[0] = numTris
+    f=open(name,'wb')
+    f.write(header)
+    f.write(nf)
+
+    # size of 1 triangle in STL is 50 bytes
+    # 12 floats (each 4 bytes) for a total of 48
+    # And additional 2 bytes for padding
+    content = np.empty(numTris*(50), np.uint8)
+    fillContents(content, vb, ib, numTris)
+    f.write(content)
+    f.close()
\ No newline at end of file
diff --git a/examples/playground.py b/examples/playground.py
new file mode 100644
index 0000000..271ad16
--- /dev/null
+++ b/examples/playground.py
@@ -0,0 +1,122 @@
+from datashader.transfer_functions import shade
+from datashader.transfer_functions import stack
+from datashader.transfer_functions import dynspread
+
+import matplotlib.pyplot as plt
+
+import numpy as np
+import cupy
+
+import os, sys
+#Add the parent folder to python search space
+sys.path.append(os.path.abspath(os.getcwd())+"/..")
+sys.path.append(os.path.abspath(os.getcwd()))
+
+from raytrace import hillshade_gpu
+from raytrace import viewshed_gpu
+
+from rtxpy import RTX
+
+import xarray as xr;
+
+from raytrace.hillshade import getSunDir
+#getSunDir(angle_altitude, azimuth)
+
+#terrain = xr.open_dataarray("olympic_national_park.nc")
+#terrain.data = terrain.data * 0.025 #scale down
+terrain = xr.open_dataarray("crater_lake_national_park.nc")
+
+terrain = terrain[::2, ::2]
+
+terrain.data = terrain.data * 0.2 #scale down
+
+azimuth = 225
+
+
+def debug(x,y):
+    rays = cupy.float32([x,y,10000,0,0,0,-1,np.inf])
+    hits = cupy.float32([0,0,0,0])
+    optix = RTX()
+    res = optix.trace(rays, hits, 1)
+    norm = cupy.asnumpy(hits[1:])
+    sun = getSunDir(25, azimuth)
+
+    return res
+
+def onclick(event):
+    """
+    Simple click handler for live adjustment of the viewshed origin when running matplotlib live session
+    """
+    ix, iy = event.xdata, event.ydata
+    print ('x = {}, y = {}'.format(ix, iy))
+
+    nix = ix/terrain.shape[1]
+    niy = iy/terrain.shape[0]
+
+    x_coords = terrain.indexes.get('x').values
+    y_coords = terrain.indexes.get('y').values
+    rangex = x_coords.max() - x_coords.min()
+    rangey = y_coords.max() - y_coords.min()
+
+    global vsw, vsh
+    vsw = x_coords.min() + nix*rangex
+    vsh = y_coords.max() - niy*rangey
+
+    #debug(ix, iy)
+    return None
+
+def test():
+    runs = 360
+    H,W = terrain.data.shape
+    if isinstance(terrain.data, np.ndarray):
+        terrain.data = cupy.array(terrain.data)
+
+    fig = plt.figure()
+    cid = fig.canvas.mpl_connect('button_press_event', onclick)
+    colors = np.uint8(np.zeros((H,W,3)))
+    imgplot = plt.imshow(colors)
+
+    x_coords = terrain.indexes.get('x').values
+    y_coords = terrain.indexes.get('y').values
+    midx = x_coords.min() + (x_coords.max() - x_coords.min()) / 2
+    midy = y_coords.min() + (y_coords.max() - y_coords.min()) / 2
+
+    global vsw, vsh
+    global azimuth
+    vsw = midx
+    vsh = midy
+
+    import time
+    for i in range(runs):
+        print("Begin Frame ", i)
+        azimuth = azimuth + 5
+        if (azimuth > 360):
+            azimuth -= 360
+
+        beforeRT = time.time()
+        hs = hillshade_gpu(terrain, shadows=True, azimuth=azimuth, angle_altitude=25)
+        vs = viewshed_gpu(terrain, x=vsw, y=vsh, observer_elev=0.01)
+        afterRT = time.time()
+        print("  RT took ", afterRT-beforeRT)
+
+        img = np.uint8(hs.data.get()*225)
+
+        withViewshed = True#False
+        if withViewshed:
+            visBuf = np.uint8(vs.data.get() > 0) * 255
+            view = np.maximum(visBuf,img)
+        else:
+            view = img
+        colors[:,:,0] = view
+        colors[:,:,1] = img
+        colors[:,:,2] = img
+        imgplot.set_data(colors)
+        plt.pause(0.001)
+
+    plt.show()
+
+    return
+
+res = test()
+
+print("Done")
diff --git a/examples/viewshed.py b/examples/viewshed.py
new file mode 100644
index 0000000..502c7d9
--- /dev/null
+++ b/examples/viewshed.py
@@ -0,0 +1,221 @@
+import numpy as np
+import numba as nb
+
+import cupy
+import xarray as xr
+from rtxpy import RTX
+import math
+
+from typing import Union
+
+from scipy.spatial.transform import Rotation as R
+
+from raytrace.cuda_utils import *
+from raytrace import mesh_utils
+
+# view options default values
+OBS_ELEV = 0
+TARGET_ELEV = 0
+
+# if a cell is invisible, its value is set to -1
+INVISIBLE = -1
+
+@nb.cuda.jit
+def _generatePrimaryRays(data, x_coords, y_coords, H, W):
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+#        if (j == W-1):
+#            data[i,j,0] = y_coords[j] - 1e-6
+#        else:
+#            data[i,j,0] = y_coords[j] + 1e-6
+#
+#        if (i == H-1):
+#            data[i,j,1] = x_coords[i] - 1e-6
+#        else:
+#            data[i,j,1] = x_coords[i] + 1e-6
+
+        if (j == W-1):
+            data[i,j,0] = j - 1e-3
+        else:
+            data[i,j,0] = j + 1e-3
+
+        if (i == H-1):
+            data[i,j,1] = i - 1e-3
+        else:
+            data[i,j,1] = i + 1e-3
+
+        data[i,j,2] = 10000 # Location of the camera (height)
+        data[i,j,3] = 1e-3
+        data[i,j,4] = 0
+        data[i,j,5] = 0
+        data[i,j,6] = -1
+        data[i,j,7] = np.inf
+
+def generatePrimaryRays(rays, x_coords, y_coords, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    d_y_coords = cupy.array(y_coords)
+    d_x_coords = cupy.array(x_coords)
+    _generatePrimaryRays[griddim, blockdim](rays, d_x_coords, d_y_coords, H, W)
+    return 0
+
+@nb.cuda.jit
+def _generateViewshedRays(camRays, hits, vsrays, visibility_grid, H, W, vp):
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+        elevationOffset = vp[2]
+        targetElevation = vp[3]
+        dist = hits[i,j,0]  # distance to surface from camera
+        norm = make_float3(hits[i,j], 1)  # normal vector at intersection with surface
+        #inorm = norm
+        if (norm[2] < 0):  # if back hit, face forward
+            norm = invert(norm)
+        cameraRay = camRays[i,j]
+        rayOrigin = make_float3(cameraRay, 0)  # get the camera ray origin
+        rayDir    = make_float3(cameraRay, 4)  # get the camera ray direction
+        hitP = add(rayOrigin, mul(rayDir,dist))  # calculate intersection point
+        newOrigin = add(hitP, mul(norm, 1e-3))  # generate new ray origin, and a little offset to avoid self-intersection
+        newOrigin = add(newOrigin, float3(0, 0, targetElevation))  # move the new origin up by the selected by user targetElevation factor
+
+        w = int(vp[0])
+        h = int(vp[1])
+        viewshedRay = camRays[h,w]  # get the camera ray that was cast for the location of the viewshed origin
+        dist = hits[h,w,0]  # get the distance from the camera to the viewshed point
+        rayOrigin = make_float3(viewshedRay, 0)  # get the origin on the camera of the ray towards VP point
+        rayDir    = make_float3(viewshedRay, 4)  # get the direction from camera to VP point
+        hitP = add(rayOrigin, mul(rayDir,dist))  # calculate distance from camera to VP
+        viewshedPoint = add(hitP, float3(0, 0, elevationOffset))  # calculate the VP location on the surface and add the VP offset
+
+        newDir = diff(viewshedPoint, newOrigin)  # calculate vector from SurfaceHit to VP
+        length = math.sqrt(dot(newDir, newDir))  # calculate distance from surface to VP
+        newDir = mul(newDir, 1/length)  # normalize the direction (vector v)
+
+        cosine = dot(norm, newDir)  # cosine of the angle between n and v
+        theta = math.acos(cosine)  # Cosine angle in radians
+        theta = (180*theta)/math.pi  # Cosine angle in degrees
+
+        # prepare a viewshed ray to cast to determine visibility
+        vsray = vsrays[i,j]
+        vsray[0] = newOrigin[0]
+        vsray[1] = newOrigin[1]
+        vsray[2] = newOrigin[2]
+        vsray[3] = 0
+        vsray[4] = newDir[0]
+        vsray[5] = newDir[1]
+        vsray[6] = newDir[2]
+        vsray[7] = length
+
+        visibility_grid[i,j] = theta
+
+def generateViewshedRays(rays, hits, vsrays, visibility_grid, H, W, vp):
+    griddim, blockdim = calc_dims((H, W))
+    _generateViewshedRays[griddim, blockdim](rays, hits, vsrays, visibility_grid, H, W, vp)
+    return 0
+
+@nb.cuda.jit
+def _calcViewshed(hits, visibility_grid, H, W):
+    i, j = nb.cuda.grid(2)
+    if i>=0 and i < H and j>=0 and j < W:
+        dist = hits[i,j,0]
+        # We traced the viewshed rays and now hits contains the intersection data
+        # if dist > 0, then we were able to hit something along the length of the ray
+        # which means that the pixel we targeted is not directly visible from the view point
+        if dist>=0:
+            visibility_grid[i,j] = INVISIBLE
+
+def calcViewshed(hits, visibility_grid, H, W):
+    griddim, blockdim = calc_dims((H, W))
+    _calcViewshed[griddim, blockdim](hits, visibility_grid, H, W)
+    return 0
+
+def viewshed_rt(raster: xr.DataArray,
+             optix: RTX,
+             x: Union[int, float],
+             y: Union[int, float],
+             observer_elev: float = OBS_ELEV,
+             target_elev: float = TARGET_ELEV) -> xr.DataArray:
+
+    H,W = raster.shape
+
+    y_coords = raster.indexes.get('y').values
+    x_coords = raster.indexes.get('x').values
+
+    # validate x arg
+    if x < x_coords.min():
+        raise ValueError("x argument outside of raster x_range")
+    elif x > x_coords.max():
+        raise ValueError("x argument outside of raster x_range")
+
+    # validate y arg
+    if y < y_coords.min():
+        raise ValueError("y argument outside of raster y_range")
+    elif y > y_coords.max():
+        raise ValueError("y argument outside of raster y_range")
+
+    selection = raster.sel(x=[x], y=[y], method='nearest')
+    x = selection.x.values[0]
+    y = selection.y.values[0]
+
+    y_view = np.where(y_coords == y)[0][0]
+    x_view = np.where(x_coords == x)[0][0]
+
+    # Device buffers
+    d_rays = cupy.empty((H,W,8), np.float32)
+    d_hits = cupy.empty((H,W,4), np.float32)
+    d_visgrid = cupy.empty((H,W), np.float32)
+    d_vsrays  = cupy.empty((H,W,8), np.float32)
+
+    generatePrimaryRays(d_rays, x_coords, y_coords, H, W)
+    device = cupy.cuda.Device(0)
+    device.synchronize()
+    res = optix.trace(d_rays, d_hits, W*H)
+
+    generateViewshedRays(d_rays, d_hits, d_vsrays, d_visgrid, H, W, (x_view, y_view, observer_elev, target_elev))
+    device.synchronize()
+    res = optix.trace(d_vsrays, d_hits, W*H)
+
+    calcViewshed(d_hits, d_visgrid, H, W)
+
+    if isinstance(raster.data, np.ndarray):
+        visgrid = cupy.asnumpy(d_visgrid)
+    else:
+        visgrid = d_visgrid
+
+    view = xr.DataArray(visgrid,
+                    name="viewshed",
+                    coords=raster.coords,
+                    dims=raster.dims,
+                    attrs=raster.attrs)
+    return view
+
+def viewshed_gpu(raster: xr.DataArray,
+             x: Union[int, float],
+             y: Union[int, float],
+             observer_elev: float = OBS_ELEV,
+             target_elev: float = TARGET_ELEV) -> xr.DataArray:
+    if not isinstance(raster.data, cupy.ndarray):
+        raise ValueError("raster.data must be a cupy array")
+
+    H,W = raster.shape
+    optix = RTX()
+
+    datahash = np.uint64(hash(str(raster.data.get())))
+    optixhash = np.uint64(optix.getHash())
+    if (optixhash != datahash):
+        numTris = (H - 1) * (W - 1) * 2
+        verts = cupy.empty(H * W * 3, np.float32)
+        triangles = cupy.empty(numTris * 3, np.int32)
+
+        # Generate a mesh from the terrain (buffers are on the GPU, so generation happens also on GPU)
+        res = mesh_utils.triangulateTerrain(verts, triangles, raster)
+        if res:
+            raise ValueError("Failed to generate mesh from terrain. Error code:{}".format(res))
+        res = optix.build(datahash, verts, triangles)
+        if res:
+            raise ValueError("OptiX failed to build GAS with error code:{}".format(res))
+        #Clear some GPU memory that we no longer need
+        verts = None
+        triangles = None
+        cupy.get_default_memory_pool().free_all_blocks()
+
+    view = viewshed_rt(raster, optix, x, y, observer_elev, target_elev)
+    return view
\ No newline at end of file