Skip to content
Merged
35 changes: 28 additions & 7 deletions cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);

#ifdef CUAMPCOR_DEBUG
r_maxval->outputToFile("r_maxval", stream);
r_maxval->outputToFile("r_corrBatchRawMaxVal", stream);
r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream);
r_corrBatchSum->outputToFile("r_corrBatchSum", stream);
Expand Down Expand Up @@ -107,9 +107,17 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
maxLocShift->outputToFile("i_maxLocShift", stream);
#endif

// deramp reference
cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis, stream);

#ifdef CUAMPCOR_DEBUG
// dump the deramped reference image(s)
c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped", stream);
#endif

// oversample reference
// (deramping included in oversampler)
referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);

// take amplitudes
cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream);

Expand All @@ -127,15 +135,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream);
#endif

// extract secondary and oversample
// extract secondary images around the max location with a smaller search range
cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream);
secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);

#ifdef CUAMPCOR_DEBUG
// dump the extracted raw secondary image
c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInRaw", stream);
#endif

// deramp secondary
cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis, stream);

#ifdef CUAMPCOR_DEBUG
// dump the deramped secondary image(s)
c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped", stream);
#endif

// oversample secondary
secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
// take amplitudes
cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream);

#ifdef CUAMPCOR_DEBUG
// dump the extracted raw secondary image
c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream);
// dump the oversampled secondary image(s)
c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream);
r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream);
Expand Down
3 changes: 2 additions & 1 deletion cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ cuAmpcorParameter::cuAmpcorParameter()
algorithm = 0; //0 freq; 1 time
deviceID = 0;
nStreams = 1;
derampMethod = 1;
derampMethod = 1; // average deramp
derampAxis = 2; // both directions

windowSizeWidthRaw = 64;
windowSizeHeightRaw = 64;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class cuAmpcorParameter{
int deviceID; ///< Targeted GPU device ID: use -1 to auto select
int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels
int derampMethod; ///< Method for deramping 0=None, 1=average
int derampAxis; ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes

// chip or window size for raw data
int windowSizeHeightRaw; ///< Template window height (original size)
Expand Down
7 changes: 5 additions & 2 deletions cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);

// cuDeramp.cu: deramping phase
void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream);
void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
// `cuDeramp` calls a deramp implementation (or does nothing) based on the value of `method`:
// `method=1` for cuLinearDeramp, any other value for no-op
// `cuLinearDeramp` Estimates the phase gradient over the chip and removes it.
void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream);
void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream);

// cuArraysPadding.cu: various utilities for oversampling padding
void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
Expand Down
155 changes: 86 additions & 69 deletions cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
* Method 0 or else: skip deramping
*
*/
#include "cuArrays.h"
#include "float2.h"

#include "cuArrays.h"
#include "float2.h"
#include <cfloat>
#include "cudaError.h"
#include "cudaUtil.h"
Expand All @@ -27,14 +27,14 @@
// cuda does not have a good support on volatile vector struct, e.g. float2
// have to use regular float type for shared memory (volatile) data
// the following methods are defined to operate float2/complex objects through float
inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block)
inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block)
{ s[i] = x.x; s[i+block] = x.y; }

inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block)
inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block)
{ x.x = s[i]; x.y = s[i+block]; }


inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block)
inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block)
{ s[i] += s[i+j]; s[i+block] += s[i+j+block];}


Expand All @@ -45,72 +45,87 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
const int tid = threadIdx.x;
copyToShared(shmem, tid, sum, nthreads);
__syncthreads();

if (nthreads >=1024) { if (tid < 512) { addInShared(shmem, tid, 512, nthreads); } __syncthreads(); }
if (nthreads >= 512) { if (tid < 256) { addInShared(shmem, tid, 256, nthreads); } __syncthreads(); }
if (nthreads >= 256) { if (tid < 128) { addInShared(shmem, tid, 128, nthreads); } __syncthreads(); }
if (nthreads >= 128) { if (tid < 64) { addInShared(shmem, tid, 64, nthreads); } __syncthreads(); }
if (tid < 32)
{
{
addInShared(shmem, tid, 32, nthreads);
addInShared(shmem, tid, 16, nthreads);
addInShared(shmem, tid, 8, nthreads);
addInShared(shmem, tid, 4, nthreads);
addInShared(shmem, tid, 2, nthreads);
addInShared(shmem, tid, 1, nthreads);
addInShared(shmem, tid, 1, nthreads);
}
__syncthreads();
copyFromShared(sum, shmem, 0, nthreads);
}

// cuda kernel for cuDerampMethod1
// cuda kernel for cuLinearDeramp with Method 1
template<const int nthreads>
__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY,
const int imageSize, const int nImages, const float normCoef)
__global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int const imageNY,
const int imageSize, const int nImages, const float normCoef, const int axis)
{
__shared__ float shmem[2*nthreads];
int pixelIdx, pixelIdxX, pixelIdxY;
const int bid = blockIdx.x;

const int bid = blockIdx.x;
if(bid >= nImages) return;
float2 *image = images+ bid*imageSize;
const int tid = threadIdx.x;
float2 phaseDiffY = make_float2(0.0f, 0.0f);
for (int i = tid; i < imageSize; i += nthreads) {
pixelIdxY = i % imageNY;
if(pixelIdxY < imageNY -1) {
pixelIdx = i;
float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);
phaseDiffY += cprod;
}
}
complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
//phaseDiffY *= normCoef;
float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x);

float2 phaseDiffX = make_float2(0.0f, 0.0f);
for (int i = tid; i < imageSize; i += nthreads) {
pixelIdxX = i / imageNY;
if(pixelIdxX < imageNX -1) {
pixelIdx = i;
float2 cprod = complexMulConj(image[i], image[i+imageNY]);
phaseDiffX += cprod;
const int tid = threadIdx.x;

// average phase ramp along row/range direction
double phaseY = 0.0;
if (axis != 0)
{
float2 phaseDiffY = make_float2(0.0f, 0.0f);
for (int i = tid; i < imageSize; i += nthreads) {
pixelIdxY = i % imageNY;
if(pixelIdxY < imageNY -1) {
pixelIdx = i;
float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);
phaseDiffY += cprod;
}
}
complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
//phaseDiffY *= normCoef;
phaseY=atan2(phaseDiffY.y, phaseDiffY.x);
}

// average phase ramp along column/azimuth direction
double phaseX = 0.0;
if (axis != 1)
{
float2 phaseDiffX = make_float2(0.0f, 0.0f);
for (int i = tid; i < imageSize; i += nthreads) {
pixelIdxX = i / imageNY;
if(pixelIdxX < imageNX -1) {
pixelIdx = i;
float2 cprod = complexMulConj(image[i], image[i+imageNY]);
phaseDiffX += cprod;
}
}
}

complexSumReduceBlock<nthreads>(phaseDiffX, shmem);

//phaseDiffX *= normCoef;
float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON

complexSumReduceBlock<nthreads>(phaseDiffX, shmem);

//phaseDiffX *= normCoef;
phaseX = atan2(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON
}
// deramp with the estimated phase ramps
for (int i = tid; i < imageSize; i += nthreads)
{
pixelIdxX = i%imageNY;
pixelIdxY = i/imageNY;
float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
float2 phase_factor = make_float2(cosf(phase), sinf(phase));
image[i] *= phase_factor;
}
{
pixelIdxX = i / imageNY;
pixelIdxY = i % imageNY;
// use double to improve accuracy
double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
double phase_sin, phase_cos;
sincos(phase, &phase_sin, &phase_cos);
image[i] = make_float2(
image[i].x*phase_cos - image[i].y*phase_sin,
image[i].x*phase_sin + image[i].y*phase_cos);
}
}

/**
Expand All @@ -120,38 +135,40 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
* @param[in,out] images input/output complex signals
* @param[in] stream cuda stream
*/
void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream)
{

if ((axis < 0) or (axis > 2)) {
throw std::invalid_argument("deramp axis must be 0, 1, or 2");
}
const dim3 grid(images->count);
const int imageSize = images->width*images->height;
const float invSize = 1.0f/imageSize;

if(imageSize <=64) {
cuDerampMethod1_kernel<64> <<<grid, 64, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize); }
else if(imageSize <=128) {
cuDerampMethod1_kernel<128> <<<grid, 128, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize); }
else if(imageSize <=256) {
cuDerampMethod1_kernel<256> <<<grid, 256, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize); }
cuLinearDeramp_kernel<64> <<<grid, 64, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize, axis); }
else if(imageSize <=128) {
cuLinearDeramp_kernel<128> <<<grid, 128, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize, axis); }
else if(imageSize <=256) {
cuLinearDeramp_kernel<256> <<<grid, 256, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize, axis); }
else {
cuDerampMethod1_kernel<512> <<<grid, 512, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize); }
getLastCudaError("cuDerampMethod1 kernel error\n");
cuLinearDeramp_kernel<512> <<<grid, 512, 0, stream>>>
(images->devData, images->height, images->width,
imageSize, images->count, invSize, axis); }
getLastCudaError("cuLinearDeramp kernel error\n");

}
void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream)

void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream)
{
switch(method) {
case 1:
cuDerampMethod1(images, stream);
cuLinearDeramp(images, axis, stream);
break;
default:
break;
Expand Down
Loading