diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu index 8f35fa075..5825ad37d 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu @@ -76,7 +76,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream); #ifdef CUAMPCOR_DEBUG - r_maxval->outputToFile("r_maxval", stream); + r_maxval->outputToFile("r_corrBatchRawMaxVal", stream); r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream); i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream); r_corrBatchSum->outputToFile("r_corrBatchSum", stream); @@ -107,9 +107,17 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) maxLocShift->outputToFile("i_maxLocShift", stream); #endif + // deramp reference + cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the deramped reference image(s) + c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped", stream); +#endif + // oversample reference - // (deramping included in oversampler) - referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); + referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled); + // take amplitudes cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream); @@ -127,15 +135,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream); #endif - // extract secondary and oversample + // extract secondary images around the max location with a smaller search range cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream); - secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod); + +#ifdef CUAMPCOR_DEBUG + // dump the extracted raw secondary image + c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInRaw", stream); +#endif + + // deramp secondary + cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis, stream); + +#ifdef CUAMPCOR_DEBUG + // dump the deramped secondary image(s) + c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped", stream); +#endif + + // oversample secondary + secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled); // take amplitudes cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream); #ifdef CUAMPCOR_DEBUG - // dump the extracted raw secondary image - c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream); // dump the oversampled secondary image(s) c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream); r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu index 9acc6b294..6e058df0c 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu @@ -22,7 +22,8 @@ cuAmpcorParameter::cuAmpcorParameter() algorithm = 0; //0 freq; 1 time deviceID = 0; nStreams = 1; - derampMethod = 1; + derampMethod = 1; // average deramp + derampAxis = 2; // both directions windowSizeWidthRaw = 64; windowSizeHeightRaw = 64; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h index 357f36ba9..6c5a01b49 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -44,6 +44,7 @@ class cuAmpcorParameter{ int deviceID; ///< Targeted GPU device ID: use -1 to auto select int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels int derampMethod; ///< Method for deramping 0=None, 1=average + int derampAxis; ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes // chip or window size for raw data int windowSizeHeightRaw; ///< Template window height (original size) diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h index d56a01c67..3339d82b0 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -46,8 +46,11 @@ void cuArraysC2R(cuArrays *image1, cuArrays *image2, cudaStream_t void cuArraysAbs(cuArrays *image1, cuArrays *image2, cudaStream_t stream); // cuDeramp.cu: deramping phase -void cuDeramp(int method, cuArrays *images, cudaStream_t stream); -void cuDerampMethod1(cuArrays *images, cudaStream_t stream); +// `cuDeramp` calls a deramp implementation (or does nothing) based on the value of `method`: +// `method=1` for cuLinearDeramp, any other value for no-op +// `cuLinearDeramp` Estimates the phase gradient over the chip and removes it. +void cuDeramp(const int method, cuArrays *images, const int axis, cudaStream_t stream); +void cuLinearDeramp(cuArrays *images, const int axis, cudaStream_t stream); // cuArraysPadding.cu: various utilities for oversampling padding void cuArraysPadding(cuArrays *image1, cuArrays *image2, cudaStream_t stream); diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu index cd46e7893..81d5d2733 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu @@ -11,9 +11,9 @@ * Method 0 or else: skip deramping * */ - -#include "cuArrays.h" -#include "float2.h" + +#include "cuArrays.h" +#include "float2.h" #include #include "cudaError.h" #include "cudaUtil.h" @@ -27,14 +27,14 @@ // cuda does not have a good support on volatile vector struct, e.g. float2 // have to use regular float type for shared memory (volatile) data // the following methods are defined to operate float2/complex objects through float -inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) +inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) { s[i] = x.x; s[i+block] = x.y; } -inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) +inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) { x.x = s[i]; x.y = s[i+block]; } -inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) +inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) { s[i] += s[i+j]; s[i+block] += s[i+j+block];} @@ -45,72 +45,87 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem) const int tid = threadIdx.x; copyToShared(shmem, tid, sum, nthreads); __syncthreads(); - + if (nthreads >=1024) { if (tid < 512) { addInShared(shmem, tid, 512, nthreads); } __syncthreads(); } if (nthreads >= 512) { if (tid < 256) { addInShared(shmem, tid, 256, nthreads); } __syncthreads(); } if (nthreads >= 256) { if (tid < 128) { addInShared(shmem, tid, 128, nthreads); } __syncthreads(); } if (nthreads >= 128) { if (tid < 64) { addInShared(shmem, tid, 64, nthreads); } __syncthreads(); } if (tid < 32) - { + { addInShared(shmem, tid, 32, nthreads); addInShared(shmem, tid, 16, nthreads); addInShared(shmem, tid, 8, nthreads); addInShared(shmem, tid, 4, nthreads); addInShared(shmem, tid, 2, nthreads); - addInShared(shmem, tid, 1, nthreads); + addInShared(shmem, tid, 1, nthreads); } __syncthreads(); copyFromShared(sum, shmem, 0, nthreads); } -// cuda kernel for cuDerampMethod1 +// cuda kernel for cuLinearDeramp with Method 1 template -__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, - const int imageSize, const int nImages, const float normCoef) +__global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int const imageNY, + const int imageSize, const int nImages, const float normCoef, const int axis) { __shared__ float shmem[2*nthreads]; int pixelIdx, pixelIdxX, pixelIdxY; - - const int bid = blockIdx.x; + + const int bid = blockIdx.x; if(bid >= nImages) return; float2 *image = images+ bid*imageSize; - const int tid = threadIdx.x; - float2 phaseDiffY = make_float2(0.0f, 0.0f); - for (int i = tid; i < imageSize; i += nthreads) { - pixelIdxY = i % imageNY; - if(pixelIdxY < imageNY -1) { - pixelIdx = i; - float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); - phaseDiffY += cprod; - } - } - complexSumReduceBlock(phaseDiffY, shmem); - //phaseDiffY *= normCoef; - float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x); - - float2 phaseDiffX = make_float2(0.0f, 0.0f); - for (int i = tid; i < imageSize; i += nthreads) { - pixelIdxX = i / imageNY; - if(pixelIdxX < imageNX -1) { - pixelIdx = i; - float2 cprod = complexMulConj(image[i], image[i+imageNY]); - phaseDiffX += cprod; + const int tid = threadIdx.x; + + // average phase ramp along row/range direction + double phaseY = 0.0; + if (axis != 0) + { + float2 phaseDiffY = make_float2(0.0f, 0.0f); + for (int i = tid; i < imageSize; i += nthreads) { + pixelIdxY = i % imageNY; + if(pixelIdxY < imageNY -1) { + pixelIdx = i; + float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]); + phaseDiffY += cprod; + } + } + complexSumReduceBlock(phaseDiffY, shmem); + //phaseDiffY *= normCoef; + phaseY=atan2(phaseDiffY.y, phaseDiffY.x); + } + + // average phase ramp along column/azimuth direction + double phaseX = 0.0; + if (axis != 1) + { + float2 phaseDiffX = make_float2(0.0f, 0.0f); + for (int i = tid; i < imageSize; i += nthreads) { + pixelIdxX = i / imageNY; + if(pixelIdxX < imageNX -1) { + pixelIdx = i; + float2 cprod = complexMulConj(image[i], image[i+imageNY]); + phaseDiffX += cprod; + } } - } - - complexSumReduceBlock(phaseDiffX, shmem); - - //phaseDiffX *= normCoef; - float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON - + + complexSumReduceBlock(phaseDiffX, shmem); + + //phaseDiffX *= normCoef; + phaseX = atan2(phaseDiffX.y, phaseDiffX.x); //+FLT_EPSILON + } + // deramp with the estimated phase ramps for (int i = tid; i < imageSize; i += nthreads) - { - pixelIdxX = i%imageNY; - pixelIdxY = i/imageNY; - float phase = pixelIdxX*phaseX + pixelIdxY*phaseY; - float2 phase_factor = make_float2(cosf(phase), sinf(phase)); - image[i] *= phase_factor; - } + { + pixelIdxX = i / imageNY; + pixelIdxY = i % imageNY; + // use double to improve accuracy + double phase = pixelIdxX*phaseX + pixelIdxY*phaseY; + double phase_sin, phase_cos; + sincos(phase, &phase_sin, &phase_cos); + image[i] = make_float2( + image[i].x*phase_cos - image[i].y*phase_sin, + image[i].x*phase_sin + image[i].y*phase_cos); + } } /** @@ -120,38 +135,40 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co * @param[in,out] images input/output complex signals * @param[in] stream cuda stream */ -void cuDerampMethod1(cuArrays *images, cudaStream_t stream) +void cuLinearDeramp(cuArrays *images, const int axis, cudaStream_t stream) { - + if ((axis < 0) or (axis > 2)) { + throw std::invalid_argument("deramp axis must be 0, 1, or 2"); + } const dim3 grid(images->count); const int imageSize = images->width*images->height; const float invSize = 1.0f/imageSize; if(imageSize <=64) { - cuDerampMethod1_kernel<64> <<>> - (images->devData, images->height, images->width, - imageSize, images->count, invSize); } - else if(imageSize <=128) { - cuDerampMethod1_kernel<128> <<>> - (images->devData, images->height, images->width, - imageSize, images->count, invSize); } - else if(imageSize <=256) { - cuDerampMethod1_kernel<256> <<>> - (images->devData, images->height, images->width, - imageSize, images->count, invSize); } + cuLinearDeramp_kernel<64> <<>> + (images->devData, images->height, images->width, + imageSize, images->count, invSize, axis); } + else if(imageSize <=128) { + cuLinearDeramp_kernel<128> <<>> + (images->devData, images->height, images->width, + imageSize, images->count, invSize, axis); } + else if(imageSize <=256) { + cuLinearDeramp_kernel<256> <<>> + (images->devData, images->height, images->width, + imageSize, images->count, invSize, axis); } else { - cuDerampMethod1_kernel<512> <<>> - (images->devData, images->height, images->width, - imageSize, images->count, invSize); } - getLastCudaError("cuDerampMethod1 kernel error\n"); + cuLinearDeramp_kernel<512> <<>> + (images->devData, images->height, images->width, + imageSize, images->count, invSize, axis); } + getLastCudaError("cuLinearDeramp kernel error\n"); } - -void cuDeramp(int method, cuArrays *images, cudaStream_t stream) + +void cuDeramp(const int method, cuArrays *images, const int axis, cudaStream_t stream) { switch(method) { case 1: - cuDerampMethod1(images, stream); + cuLinearDeramp(images, axis, stream); break; default: break; diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu index 1b6ab6267..d60399023 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu @@ -1,4 +1,4 @@ -/* +/* * @file cuOverSampler.cu * @brief Implementations of cuOverSamplerR2R (C2C) class */ @@ -22,17 +22,17 @@ */ cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_) { - + int inNXp2 = inNX; int inNYp2 = inNY; int outNXp2 = outNX; int outNYp2 = outNY; - + /* if expanded to 2^n int inNXp2 = nextpower2(inNX); int inNYp2 = nextpower2(inNY); int outNXp2 = inNXp2*outNX/inNX; - int outNYp2 = inNYp2*outNY/inNY; + int outNYp2 = inNYp2*outNY/inNY; */ // set up work arrays @@ -67,25 +67,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_) * Execute fft oversampling * @param[in] imagesIn input batch of images * @param[out] imagesOut output batch of images - * @param[in] method phase deramping method */ -void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) -{ - cuDeramp(method, imagesIn, stream); +void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut) +{ cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE )); cuArraysPaddingMany(workIn, workOut, stream); cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD)); } /// destructor -cuOverSamplerC2C::~cuOverSamplerC2C() +cuOverSamplerC2C::~cuOverSamplerC2C() { // destroy fft handles cufft_Error(cufftDestroy(forwardPlan)); cufft_Error(cufftDestroy(backwardPlan)); // deallocate work arrays delete(workIn); - delete(workOut); + delete(workOut); } // end of cuOverSamplerC2C @@ -99,7 +97,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C() */ cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream) { - + int inNXp2 = inNX; int inNYp2 = inNY; int outNXp2 = outNX; @@ -144,14 +142,14 @@ void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *image cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE)); cuArraysPaddingMany(workSizeIn, workSizeOut, stream); cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD )); - cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream); + cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream); } /// destructor -cuOverSamplerR2R::~cuOverSamplerR2R() +cuOverSamplerR2R::~cuOverSamplerR2R() { cufft_Error(cufftDestroy(forwardPlan)); - cufft_Error(cufftDestroy(backwardPlan)); + cufft_Error(cufftDestroy(backwardPlan)); workSizeIn->deallocate(); workSizeOut->deallocate(); } diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h index 9ddce96b2..a3154b613 100644 --- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h +++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h @@ -31,7 +31,7 @@ class cuOverSamplerC2C // set cuda stream void setStream(cudaStream_t stream_); // execute oversampling - void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); + void execute(cuArrays *imagesIn, cuArrays *imagesOut); // destructor ~cuOverSamplerC2C(); }; diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp index 7e8b6fb98..8f5fb65ad 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp +++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp @@ -85,7 +85,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount); #ifdef CUAMPCOR_DEBUG - r_maxval->outputToFile("r_maxval"); + r_maxval->outputToFile("r_corrBatchRawMaxVal"); r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn"); i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid"); r_corrBatchSum->outputToFile("r_corrBatchSum"); @@ -115,9 +115,16 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) maxLocShift->outputToFile("i_maxLocShift"); #endif + // deramp reference + cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis); + +#ifdef CUAMPCOR_DEBUG + // dump the deramped reference image(s) + c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped"); +#endif + // oversample reference - // (deramping included in oversampler) - referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod); + referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled); // take amplitudes cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled); @@ -135,15 +142,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_) r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean"); #endif - // extract secondary and oversample + // extract secondary for smaller search window cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit); - secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod); - // take amplitudes - cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled); #ifdef CUAMPCOR_DEBUG // dump the extracted raw secondary image c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn"); +#endif + + // deramp secondary + cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis); +#ifdef CUAMPCOR_DEBUG + // dump the deramped secondary image(s) + c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped"); +#endif + + // oversample secondary + secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled); + + // take amplitudes + cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled); + +#ifdef CUAMPCOR_DEBUG // dump the oversampled secondary image(s) c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled"); r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled"); diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp index e10e38fc1..b82313fc9 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp +++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp @@ -25,6 +25,7 @@ cuAmpcorParameter::cuAmpcorParameter() deviceID = 0; nStreams = 1; derampMethod = 1; + derampAxis = 2; // both directions windowSizeWidthRaw = 64; windowSizeHeightRaw = 64; diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h index 5c9258d04..ce6d095bd 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h +++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h @@ -46,6 +46,7 @@ class cuAmpcorParameter{ int deviceID; ///< Targeted GPU device ID: use -1 to auto select int nStreams; ///< Number of streams to asynchonize data transfers and compute kernels int derampMethod; ///< Method for deramping 0=None, 1=average + int derampAxis; ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes // chip or window size for raw data int windowSizeHeightRaw; ///< Template window height (original size) diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h index be318fc9a..7a1b6d373 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h +++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h @@ -42,8 +42,8 @@ void cuArraysCopyPadded(cuArrays *imageIn, cuArrays *imageOut); void cuArraysAbs(cuArrays *image1, cuArrays *image2); // cuDeramp.cu: deramping phase -void cuDeramp(int method, cuArrays *images); -void cuDerampMethod1(cuArrays *images); +void cuDeramp(const int method, cuArrays *images, const int axis); +void cuLinearDeramp(cuArrays *images, const int axis); // cuArraysPadding.cu: various utilities for oversampling padding void cuArraysPaddingMany(cuArrays *image1, cuArrays *image2); diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp index a1d015395..6924818a2 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp +++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp @@ -23,40 +23,51 @@ namespace isce3::matchtemplate::pycuampcor { -// kernel for cuDerampMethod1 -static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, - const int imageSize, const int nImages, const float normCoef) +// kernel for linear deramping +static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int imageNY, + const int imageSize, const int nImages, const float normCoef, const int axis) { for (int k = 0; k < nImages; k++) { float2* image = images + k * imageSize; - double2 phaseDiffY = make_double2(0.0, 0.0); - for (int j = 0; j < imageNX; j++) { - for (int i = 0; i < imageNY - 1; i++) { - const int pixelIdx = j * imageNY + i; - float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]); - phaseDiffY += cprod; + double phaseY = 0.0; + if(axis != 0) + { + double2 phaseDiffY = make_double2(0.0, 0.0); + for (int j = 0; j < imageNX; j++) { + for (int i = 0; i < imageNY - 1; i++) { + const int pixelIdx = j * imageNY + i; + float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]); + phaseDiffY += cprod; + } } + phaseY = atan2(phaseDiffY.y, phaseDiffY.x); } - double2 phaseDiffX = make_double2(0.0, 0.0); - for (int j = 0; j < imageNX - 1; j++) { - for (int i = 0; i < imageNY; i++) { - const int pixelIdx = j * imageNY + i; - float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]); - phaseDiffX += cprod; + double phaseX = 0.0; + if(axis != 1) + { + double2 phaseDiffX = make_double2(0.0, 0.0); + for (int j = 0; j < imageNX - 1; j++) { + for (int i = 0; i < imageNY; i++) { + const int pixelIdx = j * imageNY + i; + float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]); + phaseDiffX += cprod; + } } + phaseX = atan2(phaseDiffX.y, phaseDiffX.x); } - float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x); - float phaseY = atan2f(phaseDiffY.y, phaseDiffY.x); for (int i = 0; i < imageSize; i++) { - const int pixelIdxX = i%imageNY; - const int pixelIdxY = i/imageNY; - float phase = pixelIdxX*phaseX + pixelIdxY*phaseY; - float2 phase_factor = make_float2(cosf(phase), sinf(phase)); - image[i] *= phase_factor; + const int pixelIdxX = i / imageNY; + const int pixelIdxY = i % imageNY; + double phase = pixelIdxX*phaseX + pixelIdxY*phaseY; + double phase_cos = cos(phase); + double phase_sin = sin(phase); + image[i] = make_float2( + image[i].x*phase_cos - image[i].y*phase_sin, + image[i].x*phase_sin + image[i].y*phase_cos); } } } @@ -67,20 +78,20 @@ static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const * and the average phase shift is obtained as atan(\sum imag / \sum real). * @param[inout] images input/output complex signals */ -void cuDerampMethod1(cuArrays *images) +void cuLinearDeramp(cuArrays *images, const int axis) { const int imageSize = images->width*images->height; const float invSize = 1.0f/imageSize; - cuDerampMethod1_kernel(images->devData, images->height, images->width, - imageSize, images->count, invSize); + cuLinearDeramp_kernel(images->devData, images->height, images->width, + imageSize, images->count, invSize, axis); } -void cuDeramp(int method, cuArrays *images) +void cuDeramp(const int method, cuArrays *images, const int axis) { switch(method) { case 1: - cuDerampMethod1(images); + cuLinearDeramp(images, axis); break; default: break; diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp index 4b84e6aa6..e94de10ff 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp +++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp @@ -1,4 +1,4 @@ -/* +/* * @file cuOverSampler.cu * @brief Implementations of cuOverSamplerR2R (C2C) class */ @@ -23,17 +23,17 @@ cuOverSamplerC2C::cuOverSamplerC2C( cuArrays *imagesIn, cuArrays *imagesOut, int inNX, int inNY, int outNX, int outNY, int nImages) { - + int inNXp2 = inNX; int inNYp2 = inNY; int outNXp2 = outNX; int outNYp2 = outNY; - + /* if expanded to 2^n int inNXp2 = nextpower2(inNX); int inNYp2 = nextpower2(inNY); int outNXp2 = inNXp2*outNX/inNX; - int outNYp2 = inNYp2*outNY/inNY; + int outNYp2 = inNYp2*outNY/inNY; */ // set up work arrays @@ -70,25 +70,23 @@ cuOverSamplerC2C::cuOverSamplerC2C( * Execute fft oversampling * @param[in] imagesIn input batch of images * @param[out] imagesOut output batch of images - * @param[in] method phase deramping method */ -void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut, int method) -{ - cuDeramp(method, imagesIn); +void cuOverSamplerC2C::execute(cuArrays *imagesIn, cuArrays *imagesOut) +{ fftwf_execute(forwardPlan); cuArraysPaddingMany(workIn, workOut); fftwf_execute(backwardPlan); } /// destructor -cuOverSamplerC2C::~cuOverSamplerC2C() +cuOverSamplerC2C::~cuOverSamplerC2C() { // destroy fft handles fftwf_destroy_plan(forwardPlan); fftwf_destroy_plan(backwardPlan); // deallocate work arrays delete(workIn); - delete(workOut); + delete(workOut); } // end of cuOverSamplerC2C @@ -101,7 +99,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C() */ cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages) { - + int inNXp2 = inNX; int inNYp2 = inNY; int outNXp2 = outNX; @@ -151,11 +149,11 @@ void cuOverSamplerR2R::execute(cuArrays *imagesIn, cuArrays *image fftwf_execute(forwardPlan); cuArraysPaddingMany(workSizeIn, workSizeOut); fftwf_execute(backwardPlan); - cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0)); + cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0)); } /// destructor -cuOverSamplerR2R::~cuOverSamplerR2R() +cuOverSamplerR2R::~cuOverSamplerR2R() { fftwf_destroy_plan(forwardPlan); fftwf_destroy_plan(backwardPlan); diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h index 2b79a67b2..6a57b9509 100644 --- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h +++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h @@ -33,7 +33,7 @@ class cuOverSamplerC2C cuOverSamplerC2C(cuArrays *imagesIn, cuArrays *imagesOut, int inNX, int inNY, int outNX, int outNY, int nImages); // execute oversampling - void execute(cuArrays *imagesIn, cuArrays *imagesOut, int deramp_method=0); + void execute(cuArrays *imagesIn, cuArrays *imagesOut); // destructor ~cuOverSamplerC2C(); }; diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp index 7e884ba45..036376c56 100644 --- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp @@ -32,6 +32,7 @@ void addbinding_pycuampcor(pybind11::module& m) .DEF_PARAM(int, deviceID) .DEF_PARAM(int, nStreams) .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, derampAxis) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) diff --git a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp index d4ff33b32..f783ae709 100644 --- a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp +++ b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp @@ -32,6 +32,7 @@ void addbinding_pycuampcor_cpu(pybind11::module& m) .DEF_PARAM(int, deviceID) .DEF_PARAM(int, nStreams) .DEF_PARAM(int, derampMethod) + .DEF_PARAM(int, derampAxis) .DEF_PARAM(str, referenceImageName) .DEF_PARAM(int, referenceImageHeight) diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py index e897b19cd..4a1feddd3 100644 --- a/python/packages/nisar/workflows/dense_offsets.py +++ b/python/packages/nisar/workflows/dense_offsets.py @@ -202,7 +202,21 @@ def set_optional_attributes(ampcor_obj, cfg, length, width): if cfg['deramping_method'] is not None: deramp = cfg['deramping_method'] - ampcor_obj.derampMethod = 0 if deramp == "magnitude" else 1 + if deramp == "magnitude": + ampcor_obj.derampMethod = 0 + elif deramp == "complex": + ampcor_obj.derampMethod = 1 + else: # skip deramping + ampcor_obj.derampMethod = 2 + + if cfg['deramping_axis'] is not None: + deramp_axis = cfg['deramping_axis'] + if deramp_axis == "azimuth": + ampcor_obj.derampAxis = 0 + elif deramp_axis == "range": + ampcor_obj.derampAxis = 1 + else: # both directions + ampcor_obj.derampAxis = 2 if cfg['correlation_statistics_zoom'] is not None: ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom'] diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py index ed5ef8358..5b29ac94e 100644 --- a/python/packages/nisar/workflows/offsets_product.py +++ b/python/packages/nisar/workflows/offsets_product.py @@ -256,8 +256,27 @@ def set_ampcor_params(cfg, ampcor_obj): ampcor_obj.algorithm = 0 if cfg['cross_correlation_domain'] == \ 'frequency' else 1 ampcor_obj.rawDataOversamplingFactor = cfg['slc_oversampling_factor'] - ampcor_obj.derampMethod = 0 if cfg['deramping_method'] == \ - 'magnitude' else 1 + + if cfg['deramping_method'] is not None: + deramp = cfg['deramping_method'] + if deramp == "magnitude": + ampcor_obj.derampMethod = 0 + elif deramp == "complex": + ampcor_obj.derampMethod = 1 + else: # skip deramping + ampcor_obj.derampMethod = 2 + + if cfg['deramping_axis'] is not None: + deramp_axis = cfg['deramping_axis'] + if deramp_axis == "azimuth": + ampcor_obj.derampAxis = 0 + elif deramp_axis == "range": + ampcor_obj.derampAxis = 1 + elif deramp_axis == "both": + ampcor_obj.derampAxis = 2 + else: + raise ValueError(f"invalid {deramp_axis=}") + ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom'] ampcor_obj.corrSurfaceZoomInWindow = cfg['correlation_surface_zoom'] ampcor_obj.corrSurfaceOverSamplingFactor = cfg[ diff --git a/share/nisar/defaults/insar.yaml b/share/nisar/defaults/insar.yaml index 9280e7604..1072ccf3e 100644 --- a/share/nisar/defaults/insar.yaml +++ b/share/nisar/defaults/insar.yaml @@ -408,8 +408,10 @@ runconfig: # Anti-aliasing oversampling factor to apply to reference/secondary SLC # prior to cross-correlation computation slc_oversampling_factor: 2 - # Deramp data prior to FFT: magnitude or complex (linear phase ramp) + # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp' deramping_method: 'complex' + # Deramp data axis prior to FFT: range, azimuth, or both + deramping_axis: 'azimuth' # Flag to use constant range/azimuth offsets in dense offsets estimation use_gross_offsets: True # Constant offset along slant range to guide dense offset estimation @@ -459,8 +461,10 @@ runconfig: start_pixel_azimuth: # Cross-correlation domain cross_correlation_domain: 'frequency' - # Deramp data prior to FFT: magnitude or complex (linear phase ramp) + # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp' deramping_method: 'complex' + # Deramp data axis prior to FFT: range, azimuth, or both + deramping_axis: 'azimuth' # Anti-aliasing oversampling factor to apply to reference/secondary SLC # prior to cross-correlation computation slc_oversampling_factor: 2 diff --git a/share/nisar/schemas/insar.yaml b/share/nisar/schemas/insar.yaml index 999dc97ad..c9e4439e9 100644 --- a/share/nisar/schemas/insar.yaml +++ b/share/nisar/schemas/insar.yaml @@ -329,8 +329,11 @@ dense_offsets_options: # prior to cross-correlation computation slc_oversampling_factor: int(min=2, max=5, required=False) - # Deramp data prior to FFT: magnitude or complex (linear phase ramp) - deramping_method: enum('magnitude', 'complex', required=False) + # Deramp data prior to FFT: 'magnitude' , 'complex' (linear phase ramp), or 'complex_no_deramp' + deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False) + + # Deramp data axis prior to FFT: range, azimuth, or both + deramping_axis: enum('range', 'azimuth', 'both', required=False) # Flag to use constant range/azimuth offsets in dense offsets estimation use_gross_offsets: bool(required=False) @@ -409,8 +412,11 @@ offsets_product_options: # prior to cross-correlation computation slc_oversampling_factor: int(min=2, max=5, required=False) - # Deramp data prior to FFT: magnitude or complex (linear phase ramp) - deramping_method: enum('magnitude', 'complex', required=False) + # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp' + deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False) + + # Deramp data axis prior to FFT: range, azimuth, or both + deramping_axis: enum('range', 'azimuth', 'both', required=False) # Flag to use constant range/azimuth offsets in dense offsets estimation use_gross_offsets: bool(required=False) diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak index f71dbd19c..30107f364 100644 Binary files a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak and b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak differ diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets index 93a7ba97f..b2398d3f3 100644 Binary files a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets and b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets differ diff --git a/tests/python/packages/isce3/matchtemplate/test_ampcor.py b/tests/python/packages/isce3/matchtemplate/test_ampcor.py index ea01ff724..09e75fd2b 100644 --- a/tests/python/packages/isce3/matchtemplate/test_ampcor.py +++ b/tests/python/packages/isce3/matchtemplate/test_ampcor.py @@ -83,6 +83,7 @@ def test_ampcor(): ampcor.algorithm = 0 # frequency ampcor.corrSurfaceOverSamplingMethod = ovs ampcor.derampMethod = 1 + ampcor.derampAxis = 0 ampcor.corrStatWindowSize = 21 ampcor.corrSurfaceZoomInWindow = 8 @@ -159,7 +160,8 @@ def test_ampcor(): meantol = 2e-2 tol = 1e-1 elif fname == "correlation_peak": - meantol = 1e-2 + meantol = 2e-2 + tol = 5e-2 else: meantol = 1 / 64 / 5 tol = 1 / 64