isce-framework · hfattahi · Mar 4, 2026 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
@@ -76,7 +76,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    r_maxval->outputToFile("r_maxval", stream);
+    r_maxval->outputToFile("r_corrBatchRawMaxVal", stream);
     r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream);
     r_corrBatchSum->outputToFile("r_corrBatchSum", stream);
@@ -107,9 +107,17 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     maxLocShift->outputToFile("i_maxLocShift", stream);
 #endif
 
+    // deramp reference
+    cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped", stream);
+#endif
+
     // oversample reference
-    // (deramping included in oversampler)
-    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);
+
     // take amplitudes
     cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream);
 
@@ -127,15 +135,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream);
 #endif
 
-    // extract secondary and oversample
+    // extract secondary images around the max location with a smaller search range
     cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream);
-    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the extracted raw secondary image
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInRaw", stream);
+#endif
+
+    // deramp secondary
+    cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped secondary image(s)
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped", stream);
+#endif
+
+    // oversample secondary
+    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
     // take amplitudes
     cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    // dump the extracted raw secondary image
-    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream);
     // dump the oversampled secondary image(s)
     c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream);
     r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream);

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
@@ -22,7 +22,8 @@ cuAmpcorParameter::cuAmpcorParameter()
     algorithm = 0; //0 freq; 1 time
     deviceID = 0;
     nStreams = 1;
-    derampMethod = 1;
+    derampMethod = 1; // average deramp
+    derampAxis = 2; // both directions
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -44,6 +44,7 @@ class cuAmpcorParameter{
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int derampAxis;     ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -46,8 +46,11 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream);
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
+// `cuDeramp` calls a deramp implementation (or does nothing) based on the value of `method`:
+//  `method=1` for cuLinearDeramp, any other value for no-op
+// `cuLinearDeramp` Estimates the phase gradient over the chip and removes it.
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream);
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -11,9 +11,9 @@
  * Method 0 or else: skip deramping
  *
  */
- 
-#include "cuArrays.h" 
-#include "float2.h" 
+
+#include "cuArrays.h"
+#include "float2.h"
 #include <cfloat>
 #include "cudaError.h"
 #include "cudaUtil.h"
@@ -27,14 +27,14 @@
 // cuda does not have a good support on volatile vector struct, e.g. float2
 // have to use regular float type for shared memory (volatile) data
 // the following methods are defined to operate float2/complex objects through float
-inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) 
+inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block)
 { s[i] = x.x; s[i+block] = x.y; }
 
-inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) 
+inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block)
 { x.x = s[i]; x.y = s[i+block]; }
 
 
-inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) 
+inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block)
 { s[i] += s[i+j]; s[i+block] += s[i+j+block];}
 
 
@@ -45,72 +45,87 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
     const int tid = threadIdx.x;
     copyToShared(shmem, tid, sum, nthreads);
     __syncthreads();
-    
+
     if (nthreads >=1024) { if (tid < 512) { addInShared(shmem, tid, 512, nthreads); } __syncthreads(); }
     if (nthreads >= 512) { if (tid < 256) { addInShared(shmem, tid, 256, nthreads); } __syncthreads(); }
     if (nthreads >= 256) { if (tid < 128) { addInShared(shmem, tid, 128, nthreads); } __syncthreads(); }
     if (nthreads >= 128) { if (tid <  64) { addInShared(shmem, tid,  64, nthreads); } __syncthreads(); }
     if (tid < 32)
-    {	
+    {
         addInShared(shmem, tid, 32, nthreads);
         addInShared(shmem, tid, 16, nthreads);
         addInShared(shmem, tid,  8, nthreads);
         addInShared(shmem, tid,  4, nthreads);
         addInShared(shmem, tid,  2, nthreads);
-        addInShared(shmem, tid,  1, nthreads); 
+        addInShared(shmem, tid,  1, nthreads);
     }
     __syncthreads();
     copyFromShared(sum, shmem, 0, nthreads);
 }
 
-// cuda kernel for cuDerampMethod1
+// cuda kernel for cuLinearDeramp with Method 1
 template<const int nthreads>
-__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, 
-    const int imageSize, const int nImages, const float normCoef)
+__global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int const imageNY,
+    const int imageSize, const int nImages, const float normCoef, const int axis)
 {
     __shared__ float shmem[2*nthreads];
     int pixelIdx, pixelIdxX, pixelIdxY;
-    
-    const int bid = blockIdx.x;    
+
+    const int bid = blockIdx.x;
     if(bid >= nImages) return;
     float2 *image = images+ bid*imageSize;
-    const int tid = threadIdx.x;  
-    float2 phaseDiffY  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads) {
-        pixelIdxY = i % imageNY;
-        if(pixelIdxY < imageNY -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);   
-            phaseDiffY += cprod;
-        } 
-    }       
-    complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
-    //phaseDiffY *= normCoef;
-    float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x);
-
-    float2 phaseDiffX  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads)  {
-        pixelIdxX = i / imageNY; 
-        if(pixelIdxX < imageNX -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj(image[i], image[i+imageNY]);
-            phaseDiffX += cprod;
+    const int tid = threadIdx.x;
+
+    // average phase ramp along row/range direction
+    double phaseY = 0.0;
+    if (axis != 0)
+    {
+        float2 phaseDiffY  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads) {
+            pixelIdxY = i % imageNY;
+            if(pixelIdxY < imageNY -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);
+                phaseDiffY += cprod;
+            }
+        }
+        complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
+        //phaseDiffY *= normCoef;
+        phaseY=atan2(phaseDiffY.y, phaseDiffY.x);
+    }
+
+    // average phase ramp along column/azimuth direction
+    double phaseX = 0.0;
+    if (axis != 1)
+    {
+        float2 phaseDiffX  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads)  {
+            pixelIdxX = i / imageNY;
+            if(pixelIdxX < imageNX -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj(image[i], image[i+imageNY]);
+                phaseDiffX += cprod;
+            }
         }
-    }   
-
-    complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
-
-    //phaseDiffX *= normCoef;
-    float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
-     
+
+        complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
+
+        //phaseDiffX *= normCoef;
+        phaseX = atan2(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
+    }
+    // deramp with the estimated phase ramps
     for (int i = tid; i < imageSize; i += nthreads)
-    { 
-        pixelIdxX = i%imageNY;
-        pixelIdxY = i/imageNY;
-        float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-        float2 phase_factor = make_float2(cosf(phase), sinf(phase));
-        image[i] *= phase_factor;
-    }     
+    {
+        pixelIdxX = i / imageNY;
+        pixelIdxY = i % imageNY;
+        // use double to improve accuracy
+        double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+        double phase_sin, phase_cos;
+        sincos(phase, &phase_sin, &phase_cos);
+        image[i] = make_float2(
+            image[i].x*phase_cos - image[i].y*phase_sin,
+            image[i].x*phase_sin + image[i].y*phase_cos);
+    }
 }
 
 /**
@@ -120,38 +135,40 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
  * @param[in,out] images input/output complex signals
  * @param[in] stream cuda stream
  */
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
-
+    if ((axis < 0) or (axis > 2)) {
+        throw std::invalid_argument("deramp axis must be 0, 1, or 2");
+    }
     const dim3 grid(images->count);
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
 
     if(imageSize <=64) {
-        cuDerampMethod1_kernel<64> <<<grid, 64, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-     else if(imageSize <=128) {
-        cuDerampMethod1_kernel<128> <<<grid, 128, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }   
-     else if(imageSize <=256) {
-        cuDerampMethod1_kernel<256> <<<grid, 256, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }  
+        cuLinearDeramp_kernel<64> <<<grid, 64, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=128) {
+        cuLinearDeramp_kernel<128> <<<grid, 128, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=256) {
+        cuLinearDeramp_kernel<256> <<<grid, 256, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
     else  {
-        cuDerampMethod1_kernel<512> <<<grid, 512, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-    getLastCudaError("cuDerampMethod1 kernel error\n");
+        cuLinearDeramp_kernel<512> <<<grid, 512, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    getLastCudaError("cuLinearDeramp kernel error\n");
 
 }
-        
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream)
+
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
     switch(method) {
     case 1:
-        cuDerampMethod1(images, stream);
+        cuLinearDeramp(images, axis, stream);
         break;
     default:
         break;