From 34ece86ddf11e01e3dc324a4438a648ebbc71bb6 Mon Sep 17 00:00:00 2001
From: Brian Hawkins <1729052+bhawkins@users.noreply.github.com>
Date: Thu, 4 Dec 2025 18:16:40 -0800
Subject: [PATCH 01/13] Use consistent X and Y indexing.

---
 cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
index cd46e7893..25be2e59f 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -105,8 +105,8 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
      
     for (int i = tid; i < imageSize; i += nthreads)
     { 
-        pixelIdxX = i%imageNY;
-        pixelIdxY = i/imageNY;
+        pixelIdxX = i / imageNY;
+        pixelIdxY = i % imageNY;
         float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
         float2 phase_factor = make_float2(cosf(phase), sinf(phase));
         image[i] *= phase_factor;

From d66149ca7a12835f3d642e787c0c2fc2b7ed8dcd Mon Sep 17 00:00:00 2001
From: Brian Hawkins <1729052+bhawkins@users.noreply.github.com>
Date: Thu, 4 Dec 2025 19:53:23 -0800
Subject: [PATCH 02/13] Ditto in CPU version.

---
 cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index a1d015395..fa8b2482a 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -52,8 +52,8 @@ static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const
         float phaseY = atan2f(phaseDiffY.y, phaseDiffY.x);
 
         for (int i = 0; i < imageSize; i++) {
-            const int pixelIdxX = i%imageNY;
-            const int pixelIdxY = i/imageNY;
+            const int pixelIdxX = i / imageNY;
+            const int pixelIdxY = i % imageNY;
             float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
             float2 phase_factor = make_float2(cosf(phase), sinf(phase));
             image[i] *= phase_factor;

From 70ef962a8567acf52dda3753376b5c6c1c1c0d7d Mon Sep 17 00:00:00 2001
From: Brian Hawkins <1729052+bhawkins@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:32:05 -0800
Subject: [PATCH 03/13] update golden data

---
 .../ovs128-rho0.8/golden/correlation_peak     | Bin 624 -> 624 bytes
 .../ovs128-rho0.8/golden/dense_offsets        | Bin 1248 -> 1248 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak
index f71dbd19c66aa4d5119b3d522220f88f11594853..2d13833a8d0b5a7c09a3958e194580c245fc9d4c 100644
GIT binary patch
literal 624
zcmWN_dn^=i0KoBHnvlpNne~UQx{0PTS>yY&T9KNXn)imxA8n0Z^GCBX{Si%Vnbl@7
zkA}F+*m+(O?%WYa?zlVe$Mv{8dT=34eg6DhFzaaOwiQkeqd0g^;}>YezeCT=u4Ymb
z6x6Lxv0gTUo8K03vs+8RF+E9UGj_W-X^-9`BvLC;d&N!UY#+iy_JzpNX6h^IQJq&&
z?k*?7p^1ynErd<JM;&FMs7y;usD}8xMp8;te2cfRTH!3bTL&plTxVm*N+>%G9PjI)
zQ>Wp3@G#e=28c*g5>TS0A<0Cgmy1|=YAbwFS4gy$<E8#VUbT+Qs1{1fCn$*6WU;HC
z(1Jh65=QV%ouR?rMR-^qvQMpKEU1msIX%?tm&y3>3zNjiql973MGa#~86YdiMf9CC
z@Nr|5z_Df$-_BwkU?N@8L2_d)eTVl8YuhE<GrNgR(NLN-M^N|@r=)rux7mrK`IWfG
z%##>AOMQGV)j>vTrp9^J+)sAl0&*tNp0OACh9){w%ej}UAVcpWLf(|HsL`?zr{HRf
zn(SLsI2Rjn%dO+U+#u6#bKI5vzdbsnrR|7A3xquw=Z24gOPR~8MfDQvx=KjC9?y6E
z*vm(GAzLDApcV586PuSO@J&&2driShk59<+mSJ`tn<qNb?<?{6+>4~Hhob1MqBh)w
z=c^W?6>WIN4Y7D<kR4|Ygf2UY!NOu<`g?htro#ODD>kwYc1qP;mo@U&u?GJmoxE0U
V5;^>ns`6>74r;J=9V1!V#Xo!IF`obc

literal 624
zcmV~$dn^=i0KoAhkJ@5p9_zK43Yn=!rs(@ClUCEB(IiHh*h(vH(Xged^+(fMjb-N5
zwMjbCYV)S!+~f{-PCI9GoX0UbHP50xA5WV_($ZC=l*~}jxW-aMHO*FY3{ST*9q<j0
z^L8R`z{KKRI}w$tBY*))i#IX-o#w}~wUA|XFy`VSo+c<MOf*yavyo@}Ti7_JCO&_e
zrt1nW$!F=X?ZGy6jjos>l5&PPq|&hxufP(q$u3_{aUkdq8w;~saCQ;DUs?;VX*GpK
zdIGwdk(B9qJfbB0q(n&l){w@!3#WVQ`1?AFO1YUxnX8bSTS@-pAY4n#d|;f~hidM$
z$XTpbl9M~e>qkyvrMCv@b0xC&Vcd+J9KSe^By*V?SFA*8`2_t(jI`g^kY4X5{2!Vq
z@X!$#-Av>!HId!pbWK^<o%ELae|to>OAA#VO}yHx#VuRU;G0oOcN&nyuM=utMwsn5
zYRWN24R9-B5Q(aX@Yoqb@|HN9IEH=UI<g&;1fSKS-M4^r*hctWQ<0VOo|J<+!Xk&r
zJKc&Rr;bTa8Sg^6F_jq!wrI%h>!RV5fz=Exb#p34l8gklYDkuU<;58jUos4=q)iZ1
ztEWh(r?+?#Lu@|{6$1pGlT+fYVfONO9ymA&$1Nk(NeW_Y+t@zUhjXTsk5_xBowZO?
zxy5I1E!#|e$eySO%{NftzKZqHX6jY#_~iFe<TXc|or;9pQ&c->@U*GoW>hEb(ev2*
SOt2iQBUm=diMCPt8R0*0vn`JR

diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets
index 93a7ba97f558029e629b40257a948919f178383d..9b1a3d0ad363943e0cadcd30fa2c1f10dde6ce40 100644
GIT binary patch
literal 1248
zcmY+Bfo;P;3<EWa2XTOm;!%E-jL-p6z@uc8-qBK$3J|ELNd7yiT6?8Sd{)(O)yaGp
zId7o5`kLi!%f0ifb(6Q;+xsV1E_%4^^xygp=6E>gzSe{zTU_oy>vRu)dIsgqj*eY;
zXx}qB_hyeS8qU!?KWO&&_3D}2;H~Nb@7wjN)pNjkF3!&SQk>%h-8oNXCTD8X=YAii
z^PBU|lQ}*)%Guv$yyAnFS9|E$5i%#YcO}BRO9p$-(cn4Q-@7Mkw4&KR==6BM`jp7d
z{mYqlUbJ%>nA+s1?DtD+et0~3Eip5=^neF=r#<_QybJW3QzE<ZsjcP;ogUx5p4a~w
z9o*KMO+9_kYEsSVEjZEKLG_jy<4LZX&hXA~=uZ5TRh{Sg&8cbMyv2O)RG<6C_l@r8
zc=wO){VusFv#4Y`^jkN1bY?8tz3Bd5p-C<n-R;^&s~+B-yU)>09-3&ina$a|;uEQb
QXYj3+(CZF7Y5l?a1IDTD`v3p{

literal 1248
zcmYk3(T#&Z3`G5TWl#u6ltL+#kP=b|ciIxJq`Wt*BP?0AXU1bTsWQ&1YLK;&zf~`)
z^SR1aEnp>2s6CjYe_(Gg^$WYEI(K&P_SHvxpfl_bwCW7`Ay3K~YV~yvMsFNw^$zA9
zxLCZ=$`#f4YMAYp=GL2UP3}CfuxNX~nzLTyoaK|=r}wrs--0aPGO=UUc+Wk3)F<HM
z)iX7m^GZ_X4d3y-vEA>N7k`l@&#WdpV)Ww9!Km%4)`(smsaFC{{seQ+|IJx#%_?r}
z?ue1c<Ce4g%@<rSaP`4yfBtS?IdMJI+9&DrzPYnBzrmZtLiJXe=+djBA7Jrh6i<S?
zds@NNh_|=;^y1ge>s5LZ;d%yhR}Xpn_e1BLTUJt?m^rRH^NtOSS#N9ZQTFN8SSq{j
zmkIAqJQ`#4Hyh5^{r+W}x0sE-pF8e@>utI>sQoSRM<SM+Q^S<KH{So#Lnu0%zB%iT
f#=Cmh$IFUe-?BeRBNc7lCIG(Um8zFI_s#kMJEieH


From ee27f43f1b6f5df26660c1489612961e85d300d8 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Thu, 11 Dec 2025 01:36:28 -0800
Subject: [PATCH 04/13] deramp improvement

 1) fix the pixelIdx error (thanks to @bhawkins)
 2) use double precision for phase correction
 3) add derampAxis parameter for options to deramp along certain
    direction
 4) move cuDeramp from oversampler to chunk processor; making the code more readable
---
 .../matchtemplate/pycuampcor/cuAmpcorChunk.cu |  35 +++-
 .../pycuampcor/cuAmpcorParameter.cu           |   3 +-
 .../pycuampcor/cuAmpcorParameter.h            |   1 +
 .../matchtemplate/pycuampcor/cuAmpcorUtil.h   |   4 +-
 .../cuda/matchtemplate/pycuampcor/cuDeramp.cu | 149 ++++++++++--------
 .../matchtemplate/pycuampcor/cuOverSampler.cu |  26 ++-
 .../matchtemplate/pycuampcor/cuOverSampler.h  |   2 +-
 .../pycuampcor/cuAmpcorChunk.cpp              |  34 +++-
 .../pycuampcor/cuAmpcorParameter.cpp          |   1 +
 .../pycuampcor/cuAmpcorParameter.h            |   1 +
 .../matchtemplate/pycuampcor/cuAmpcorUtil.h   |   4 +-
 .../matchtemplate/pycuampcor/cuDeramp.cpp     |  61 ++++---
 .../pycuampcor/cuOverSampler.cpp              |  24 ++-
 .../matchtemplate/pycuampcor/cuOverSampler.h  |   2 +-
 .../cuda/matchtemplate/pycuampcor.cpp         |   1 +
 .../pybind_isce3/matchtemplate/pycuampcor.cpp |   1 +
 .../packages/nisar/workflows/dense_offsets.py |  16 +-
 17 files changed, 224 insertions(+), 141 deletions(-)

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
index 8f35fa075..5825ad37d 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
@@ -76,7 +76,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    r_maxval->outputToFile("r_maxval", stream);
+    r_maxval->outputToFile("r_corrBatchRawMaxVal", stream);
     r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream);
     r_corrBatchSum->outputToFile("r_corrBatchSum", stream);
@@ -107,9 +107,17 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     maxLocShift->outputToFile("i_maxLocShift", stream);
 #endif
 
+    // deramp reference
+    cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped", stream);
+#endif
+
     // oversample reference
-    // (deramping included in oversampler)
-    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);
+
     // take amplitudes
     cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream);
 
@@ -127,15 +135,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream);
 #endif
 
-    // extract secondary and oversample
+    // extract secondary images around the max location with a smaller search range
     cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream);
-    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the extracted raw secondary image
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInRaw", stream);
+#endif
+
+    // deramp secondary
+    cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped secondary image(s)
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped", stream);
+#endif
+
+    // oversample secondary
+    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
     // take amplitudes
     cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    // dump the extracted raw secondary image
-    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream);
     // dump the oversampled secondary image(s)
     c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream);
     r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
index 9acc6b294..6e058df0c 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
@@ -22,7 +22,8 @@ cuAmpcorParameter::cuAmpcorParameter()
     algorithm = 0; //0 freq; 1 time
     deviceID = 0;
     nStreams = 1;
-    derampMethod = 1;
+    derampMethod = 1; // average deramp
+    derampAxis = 2; // both directions
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
index 357f36ba9..6c5a01b49 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -44,6 +44,7 @@ class cuAmpcorParameter{
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int derampAxis;     ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
index d56a01c67..59be4cbc0 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -46,8 +46,8 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream);
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream);
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
index 25be2e59f..786b9d4dd 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -11,9 +11,9 @@
  * Method 0 or else: skip deramping
  *
  */
- 
-#include "cuArrays.h" 
-#include "float2.h" 
+
+#include "cuArrays.h"
+#include "float2.h"
 #include <cfloat>
 #include "cudaError.h"
 #include "cudaUtil.h"
@@ -27,14 +27,14 @@
 // cuda does not have a good support on volatile vector struct, e.g. float2
 // have to use regular float type for shared memory (volatile) data
 // the following methods are defined to operate float2/complex objects through float
-inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) 
+inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block)
 { s[i] = x.x; s[i+block] = x.y; }
 
-inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) 
+inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block)
 { x.x = s[i]; x.y = s[i+block]; }
 
 
-inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) 
+inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block)
 { s[i] += s[i+j]; s[i+block] += s[i+j+block];}
 
 
@@ -45,72 +45,87 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
     const int tid = threadIdx.x;
     copyToShared(shmem, tid, sum, nthreads);
     __syncthreads();
-    
+
     if (nthreads >=1024) { if (tid < 512) { addInShared(shmem, tid, 512, nthreads); } __syncthreads(); }
     if (nthreads >= 512) { if (tid < 256) { addInShared(shmem, tid, 256, nthreads); } __syncthreads(); }
     if (nthreads >= 256) { if (tid < 128) { addInShared(shmem, tid, 128, nthreads); } __syncthreads(); }
     if (nthreads >= 128) { if (tid <  64) { addInShared(shmem, tid,  64, nthreads); } __syncthreads(); }
     if (tid < 32)
-    {	
+    {
         addInShared(shmem, tid, 32, nthreads);
         addInShared(shmem, tid, 16, nthreads);
         addInShared(shmem, tid,  8, nthreads);
         addInShared(shmem, tid,  4, nthreads);
         addInShared(shmem, tid,  2, nthreads);
-        addInShared(shmem, tid,  1, nthreads); 
+        addInShared(shmem, tid,  1, nthreads);
     }
     __syncthreads();
     copyFromShared(sum, shmem, 0, nthreads);
 }
 
-// cuda kernel for cuDerampMethod1
+// cuda kernel for cuLinearDeramp with Method 1
 template<const int nthreads>
-__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, 
-    const int imageSize, const int nImages, const float normCoef)
+__global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int const imageNY,
+    const int imageSize, const int nImages, const float normCoef, const int axis)
 {
     __shared__ float shmem[2*nthreads];
     int pixelIdx, pixelIdxX, pixelIdxY;
-    
-    const int bid = blockIdx.x;    
+
+    const int bid = blockIdx.x;
     if(bid >= nImages) return;
     float2 *image = images+ bid*imageSize;
-    const int tid = threadIdx.x;  
-    float2 phaseDiffY  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads) {
-        pixelIdxY = i % imageNY;
-        if(pixelIdxY < imageNY -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);   
-            phaseDiffY += cprod;
-        } 
-    }       
-    complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
-    //phaseDiffY *= normCoef;
-    float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x);
-
-    float2 phaseDiffX  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads)  {
-        pixelIdxX = i / imageNY; 
-        if(pixelIdxX < imageNX -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj(image[i], image[i+imageNY]);
-            phaseDiffX += cprod;
+    const int tid = threadIdx.x;
+
+    // average phase ramp along row/range direction
+    double phaseY = 0.0;
+    if (axis != 0)
+    {
+        float2 phaseDiffY  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads) {
+            pixelIdxY = i % imageNY;
+            if(pixelIdxY < imageNY -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);
+                phaseDiffY += cprod;
+            }
         }
-    }   
-    
-    complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
-   
-    //phaseDiffX *= normCoef;
-    float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
-     
+        complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
+        //phaseDiffY *= normCoef;
+        phaseY=atan2(phaseDiffY.y, phaseDiffY.x);
+    }
+
+    // average phase ramp along column/azimuth direction
+    double phaseX = 0.0;
+    if (axis != 1)
+    {
+        float2 phaseDiffX  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads)  {
+            pixelIdxX = i / imageNY;
+            if(pixelIdxX < imageNX -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj(image[i], image[i+imageNY]);
+                phaseDiffX += cprod;
+            }
+        }
+
+        complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
+
+        //phaseDiffX *= normCoef;
+        phaseX = atan2(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
+    }
+    // deramp with the estimated phase ramps
     for (int i = tid; i < imageSize; i += nthreads)
-    { 
+    {
         pixelIdxX = i / imageNY;
         pixelIdxY = i % imageNY;
-        float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-        float2 phase_factor = make_float2(cosf(phase), sinf(phase));
-        image[i] *= phase_factor;
-    }     
+        // use double to improve accuracy
+        double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+        double phase_cos = cos(phase);
+        double phase_sin = sin(phase);
+        image[i] = make_float2(
+            image[i].x*phase_cos - image[i].y*phase_sin,
+            image[i].x*phase_sin + image[i].y*phase_cos);
+    }
 }
 
 /**
@@ -120,38 +135,38 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
  * @param[in,out] images input/output complex signals
  * @param[in] stream cuda stream
  */
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
-    
+
     const dim3 grid(images->count);
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
 
     if(imageSize <=64) {
-        cuDerampMethod1_kernel<64> <<<grid, 64, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-     else if(imageSize <=128) {
-        cuDerampMethod1_kernel<128> <<<grid, 128, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }   
-     else if(imageSize <=256) {
-        cuDerampMethod1_kernel<256> <<<grid, 256, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }  
+        cuLinearDeramp_kernel<64> <<<grid, 64, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=128) {
+        cuLinearDeramp_kernel<128> <<<grid, 128, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=256) {
+        cuLinearDeramp_kernel<256> <<<grid, 256, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
     else  {
-        cuDerampMethod1_kernel<512> <<<grid, 512, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-    getLastCudaError("cuDerampMethod1 kernel error\n");
+        cuLinearDeramp_kernel<512> <<<grid, 512, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    getLastCudaError("cuLinearDeramp kernel error\n");
 
 }
-        
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream)
+
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
     switch(method) {
     case 1:
-        cuDerampMethod1(images, stream);
+        cuLinearDeramp(images, axis, stream);
         break;
     default:
         break;
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
index 1b6ab6267..d60399023 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
@@ -1,4 +1,4 @@
-/* 
+/*
  * @file cuOverSampler.cu
  * @brief Implementations of cuOverSamplerR2R (C2C) class
  */
@@ -22,17 +22,17 @@
  */
 cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
     int outNYp2 = outNY;
-    
+
     /* if expanded to 2^n
     int inNXp2 = nextpower2(inNX);
     int inNYp2 = nextpower2(inNY);
     int outNXp2 = inNXp2*outNX/inNX;
-    int outNYp2 = inNYp2*outNY/inNY; 
+    int outNYp2 = inNYp2*outNY/inNY;
     */
 
     // set up work arrays
@@ -67,25 +67,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_)
  * Execute fft oversampling
  * @param[in] imagesIn input batch of images
  * @param[out] imagesOut output batch of images
- * @param[in] method phase deramping method
  */
-void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int method)
-{   
-    cuDeramp(method, imagesIn, stream);         
+void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut)
+{
     cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE ));
     cuArraysPaddingMany(workIn, workOut, stream);
     cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD));
 }
 
 /// destructor
-cuOverSamplerC2C::~cuOverSamplerC2C() 
+cuOverSamplerC2C::~cuOverSamplerC2C()
 {
     // destroy fft handles
     cufft_Error(cufftDestroy(forwardPlan));
     cufft_Error(cufftDestroy(backwardPlan));
     // deallocate work arrays
     delete(workIn);
-    delete(workOut);	
+    delete(workOut);
 }
 
 // end of cuOverSamplerC2C
@@ -99,7 +97,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C()
  */
 cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
@@ -144,14 +142,14 @@ void cuOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *image
     cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE));
     cuArraysPaddingMany(workSizeIn, workSizeOut, stream);
     cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD ));
-    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream);	
+    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream);
 }
 
 /// destructor
-cuOverSamplerR2R::~cuOverSamplerR2R() 
+cuOverSamplerR2R::~cuOverSamplerR2R()
 {
     cufft_Error(cufftDestroy(forwardPlan));
-    cufft_Error(cufftDestroy(backwardPlan));	
+    cufft_Error(cufftDestroy(backwardPlan));
     workSizeIn->deallocate();
     workSizeOut->deallocate();
 }
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
index 9ddce96b2..a3154b613 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
@@ -31,7 +31,7 @@ class cuOverSamplerC2C
      // set cuda stream
      void setStream(cudaStream_t stream_);
      // execute oversampling
-     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
+     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut);
      // destructor
      ~cuOverSamplerC2C();
 };
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
index 7e8b6fb98..8f5fb65ad 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
@@ -85,7 +85,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount);
 
 #ifdef CUAMPCOR_DEBUG
-    r_maxval->outputToFile("r_maxval");
+    r_maxval->outputToFile("r_corrBatchRawMaxVal");
     r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn");
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid");
     r_corrBatchSum->outputToFile("r_corrBatchSum");
@@ -115,9 +115,16 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     maxLocShift->outputToFile("i_maxLocShift");
 #endif
 
+    // deramp reference
+    cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped");
+#endif
+
     // oversample reference
-    // (deramping included in oversampler)
-    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);
     // take amplitudes
     cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled);
 
@@ -135,15 +142,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean");
 #endif
 
-    // extract secondary and oversample
+    // extract secondary for smaller search window
     cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit);
-    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);
-    // take amplitudes
-    cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled);
 
 #ifdef CUAMPCOR_DEBUG
     // dump the extracted raw secondary image
     c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn");
+#endif
+
+    // deramp secondary
+    cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis);
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped secondary image(s)
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped");
+#endif
+
+    // oversample secondary
+    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
+
+    // take amplitudes
+    cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled);
+
+#ifdef CUAMPCOR_DEBUG
     // dump the oversampled secondary image(s)
     c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled");
     r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled");
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
index e10e38fc1..b82313fc9 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
@@ -25,6 +25,7 @@ cuAmpcorParameter::cuAmpcorParameter()
     deviceID = 0;
     nStreams = 1;
     derampMethod = 1;
+    derampAxis = 2; // both directions
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
index 5c9258d04..ce6d095bd 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -46,6 +46,7 @@ class cuAmpcorParameter{
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int derampAxis;     ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
index be318fc9a..7a1b6d373 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -42,8 +42,8 @@ void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut);
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images);
-void cuDerampMethod1(cuArrays<float2> *images);
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis);
+void cuLinearDeramp(cuArrays<float2> *images, const int axis);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
 void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2);
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index fa8b2482a..6924818a2 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -23,40 +23,51 @@
 
 namespace isce3::matchtemplate::pycuampcor {
 
-// kernel for cuDerampMethod1
-static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY,
-    const int imageSize, const int nImages, const float normCoef)
+// kernel for linear deramping
+static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int imageNY,
+    const int imageSize, const int nImages, const float normCoef, const int axis)
 {
     for (int k = 0; k < nImages; k++) {
 
         float2* image = images + k * imageSize;
 
-        double2 phaseDiffY = make_double2(0.0, 0.0);
-        for (int j = 0; j < imageNX; j++) {
-            for (int i = 0; i < imageNY - 1; i++) {
-                const int pixelIdx = j * imageNY + i;
-                float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]);
-                phaseDiffY += cprod;
+        double phaseY = 0.0;
+        if(axis != 0)
+        {
+            double2 phaseDiffY = make_double2(0.0, 0.0);
+            for (int j = 0; j < imageNX; j++) {
+                for (int i = 0; i < imageNY - 1; i++) {
+                    const int pixelIdx = j * imageNY + i;
+                    float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]);
+                    phaseDiffY += cprod;
+                }
             }
+            phaseY = atan2(phaseDiffY.y, phaseDiffY.x);
         }
 
-        double2 phaseDiffX = make_double2(0.0, 0.0);
-        for (int j = 0; j < imageNX - 1; j++) {
-            for (int i = 0; i < imageNY; i++) {
-                const int pixelIdx = j * imageNY + i;
-                float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]);
-                phaseDiffX += cprod;
+        double phaseX = 0.0;
+        if(axis != 1)
+        {
+            double2 phaseDiffX = make_double2(0.0, 0.0);
+            for (int j = 0; j < imageNX - 1; j++) {
+                for (int i = 0; i < imageNY; i++) {
+                    const int pixelIdx = j * imageNY + i;
+                    float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]);
+                    phaseDiffX += cprod;
+                }
             }
+            phaseX = atan2(phaseDiffX.y, phaseDiffX.x);
         }
-        float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);
-        float phaseY = atan2f(phaseDiffY.y, phaseDiffY.x);
 
         for (int i = 0; i < imageSize; i++) {
             const int pixelIdxX = i / imageNY;
             const int pixelIdxY = i % imageNY;
-            float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-            float2 phase_factor = make_float2(cosf(phase), sinf(phase));
-            image[i] *= phase_factor;
+            double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+            double phase_cos = cos(phase);
+            double phase_sin = sin(phase);
+            image[i] = make_float2(
+                image[i].x*phase_cos - image[i].y*phase_sin,
+                image[i].x*phase_sin + image[i].y*phase_cos);
         }
     }
 }
@@ -67,20 +78,20 @@ static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const
  *   and the average phase shift is obtained as atan(\sum imag / \sum real).
  * @param[inout] images input/output complex signals
  */
-void cuDerampMethod1(cuArrays<float2> *images)
+void cuLinearDeramp(cuArrays<float2> *images, const int axis)
 {
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
 
-    cuDerampMethod1_kernel(images->devData, images->height, images->width,
-        imageSize, images->count, invSize);
+    cuLinearDeramp_kernel(images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis);
 }
 
-void cuDeramp(int method, cuArrays<float2> *images)
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis)
 {
     switch(method) {
     case 1:
-        cuDerampMethod1(images);
+        cuLinearDeramp(images, axis);
         break;
     default:
         break;
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
index 4b84e6aa6..e94de10ff 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * @file cuOverSampler.cu
  * @brief Implementations of cuOverSamplerR2R (C2C) class
  */
@@ -23,17 +23,17 @@ cuOverSamplerC2C::cuOverSamplerC2C(
         cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut,
         int inNX, int inNY, int outNX, int outNY, int nImages)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
     int outNYp2 = outNY;
-    
+
     /* if expanded to 2^n
     int inNXp2 = nextpower2(inNX);
     int inNYp2 = nextpower2(inNY);
     int outNXp2 = inNXp2*outNX/inNX;
-    int outNYp2 = inNYp2*outNY/inNY; 
+    int outNYp2 = inNYp2*outNY/inNY;
     */
 
     // set up work arrays
@@ -70,25 +70,23 @@ cuOverSamplerC2C::cuOverSamplerC2C(
  * Execute fft oversampling
  * @param[in] imagesIn input batch of images
  * @param[out] imagesOut output batch of images
- * @param[in] method phase deramping method
  */
-void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int method)
-{   
-    cuDeramp(method, imagesIn);
+void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut)
+{
     fftwf_execute(forwardPlan);
     cuArraysPaddingMany(workIn, workOut);
     fftwf_execute(backwardPlan);
 }
 
 /// destructor
-cuOverSamplerC2C::~cuOverSamplerC2C() 
+cuOverSamplerC2C::~cuOverSamplerC2C()
 {
     // destroy fft handles
     fftwf_destroy_plan(forwardPlan);
     fftwf_destroy_plan(backwardPlan);
     // deallocate work arrays
     delete(workIn);
-    delete(workOut);	
+    delete(workOut);
 }
 
 // end of cuOverSamplerC2C
@@ -101,7 +99,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C()
  */
 cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
@@ -151,11 +149,11 @@ void cuOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *image
     fftwf_execute(forwardPlan);
     cuArraysPaddingMany(workSizeIn, workSizeOut);
     fftwf_execute(backwardPlan);
-    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0));	
+    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0));
 }
 
 /// destructor
-cuOverSamplerR2R::~cuOverSamplerR2R() 
+cuOverSamplerR2R::~cuOverSamplerR2R()
 {
     fftwf_destroy_plan(forwardPlan);
     fftwf_destroy_plan(backwardPlan);
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
index 2b79a67b2..6a57b9509 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
@@ -33,7 +33,7 @@ class cuOverSamplerC2C
      cuOverSamplerC2C(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut,
              int inNX, int inNY, int outNX, int outNY, int nImages);
      // execute oversampling
-     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
+     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut);
      // destructor
      ~cuOverSamplerC2C();
 };
diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
index 7e884ba45..036376c56 100644
--- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
+++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
@@ -32,6 +32,7 @@ void addbinding_pycuampcor(pybind11::module& m)
         .DEF_PARAM(int, deviceID)
         .DEF_PARAM(int, nStreams)
         .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, derampAxis)
 
         .DEF_PARAM(str, referenceImageName)
         .DEF_PARAM(int, referenceImageHeight)
diff --git a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
index d4ff33b32..f783ae709 100644
--- a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
+++ b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
@@ -32,6 +32,7 @@ void addbinding_pycuampcor_cpu(pybind11::module& m)
         .DEF_PARAM(int, deviceID)
         .DEF_PARAM(int, nStreams)
         .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, derampAxis)
 
         .DEF_PARAM(str, referenceImageName)
         .DEF_PARAM(int, referenceImageHeight)
diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py
index e897b19cd..54c785323 100644
--- a/python/packages/nisar/workflows/dense_offsets.py
+++ b/python/packages/nisar/workflows/dense_offsets.py
@@ -202,7 +202,21 @@ def set_optional_attributes(ampcor_obj, cfg, length, width):
 
     if cfg['deramping_method'] is not None:
         deramp = cfg['deramping_method']
-        ampcor_obj.derampMethod = 0 if deramp == "magnitude" else 1
+        if deramp == "magnitude":
+            ampcor_obj.derampMethod = 0
+        elif deramp == "linear":
+            ampcor_obj.derampMethod = 1
+        else: # skip deramping
+            ampcor_obj.derampMethod = 2
+
+    if cfg['deramping_axis'] is not None:
+        deramp_axis = cfg['deramping_axis']
+        if deramp_axis == "azimuth":
+            ampcor_obj.derampAxis = 0
+        elif deramp_axis == "range":
+            ampcor_obj.derampAxis = 1
+        else: # both directions
+            ampcor_obj.derampAxis = 2
 
     if cfg['correlation_statistics_zoom'] is not None:
         ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom']

From 619df3a063a606e60c227cf7fc2214e6989ae1de Mon Sep 17 00:00:00 2001
From: Xiaodong Huang <xhuang@nisar-adt-dev-3.jpl.nasa.gov>
Date: Thu, 11 Dec 2025 19:50:29 +0000
Subject: [PATCH 05/13] update the runconfig

---
 .../packages/nisar/workflows/dense_offsets.py |  2 +-
 .../nisar/workflows/offsets_product.py        | 21 +++++++++++++++++--
 share/nisar/defaults/insar.yaml               |  2 ++
 share/nisar/schemas/insar.yaml                |  3 +++
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py
index 54c785323..4a1feddd3 100644
--- a/python/packages/nisar/workflows/dense_offsets.py
+++ b/python/packages/nisar/workflows/dense_offsets.py
@@ -204,7 +204,7 @@ def set_optional_attributes(ampcor_obj, cfg, length, width):
         deramp = cfg['deramping_method']
         if deramp == "magnitude":
             ampcor_obj.derampMethod = 0
-        elif deramp == "linear":
+        elif deramp == "complex":
             ampcor_obj.derampMethod = 1
         else: # skip deramping
             ampcor_obj.derampMethod = 2
diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py
index ed5ef8358..724517b35 100644
--- a/python/packages/nisar/workflows/offsets_product.py
+++ b/python/packages/nisar/workflows/offsets_product.py
@@ -256,8 +256,25 @@ def set_ampcor_params(cfg, ampcor_obj):
     ampcor_obj.algorithm = 0 if cfg['cross_correlation_domain'] == \
                                 'frequency' else 1
     ampcor_obj.rawDataOversamplingFactor = cfg['slc_oversampling_factor']
-    ampcor_obj.derampMethod = 0 if cfg['deramping_method'] == \
-                                   'magnitude' else 1
+
+    if cfg['deramping_method'] is not None:
+        deramp = cfg['deramping_method']
+        if deramp == "magnitude":
+            ampcor_obj.derampMethod = 0
+        elif deramp == "complex":
+            ampcor_obj.derampMethod = 1
+        else: # skip deramping
+            ampcor_obj.derampMethod = 2
+
+    if cfg['deramping_axis'] is not None:
+        deramp_axis = cfg['deramping_axis']
+        if deramp_axis == "azimuth":
+            ampcor_obj.derampAxis = 0
+        elif deramp_axis == "range":
+            ampcor_obj.derampAxis = 1
+        else: # both directions
+            ampcor_obj.derampAxis = 2
+
     ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom']
     ampcor_obj.corrSurfaceZoomInWindow = cfg['correlation_surface_zoom']
     ampcor_obj.corrSurfaceOverSamplingFactor = cfg[
diff --git a/share/nisar/defaults/insar.yaml b/share/nisar/defaults/insar.yaml
index 9280e7604..815987953 100644
--- a/share/nisar/defaults/insar.yaml
+++ b/share/nisar/defaults/insar.yaml
@@ -410,6 +410,8 @@ runconfig:
                 slc_oversampling_factor: 2
                 # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
                 deramping_method: 'complex'
+                # Deramp data axis prior to FFT: range, azimuth, or both
+                deramping_axis: 'azimuth'
                 # Flag to use constant range/azimuth offsets in dense offsets estimation
                 use_gross_offsets: True
                 # Constant offset along slant range to guide dense offset estimation
diff --git a/share/nisar/schemas/insar.yaml b/share/nisar/schemas/insar.yaml
index 999dc97ad..c8455d154 100644
--- a/share/nisar/schemas/insar.yaml
+++ b/share/nisar/schemas/insar.yaml
@@ -332,6 +332,9 @@ dense_offsets_options:
     # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
     deramping_method: enum('magnitude', 'complex', required=False)
 
+    # Deramp data axis prior to FFT: range, azimuth, or both
+    deramping_axis: enum('range', 'azimuth', 'both', required=False)
+
     # Flag to use constant range/azimuth offsets in dense offsets estimation
     use_gross_offsets: bool(required=False)
 

From 8bc8712e270e60f0c79b86a887f3d3c4546da502 Mon Sep 17 00:00:00 2001
From: Xiaodong Huang <xhuang@nisar-adt-dev-5.jpl.nasa.gov>
Date: Wed, 17 Dec 2025 19:57:09 +0000
Subject: [PATCH 06/13] add the complex_no_deramp option

---
 share/nisar/defaults/insar.yaml |  6 ++++--
 share/nisar/schemas/insar.yaml  | 11 +++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/share/nisar/defaults/insar.yaml b/share/nisar/defaults/insar.yaml
index 815987953..a4ef7a5ef 100644
--- a/share/nisar/defaults/insar.yaml
+++ b/share/nisar/defaults/insar.yaml
@@ -408,7 +408,7 @@ runconfig:
                 # Anti-aliasing oversampling factor to apply to reference/secondary SLC
                 # prior to cross-correlation computation
                 slc_oversampling_factor: 2
-                # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
+                # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
                 deramping_method: 'complex'
                 # Deramp data axis prior to FFT: range, azimuth, or both
                 deramping_axis: 'azimuth'
@@ -461,8 +461,10 @@ runconfig:
                 start_pixel_azimuth:
                 # Cross-correlation domain
                 cross_correlation_domain: 'frequency'
-                # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
+                # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
                 deramping_method: 'complex'
+                # Deramp data axis prior to FFT: range, azimuth, or both
+                deramping_axis: 'azimuth'
                 # Anti-aliasing oversampling factor to apply to reference/secondary SLC
                 # prior to cross-correlation computation
                 slc_oversampling_factor: 2
diff --git a/share/nisar/schemas/insar.yaml b/share/nisar/schemas/insar.yaml
index c8455d154..6381ccdc7 100644
--- a/share/nisar/schemas/insar.yaml
+++ b/share/nisar/schemas/insar.yaml
@@ -329,8 +329,8 @@ dense_offsets_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
-    deramping_method: enum('magnitude', 'complex', required=False)
+    # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+    deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
 
     # Deramp data axis prior to FFT: range, azimuth, or both
     deramping_axis: enum('range', 'azimuth', 'both', required=False)
@@ -412,8 +412,11 @@ offsets_product_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
-    deramping_method: enum('magnitude', 'complex', required=False)
+    # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+    deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
+
+    # Deramp data axis prior to FFT: range, azimuth, or both
+    deramping_axis: enum('range', 'azimuth', 'both', required=False)
 
     # Flag to use constant range/azimuth offsets in dense offsets estimation
     use_gross_offsets: bool(required=False)

From e84ec704dc1c2bba0edfdf10ddd8b46ea921a92d Mon Sep 17 00:00:00 2001
From: Lijun Zhu <38738201+lijun99@users.noreply.github.com>
Date: Wed, 14 Jan 2026 16:28:36 -0800
Subject: [PATCH 07/13] Apply suggestions from code review

from @bhawkins :
  - commenting and value checks on deramp_method and deramp_axis
  - use explicit sincos for potential performance boost

Co-authored-by: Brian Hawkins <1729052+bhawkins@users.noreply.github.com>
---
 cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h | 3 +++
 cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu    | 8 +++++---
 python/packages/nisar/workflows/offsets_product.py     | 4 +++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
index 59be4cbc0..3339d82b0 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -46,6 +46,9 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 
 // cuDeramp.cu: deramping phase
+// `cuDeramp` calls a deramp implementation (or does nothing) based on the value of `method`:
+//  `method=1` for cuLinearDeramp, any other value for no-op
+// `cuLinearDeramp` Estimates the phase gradient over the chip and removes it.
 void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream);
 void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream);
 
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
index 786b9d4dd..81d5d2733 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -120,8 +120,8 @@ __global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int con
         pixelIdxY = i % imageNY;
         // use double to improve accuracy
         double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-        double phase_cos = cos(phase);
-        double phase_sin = sin(phase);
+        double phase_sin, phase_cos;
+        sincos(phase, &phase_sin, &phase_cos);
         image[i] = make_float2(
             image[i].x*phase_cos - image[i].y*phase_sin,
             image[i].x*phase_sin + image[i].y*phase_cos);
@@ -137,7 +137,9 @@ __global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int con
  */
 void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
-
+    if ((axis < 0) or (axis > 2)) {
+        throw std::invalid_argument("deramp axis must be 0, 1, or 2");
+    }
     const dim3 grid(images->count);
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py
index 724517b35..5b29ac94e 100644
--- a/python/packages/nisar/workflows/offsets_product.py
+++ b/python/packages/nisar/workflows/offsets_product.py
@@ -272,8 +272,10 @@ def set_ampcor_params(cfg, ampcor_obj):
             ampcor_obj.derampAxis = 0
         elif deramp_axis == "range":
             ampcor_obj.derampAxis = 1
-        else: # both directions
+        elif deramp_axis == "both":
             ampcor_obj.derampAxis = 2
+        else:
+            raise ValueError(f"invalid {deramp_axis=}")
 
     ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom']
     ampcor_obj.corrSurfaceZoomInWindow = cfg['correlation_surface_zoom']

From 75b7f18266073e8fcb0bb7889ec6156a52344356 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Wed, 14 Jan 2026 16:34:45 -0800
Subject: [PATCH 08/13] use explicit sincos for ampcor cpu module as well

---
 cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index 6924818a2..0a993d1f4 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -63,8 +63,8 @@ static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int i
             const int pixelIdxX = i / imageNY;
             const int pixelIdxY = i % imageNY;
             double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-            double phase_cos = cos(phase);
-            double phase_sin = sin(phase);
+            double phase_sin, phase_cos;
+            sincos(phase, &phase_sin, &phase_cos);
             image[i] = make_float2(
                 image[i].x*phase_cos - image[i].y*phase_sin,
                 image[i].x*phase_sin + image[i].y*phase_cos);

From ed931b504a50fbe150bce8768ee33f99548b7748 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Wed, 14 Jan 2026 16:48:50 -0800
Subject: [PATCH 09/13] add comments on deramp_method in runconfig

---
 share/nisar/defaults/insar.yaml | 4 ++--
 share/nisar/schemas/insar.yaml  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/share/nisar/defaults/insar.yaml b/share/nisar/defaults/insar.yaml
index a4ef7a5ef..1072ccf3e 100644
--- a/share/nisar/defaults/insar.yaml
+++ b/share/nisar/defaults/insar.yaml
@@ -408,7 +408,7 @@ runconfig:
                 # Anti-aliasing oversampling factor to apply to reference/secondary SLC
                 # prior to cross-correlation computation
                 slc_oversampling_factor: 2
-                # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+                # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
                 deramping_method: 'complex'
                 # Deramp data axis prior to FFT: range, azimuth, or both
                 deramping_axis: 'azimuth'
@@ -461,7 +461,7 @@ runconfig:
                 start_pixel_azimuth:
                 # Cross-correlation domain
                 cross_correlation_domain: 'frequency'
-                # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+                # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
                 deramping_method: 'complex'
                 # Deramp data axis prior to FFT: range, azimuth, or both
                 deramping_axis: 'azimuth'
diff --git a/share/nisar/schemas/insar.yaml b/share/nisar/schemas/insar.yaml
index 6381ccdc7..c9e4439e9 100644
--- a/share/nisar/schemas/insar.yaml
+++ b/share/nisar/schemas/insar.yaml
@@ -329,7 +329,7 @@ dense_offsets_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+    # Deramp data prior to FFT: 'magnitude' , 'complex' (linear phase ramp), or 'complex_no_deramp'
     deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
 
     # Deramp data axis prior to FFT: range, azimuth, or both
@@ -412,7 +412,7 @@ offsets_product_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude , complex (linear phase ramp), or complex without deramp
+    # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
     deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
 
     # Deramp data axis prior to FFT: range, azimuth, or both

From aa8d2bb7d0aa17697dbcbab4f7ee3d05879c1f43 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Wed, 14 Jan 2026 17:07:01 -0800
Subject: [PATCH 10/13] add math.h for sincos

---
 cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index 0a993d1f4..a5dc67808 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -19,6 +19,7 @@
 #include <iostream>
 #include <iomanip>
 #include <cmath>
+#include <math.h>
 #include <limits>
 
 namespace isce3::matchtemplate::pycuampcor {

From 1abc35ecd8786e019c4c7589bbe759651a650e11 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Thu, 15 Jan 2026 13:18:41 -0800
Subject: [PATCH 11/13] update ampcor test golden data and adjust the tolerance
 for correlation peak

---
 .../ovs128-rho0.8/golden/correlation_peak     | Bin 624 -> 624 bytes
 .../ovs128-rho0.8/golden/dense_offsets        | Bin 1248 -> 1248 bytes
 .../isce3/matchtemplate/test_ampcor.py        |   4 +++-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak
index 2d13833a8d0b5a7c09a3958e194580c245fc9d4c..30107f3642f7f6f77fee81f937271c7c0f5c336b 100644
GIT binary patch
literal 624
zcmWNOZAepL0EHv$!-CxA?!9~8d+#<UNEVrqMPK+N7TAwMlMFS4EJDpPEFvk>a)^sa
z%?e5?5KAdB&BExz3^V&tfsmP@4$@!|64}BavhM$LIOjY^?$kM#>85VTB=U=(<DXz-
zjls599pjIa1yyz~%r%&E)Whlt$@eh>N1mj+#lhn!8*jcUMEE5qlO<Kzf)jr=TCb~I
z3wj9l_;@;|ayB4HOYkwh+{=p|okW{SzOHifm_{&UP<lnfHsm6GM(0_i!jf7C1HB3}
zsZMUMRM=eOqEkvfxD346lKG<slPMBklZskyW04^^vrqE-wTi1=uq;wy59>6Pxfu)!
zJ|0o2eV|ZZ9Yy69jpus=HEx}f!*0T}Ch5I`vh@au#gajrWU5rB^PQJqfk{EXp#Pi6
zqaTurg$Cn|lHqAd$5w-pRVHgv1iABc`Yd0Y)R+huY>M;{(IwgcO49yT(2B|XwO&T^
z4Net$84p-24{0=&IBCp_Wi}kcu%%7*D{P2Uh`y^4Xcn|PZIq-$;b~FWy+Wh-yd<yB
z$~vy`GD4v-Lr@oIvfUrU(jqrmF$SM=ti2Zn{z)%|%^tcce5e@)w<=7UPis6mDA=)A
z;ZCkfW|E+-TjAj@lS>KKr}fdy+%uT6eD|$B@#_@Q1{FkxO3p7yDAmQtr)ZLJQGQv{
sQ|Y2%R5D;OD!O5P5VBLf*v52)gR5aXvGE!w4+yeCZnA$W^p`pK58;5+t^fc4

literal 624
zcmWN_dn^=i0KoBHnvlpNne~UQx{0PTS>yY&T9KNXn)imxA8n0Z^GCBX{Si%Vnbl@7
zkA}F+*m+(O?%WYa?zlVe$Mv{8dT=34eg6DhFzaaOwiQkeqd0g^;}>YezeCT=u4Ymb
z6x6Lxv0gTUo8K03vs+8RF+E9UGj_W-X^-9`BvLC;d&N!UY#+iy_JzpNX6h^IQJq&&
z?k*?7p^1ynErd<JM;&FMs7y;usD}8xMp8;te2cfRTH!3bTL&plTxVm*N+>%G9PjI)
zQ>Wp3@G#e=28c*g5>TS0A<0Cgmy1|=YAbwFS4gy$<E8#VUbT+Qs1{1fCn$*6WU;HC
z(1Jh65=QV%ouR?rMR-^qvQMpKEU1msIX%?tm&y3>3zNjiql973MGa#~86YdiMf9CC
z@Nr|5z_Df$-_BwkU?N@8L2_d)eTVl8YuhE<GrNgR(NLN-M^N|@r=)rux7mrK`IWfG
z%##>AOMQGV)j>vTrp9^J+)sAl0&*tNp0OACh9){w%ej}UAVcpWLf(|HsL`?zr{HRf
zn(SLsI2Rjn%dO+U+#u6#bKI5vzdbsnrR|7A3xquw=Z24gOPR~8MfDQvx=KjC9?y6E
z*vm(GAzLDApcV586PuSO@J&&2driShk59<+mSJ`tn<qNb?<?{6+>4~Hhob1MqBh)w
z=c^W?6>WIN4Y7D<kR4|Ygf2UY!NOu<`g?htro#ODD>kwYc1qP;mo@U&u?GJmoxE0U
V5;^>ns`6>74r;J=9V1!V#Xo!IF`obc

diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets
index 9b1a3d0ad363943e0cadcd30fa2c1f10dde6ce40..b2398d3f347741b35d307677dce36aebdc69d6f5 100644
GIT binary patch
literal 1248
zcmZ9Kfo;Pu3<Ncb2T_2I(osB^3uv_7y-CFh0p`j7C#f_YV_ajL<oF}OWj*iOen;0}
z{MC28W;tKzT|>D~&r$omv%P9~e;&89Zy%*Vk00+~_YQh@cerO%?~d$gu261~M(aU|
z;>B?GRWEDa=#6N;LycaHPU-hhH|1J)c07D^w>EEH&D8%f2fe<yv%GnaGnM{roHY*a
z`4%z$`NrG9CtO0T+WQ~D{J7q~kG0<3X@9(KkF$$jP4s<w^9E3_ni&mUP4DKdXU&7{
zZC!sttl7qRnMu6(bl#p^7f9+!jc&bfIi$xB&H1S5SMg%@K6;#=`|ExCHu7(CqEYjf
zXVtwoI91+v{hrRm+dH0|=9&q*(~xrxT;Q^=I+`8>?o_MY2e@xtRfFpC?utGi8uFqa
oX#QX5$qhLo`>DgD|FA3f^gS}V{^qLAZs*BwzccaoPVl_`0YOCJUH||9

literal 1248
zcmY+Bfo;P;3<EWa2XTOm;!%E-jL-p6z@uc8-qBK$3J|ELNd7yiT6?8Sd{)(O)yaGp
zId7o5`kLi!%f0ifb(6Q;+xsV1E_%4^^xygp=6E>gzSe{zTU_oy>vRu)dIsgqj*eY;
zXx}qB_hyeS8qU!?KWO&&_3D}2;H~Nb@7wjN)pNjkF3!&SQk>%h-8oNXCTD8X=YAii
z^PBU|lQ}*)%Guv$yyAnFS9|E$5i%#YcO}BRO9p$-(cn4Q-@7Mkw4&KR==6BM`jp7d
z{mYqlUbJ%>nA+s1?DtD+et0~3Eip5=^neF=r#<_QybJW3QzE<ZsjcP;ogUx5p4a~w
z9o*KMO+9_kYEsSVEjZEKLG_jy<4LZX&hXA~=uZ5TRh{Sg&8cbMyv2O)RG<6C_l@r8
zc=wO){VusFv#4Y`^jkN1bY?8tz3Bd5p-C<n-R;^&s~+B-yU)>09-3&ina$a|;uEQb
QXYj3+(CZF7Y5l?a1IDTD`v3p{

diff --git a/tests/python/packages/isce3/matchtemplate/test_ampcor.py b/tests/python/packages/isce3/matchtemplate/test_ampcor.py
index ea01ff724..09e75fd2b 100644
--- a/tests/python/packages/isce3/matchtemplate/test_ampcor.py
+++ b/tests/python/packages/isce3/matchtemplate/test_ampcor.py
@@ -83,6 +83,7 @@ def test_ampcor():
             ampcor.algorithm = 0  # frequency
             ampcor.corrSurfaceOverSamplingMethod = ovs
             ampcor.derampMethod = 1
+            ampcor.derampAxis = 0
 
             ampcor.corrStatWindowSize = 21
             ampcor.corrSurfaceZoomInWindow = 8
@@ -159,7 +160,8 @@ def test_ampcor():
                     meantol = 2e-2
                     tol = 1e-1
                 elif fname == "correlation_peak":
-                    meantol = 1e-2
+                    meantol = 2e-2
+                    tol = 5e-2
                 else:
                     meantol = 1 / 64 / 5
                     tol = 1 / 64

From b44b6fd715a1523017eea15e9fe4ebaf38fcc179 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Thu, 15 Jan 2026 14:21:52 -0800
Subject: [PATCH 12/13] fix sincos for macOS

---
 cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index a5dc67808..d041d8fae 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -19,7 +19,6 @@
 #include <iostream>
 #include <iomanip>
 #include <cmath>
-#include <math.h>
 #include <limits>
 
 namespace isce3::matchtemplate::pycuampcor {
@@ -65,7 +64,11 @@ static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int i
             const int pixelIdxY = i % imageNY;
             double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
             double phase_sin, phase_cos;
+#if defined(__APPLE__) || defined(__MACOSX__)
+            __sincos(phase, &phase_sin, &phase_cos);
+#else
             sincos(phase, &phase_sin, &phase_cos);
+#endif
             image[i] = make_float2(
                 image[i].x*phase_cos - image[i].y*phase_sin,
                 image[i].x*phase_sin + image[i].y*phase_cos);

From a6b0e9b575ca4bc1d2b3506c7812d37aecda4763 Mon Sep 17 00:00:00 2001
From: Lijun Zhu <ljzhu@gps.caltech.edu>
Date: Fri, 16 Jan 2026 14:18:25 -0800
Subject: [PATCH 13/13] revert sincos in the cpu module to separate calls
 (optimize by  -ffast-math)

---
 cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index d041d8fae..6924818a2 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -63,12 +63,8 @@ static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int i
             const int pixelIdxX = i / imageNY;
             const int pixelIdxY = i % imageNY;
             double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-            double phase_sin, phase_cos;
-#if defined(__APPLE__) || defined(__MACOSX__)
-            __sincos(phase, &phase_sin, &phase_cos);
-#else
-            sincos(phase, &phase_sin, &phase_cos);
-#endif
+            double phase_cos = cos(phase);
+            double phase_sin = sin(phase);
             image[i] = make_float2(
                 image[i].x*phase_cos - image[i].y*phase_sin,
                 image[i].x*phase_sin + image[i].y*phase_cos);