diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
index 8f35fa075..5825ad37d 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorChunk.cu
@@ -76,7 +76,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    r_maxval->outputToFile("r_maxval", stream);
+    r_maxval->outputToFile("r_corrBatchRawMaxVal", stream);
     r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn", stream);
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid", stream);
     r_corrBatchSum->outputToFile("r_corrBatchSum", stream);
@@ -107,9 +107,17 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     maxLocShift->outputToFile("i_maxLocShift", stream);
 #endif
 
+    // deramp reference
+    cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped", stream);
+#endif
+
     // oversample reference
-    // (deramping included in oversampler)
-    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);
+
     // take amplitudes
     cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled, stream);
 
@@ -127,15 +135,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean",stream);
 #endif
 
-    // extract secondary and oversample
+    // extract secondary images around the max location with a smaller search range
     cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit, stream);
-    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the extracted raw secondary image
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInRaw", stream);
+#endif
+
+    // deramp secondary
+    cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis, stream);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped secondary image(s)
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped", stream);
+#endif
+
+    // oversample secondary
+    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
     // take amplitudes
     cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled, stream);
 
 #ifdef CUAMPCOR_DEBUG
-    // dump the extracted raw secondary image
-    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn", stream);
     // dump the oversampled secondary image(s)
     c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled", stream);
     r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled", stream);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
index 9acc6b294..6e058df0c 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.cu
@@ -22,7 +22,8 @@ cuAmpcorParameter::cuAmpcorParameter()
     algorithm = 0; //0 freq; 1 time
     deviceID = 0;
     nStreams = 1;
-    derampMethod = 1;
+    derampMethod = 1; // average deramp
+    derampAxis = 2; // both directions
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
index 357f36ba9..6c5a01b49 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -44,6 +44,7 @@ class cuAmpcorParameter{
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int derampAxis;     ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
index d56a01c67..3339d82b0 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -46,8 +46,11 @@ void cuArraysC2R(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2, cudaStream_t stream);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream);
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream);
+// `cuDeramp` calls a deramp implementation (or does nothing) based on the value of `method`:
+//  `method=1` for cuLinearDeramp, any other value for no-op
+// `cuLinearDeramp` Estimates the phase gradient over the chip and removes it.
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream);
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
 void cuArraysPadding(cuArrays<float2> *image1, cuArrays<float2> *image2, cudaStream_t stream);
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
index cd46e7893..81d5d2733 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuDeramp.cu
@@ -11,9 +11,9 @@
  * Method 0 or else: skip deramping
  *
  */
- 
-#include "cuArrays.h" 
-#include "float2.h" 
+
+#include "cuArrays.h"
+#include "float2.h"
 #include <cfloat>
 #include "cudaError.h"
 #include "cudaUtil.h"
@@ -27,14 +27,14 @@
 // cuda does not have a good support on volatile vector struct, e.g. float2
 // have to use regular float type for shared memory (volatile) data
 // the following methods are defined to operate float2/complex objects through float
-inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block) 
+inline static __device__ void copyToShared(volatile float *s, const int i, const float2 x, const int block)
 { s[i] = x.x; s[i+block] = x.y; }
 
-inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block) 
+inline static __device__ void copyFromShared(float2 &x, volatile float *s, const int i, const int block)
 { x.x = s[i]; x.y = s[i+block]; }
 
 
-inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block) 
+inline static __device__ void addInShared(volatile float *s, const int i, const int j, const int block)
 { s[i] += s[i+j]; s[i+block] += s[i+j+block];}
 
 
@@ -45,72 +45,87 @@ __device__ void complexSumReduceBlock(float2& sum, volatile float *shmem)
     const int tid = threadIdx.x;
     copyToShared(shmem, tid, sum, nthreads);
     __syncthreads();
-    
+
     if (nthreads >=1024) { if (tid < 512) { addInShared(shmem, tid, 512, nthreads); } __syncthreads(); }
     if (nthreads >= 512) { if (tid < 256) { addInShared(shmem, tid, 256, nthreads); } __syncthreads(); }
     if (nthreads >= 256) { if (tid < 128) { addInShared(shmem, tid, 128, nthreads); } __syncthreads(); }
     if (nthreads >= 128) { if (tid <  64) { addInShared(shmem, tid,  64, nthreads); } __syncthreads(); }
     if (tid < 32)
-    {	
+    {
         addInShared(shmem, tid, 32, nthreads);
         addInShared(shmem, tid, 16, nthreads);
         addInShared(shmem, tid,  8, nthreads);
         addInShared(shmem, tid,  4, nthreads);
         addInShared(shmem, tid,  2, nthreads);
-        addInShared(shmem, tid,  1, nthreads); 
+        addInShared(shmem, tid,  1, nthreads);
     }
     __syncthreads();
     copyFromShared(sum, shmem, 0, nthreads);
 }
 
-// cuda kernel for cuDerampMethod1
+// cuda kernel for cuLinearDeramp with Method 1
 template<const int nthreads>
-__global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY, 
-    const int imageSize, const int nImages, const float normCoef)
+__global__ void cuLinearDeramp_kernel(float2 *images, const int imageNX, int const imageNY,
+    const int imageSize, const int nImages, const float normCoef, const int axis)
 {
     __shared__ float shmem[2*nthreads];
     int pixelIdx, pixelIdxX, pixelIdxY;
-    
-    const int bid = blockIdx.x;    
+
+    const int bid = blockIdx.x;
     if(bid >= nImages) return;
     float2 *image = images+ bid*imageSize;
-    const int tid = threadIdx.x;  
-    float2 phaseDiffY  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads) {
-        pixelIdxY = i % imageNY;
-        if(pixelIdxY < imageNY -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);   
-            phaseDiffY += cprod;
-        } 
-    }       
-    complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
-    //phaseDiffY *= normCoef;
-    float phaseY=atan2f(phaseDiffY.y, phaseDiffY.x);
-
-    float2 phaseDiffX  = make_float2(0.0f, 0.0f);
-    for (int i = tid; i < imageSize; i += nthreads)  {
-        pixelIdxX = i / imageNY; 
-        if(pixelIdxX < imageNX -1) {
-            pixelIdx = i;
-            float2 cprod = complexMulConj(image[i], image[i+imageNY]);
-            phaseDiffX += cprod;
+    const int tid = threadIdx.x;
+
+    // average phase ramp along row/range direction
+    double phaseY = 0.0;
+    if (axis != 0)
+    {
+        float2 phaseDiffY  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads) {
+            pixelIdxY = i % imageNY;
+            if(pixelIdxY < imageNY -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj( image[pixelIdx], image[pixelIdx+1]);
+                phaseDiffY += cprod;
+            }
+        }
+        complexSumReduceBlock<nthreads>(phaseDiffY, shmem);
+        //phaseDiffY *= normCoef;
+        phaseY=atan2(phaseDiffY.y, phaseDiffY.x);
+    }
+
+    // average phase ramp along column/azimuth direction
+    double phaseX = 0.0;
+    if (axis != 1)
+    {
+        float2 phaseDiffX  = make_float2(0.0f, 0.0f);
+        for (int i = tid; i < imageSize; i += nthreads)  {
+            pixelIdxX = i / imageNY;
+            if(pixelIdxX < imageNX -1) {
+                pixelIdx = i;
+                float2 cprod = complexMulConj(image[i], image[i+imageNY]);
+                phaseDiffX += cprod;
+            }
         }
-    }   
-    
-    complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
-   
-    //phaseDiffX *= normCoef;
-    float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
-     
+
+        complexSumReduceBlock<nthreads>(phaseDiffX, shmem);
+
+        //phaseDiffX *= normCoef;
+        phaseX = atan2(phaseDiffX.y, phaseDiffX.x);  //+FLT_EPSILON
+    }
+    // deramp with the estimated phase ramps
     for (int i = tid; i < imageSize; i += nthreads)
-    { 
-        pixelIdxX = i%imageNY;
-        pixelIdxY = i/imageNY;
-        float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-        float2 phase_factor = make_float2(cosf(phase), sinf(phase));
-        image[i] *= phase_factor;
-    }     
+    {
+        pixelIdxX = i / imageNY;
+        pixelIdxY = i % imageNY;
+        // use double to improve accuracy
+        double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+        double phase_sin, phase_cos;
+        sincos(phase, &phase_sin, &phase_cos);
+        image[i] = make_float2(
+            image[i].x*phase_cos - image[i].y*phase_sin,
+            image[i].x*phase_sin + image[i].y*phase_cos);
+    }
 }
 
 /**
@@ -120,38 +135,40 @@ __global__ void cuDerampMethod1_kernel(float2 *images, const int imageNX, int co
  * @param[in,out] images input/output complex signals
  * @param[in] stream cuda stream
  */
-void cuDerampMethod1(cuArrays<float2> *images, cudaStream_t stream)
+void cuLinearDeramp(cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
-    
+    if ((axis < 0) or (axis > 2)) {
+        throw std::invalid_argument("deramp axis must be 0, 1, or 2");
+    }
     const dim3 grid(images->count);
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
 
     if(imageSize <=64) {
-        cuDerampMethod1_kernel<64> <<<grid, 64, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-     else if(imageSize <=128) {
-        cuDerampMethod1_kernel<128> <<<grid, 128, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }   
-     else if(imageSize <=256) {
-        cuDerampMethod1_kernel<256> <<<grid, 256, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }  
+        cuLinearDeramp_kernel<64> <<<grid, 64, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=128) {
+        cuLinearDeramp_kernel<128> <<<grid, 128, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    else if(imageSize <=256) {
+        cuLinearDeramp_kernel<256> <<<grid, 256, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
     else  {
-        cuDerampMethod1_kernel<512> <<<grid, 512, 0, stream>>>
-        (images->devData, images->height, images->width, 
-        imageSize, images->count, invSize); }
-    getLastCudaError("cuDerampMethod1 kernel error\n");
+        cuLinearDeramp_kernel<512> <<<grid, 512, 0, stream>>>
+        (images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis); }
+    getLastCudaError("cuLinearDeramp kernel error\n");
 
 }
-        
-void cuDeramp(int method, cuArrays<float2> *images, cudaStream_t stream)
+
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis, cudaStream_t stream)
 {
     switch(method) {
     case 1:
-        cuDerampMethod1(images, stream);
+        cuLinearDeramp(images, axis, stream);
         break;
     default:
         break;
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
index 1b6ab6267..d60399023 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.cu
@@ -1,4 +1,4 @@
-/* 
+/*
  * @file cuOverSampler.cu
  * @brief Implementations of cuOverSamplerR2R (C2C) class
  */
@@ -22,17 +22,17 @@
  */
 cuOverSamplerC2C::cuOverSamplerC2C(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream_)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
     int outNYp2 = outNY;
-    
+
     /* if expanded to 2^n
     int inNXp2 = nextpower2(inNX);
     int inNYp2 = nextpower2(inNY);
     int outNXp2 = inNXp2*outNX/inNX;
-    int outNYp2 = inNYp2*outNY/inNY; 
+    int outNYp2 = inNYp2*outNY/inNY;
     */
 
     // set up work arrays
@@ -67,25 +67,23 @@ void cuOverSamplerC2C::setStream(cudaStream_t stream_)
  * Execute fft oversampling
  * @param[in] imagesIn input batch of images
  * @param[out] imagesOut output batch of images
- * @param[in] method phase deramping method
  */
-void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int method)
-{   
-    cuDeramp(method, imagesIn, stream);         
+void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut)
+{
     cufft_Error(cufftExecC2C(forwardPlan, imagesIn->devData, workIn->devData, CUFFT_INVERSE ));
     cuArraysPaddingMany(workIn, workOut, stream);
     cufft_Error(cufftExecC2C(backwardPlan, workOut->devData, imagesOut->devData, CUFFT_FORWARD));
 }
 
 /// destructor
-cuOverSamplerC2C::~cuOverSamplerC2C() 
+cuOverSamplerC2C::~cuOverSamplerC2C()
 {
     // destroy fft handles
     cufft_Error(cufftDestroy(forwardPlan));
     cufft_Error(cufftDestroy(backwardPlan));
     // deallocate work arrays
     delete(workIn);
-    delete(workOut);	
+    delete(workOut);
 }
 
 // end of cuOverSamplerC2C
@@ -99,7 +97,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C()
  */
 cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages, cudaStream_t stream)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
@@ -144,14 +142,14 @@ void cuOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *image
     cufft_Error(cufftExecC2C(forwardPlan, workSizeIn->devData, workSizeIn->devData, CUFFT_INVERSE));
     cuArraysPaddingMany(workSizeIn, workSizeOut, stream);
     cufft_Error(cufftExecC2C(backwardPlan, workSizeOut->devData, workSizeOut->devData,CUFFT_FORWARD ));
-    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream);	
+    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0), stream);
 }
 
 /// destructor
-cuOverSamplerR2R::~cuOverSamplerR2R() 
+cuOverSamplerR2R::~cuOverSamplerR2R()
 {
     cufft_Error(cufftDestroy(forwardPlan));
-    cufft_Error(cufftDestroy(backwardPlan));	
+    cufft_Error(cufftDestroy(backwardPlan));
     workSizeIn->deallocate();
     workSizeOut->deallocate();
 }
diff --git a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
index 9ddce96b2..a3154b613 100644
--- a/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
+++ b/cxx/isce3/cuda/matchtemplate/pycuampcor/cuOverSampler.h
@@ -31,7 +31,7 @@ class cuOverSamplerC2C
      // set cuda stream
      void setStream(cudaStream_t stream_);
      // execute oversampling
-     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
+     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut);
      // destructor
      ~cuOverSamplerC2C();
 };
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
index 7e8b6fb98..8f5fb65ad 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorChunk.cpp
@@ -85,7 +85,7 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     cuArraysSumCorr(r_corrBatchRawZoomIn, i_corrBatchZoomInValid, r_corrBatchSum, i_corrBatchValidCount);
 
 #ifdef CUAMPCOR_DEBUG
-    r_maxval->outputToFile("r_maxval");
+    r_maxval->outputToFile("r_corrBatchRawMaxVal");
     r_corrBatchRawZoomIn->outputToFile("r_corrBatchRawStatZoomIn");
     i_corrBatchZoomInValid->outputToFile("i_corrBatchZoomInValid");
     r_corrBatchSum->outputToFile("r_corrBatchSum");
@@ -115,9 +115,16 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     maxLocShift->outputToFile("i_maxLocShift");
 #endif
 
+    // deramp reference
+    cuDeramp(param->derampMethod, c_referenceBatchRaw, param->derampAxis);
+
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped reference image(s)
+    c_referenceBatchRaw->outputToFile("c_referenceBatchRawDeramped");
+#endif
+
     // oversample reference
-    // (deramping included in oversampler)
-    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled, param->derampMethod);
+    referenceBatchOverSampler->execute(c_referenceBatchRaw, c_referenceBatchOverSampled);
     // take amplitudes
     cuArraysAbs(c_referenceBatchOverSampled, r_referenceBatchOverSampled);
 
@@ -135,15 +142,28 @@ void cuAmpcorChunk::run(int idxDown_, int idxAcross_)
     r_referenceBatchOverSampled->outputToFile("r_referenceBatchOverSampledSubMean");
 #endif
 
-    // extract secondary and oversample
+    // extract secondary for smaller search window
     cuArraysCopyExtract(c_secondaryBatchRaw, c_secondaryBatchZoomIn, offsetInit);
-    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled, param->derampMethod);
-    // take amplitudes
-    cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled);
 
 #ifdef CUAMPCOR_DEBUG
     // dump the extracted raw secondary image
     c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomIn");
+#endif
+
+    // deramp secondary
+    cuDeramp(param->derampMethod, c_secondaryBatchZoomIn, param->derampAxis);
+#ifdef CUAMPCOR_DEBUG
+    // dump the deramped secondary image(s)
+    c_secondaryBatchZoomIn->outputToFile("c_secondaryBatchZoomInDeramped");
+#endif
+
+    // oversample secondary
+    secondaryBatchOverSampler->execute(c_secondaryBatchZoomIn, c_secondaryBatchOverSampled);
+
+    // take amplitudes
+    cuArraysAbs(c_secondaryBatchOverSampled, r_secondaryBatchOverSampled);
+
+#ifdef CUAMPCOR_DEBUG
     // dump the oversampled secondary image(s)
     c_secondaryBatchOverSampled->outputToFile("c_secondaryBatchOverSampled");
     r_secondaryBatchOverSampled->outputToFile("r_secondaryBatchOverSampled");
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
index e10e38fc1..b82313fc9 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.cpp
@@ -25,6 +25,7 @@ cuAmpcorParameter::cuAmpcorParameter()
     deviceID = 0;
     nStreams = 1;
     derampMethod = 1;
+    derampAxis = 2; // both directions
 
     windowSizeWidthRaw = 64;
     windowSizeHeightRaw = 64;
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
index 5c9258d04..ce6d095bd 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorParameter.h
@@ -46,6 +46,7 @@ class cuAmpcorParameter{
     int deviceID;       ///< Targeted GPU device ID: use -1 to auto select
     int nStreams;       ///< Number of streams to asynchonize data transfers and compute kernels
     int derampMethod;   ///< Method for deramping 0=None, 1=average
+    int derampAxis;     ///< Axis for deramping 0=down (azimuth) 1=across (range), 2=both axes
 
     // chip or window size for raw data
     int windowSizeHeightRaw;        ///< Template window height (original size)
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
index be318fc9a..7a1b6d373 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuAmpcorUtil.h
@@ -42,8 +42,8 @@ void cuArraysCopyPadded(cuArrays<T_in> *imageIn, cuArrays<T_out> *imageOut);
 void cuArraysAbs(cuArrays<float2> *image1, cuArrays<float> *image2);
 
 // cuDeramp.cu: deramping phase
-void cuDeramp(int method, cuArrays<float2> *images);
-void cuDerampMethod1(cuArrays<float2> *images);
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis);
+void cuLinearDeramp(cuArrays<float2> *images, const int axis);
 
 // cuArraysPadding.cu: various utilities for oversampling padding
 void cuArraysPaddingMany(cuArrays<float2> *image1, cuArrays<float2> *image2);
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
index a1d015395..6924818a2 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuDeramp.cpp
@@ -23,40 +23,51 @@
 
 namespace isce3::matchtemplate::pycuampcor {
 
-// kernel for cuDerampMethod1
-static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const imageNY,
-    const int imageSize, const int nImages, const float normCoef)
+// kernel for linear deramping
+static void cuLinearDeramp_kernel(float2 *images, const int imageNX, const int imageNY,
+    const int imageSize, const int nImages, const float normCoef, const int axis)
 {
     for (int k = 0; k < nImages; k++) {
 
         float2* image = images + k * imageSize;
 
-        double2 phaseDiffY = make_double2(0.0, 0.0);
-        for (int j = 0; j < imageNX; j++) {
-            for (int i = 0; i < imageNY - 1; i++) {
-                const int pixelIdx = j * imageNY + i;
-                float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]);
-                phaseDiffY += cprod;
+        double phaseY = 0.0;
+        if(axis != 0)
+        {
+            double2 phaseDiffY = make_double2(0.0, 0.0);
+            for (int j = 0; j < imageNX; j++) {
+                for (int i = 0; i < imageNY - 1; i++) {
+                    const int pixelIdx = j * imageNY + i;
+                    float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+1]);
+                    phaseDiffY += cprod;
+                }
             }
+            phaseY = atan2(phaseDiffY.y, phaseDiffY.x);
         }
 
-        double2 phaseDiffX = make_double2(0.0, 0.0);
-        for (int j = 0; j < imageNX - 1; j++) {
-            for (int i = 0; i < imageNY; i++) {
-                const int pixelIdx = j * imageNY + i;
-                float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]);
-                phaseDiffX += cprod;
+        double phaseX = 0.0;
+        if(axis != 1)
+        {
+            double2 phaseDiffX = make_double2(0.0, 0.0);
+            for (int j = 0; j < imageNX - 1; j++) {
+                for (int i = 0; i < imageNY; i++) {
+                    const int pixelIdx = j * imageNY + i;
+                    float2 cprod = complexMulConj(image[pixelIdx], image[pixelIdx+imageNY]);
+                    phaseDiffX += cprod;
+                }
             }
+            phaseX = atan2(phaseDiffX.y, phaseDiffX.x);
         }
-        float phaseX = atan2f(phaseDiffX.y, phaseDiffX.x);
-        float phaseY = atan2f(phaseDiffY.y, phaseDiffY.x);
 
         for (int i = 0; i < imageSize; i++) {
-            const int pixelIdxX = i%imageNY;
-            const int pixelIdxY = i/imageNY;
-            float phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
-            float2 phase_factor = make_float2(cosf(phase), sinf(phase));
-            image[i] *= phase_factor;
+            const int pixelIdxX = i / imageNY;
+            const int pixelIdxY = i % imageNY;
+            double phase = pixelIdxX*phaseX + pixelIdxY*phaseY;
+            double phase_cos = cos(phase);
+            double phase_sin = sin(phase);
+            image[i] = make_float2(
+                image[i].x*phase_cos - image[i].y*phase_sin,
+                image[i].x*phase_sin + image[i].y*phase_cos);
         }
     }
 }
@@ -67,20 +78,20 @@ static void cuDerampMethod1_kernel(float2 *images, const int imageNX, int const
  *   and the average phase shift is obtained as atan(\sum imag / \sum real).
  * @param[inout] images input/output complex signals
  */
-void cuDerampMethod1(cuArrays<float2> *images)
+void cuLinearDeramp(cuArrays<float2> *images, const int axis)
 {
     const int imageSize = images->width*images->height;
     const float invSize = 1.0f/imageSize;
 
-    cuDerampMethod1_kernel(images->devData, images->height, images->width,
-        imageSize, images->count, invSize);
+    cuLinearDeramp_kernel(images->devData, images->height, images->width,
+        imageSize, images->count, invSize, axis);
 }
 
-void cuDeramp(int method, cuArrays<float2> *images)
+void cuDeramp(const int method, cuArrays<float2> *images, const int axis)
 {
     switch(method) {
     case 1:
-        cuDerampMethod1(images);
+        cuLinearDeramp(images, axis);
         break;
     default:
         break;
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
index 4b84e6aa6..e94de10ff 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.cpp
@@ -1,4 +1,4 @@
-/* 
+/*
  * @file cuOverSampler.cu
  * @brief Implementations of cuOverSamplerR2R (C2C) class
  */
@@ -23,17 +23,17 @@ cuOverSamplerC2C::cuOverSamplerC2C(
         cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut,
         int inNX, int inNY, int outNX, int outNY, int nImages)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
     int outNYp2 = outNY;
-    
+
     /* if expanded to 2^n
     int inNXp2 = nextpower2(inNX);
     int inNYp2 = nextpower2(inNY);
     int outNXp2 = inNXp2*outNX/inNX;
-    int outNYp2 = inNYp2*outNY/inNY; 
+    int outNYp2 = inNYp2*outNY/inNY;
     */
 
     // set up work arrays
@@ -70,25 +70,23 @@ cuOverSamplerC2C::cuOverSamplerC2C(
  * Execute fft oversampling
  * @param[in] imagesIn input batch of images
  * @param[out] imagesOut output batch of images
- * @param[in] method phase deramping method
  */
-void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int method)
-{   
-    cuDeramp(method, imagesIn);
+void cuOverSamplerC2C::execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut)
+{
     fftwf_execute(forwardPlan);
     cuArraysPaddingMany(workIn, workOut);
     fftwf_execute(backwardPlan);
 }
 
 /// destructor
-cuOverSamplerC2C::~cuOverSamplerC2C() 
+cuOverSamplerC2C::~cuOverSamplerC2C()
 {
     // destroy fft handles
     fftwf_destroy_plan(forwardPlan);
     fftwf_destroy_plan(backwardPlan);
     // deallocate work arrays
     delete(workIn);
-    delete(workOut);	
+    delete(workOut);
 }
 
 // end of cuOverSamplerC2C
@@ -101,7 +99,7 @@ cuOverSamplerC2C::~cuOverSamplerC2C()
  */
 cuOverSamplerR2R::cuOverSamplerR2R(int inNX, int inNY, int outNX, int outNY, int nImages)
 {
-    
+
     int inNXp2 = inNX;
     int inNYp2 = inNY;
     int outNXp2 = outNX;
@@ -151,11 +149,11 @@ void cuOverSamplerR2R::execute(cuArrays<float> *imagesIn, cuArrays<float> *image
     fftwf_execute(forwardPlan);
     cuArraysPaddingMany(workSizeIn, workSizeOut);
     fftwf_execute(backwardPlan);
-    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0));	
+    cuArraysCopyExtract(workSizeOut, imagesOut, make_int2(0,0));
 }
 
 /// destructor
-cuOverSamplerR2R::~cuOverSamplerR2R() 
+cuOverSamplerR2R::~cuOverSamplerR2R()
 {
     fftwf_destroy_plan(forwardPlan);
     fftwf_destroy_plan(backwardPlan);
diff --git a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
index 2b79a67b2..6a57b9509 100644
--- a/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
+++ b/cxx/isce3/matchtemplate/pycuampcor/cuOverSampler.h
@@ -33,7 +33,7 @@ class cuOverSamplerC2C
      cuOverSamplerC2C(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut,
              int inNX, int inNY, int outNX, int outNY, int nImages);
      // execute oversampling
-     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut, int deramp_method=0);
+     void execute(cuArrays<float2> *imagesIn, cuArrays<float2> *imagesOut);
      // destructor
      ~cuOverSamplerC2C();
 };
diff --git a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
index 7e884ba45..036376c56 100644
--- a/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
+++ b/python/extensions/pybind_isce3/cuda/matchtemplate/pycuampcor.cpp
@@ -32,6 +32,7 @@ void addbinding_pycuampcor(pybind11::module& m)
         .DEF_PARAM(int, deviceID)
         .DEF_PARAM(int, nStreams)
         .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, derampAxis)
 
         .DEF_PARAM(str, referenceImageName)
         .DEF_PARAM(int, referenceImageHeight)
diff --git a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
index d4ff33b32..f783ae709 100644
--- a/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
+++ b/python/extensions/pybind_isce3/matchtemplate/pycuampcor.cpp
@@ -32,6 +32,7 @@ void addbinding_pycuampcor_cpu(pybind11::module& m)
         .DEF_PARAM(int, deviceID)
         .DEF_PARAM(int, nStreams)
         .DEF_PARAM(int, derampMethod)
+        .DEF_PARAM(int, derampAxis)
 
         .DEF_PARAM(str, referenceImageName)
         .DEF_PARAM(int, referenceImageHeight)
diff --git a/python/packages/nisar/workflows/dense_offsets.py b/python/packages/nisar/workflows/dense_offsets.py
index e897b19cd..4a1feddd3 100644
--- a/python/packages/nisar/workflows/dense_offsets.py
+++ b/python/packages/nisar/workflows/dense_offsets.py
@@ -202,7 +202,21 @@ def set_optional_attributes(ampcor_obj, cfg, length, width):
 
     if cfg['deramping_method'] is not None:
         deramp = cfg['deramping_method']
-        ampcor_obj.derampMethod = 0 if deramp == "magnitude" else 1
+        if deramp == "magnitude":
+            ampcor_obj.derampMethod = 0
+        elif deramp == "complex":
+            ampcor_obj.derampMethod = 1
+        else: # skip deramping
+            ampcor_obj.derampMethod = 2
+
+    if cfg['deramping_axis'] is not None:
+        deramp_axis = cfg['deramping_axis']
+        if deramp_axis == "azimuth":
+            ampcor_obj.derampAxis = 0
+        elif deramp_axis == "range":
+            ampcor_obj.derampAxis = 1
+        else: # both directions
+            ampcor_obj.derampAxis = 2
 
     if cfg['correlation_statistics_zoom'] is not None:
         ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom']
diff --git a/python/packages/nisar/workflows/offsets_product.py b/python/packages/nisar/workflows/offsets_product.py
index ed5ef8358..5b29ac94e 100644
--- a/python/packages/nisar/workflows/offsets_product.py
+++ b/python/packages/nisar/workflows/offsets_product.py
@@ -256,8 +256,27 @@ def set_ampcor_params(cfg, ampcor_obj):
     ampcor_obj.algorithm = 0 if cfg['cross_correlation_domain'] == \
                                 'frequency' else 1
     ampcor_obj.rawDataOversamplingFactor = cfg['slc_oversampling_factor']
-    ampcor_obj.derampMethod = 0 if cfg['deramping_method'] == \
-                                   'magnitude' else 1
+
+    if cfg['deramping_method'] is not None:
+        deramp = cfg['deramping_method']
+        if deramp == "magnitude":
+            ampcor_obj.derampMethod = 0
+        elif deramp == "complex":
+            ampcor_obj.derampMethod = 1
+        else: # skip deramping
+            ampcor_obj.derampMethod = 2
+
+    if cfg['deramping_axis'] is not None:
+        deramp_axis = cfg['deramping_axis']
+        if deramp_axis == "azimuth":
+            ampcor_obj.derampAxis = 0
+        elif deramp_axis == "range":
+            ampcor_obj.derampAxis = 1
+        elif deramp_axis == "both":
+            ampcor_obj.derampAxis = 2
+        else:
+            raise ValueError(f"invalid {deramp_axis=}")
+
     ampcor_obj.corrStatWindowSize = cfg['correlation_statistics_zoom']
     ampcor_obj.corrSurfaceZoomInWindow = cfg['correlation_surface_zoom']
     ampcor_obj.corrSurfaceOverSamplingFactor = cfg[
diff --git a/share/nisar/defaults/insar.yaml b/share/nisar/defaults/insar.yaml
index 9280e7604..1072ccf3e 100644
--- a/share/nisar/defaults/insar.yaml
+++ b/share/nisar/defaults/insar.yaml
@@ -408,8 +408,10 @@ runconfig:
                 # Anti-aliasing oversampling factor to apply to reference/secondary SLC
                 # prior to cross-correlation computation
                 slc_oversampling_factor: 2
-                # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
+                # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
                 deramping_method: 'complex'
+                # Deramp data axis prior to FFT: range, azimuth, or both
+                deramping_axis: 'azimuth'
                 # Flag to use constant range/azimuth offsets in dense offsets estimation
                 use_gross_offsets: True
                 # Constant offset along slant range to guide dense offset estimation
@@ -459,8 +461,10 @@ runconfig:
                 start_pixel_azimuth:
                 # Cross-correlation domain
                 cross_correlation_domain: 'frequency'
-                # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
+                # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
                 deramping_method: 'complex'
+                # Deramp data axis prior to FFT: range, azimuth, or both
+                deramping_axis: 'azimuth'
                 # Anti-aliasing oversampling factor to apply to reference/secondary SLC
                 # prior to cross-correlation computation
                 slc_oversampling_factor: 2
diff --git a/share/nisar/schemas/insar.yaml b/share/nisar/schemas/insar.yaml
index 999dc97ad..c9e4439e9 100644
--- a/share/nisar/schemas/insar.yaml
+++ b/share/nisar/schemas/insar.yaml
@@ -329,8 +329,11 @@ dense_offsets_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
-    deramping_method: enum('magnitude', 'complex', required=False)
+    # Deramp data prior to FFT: 'magnitude' , 'complex' (linear phase ramp), or 'complex_no_deramp'
+    deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
+
+    # Deramp data axis prior to FFT: range, azimuth, or both
+    deramping_axis: enum('range', 'azimuth', 'both', required=False)
 
     # Flag to use constant range/azimuth offsets in dense offsets estimation
     use_gross_offsets: bool(required=False)
@@ -409,8 +412,11 @@ offsets_product_options:
     # prior to cross-correlation computation
     slc_oversampling_factor: int(min=2, max=5, required=False)
 
-    # Deramp data prior to FFT: magnitude or complex (linear phase ramp)
-    deramping_method: enum('magnitude', 'complex', required=False)
+    # Deramp data prior to FFT: 'magnitude', 'complex' (linear phase ramp), or 'complex_no_deramp'
+    deramping_method: enum('magnitude', 'complex', 'complex_no_deramp', required=False)
+
+    # Deramp data axis prior to FFT: range, azimuth, or both
+    deramping_axis: enum('range', 'azimuth', 'both', required=False)
 
     # Flag to use constant range/azimuth offsets in dense offsets estimation
     use_gross_offsets: bool(required=False)
diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak
index f71dbd19c..30107f364 100644
Binary files a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak and b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/correlation_peak differ
diff --git a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets
index 93a7ba97f..b2398d3f3 100644
Binary files a/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets and b/tests/data/ampcor/accuracy-testdata/ovs128-rho0.8/golden/dense_offsets differ
diff --git a/tests/python/packages/isce3/matchtemplate/test_ampcor.py b/tests/python/packages/isce3/matchtemplate/test_ampcor.py
index ea01ff724..09e75fd2b 100644
--- a/tests/python/packages/isce3/matchtemplate/test_ampcor.py
+++ b/tests/python/packages/isce3/matchtemplate/test_ampcor.py
@@ -83,6 +83,7 @@ def test_ampcor():
             ampcor.algorithm = 0  # frequency
             ampcor.corrSurfaceOverSamplingMethod = ovs
             ampcor.derampMethod = 1
+            ampcor.derampAxis = 0
 
             ampcor.corrStatWindowSize = 21
             ampcor.corrSurfaceZoomInWindow = 8
@@ -159,7 +160,8 @@ def test_ampcor():
                     meantol = 2e-2
                     tol = 1e-1
                 elif fname == "correlation_peak":
-                    meantol = 1e-2
+                    meantol = 2e-2
+                    tol = 5e-2
                 else:
                     meantol = 1 / 64 / 5
                     tol = 1 / 64