Increment of lambda is now on GPU (it is faster, but I wanted it differently)

kocimil1 · kocimil1 · commit 9caf8bc4cf6d · 2026-02-23T15:35:57.000+01:00
diff --git a/cpp/common/molecular/MolWorld_sp3_multi.h b/cpp/common/molecular/MolWorld_sp3_multi.h
@@ -393,8 +393,12 @@ void TI_step(double lambda, double dE, double sigma, double dLambda, int nMDstep
                         if(si_count < nCVs){
                             Quat4f acon  = Quat4f{initial_positions[si_count].x, initial_positions[si_count].y, initial_positions[si_count].z, JEforceconst}; 
                             Quat4f aconK = Quat4f{final_positions[si_count].x,   final_positions[si_count].y,   final_positions[si_count].z,   0.0f}; 
+                            if(si_count == 0){ // for incrementing of lambda step index in the kernel
+                                aconK.w = 1.0f;
+                            }                            
                             constr [isys*ocl.nAtoms + ia] = acon;
                             constrK[isys*ocl.nAtoms + ia] = aconK;
+
                         }
                         si_count++;
                     }
@@ -406,7 +410,7 @@ void TI_step(double lambda, double dE, double sigma, double dLambda, int nMDstep
             double beta = 1.0 / (const_kB * go.T_target);
             double dLambda = 1.0 / (double)(nLambda - 1);
             nPerVFs = nPerVFs_;
-            int nBatches = nMDsteps / (nLambda * nPerVFs * nSystems);
+            int nBatches = nMDsteps / (nLambda * nSystems);
             if( nBatches < 1 ) nBatches = 1;
             
             printf("  Running %d batches of %d pulling steps each...\n", nBatches, nLambda);
@@ -423,15 +427,16 @@ void TI_step(double lambda, double dE, double sigma, double dLambda, int nMDstep
                 run_ocl_opt( nEQsteps, Fconv );
 
                 // 2. Pulling
+                // nPerVFs = nLambda;
                 nPerVFs = nPerVFs_;
-                for(int isys=0; isys<nSystems; isys++){ jeParams[isys].x = 0; jeParams[isys].y = nLambda; jeParams[isys].z = nPerVFs; jeParams[isys].w = 0; }
+                for(int isys=0; isys<nSystems; isys++){ jeParams[isys].x = 0; jeParams[isys].y = nLambda; }
                 ocl.upload( ocl.ibuff_jeParams, jeParams );
 
                 for(int i=0; i<nSystems * nLambda; i++) gpu_work[i] = 0;
                 ocl.upload( ocl.ibuff_work, gpu_work );
 
-                printf("  Pulling for %d * %d = %d steps...\n", nLambda, nPerVFs, nLambda*nPerVFs);
-                run_ocl_opt( nLambda*nPerVFs, Fconv );
+                printf("  Pulling for %d steps...\n", nLambda);
+                run_ocl_opt( nLambda, Fconv );
 
                 // 3. Download work and accumulate
                 ocl.download( ocl.ibuff_work, gpu_work );
@@ -441,6 +446,9 @@ void TI_step(double lambda, double dE, double sigma, double dLambda, int nMDstep
                     double W_traj = 0;
                     for(int i=0; i<nLambda; i++){
                         float dW = gpu_work[ isys * nLambda + i ]*dLambda;
+                        // if(isys == 0 && i%1000 == 0){ 
+                        //     printf("  System %d, lambda %f, dW %f, cumulative W %f\n", isys, (float)i/(float)(nLambda-1), dW, W_traj + dW);
+                        // }
                         W_traj += (double)dW;
                         sum_exp_W[i] += exp(-beta * W_traj);
                     }
@@ -1173,10 +1181,6 @@ double evalVFs( double Fconv=1e-6 ){
             //     TDrive[isys].z = 0;
             // }
             // printf("evalVFs() TDrive[isys].z = %f\n", TDrive[isys].z);
-            if( jeParams && jeParams[isys].x >= 0 ){
-                jeParams[isys].x += 1;
-                jeParams[isys].w = 0;
-            }
             // printf("evalVFs() TDrive[isys].z = %f\n", TDrive[isys].z);
             TDrive[isys].w = randf(-1.0,1.0); 
         }else{
@@ -1189,7 +1193,6 @@ double evalVFs( double Fconv=1e-6 ){
     //printf( "MDpars{%g,%g,%g,%g}\n", MDpars[0].x,MDpars[0].y,MDpars[0].z,MDpars[0].w );
     err |= ocl.upload( ocl.ibuff_MDpars, MDpars );
     err |= ocl.upload( ocl.ibuff_TDrive, TDrive );
-    if( jeParams)err |= ocl.upload( ocl.ibuff_jeParams, jeParams );
     err |= ocl.upload( ocl.ibuff_cvf   , cvfs   );
     // //printf("MolWorld_sp3_multi::evalVFs() bGroupUpdate=%i \n", bGroupUpdate );
     // if(bGroupUpdate){
diff --git a/cpp/common_resources/cl/relax_multi.cl b/cpp/common_resources/cl/relax_multi.cl
@@ -342,8 +342,8 @@ __kernel void getMMFFf4(
             // --- Evaluate bond-length stretching energy and forces
             if(iG<ing){
                 // Bond stretching with proper MMFF parameters from bL[i] and bK[i]
-                E+= evalBond( h.xyz, l-bL[i], bK[i], &f1 );  fbs[i]-=f1;  fa+=f1;   // harmonic bond stretching, fa is force on center atom, fbs[i] is recoil force on i-th neighbor,
-                //E+= evalBond( h.xyz, l-1.198f, 40.f, &f1 );  fbs[i]-=f1;  fa+=f1;   // harmonic bond stretching, fa is force on center atom, fbs[i] is recoil force on i-th neighbor,
+                //E+= evalBond( h.xyz, l-bL[i], bK[i], &f1 );  fbs[i]-=f1;  fa+=f1;   // harmonic bond stretching, fa is force on center atom, fbs[i] is recoil force on i-th neighbor,
+                E+= evalBond( h.xyz, l-1.198f, 40.f, &f1 );  fbs[i]-=f1;  fa+=f1;   // harmonic bond stretching, fa is force on center atom, fbs[i] is recoil force on i-th neighbor,
 
                 // pi-pi alignment interaction            
                 float kpp = Kppi[i];
@@ -385,26 +385,26 @@ __kernel void getMMFFf4(
                 const int jnga = jng+i0a;
                 const float4 hj = hs[j];  
                       
-                E += evalAngleCosHalf( hi, hj, par.xy, par.z, &f1, &f2 );    // evaluate angular force and energy using cos(angle/2) formulation        
-                fa    -= f1+f2;
-
-                //if(bSubtractVdW)
-                { // Remove non-bonded interactions from atoms that are bonded to common neighbor
-                    float4 REQi=REQKs[inga];   // non-bonding parameters of i-th neighbor
-                    float4 REQj=REQKs[jnga];   // non-bonding parameters of j-th neighbor
-                    // combine non-bonding parameters of i-th and j-th neighbors using mixing rules
-                    float4 REQij;             
-                    REQij.x  = REQi.x  + REQj.x;
-                    REQij.yz = REQi.yz * REQj.yz; 
+                // E += evalAngleCosHalf( hi, hj, par.xy, par.z, &f1, &f2 );    // evaluate angular force and energy using cos(angle/2) formulation        
+                // fa    -= f1+f2;
+
+                // //if(bSubtractVdW)
+                // { // Remove non-bonded interactions from atoms that are bonded to common neighbor
+                //     float4 REQi=REQKs[inga];   // non-bonding parameters of i-th neighbor
+                //     float4 REQj=REQKs[jnga];   // non-bonding parameters of j-th neighbor
+                //     // combine non-bonding parameters of i-th and j-th neighbors using mixing rules
+                //     float4 REQij;             
+                //     REQij.x  = REQi.x  + REQj.x;
+                //     REQij.yz = REQi.yz * REQj.yz; 
                     
-                    float3 dp = (hj.xyz/hj.w) - (hi.xyz/hi.w);   // recover vector between i-th and j-th neighbors using stored vectos and inverse bond lengths, this should be faster than dp=apos[jngv].xyz-apos[ingv].xyz; from global memory
-                    float4 fij = getLJQH( dp, REQij, 1.0f );     // compute non-bonded interaction between i-th and j-th neighbors using Lennard-Jones and Coulomb interactions and Hydrogen bond correction
-                    f1 -=  fij.xyz;
-                    f2 +=  fij.xyz;
-                }
-
-                fbs[i]+= f1;
-                fbs[j]+= f2;
+                //     float3 dp = (hj.xyz/hj.w) - (hi.xyz/hi.w);   // recover vector between i-th and j-th neighbors using stored vectos and inverse bond lengths, this should be faster than dp=apos[jngv].xyz-apos[ingv].xyz; from global memory
+                //     float4 fij = getLJQH( dp, REQij, 1.0f );     // compute non-bonded interaction between i-th and j-th neighbors using Lennard-Jones and Coulomb interactions and Hydrogen bond correction
+                //     f1 -=  fij.xyz;
+                //     f2 +=  fij.xyz;
+                // }
+
+                // fbs[i]+= f1;
+                // fbs[j]+= f2;
             }
         }
 
@@ -1004,54 +1004,57 @@ __kernel void updateAtomsMMFFf4(
         // ------- constrains
         float4 cons = constr[ iaa ]; // constraints (x,y,z,K)
 
-        if( (cons.w > 0.f) && jeParams && (jeParams[iS].x > -1) ){
-            // Jarzynski equality 
+        if( (cons.w > 0.f) && jeParams ){
+            // Jarzynski equality or Setup Equilibration
             // We use standard "constr" for initial position and "constrK" for final position
             // But we use "cons.w" as stiffness
-            // jeParams are (iLambda, nLambda, nPerVF, iStep) for Jarzynski Equality
             
             float4 consEnd = constrK[ iaa ];
-            int nLambda    = jeParams[iS].y;
-            float k = cons.w;
-            
-            float lambda = (float)jeParams[iS].x/(float)(nLambda-1);
-            
-            float3 p0 = cons.xyz;
-            float3 p1 = consEnd.xyz;
-            
-            float3 target = p0 + (p1 - p0) * lambda;
-            
-            // Compute Force (Harmonic)
-            // Force on atom = k * (target - pe)
-            float3 fc = (target - pe.xyz) * (float3){k,k,k};
-            fe.xyz += fc;
-            
-            // Accumulate Work
-            // Work done ON system = integral of (dH/dLambda) dLambda
-            // H_spring = 0.5 * k * (x - x0(lambda))^2
-            // dH/dLambda = k * (x - x0) * (-dx0/dLambda)
-            //            = k * (x - x0) * -(p1 - p0)
-            //            = k * (x0 - x) * (p1 - p0)  = fc * (p1 - p0)
-            // So we accumulate dot(fc, dir).
-            
-            float3 dir = p1 - p0;
-            float work_term = dot(fc, dir);
-            if( (jeParams[iS].w >= jeParams[iS].z - 1) && (jeParams[iS].x >= 0) ){
-                // Record work at this step if buffer provided
-                {
-                    volatile __global float* addr = &work[ nLambda * iS + jeParams[iS].x ];
-                    float old_val, new_val;
-                    do {
-                        old_val = *addr;
-                        new_val = old_val + work_term;
-                    } while (atomic_cmpxchg((volatile __global int*)addr, as_int(old_val), as_int(new_val)) != as_int(old_val));
-
-
-                }                    
-            }
-            else if(iG==0){
-                jeParams[iS].w += 1;
-            }
+            int step       = jeParams[iS].x;
+            if (step > -1) {
+                int nLambda    = jeParams[iS].y;
+                float k = cons.w;
+                
+                // RESTRICTION: Ensure step does not overrun nLambda and cause OOB memory access
+                if (step < nLambda) {
+                    // Calculate lambda continuously
+                    float lambda = min(1.0f, (float)step / (float)((nLambda - 1)));
+                    
+                    float3 p0 = cons.xyz;
+                    float3 p1 = consEnd.xyz;
+                    
+                    float3 target = p0 + (p1 - p0) * lambda;
+                    
+                    // Compute Force (Harmonic)
+                    float3 fc = (target - pe.xyz) * (float3){k,k,k};
+                    fe.xyz += fc;
+                    
+                    // Accumulate Work
+                    float3 dir = p1 - p0;
+                    float work_term = dot(fc, dir);
+                    
+                    // Record work at this safely bounded step
+                    {
+                        volatile __global float* addr = &work[ nLambda * iS + step ];
+                        float old_val, new_val;
+                        do {
+                            old_val = *addr;
+                            new_val = old_val + work_term;
+                        } while (atomic_cmpxchg((volatile __global int*)addr, as_int(old_val), as_int(new_val)) != as_int(old_val));
+                    }
+                } // End of Restriction 
+                
+                if(consEnd.w>0.0f && step < nLambda){
+                    jeParams[iS].x += 1; // Increment step index for next step exactly once per system
+                }
+            } else if (step == -1) {
+                // Initial Equilibrium Step
+                float k = cons.w;
+
+                float3 target = cons.xyz;
+                float3 fc = (target - pe.xyz) * (float3){k,k,k};
+                fe.xyz += fc;
+            }  
             cons.w = 0.0f; // Disable standard logic
         }
 
diff --git a/examples/tFreeEnergy_multi/run_ES.sh b/examples/tFreeEnergy_multi/run_ES.sh
@@ -45,11 +45,11 @@ echo "Step 2: Running Free Energy Calculation (Mode: $MODE)..."
 echo "----------------------------------------"
 python3 run_ES.py \
     --mode $MODE \
-    --nSys 10 \
+    --nSys 20 \
     --xyz_name "../tMMFF/data/entropic_spring_$N.xyz" \
     --system_name "entropic_spring_$N" \
     --nLambda 100000 \
-    --nMDsteps 10000000 \
+    --nMDsteps 2000000 \
     --nEQsteps 50000 \
     --Fconv 1e-6 \
     --constraints "constraints_ES.txt" \
@@ -77,8 +77,8 @@ echo "  Completed successfully!"
 echo "=========================================="
 echo ""
 echo "Output files:"
-echo "  - entropic_spring_${N}_TI.dat (raw data)"
-echo "  - entropic_spring_${N}_TI_interactive.html (interactive plot)"
+echo "  - entropic_spring_${N}_free_energy.dat (raw data)"
+echo "  - entropic_spring_${N}_free_energy_interactive.html (interactive plot)"
 echo ""
-echo "To view the interactive plot, open entropic_spring_${N}_TI_interactive.html in a web browser"
+echo "To view the interactive plot, open entropic_spring_${N}_free_energy_interactive.html in a web browser"
 echo ""