Merge branch 'main' of https://github.com/ligerzero-ai/utils

Han Lin Mai · Han Lin Mai · commit 0fe481faddb6 · 2025-04-04T22:38:54.000+02:00
diff --git a/jobscript_templates/CustodianScripts/Static_Custodian_Setonix.sh b/jobscript_templates/CustodianScripts/Static_Custodian_Setonix.sh
@@ -6,8 +6,8 @@
 #SBATCH --time={WALLTIMESTRING}
 #SBATCH --partition=work
 #SBATCH --export=NONE
-#SBATCH --exclusive
-
+##SBATCH --exclusive
+#SBATCH --mem=32GB
 module load vasp/5.4.4
 cd "$PBS_O_WORKDIR"
 
diff --git a/jobscript_templates/jobfile-Setonix-StaticImage-DDEC6 b/jobscript_templates/jobfile-Setonix-StaticImage-DDEC6
@@ -27,9 +27,9 @@ handlers = [VaspErrorHandler(output_filename=output_filename), UnconvergedErrorH
 jobs = [VaspJob(sys.argv[1:], output_file=output_filename, suffix = "",
                 settings_override = [{"dict": "INCAR", "action": {"_set": {"NSW": 1, "LAECHG": True, "LCHARGE": True, "NELM": 500, "EDIFF": 1E-5}}}])]
 c = Custodian(handlers, jobs, max_errors=10)
-c.run()' > StaticImage-DDEC6-custodian.py
+c.run()' > custodian.py
 
-python StaticImage-DDEC6-custodian.py $run_cmd vasp_std &> vasp.log
+python custodian.py $run_cmd vasp_std &> vasp.log
 
 echo '<net charge>
 0.0 <-- specifies the net charge of the unit cell (defaults to 0.0 if nothing specified)
diff --git a/utils/vasp/resubmitter.py b/utils/vasp/resubmitter.py
@@ -3,12 +3,12 @@
 import tarfile
 import subprocess
 import pandas as pd
-import filecmp
+from tqdm import tqdm
+import glob
 
 from utils.vasp.database import find_vasp_directories, check_convergence
 from utils.generic import get_latest_file_iteration
 from utils.jobfile import jobfile
-from tqdm import tqdm
 
 def get_slurm_jobs_working_directories(username="hmai"):
     command = f'squeue -u {username} -o "%i %Z"'
@@ -78,7 +78,7 @@ def reconverge_all(
         # Prioritize directories without vasp.log
         dirs_to_check = dirs_without_log + dirs_with_log
         
-        for i, dir in enumerate(dirs_to_check):
+        for i, dir in enumerate(tqdm(dirs_to_check, desc="Reconverging Directories")):
             if not check_convergence(dir):
                 if i + len(running_queued_job_directories) > self.max_submissions:
                     leftover_calcs_exceeding_queue_limit.append(dir)
@@ -87,9 +87,6 @@ def reconverge_all(
                         dir, calc_type, HPC, VASP_version, CPU, walltime, cpu_per_node
                     )
                     dirs_to_search_next_time.append(dir)
-                    print(f"RERUNNING: {dir}")
-            else:
-                print(f"CONVERGED: {dir}")
 
         self.update_resubmit_log(
             dirs_to_search_next_time
@@ -99,16 +96,19 @@ def reconverge_all(
         return dirs_to_search_next_time
 
     def load_non_converged_paths(self, from_dataframe_path):
-        if from_dataframe_path:
-            df = pd.read_pickle(from_dataframe_path)
-            return [
-                (
-                    path.rstrip(os.sep + "OUTCAR")
-                    if path.endswith(os.sep + "OUTCAR")
-                    else path
-                )
-                for path in df["filepath"].tolist()
-            ]
+        if from_dataframe_path is not None:
+            try:
+                df = pd.read_pickle(from_dataframe_path)
+                return [
+                    (
+                        path.rstrip(os.sep + "OUTCAR")
+                        if path.endswith(os.sep + "OUTCAR")
+                        else path
+                    )
+                    for path in df["filepath"].tolist()
+                ]
+            except Exception as e:
+                print(f"Failed to read dataframe: {from_dataframe_path} with exception {e}")
         return self.reconverge_from_log_file()
 
     def update_resubmit_log(self, dirs_to_search_next_time):
@@ -126,6 +126,9 @@ def reconverge(
         walltime=24,
         cpu_per_node=128,
     ):
+        if glob.fnmatch.fnmatch(os.path.basename(dirpath), 'error_run*'):
+            #print(f"Skipping {dirpath} as it contains an error_run folder")
+            return
         self.handle_error_run_files(dirpath)
         reconverge_methods = {
             "static": self.reconverge_static,
@@ -163,7 +166,7 @@ def handle_error_run_files(self, dirpath, filename_to_compare="vasp.log"):
             )
             os.makedirs(error_run_folder_path, exist_ok=True)
             self.move_files_to_error_run_folder(dirpath, error_run_folder_path)
-
+            
     def move_files_to_error_run_folder(self, dirpath, error_run_folder_path):
         for f in os.listdir(dirpath):
             if ("error" in f and "tar" in f) or f.endswith(".sh"):
@@ -283,7 +286,7 @@ def reconverge_generic(
         template_type = template_filename.split('_')[1].split('.py')[0]
         script_name = os.path.join(
             self.script_template_dir,
-            f"{template_type}_Custodian_{HPC}.sh",
+            f"{template_filename.split('_')[1].split('.py')[0]}_Custodian_{HPC}.sh"
         )
         job = jobfile(
             file_path=script_name,