add: support sampling and warmup instrumentation policies

Essoz · Essoz · commit 07787ab98156 · 2026-02-11T18:59:16.000-05:00
diff --git a/traincheck/collect_trace.py b/traincheck/collect_trace.py
@@ -380,6 +380,20 @@ def main():
         help="Indicate wthether use torch.compile to speed the model, necessary to realize compatibility",
     )
 
+    ## instrumentation policy configs
+    parser.add_argument(
+        "--sampling-interval",
+        type=int,
+        default=None,
+        help="Interval of steps to instrument (e.g., 10 for every 10th step).",
+    )
+    parser.add_argument(
+        "--warm-up-steps",
+        type=int,
+        default=0,
+        help="Number of initial steps to always instrument.",
+    )
+
     args = parser.parse_args()
 
     # read the configuration file
@@ -508,6 +522,8 @@ def main():
             instr_descriptors=args.instr_descriptors,
             no_auto_var_instr=args.no_auto_var_instr,
             use_torch_compile=args.use_torch_compile,
+            sampling_interval=args.sampling_interval,
+            warm_up_steps=args.warm_up_steps,
         )
 
     if args.copy_all_files:
diff --git a/traincheck/config/config.py b/traincheck/config/config.py
@@ -95,6 +95,13 @@
 TYPE_ERR_THRESHOLD = 3
 RECURSION_ERR_THRESHOLD = 5
 
+INSTRUMENTATION_POLICY = {
+    "interval": 1,
+    "warm_up": 1,  # default to 1 to ensure the first step is always instrumented: before warm-up is depleted, we do instrumentation with interval=1, after warm-up is depleted, we do instrumentation with the specified interval
+}
+
+DISABLE_WRAPPER = False
+
 
 class InstrOpt:
     def __init__(
diff --git a/traincheck/developer/annotations.py b/traincheck/developer/annotations.py
@@ -1,3 +1,4 @@
+import traincheck.config.config as config
 import traincheck.instrumentor.tracer as tracer
 from traincheck.config.config import ALL_STAGE_NAMES
 from traincheck.instrumentor import META_VARS
@@ -16,8 +17,13 @@ def annotate_stage(stage_name: str):
         stage_name in ALL_STAGE_NAMES
     ), f"Invalid stage name: {stage_name}, valid ones are {ALL_STAGE_NAMES}"
 
+    old_stage = META_VARS.get("stage", None)
     META_VARS["stage"] = stage_name
 
+    # We always reset the wrapper when stage changes, and let the policy decide later if we should skip
+    if old_stage != stage_name:
+        config.DISABLE_WRAPPER = False
+
 
 def annotate_answer_start_token_ids(
     answer_start_token_id: int, include_start_token: bool = False
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy.py b/traincheck/instrumentor/proxy_wrapper/proxy.py
@@ -8,6 +8,7 @@
 
 import torch
 
+import traincheck.config.config as config
 import traincheck.instrumentor.proxy_wrapper.proxy_config as proxy_config  # HACK: cannot directly import config variables as then they would be local variables
 import traincheck.instrumentor.proxy_wrapper.proxy_methods as proxy_methods
 from traincheck.config.config import should_disable_proxy_dumping
@@ -158,6 +159,9 @@ def __deepcopy__(self, memo):
         return new_copy
 
     def dump_trace(self, phase, dump_loc):
+        if config.DISABLE_WRAPPER:
+            return
+
         obj = self._obj
         var_name = self.__dict__["var_name"]
         assert var_name is not None  # '' is allowed as a var_name (root object)
diff --git a/traincheck/instrumentor/proxy_wrapper/proxy_observer.py b/traincheck/instrumentor/proxy_wrapper/proxy_observer.py
@@ -1,6 +1,7 @@
 import functools
 import typing
 
+import traincheck.config.config as config
 from traincheck.config.config import should_disable_proxy_dumping
 from traincheck.instrumentor.proxy_wrapper.subclass import ProxyParameter
 from traincheck.utils import typename
@@ -21,6 +22,8 @@ def observe_proxy_var(
     phase,
     observe_api_name: str,
 ):
+    if config.DISABLE_WRAPPER:
+        return
 
     # update the proxy object's timestamp
     var.update_timestamp()
diff --git a/traincheck/instrumentor/proxy_wrapper/subclass.py b/traincheck/instrumentor/proxy_wrapper/subclass.py
@@ -6,6 +6,7 @@
 import torch
 from torch import nn
 
+import traincheck.config.config as config
 from traincheck.config.config import should_disable_proxy_dumping
 from traincheck.instrumentor.dumper import dump_trace_VAR
 from traincheck.instrumentor.proxy_wrapper.dumper import dump_attributes, get_meta_vars
@@ -178,6 +179,9 @@ def register_object(self):
         )
 
     def dump_trace(self, phase, dump_loc):
+        if config.DISABLE_WRAPPER:
+            return
+
         # TODO
         var_name = self.__dict__["var_name"]
         # assert var_name is not None  # '' is allowed as a var_name (root object)
diff --git a/traincheck/instrumentor/source_file.py b/traincheck/instrumentor/source_file.py
@@ -33,6 +33,8 @@ def __init__(
         use_full_instr: bool,
         funcs_to_instr: list[str] | None,
         API_dump_stack_trace: bool,
+        sampling_interval: int,
+        warm_up_steps: int,
     ):
         super().__init__()
         if not modules_to_instr:
@@ -44,10 +46,27 @@ def __init__(
         self.use_full_instr = use_full_instr
         self.funcs_to_instr = funcs_to_instr
         self.API_dump_stack_trace = API_dump_stack_trace
+        self.sampling_interval = sampling_interval
+        self.warm_up_steps = warm_up_steps
+        self.current_function = None
+
+    def visit_FunctionDef(self, node):
+        old_function = self.current_function
+        self.current_function = node.name
+        self.generic_visit(node)
+        self.current_function = old_function
+        return node
+
+    def visit_AsyncFunctionDef(self, node):
+        old_function = self.current_function
+        self.current_function = node.name
+        self.generic_visit(node)
+        self.current_function = old_function
+        return node
 
     def get_instrument_node(self, module_name: str):
         return ast.parse(
-            f"from traincheck.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}).instrument()"
+            f"from traincheck.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}, sampling_interval={str(self.sampling_interval)}, warm_up_steps={str(self.warm_up_steps)}).instrument()"
         ).body
 
     def visit_Import(self, node):
@@ -65,8 +84,6 @@ def visit_Import(self, node):
                 instrument_nodes.append(self.get_instrument_node(n.asname))
             else:
                 instrument_nodes.append(self.get_instrument_node(n.name))
-        # let's see if there are aliases, if yes, use them
-        # if not, let's use the module name directly
         return [node] + instrument_nodes
 
     def visit_ImportFrom(self, node):
@@ -87,6 +104,105 @@ def visit_ImportFrom(self, node):
                 instrument_nodes.append(self.get_instrument_node(n.name))
         return [node] + instrument_nodes
 
+    def _get_loop_context(self, node):
+        # Heuristic: Inject into loops that look like training loops.
+        # Check for calls to .step() or .backward()
+        has_training_signal = False
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call):
+                if isinstance(child.func, ast.Attribute):
+                    if child.func.attr in ["step", "backward"]:
+                        has_training_signal = True
+
+        if has_training_signal:
+            return "training"
+
+        # If no explicit training signal, check if we are in an eval/test function
+        if self.current_function:
+            name_lower = self.current_function.lower()
+            if "test" in name_lower or "eval" in name_lower or "valid" in name_lower:
+                return "eval"
+
+        return None
+
+    def _inject_call(self, node, func_name):
+        import_stmt = ast.ImportFrom(
+            module="traincheck.instrumentor.control",
+            names=[ast.alias(name=func_name, asname=None)],
+            level=0,
+        )
+        call_stmt = ast.Expr(
+            value=ast.Call(
+                func=ast.Name(id=func_name, ctx=ast.Load()), args=[], keywords=[]
+            )
+        )
+        node.body.insert(0, call_stmt)
+        node.body.insert(0, import_stmt)
+        return node
+
+    def visit_For(self, node):
+        self.generic_visit(node)
+        context = self._get_loop_context(node)
+        if context == "training":
+            return self._inject_call(node, "start_step")
+        elif context == "eval":
+            return self._inject_call(node, "start_eval_step")
+        return node
+
+    def visit_While(self, node):
+        self.generic_visit(node)
+        context = self._get_loop_context(node)
+        if context == "training":
+            return self._inject_call(node, "start_step")
+        elif context == "eval":
+            return self._inject_call(node, "start_eval_step")
+        return node
+
+    def _should_inject_control(self, node):
+        # Heuristic: Inject into loops that look like training loops.
+        # Check for calls to .step() or .backward()
+        for child in ast.walk(node):
+            if isinstance(child, ast.Call):
+                if isinstance(child.func, ast.Attribute):
+                    if child.func.attr in ["step", "backward"]:
+                        return True
+        return False
+
+    def _inject_start_step(self, node):
+        import_stmt = ast.ImportFrom(
+            module="traincheck.instrumentor.control",
+            names=[ast.alias(name="start_step", asname=None)],
+            level=0,
+        )
+        call_stmt = ast.Expr(
+            value=ast.Call(
+                func=ast.Name(id="start_step", ctx=ast.Load()), args=[], keywords=[]
+            )
+        )
+        # We need to insert the import at the top of the file ideally,
+        # but inserting inside the loop works if we deal with python scoping (imports are valid statements).
+        # Actually proper way is to add import at module level.
+        # But `visit_Module` is not here.
+        # For simplicity, let's just use fully qualified name or inject import in the loop (a bit inefficient but works).
+        # Better: Inject `import traincheck.instrumentor.control` at top of loop or use `traincheck.instrumentor.control.start_step()` with import logic handled elsewhere?
+        # The `InsertTracerVisitor` modifies the module. We can add an import to the module body if we had access.
+        # `visit_Import` adds imports.
+        # Let's assume `traincheck` is importable.
+
+        # Helper to create `traincheck.instrumentor.control.start_step()` call
+        # And ensure import is present.
+        # Actually `InsertTracerVisitor` is used on the whole file.
+        # Let's just blindly insert the call logic and rely on the fact that we can insert an import at the top of the loop
+        # or just assume the user code can handle it if we inject the import statement right before the call.
+
+        # Let's inject:
+        # from traincheck.instrumentor.control import start_step
+        # start_step()
+
+        node.body.insert(0, call_stmt)
+        node.body.insert(0, import_stmt)
+        return node
+
 
 def instrument_library(
     source: str,
@@ -95,6 +211,8 @@ def instrument_library(
     use_full_instr: bool,
     funcs_to_instr: list[str] | None,
     API_dump_stack_trace: bool,
+    sampling_interval: int,
+    warm_up_steps: int,
 ) -> str:
     """
     Instruments the given source code and returns the instrumented source code.
@@ -116,6 +234,8 @@ def instrument_library(
         use_full_instr,
         funcs_to_instr,
         API_dump_stack_trace,
+        sampling_interval,
+        warm_up_steps,
     )
     root = visitor.visit(root)
     source = ast.unparse(root)
@@ -811,6 +931,8 @@ def instrument_file(
     instr_descriptors: bool,
     no_auto_var_instr: bool,
     use_torch_compile: bool,
+    sampling_interval: int = 1,
+    warm_up_steps: int = 0,
 ) -> str:
     """
     Instruments the given file and returns the instrumented source code.
@@ -827,6 +949,8 @@ def instrument_file(
         use_full_instr,
         funcs_to_instr,
         API_dump_stack_trace,
+        sampling_interval,
+        warm_up_steps,
     )
     # annotate stages
     instrumented_source = annotate_stage(instrumented_source)
diff --git a/traincheck/instrumentor/tracer.py b/traincheck/instrumentor/tracer.py
@@ -162,10 +162,9 @@ def function_wrapper(
         TypeError: function_wrapper() got multiple values for argument 'arg_name'
     """
 
-    global DISABLE_WRAPPER
     global PROCESS_ID
 
-    if DISABLE_WRAPPER:
+    if config.DISABLE_WRAPPER:
         # TODO: all meta vars update should be done outside the function_wrapper (e.g. step increment) by applying a separate wrapper
         return original_function(*args, **kwargs)
 
@@ -403,7 +402,7 @@ def wrapper(
     increment_step = False
     if original_function_name.endswith(".step"):
         owner = get_owner_class(original_function)
-        if issubclass(owner, torch.optim.Optimizer):
+        if owner and issubclass(owner, torch.optim.Optimizer):
             increment_step = True
     # determine statically whether to dump the trace
     if not disable_dump:
@@ -412,7 +411,14 @@ def wrapper(
         @functools.wraps(original_function)
         def wrapped(*args, **kwargs):
             if increment_step:
-                META_VARS["step"] += 1
+                # Meta var update for step is now handled by traincheck.instrumentor.control.start_step
+                # which is injected into training loops.
+                # However, for backward compatibility or if injection fails, we might want to keep basic step counting?
+                # User specifically asked to move logic. If we keep it here, we might double count if both run.
+                # But injection is "An easy way out".
+                # Let's check META_VARS.
+                pass
+
             return function_wrapper(
                 original_function,
                 original_function_name,
@@ -554,6 +560,8 @@ def __init__(
         use_full_instr: bool,
         funcs_to_instr: Optional[list[str]] = None,
         API_dump_stack_trace: bool = False,
+        sampling_interval: int = 1,
+        warm_up_steps: int = 0,
     ):
         """
         Instruments the specified target with additional tracing functionality.
@@ -576,12 +584,24 @@ def __init__(
                 and the functions in this list will be instrumented with dump enabled. NOTE: If this list is provided, use_full_str must be set to False. WRAP_WITHOUT_DUMP will be ignored.
             API_dump_stack_trace (bool):
                 Whether to dump the stack trace of the function call. Enabling this will add the stack trace to the trace log.
+            sampling_interval (int):
+                The interval for sampling-based instrumentation. Every Nth step will be instrumented. Defaults to 1.
+            warm_up_steps (int):
+                The number of initial steps to always instrument. Defaults to 0.
 
         Indirectly, at initialization, the instrumentor will also load the instr_opts.json file if it exists.
         This file is automatically generated by the `collect_trace` script when `--invariants` is provided.
         The user should not need to interact with this file directly.
 
         """
+        if sampling_interval:
+            if config.INSTRUMENTATION_POLICY is None:
+                config.INSTRUMENTATION_POLICY = {}
+            config.INSTRUMENTATION_POLICY["interval"] = sampling_interval
+        if warm_up_steps is not None:
+            if config.INSTRUMENTATION_POLICY is None:
+                config.INSTRUMENTATION_POLICY = {}
+            config.INSTRUMENTATION_POLICY["warm_up"] = warm_up_steps
 
         self.instrumenting = True
         if isinstance(target, types.ModuleType):