load in 8bit correctly

thejaminator · thejaminator · commit df1c0ff23c43 · 2023-05-04T01:33:40.000+08:00
diff --git a/elk/utils/hf_utils.py b/elk/utils/hf_utils.py
@@ -20,15 +20,11 @@
 _AUTOREGRESSIVE_SUFFIXES = ["ConditionalGeneration"] + _DECODER_ONLY_SUFFIXES
 
 
-def instantiate_model(
+def determine_dtypes(
     model_str: str,
-    device: str | torch.device = "cpu",
-    **kwargs,
-) -> PreTrainedModel:
-    """Instantiate a model string with the appropriate `Auto` class."""
-    device = torch.device(device)
-    kwargs["device_map"] = {"": device}
-
+    is_cpu: bool,
+    load_in_8bit: bool,
+) -> torch.dtype | str:
     with prevent_name_conflicts():
         model_cfg = AutoConfig.from_pretrained(model_str)
 
@@ -37,27 +33,47 @@ def instantiate_model(
         fp32_weights = model_cfg.torch_dtype in (None, torch.float32)
 
         # Required by `bitsandbytes` to load in 8-bit.
-        if kwargs.get("load_in_8bit"):
+        if load_in_8bit:
             # Sanity check: we probably shouldn't be loading in 8-bit if the checkpoint
             # is in fp32. `bitsandbytes` only supports mixed fp16/int8 inference, and
             # we can't guarantee that there won't be overflow if we downcast to fp16.
             if fp32_weights:
                 raise ValueError("Cannot load in 8-bit if weights are fp32")
 
-            kwargs["torch_dtype"] = torch.float16
+            torch_dtype = torch.float16
 
         # CPUs generally don't support anything other than fp32.
-        elif device.type == "cpu":
-            kwargs["torch_dtype"] = torch.float32
+        elif is_cpu:
+            torch_dtype = torch.float32
 
         # If the model is fp32 but bf16 is available, convert to bf16.
         # Usually models with fp32 weights were actually trained in bf16, and
         # converting them doesn't hurt performance.
         elif fp32_weights and torch.cuda.is_bf16_supported():
-            kwargs["torch_dtype"] = torch.bfloat16
+            torch_dtype = torch.bfloat16
             print("Weights seem to be fp32, but bf16 is available. Loading in bf16.")
         else:
-            kwargs["torch_dtype"] = "auto"
+            torch_dtype = "auto"
+        return torch_dtype
+
+
+def instantiate_model(
+    model_str: str,
+    load_in_8bit: bool,
+    is_cpu: bool,
+    **kwargs,
+) -> PreTrainedModel:
+    """Instantiate a model string with the appropriate `Auto` class."""
+
+    with prevent_name_conflicts():
+        model_cfg = AutoConfig.from_pretrained(model_str)
+        # If a torch_dtype was not specified, try to infer it.
+        if "torch_dtype" not in kwargs:
+            kwargs["torch_dtype"] = determine_dtypes(
+                model_str=model_str, is_cpu=is_cpu, load_in_8bit=load_in_8bit
+            )
+        # Add load_in_8bit to kwargs
+        kwargs["load_in_8bit"] = load_in_8bit
 
         archs = model_cfg.architectures
         if not isinstance(archs, list):
diff --git a/elk/utils/multi_gpu.py b/elk/utils/multi_gpu.py
@@ -29,18 +29,16 @@ def is_single_gpu(self) -> bool:
     def used_devices(self) -> list[str]:
         return [self.first_device] + self.other_devices
 
+    @property
+    def has_cpu_device(self) -> bool:
+        devices = [torch.device(device) for device in self.used_devices]
+        return any(device.type == "cpu" for device in devices)
+
 
 def instantiate_model_with_devices(
     cfg: "Extract", device_config: ModelDevices, is_verbose: bool, **kwargs
 ) -> PreTrainedModel:
     first_device = device_config.first_device
-    if cfg.int8:
-        # Required by `bitsandbytes`
-        torch_dtype = torch.float16
-    elif device_config == "cpu":
-        torch_dtype = torch.float32
-    else:
-        torch_dtype = "auto"
 
     # TODO: Maybe we should ensure the device map is the same
     # for all the extract processes? This is because the device map
@@ -51,8 +49,7 @@ def instantiate_model_with_devices(
         if device_config.is_single_gpu
         else create_device_map(
             model_str=cfg.model,
-            use_8bit=cfg.int8,
-            torch_dtype=torch_dtype,
+            load_in_8bit=cfg.int8,
             model_devices=device_config,
             verbose=is_verbose,
         )
@@ -67,23 +64,24 @@ def instantiate_model_with_devices(
             cfg.model,
             device_map=device_map,
             load_in_8bit=cfg.int8,
-            torch_dtype=torch_dtype,
+            is_cpu=device_config.has_cpu_device,
             **kwargs,
         )
     return model
 
 
 def create_device_map(
     model_str: str,
-    use_8bit: float,
-    torch_dtype: dtype | str,
+    load_in_8bit: bool,
     model_devices: ModelDevices,
     verbose: bool,
 ) -> dict[str, str]:
     """Creates a device map for a model running on multiple GPUs."""
     with init_empty_weights():
         # Need to first instantiate an empty model to get the layer class
-        model = instantiate_model(model_str=model_str, torch_dtype=torch_dtype)
+        model = instantiate_model(
+            model_str=model_str, load_in_8bit=load_in_8bit, is_cpu=False
+        )
 
     # e.g. {"cuda:0": 16000, "cuda:1": 16000}
     max_memory_all_devices: dict[str, int] = get_available_memory_for_devices()
@@ -97,7 +95,7 @@ def create_device_map(
     max_memory_used_devices[model_devices.first_device] = (
         max_memory_used_devices[model_devices.first_device] * 0.6
     )
-    if use_8bit:
+    if load_in_8bit:
         print("Using 8bit")
     # If 8bit, multiply the memory by 2
     # This is because we instantiated our empty model in (probably) float16
@@ -107,7 +105,7 @@ def create_device_map(
             device: max_memory_used_devices[device] * 2
             for device in max_memory_used_devices
         }
-        if use_8bit
+        if load_in_8bit
         else max_memory_used_devices
     )