CLIP code refactor and improvements.

More generic clip model class that can be used on more types of text encoders. Don't apply weighting algorithm when weight is 1.0 Don't compute an empty token output when it's not needed.
2025-08-03 07:26:31 +08:00 · 2023-11-06 13:43:50 -05:00
parent b3fcd64c6c
commit 656c0b5d90
3 changed files with 62 additions and 33 deletions
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@@ -9,9 +9,8 @@ class SDXLClipG(sd1_clip.SDClipModel):
            layer_idx=-2

        textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_config_bigg.json")
-        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, textmodel_path=textmodel_path, dtype=dtype)
-        self.empty_tokens = [[49406] + [49407] + [0] * 75]
-        self.layer_norm_hidden_state = False
+        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, textmodel_path=textmodel_path, dtype=dtype,
+                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)

    def load_sd(self, sd):
        return super().load_sd(sd)
@@ -38,8 +37,7 @@ class SDXLTokenizer:
 class SDXLClipModel(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None):
        super().__init__()
-        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=11, device=device, dtype=dtype)
-        self.clip_l.layer_norm_hidden_state = False
+        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=11, device=device, dtype=dtype, layer_norm_hidden_state=False)
        self.clip_g = SDXLClipG(device=device, dtype=dtype)

    def clip_layer(self, layer_idx):