Move text projection into the CLIP model code.

Fix issue with not loading the SSD1B clip correctly.
2025-08-02 23:14:49 +08:00 · 2024-02-25 01:41:08 -05:00
parent 6533b172c1
commit 1cb3f6a83b
5 changed files with 33 additions and 15 deletions
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -98,8 +98,22 @@ def transformers_convert(sd, prefix_from, prefix_to, number):
                    p = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"]
                    k_to = "{}encoder.layers.{}.{}.{}".format(prefix_to, resblock, p[x], y)
                    sd[k_to] = weights[shape_from*x:shape_from*(x + 1)]
+
    return sd

+def clip_text_transformers_convert(sd, prefix_from, prefix_to):
+    sd = transformers_convert(sd, prefix_from, "{}text_model.".format(prefix_to), 32)
+
+    tp = "{}text_projection.weight".format(prefix_from)
+    if tp in sd:
+        sd["{}text_projection.weight".format(prefix_to)] = sd.pop(tp)
+
+    tp = "{}text_projection".format(prefix_from)
+    if tp in sd:
+        sd["{}text_projection.weight".format(prefix_to)] = sd.pop(tp).transpose(0, 1)
+    return sd
+
+
 UNET_MAP_ATTENTIONS = {
    "proj_in.weight",
    "proj_in.bias",