data-archetype
/

semdisdiffae

image-reconstruction

image-tokenizer

semantic-alignment

Model card Files Files and versions

data-archetype commited on 20 days ago

Commit

518072c

·

verified ·

1 Parent(s): 1e745d3

Upload folder using huggingface_hub

Files changed (1) hide show

capacitor_diffae/model.py +51 -0

capacitor_diffae/model.py CHANGED Viewed

@@ -65,10 +65,22 @@ class CapacitorDiffAE(nn.Module):
         recon = model.reconstruct(images)
     """
     def __init__(self, config: CapacitorDiffAEConfig) -> None:
         super().__init__()
         self.config = config
         self.encoder = Encoder(
             in_channels=config.in_channels,
             patch_size=config.patch_size,
@@ -154,6 +166,45 @@ class CapacitorDiffAE(nn.Module):
         model.eval()
         return model
     def encode(self, images: Tensor) -> Tensor:
         """Encode images to latents (posterior mode).

         recon = model.reconstruct(images)
     """
+    _LATENT_NORM_EPS: float = 1e-4
     def __init__(self, config: CapacitorDiffAEConfig) -> None:
         super().__init__()
         self.config = config
+        # Latent running stats for whitening/dewhitening
+        self.register_buffer(
+            "latent_norm_running_mean",
+            torch.zeros((config.bottleneck_dim,), dtype=torch.float32),
+        )
+        self.register_buffer(
+            "latent_norm_running_var",
+            torch.ones((config.bottleneck_dim,), dtype=torch.float32),
+        )
         self.encoder = Encoder(
             in_channels=config.in_channels,
             patch_size=config.patch_size,
         model.eval()
         return model
+    def _latent_norm_stats(self) -> tuple[Tensor, Tensor]:
+        """Return (mean, std) tensors for latent whitening, shaped [1,C,1,1]."""
+        mean = self.latent_norm_running_mean.view(1, -1, 1, 1)
+        var = self.latent_norm_running_var.view(1, -1, 1, 1)
+        std = torch.sqrt(var.to(torch.float32) + self._LATENT_NORM_EPS)
+        return mean.to(torch.float32), std
+    def whiten(self, latents: Tensor) -> Tensor:
+        """Whiten encoder latents using per-channel running stats.
+        Use this before passing latents to a downstream latent-space
+        diffusion model. The whitened latents have approximately zero mean
+        and unit variance per channel.
+        Args:
+            latents: [B, bottleneck_dim, h, w] raw encoder output.
+        Returns:
+            Whitened latents [B, bottleneck_dim, h, w] in float32.
+        """
+        z = latents.to(torch.float32)
+        mean, std = self._latent_norm_stats()
+        return (z - mean.to(device=z.device)) / std.to(device=z.device)
+    def dewhiten(self, latents: Tensor) -> Tensor:
+        """Undo whitening to recover raw encoder latent scale.
+        Use this before passing whitened latents back to ``decode()``.
+        Args:
+            latents: [B, bottleneck_dim, h, w] whitened latents.
+        Returns:
+            Dewhitened latents [B, bottleneck_dim, h, w] in float32.
+        """
+        z = latents.to(torch.float32)
+        mean, std = self._latent_norm_stats()
+        return z * std.to(device=z.device) + mean.to(device=z.device)
     def encode(self, images: Tensor) -> Tensor:
         """Encode images to latents (posterior mode).