fix: improve scales change and revert conditional

4b10c8c3 · drbh · ab4d480d · 4b10c8c3
Commit 4b10c8c3 authored 11 months ago by drbh
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 2 deletions

server/text_generation_server/layers/marlin/fp8.py server/text_generation_server/layers/marlin/fp8.py +5 -2

No files found.
--- a/server/text_generation_server/layers/marlin/fp8.py
+++ b/server/text_generation_server/layers/marlin/fp8.py
@@ -38,9 +38,12 @@ class GPTQMarlinFP8Linear(nn.Module):

        log_once(logger.info, "GPU does not support FP8, using Marlin FP8 kernel")

+        # if scales is a scalar (0D tensor), convert it to a 1D tensor
+        if scales.dim() == 0:
+            scales = scales.unsqueeze(0)
+
        scales = scales.unsqueeze(0)
-        # repack weights for Marlin if a single scale is provided
-        if scales.size(0) == 1:
+        if scales.shape[1] == 1:
            out_features, in_features = qweight.shape
            scales = scales.repeat(1, out_features)
        qweight, scales = repack_fp8_for_marlin(qweight, scales)