diff --git a/x/create/client/quantize.go b/x/create/client/quantize.go index d425f72e9..58638a74e 100644 --- a/x/create/client/quantize.go +++ b/x/create/client/quantize.go @@ -70,9 +70,13 @@ func loadAndQuantizeArray(r io.Reader, name, quantize string, arrays map[string] if info, ok := header[inputKey]; ok && info.Dtype == "F8_E4M3" { scaleKey := inputKey + ".scale_inv" scaleInv := st.Get(scaleKey) + if scaleInv == nil { + scaleKey = inputKey + ".scale" + scaleInv = st.Get(scaleKey) + } if scaleInv == nil { st.Free() - return tmpPath, nil, nil, fmt.Errorf("missing companion tensor %q for fp8 source tensor %q", scaleKey, inputKey) + return tmpPath, nil, nil, fmt.Errorf("missing companion tensor %q or %q for fp8 source tensor %q", inputKey+".scale_inv", inputKey+".scale", inputKey) } arr, err = decodeSourceFP8Tensor(arr, scaleInv) if err != nil { @@ -560,13 +564,13 @@ func safetensorsKey(preferred string, header map[string]safetensorsHeaderEntry) return keys[0], nil } -func decodeSourceFP8Tensor(weight, scaleInv *mlx.Array) (*mlx.Array, error) { - if weight == nil || scaleInv == nil { +func decodeSourceFP8Tensor(weight, scale *mlx.Array) (*mlx.Array, error) { + if weight == nil || scale == nil { return nil, fmt.Errorf("fp8 weight and scale tensors are required") } weightShape := weight.Dims() - scaleShape := scaleInv.Dims() + scaleShape := scale.Dims() if len(weightShape) != 2 || len(scaleShape) != 2 { return nil, fmt.Errorf("expected 2D fp8 weight and scale tensors, got %v and %v", weightShape, scaleShape) } @@ -596,7 +600,7 @@ func decodeSourceFP8Tensor(weight, scaleInv *mlx.Array) (*mlx.Array, error) { } decoded = mlx.Reshape(decoded, int32(scaleShape[0]), int32(blockRows), int32(scaleShape[1]), int32(blockCols)) - decoded = mlx.Mul(decoded, mlx.ExpandDims(mlx.ExpandDims(scaleInv, 1), 3)) + decoded = mlx.Mul(decoded, mlx.ExpandDims(mlx.ExpandDims(scale, 1), 3)) decoded = mlx.Reshape(decoded, int32(rows+padBottom), int32(cols+padSide)) if padBottom > 0 || padSide > 0 { decoded = mlx.SliceStartStop(decoded, []int32{0, 0}, []int32{int32(rows), int32(cols)}) diff --git a/x/create/client/quantize_test.go b/x/create/client/quantize_test.go new file mode 100644 index 000000000..3e4a5f8bb --- /dev/null +++ b/x/create/client/quantize_test.go @@ -0,0 +1,24 @@ +package client + +import ( + "testing" + + "github.com/ollama/ollama/x/mlxrunner/mlx" +) + +func TestDecodeSourceFP8TensorAcceptsWeightScale(t *testing.T) { + if err := mlx.CheckInit(); err != nil { + t.Skipf("MLX unavailable: %v", err) + } + + weight := mlx.FromValues([]uint8{0, 1, 2, 3}, 2, 2) + scale := mlx.FromValues([]float32{1}, 1, 1).AsType(mlx.DTypeBFloat16) + got, err := decodeSourceFP8Tensor(weight, scale) + if err != nil { + t.Fatal(err) + } + mlx.Eval(got) + if dims := got.Dims(); len(dims) != 2 || dims[0] != 2 || dims[1] != 2 { + t.Fatalf("decoded dims = %v, want [2 2]", dims) + } +} diff --git a/x/create/create.go b/x/create/create.go index 54beed3ee..1388b6067 100644 --- a/x/create/create.go +++ b/x/create/create.go @@ -1,9 +1,11 @@ package create import ( + "encoding/binary" "encoding/json" "fmt" "io" + "math" "os" "path/filepath" "regexp" @@ -325,6 +327,77 @@ func isStackedExpertWeight(name string) bool { strings.Contains(name, ".moe.experts.") } +func sourceFP8BF16PromotionQuantization(name string, shape []int32, requested string) string { + quantNorm := normalizeQuantType(requested) + if quantNorm == "" { + return "" + } + + switch quantNorm { + case "nvfp4", "mxfp4", "mxfp8": + default: + return "" + } + + if !sourceFP8CanPromoteBF16Weight(name, shape) { + return "" + } + + return "mxfp8" +} + +func sourceFP8TensorQuantization(name string, shape []int32, requested string, fallback string) string { + quantNorm := normalizeQuantType(requested) + switch quantNorm { + case "nvfp4", "mxfp4": + if sourceFP8ShouldPromoteLowBitTensor(name, shape) { + return "mxfp8" + } + } + return fallback +} + +func sourceFP8ShouldPromoteLowBitTensor(name string, shape []int32) bool { + if len(shape) != 2 || !isAligned(shape, "mxfp8") { + return false + } + + return strings.Contains(name, "down_proj") || + strings.Contains(name, ".v_proj") || + strings.Contains(name, ".k_proj") +} + +func sourceFP8CanPromoteBF16Weight(name string, shape []int32) bool { + if !strings.HasSuffix(name, ".weight") || len(shape) != 2 { + return false + } + + var elems int64 = 1 + for _, d := range shape { + elems *= int64(d) + } + if elems < 1024 { + return false + } + + if !isAligned(shape, "mxfp8") { + return false + } + + switch { + case strings.Contains(name, "audio_tower") || strings.Contains(name, "embed_audio"): + return false + case strings.Contains(name, "norm") || strings.Contains(name, "ln_") || strings.Contains(name, "layernorm"): + return false + case strings.Contains(name, "router") || strings.Contains(name, "score_correction"): + return false + case strings.Contains(name, "mlp.gate.weight") && !strings.Contains(name, "_proj"): + return false + default: + return true + } +} + // GetTensorQuantization returns the appropriate quantization type for a tensor. // Returns "" if the tensor should not be quantized. // This implements mixed-precision quantization: @@ -390,6 +463,7 @@ func GetTensorQuantization(name string, shape []int32, quantize string) string { } var expertLayerPrefixRegexp = regexp.MustCompile(`^(?:model\.language_model\.|language_model(?:\.model)?\.|model\.)?layers\.\d+$`) +var prequantizedExpertSuffixRegexp = regexp.MustCompile(`^\.(\d+)\.(.+)$`) // ExpertGroupPrefix returns the group prefix for expert tensors that should be packed together. // For example: @@ -442,8 +516,17 @@ type sourceQuantization struct { Bits int `json:"bits"` GroupSize int `json:"group_size"` Mode string `json:"mode"` + Format string `json:"format"` QuantMethod string `json:"quant_method"` WeightBlockSize []int32 `json:"weight_block_size"` + ConfigGroups map[string]struct { + Format string `json:"format"` + Weights struct { + BlockStructure []int32 `json:"block_structure"` + NumBits int `json:"num_bits"` + Type string `json:"type"` + } `json:"weights"` + } `json:"config_groups"` } type sourceModelConfig struct { @@ -451,10 +534,12 @@ type sourceModelConfig struct { Architectures []string `json:"architectures"` Quantization sourceQuantization `json:"quantization"` QuantizationConfig sourceQuantization `json:"quantization_config"` + CompressionConfig sourceQuantization `json:"compression_config"` TextConfig struct { ModelType string `json:"model_type"` Quantization sourceQuantization `json:"quantization"` QuantizationConfig sourceQuantization `json:"quantization_config"` + CompressionConfig sourceQuantization `json:"compression_config"` } `json:"text_config"` } @@ -489,8 +574,10 @@ func (cfg sourceModelConfig) QuantMetadata() map[string]string { for _, candidate := range []sourceQuantization{ cfg.Quantization, cfg.QuantizationConfig, + cfg.CompressionConfig, cfg.TextConfig.Quantization, cfg.TextConfig.QuantizationConfig, + cfg.TextConfig.CompressionConfig, } { if candidate.Bits != 0 { q = candidate @@ -515,21 +602,32 @@ type sourceQuantizedKind string const ( sourceQuantizedKindNone sourceQuantizedKind = "" sourceQuantizedKindPrequantized sourceQuantizedKind = "prequantized" - sourceQuantizedKindHFFP8 sourceQuantizedKind = "hf_fp8" + sourceQuantizedKindSourceFP8 sourceQuantizedKind = "source_fp8" ) func (cfg sourceModelConfig) quantizationConfigs() []sourceQuantization { return []sourceQuantization{ cfg.Quantization, cfg.QuantizationConfig, + cfg.CompressionConfig, cfg.TextConfig.Quantization, cfg.TextConfig.QuantizationConfig, + cfg.TextConfig.CompressionConfig, } } func (cfg sourceModelConfig) HFFP8WeightBlockSize() (rows, cols int32, ok bool) { for _, q := range cfg.quantizationConfigs() { if !strings.EqualFold(q.QuantMethod, "fp8") || len(q.WeightBlockSize) != 2 { + if !strings.EqualFold(q.QuantMethod, "compressed-tensors") && !strings.EqualFold(q.Format, "float-quantized") { + continue + } + for _, group := range q.ConfigGroups { + if !strings.EqualFold(group.Format, "float-quantized") || group.Weights.NumBits != 8 || !strings.EqualFold(group.Weights.Type, "float") || len(group.Weights.BlockStructure) != 2 { + continue + } + return group.Weights.BlockStructure[0], group.Weights.BlockStructure[1], true + } continue } return q.WeightBlockSize[0], q.WeightBlockSize[1], true @@ -537,13 +635,28 @@ func (cfg sourceModelConfig) HFFP8WeightBlockSize() (rows, cols int32, ok bool) return 0, 0, false } +func (cfg sourceModelConfig) hasPackedNVFP4Format() bool { + for _, q := range cfg.quantizationConfigs() { + if strings.EqualFold(q.Format, "nvfp4-pack-quantized") { + return true + } + } + return false +} + func inspectSourceQuantization(modelDir string, cfg sourceModelConfig) (sourceQuantizedKind, error) { + // Check for NVIDIA ModelOpt hf_quant_config.json (NVFP4) + if detectModelOptQuantization(modelDir) { + return sourceQuantizedKindPrequantized, nil + } + entries, err := os.ReadDir(modelDir) if err != nil { return sourceQuantizedKindNone, err } - hasScaleInv := false + hasFP8Scale := false + hasPackedNVFP4 := false for _, entry := range entries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".safetensors") { continue @@ -559,23 +672,57 @@ func inspectSourceQuantization(modelDir string, cfg sourceModelConfig) (sourceQu case strings.HasSuffix(name, ".scales"): extractor.Close() return sourceQuantizedKindPrequantized, nil + case strings.HasSuffix(name, ".weight_packed"): + hasPackedNVFP4 = true case strings.HasSuffix(name, ".weight_scale_inv"): - hasScaleInv = true + hasFP8Scale = true + case strings.HasSuffix(name, ".weight_scale"): + hasFP8Scale = true } } extractor.Close() } - if hasScaleInv { + if hasPackedNVFP4 && cfg.hasPackedNVFP4Format() { + return sourceQuantizedKindPrequantized, nil + } + + if hasFP8Scale { if _, _, ok := cfg.HFFP8WeightBlockSize(); ok { - return sourceQuantizedKindHFFP8, nil + return sourceQuantizedKindSourceFP8, nil } } return sourceQuantizedKindNone, nil } +// modelOptQuantConfig represents the hf_quant_config.json format from +// NVIDIA ModelOpt (TensorRT Model Optimizer). +type modelOptQuantConfig struct { + Producer struct { + Name string `json:"name"` + Version string `json:"version"` + } `json:"producer"` + Quantization struct { + QuantAlgo string `json:"quant_algo"` + GroupSize int `json:"group_size"` + ExcludeModules []string `json:"exclude_modules"` + } `json:"quantization"` +} + +func detectModelOptQuantization(modelDir string) bool { + data, err := os.ReadFile(filepath.Join(modelDir, "hf_quant_config.json")) + if err != nil { + return false + } + var cfg modelOptQuantConfig + if err := json.Unmarshal(data, &cfg); err != nil { + return false + } + return strings.ToUpper(cfg.Quantization.QuantAlgo) == "NVFP4" +} + func resolveEffectiveQuantization(cfg sourceModelConfig, sourceKind sourceQuantizedKind, requested string) (string, error) { switch sourceKind { case sourceQuantizedKindNone: @@ -585,10 +732,7 @@ func resolveEffectiveQuantization(cfg sourceModelConfig, sourceKind sourceQuanti return "", fmt.Errorf("cannot requantize already-quantized source model with --quantize %q", requested) } return "", nil - case sourceQuantizedKindHFFP8: - if requested != "" { - return "", fmt.Errorf("cannot requantize already-quantized fp8 source model with --quantize %q", requested) - } + case sourceQuantizedKindSourceFP8: rows, cols, ok := cfg.HFFP8WeightBlockSize() if !ok { return "", fmt.Errorf("fp8 source model missing weight_block_size metadata") @@ -596,12 +740,36 @@ func resolveEffectiveQuantization(cfg sourceModelConfig, sourceKind sourceQuanti if rows != 128 || cols != 128 { return "", fmt.Errorf("unsupported fp8 source block size %dx%d", rows, cols) } + if requested != "" { + requested = normalizeQuantType(requested) + switch requested { + case "nvfp4", "mxfp4", "mxfp8": + return requested, nil + default: + return "", fmt.Errorf("cannot convert already-quantized fp8 source model with --quantize %q", requested) + } + } return "mxfp8", nil default: return "", fmt.Errorf("unsupported source quantization kind %q", sourceKind) } } +func importQuantizationStatus(sourceKind sourceQuantizedKind, effectiveQuantize string) string { + if effectiveQuantize == "" { + if sourceKind == sourceQuantizedKindPrequantized { + return ", preserving source quantization" + } + return "" + } + switch sourceKind { + case sourceQuantizedKindSourceFP8: + return fmt.Sprintf(", converting source E4M3 block-FP8 to MLX %s", effectiveQuantize) + default: + return fmt.Sprintf(", quantizing to %s", effectiveQuantize) + } +} + type tensorImportTransform interface { skipTensor(name string) bool transformTensor(td *safetensors.TensorData) ([]*safetensors.TensorData, error) @@ -666,6 +834,10 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La return err } sourceQuantMetadata := sourceConfig.QuantMetadata() + sourceTensorFiles, err := readSourceTensorFiles(modelDir) + if err != nil { + return fmt.Errorf("failed to read source tensor index: %w", err) + } importTransform, err := newTensorImportTransform(modelDir, sourceConfig) if err != nil { return fmt.Errorf("failed to construct import transform for architecture %q: %w", sourceConfig.Architecture(), err) @@ -680,16 +852,22 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La // Readers reference file-backed SectionReaders, so we keep extractors // open until each group is flushed to avoid buffering tensor data in memory. expertGroups := make(map[string][]PackedTensorInput) + prequantizedExpertGroups := make(map[string][]*safetensors.TensorData) var expertGroupOrder []string // Track open extractors so we can close them after flushing groups var openExtractors []*safetensors.TensorExtractor + crossFileExtractors := make(map[string]*safetensors.TensorExtractor) closeExtractors := func() { for _, ext := range openExtractors { ext.Close() } openExtractors = nil + for _, ext := range crossFileExtractors { + ext.Close() + } + clear(crossFileExtractors) } entries, err := os.ReadDir(modelDir) @@ -717,11 +895,7 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La for _, name := range tensorNames { tensorSet[name] = struct{}{} } - quantizeMsg := "" - if effectiveQuantize != "" { - quantizeMsg = fmt.Sprintf(", quantizing to %s", effectiveQuantize) - } - fn(fmt.Sprintf("importing %s (%d tensors%s)", entry.Name(), len(tensorNames), quantizeMsg)) + fn(fmt.Sprintf("importing %s (%d tensors%s)", entry.Name(), len(tensorNames), importQuantizationStatus(sourceQuantKind, effectiveQuantize))) // Track whether this extractor has expert tensors that need to stay open hasExpertTensors := false @@ -730,10 +904,10 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La if importTransform.skipTensor(tensorName) { continue } - if shouldSkipSourceCompanion(tensorName, tensorSet) { + if shouldSkipSourceCompanion(tensorName, tensorSet, sourceTensorFiles) { continue } - sourceFP8ScaleName, hasSourceFP8Scale := sourceFP8Companion(tensorName, tensorSet) + sourceFP8ScaleName, hasSourceFP8Scale := sourceFP8Companion(tensorName, tensorSet, sourceTensorFiles) td, err := extractor.GetTensor(tensorName) if err != nil { @@ -742,6 +916,28 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La return fmt.Errorf("failed to get tensor %s: %w", tensorName, err) } + if packedCreator != nil { + if packedWeightName := strings.TrimSuffix(tensorName, "_packed"); packedWeightName != tensorName { + groupPrefix := ExpertGroupPrefix(packedWeightName) + if groupPrefix != "" { + packedTensors, ok, err := packedNVFP4TensorData(modelDir, extractor, crossFileExtractors, td, tensorName, tensorSet, sourceTensorFiles) + if err != nil { + extractor.Close() + closeExtractors() + return err + } + if ok { + hasExpertTensors = true + if _, exists := prequantizedExpertGroups[groupPrefix]; !exists { + expertGroupOrder = append(expertGroupOrder, groupPrefix) + } + prequantizedExpertGroups[groupPrefix] = append(prequantizedExpertGroups[groupPrefix], packedTensors...) + continue + } + } + } + } + if effectiveQuantize == "" { layer, ok, err := createPrequantizedLayer(extractor, td, tensorName, tensorSet, sourceQuantMetadata, createLayer) if err != nil { @@ -753,6 +949,27 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La layers = append(layers, layer) continue } + layer, ok, err = createPackedNVFP4Layer(modelDir, extractor, crossFileExtractors, td, tensorName, tensorSet, sourceTensorFiles, sourceQuantMetadata, createLayer) + if err != nil { + extractor.Close() + closeExtractors() + return err + } + if ok { + layers = append(layers, layer) + continue + } + // Try ModelOpt NVFP4 format (weight_scale + weight_scale_2) + layer, ok, err = createModelOptFP4Layer(extractor, td, tensorName, tensorSet, sourceQuantMetadata, createLayer) + if err != nil { + extractor.Close() + closeExtractors() + return err + } + if ok { + layers = append(layers, layer) + continue + } } outputTensors, err := importTransform.transformTensor(td) @@ -767,10 +984,16 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La // GetTensorQuantization handles mixed-precision (e.g., Q8 for attention, Q4 for FFN) quantizeType := "" switch { - case sourceQuantKind == sourceQuantizedKindHFFP8 && hasSourceFP8Scale: - quantizeType = "mxfp8" - case sourceQuantKind == sourceQuantizedKindHFFP8: - quantizeType = "" + case sourceQuantKind == sourceQuantizedKindSourceFP8 && hasSourceFP8Scale: + quantizeType = importTransform.quantizationType(outTD.Name, outTD.Shape, effectiveQuantize) + if quantizeType == "" && effectiveQuantize == "mxfp8" { + // Source FP8 tensors are already quantized weights and small + // synthetic tests may not pass the generic import size filter. + quantizeType = "mxfp8" + } + quantizeType = sourceFP8TensorQuantization(outTD.Name, outTD.Shape, quantize, quantizeType) + case sourceQuantKind == sourceQuantizedKindSourceFP8: + quantizeType = sourceFP8BF16PromotionQuantization(outTD.Name, outTD.Shape, quantize) case effectiveQuantize != "": quantizeType = importTransform.quantizationType(outTD.Name, outTD.Shape, effectiveQuantize) } @@ -784,15 +1007,15 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La if quantizeType == "" { extractor.Close() closeExtractors() - return fmt.Errorf("source fp8 tensor %s was not scheduled for mxfp8 conversion", tensorName) + return fmt.Errorf("source fp8 tensor %s was not scheduled for %s conversion", tensorName, effectiveQuantize) } - scaleTD, err := extractor.GetTensor(sourceFP8ScaleName) + scaleTD, err := getTensorFromSource(modelDir, extractor, crossFileExtractors, sourceTensorFiles, sourceFP8ScaleName) if err != nil { extractor.Close() closeExtractors() return fmt.Errorf("failed to get fp8 scale tensor %s: %w", sourceFP8ScaleName, err) } - reader = buildSourceFP8Reader(outTD, scaleTD.WithName(outTD.Name+".scale_inv")) + reader = buildSourceFP8Reader(outTD, scaleTD) } // Check if this tensor belongs to an expert group for packing @@ -843,6 +1066,31 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La if packedCreator != nil { sort.Strings(expertGroupOrder) for _, groupName := range expertGroupOrder { + if tensors := prequantizedExpertGroups[groupName]; len(tensors) > 0 { + layer, ok, err := createPackedNVFP4ExpertGroupLayer(groupName, tensors, createLayer) + if err != nil { + closeExtractors() + return fmt.Errorf("failed to create packed prequantized layer for %s: %w", groupName, err) + } + if ok { + layers = append(layers, layer) + continue + } + layer, err = createLayer( + safetensors.BuildPackedSafetensorsReaderWithMetadata(tensors, map[string]string{ + "quant_type": "nvfp4", + "group_size": "16", + }), + "application/vnd.ollama.image.tensor", + groupName, + ) + if err != nil { + closeExtractors() + return fmt.Errorf("failed to create packed prequantized layer for %s: %w", groupName, err) + } + layers = append(layers, layer) + continue + } tensors := expertGroups[groupName] fn(fmt.Sprintf("packing %s (%d tensors)", groupName, len(tensors))) layer, err := packedCreator(groupName, tensors) @@ -904,7 +1152,7 @@ func CreateSafetensorsModel(modelName, modelDir, quantize string, createLayer La return nil } -func shouldSkipSourceCompanion(name string, tensorSet map[string]struct{}) bool { +func shouldSkipSourceCompanion(name string, tensorSet map[string]struct{}, sourceTensorFiles map[string]string) bool { switch { case strings.HasSuffix(name, ".scales"): _, ok := tensorSet[strings.TrimSuffix(name, ".scales")+".weight"] @@ -915,23 +1163,70 @@ func shouldSkipSourceCompanion(name string, tensorSet map[string]struct{}) bool case strings.HasSuffix(name, ".weight_scale_inv"): _, ok := tensorSet[strings.TrimSuffix(name, "_scale_inv")] return ok + case strings.HasSuffix(name, ".weight_scale"): + base := strings.TrimSuffix(name, "_scale") + if _, ok := tensorSet[base]; ok { + return true + } + if _, ok := sourceTensorFiles[base+"_packed"]; ok { + return true + } + _, ok := tensorSet[base+"_packed"] + return ok + // ModelOpt NVFP4 companion tensors + case strings.HasSuffix(name, ".weight_scale_2"): + _, ok := tensorSet[strings.TrimSuffix(name, "_scale_2")] + return ok + case strings.HasSuffix(name, ".input_scale"): + // Activation scale for ModelOpt — not needed for weight-only inference + base := strings.TrimSuffix(name, ".input_scale") + _, ok := tensorSet[base+".weight"] + return ok + case strings.HasSuffix(name, ".weight_global_scale"): + base := strings.TrimSuffix(name, ".weight_global_scale") + if _, ok := sourceTensorFiles[base+".weight_packed"]; ok { + return true + } + _, ok := tensorSet[base+".weight_packed"] + return ok + case strings.HasSuffix(name, ".input_global_scale"): + base := strings.TrimSuffix(name, ".input_global_scale") + if _, ok := sourceTensorFiles[base+".weight_packed"]; ok { + return true + } + _, ok := tensorSet[base+".weight_packed"] + return ok default: return false } } -func sourceFP8Companion(weightName string, tensorSet map[string]struct{}) (scaleName string, ok bool) { +func sourceFP8Companion(weightName string, tensorSet map[string]struct{}, sourceTensorFiles map[string]string) (scaleName string, ok bool) { if !strings.HasSuffix(weightName, ".weight") { return "", false } scaleName = weightName + "_scale_inv" - _, ok = tensorSet[scaleName] + if _, ok = tensorSet[scaleName]; ok { + return scaleName, true + } + if _, ok = sourceTensorFiles[scaleName]; ok { + return scaleName, true + } + scaleName = weightName + "_scale" + if _, ok = tensorSet[scaleName]; ok { + return scaleName, true + } + _, ok = sourceTensorFiles[scaleName] return scaleName, ok } func buildSourceFP8Reader(weightTD, scaleTD *safetensors.TensorData) io.Reader { - return safetensors.BuildPackedSafetensorsReader([]*safetensors.TensorData{weightTD, scaleTD}) + scaleName := weightTD.Name + ".scale_inv" + if strings.HasSuffix(scaleTD.Name, "_scale") && !strings.HasSuffix(scaleTD.Name, "_scale_inv") { + scaleName = weightTD.Name + ".scale" + } + return safetensors.BuildPackedSafetensorsReader([]*safetensors.TensorData{weightTD, scaleTD.WithName(scaleName)}) } func createPrequantizedLayer( @@ -991,3 +1286,475 @@ func prequantizedCompanions(weightName string, tensorSet map[string]struct{}) (s } return scaleName, biasName, true } + +// createModelOptFP4Layer creates a pre-quantized layer from NVIDIA ModelOpt +// NVFP4 tensors. The weight (U8) and scale (F8_E4M3 stored as uint8) are +// packed with the per-tensor global scale (weight_scale_2) into a single +// safetensors blob. The tensor names are mapped to our standard format: +// - source.weight → tensorName (weight data, kept as-is) +// - source.weight_scale → tensorName.scale (FP8 E4M3 bytes as uint8) +// - source.weight_scale_2 → tensorName.global_scale (F32 scalar) +func createModelOptFP4Layer( + extractor *safetensors.TensorExtractor, + td *safetensors.TensorData, + tensorName string, + tensorSet map[string]struct{}, + metadata map[string]string, + createLayer LayerCreator, +) (LayerInfo, bool, error) { + scaleName, globalScaleName, ok := modelOptFP4Companions(tensorName, tensorSet) + if !ok { + return LayerInfo{}, false, nil + } + + // NVIDIA packs FP4 as U8 (2 values/byte), MLX expects U32 (8 values/uint32). + // Repack: view the U8 data as U32 (4 consecutive bytes → 1 uint32) and + // adjust the shape from [out, in/2] to [out, in/8]. + weightTD := td.WithName(tensorName) + if strings.ToUpper(weightTD.Dtype) == "U8" && len(weightTD.Shape) == 2 { + weightTD.Dtype = "U32" + weightTD.Shape = []int32{weightTD.Shape[0], weightTD.Shape[1] / 4} + } + tensors := []*safetensors.TensorData{weightTD} + + scaleTD, err := extractor.GetTensor(scaleName) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", scaleName, err) + } + // F8_E4M3 scales stored as uint8 — fix the dtype for our loader + scaleRenamed := scaleTD.WithName(tensorName + ".scale") + if strings.ToUpper(scaleRenamed.Dtype) == "F8_E4M3" { + scaleRenamed.Dtype = "U8" + } + tensors = append(tensors, scaleRenamed) + + if globalScaleName != "" { + gsTD, err := extractor.GetTensor(globalScaleName) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", globalScaleName, err) + } + gsTD, err = validateScalarFloat32TensorData(gsTD, tensorName+".global_scale") + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to normalize tensor %s: %w", globalScaleName, err) + } + tensors = append(tensors, gsTD) + } + + // Add nvfp4 quant metadata + md := make(map[string]string) + for k, v := range metadata { + md[k] = v + } + md["quant_type"] = "nvfp4" + + layer, err := createLayer( + safetensors.BuildPackedSafetensorsReaderWithMetadata(tensors, md), + "application/vnd.ollama.image.tensor", + tensorName, + ) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to create ModelOpt FP4 layer for %s: %w", tensorName, err) + } + return layer, true, nil +} + +// createPackedNVFP4Layer creates a pre-quantized layer from packed NVFP4 +// tensors that use the newer source layout: +// - source.weight_packed -> tensorName (U32 repacked weight) +// - source.weight_scale -> tensorName.scale +// - source.weight_global_scale -> reciprocal stored as tensorName.global_scale +// - source.input_global_scale -> ignored for weight-only inference +func createPackedNVFP4Layer( + modelDir string, + extractor *safetensors.TensorExtractor, + crossFileExtractors map[string]*safetensors.TensorExtractor, + td *safetensors.TensorData, + tensorName string, + tensorSet map[string]struct{}, + sourceTensorFiles map[string]string, + metadata map[string]string, + createLayer LayerCreator, +) (LayerInfo, bool, error) { + weightName, scaleName, weightGlobalScaleName, _, ok := packedNVFP4Companions(tensorName, tensorSet, sourceTensorFiles) + if !ok { + return LayerInfo{}, false, nil + } + + weightTD := td.WithName(weightName) + if strings.ToUpper(weightTD.Dtype) == "U8" && len(weightTD.Shape) == 2 { + weightTD.Dtype = "U32" + weightTD.Shape = []int32{weightTD.Shape[0], weightTD.Shape[1] / 4} + } + tensors := []*safetensors.TensorData{weightTD} + + scaleTD, err := getTensorFromSource(modelDir, extractor, crossFileExtractors, sourceTensorFiles, scaleName) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", scaleName, err) + } + scaleRenamed := scaleTD.WithName(weightName + ".scale") + if strings.ToUpper(scaleRenamed.Dtype) == "F8_E4M3" { + scaleRenamed.Dtype = "U8" + } + tensors = append(tensors, scaleRenamed) + + if weightGlobalScaleName != "" { + gsTD, err := getTensorFromSource(modelDir, extractor, crossFileExtractors, sourceTensorFiles, weightGlobalScaleName) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to get tensor %s: %w", weightGlobalScaleName, err) + } + gsTD, err = invertScalarFloat32TensorData(gsTD, weightName+".global_scale") + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to normalize tensor %s: %w", weightGlobalScaleName, err) + } + tensors = append(tensors, gsTD) + } + + md := make(map[string]string) + for k, v := range metadata { + md[k] = v + } + md["quant_type"] = "nvfp4" + if _, ok := md["group_size"]; !ok { + md["group_size"] = "16" + } + + layer, err := createLayer( + safetensors.BuildPackedSafetensorsReaderWithMetadata(tensors, md), + "application/vnd.ollama.image.tensor", + weightName, + ) + if err != nil { + return LayerInfo{}, false, fmt.Errorf("failed to create packed NVFP4 layer for %s: %w", tensorName, err) + } + return layer, true, nil +} + +type stackedTempTensor struct { + tensor *safetensors.TensorData + file *os.File + path string +} + +func createPackedNVFP4ExpertGroupLayer(groupName string, tensors []*safetensors.TensorData, createLayer LayerCreator) (LayerInfo, bool, error) { + stacked, metadata, ok, err := stackPackedNVFP4ExpertGroup(groupName, tensors) + if err != nil || !ok { + return LayerInfo{}, ok, err + } + defer func() { + for _, td := range stacked { + if td.file != nil { + td.file.Close() + } + if td.path != "" { + os.Remove(td.path) + } + } + }() + + packed := make([]*safetensors.TensorData, 0, len(stacked)) + for _, td := range stacked { + packed = append(packed, td.tensor) + } + layer, err := createLayer( + safetensors.BuildPackedSafetensorsReaderWithMetadata(packed, metadata), + "application/vnd.ollama.image.tensor", + groupName, + ) + if err != nil { + return LayerInfo{}, true, err + } + return layer, true, nil +} + +func stackPackedNVFP4ExpertGroup(groupName string, tensors []*safetensors.TensorData) ([]stackedTempTensor, map[string]string, bool, error) { + if !strings.HasSuffix(groupName, ".experts") { + return nil, nil, false, nil + } + + type namedExpertTensor struct { + expert int + name string + td *safetensors.TensorData + } + + grouped := make(map[string][]namedExpertTensor) + for _, td := range tensors { + suffix := strings.TrimPrefix(td.Name, groupName) + m := prequantizedExpertSuffixRegexp.FindStringSubmatch(suffix) + if m == nil { + return nil, nil, false, nil + } + expert, err := strconv.Atoi(m[1]) + if err != nil { + return nil, nil, false, fmt.Errorf("invalid expert index in %q: %w", td.Name, err) + } + grouped[m[2]] = append(grouped[m[2]], namedExpertTensor{ + expert: expert, + name: td.Name, + td: td, + }) + } + if len(grouped) == 0 { + return nil, nil, false, nil + } + + groupBase := strings.TrimSuffix(groupName, ".experts") + ".switch_mlp." + names := make([]string, 0, len(grouped)) + for name := range grouped { + names = append(names, name) + } + sort.Strings(names) + + var stacked []stackedTempTensor + metadata := map[string]string{ + "quant_type": "nvfp4", + "group_size": "16", + } + cleanup := func() { + for _, td := range stacked { + if td.file != nil { + td.file.Close() + } + if td.path != "" { + os.Remove(td.path) + } + } + } + + for _, name := range names { + if strings.HasSuffix(name, ".input_global_scale") { + continue + } + experts := grouped[name] + sort.Slice(experts, func(i, j int) bool { return experts[i].expert < experts[j].expert }) + if len(experts) == 0 { + continue + } + + stackedName := groupBase + name + baseShape := append([]int32(nil), experts[0].td.Shape...) + stackedShape := make([]int32, 0, len(baseShape)+1) + stackedShape = append(stackedShape, int32(len(experts))) + switch { + case strings.HasSuffix(name, ".global_scale"), strings.HasSuffix(name, ".input_global_scale"): + stackedShape = append(stackedShape, 1, 1) + default: + stackedShape = append(stackedShape, baseShape...) + } + + f, err := os.CreateTemp("", "ollama-packed-nvfp4-*.bin") + if err != nil { + cleanup() + return nil, nil, false, fmt.Errorf("create temp tensor for %s: %w", stackedName, err) + } + + var size int64 + for _, expert := range experts { + if expert.td.Dtype != experts[0].td.Dtype || !slices.Equal(expert.td.Shape, experts[0].td.Shape) { + f.Close() + os.Remove(f.Name()) + cleanup() + return nil, nil, false, fmt.Errorf("mismatched expert tensor layout in %s", stackedName) + } + written, err := io.Copy(f, expert.td.Reader()) + if err != nil { + f.Close() + os.Remove(f.Name()) + cleanup() + return nil, nil, false, fmt.Errorf("stack tensor %s: %w", expert.name, err) + } + size += written + } + + stacked = append(stacked, stackedTempTensor{ + tensor: safetensors.NewTensorDataFromReaderAt(stackedName, experts[0].td.Dtype, stackedShape, f, size), + file: f, + path: f.Name(), + }) + + if strings.HasSuffix(name, ".weight") { + metadata[stackedName+".quant_type"] = "nvfp4" + metadata[stackedName+".group_size"] = "16" + } + } + + return stacked, metadata, true, nil +} + +func packedNVFP4TensorData( + modelDir string, + extractor *safetensors.TensorExtractor, + crossFileExtractors map[string]*safetensors.TensorExtractor, + td *safetensors.TensorData, + tensorName string, + tensorSet map[string]struct{}, + sourceTensorFiles map[string]string, +) ([]*safetensors.TensorData, bool, error) { + weightName, scaleName, weightGlobalScaleName, _, ok := packedNVFP4Companions(tensorName, tensorSet, sourceTensorFiles) + if !ok { + return nil, false, nil + } + + weightTD := td.WithName(weightName) + if strings.ToUpper(weightTD.Dtype) == "U8" && len(weightTD.Shape) == 2 { + weightTD.Dtype = "U32" + weightTD.Shape = []int32{weightTD.Shape[0], weightTD.Shape[1] / 4} + } + tensors := []*safetensors.TensorData{weightTD} + + scaleTD, err := getTensorFromSource(modelDir, extractor, crossFileExtractors, sourceTensorFiles, scaleName) + if err != nil { + return nil, false, fmt.Errorf("failed to get tensor %s: %w", scaleName, err) + } + scaleRenamed := scaleTD.WithName(weightName + ".scale") + if strings.ToUpper(scaleRenamed.Dtype) == "F8_E4M3" { + scaleRenamed.Dtype = "U8" + } + tensors = append(tensors, scaleRenamed) + + if weightGlobalScaleName != "" { + gsTD, err := getTensorFromSource(modelDir, extractor, crossFileExtractors, sourceTensorFiles, weightGlobalScaleName) + if err != nil { + return nil, false, fmt.Errorf("failed to get tensor %s: %w", weightGlobalScaleName, err) + } + gsTD, err = invertScalarFloat32TensorData(gsTD, weightName+".global_scale") + if err != nil { + return nil, false, fmt.Errorf("failed to normalize tensor %s: %w", weightGlobalScaleName, err) + } + tensors = append(tensors, gsTD) + } + + return tensors, true, nil +} + +func validateScalarFloat32TensorData(td *safetensors.TensorData, name string) (*safetensors.TensorData, error) { + if td == nil { + return nil, nil + } + if strings.ToUpper(td.Dtype) != "F32" { + return nil, fmt.Errorf("expected F32 tensor, got %s", td.Dtype) + } + n := int32(1) + for _, dim := range td.Shape { + n *= dim + } + if n != 1 { + return nil, fmt.Errorf("expected scalar F32 tensor, got shape %v", td.Shape) + } + return td.WithName(name), nil +} + +func invertScalarFloat32TensorData(td *safetensors.TensorData, name string) (*safetensors.TensorData, error) { + td, err := validateScalarFloat32TensorData(td, name) + if err != nil { + return nil, err + } + raw, err := io.ReadAll(td.Reader()) + if err != nil { + return nil, err + } + if len(raw)%4 != 0 { + return nil, fmt.Errorf("invalid F32 tensor byte length %d", len(raw)) + } + out := make([]byte, len(raw)) + for i := 0; i < len(raw); i += 4 { + v := math.Float32frombits(binary.LittleEndian.Uint32(raw[i : i+4])) + if v == 0 { + return nil, fmt.Errorf("cannot invert zero F32 scale") + } + binary.LittleEndian.PutUint32(out[i:i+4], math.Float32bits(1/v)) + } + return safetensors.NewTensorDataFromBytes(name, td.Dtype, td.Shape, out), nil +} + +// modelOptFP4Companions finds the companion tensors for a ModelOpt NVFP4 +// quantized weight: weight_scale (per-group FP8 E4M3 scales) and optional +// weight_scale_2 (per-tensor global scale). +func modelOptFP4Companions(weightName string, tensorSet map[string]struct{}) (scaleName, globalScaleName string, ok bool) { + if !strings.HasSuffix(weightName, ".weight") { + return "", "", false + } + + scaleName = weightName + "_scale" + if _, ok := tensorSet[scaleName]; !ok { + return "", "", false + } + + globalScaleName = weightName + "_scale_2" + if _, ok := tensorSet[globalScaleName]; !ok { + globalScaleName = "" + } + return scaleName, globalScaleName, true +} + +func packedNVFP4Companions(weightPackedName string, tensorSet map[string]struct{}, sourceTensorFiles map[string]string) (weightName, scaleName, weightGlobalScaleName, inputGlobalScaleName string, ok bool) { + if !strings.HasSuffix(weightPackedName, ".weight_packed") { + return "", "", "", "", false + } + + weightName = strings.TrimSuffix(weightPackedName, "_packed") + scaleName = strings.TrimSuffix(weightPackedName, "_packed") + "_scale" + if _, ok := tensorSet[scaleName]; !ok { + if _, ok := sourceTensorFiles[scaleName]; !ok { + return "", "", "", "", false + } + } + + weightGlobalScaleName = strings.TrimSuffix(weightPackedName, "_packed") + "_global_scale" + if _, ok := tensorSet[weightGlobalScaleName]; !ok { + if _, ok := sourceTensorFiles[weightGlobalScaleName]; !ok { + weightGlobalScaleName = "" + } + } + + inputGlobalScaleName = strings.TrimSuffix(weightPackedName, ".weight_packed") + ".input_global_scale" + if _, ok := tensorSet[inputGlobalScaleName]; !ok { + if _, ok := sourceTensorFiles[inputGlobalScaleName]; !ok { + inputGlobalScaleName = "" + } + } + + return weightName, scaleName, weightGlobalScaleName, inputGlobalScaleName, true +} + +func readSourceTensorFiles(modelDir string) (map[string]string, error) { + indexPath := filepath.Join(modelDir, "model.safetensors.index.json") + data, err := os.ReadFile(indexPath) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + var index struct { + WeightMap map[string]string `json:"weight_map"` + } + if err := json.Unmarshal(data, &index); err != nil { + return nil, err + } + return index.WeightMap, nil +} + +func getTensorFromSource(modelDir string, current *safetensors.TensorExtractor, cache map[string]*safetensors.TensorExtractor, sourceTensorFiles map[string]string, name string) (*safetensors.TensorData, error) { + if td, err := current.GetTensor(name); err == nil { + return td, nil + } + if sourceTensorFiles == nil { + return nil, fmt.Errorf("tensor %s not found in current shard and no source index available", name) + } + fileName, ok := sourceTensorFiles[name] + if !ok { + return nil, fmt.Errorf("tensor %s not found in source index", name) + } + ext := cache[fileName] + if ext == nil { + path := filepath.Join(modelDir, fileName) + var err error + ext, err = safetensors.OpenForExtraction(path) + if err != nil { + return nil, err + } + cache[fileName] = ext + } + return ext.GetTensor(name) +} diff --git a/x/create/create_test.go b/x/create/create_test.go index 0acbb6613..3dfd18756 100644 --- a/x/create/create_test.go +++ b/x/create/create_test.go @@ -4,7 +4,9 @@ import ( "bytes" "encoding/binary" "encoding/json" + "fmt" "io" + "math" "os" "path/filepath" "slices" @@ -59,6 +61,43 @@ func TestIsTensorModelDir(t *testing.T) { } } +func TestValidateScalarFloat32TensorData(t *testing.T) { + td := st.NewTensorDataFromBytes("linear.weight_scale_2", "F32", []int32{}, encodeFloat32s(2)) + + got, err := validateScalarFloat32TensorData(td, "linear.weight.global_scale") + if err != nil { + t.Fatalf("validateScalarFloat32TensorData returned error: %v", err) + } + + if got.Name != "linear.weight.global_scale" { + t.Fatalf("name = %q, want %q", got.Name, "linear.weight.global_scale") + } + if got.Dtype != "F32" { + t.Fatalf("dtype = %q, want F32", got.Dtype) + } + if len(got.Shape) != 0 { + t.Fatalf("shape = %v, want scalar", got.Shape) + } +} + +func TestValidateScalarFloat32TensorDataRejectsNonScalar(t *testing.T) { + td := st.NewTensorDataFromBytes("linear.weight_scale_2", "F32", []int32{2}, encodeFloat32s(2, 4)) + + _, err := validateScalarFloat32TensorData(td, "linear.weight.global_scale") + if err == nil || !strings.Contains(err.Error(), "expected scalar F32 tensor") { + t.Fatalf("validateScalarFloat32TensorData error = %v, want scalar-shape failure", err) + } +} + +func TestInvertScalarFloat32TensorDataRejectsNonF32(t *testing.T) { + td := st.NewTensorDataFromBytes("linear.weight_global_scale", "BF16", []int32{}, []byte{0, 0}) + + _, err := invertScalarFloat32TensorData(td, "linear.weight.global_scale") + if err == nil || !strings.Contains(err.Error(), "expected F32 tensor") { + t.Fatalf("invertScalarFloat32TensorData error = %v, want dtype failure", err) + } +} + func TestIsSafetensorsModelDir(t *testing.T) { tests := []struct { name string @@ -246,6 +285,41 @@ func readSingleTensorRaw(t *testing.T, data []byte) []byte { return nil } +func encodeFloat32s(vals ...float32) []byte { + raw := make([]byte, 4*len(vals)) + for i, v := range vals { + binary.LittleEndian.PutUint32(raw[i*4:(i+1)*4], math.Float32bits(v)) + } + return raw +} + +func readPackedTensorRaw(t *testing.T, data []byte, tensorName string) []byte { + t.Helper() + + var headerSize uint64 + if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil { + t.Fatalf("failed to read header size: %v", err) + } + + var header map[string]struct { + Dtype string `json:"dtype"` + Shape []int32 `json:"shape"` + DataOffsets [2]int `json:"data_offsets"` + } + if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil { + t.Fatalf("failed to parse header: %v", err) + } + + info, ok := header[tensorName] + if !ok { + t.Fatalf("tensor %q not found in header", tensorName) + } + + start := 8 + int(headerSize) + info.DataOffsets[0] + end := 8 + int(headerSize) + info.DataOffsets[1] + return data[start:end] +} + func readSafetensorsHeaderNames(t *testing.T, data []byte) []string { t.Helper() @@ -612,10 +686,22 @@ func TestCreateSafetensorsModel_HFFP8AutoConvertsToMXFP8(t *testing.T) { writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } - if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}); err != nil { + var statusMessages []string + progressFn := func(status string) { + statusMessages = append(statusMessages, status) + } + + if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil { t.Fatalf("CreateSafetensorsModel failed: %v", err) } + if len(statusMessages) == 0 { + t.Fatal("no status messages received") + } + if got, want := statusMessages[0], "importing model.safetensors (4 tensors, converting source E4M3 block-FP8 to MLX mxfp8)"; got != want { + t.Fatalf("status = %q, want %q", got, want) + } + if got := quantizeByName["linear.weight"]; got != "mxfp8" { t.Fatalf("linear.weight quantization = %q, want %q", got, "mxfp8") } @@ -643,6 +729,166 @@ func TestCreateSafetensorsModel_HFFP8AutoConvertsToMXFP8(t *testing.T) { } } +func TestCreateSafetensorsModel_CompressedTensorsFP8WeightScale(t *testing.T) { + dir := t.TempDir() + + configJSON := `{ + "model_type": "test", + "architectures": ["TestModel"], + "compression_config": { + "quant_method": "compressed-tensors", + "format": "float-quantized", + "config_groups": { + "group_0": { + "format": "float-quantized", + "weights": { + "type": "float", + "num_bits": 8, + "block_structure": [128, 128] + } + } + } + } + }` + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil { + t.Fatalf("failed to write config.json: %v", err) + } + + createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{2, 2}, []byte{1, 2, 3, 4}), + st.NewTensorDataFromBytes("linear.weight_scale", "BF16", []int32{1, 1}, make([]byte, 2)), + st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{2}, make([]byte, 4)), + }) + + quantizeByName := make(map[string]string) + headerNamesByName := make(map[string][]string) + + createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) { + if _, err := io.ReadAll(r); err != nil { + return LayerInfo{}, err + } + return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil + } + createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) { + data, err := io.ReadAll(r) + if err != nil { + return nil, err + } + quantizeByName[name] = quantize + headerNamesByName[name] = readSafetensorsHeaderNames(t, data) + return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil + } + writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } + + var statusMessages []string + progressFn := func(status string) { + statusMessages = append(statusMessages, status) + } + + if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil { + t.Fatalf("CreateSafetensorsModel failed: %v", err) + } + if len(statusMessages) == 0 { + t.Fatal("no status messages received") + } + if got, want := statusMessages[0], "importing model.safetensors (3 tensors, converting source E4M3 block-FP8 to MLX mxfp8)"; got != want { + t.Fatalf("status = %q, want %q", got, want) + } + if got := quantizeByName["linear.weight"]; got != "mxfp8" { + t.Fatalf("linear.weight quantization = %q, want mxfp8", got) + } + if _, ok := quantizeByName["linear.weight_scale"]; ok { + t.Fatal("linear.weight_scale should not be imported as a standalone tensor") + } + if got := headerNamesByName["linear.weight"]; !slices.Equal(got, []string{"linear.weight", "linear.weight.scale"}) { + t.Fatalf("linear.weight blob tensors = %v, want %v", got, []string{"linear.weight", "linear.weight.scale"}) + } +} + +func TestCreateSafetensorsModel_HFFP8SourceCanConvertToNVFP4(t *testing.T) { + dir := t.TempDir() + + configJSON := `{ + "model_type": "test", + "architectures": ["TestModel"], + "quantization_config": {"quant_method": "fp8", "weight_block_size": [128, 128]} + }` + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil { + t.Fatalf("failed to write config.json: %v", err) + } + + createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)), + st.NewTensorDataFromBytes("linear.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)), + st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight", "F8_E4M3", []int32{128, 128}, make([]byte, 128*128)), + st.NewTensorDataFromBytes("model.layers.0.mlp.experts.0.down_proj.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)), + st.NewTensorDataFromBytes("model.layers.0.self_attn.q_proj.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)), + st.NewTensorDataFromBytes("model.embed_tokens.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)), + st.NewTensorDataFromBytes("lm_head.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)), + st.NewTensorDataFromBytes("model.layers.0.mlp.gate.weight", "BF16", []int32{128, 128}, make([]byte, 128*128*2)), + st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{128}, make([]byte, 256)), + }) + + quantizeByName := make(map[string]string) + headerNamesByName := make(map[string][]string) + + createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) { + if _, err := io.ReadAll(r); err != nil { + return LayerInfo{}, err + } + return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil + } + createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) { + data, err := io.ReadAll(r) + if err != nil { + return nil, err + } + quantizeByName[name] = quantize + headerNamesByName[name] = readSafetensorsHeaderNames(t, data) + return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil + } + writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } + + var statusMessages []string + progressFn := func(status string) { + statusMessages = append(statusMessages, status) + } + + if err := CreateSafetensorsModel("test-model", dir, "nvfp4", createLayer, createTensorLayer, writeManifest, progressFn); err != nil { + t.Fatalf("CreateSafetensorsModel failed: %v", err) + } + if len(statusMessages) == 0 { + t.Fatal("no status messages received") + } + if got, want := statusMessages[0], "importing model.safetensors (9 tensors, converting source E4M3 block-FP8 to MLX nvfp4)"; got != want { + t.Fatalf("status = %q, want %q", got, want) + } + if got := quantizeByName["linear.weight"]; got != "nvfp4" { + t.Fatalf("linear.weight quantization = %q, want nvfp4", got) + } + if got := quantizeByName["model.layers.0.mlp.experts.0.down_proj.weight"]; got != "mxfp8" { + t.Fatalf("source fp8 down_proj quantization = %q, want mxfp8", got) + } + for _, name := range []string{ + "model.layers.0.self_attn.q_proj.weight", + "model.embed_tokens.weight", + "lm_head.weight", + } { + if got := quantizeByName[name]; got != "mxfp8" { + t.Fatalf("%s quantization = %q, want mxfp8", name, got) + } + } + if got := quantizeByName["model.layers.0.mlp.gate.weight"]; got != "" { + t.Fatalf("router gate quantization = %q, want empty", got) + } + if got := quantizeByName["norm.weight"]; got != "" { + t.Fatalf("norm.weight quantization = %q, want empty", got) + } + if got := headerNamesByName["linear.weight"]; !slices.Equal(got, []string{"linear.weight", "linear.weight.scale_inv"}) { + t.Fatalf("linear.weight blob tensors = %v, want %v", got, []string{"linear.weight", "linear.weight.scale_inv"}) + } +} + func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T) { tests := []struct { name string @@ -670,7 +916,20 @@ func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T st.NewTensorDataFromBytes("linear.weight", "F8_E4M3", []int32{2, 2}, []byte{1, 2, 3, 4}), st.NewTensorDataFromBytes("linear.weight_scale_inv", "BF16", []int32{1, 1}, make([]byte, 2)), }, - wantErr: `cannot requantize already-quantized fp8 source model with --quantize "int4"`, + wantErr: `cannot convert already-quantized fp8 source model with --quantize "int4"`, + }, + { + name: "packed nvfp4 source", + configJSON: `{ + "model_type": "test", + "architectures": ["TestModel"], + "compression_config": {"format": "nvfp4-pack-quantized"} + }`, + tensors: []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)), + st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)), + }, + wantErr: `cannot requantize already-quantized source model with --quantize "int4"`, }, } @@ -701,6 +960,317 @@ func TestCreateSafetensorsModel_RejectsRequantizingQuantizedSources(t *testing.T } } +func TestCreateSafetensorsModel_PackedNVFP4PreservesSourceLayout(t *testing.T) { + dir := t.TempDir() + + configJSON := `{ + "model_type": "test", + "architectures": ["TestModel"], + "compression_config": {"format": "nvfp4-pack-quantized"} + }` + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil { + t.Fatalf("failed to write config.json: %v", err) + } + + createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)), + st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)), + st.NewTensorDataFromBytes("linear.weight_global_scale", "F32", []int32{}, encodeFloat32s(4)), + st.NewTensorDataFromBytes("linear.input_global_scale", "F32", []int32{}, encodeFloat32s(8)), + st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{16}, make([]byte, 32)), + }) + + var statusMessages []string + layerHeaders := make(map[string]map[string]json.RawMessage) + layerData := make(map[string][]byte) + var tensorLayerNames []string + + createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) { + data, err := io.ReadAll(r) + if err != nil { + return LayerInfo{}, err + } + if mediaType == "application/vnd.ollama.image.tensor" { + if len(data) < 8 { + return LayerInfo{}, io.ErrUnexpectedEOF + } + var headerSize uint64 + if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil { + return LayerInfo{}, err + } + var header map[string]json.RawMessage + if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil { + return LayerInfo{}, err + } + layerHeaders[name] = header + layerData[name] = data + } + return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil + } + createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) { + if _, err := io.ReadAll(r); err != nil { + return nil, err + } + tensorLayerNames = append(tensorLayerNames, name) + return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil + } + writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } + progressFn := func(status string) { statusMessages = append(statusMessages, status) } + + if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, progressFn); err != nil { + t.Fatalf("CreateSafetensorsModel failed: %v", err) + } + + if len(statusMessages) == 0 { + t.Fatal("no status messages received") + } + if got, want := statusMessages[0], "importing model.safetensors (5 tensors, preserving source quantization)"; got != want { + t.Fatalf("status = %q, want %q", got, want) + } + + if slices.Contains(tensorLayerNames, "linear.weight_scale") || slices.Contains(tensorLayerNames, "linear.weight_global_scale") || slices.Contains(tensorLayerNames, "linear.input_global_scale") { + t.Fatalf("packed nvfp4 companions unexpectedly emitted as standalone tensor layers: %v", tensorLayerNames) + } + + packedHeader := layerHeaders["linear.weight"] + if packedHeader == nil { + t.Fatalf("missing packed layer header for linear.weight") + } + for _, key := range []string{ + "linear.weight", + "linear.weight.scale", + "linear.weight.global_scale", + } { + if _, ok := packedHeader[key]; !ok { + t.Fatalf("packed header missing %s: %v", key, packedHeader) + } + } + if _, ok := packedHeader["linear.weight.input_global_scale"]; ok { + t.Fatalf("packed header unexpectedly includes input_global_scale: %v", packedHeader) + } + globalRaw := readPackedTensorRaw(t, layerData["linear.weight"], "linear.weight.global_scale") + if got := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw)); got != 0.25 { + t.Fatalf("linear.weight.global_scale = %v, want 0.25", got) + } + + var metadata map[string]string + if metaRaw, ok := packedHeader["__metadata__"]; ok { + if err := json.Unmarshal(metaRaw, &metadata); err != nil { + t.Fatalf("failed to parse metadata: %v", err) + } + } + if metadata["quant_type"] != "nvfp4" { + t.Fatalf("quant_type = %q, want %q", metadata["quant_type"], "nvfp4") + } + if metadata["group_size"] != "16" { + t.Fatalf("group_size = %q, want %q", metadata["group_size"], "16") + } +} + +func TestCreateSafetensorsModel_PackedNVFP4CrossShardCompanions(t *testing.T) { + dir := t.TempDir() + + configJSON := `{ + "model_type": "test", + "architectures": ["TestModel"], + "compression_config": {"format": "nvfp4-pack-quantized"} + }` + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil { + t.Fatalf("failed to write config.json: %v", err) + } + + createTestSafetensors(t, filepath.Join(dir, "model-00001-of-00002.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight_packed", "U8", []int32{16, 8}, make([]byte, 128)), + st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{16}, make([]byte, 32)), + }) + createTestSafetensors(t, filepath.Join(dir, "model-00002-of-00002.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("linear.weight_scale", "F8_E4M3", []int32{16, 1}, make([]byte, 16)), + st.NewTensorDataFromBytes("linear.weight_global_scale", "F32", []int32{}, encodeFloat32s(2)), + st.NewTensorDataFromBytes("linear.input_global_scale", "F32", []int32{}, encodeFloat32s(8)), + }) + indexJSON := `{ + "metadata": {"total_size": 152}, + "weight_map": { + "linear.weight_packed": "model-00001-of-00002.safetensors", + "norm.weight": "model-00001-of-00002.safetensors", + "linear.weight_scale": "model-00002-of-00002.safetensors", + "linear.weight_global_scale": "model-00002-of-00002.safetensors", + "linear.input_global_scale": "model-00002-of-00002.safetensors" + } + }` + if err := os.WriteFile(filepath.Join(dir, "model.safetensors.index.json"), []byte(indexJSON), 0o644); err != nil { + t.Fatalf("failed to write index: %v", err) + } + + layerHeaders := make(map[string]map[string]json.RawMessage) + var tensorLayerNames []string + + createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) { + data, err := io.ReadAll(r) + if err != nil { + return LayerInfo{}, err + } + if mediaType == "application/vnd.ollama.image.tensor" { + var headerSize uint64 + if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil { + return LayerInfo{}, err + } + var header map[string]json.RawMessage + if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil { + return LayerInfo{}, err + } + layerHeaders[name] = header + } + return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil + } + createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) { + if _, err := io.ReadAll(r); err != nil { + return nil, err + } + tensorLayerNames = append(tensorLayerNames, name) + return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil + } + writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } + + packedCreator := func(groupName string, tensors []PackedTensorInput) (LayerInfo, error) { + return LayerInfo{}, fmt.Errorf("unexpected packedCreator call for %s", groupName) + } + if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}, packedCreator); err != nil { + t.Fatalf("CreateSafetensorsModel failed: %v", err) + } + + if slices.Contains(tensorLayerNames, "linear.weight_packed") || slices.Contains(tensorLayerNames, "linear.weight_scale") || slices.Contains(tensorLayerNames, "linear.weight_global_scale") || slices.Contains(tensorLayerNames, "linear.input_global_scale") { + t.Fatalf("packed nvfp4 tensors unexpectedly emitted as standalone tensor layers: %v", tensorLayerNames) + } + + packedHeader := layerHeaders["linear.weight"] + if packedHeader == nil { + t.Fatalf("missing packed layer header for linear.weight") + } + for _, key := range []string{ + "linear.weight", + "linear.weight.scale", + "linear.weight.global_scale", + } { + if _, ok := packedHeader[key]; !ok { + t.Fatalf("packed header missing %s: %v", key, packedHeader) + } + } + if _, ok := packedHeader["linear.weight.input_global_scale"]; ok { + t.Fatalf("packed header unexpectedly includes input_global_scale: %v", packedHeader) + } +} + +func TestCreateSafetensorsModel_PackedNVFP4StacksExperts(t *testing.T) { + dir := t.TempDir() + + configJSON := `{ + "model_type": "test", + "architectures": ["TestModel"], + "compression_config": {"format": "nvfp4-pack-quantized"} + }` + if err := os.WriteFile(filepath.Join(dir, "config.json"), []byte(configJSON), 0o644); err != nil { + t.Fatalf("failed to write config.json: %v", err) + } + + createTestSafetensors(t, filepath.Join(dir, "model.safetensors"), []*st.TensorData{ + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_packed", "U8", []int32{2, 8}, make([]byte, 16)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_scale", "F8_E4M3", []int32{2, 1}, make([]byte, 2)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.weight_global_scale", "F32", []int32{1}, encodeFloat32s(2)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.0.gate_proj.input_global_scale", "F32", []int32{1}, encodeFloat32s(32)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_packed", "U8", []int32{2, 8}, make([]byte, 16)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_scale", "F8_E4M3", []int32{2, 1}, make([]byte, 2)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.weight_global_scale", "F32", []int32{1}, encodeFloat32s(4)), + st.NewTensorDataFromBytes("model.layers.1.mlp.experts.1.gate_proj.input_global_scale", "F32", []int32{1}, encodeFloat32s(64)), + st.NewTensorDataFromBytes("norm.weight", "BF16", []int32{2}, make([]byte, 4)), + }) + + layerHeaders := make(map[string]map[string]json.RawMessage) + layerData := make(map[string][]byte) + createLayer := func(r io.Reader, mediaType, name string) (LayerInfo, error) { + data, err := io.ReadAll(r) + if err != nil { + return LayerInfo{}, err + } + if mediaType == "application/vnd.ollama.image.tensor" { + var headerSize uint64 + if err := binary.Read(bytes.NewReader(data[:8]), binary.LittleEndian, &headerSize); err != nil { + return LayerInfo{}, err + } + var header map[string]json.RawMessage + if err := json.Unmarshal(data[8:8+headerSize], &header); err != nil { + return LayerInfo{}, err + } + layerHeaders[name] = header + layerData[name] = data + } + return LayerInfo{Name: name, Digest: "sha256:" + name, MediaType: mediaType}, nil + } + createTensorLayer := func(r io.Reader, name, dtype string, shape []int32, quantize string) ([]LayerInfo, error) { + if _, err := io.ReadAll(r); err != nil { + return nil, err + } + return []LayerInfo{{Name: name, Digest: "sha256:tensor_" + name, MediaType: "application/vnd.ollama.image.tensor"}}, nil + } + writeManifest := func(modelName string, config LayerInfo, layers []LayerInfo) error { return nil } + packedCreator := func(groupName string, tensors []PackedTensorInput) (LayerInfo, error) { + return LayerInfo{}, fmt.Errorf("unexpected packedCreator call for %s", groupName) + } + + if err := CreateSafetensorsModel("test-model", dir, "", createLayer, createTensorLayer, writeManifest, func(string) {}, packedCreator); err != nil { + t.Fatalf("CreateSafetensorsModel failed: %v", err) + } + + header := layerHeaders["model.layers.1.mlp.experts"] + if header == nil { + t.Fatalf("missing packed expert layer header") + } + for _, key := range []string{ + "model.layers.1.mlp.switch_mlp.gate_proj.weight", + "model.layers.1.mlp.switch_mlp.gate_proj.weight.scale", + "model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale", + } { + if _, ok := header[key]; !ok { + t.Fatalf("stacked header missing %s: %v", key, header) + } + } + if _, ok := header["model.layers.1.mlp.switch_mlp.gate_proj.weight.input_global_scale"]; ok { + t.Fatalf("stacked header unexpectedly includes input_global_scale: %v", header) + } + if _, ok := header["model.layers.1.mlp.experts.0.gate_proj.weight"]; ok { + t.Fatalf("unexpected per-expert tensor left in packed header: %v", header) + } + + var weightInfo struct { + Dtype string `json:"dtype"` + Shape []int32 `json:"shape"` + } + if err := json.Unmarshal(header["model.layers.1.mlp.switch_mlp.gate_proj.weight"], &weightInfo); err != nil { + t.Fatalf("failed to unmarshal stacked weight info: %v", err) + } + if weightInfo.Dtype != "U32" || !slices.Equal(weightInfo.Shape, []int32{2, 2, 2}) { + t.Fatalf("stacked weight = dtype %s shape %v, want U32 [2 2 2]", weightInfo.Dtype, weightInfo.Shape) + } + + var globalInfo struct { + Dtype string `json:"dtype"` + Shape []int32 `json:"shape"` + } + if err := json.Unmarshal(header["model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale"], &globalInfo); err != nil { + t.Fatalf("failed to unmarshal stacked global scale info: %v", err) + } + if globalInfo.Dtype != "F32" || !slices.Equal(globalInfo.Shape, []int32{2, 1, 1}) { + t.Fatalf("stacked global scale = dtype %s shape %v, want F32 [2 1 1]", globalInfo.Dtype, globalInfo.Shape) + } + globalRaw := readPackedTensorRaw(t, layerData["model.layers.1.mlp.experts"], "model.layers.1.mlp.switch_mlp.gate_proj.weight.global_scale") + if got0 := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw[0:4])); got0 != 0.5 { + t.Fatalf("stacked global scale[0] = %v, want 0.5", got0) + } + if got1 := math.Float32frombits(binary.LittleEndian.Uint32(globalRaw[4:8])); got1 != 0.25 { + t.Fatalf("stacked global scale[1] = %v, want 0.25", got1) + } +} + func TestCreateSafetensorsModel_HFFP8PacksExperts(t *testing.T) { dir := t.TempDir() @@ -777,6 +1347,26 @@ func TestCreateSafetensorsModel_HFFP8PacksExperts(t *testing.T) { t.Fatalf("expected mxfp8 quantize for %s, got %q", tensor.Name, tensor.Quantize) } } + + packedLayerNames = nil + packedLayerTensors = nil + if err := CreateSafetensorsModel("test-model", dir, "nvfp4", createLayer, createTensorLayer, writeManifest, func(string) {}, createPackedLayer); err != nil { + t.Fatalf("CreateSafetensorsModel nvfp4 failed: %v", err) + } + + if len(packedLayerNames) != 1 { + t.Fatalf("expected 1 packed layer for nvfp4, got %d: %v", len(packedLayerNames), packedLayerNames) + } + + for _, tensor := range packedLayerTensors[0] { + want := "nvfp4" + if strings.Contains(tensor.Name, "down_proj") { + want = "mxfp8" + } + if tensor.Quantize != want { + t.Fatalf("nvfp4 packed tensor %s quantize = %q, want %q", tensor.Name, tensor.Quantize, want) + } + } } func TestCreateSafetensorsModel_Qwen35Transforms(t *testing.T) { diff --git a/x/create/dtype.go b/x/create/dtype.go index a7181df79..b5411d016 100644 --- a/x/create/dtype.go +++ b/x/create/dtype.go @@ -19,6 +19,10 @@ func DTypeSize(dtype string) (int, error) { return 4, nil case "F64": return 8, nil + case "U8", "I8": + return 1, nil + case "F8_E4M3", "F8_E5M2", "F8_E4M3FN", "F8_E5M2FNUZ": + return 1, nil default: return 0, fmt.Errorf("unsupported dtype %q", dtype) } diff --git a/x/imagegen/safetensors/safetensors.go b/x/imagegen/safetensors/safetensors.go index df7b52465..6ad3dc161 100644 --- a/x/imagegen/safetensors/safetensors.go +++ b/x/imagegen/safetensors/safetensors.go @@ -64,6 +64,8 @@ func dtypeFromString(s string) mlx.Dtype { return mlx.DtypeInt64 case "U8", "UINT8": return mlx.DtypeUint8 + case "F8_E4M3", "F8_E5M2", "F8_E4M3FN", "F8_E5M2FNUZ": + return mlx.DtypeUint8 // FP8 types stored as raw uint8 bytes default: return mlx.DtypeFloat32 } diff --git a/x/mlxrunner/mlx/io.go b/x/mlxrunner/mlx/io.go index 52560dcd0..e15034fa1 100644 --- a/x/mlxrunner/mlx/io.go +++ b/x/mlxrunner/mlx/io.go @@ -7,6 +7,7 @@ import ( "fmt" "iter" "runtime" + "sort" "unsafe" ) @@ -121,10 +122,17 @@ func SaveSafetensorsWithMetadata(path string, arrays map[string]*Array, metadata cArrays := C.mlx_map_string_to_array_new() defer C.mlx_map_string_to_array_free(cArrays) + arrayNames := make([]string, 0, len(arrays)) for name, arr := range arrays { if arr == nil { continue } + arrayNames = append(arrayNames, name) + } + sort.Strings(arrayNames) + + for _, name := range arrayNames { + arr := arrays[name] cName := C.CString(name) C.mlx_map_string_to_array_insert(cArrays, cName, arr.ctx) C.free(unsafe.Pointer(cName)) @@ -133,7 +141,14 @@ func SaveSafetensorsWithMetadata(path string, arrays map[string]*Array, metadata cMetadata := C.mlx_map_string_to_string_new() defer C.mlx_map_string_to_string_free(cMetadata) - for key, value := range metadata { + metadataKeys := make([]string, 0, len(metadata)) + for key := range metadata { + metadataKeys = append(metadataKeys, key) + } + sort.Strings(metadataKeys) + + for _, key := range metadataKeys { + value := metadata[key] cKey := C.CString(key) cValue := C.CString(value) C.mlx_map_string_to_string_insert(cMetadata, cKey, cValue) diff --git a/x/mlxrunner/model/linear.go b/x/mlxrunner/model/linear.go index 788e4e3f0..cfbcfd3a7 100644 --- a/x/mlxrunner/model/linear.go +++ b/x/mlxrunner/model/linear.go @@ -74,14 +74,23 @@ func MakeLinearLayer( scales, ) + // Check for per-tensor global scale (NVIDIA double-scale nvfp4). + // NVIDIA ModelOpt stores this as "weight_scale_2"; our import + // pipeline maps it to "weight.global_scale". + globalScale := tensors[path+".weight.global_scale"] + if globalScale == nil { + globalScale = tensors[path+".weight_scale_2"] + } + return &nn.QuantizedLinear{ - Weight: w, - Scales: scales, - QBiases: qbiases, - Bias: bias, - GroupSize: groupSize, - Bits: bits, - Mode: mode, + Weight: w, + Scales: scales, + QBiases: qbiases, + Bias: bias, + GlobalScale: globalScale, + GroupSize: groupSize, + Bits: bits, + Mode: mode, } } diff --git a/x/models/nn/nn.go b/x/models/nn/nn.go index 56e727617..4410848b7 100644 --- a/x/models/nn/nn.go +++ b/x/models/nn/nn.go @@ -78,13 +78,14 @@ func (l *Linear) OutputDim() int32 { // QuantizedLinear applies an affine transformation using quantized weights. type QuantizedLinear struct { - Weight *mlx.Array // Quantized weight data - Scales *mlx.Array // Scale factors for dequantization - QBiases *mlx.Array // Quantization biases (nil for nvfp4) - Bias *mlx.Array // Layer bias [output_dims] or nil - GroupSize int - Bits int - Mode string + Weight *mlx.Array // Quantized weight data + Scales *mlx.Array // Scale factors for dequantization + QBiases *mlx.Array // Quantization biases (nil for nvfp4) + Bias *mlx.Array // Layer bias [output_dims] or nil + GlobalScale *mlx.Array // Per-tensor global scale for double-scale nvfp4 (nil for standard) + GroupSize int + Bits int + Mode string } func NewQuantizedLinear(weight *mlx.Array, bias *mlx.Array, groupSize, bits int, mode string) *QuantizedLinear { @@ -106,7 +107,18 @@ func NewQuantizedLinear(weight *mlx.Array, bias *mlx.Array, groupSize, bits int, } func (ql *QuantizedLinear) Forward(x *mlx.Array) *mlx.Array { - out := mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode) + var out *mlx.Array + if ql.GlobalScale != nil { + // Double-scale nvfp4 (e.g., NVIDIA ModelOpt): standard quantized_matmul + // followed by global_scale multiply. The global_scale is a per-tensor + // F32 scalar (weight_scale_2 in NVIDIA's format). + // TODO: switch to a fused double-scale matmul once MLX has kernel + // coverage for this path. + out = mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode) + out = mlx.Mul(out, ql.GlobalScale) + } else { + out = mlx.QuantizedMatmul(x, ql.Weight, ql.Scales, ql.QBiases, true, ql.GroupSize, ql.Bits, ql.Mode) + } if ql.Bias != nil && ql.Bias.Valid() { out = out.Add(ql.Bias) } diff --git a/x/safetensors/extractor.go b/x/safetensors/extractor.go index f4f7e5d87..28d79226c 100644 --- a/x/safetensors/extractor.go +++ b/x/safetensors/extractor.go @@ -110,6 +110,19 @@ func NewTensorDataFromBytes(name, dtype string, shape []int32, rawData []byte) * } } +// NewTensorDataFromReaderAt creates a TensorData backed by an arbitrary +// io.ReaderAt. This is useful for constructing large synthetic tensors from +// temporary files without loading the full payload into memory. +func NewTensorDataFromReaderAt(name, dtype string, shape []int32, readerAt io.ReaderAt, size int64) *TensorData { + return &TensorData{ + Name: name, + Dtype: dtype, + Shape: shape, + Size: size, + reader: io.NewSectionReader(readerAt, 0, size), + } +} + // ExtractRawFromSafetensors reads a safetensors-wrapped reader and extracts // the raw tensor data bytes (stripping the header). func ExtractRawFromSafetensors(r io.Reader) ([]byte, error) { diff --git a/x/server/show.go b/x/server/show.go index 71007d56b..310c4ba71 100644 --- a/x/server/show.go +++ b/x/server/show.go @@ -306,15 +306,16 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) { } // GetSafetensorsDtype returns the quantization type for a safetensors model. -// Reads quant_type from the first tensor blob's __metadata__. -// Falls back to torch_dtype from config.json if no quant metadata. +// Reads tensor headers until quantized weights are found. +// Falls back to torch_dtype from config.json if no quant metadata exists. func GetSafetensorsDtype(name model.Name) (string, error) { mf, err := manifest.ParseNamedManifest(name) if err != nil { return "", fmt.Errorf("failed to load manifest: %w", err) } - // Check first tensor blob for quant_type metadata + // Mixed models can start with unquantized embeddings or heads, so scan until + // any tensor blob reports quantized weight metadata. for _, layer := range mf.Layers { if layer.MediaType != manifest.MediaTypeImageTensor { continue @@ -323,15 +324,20 @@ func GetSafetensorsDtype(name model.Name) (string, error) { if err != nil { continue } - info, err := readSafetensorsHeader(blobPath) + f, err := os.Open(blobPath) if err != nil { continue } - if quantType := canonicalQuantType(info.QuantType); quantType != "" { - return quantType, nil + infos, err := parseSafetensorsAllHeaders(f) + _ = f.Close() + if err != nil { + continue + } + for _, info := range infos { + if quantType := canonicalQuantType(info.QuantType); quantType != "" { + return quantType, nil + } } - // Only check the first tensor blob - break } // Not quantized - return torch_dtype from config.json @@ -354,86 +360,6 @@ type safetensorsTensorInfo struct { GroupSize string // from __metadata__.group_size (e.g., "32", "64") } -// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata. -// Safetensors format: 8-byte header size (little endian) + JSON header + tensor data -func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - return parseSafetensorsHeader(f) -} - -// parseSafetensorsHeader parses a safetensors header from a reader. -// This is separated for testability. -// Parses __metadata__ for quant_type and group_size if present. -func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) { - // Read header size (8 bytes, little endian) - var headerSize uint64 - if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil { - return nil, fmt.Errorf("failed to read header size: %w", err) - } - - // Sanity check - header shouldn't be too large - if headerSize > 1024*1024 { - return nil, fmt.Errorf("header size too large: %d", headerSize) - } - - // Read header JSON - headerBytes := make([]byte, headerSize) - if _, err := io.ReadFull(r, headerBytes); err != nil { - return nil, fmt.Errorf("failed to read header: %w", err) - } - - // Parse as map of tensor name -> info - var header map[string]json.RawMessage - if err := json.Unmarshal(headerBytes, &header); err != nil { - return nil, fmt.Errorf("failed to parse header: %w", err) - } - - // Parse metadata if present - var quantType, groupSize string - if metaRaw, ok := header["__metadata__"]; ok { - var meta map[string]string - if json.Unmarshal(metaRaw, &meta) == nil { - quantType = meta["quant_type"] - groupSize = meta["group_size"] - } - } - - // Find the main tensor entry (not __metadata__, .scale, or .bias) - for name, raw := range header { - if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") { - continue - } - var info safetensorsTensorInfo - if err := json.Unmarshal(raw, &info); err != nil { - return nil, fmt.Errorf("failed to parse tensor info: %w", err) - } - info.QuantType = quantType - info.GroupSize = groupSize - return &info, nil - } - - // Fall back to first non-metadata tensor entry - for name, raw := range header { - if name == "__metadata__" { - continue - } - var info safetensorsTensorInfo - if err := json.Unmarshal(raw, &info); err != nil { - return nil, fmt.Errorf("failed to parse tensor info: %w", err) - } - info.QuantType = quantType - info.GroupSize = groupSize - return &info, nil - } - - return nil, fmt.Errorf("no tensor found in header") -} - // parseSafetensorsAllHeaders parses all tensor entries from a safetensors header. // Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias). // For packed blobs this returns multiple entries; for single-tensor blobs, one entry. diff --git a/x/server/show_test.go b/x/server/show_test.go index 01d2a36fd..6f2812a49 100644 --- a/x/server/show_test.go +++ b/x/server/show_test.go @@ -9,6 +9,7 @@ import ( "testing" "github.com/ollama/ollama/manifest" + "github.com/ollama/ollama/types/model" ) func TestBuildModelInfo(t *testing.T) { @@ -286,168 +287,7 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) { } } -func TestParseSafetensorsHeader(t *testing.T) { - tests := []struct { - name string - header map[string]any - wantDtype string - wantShape []int64 - wantQuantType string - wantGroupSize string - wantErr bool - }{ - { - name: "simple tensor", - header: map[string]any{ - "weight": map[string]any{ - "dtype": "BF16", - "shape": []int64{2560, 262144}, - "data_offsets": []int64{0, 1342177280}, - }, - }, - wantDtype: "BF16", - wantShape: []int64{2560, 262144}, - }, - { - name: "tensor keyed by name", - header: map[string]any{ - "model.layers.0.weight": map[string]any{ - "dtype": "BF16", - "shape": []int64{2560, 2560}, - "data_offsets": []int64{0, 13107200}, - }, - }, - wantDtype: "BF16", - wantShape: []int64{2560, 2560}, - }, - { - name: "with int4 quant metadata", - header: map[string]any{ - "__metadata__": map[string]any{ - "quant_type": "int4", - "group_size": "32", - }, - "model.layers.0.mlp.up_proj.weight": map[string]any{ - "dtype": "U32", - "shape": []int64{2560, 320}, - "data_offsets": []int64{0, 3276800}, - }, - "model.layers.0.mlp.up_proj.weight.scale": map[string]any{ - "dtype": "BF16", - "shape": []int64{2560, 80}, - "data_offsets": []int64{3276800, 3686400}, - }, - "model.layers.0.mlp.up_proj.weight.bias": map[string]any{ - "dtype": "BF16", - "shape": []int64{2560, 80}, - "data_offsets": []int64{3686400, 4096000}, - }, - }, - wantDtype: "U32", - wantShape: []int64{2560, 320}, - wantQuantType: "int4", - wantGroupSize: "32", - }, - { - name: "int8 quant metadata", - header: map[string]any{ - "__metadata__": map[string]any{ - "quant_type": "int8", - "group_size": "64", - }, - "model.layers.0.mlp.down_proj.weight": map[string]any{ - "dtype": "U32", - "shape": []int64{2560, 640}, - "data_offsets": []int64{0, 6553600}, - }, - "model.layers.0.mlp.down_proj.weight.scale": map[string]any{ - "dtype": "BF16", - "shape": []int64{2560, 40}, - "data_offsets": []int64{6553600, 6963200}, - }, - }, - wantDtype: "U32", - wantShape: []int64{2560, 640}, - wantQuantType: "int8", - wantGroupSize: "64", - }, - { - name: "with old-style format metadata", - header: map[string]any{ - "__metadata__": map[string]any{ - "format": "pt", - }, - "bias": map[string]any{ - "dtype": "F32", - "shape": []int64{1024}, - "data_offsets": []int64{0, 4096}, - }, - }, - wantDtype: "F32", - wantShape: []int64{1024}, - }, - { - name: "float16 tensor", - header: map[string]any{ - "layer.weight": map[string]any{ - "dtype": "F16", - "shape": []int64{512, 512, 3, 3}, - "data_offsets": []int64{0, 4718592}, - }, - }, - wantDtype: "F16", - wantShape: []int64{512, 512, 3, 3}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create safetensors format: 8-byte size + JSON header - headerJSON, err := json.Marshal(tt.header) - if err != nil { - t.Fatalf("failed to marshal header: %v", err) - } - - var buf bytes.Buffer - if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil { - t.Fatalf("failed to write header size: %v", err) - } - buf.Write(headerJSON) - - info, err := parseSafetensorsHeader(&buf) - if (err != nil) != tt.wantErr { - t.Errorf("parseSafetensorsHeader() error = %v, wantErr %v", err, tt.wantErr) - return - } - if tt.wantErr { - return - } - - if info.Dtype != tt.wantDtype { - t.Errorf("Dtype = %v, want %v", info.Dtype, tt.wantDtype) - } - - if len(info.Shape) != len(tt.wantShape) { - t.Errorf("Shape length = %v, want %v", len(info.Shape), len(tt.wantShape)) - } else { - for i, s := range info.Shape { - if s != tt.wantShape[i] { - t.Errorf("Shape[%d] = %v, want %v", i, s, tt.wantShape[i]) - } - } - } - - if info.QuantType != tt.wantQuantType { - t.Errorf("QuantType = %v, want %v", info.QuantType, tt.wantQuantType) - } - if info.GroupSize != tt.wantGroupSize { - t.Errorf("GroupSize = %v, want %v", info.GroupSize, tt.wantGroupSize) - } - }) - } -} - -func TestParseSafetensorsHeader_Errors(t *testing.T) { +func TestParseSafetensorsAllHeaders_Errors(t *testing.T) { tests := []struct { name string data []byte @@ -467,7 +307,7 @@ func TestParseSafetensorsHeader_Errors(t *testing.T) { name: "header size too large", data: func() []byte { var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, uint64(2*1024*1024)) // 2MB + binary.Write(&buf, binary.LittleEndian, uint64(200*1024*1024)) // 200 MiB return buf.Bytes() }(), wantErr: "header size too large", @@ -510,7 +350,7 @@ func TestParseSafetensorsHeader_Errors(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - _, err := parseSafetensorsHeader(bytes.NewReader(tt.data)) + _, err := parseSafetensorsAllHeaders(bytes.NewReader(tt.data)) if err == nil { t.Error("expected error, got nil") return @@ -1209,44 +1049,77 @@ func TestGetTensorInfoFromManifest_Packed(t *testing.T) { } } -func TestReadSafetensorsHeader(t *testing.T) { - // Create a temp file with a valid safetensors header - tempDir := t.TempDir() +func TestGetSafetensorsDtypeScansPastUnquantizedFirstBlob(t *testing.T) { + t.Setenv("OLLAMA_MODELS", t.TempDir()) - header := map[string]any{ - "test_tensor": map[string]any{ - "dtype": "BF16", - "shape": []int64{1024, 768}, - "data_offsets": []int64{0, 1572864}, - }, - } - headerJSON, _ := json.Marshal(header) + writeSafetensorsLayer := func(t *testing.T, header map[string]any, name string) manifest.Layer { + t.Helper() - var buf bytes.Buffer - binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))) - buf.Write(headerJSON) + headerJSON, err := json.Marshal(header) + if err != nil { + t.Fatalf("failed to marshal header: %v", err) + } - filePath := filepath.Join(tempDir, "test.safetensors") - if err := os.WriteFile(filePath, buf.Bytes(), 0o644); err != nil { - t.Fatalf("failed to write test file: %v", err) + var buf bytes.Buffer + if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil { + t.Fatalf("failed to write header size: %v", err) + } + buf.Write(headerJSON) + + layer, err := manifest.NewLayer(&buf, manifest.MediaTypeImageTensor) + if err != nil { + t.Fatalf("failed to create tensor layer: %v", err) + } + layer.Name = name + return layer } - info, err := readSafetensorsHeader(filePath) + configData, err := json.Marshal(map[string]any{ + "model_format": "safetensors", + }) if err != nil { - t.Fatalf("readSafetensorsHeader() error = %v", err) + t.Fatalf("failed to marshal config: %v", err) + } + configLayer, err := manifest.NewLayer(bytes.NewReader(configData), "application/vnd.docker.container.image.v1+json") + if err != nil { + t.Fatalf("failed to create config layer: %v", err) } - if info.Dtype != "BF16" { - t.Errorf("Dtype = %v, want BF16", info.Dtype) - } - if len(info.Shape) != 2 || info.Shape[0] != 1024 || info.Shape[1] != 768 { - t.Errorf("Shape = %v, want [1024, 768]", info.Shape) - } -} + unquantized := writeSafetensorsLayer(t, map[string]any{ + "model.embed_tokens.weight": map[string]any{ + "dtype": "BF16", + "shape": []int64{16, 8}, + "data_offsets": []int64{0, 256}, + }, + }, "model.embed_tokens.weight") -func TestReadSafetensorsHeader_FileNotFound(t *testing.T) { - _, err := readSafetensorsHeader("/nonexistent/path/file.safetensors") - if err == nil { - t.Error("expected error for nonexistent file") + quantized := writeSafetensorsLayer(t, map[string]any{ + "__metadata__": map[string]string{ + "quant_type": "mxfp8", + "group_size": "32", + }, + "model.layers.0.mlp.down_proj.weight": map[string]any{ + "dtype": "U32", + "shape": []int64{16, 4}, + "data_offsets": []int64{0, 256}, + }, + "model.layers.0.mlp.down_proj.weight.scale": map[string]any{ + "dtype": "BF16", + "shape": []int64{16, 1}, + "data_offsets": []int64{256, 288}, + }, + }, "model.layers.0.mlp.down_proj.weight") + + name := model.ParseName("mixed-fp8-safetensors") + if err := manifest.WriteManifest(name, configLayer, []manifest.Layer{unquantized, quantized}); err != nil { + t.Fatalf("failed to write manifest: %v", err) + } + + got, err := GetSafetensorsDtype(name) + if err != nil { + t.Fatalf("GetSafetensorsDtype() error = %v", err) + } + if got != "mxfp8" { + t.Fatalf("GetSafetensorsDtype() = %q, want mxfp8", got) } }