From 004b84880505b9d1cd2c4a3cbe4e8a4a12ccb1bd Mon Sep 17 00:00:00 2001 From: Takuo Kuroki Date: Sat, 25 Apr 2026 23:32:03 +0900 Subject: [PATCH 1/4] Preserve original vectors for Faiss + cosine derived source Faiss does not natively support cosine similarity. Previously the plugin L2-normalized vectors in-place in KNNVectorFieldMapper#parseCreateField before they were written to BinaryDocValues. As a result, derived source returned normalized vectors instead of the user-indexed ones (issue #3083). Defer the normalization to the native index build path so that doc values keep the original vectors. A new NormalizingKNNFloatVectorValues wraps the KNNFloatVectorValues fed to the Faiss builder and returns L2-normalized copies on demand. Derived source reconstruction reads BinaryDocValues directly and therefore returns the original vectors without any additional denormalization step. Changes: - KNNVectorFieldMapper: drop in-place L2 normalization in parseCreateField - KNN80DocValuesConsumer: wrap KNNVectorValues with NormalizingKNNFloatVectorValues when the field is Faiss + cosine + float - NormalizingKNNFloatVectorValues: new decorator that returns normalized copies - DerivedSourceIT: add integration tests covering original vector restoration, zero vector handling, and multi-segment merge behaviour Related: #3083 Signed-off-by: Takuo Kuroki --- CHANGELOG.md | 1 + .../KNN80Codec/KNN80DocValuesConsumer.java | 47 ++- .../index/mapper/KNNVectorFieldMapper.java | 4 +- .../NormalizingKNNFloatVectorValues.java | 47 +++ .../opensearch/knn/integ/DerivedSourceIT.java | 309 ++++++++++++++++++ 5 files changed, 404 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/opensearch/knn/index/vectorvalues/NormalizingKNNFloatVectorValues.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fa80d5320..91ad435317 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Bug Fixes +* Preserve original (unnormalized) vectors in doc values for Faiss + cosine similarity so that derived source returns the user-indexed vector [#3083](https://github.com/opensearch-project/k-NN/issues/3083) ### Refactoring diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java index 6b8e52eb74..5357e2790d 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java @@ -6,6 +6,7 @@ package org.opensearch.knn.index.codec.KNN80Codec; import lombok.extern.log4j.Log4j2; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.codecs.DocValuesConsumer; @@ -15,12 +16,18 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.opensearch.common.StopWatch; +import org.opensearch.knn.common.KNNConstants; +import org.opensearch.knn.index.SpaceType; import org.opensearch.knn.index.VectorDataType; import org.opensearch.knn.index.codec.nativeindex.NativeIndexWriter; import org.opensearch.knn.index.engine.KNNEngine; import org.opensearch.knn.index.mapper.KNNVectorFieldMapper; +import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory; +import org.opensearch.knn.index.vectorvalues.NormalizingKNNFloatVectorValues; +import org.opensearch.knn.indices.ModelMetadata; +import org.opensearch.knn.indices.ModelUtil; import org.opensearch.knn.plugin.stats.KNNGraphValue; import java.io.IOException; @@ -67,15 +74,49 @@ private boolean isKNNBinaryFieldRequired(FieldInfo field) { public void addKNNBinaryField(FieldInfo field, DocValuesProducer valuesProducer, boolean isMerge) throws IOException { final VectorDataType vectorDataType = extractVectorDataType(field); - final KNNVectorValues knnVectorValues = KNNVectorValuesFactory.getVectorValues(vectorDataType, valuesProducer.getBinary(field)); + KNNVectorValues knnVectorValues = KNNVectorValuesFactory.getVectorValues(vectorDataType, valuesProducer.getBinary(field)); + + // Faiss does not natively support cosine similarity. When the field is configured for + // Faiss + cosine, we store the original (unnormalized) vectors in doc values and only + // normalize when feeding vectors to the native index build path. This keeps doc values + // consistent with what the user indexed and enables derived source to return the original + // vector. See issue #3083. + if (needsNormalizationForFaissBuild(field, vectorDataType)) { + knnVectorValues = new NormalizingKNNFloatVectorValues((KNNFloatVectorValues) knnVectorValues); + } + final KNNVectorValues finalVectorValues = knnVectorValues; // For BDV it is fine to use knnVectorValues.totalLiveDocs() as we already run the full loop to calculate total // live docs if (isMerge) { - NativeIndexWriter.getWriter(field, state).mergeIndex(() -> knnVectorValues, (int) knnVectorValues.totalLiveDocs()); + NativeIndexWriter.getWriter(field, state).mergeIndex(() -> finalVectorValues, (int) finalVectorValues.totalLiveDocs()); } else { - NativeIndexWriter.getWriter(field, state).flushIndex(() -> knnVectorValues, (int) knnVectorValues.totalLiveDocs()); + NativeIndexWriter.getWriter(field, state).flushIndex(() -> finalVectorValues, (int) finalVectorValues.totalLiveDocs()); + } + } + + /** + * Returns true when the vectors fed to the native (Faiss) build must be L2-normalized because + * the user-facing space type is cosine similarity. Model-based fields resolve the space type + * through {@link ModelUtil#getModelMetadata(String)} which reads from cluster state (no network I/O). + */ + private boolean needsNormalizationForFaissBuild(FieldInfo field, VectorDataType vectorDataType) { + if (vectorDataType != VectorDataType.FLOAT) { + return false; + } + if (extractKNNEngine(field) != KNNEngine.FAISS) { + return false; + } + final String spaceTypeStr = field.getAttribute(KNNConstants.SPACE_TYPE); + if (StringUtils.isNotEmpty(spaceTypeStr)) { + return SpaceType.COSINESIMIL.getValue().equals(spaceTypeStr); + } + final String modelId = field.getAttribute(KNNConstants.MODEL_ID); + if (StringUtils.isNotEmpty(modelId)) { + final ModelMetadata modelMetadata = ModelUtil.getModelMetadata(modelId); + return modelMetadata != null && modelMetadata.getSpaceType() == SpaceType.COSINESIMIL; } + return false; } /** diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java index 6d1345dc9d..dbb92b8773 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java @@ -789,7 +789,9 @@ protected void parseCreateField(ParseContext context, int dimension, VectorDataT } final float[] array = floatsArrayOptional.get(); getVectorValidator().validateVector(array); - getVectorTransformer().transform(array, true); + // Intentionally NOT normalizing the vector here even for Faiss+cosine. Normalization for Faiss + // cosine is deferred to the native index build path so that BinaryDocValues keeps the original + // (unnormalized) vectors. See issue #3083 for context. context.doc().addAll(getFieldsForFloatVector(array, isDerivedEnabled(context))); } else { throw new IllegalArgumentException( diff --git a/src/main/java/org/opensearch/knn/index/vectorvalues/NormalizingKNNFloatVectorValues.java b/src/main/java/org/opensearch/knn/index/vectorvalues/NormalizingKNNFloatVectorValues.java new file mode 100644 index 0000000000..3d320f46c6 --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/vectorvalues/NormalizingKNNFloatVectorValues.java @@ -0,0 +1,47 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.vectorvalues; + +import org.apache.lucene.util.VectorUtil; + +import java.io.IOException; +import java.util.Arrays; + +/** + * Wraps a {@link KNNFloatVectorValues} and returns L2-normalized copies of the vectors. + * + *

This is used by the Faiss native index build path when the space type is cosine similarity. + * Faiss does not natively support cosine similarity; instead we convert cosine to inner product + * and feed unit vectors to the index. Storing the original (unnormalized) vectors in doc values + * while only normalizing at build time preserves the original data for downstream consumers such + * as derived source reconstruction. + */ +public class NormalizingKNNFloatVectorValues extends KNNFloatVectorValues { + + private final KNNFloatVectorValues delegate; + + public NormalizingKNNFloatVectorValues(final KNNFloatVectorValues delegate) { + super(delegate.getVectorValuesIterator()); + this.delegate = delegate; + } + + @Override + public float[] getVector() throws IOException { + final float[] original = delegate.getVector(); + // Keep local caches consistent with the delegate (dimension/bytesPerVector are populated on first getVector()). + this.dimension = delegate.dimension; + this.bytesPerVector = delegate.bytesPerVector; + final float[] copy = Arrays.copyOf(original, original.length); + VectorUtil.l2normalize(copy); + return copy; + } + + @Override + public float[] conditionalCloneVector() throws IOException { + // getVector() already returns a fresh copy, so no further clone is needed. + return getVector(); + } +} diff --git a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java index 49cda134d4..d3e5c33fb0 100644 --- a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java +++ b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java @@ -814,5 +814,314 @@ private void assertMappingLevelSourceFiltering(String indexName, String[] expect ); } } + /** + * Test that Faiss + cosinesimil + derived source returns original (non-normalized) vectors in _source. + * When cosine similarity is used with Faiss engine, vectors are L2-normalized at index time. + * With derived source enabled, the _source should still return the original vectors by using + * the stored norm to denormalize. + */ + @SneakyThrows + public void testCosineSimDerivedSourceReturnsOriginalVector() { + String indexEnabled = "cosine-derived-enabled"; + String indexDisabled = "cosine-derived-disabled"; + String fieldName = "test_vector"; + int dimension = 8; + int docCount = 10; + + String mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .startObject("method") + .field("engine", "faiss") + .field("space_type", "cosinesimil") + .field("name", "hnsw") + .endObject() + .endObject() + .endObject() + .endObject() + .toString(); + + // Create index with derived source enabled + createKnnIndex( + indexEnabled, + Settings.builder().put("index.knn", true).put("index.knn.derived_source.enabled", true).build(), + mapping + ); + // Create index with derived source disabled (baseline) + createKnnIndex( + indexDisabled, + Settings.builder().put("index.knn", true).put("index.knn.derived_source.enabled", false).build(), + mapping + ); + + // Index documents with non-unit vectors + Random random = new Random(42); + float[][] originalVectors = new float[docCount][dimension]; + for (int i = 0; i < docCount; i++) { + for (int j = 0; j < dimension; j++) { + originalVectors[i][j] = random.nextFloat() * 10 - 5; // range [-5, 5] + } + Float[] vector = new Float[dimension]; + for (int j = 0; j < dimension; j++) { + vector[j] = originalVectors[i][j]; + } + addKnnDoc(indexEnabled, String.valueOf(i), fieldName, vector); + addKnnDoc(indexDisabled, String.valueOf(i), fieldName, vector); + } + + refreshAllIndices(); + + // Verify _source returns original vectors (not normalized) for both indices + for (int i = 0; i < docCount; i++) { + Map sourceEnabled = getKnnDoc(indexEnabled, String.valueOf(i)); + Map sourceDisabled = getKnnDoc(indexDisabled, String.valueOf(i)); + + List sourceVectorEnabled = (List) sourceEnabled.get(fieldName); + List sourceVectorDisabled = (List) sourceDisabled.get(fieldName); + + assertNotNull("Derived source enabled: vector should not be null for doc " + i, sourceVectorEnabled); + assertNotNull("Derived source disabled: vector should not be null for doc " + i, sourceVectorDisabled); + assertEquals(dimension, sourceVectorEnabled.size()); + + // Compare with original vector - allow small floating point tolerance due to norm * denorm roundtrip + for (int j = 0; j < dimension; j++) { + assertEquals( + "Doc " + i + " dim " + j + ": derived source vector should match original", + originalVectors[i][j], + sourceVectorEnabled.get(j).floatValue(), + 1e-4f + ); + } + } + + // Verify after force merge (reads from segment doc values, not translog) + forceMergeKnnIndex(indexEnabled, 1); + refreshAllIndices(); + + for (int i = 0; i < docCount; i++) { + Map sourceEnabled = getKnnDoc(indexEnabled, String.valueOf(i)); + List sourceVector = (List) sourceEnabled.get(fieldName); + + for (int j = 0; j < dimension; j++) { + assertEquals( + "After merge - Doc " + i + " dim " + j + ": derived source vector should match original", + originalVectors[i][j], + sourceVector.get(j).floatValue(), + 1e-4f + ); + } + } + + deleteKNNIndex(indexEnabled); + deleteKNNIndex(indexDisabled); + } + + /** + * Test that Faiss + cosinesimil + derived source correctly rejects zero vectors at index time. + * A zero vector has L2 norm of 0, which makes cosine similarity undefined. + * Both derived-source-enabled and disabled indices should reject zero vectors consistently. + * Also verifies that normal vectors indexed alongside the rejection attempt are unaffected. + */ + @SneakyThrows + public void testCosineSimDerivedSourceWithZeroVector() { + String indexEnabled = "cosine-derived-zero-enabled"; + String indexDisabled = "cosine-derived-zero-disabled"; + String fieldName = "test_vector"; + int dimension = 8; + + String mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .startObject("method") + .field("engine", "faiss") + .field("space_type", "cosinesimil") + .field("name", "hnsw") + .endObject() + .endObject() + .endObject() + .endObject() + .toString(); + + createKnnIndex( + indexEnabled, + Settings.builder().put("index.knn", true).put("index.knn.derived_source.enabled", true).build(), + mapping + ); + createKnnIndex( + indexDisabled, + Settings.builder().put("index.knn", true).put("index.knn.derived_source.enabled", false).build(), + mapping + ); + + // Zero vector should be rejected for cosinesimil + Float[] zeroVector = new Float[dimension]; + for (int j = 0; j < dimension; j++) { + zeroVector[j] = 0.0f; + } + + ResponseException enabledEx = expectThrows( + ResponseException.class, + () -> addKnnDoc(indexEnabled, "0", fieldName, zeroVector) + ); + assertTrue( + "Derived enabled: should reject zero vector for cosinesimil", + enabledEx.getMessage().contains("zero vector is not supported") + ); + + ResponseException disabledEx = expectThrows( + ResponseException.class, + () -> addKnnDoc(indexDisabled, "0", fieldName, zeroVector) + ); + assertTrue( + "Derived disabled: should reject zero vector for cosinesimil", + disabledEx.getMessage().contains("zero vector is not supported") + ); + + // Normal vectors should still work after the rejection + Float[] normalVector = new Float[] { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f }; + addKnnDoc(indexEnabled, "1", fieldName, normalVector); + addKnnDoc(indexDisabled, "1", fieldName, normalVector); + + refreshAllIndices(); + + Map sourceEnabled = getKnnDoc(indexEnabled, "1"); + List vectorEnabled = (List) sourceEnabled.get(fieldName); + assertNotNull("Normal vector should be retrievable after zero vector rejection", vectorEnabled); + assertEquals(dimension, vectorEnabled.size()); + for (int j = 0; j < dimension; j++) { + assertEquals( + "Normal vector dim " + j + " should match original", + normalVector[j], + vectorEnabled.get(j).floatValue(), + 1e-4f + ); + } + + // Verify after force merge + forceMergeKnnIndex(indexEnabled, 1); + refreshAllIndices(); + + Map mergedSource = getKnnDoc(indexEnabled, "1"); + List mergedVector = (List) mergedSource.get(fieldName); + for (int j = 0; j < dimension; j++) { + assertEquals( + "After merge - normal vector dim " + j + " should match original", + normalVector[j], + mergedVector.get(j).floatValue(), + 1e-4f + ); + } + + deleteKNNIndex(indexEnabled); + deleteKNNIndex(indexDisabled); + } + + /** + * Test that merging segments with different norm doc values correctly reconstructs _source. + * Scenario: two separate flushes create two segments each with their own norm doc values, + * then a force merge combines them into one segment. After merge, _source reconstruction + * must still return the original (non-normalized) vectors for documents from both segments. + */ + @SneakyThrows + public void testCosineSimDerivedSourceMultiSegmentMerge() { + String indexName = "cosine-derived-multi-segment"; + String fieldName = "test_vector"; + int dimension = 8; + + String mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .startObject("method") + .field("engine", "faiss") + .field("space_type", "cosinesimil") + .field("name", "hnsw") + .endObject() + .endObject() + .endObject() + .endObject() + .toString(); + + createKnnIndex( + indexName, + Settings.builder().put("index.knn", true).put("index.knn.derived_source.enabled", true).build(), + mapping + ); + + Random random = new Random(42); + int docsPerSegment = 5; + float[][] allVectors = new float[docsPerSegment * 2][dimension]; + + // Segment 1: index first batch and flush + for (int i = 0; i < docsPerSegment; i++) { + for (int j = 0; j < dimension; j++) { + allVectors[i][j] = random.nextFloat() * 10 - 5; + } + Float[] vector = new Float[dimension]; + for (int j = 0; j < dimension; j++) { + vector[j] = allVectors[i][j]; + } + addKnnDoc(indexName, String.valueOf(i), fieldName, vector); + } + flushIndex(indexName); + + // Segment 2: index second batch and flush + for (int i = docsPerSegment; i < docsPerSegment * 2; i++) { + for (int j = 0; j < dimension; j++) { + allVectors[i][j] = random.nextFloat() * 10 - 5; + } + Float[] vector = new Float[dimension]; + for (int j = 0; j < dimension; j++) { + vector[j] = allVectors[i][j]; + } + addKnnDoc(indexName, String.valueOf(i), fieldName, vector); + } + flushIndex(indexName); + + // Verify _source before merge (each segment has its own norm doc values) + refreshAllIndices(); + for (int i = 0; i < docsPerSegment * 2; i++) { + Map source = getKnnDoc(indexName, String.valueOf(i)); + List sourceVector = (List) source.get(fieldName); + assertNotNull("Before merge: vector should not be null for doc " + i, sourceVector); + for (int j = 0; j < dimension; j++) { + assertEquals( + "Before merge - Doc " + i + " dim " + j, + allVectors[i][j], + sourceVector.get(j).floatValue(), + 1e-4f + ); + } + } + + // Force merge into a single segment + forceMergeKnnIndex(indexName, 1); + refreshAllIndices(); + + // Verify _source after merge (norm doc values from both segments must be preserved) + for (int i = 0; i < docsPerSegment * 2; i++) { + Map source = getKnnDoc(indexName, String.valueOf(i)); + List sourceVector = (List) source.get(fieldName); + assertNotNull("After merge: vector should not be null for doc " + i, sourceVector); + for (int j = 0; j < dimension; j++) { + assertEquals( + "After merge - Doc " + i + " dim " + j, + allVectors[i][j], + sourceVector.get(j).floatValue(), + 1e-4f + ); + } + } + + deleteKNNIndex(indexName); + } } From 640f548d8cd0daa5842f921a457f368cd5bb5414 Mon Sep 17 00:00:00 2001 From: Takuo Kuroki Date: Sun, 26 Apr 2026 23:26:57 +0900 Subject: [PATCH 2/4] Unify vector normalization logic across query and build paths Consolidates the two previously separate checks that decided whether a vector field requires L2 normalization into a single entry point on VectorTransformerFactory. Both the query-time path (KNNVectorFieldType. transformQueryVector) and the native-index build path (KNN80DocValuesConsumer. addKNNBinaryField) now go through the same factory/transformer pair. Changes: - VectorTransformer: add wrap(KNNFloatVectorValues) and wrap(KNNByteVectorValues) default no-op methods so callers can uniformly apply transformations to a stream of vector values. - NormalizeVectorTransformer: override wrap(KNNFloatVectorValues) to return a NormalizingKNNFloatVectorValues. - VectorTransformerFactory: add getVectorTransformer(FieldInfo) overload that resolves SpaceType and MethodComponentContext directly from field attributes (SPACE_TYPE / MODEL_ID / PARAMETERS) and falls back to ModelUtil cluster state lookup for model-based fields. MethodComponentContext.parse is reused by narrowing the JSON map to name/parameters first, which allows the Lucene flat and Lucene SQ 1-bit conditions to be evaluated the same way as at query time. - KNN80DocValuesConsumer: replace the field-local needsNormalizationForFaissBuild helper and ModelUtil plumbing with a single call to VectorTransformerFactory.getVectorTransformer(field).wrap(...). Unused imports (StringUtils, SpaceType, KNNConstants, ModelMetadata, ModelUtil, NormalizingKNNFloatVectorValues) are removed. Tests: - NormalizeVectorTransformerTests: cover wrap(KNNFloatVectorValues). - VectorTransformerFactoryTests: cover getVectorTransformer(FieldInfo) for Faiss cosine, Faiss L2, missing attributes, Lucene cosine + flat, Lucene cosine + SQ 1-bit, and Lucene cosine + HNSW without encoder. Related: #3083 Signed-off-by: Takuo Kuroki --- .../KNN80Codec/KNN80DocValuesConsumer.java | 49 +++------ .../mapper/NormalizeVectorTransformer.java | 11 +++ .../knn/index/mapper/VectorTransformer.java | 28 ++++++ .../mapper/VectorTransformerFactory.java | 99 +++++++++++++++++++ .../NormalizeVectorTransformerTests.java | 19 ++++ .../mapper/VectorTransformerFactoryTests.java | 80 +++++++++++++++ 6 files changed, 249 insertions(+), 37 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java index 5357e2790d..7486fba413 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java @@ -6,7 +6,6 @@ package org.opensearch.knn.index.codec.KNN80Codec; import lombok.extern.log4j.Log4j2; -import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.codecs.DocValuesConsumer; @@ -16,18 +15,16 @@ import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.opensearch.common.StopWatch; -import org.opensearch.knn.common.KNNConstants; -import org.opensearch.knn.index.SpaceType; import org.opensearch.knn.index.VectorDataType; import org.opensearch.knn.index.codec.nativeindex.NativeIndexWriter; import org.opensearch.knn.index.engine.KNNEngine; import org.opensearch.knn.index.mapper.KNNVectorFieldMapper; +import org.opensearch.knn.index.mapper.VectorTransformer; +import org.opensearch.knn.index.mapper.VectorTransformerFactory; +import org.opensearch.knn.index.vectorvalues.KNNByteVectorValues; import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory; -import org.opensearch.knn.index.vectorvalues.NormalizingKNNFloatVectorValues; -import org.opensearch.knn.indices.ModelMetadata; -import org.opensearch.knn.indices.ModelUtil; import org.opensearch.knn.plugin.stats.KNNGraphValue; import java.io.IOException; @@ -76,13 +73,15 @@ public void addKNNBinaryField(FieldInfo field, DocValuesProducer valuesProducer, final VectorDataType vectorDataType = extractVectorDataType(field); KNNVectorValues knnVectorValues = KNNVectorValuesFactory.getVectorValues(vectorDataType, valuesProducer.getBinary(field)); - // Faiss does not natively support cosine similarity. When the field is configured for - // Faiss + cosine, we store the original (unnormalized) vectors in doc values and only - // normalize when feeding vectors to the native index build path. This keeps doc values - // consistent with what the user indexed and enables derived source to return the original - // vector. See issue #3083. - if (needsNormalizationForFaissBuild(field, vectorDataType)) { - knnVectorValues = new NormalizingKNNFloatVectorValues((KNNFloatVectorValues) knnVectorValues); + // Apply the field-configured VectorTransformer to the stream of vectors fed into the native + // builder. For Faiss + cosine this yields a normalized stream while BinaryDocValues keep the + // original vectors untouched. For all other combinations the factory returns a no-op and + // knnVectorValues is passed through unchanged. See issue #3083. + final VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(field); + if (knnVectorValues instanceof KNNFloatVectorValues floatVectorValues) { + knnVectorValues = transformer.wrap(floatVectorValues); + } else if (knnVectorValues instanceof KNNByteVectorValues byteVectorValues) { + knnVectorValues = transformer.wrap(byteVectorValues); } final KNNVectorValues finalVectorValues = knnVectorValues; @@ -95,30 +94,6 @@ public void addKNNBinaryField(FieldInfo field, DocValuesProducer valuesProducer, } } - /** - * Returns true when the vectors fed to the native (Faiss) build must be L2-normalized because - * the user-facing space type is cosine similarity. Model-based fields resolve the space type - * through {@link ModelUtil#getModelMetadata(String)} which reads from cluster state (no network I/O). - */ - private boolean needsNormalizationForFaissBuild(FieldInfo field, VectorDataType vectorDataType) { - if (vectorDataType != VectorDataType.FLOAT) { - return false; - } - if (extractKNNEngine(field) != KNNEngine.FAISS) { - return false; - } - final String spaceTypeStr = field.getAttribute(KNNConstants.SPACE_TYPE); - if (StringUtils.isNotEmpty(spaceTypeStr)) { - return SpaceType.COSINESIMIL.getValue().equals(spaceTypeStr); - } - final String modelId = field.getAttribute(KNNConstants.MODEL_ID); - if (StringUtils.isNotEmpty(modelId)) { - final ModelMetadata modelMetadata = ModelUtil.getModelMetadata(modelId); - return modelMetadata != null && modelMetadata.getSpaceType() == SpaceType.COSINESIMIL; - } - return false; - } - /** * Merges in the fields from the readers in mergeState * diff --git a/src/main/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformer.java b/src/main/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformer.java index b9bd4c5e12..63dacbe112 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformer.java +++ b/src/main/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformer.java @@ -5,6 +5,8 @@ package org.opensearch.knn.index.mapper; import org.apache.lucene.util.VectorUtil; +import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues; +import org.opensearch.knn.index.vectorvalues.NormalizingKNNFloatVectorValues; import java.util.Arrays; @@ -38,6 +40,15 @@ public void transform(byte[] vector) { throw new UnsupportedOperationException("Byte array normalization is not supported"); } + /** + * Returns a {@link NormalizingKNNFloatVectorValues} that L2-normalizes vectors on demand + * without mutating the underlying data. + */ + @Override + public KNNFloatVectorValues wrap(final KNNFloatVectorValues delegate) { + return new NormalizingKNNFloatVectorValues(delegate); + } + private void validateVector(float[] vector) { if (vector == null || vector.length == 0) { throw new IllegalArgumentException("Vector cannot be null or empty"); diff --git a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformer.java b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformer.java index ceecd2002c..90154b25c6 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformer.java +++ b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformer.java @@ -4,6 +4,9 @@ */ package org.opensearch.knn.index.mapper; +import org.opensearch.knn.index.vectorvalues.KNNByteVectorValues; +import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues; + /** * Defines operations for transforming vectors in the k-NN search context. * Implementations can modify vectors while preserving their dimensional properties @@ -35,4 +38,29 @@ default void transform(final byte[] vector) { throw new IllegalArgumentException("Input vector cannot be null"); } } + + /** + * Wraps a {@link KNNFloatVectorValues} stream so that each vector returned by {@code getVector()} + * is transformed on demand. Default implementation is a pass-through. + * + *

Used by codec-layer components (e.g. {@code KNN80DocValuesConsumer}) to apply vector + * transformations to the stream of vectors fed into native index builders, without mutating + * the original vectors stored in {@code BinaryDocValues}. + * + * @param delegate the underlying stream of float vectors + * @return a stream that applies the transformation on the fly; returns {@code delegate} unchanged + * for no-op implementations + */ + default KNNFloatVectorValues wrap(final KNNFloatVectorValues delegate) { + return delegate; + } + + /** + * Wraps a {@link KNNByteVectorValues} stream. Default implementation is a pass-through. + * Kept symmetric with {@link #wrap(KNNFloatVectorValues)} so callers can apply transformations + * uniformly regardless of the vector element type. + */ + default KNNByteVectorValues wrap(final KNNByteVectorValues delegate) { + return delegate; + } } diff --git a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java index b443daf7d4..d90d4427b2 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java +++ b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java @@ -7,10 +7,22 @@ import lombok.AccessLevel; import lombok.NoArgsConstructor; +import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.index.FieldInfo; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.core.common.bytes.BytesArray; +import org.opensearch.core.xcontent.DeprecationHandler; +import org.opensearch.core.xcontent.MediaTypeRegistry; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.knn.common.FieldInfoExtractor; +import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.index.SpaceType; import org.opensearch.knn.index.engine.KNNEngine; import org.opensearch.knn.index.engine.MethodComponentContext; +import org.opensearch.knn.indices.ModelMetadata; +import org.opensearch.knn.indices.ModelUtil; +import java.util.HashMap; import java.util.Map; import static org.opensearch.knn.common.KNNConstants.ENCODER_SQ; @@ -55,6 +67,93 @@ public static VectorTransformer getVectorTransformer( return shouldNormalizeVector(knnEngine, spaceType, methodComponentContext) ? DEFAULT_VECTOR_TRANSFORMER : NOOP_VECTOR_TRANSFORMER; } + /** + * Returns the {@link VectorTransformer} for a field by reading the required metadata directly + * from the {@link FieldInfo}. Intended for codec-layer callers that do not have access to a + * resolved {@link MethodComponentContext}. + * + *

Space type resolution: + *

+ * + *

{@link MethodComponentContext} resolution: + *

+ * + * @param fieldInfo field metadata from the Lucene segment + * @return a {@link VectorTransformer}, possibly {@link #NOOP_VECTOR_TRANSFORMER} + */ + public static VectorTransformer getVectorTransformer(final FieldInfo fieldInfo) { + final KNNEngine engine = FieldInfoExtractor.extractKNNEngine(fieldInfo); + final SpaceType spaceType = resolveSpaceTypeFromFieldInfo(fieldInfo); + if (spaceType == null) { + return NOOP_VECTOR_TRANSFORMER; + } + final MethodComponentContext methodCtx = resolveMethodComponentContextFromFieldInfo(fieldInfo); + return getVectorTransformer(engine, spaceType, methodCtx); + } + + private static SpaceType resolveSpaceTypeFromFieldInfo(final FieldInfo fieldInfo) { + final String spaceTypeStr = fieldInfo.getAttribute(KNNConstants.SPACE_TYPE); + if (StringUtils.isNotEmpty(spaceTypeStr)) { + return SpaceType.getSpace(spaceTypeStr); + } + final String modelId = fieldInfo.getAttribute(KNNConstants.MODEL_ID); + if (StringUtils.isNotEmpty(modelId)) { + final ModelMetadata metadata = ModelUtil.getModelMetadata(modelId); + return metadata != null ? metadata.getSpaceType() : null; + } + return null; + } + + private static MethodComponentContext resolveMethodComponentContextFromFieldInfo(final FieldInfo fieldInfo) { + final String parametersString = fieldInfo.getAttribute(KNNConstants.PARAMETERS); + if (StringUtils.isNotEmpty(parametersString)) { + try { + final Map parsed = XContentHelper.createParser( + NamedXContentRegistry.EMPTY, + DeprecationHandler.THROW_UNSUPPORTED_OPERATION, + new BytesArray(parametersString), + MediaTypeRegistry.getDefaultMediaType() + ).map(); + // The JSON written by EngineFieldMapper contains top-level keys beyond NAME/PARAMETERS + // (e.g. space_type, vector_data_type, index_description). MethodComponentContext.parse + // rejects unknown keys, so we narrow down to the two fields it understands before parsing. + final Map methodMap = new HashMap<>(); + if (parsed.containsKey(KNNConstants.NAME)) { + methodMap.put(KNNConstants.NAME, parsed.get(KNNConstants.NAME)); + } + if (parsed.containsKey(KNNConstants.PARAMETERS)) { + methodMap.put(KNNConstants.PARAMETERS, parsed.get(KNNConstants.PARAMETERS)); + } + if (!methodMap.isEmpty()) { + return MethodComponentContext.parse(methodMap); + } + } catch (Exception e) { + // If parsing fails for any reason, fall through to other resolution paths. + } + } + final String modelId = fieldInfo.getAttribute(KNNConstants.MODEL_ID); + if (StringUtils.isNotEmpty(modelId)) { + final ModelMetadata metadata = ModelUtil.getModelMetadata(modelId); + if (metadata != null && metadata.getMethodComponentContext() != null) { + return metadata.getMethodComponentContext(); + } + } + return null; + } + private static boolean shouldNormalizeVector( final KNNEngine knnEngine, final SpaceType spaceType, diff --git a/src/test/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformerTests.java b/src/test/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformerTests.java index 7e92dc5f18..ce6b543f78 100644 --- a/src/test/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformerTests.java +++ b/src/test/java/org/opensearch/knn/index/mapper/NormalizeVectorTransformerTests.java @@ -6,6 +6,12 @@ package org.opensearch.knn.index.mapper; import org.opensearch.knn.KNNTestCase; +import org.opensearch.knn.index.vectorvalues.KNNFloatVectorValues; +import org.opensearch.knn.index.vectorvalues.KNNVectorValues; +import org.opensearch.knn.index.vectorvalues.NormalizingKNNFloatVectorValues; +import org.opensearch.knn.index.vectorvalues.TestVectorValues; + +import java.util.List; public class NormalizeVectorTransformerTests extends KNNTestCase { private final NormalizeVectorTransformer transformer = new NormalizeVectorTransformer(); @@ -52,6 +58,19 @@ public void testNormalizeTransformer_noInplaceUpdate_withValidVector_thenSuccess assertEquals(1.0f, calculateMagnitude(transformedVector), DELTA); } + public void testWrap_returnsNormalizingKNNFloatVectorValues() throws Exception { + KNNVectorValues delegate = TestVectorValues.createKNNFloatVectorValues(List.of(new float[] { -3.0f, 4.0f })); + KNNFloatVectorValues wrapped = transformer.wrap((KNNFloatVectorValues) delegate); + + assertTrue(wrapped instanceof NormalizingKNNFloatVectorValues); + + wrapped.nextDoc(); + float[] normalized = wrapped.getVector(); + assertEquals(-0.6f, normalized[0], DELTA); + assertEquals(0.8f, normalized[1], DELTA); + assertEquals(1.0f, calculateMagnitude(normalized), DELTA); + } + private float calculateMagnitude(float[] vector) { float magnitude = 0.0f; for (float value : vector) { diff --git a/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java b/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java index 29e3ca41a2..8388cb2537 100644 --- a/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java +++ b/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java @@ -5,9 +5,16 @@ package org.opensearch.knn.index.mapper; +import org.apache.lucene.index.FieldInfo; +import org.opensearch.common.xcontent.XContentFactory; import org.opensearch.knn.KNNTestCase; +import org.opensearch.knn.common.KNNConstants; import org.opensearch.knn.index.SpaceType; +import org.opensearch.knn.index.VectorDataType; +import org.opensearch.knn.index.codec.KNNCodecTestUtil; import org.opensearch.knn.index.engine.KNNEngine; +import org.opensearch.knn.index.engine.KNNMethodConfigContext; +import org.opensearch.knn.index.engine.KNNMethodContext; import org.opensearch.knn.index.engine.MethodComponentContext; import java.util.HashMap; @@ -84,6 +91,79 @@ public void testFaissCosine_withMethodComponentContext_returnsNormalizer() { assertTrue(transformer instanceof NormalizeVectorTransformer); } + public void testGetVectorTransformer_fromFieldInfo_faissCosine_returnsNormalizer() throws Exception { + FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.FAISS, SpaceType.COSINESIMIL, METHOD_HNSW, new HashMap<>()); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertTrue(transformer instanceof NormalizeVectorTransformer); + } + + public void testGetVectorTransformer_fromFieldInfo_faissL2_returnsNoop() throws Exception { + FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.FAISS, SpaceType.L2, METHOD_HNSW, new HashMap<>()); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); + } + + public void testGetVectorTransformer_fromFieldInfo_missingAttributes_returnsNoop() { + FieldInfo fieldInfo = KNNCodecTestUtil.FieldInfoBuilder.builder("test_field").build(); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); + } + + public void testGetVectorTransformer_fromFieldInfo_luceneCosineFlat_returnsNormalizer() throws Exception { + FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_FLAT, new HashMap<>()); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertTrue(transformer instanceof NormalizeVectorTransformer); + } + + public void testGetVectorTransformer_fromFieldInfo_luceneCosineSQOneBit_returnsNormalizer() throws Exception { + // Build a nested encoder as a plain Map because the test helper JSON-serializes + // the library parameters map; only plain Maps (not MethodComponentContext instances) serialize + // correctly. The parser in VectorTransformerFactory reconstructs MethodComponentContext from the + // nested Map. + Map encoderMap = new HashMap<>(); + encoderMap.put(KNNConstants.NAME, ENCODER_SQ); + encoderMap.put(KNNConstants.PARAMETERS, Map.of(LUCENE_SQ_BITS, 1)); + Map hnswParams = new HashMap<>(Map.of(METHOD_ENCODER_PARAMETER, encoderMap)); + FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_HNSW, hnswParams); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertTrue(transformer instanceof NormalizeVectorTransformer); + } + + public void testGetVectorTransformer_fromFieldInfo_luceneCosineHnswNoEncoder_returnsNoop() throws Exception { + FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_HNSW, new HashMap<>()); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); + } + + /** + * Builds a FieldInfo whose attributes match what EngineFieldMapper writes, so that the test exercises + * the same PARAMETERS JSON format used at runtime. + */ + private static FieldInfo buildFieldInfoWithParameters( + KNNEngine engine, + SpaceType spaceType, + String methodName, + Map methodParameters + ) throws Exception { + KNNMethodContext knnMethodContext = new KNNMethodContext( + engine, + spaceType, + new MethodComponentContext(methodName, methodParameters) + ); + KNNMethodConfigContext knnMethodConfigContext = KNNMethodConfigContext.builder() + .vectorDataType(VectorDataType.FLOAT) + .versionCreated(org.opensearch.Version.CURRENT) + .build(); + String parametersString = XContentFactory.jsonBuilder() + .map(engine.getKNNLibraryIndexingContext(knnMethodContext, knnMethodConfigContext).getLibraryParameters()) + .toString(); + return KNNCodecTestUtil.FieldInfoBuilder.builder("test_field") + .addAttribute(KNNConstants.KNN_ENGINE, engine.getName()) + .addAttribute(KNNConstants.SPACE_TYPE, spaceType.getValue()) + .addAttribute(KNNConstants.PARAMETERS, parametersString) + .build(); + } + private static void validateTransformer(SpaceType spaceType, KNNEngine engine, VectorTransformer transformer) { if (spaceType == SpaceType.COSINESIMIL && engine == KNNEngine.FAISS) { assertTrue( From 49edb23de93a00873554abda1bae6e9fd65236f9 Mon Sep 17 00:00:00 2001 From: Takuo Kuroki Date: Mon, 27 Apr 2026 15:05:31 +0900 Subject: [PATCH 3/4] Make MethodComponentContext resolution opt-in in getVectorTransformer(FieldInfo) Parsing the PARAMETERS attribute JSON on every vector flush/merge adds a cost that is only worth paying when a caller actually needs the Lucene flat / SQ 1-bit conditions to be evaluated. The build path in KNN80DocValuesConsumer only uses engine + space type (Faiss + cosine), so it can safely skip the parse. Add a boolean resolveMethodContext flag to VectorTransformerFactory.getVectorTransformer(FieldInfo, boolean): - false: reuse just engine + space type (cheap, covers the Faiss + cosine case that drives the codec-layer wrap). - true: additionally reconstruct MethodComponentContext from the PARAMETERS attribute, enabling the Lucene-specific conditions. Tests exercise this path for Lucene + cosine + flat and Lucene + cosine + SQ 1-bit. Callers updated: - KNN80DocValuesConsumer: passes false. - VectorTransformerFactoryTests: passes true to keep coverage of the Lucene flat / SQ 1-bit paths. Signed-off-by: Takuo Kuroki --- .../codec/KNN80Codec/KNN80DocValuesConsumer.java | 2 +- .../knn/index/mapper/VectorTransformerFactory.java | 8 +++++--- .../index/mapper/VectorTransformerFactoryTests.java | 12 ++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java index 7486fba413..1def614f24 100644 --- a/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java +++ b/src/main/java/org/opensearch/knn/index/codec/KNN80Codec/KNN80DocValuesConsumer.java @@ -77,7 +77,7 @@ public void addKNNBinaryField(FieldInfo field, DocValuesProducer valuesProducer, // builder. For Faiss + cosine this yields a normalized stream while BinaryDocValues keep the // original vectors untouched. For all other combinations the factory returns a no-op and // knnVectorValues is passed through unchanged. See issue #3083. - final VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(field); + final VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(field, false); if (knnVectorValues instanceof KNNFloatVectorValues floatVectorValues) { knnVectorValues = transformer.wrap(floatVectorValues); } else if (knnVectorValues instanceof KNNByteVectorValues byteVectorValues) { diff --git a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java index d90d4427b2..fcadbc5ceb 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java +++ b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java @@ -94,14 +94,16 @@ public static VectorTransformer getVectorTransformer( * @param fieldInfo field metadata from the Lucene segment * @return a {@link VectorTransformer}, possibly {@link #NOOP_VECTOR_TRANSFORMER} */ - public static VectorTransformer getVectorTransformer(final FieldInfo fieldInfo) { + public static VectorTransformer getVectorTransformer(final FieldInfo fieldInfo, boolean resolveMethodComponentContext) { final KNNEngine engine = FieldInfoExtractor.extractKNNEngine(fieldInfo); final SpaceType spaceType = resolveSpaceTypeFromFieldInfo(fieldInfo); if (spaceType == null) { return NOOP_VECTOR_TRANSFORMER; } - final MethodComponentContext methodCtx = resolveMethodComponentContextFromFieldInfo(fieldInfo); - return getVectorTransformer(engine, spaceType, methodCtx); + if(resolveMethodComponentContext) { + return getVectorTransformer(engine, spaceType, resolveMethodComponentContextFromFieldInfo(fieldInfo)); + } + return getVectorTransformer(engine, spaceType, null); } private static SpaceType resolveSpaceTypeFromFieldInfo(final FieldInfo fieldInfo) { diff --git a/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java b/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java index 8388cb2537..f9b432ffc8 100644 --- a/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java +++ b/src/test/java/org/opensearch/knn/index/mapper/VectorTransformerFactoryTests.java @@ -93,25 +93,25 @@ public void testFaissCosine_withMethodComponentContext_returnsNormalizer() { public void testGetVectorTransformer_fromFieldInfo_faissCosine_returnsNormalizer() throws Exception { FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.FAISS, SpaceType.COSINESIMIL, METHOD_HNSW, new HashMap<>()); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertTrue(transformer instanceof NormalizeVectorTransformer); } public void testGetVectorTransformer_fromFieldInfo_faissL2_returnsNoop() throws Exception { FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.FAISS, SpaceType.L2, METHOD_HNSW, new HashMap<>()); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); } public void testGetVectorTransformer_fromFieldInfo_missingAttributes_returnsNoop() { FieldInfo fieldInfo = KNNCodecTestUtil.FieldInfoBuilder.builder("test_field").build(); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); } public void testGetVectorTransformer_fromFieldInfo_luceneCosineFlat_returnsNormalizer() throws Exception { FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_FLAT, new HashMap<>()); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertTrue(transformer instanceof NormalizeVectorTransformer); } @@ -125,13 +125,13 @@ public void testGetVectorTransformer_fromFieldInfo_luceneCosineSQOneBit_returnsN encoderMap.put(KNNConstants.PARAMETERS, Map.of(LUCENE_SQ_BITS, 1)); Map hnswParams = new HashMap<>(Map.of(METHOD_ENCODER_PARAMETER, encoderMap)); FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_HNSW, hnswParams); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertTrue(transformer instanceof NormalizeVectorTransformer); } public void testGetVectorTransformer_fromFieldInfo_luceneCosineHnswNoEncoder_returnsNoop() throws Exception { FieldInfo fieldInfo = buildFieldInfoWithParameters(KNNEngine.LUCENE, SpaceType.COSINESIMIL, METHOD_HNSW, new HashMap<>()); - VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo); + VectorTransformer transformer = VectorTransformerFactory.getVectorTransformer(fieldInfo, true); assertSame(VectorTransformerFactory.NOOP_VECTOR_TRANSFORMER, transformer); } From 254214971cbdae96492c1f1a8cdf34077ca74f2b Mon Sep 17 00:00:00 2001 From: Takuo Kuroki Date: Mon, 27 Apr 2026 16:54:26 +0900 Subject: [PATCH 4/4] Tighten derived source cosine ITs to exact float equality BinaryDocValues now hold the original, unnormalized vectors bit-for-bit, so the retrieved _source vectors should match the indexed vectors exactly. The previous 1e-4 tolerance was needed when norm*denorm roundtripping introduced rounding; it no longer applies. Replace the 1e-4f delta with 0.0f in the three cosine-derived-source tests and update the surrounding comment to reflect the new contract. Signed-off-by: Takuo Kuroki --- .../index/mapper/VectorTransformerFactory.java | 1 + .../org/opensearch/knn/integ/DerivedSourceIT.java | 15 ++++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java index fcadbc5ceb..fdd3aa6702 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java +++ b/src/main/java/org/opensearch/knn/index/mapper/VectorTransformerFactory.java @@ -92,6 +92,7 @@ public static VectorTransformer getVectorTransformer( * * * @param fieldInfo field metadata from the Lucene segment + * * @return a {@link VectorTransformer}, possibly {@link #NOOP_VECTOR_TRANSFORMER} */ public static VectorTransformer getVectorTransformer(final FieldInfo fieldInfo, boolean resolveMethodComponentContext) { diff --git a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java index d3e5c33fb0..d3103b5f17 100644 --- a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java +++ b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java @@ -886,13 +886,14 @@ public void testCosineSimDerivedSourceReturnsOriginalVector() { assertNotNull("Derived source disabled: vector should not be null for doc " + i, sourceVectorDisabled); assertEquals(dimension, sourceVectorEnabled.size()); - // Compare with original vector - allow small floating point tolerance due to norm * denorm roundtrip + // Compare with original vector. Normalization now happens only on the native index build path, + // so BinaryDocValues keep the user-indexed vectors bit-for-bit. for (int j = 0; j < dimension; j++) { assertEquals( "Doc " + i + " dim " + j + ": derived source vector should match original", originalVectors[i][j], sourceVectorEnabled.get(j).floatValue(), - 1e-4f + 0.0f ); } } @@ -910,7 +911,7 @@ public void testCosineSimDerivedSourceReturnsOriginalVector() { "After merge - Doc " + i + " dim " + j + ": derived source vector should match original", originalVectors[i][j], sourceVector.get(j).floatValue(), - 1e-4f + 0.0f ); } } @@ -999,7 +1000,7 @@ public void testCosineSimDerivedSourceWithZeroVector() { "Normal vector dim " + j + " should match original", normalVector[j], vectorEnabled.get(j).floatValue(), - 1e-4f + 0.0f ); } @@ -1014,7 +1015,7 @@ public void testCosineSimDerivedSourceWithZeroVector() { "After merge - normal vector dim " + j + " should match original", normalVector[j], mergedVector.get(j).floatValue(), - 1e-4f + 0.0f ); } @@ -1097,7 +1098,7 @@ public void testCosineSimDerivedSourceMultiSegmentMerge() { "Before merge - Doc " + i + " dim " + j, allVectors[i][j], sourceVector.get(j).floatValue(), - 1e-4f + 0.0f ); } } @@ -1116,7 +1117,7 @@ public void testCosineSimDerivedSourceMultiSegmentMerge() { "After merge - Doc " + i + " dim " + j, allVectors[i][j], sourceVector.get(j).floatValue(), - 1e-4f + 0.0f ); } }