dotnet · adamsitnik · Oct 31, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
@@ -11,6 +11,7 @@
     <PackageVersion Include="Microsoft.Diagnostics.Tracing.TraceEvent" Version="3.1.3" />
     <PackageVersion Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="9.0.0" />
     <PackageVersion Include="Microsoft.Extensions.Configuration.UserSecrets" Version="9.0.0" />
+    <PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="$(MicrosoftMLTokenizersVersion)" />
     <PackageVersion Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="$(MicrosoftMLTokenizersVersion)" />
     <PackageVersion Include="Microsoft.SemanticKernel.Connectors.InMemory" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />
     <PackageVersion Include="Microsoft.SemanticKernel.Connectors.SqliteVec" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />

diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentElement.cs b/src/Libraries/Microsoft.Extensions.DataIngestion.Abstractions/IngestionDocumentElement.cs
@@ -128,7 +128,11 @@ public IngestionDocumentHeader(string markdown)
     /// <summary>
     /// Gets or sets the level of the header.
     /// </summary>
-    public int? Level { get; set; }
+    public int? Level
+    {
+        get => field;
+        set => field = Throw.IfOutOfRange(value.GetValueOrDefault(), min: 0, max: 10, nameof(value));
+    }
 }
 
 /// <summary>

diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs
@@ -0,0 +1,266 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Microsoft.ML.Tokenizers;
+using Microsoft.Shared.Diagnostics;
+
+namespace Microsoft.Extensions.DataIngestion.Chunkers;
+
+internal sealed class ElementsChunker
+{
+    private readonly Tokenizer _tokenizer;
+    private readonly int _maxTokensPerChunk;
+    private StringBuilder? _currentChunk;
+
+    internal ElementsChunker(IngestionChunkerOptions options)
+    {
+        _ = Throw.IfNull(options);
+
+        _tokenizer = options.Tokenizer;
+        _maxTokensPerChunk = options.MaxTokensPerChunk;
+    }
+
+    // Goals:
+    // 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized.
+    // 2. Maintain context in each chunk.
+    // 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows).
+    internal IEnumerable<IngestionChunk<string>> Process(IngestionDocument document, string context, List<IngestionDocumentElement> elements)
+    {
+        // Not using yield return here as we use ref structs.
+        List<IngestionChunk<string>> chunks = [];
+
+        // Token count != character count, but StringBuilder will grow as needed.
+        _currentChunk ??= new(capacity: _maxTokensPerChunk);
+
+        int contextTokenCount = CountTokens(context.AsSpan());
+        int totalTokenCount = contextTokenCount;
+
+        // If the context itself exceeds the max tokens per chunk, we can't do anything.
+        if (contextTokenCount >= _maxTokensPerChunk)
+        {
+            ThrowTokenCountExceeded();
+        }
+
+        _currentChunk = _currentChunk.Append(context);
+
+        for (int elementIndex = 0; elementIndex < elements.Count; elementIndex++)
+        {
+            IngestionDocumentElement element = elements[elementIndex];
+            string? semanticContent = element switch
+            {
+                // Image exposes:
+                // - Markdown: ![Alt Text](url) which is not very useful for embedding.
+                // - AlternativeText: usually a short description of the image, can be null or empty. It is usually less than 50 words.
+                // - Text: result of OCR, can be longer, but also can be null or empty. It can be several hundred words.
+                // We prefer  AlternativeText over Text, as it is usually more relevant.
+                IngestionDocumentImage image => image.AlternativeText ?? image.Text,
+                _ => element.GetMarkdown()
+            };
+
+            if (string.IsNullOrEmpty(semanticContent))
+            {
+                continue; // An image can come with Markdown, but no AlternativeText or Text.
+            }
+
+            int elementTokenCount = CountTokens(semanticContent.AsSpan());
+            if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk)
+            {
+                totalTokenCount += elementTokenCount;
+                AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan());
+            }
+            else if (element is IngestionDocumentTable table)
+            {
+                ValueStringBuilder tableBuilder = new(initialCapacity: 8000);
+                AddMarkdownTableRow(table, rowIndex: 0, ref tableBuilder);
+                AddMarkdownTableSeparatorRow(columnCount: table.Cells.GetLength(1), ref tableBuilder);
+
+                int headerLength = tableBuilder.Length;
+                int headerTokenCount = CountTokens(tableBuilder.AsSpan());
+
+                // We can't respect the limit if context and header themselves use more tokens.
+                if (contextTokenCount + headerTokenCount >= _maxTokensPerChunk)
+                {
+                    tableBuilder.Dispose();
+                    ThrowTokenCountExceeded();
+                }
+
+                if (headerTokenCount + totalTokenCount >= _maxTokensPerChunk)
+                {
+                    // We can't add the header row, so commit what we have accumulated so far.
+                    Commit();
+                }
+
+                totalTokenCount += headerTokenCount;
+                int tableLength = headerLength;
+
+                int rowCount = table.Cells.GetLength(0);
+                for (int rowIndex = 1; rowIndex < rowCount; rowIndex++)
+                {
+                    AddMarkdownTableRow(table, rowIndex, ref tableBuilder);
+
+                    int lastRowTokens = CountTokens(tableBuilder.AsSpan(tableLength));
+
+                    // Appending this row would exceed the limit.
+                    if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
+                    {
+                        // We append the table as long as it's not just the header.
+                        if (rowIndex != 1)
+                        {
+                            AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
+                        }
+
+                        // And commit the table we built so far.
+                        Commit();
+
+                        // Erase previous rows and keep only the header.
+                        tableBuilder.Length = headerLength;
+                        tableLength = headerLength;
+                        totalTokenCount += headerTokenCount;
+
+                        if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
+                        {
+                            // This row is simply too big even for a fresh chunk:
+                            tableBuilder.Dispose();
+                            ThrowTokenCountExceeded();
+                        }
+
+                        AddMarkdownTableRow(table, rowIndex, ref tableBuilder);
+                    }
+
+                    tableLength = tableBuilder.Length;
+                    totalTokenCount += lastRowTokens;
+                }
+
+                AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
+                tableBuilder.Dispose();
+            }
+            else
+            {
+                ReadOnlySpan<char> remainingContent = semanticContent.AsSpan();
+
+                while (!remainingContent.IsEmpty)
+                {
+                    int index = _tokenizer.GetIndexByTokenCount(
+                        text: remainingContent,
+                        maxTokenCount: _maxTokensPerChunk - totalTokenCount,
+                        out string? normalizedText,
+                        out int tokenCount,
+                        considerNormalization: false); // We don't normalize, just append as-is to keep original content.
+
+                    // some tokens fit
+                    if (index > 0)
+                    {
+                        // We could try to split by sentences or other delimiters, but it's complicated.
+                        // For simplicity, we will just split at the last new line that fits.
+                        // Our promise is not to go over the max token count, not to create perfect chunks.
+                        int newLineIndex = remainingContent.Slice(0, index).LastIndexOf('\n');
+                        if (newLineIndex > 0)
+                        {
+                            index = newLineIndex + 1; // We want to include the new line character (works for "\r\n" as well).
+                            tokenCount = CountTokens(remainingContent.Slice(0, index));
+                        }
+
+                        totalTokenCount += tokenCount;
+                        ReadOnlySpan<char> spanToAppend = remainingContent.Slice(0, index);
+                        AppendNewLineAndSpan(_currentChunk, spanToAppend);
+                        remainingContent = remainingContent.Slice(index);
+                    }
+                    else if (totalTokenCount == contextTokenCount)
+                    {
+                        // We are at the beginning of a chunk, and even a single token does not fit.
+                        ThrowTokenCountExceeded();
+                    }
+
+                    if (!remainingContent.IsEmpty)
+                    {
+                        Commit();
+                    }
+                }
+            }
+
+            if (totalTokenCount == _maxTokensPerChunk)
+            {
+                Commit();
+            }
+        }
+
+        if (totalTokenCount > contextTokenCount)
+        {
+            chunks.Add(new(_currentChunk.ToString(), document, context));
+        }
+
+        _currentChunk = _currentChunk.Clear();
+
+        return chunks;
+
+        void Commit()
+        {
+            chunks.Add(new(_currentChunk.ToString(), document, context));
+
+            // We keep the context in the current chunk as it's the same for all elements.
+            _currentChunk = _currentChunk.Remove(
+                startIndex: context.Length,
+                length: _currentChunk.Length - context.Length);
+            totalTokenCount = contextTokenCount;
+        }
+
+        static void ThrowTokenCountExceeded()
+            => throw new InvalidOperationException("Can't fit in the current chunk. Consider increasing max tokens per chunk.");
+    }
+
+    private static void AppendNewLineAndSpan(StringBuilder stringBuilder, ReadOnlySpan<char> chars)
+    {
+        // Don't start an empty chunk (no context provided) with a new line.
+        if (stringBuilder.Length > 0)
+        {
+            stringBuilder.AppendLine();
+        }
+
+#if NET
+        stringBuilder.Append(chars);
+#else
+        stringBuilder.Append(chars.ToString());
+#endif
+    }
+
+    private static void AddMarkdownTableRow(IngestionDocumentTable table, int rowIndex, ref ValueStringBuilder vsb)
+    {
+        for (int columnIndex = 0; columnIndex < table.Cells.GetLength(1); columnIndex++)
+        {
+            vsb.Append('|');
+            vsb.Append(' ');
+            string? cellContent = table.Cells[rowIndex, columnIndex] switch
+            {
+                null => null,
+                IngestionDocumentImage img => img.AlternativeText ?? img.Text,
+                IngestionDocumentElement other => other.GetMarkdown()
+            };
+            vsb.Append(cellContent);
+            vsb.Append(' ');
+        }
+
+        vsb.Append('|');
+        vsb.Append(Environment.NewLine);
+    }
+
+    private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStringBuilder vsb)
+    {
+        const int DashCount = 3; // The dash count does not need to match the header length.
+        for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
+        {
+            vsb.Append('|');
+            vsb.Append(' ');
+            vsb.Append('-', DashCount);
+            vsb.Append(' ');
+        }
+
+        vsb.Append('|');
+        vsb.Append(Environment.NewLine);
+    }
+
+    private int CountTokens(ReadOnlySpan<char> input)
+        => _tokenizer.CountTokens(input, considerNormalization: false);
+}
diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs
@@ -0,0 +1,84 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using Microsoft.Extensions.DataIngestion.Chunkers;
+using Microsoft.Shared.Diagnostics;
+
+namespace Microsoft.Extensions.DataIngestion;
+
+/// <summary>
+/// Splits documents into chunks based on headers and their corresponding levels, preserving the header context.
+/// </summary>
+public sealed class HeaderChunker : IngestionChunker<string>
+{
+    private const int MaxHeaderLevel = 10;
+    private readonly ElementsChunker _elementsChunker;
+
+    /// <summary>
+    /// Initializes a new instance of the <see cref="HeaderChunker"/> class.
+    /// </summary>
+    /// <param name="options">The options for the chunker.</param>
+    public HeaderChunker(IngestionChunkerOptions options)
+    {
+        _elementsChunker = new(options);
+    }
+
+    /// <inheritdoc/>
+#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
+    public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document,
+#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
+        [EnumeratorCancellation] CancellationToken cancellationToken = default)
+    {
+        _ = Throw.IfNull(document);
+
+        List<IngestionDocumentElement> elements = new(20);
+        string?[] headers = new string?[MaxHeaderLevel + 1];
+
+        foreach (IngestionDocumentElement element in document.EnumerateContent())
+        {
+            cancellationToken.ThrowIfCancellationRequested();
+
+            if (element is IngestionDocumentHeader header)
+            {
+                foreach (var chunk in SplitIntoChunks(document, headers, elements))
+                {
+                    yield return chunk;
+                }
+
+                int headerLevel = header.Level.GetValueOrDefault();
+                headers[headerLevel] = header.GetMarkdown();
+                headers.AsSpan(headerLevel + 1).Clear(); // clear all lower level headers
+
+                continue; // don't add headers to the elements list, they are part of the context
+            }
+
+            elements.Add(element);
+        }
+
+        // take care of any remaining paragraphs
+        foreach (var chunk in SplitIntoChunks(document, headers, elements))
+        {
+            yield return chunk;
+        }
+    }
+
+    private IEnumerable<IngestionChunk<string>> SplitIntoChunks(IngestionDocument document, string?[] headers, List<IngestionDocumentElement> elements)
+    {
+        if (elements.Count > 0)
+        {
+            string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h)));
+
+            foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements))
+            {
+                yield return chunk;
+            }
+
+            elements.Clear();
+        }
+    }
+}