Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eng/packages/TestOnly.props
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<PackageVersion Include="Microsoft.Diagnostics.Tracing.TraceEvent" Version="3.1.3" />
<PackageVersion Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="9.0.0" />
<PackageVersion Include="Microsoft.Extensions.Configuration.UserSecrets" Version="9.0.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.InMemory" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.SqliteVec" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ public IngestionDocumentHeader(string markdown)
/// <summary>
/// Gets or sets the level of the header.
/// </summary>
public int? Level { get; set; }
public int? Level
{
get => field;
set => field = Throw.IfOutOfRange(value.GetValueOrDefault(), min: 0, max: 10, nameof(value));
Comment thread
adamsitnik marked this conversation as resolved.
Outdated
}
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Text;
using Microsoft.ML.Tokenizers;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion.Chunkers;

internal sealed class ElementsChunker
{
private readonly Tokenizer _tokenizer;
private readonly int _maxTokensPerChunk;
private StringBuilder? _currentChunk;

internal ElementsChunker(IngestionChunkerOptions options)
{
_ = Throw.IfNull(options);

_tokenizer = options.Tokenizer;
_maxTokensPerChunk = options.MaxTokensPerChunk;
}

// Goals:
// 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized.
// 2. Maintain context in each chunk.
// 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows).
internal IEnumerable<IngestionChunk<string>> Process(IngestionDocument document, string context, List<IngestionDocumentElement> elements)
{
// Not using yield return here as we use ref structs.
Comment thread
stephentoub marked this conversation as resolved.
List<IngestionChunk<string>> chunks = [];

// Token count != character count, but StringBuilder will grow as needed.
_currentChunk ??= new(capacity: _maxTokensPerChunk);

int contextTokenCount = CountTokens(context.AsSpan());
int totalTokenCount = contextTokenCount;

// If the context itself exceeds the max tokens per chunk, we can't do anything.
if (contextTokenCount >= _maxTokensPerChunk)
{
ThrowTokenCountExceeded();
}

_currentChunk = _currentChunk.Append(context);

for (int elementIndex = 0; elementIndex < elements.Count; elementIndex++)
{
IngestionDocumentElement element = elements[elementIndex];
string? semanticContent = element switch
{
// Image exposes:
// - Markdown: ![Alt Text](url) which is not very useful for embedding.
// - AlternativeText: usually a short description of the image, can be null or empty. It is usually less than 50 words.
// - Text: result of OCR, can be longer, but also can be null or empty. It can be several hundred words.
// We prefer AlternativeText over Text, as it is usually more relevant.
IngestionDocumentImage image => image.AlternativeText ?? image.Text,
_ => element.GetMarkdown()
};

if (string.IsNullOrEmpty(semanticContent))
{
continue; // An image can come with Markdown, but no AlternativeText or Text.
}

int elementTokenCount = CountTokens(semanticContent.AsSpan());
if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk)
{
totalTokenCount += elementTokenCount;
AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan());
}
else if (element is IngestionDocumentTable table)
{
ValueStringBuilder tableBuilder = new(initialCapacity: 8000);
Comment thread
adamsitnik marked this conversation as resolved.
AddMarkdownTableRow(table, rowIndex: 0, ref tableBuilder);
AddMarkdownTableSeparatorRow(columnCount: table.Cells.GetLength(1), ref tableBuilder);

int headerLength = tableBuilder.Length;
int headerTokenCount = CountTokens(tableBuilder.AsSpan());

// We can't respect the limit if context and header themselves use more tokens.
if (contextTokenCount + headerTokenCount >= _maxTokensPerChunk)
{
tableBuilder.Dispose();
ThrowTokenCountExceeded();
}

if (headerTokenCount + totalTokenCount >= _maxTokensPerChunk)
{
// We can't add the header row, so commit what we have accumulated so far.
Commit();
}

totalTokenCount += headerTokenCount;
int tableLength = headerLength;

int rowCount = table.Cells.GetLength(0);
for (int rowIndex = 1; rowIndex < rowCount; rowIndex++)
{
AddMarkdownTableRow(table, rowIndex, ref tableBuilder);

int lastRowTokens = CountTokens(tableBuilder.AsSpan(tableLength));

// Appending this row would exceed the limit.
if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
{
// We append the table as long as it's not just the header.
if (rowIndex != 1)
{
AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
}

// And commit the table we built so far.
Commit();

// Erase previous rows and keep only the header.
tableBuilder.Length = headerLength;
tableLength = headerLength;
totalTokenCount += headerTokenCount;

if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
{
// This row is simply too big even for a fresh chunk:
tableBuilder.Dispose();
ThrowTokenCountExceeded();
}

AddMarkdownTableRow(table, rowIndex, ref tableBuilder);
}

tableLength = tableBuilder.Length;
totalTokenCount += lastRowTokens;
}

AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
tableBuilder.Dispose();
}
else
{
ReadOnlySpan<char> remainingContent = semanticContent.AsSpan();

while (!remainingContent.IsEmpty)
{
int index = _tokenizer.GetIndexByTokenCount(
text: remainingContent,
maxTokenCount: _maxTokensPerChunk - totalTokenCount,
out string? normalizedText,
out int tokenCount,
considerNormalization: false); // We don't normalize, just append as-is to keep original content.

// some tokens fit
if (index > 0)
{
// We could try to split by sentences or other delimiters, but it's complicated.
// For simplicity, we will just split at the last new line that fits.
// Our promise is not to go over the max token count, not to create perfect chunks.
int newLineIndex = remainingContent.Slice(0, index).LastIndexOf('\n');
if (newLineIndex > 0)
{
index = newLineIndex + 1; // We want to include the new line character (works for "\r\n" as well).
tokenCount = CountTokens(remainingContent.Slice(0, index));
}

totalTokenCount += tokenCount;
ReadOnlySpan<char> spanToAppend = remainingContent.Slice(0, index);
AppendNewLineAndSpan(_currentChunk, spanToAppend);
remainingContent = remainingContent.Slice(index);
}
else if (totalTokenCount == contextTokenCount)
{
// We are at the beginning of a chunk, and even a single token does not fit.
ThrowTokenCountExceeded();
}

if (!remainingContent.IsEmpty)
{
Commit();
}
}
}

if (totalTokenCount == _maxTokensPerChunk)
{
Commit();
}
}

if (totalTokenCount > contextTokenCount)
{
chunks.Add(new(_currentChunk.ToString(), document, context));
}

_currentChunk = _currentChunk.Clear();
Comment thread
adamsitnik marked this conversation as resolved.
Outdated

return chunks;

void Commit()
{
chunks.Add(new(_currentChunk.ToString(), document, context));

// We keep the context in the current chunk as it's the same for all elements.
_currentChunk = _currentChunk.Remove(
startIndex: context.Length,
length: _currentChunk.Length - context.Length);
Comment thread
adamsitnik marked this conversation as resolved.
totalTokenCount = contextTokenCount;
}

static void ThrowTokenCountExceeded()
=> throw new InvalidOperationException("Can't fit in the current chunk. Consider increasing max tokens per chunk.");
}

private static void AppendNewLineAndSpan(StringBuilder stringBuilder, ReadOnlySpan<char> chars)
{
// Don't start an empty chunk (no context provided) with a new line.
if (stringBuilder.Length > 0)
{
stringBuilder.AppendLine();
}

#if NET
stringBuilder.Append(chars);
#else
stringBuilder.Append(chars.ToString());
Comment thread
stephentoub marked this conversation as resolved.
#endif
}

private static void AddMarkdownTableRow(IngestionDocumentTable table, int rowIndex, ref ValueStringBuilder vsb)
{
for (int columnIndex = 0; columnIndex < table.Cells.GetLength(1); columnIndex++)
{
vsb.Append('|');
vsb.Append(' ');
string? cellContent = table.Cells[rowIndex, columnIndex] switch
{
null => null,
IngestionDocumentImage img => img.AlternativeText ?? img.Text,
IngestionDocumentElement other => other.GetMarkdown()
};
vsb.Append(cellContent);
vsb.Append(' ');
}

vsb.Append('|');
vsb.Append(Environment.NewLine);
}

private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStringBuilder vsb)
{
const int DashCount = 3; // The dash count does not need to match the header length.
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
vsb.Append('|');
vsb.Append(' ');
vsb.Append('-', DashCount);
vsb.Append(' ');
}

vsb.Append('|');
vsb.Append(Environment.NewLine);
}

private int CountTokens(ReadOnlySpan<char> input)
=> _tokenizer.CountTokens(input, considerNormalization: false);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using Microsoft.Extensions.DataIngestion.Chunkers;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Splits documents into chunks based on headers and their corresponding levels, preserving the header context.
/// </summary>
public sealed class HeaderChunker : IngestionChunker<string>
{
private const int MaxHeaderLevel = 10;
private readonly ElementsChunker _elementsChunker;

/// <summary>
/// Initializes a new instance of the <see cref="HeaderChunker"/> class.
/// </summary>
/// <param name="options">The options for the chunker.</param>
public HeaderChunker(IngestionChunkerOptions options)
{
_elementsChunker = new(options);
}

/// <inheritdoc/>
#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously
Comment thread
adamsitnik marked this conversation as resolved.
Outdated
public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document,
#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(document);

List<IngestionDocumentElement> elements = new(20);
Comment thread
adamsitnik marked this conversation as resolved.
Outdated
string?[] headers = new string?[MaxHeaderLevel + 1];

foreach (IngestionDocumentElement element in document.EnumerateContent())
{
cancellationToken.ThrowIfCancellationRequested();

if (element is IngestionDocumentHeader header)
{
foreach (var chunk in SplitIntoChunks(document, headers, elements))
{
yield return chunk;
}

int headerLevel = header.Level.GetValueOrDefault();
headers[headerLevel] = header.GetMarkdown();
headers.AsSpan(headerLevel + 1).Clear(); // clear all lower level headers

continue; // don't add headers to the elements list, they are part of the context
}

elements.Add(element);
}

// take care of any remaining paragraphs
foreach (var chunk in SplitIntoChunks(document, headers, elements))
{
yield return chunk;
}
}

private IEnumerable<IngestionChunk<string>> SplitIntoChunks(IngestionDocument document, string?[] headers, List<IngestionDocumentElement> elements)
{
if (elements.Count > 0)
{
string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h)));

foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements))
{
yield return chunk;
}

elements.Clear();
}
}
}
Loading
Loading