Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions embeddings/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions embeddings/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
# For local dev with ../../candle, add a [patch] section to use path deps.
[dependencies]
tokenizers = "0.15.2"
unicode-segmentation = "1"
hf-hub = { git = "https://github.com/huggingface/hf-hub.git", rev = "ac22200ea0b5af4d8c362f699be0340647b19060", default-features = false,features = ["ureq"] }
anyhow = "1.0.81"
serde_json = "1.0.114"
Expand Down
63 changes: 63 additions & 0 deletions embeddings/manticoresearch_text_embeddings.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
#include <ostream>
#include <new>

/// Chunking strategy, mirrored as a `u32` across the FFI in [`ChunkSettings`].
constexpr static const uint32_t STRATEGY_NONE = 0;

constexpr static const uint32_t STRATEGY_FIXED = 1;

constexpr static const uint32_t STRATEGY_RECURSIVE = 2;

constexpr static const uint32_t STRATEGY_SENTENCE = 3;

struct TextModelResult {
void *m_pModel;
char *m_szError;
Expand Down Expand Up @@ -66,6 +75,58 @@ using ValidateApiKeyFn = char*(*)(const TextModelWrapper*);
/// for returning owned strings to C/C++.
using FreeStringFn = void(*)(char*);

/// One emitted chunk's byte span into the original input document.
struct ChunkSpan {
uintptr_t start;
uintptr_t end;
};

/// Maps one input document to its run of chunks in the flat embeddings/spans
/// arrays: the document's chunks are `[first, first + count)`.
struct DocChunks {
uintptr_t first;
uintptr_t count;
};

/// Result of [`TextModelWrapper::make_vect_embeddings_chunked`]: a flat array of
/// chunk embeddings, a parallel array of byte spans, and a per-input-document
/// grouping so the C++ caller can rebuild "these N chunks belong to document i".
///
struct ChunkedVecResult {
char *m_szError;
const FloatVec *m_tEmbedding;
uintptr_t emb_len;
uintptr_t emb_cap;
const ChunkSpan *m_tSpans;
uintptr_t spans_cap;
const DocChunks *m_tDocs;
uintptr_t docs_len;
uintptr_t docs_cap;
};

/// Chunking parameters. `#[repr(C)]` — passed straight across the FFI by the
/// daemon, which owns the DDL surface and validates against the model.
struct ChunkSettings {
/// One of the `STRATEGY_*` constants. `STRATEGY_NONE` ⇒ no chunking.
uint32_t strategy;
/// Target chunk size in tokens. `0` ⇒ use the model's max. Always clamped to
/// the model's real input limit.
uint32_t max_tokens;
/// Token overlap between consecutive chunks. `0` ⇒ none.
uint32_t overlap_tokens;
/// Hard cap on chunks per document. `0` ⇒ unlimited. Overflow merges the
/// tail into the last chunk (matches OpenSearch's `max_chunk_limit`).
uint32_t max_chunks;
};

using MakeVectEmbeddingsChunkedFn = ChunkedVecResult(*)(const TextModelWrapper*,
const StringItem*,
uintptr_t,
const ChunkSettings*,
int32_t);

using FreeChunkedResultFn = void(*)(ChunkedVecResult);

struct EmbedLib {
uintptr_t version;
const char *version_str;
Expand All @@ -77,6 +138,8 @@ struct EmbedLib {
GetLenFn get_max_input_size;
ValidateApiKeyFn validate_api_key;
FreeStringFn free_string;
MakeVectEmbeddingsChunkedFn make_vect_embeddings_chunked;
FreeChunkedResultFn free_chunked_result;
};

extern "C" {
Expand Down
Loading
Loading