Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions lib/ruby_llm/embedding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ def initialize(vectors:, model:, input_tokens: 0)
@input_tokens = input_tokens
end

def self.embed(text, # rubocop:disable Metrics/ParameterLists
def self.embed(text = nil, # rubocop:disable Metrics/ParameterLists
model: nil,
provider: nil,
image: nil,
video: nil,
assume_model_exists: false,
context: nil,
dimensions: nil)
Expand All @@ -22,8 +24,31 @@ def self.embed(text, # rubocop:disable Metrics/ParameterLists
model, provider_instance = Models.resolve(model, provider: provider, assume_exists: assume_model_exists,
config: config)
model_id = model.id
args = set_embedding_params(
provider_instance,
text: text,
model_id: model_id,
dimensions: dimensions,
image: image,
video: video
)

provider_instance.embed(text, model: model_id, dimensions:)
provider_instance.embed(**args)
end

def self.set_embedding_params(provider_instance, # rubocop:disable Metrics/ParameterLists
text: nil,
model_id: nil,
dimensions: nil,
image: nil,
video: nil)
embed_params = provider_instance.method(:embed).parameters.map(&:last)
args = { model: model_id }
args[:text] = text if text
args[:dimensions] = dimensions if dimensions
args[:image] = image if image && embed_params.include?(:image)
args[:video] = video if video && embed_params.include?(:video)
args
end
end
end
8 changes: 6 additions & 2 deletions lib/ruby_llm/provider.rb
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ def list_models
parse_list_models_response response, slug, capabilities
end

def embed(text, model:, dimensions:)
payload = render_embedding_payload(text, model:, dimensions:)
def embed(model:, text: nil, image: nil, video: nil, dimensions: nil)
payload = if image || video
render_embedding_payload(text, model:, image:, video:, dimensions:)
else
render_embedding_payload(text, model:, dimensions:)
end
response = @connection.post(embedding_url(model:), payload)
parse_embedding_response(response, model:, text:)
end
Expand Down
63 changes: 59 additions & 4 deletions lib/ruby_llm/providers/vertexai/embeddings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,76 @@ def embedding_url(model:)
"projects/#{@config.vertexai_project_id}/locations/#{@config.vertexai_location}/publishers/google/models/#{model}:predict" # rubocop:disable Layout/LineLength
end

def render_embedding_payload(text, model:, dimensions:) # rubocop:disable Lint/UnusedMethodArgument
def render_embedding_payload(text, model:, image: nil, video: nil, dimensions: nil) # rubocop:disable Lint/UnusedMethodArgument
is_multimodal = image.present? || video.present?

if is_multimodal
render_multimodal_payload(text:, image:, video:, dimensions:)
else
{
instances: [text].flatten.map { |t| { text: t.to_s } }
}.tap do |payload|
payload[:parameters] = { outputDimensionality: dimensions } if dimensions
end
end
end

def render_multimodal_payload(text:, image:, video:, dimensions:)
instance = {}
instance[:text] = text.to_s if text
add_image_instance(instance, image: image)
add_video_instance(instance, video: video)

{
instances: [text].flatten.map { |t| { content: t.to_s } }
instances: [instance]
}.tap do |payload|
payload[:parameters] = { outputDimensionality: dimensions } if dimensions
end
end

def add_image_instance(instance, image:)
return unless image.present?

require 'base64'
image_data = image.respond_to?(:read) ? image.read : image
instance[:image] = { bytesBase64Encoded: Base64.strict_encode64(image_data) }
end

def add_video_instance(instance, video:)
return unless video.present?

require 'base64'
if video.is_a?(String) && video.start_with?('gs://')
instance[:video] = { gcsUri: video }
else
video_data = video.respond_to?(:read) ? video.read : video
instance[:video] = { bytesBase64Encoded: Base64.strict_encode64(video_data) }
end
end

def parse_embedding_response(response, model:, text:)
predictions = response.body['predictions']
vectors = predictions&.map { |p| p.dig('embeddings', 'values') }
vectors = vectors.first if vectors&.length == 1 && !text.is_a?(Array)

if model == 'multimodalembedding'
vectors = parse_multimodal_embeddings(predictions)
else
vectors = predictions&.map { |p| p.dig('embeddings', 'values') }
vectors = vectors.first if vectors&.length == 1 && !text.is_a?(Array)
end
Embedding.new(vectors:, model:, input_tokens: 0)
end

def parse_multimodal_embeddings(predictions)
text_embedding = predictions&.dig(0, 'textEmbedding')
image_embedding = predictions&.dig(0, 'imageEmbedding')
video_embedding = predictions&.dig(0, 'videoEmbedding')

vectors = {}
vectors[:text] = text_embedding if text_embedding
vectors[:image] = image_embedding if image_embedding
vectors[:video] = video_embedding if video_embedding
vectors
end
end
end
end
Expand Down
178 changes: 178 additions & 0 deletions spec/ruby_llm/providers/vertex_ai/embeddings_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# frozen_string_literal: true

require 'spec_helper'

RSpec.describe RubyLLM::Providers::VertexAI::Embeddings do
let(:test_class) do
Class.new do
include RubyLLM::Providers::VertexAI::Embeddings

attr_reader :config

def initialize(config)
@config = config
end
end
end

let(:config) do
double( # rubocop:disable RSpec/VerifiedDoubles
'config',
vertexai_project_id: 'test-project',
vertexai_location: 'us-central1'
)
end

let(:embeddings) do
test_class.new(config)
end

describe '#embedding_url' do
it 'constructs the correct URL' do
url = embeddings.send(:embedding_url, model: 'multimodalembedding')
expect(url).to eq(
'projects/test-project/locations/us-central1/publishers/google/models/multimodalembedding:predict'
)
end
end

describe '#render_embedding_payload' do
context 'with text only' do
it 'renders single text payload with a text embedding model' do
payload = embeddings.send(:render_embedding_payload, 'Hello!', model: 'gemini-embedding-001')
expect(payload[:instances]).to eq([{ text: 'Hello!' }])
end

it 'renders single text payload with a multimodal embedding model' do
payload = embeddings.send(:render_embedding_payload, 'Hello!', model: 'multimodalembedding')
expect(payload[:instances]).to eq([{ text: 'Hello!' }])
end

it 'renders multiple batch text payloads' do
payload = embeddings.send(:render_embedding_payload, ['Hello!', 'Hi there!'], model: 'text-embedding-005')
expect(payload[:instances]).to eq([{ text: 'Hello!' }, { text: 'Hi there!' }])
end
end

context 'with multimodal content payload' do
it 'renders payload with multimodal content' do
image_data = 'fake_image_data'
video_data = 'fake_video_data'

payload = embeddings.send(
:render_embedding_payload,
'Test multimodal input',
model: 'multimodalembedding',
image: image_data,
video: video_data
)
expect(payload[:instances].first[:text]).to eq('Test multimodal input')
expect(payload[:instances].first).to have_key(:image)
expect(payload[:instances].first).to have_key(:video)
end

it 'renders payload wtih dimensions parameter' do
payload = embeddings.send(
:render_embedding_payload,
'Hello!',
model: 'multimodalembedding',
dimensions: 512
)
expect(payload[:parameters]).to eq({ outputDimensionality: 512 })
end
end
end

describe '#parse_embedding_response' do
context 'with text only embeddings' do
it 'parses text only embedding response' do
embedding_response = instance_double(
Faraday::Response, body: {
'predictions' => [
'embeddings' => {
'statistics' => {
'truncated' => false,
'token_count' => 6
},
'values' => ['...']
}
]
}
)
embedding = embeddings.send(
:parse_embedding_response,
embedding_response,
model: 'gemini-embedding-001',
text: 'Hello!'
)
expect(embedding.vectors).to eq(['...'])
expect(embedding.model).to eq('gemini-embedding-001')
end

it 'parses batch text embedding response' do
embedding_response = instance_double(
Faraday::Response, body: {
'predictions' => [
{
'embeddings' => {
'statistics' => {
'token_count' => 8,
'truncated' => false
},
'values' => ['0.1,....']
}
},
{
'embeddings' => {
'statistics' => {
'token_count' => 3,
'truncated' => false
},
'values' => ['0.2,....']
}
}
]
}
)
embedding = embeddings.send(
:parse_embedding_response,
embedding_response,
model: 'text-embedding-004',
text: ['Hello!', 'Hi there!']
)
expect(embedding.vectors).to eq([['0.1,....'], ['0.2,....']])
expect(embedding.model).to eq('text-embedding-004')
end
end

context 'with multimodal embeddings' do
it 'parses multimodal embedding response' do
embedding_response = instance_double(
Faraday::Response, body: {
'predictions' => [
{
'textEmbedding' => ['0.1,....'],
'imageEmbedding' => ['0.2,....'],
'videoEmbedding' => ['0.3,....']
}
]
}
)
embedding = embeddings.send(
:parse_embedding_response,
embedding_response,
model: 'multimodalembedding',
text: 'Multimodal input'
)
expect(embedding.vectors).to eq(
{
text: ['0.1,....'],
image: ['0.2,....'],
video: ['0.3,....']
}
)
expect(embedding.model).to eq('multimodalembedding')
end
end
end
end