Skip to content

Commit 04bb5c6

Browse files
authored
Merge 79e42f9 into 897d04e
2 parents 897d04e + 79e42f9 commit 04bb5c6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+5551
-171
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ id_ed25519.pub
2020
*.model
2121
.cline_storage
2222
*.egg-info
23+
CLAUDE.md
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2.
5+
// Same structure as AIE2+ variant but uses LUT-based getTanhBf16.
6+
7+
#define NOCPP
8+
9+
#include "../aie_kernel_utils.h"
10+
#include "lut_based_ops.h"
11+
12+
#include <aie_api/aie.hpp>
13+
#include <stdint.h>
14+
#include <type_traits>
15+
16+
static bfloat16 left_buf[2048] __attribute__((aligned(64)));
17+
static bfloat16 right_buf[2048] __attribute__((aligned(64)));
18+
19+
template <uint32_t r>
20+
void matvec_vectorized(uint32_t m,
21+
uint32_t k,
22+
const bfloat16 *__restrict a,
23+
const bfloat16 *__restrict b,
24+
bfloat16 *__restrict c)
25+
{
26+
::aie::set_rounding(aie::rounding_mode::conv_even);
27+
bfloat16 *c_end = c + m;
28+
const bfloat16 *b_end = b + k;
29+
for (; c < c_end; c++) {
30+
aie::accum acc = aie::zeros<accfloat, r>();
31+
AIE_LOOP_MIN_ITERATION_COUNT(2)
32+
for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
33+
aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
34+
aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
35+
acc = aie::mac(acc, a_vec, b_vec);
36+
}
37+
*c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
38+
}
39+
}
40+
41+
extern "C" {
42+
43+
void dual_gemv_matvec_bf16(uint32_t m,
44+
uint32_t k,
45+
uint32_t row_offset,
46+
const bfloat16 *__restrict a_in,
47+
const bfloat16 *__restrict b_in,
48+
uint32_t phase)
49+
{
50+
bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
51+
dst += row_offset;
52+
matvec_vectorized<64>(m, k, a_in, b_in, dst);
53+
}
54+
55+
void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
56+
{
57+
event0();
58+
59+
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
60+
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
61+
AIE_PREPARE_FOR_PIPELINING
62+
for (int i = 0; i < m_output; i += 16) {
63+
aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
64+
aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
65+
66+
aie::vector<bfloat16, 16> half_x = aie::mul(left_val, register_0_5);
67+
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
68+
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
69+
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
70+
auto silu_output = aie::mul(left_val, sigmoid_approx);
71+
72+
auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
73+
aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
74+
}
75+
76+
event1();
77+
}
78+
79+
} // extern "C"

aie_kernels/aie2/flowkv.cc

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// FlowKV decode attention kernel for AIE2+.
5+
//
6+
// Implements streaming decode attention with online softmax using a 2-tile
7+
// pipeline per KV head group:
8+
//
9+
// Score tile (CT0): Computes Q * K^T / sqrt(d) with online softmax tracking.
10+
// Maintains running max and denominator across chunks.
11+
// Outputs a packed buffer [F_c | C_c | l] to the value tile via on-chip
12+
// FIFO each chunk iteration.
13+
//
14+
// Value tile (CT1): Accumulates weighted values with correction.
15+
// Reads the packed buffer from the score tile FIFO each chunk.
16+
// Saves the denominator from the last chunk in a static buffer so that
17+
// normalize can read it after all FIFO buffers are released.
18+
// Final normalization: O = Y / l.
19+
//
20+
// Both tiles share this single .o file. Each Worker calls a different subset
21+
// of functions. Static buffers are per-tile (each tile gets its own copy).
22+
//
23+
// Packed inter-tile buffer layout (bf16):
24+
// [0 .. chunk_size*group_size - 1] : F_c scores
25+
// [chunk_size*group_size .. chunk_size*group_size + gs-1] : C_c correction
26+
// [chunk_size*group_size + gs .. chunk_size*group_size + 2*gs - 1] : l denom
27+
28+
#define NOCPP
29+
30+
#include "../aie_kernel_utils.h"
31+
32+
#include <aie_api/aie.hpp>
33+
#include <stdint.h>
34+
#include <type_traits>
35+
36+
// ---------------------------------------------------------------------------
37+
// Score tile: static softmax state (only used by score tile Worker)
38+
// ---------------------------------------------------------------------------
39+
static float score_running_max[4] __attribute__((aligned(64)));
40+
static float score_running_sum[4] __attribute__((aligned(64)));
41+
42+
// ---------------------------------------------------------------------------
43+
// Value tile: accumulated output in f32 for precision
44+
// ---------------------------------------------------------------------------
45+
static float value_accum[4 * 64] __attribute__((aligned(64)));
46+
47+
// Saved denominator from the last chunk (written by accum, read by normalize)
48+
static float saved_denom[4] __attribute__((aligned(64)));
49+
50+
extern "C" {
51+
52+
// ============================= Score Tile ====================================
53+
54+
// Initialize softmax state at the start of a new attention computation.
55+
void flowkv_score_init_bf16(int32_t num_q_heads)
56+
{
57+
for (int h = 0; h < num_q_heads; h++) {
58+
score_running_max[h] = -1e30f;
59+
score_running_sum[h] = 0.0f;
60+
}
61+
}
62+
63+
// Compute attention scores for one K chunk and update online softmax state.
64+
// Writes results into a single packed inter-tile buffer.
65+
//
66+
// q_in: (num_q_heads, head_dim) -- query vectors for this KV group
67+
// k_chunk: (chunk_size, head_dim) -- K cache chunk
68+
// packed_out: packed buffer for inter-tile FIFO:
69+
// [0 .. cs*gs-1]: F_c scores in (chunk_size, num_q_heads) layout
70+
// [cs*gs .. cs*gs+gs-1]: C_c correction factors
71+
// [cs*gs+gs .. cs*gs+2*gs-1]: l denominators
72+
void flowkv_score_chunk_bf16(const bfloat16 *__restrict q_in, const bfloat16 *__restrict k_chunk,
73+
bfloat16 *__restrict packed_out, int32_t num_q_heads, int32_t head_dim,
74+
int32_t chunk_size)
75+
{
76+
event0();
77+
::aie::set_rounding(aie::rounding_mode::conv_even);
78+
79+
const float inv_sqrt_d = 0.125f; // 1/sqrt(64) = 1/8
80+
81+
const int32_t scores_size = chunk_size * num_q_heads;
82+
bfloat16 *scores_out = packed_out;
83+
bfloat16 *correction_out = packed_out + scores_size;
84+
bfloat16 *denom_out = packed_out + scores_size + num_q_heads;
85+
86+
for (int h = 0; h < num_q_heads; h++) {
87+
const bfloat16 *q_head = q_in + h * head_dim;
88+
float m_old = score_running_max[h];
89+
float l_old = score_running_sum[h];
90+
91+
// Phase 1: Compute dot products and find chunk-local max
92+
// Store scores as bf16 to avoid float array auto-vectorization issues
93+
bfloat16 scores_bf16[32]; // chunk_size max = 32
94+
bfloat16 m_chunk_bf16 = static_cast<bfloat16>(-1e30f);
95+
96+
for (int pos = 0; pos < chunk_size; pos++) {
97+
const bfloat16 *k_pos = k_chunk + pos * head_dim;
98+
99+
// Vectorized dot product: head_dim=64 using single accum
100+
aie::accum<accfloat, 32> acc = aie::zeros<accfloat, 32>();
101+
102+
auto q_vec0 = aie::load_v<32>(q_head);
103+
auto k_vec0 = aie::load_v<32>(k_pos);
104+
acc = aie::mac(acc, q_vec0, k_vec0);
105+
106+
auto q_vec1 = aie::load_v<32>(q_head + 32);
107+
auto k_vec1 = aie::load_v<32>(k_pos + 32);
108+
acc = aie::mac(acc, q_vec1, k_vec1);
109+
110+
bfloat16 score = static_cast<bfloat16>(
111+
aie::reduce_add(acc.to_vector<float>()) * inv_sqrt_d);
112+
113+
scores_bf16[pos] = score;
114+
if (static_cast<float>(score) > static_cast<float>(m_chunk_bf16)) {
115+
m_chunk_bf16 = score;
116+
}
117+
}
118+
119+
// Phase 2: Online softmax update using bf16 vector ops
120+
float m_chunk_f = static_cast<float>(m_chunk_bf16);
121+
float m_new = (m_chunk_f > m_old) ? m_chunk_f : m_old;
122+
bfloat16 m_new_bf16 = static_cast<bfloat16>(m_new);
123+
124+
// C_c = exp2((m_old - m_new) * log2e) via vector exp2
125+
bfloat16 corr_scaled = static_cast<bfloat16>((m_old - m_new) * 1.4453125f);
126+
aie::vector<bfloat16, 16> corr_in_vec = aie::broadcast<bfloat16, 16>(corr_scaled);
127+
aie::accum<accfloat, 16> corr_acc(corr_in_vec);
128+
aie::vector<bfloat16, 16> corr_exp = aie::exp2<bfloat16>(corr_acc.to_vector<float>());
129+
float c_correction = static_cast<float>(corr_exp[0]);
130+
131+
bfloat16 l_new_bf16 = static_cast<bfloat16>(c_correction * l_old);
132+
133+
// Compute exp2 for each score position — one at a time, no float arrays
134+
for (int pos = 0; pos < chunk_size; pos++) {
135+
bfloat16 diff = static_cast<bfloat16>(
136+
(static_cast<float>(scores_bf16[pos]) - m_new) * 1.4453125f);
137+
aie::vector<bfloat16, 16> diff_vec = aie::broadcast<bfloat16, 16>(diff);
138+
aie::accum<accfloat, 16> diff_acc(diff_vec);
139+
aie::vector<bfloat16, 16> exp_result = aie::exp2<bfloat16>(diff_acc.to_vector<float>());
140+
bfloat16 f_bf16 = exp_result[0];
141+
l_new_bf16 = static_cast<bfloat16>(static_cast<float>(l_new_bf16) + static_cast<float>(f_bf16));
142+
scores_out[pos * num_q_heads + h] = f_bf16;
143+
}
144+
145+
// Update running state
146+
score_running_max[h] = m_new;
147+
score_running_sum[h] = static_cast<float>(l_new_bf16);
148+
149+
// Write correction and denominator to packed buffer
150+
correction_out[h] = static_cast<bfloat16>(c_correction);
151+
denom_out[h] = l_new_bf16;
152+
}
153+
154+
event1();
155+
}
156+
157+
// ============================= Value Tile ====================================
158+
159+
// Initialize the value accumulator.
160+
void flowkv_value_init_bf16(int32_t num_q_heads, int32_t head_dim)
161+
{
162+
int total = num_q_heads * head_dim;
163+
for (int i = 0; i < total; i++) {
164+
value_accum[i] = 0.0f;
165+
}
166+
for (int h = 0; h < num_q_heads; h++) {
167+
saved_denom[h] = 0.0f;
168+
}
169+
}
170+
171+
// Accumulate weighted values for one chunk.
172+
// Reads scores and correction from the packed inter-tile buffer.
173+
// Saves the denominator into a static buffer for later normalization.
174+
//
175+
// packed_in: packed buffer from score tile FIFO
176+
// [0..cs*gs-1]: F_c scores
177+
// [cs*gs..cs*gs+gs-1]: C_c correction
178+
// [cs*gs+gs..cs*gs+2*gs-1]: l denom
179+
// v_chunk: (chunk_size, head_dim) -- V cache chunk from DDR
180+
void flowkv_value_accum_bf16(const bfloat16 *__restrict packed_in, const bfloat16 *__restrict v_chunk,
181+
int32_t num_q_heads, int32_t head_dim, int32_t chunk_size)
182+
{
183+
event0();
184+
::aie::set_rounding(aie::rounding_mode::conv_even);
185+
186+
const int32_t scores_size = chunk_size * num_q_heads;
187+
const bfloat16 *scores_in = packed_in;
188+
const bfloat16 *correction_in = packed_in + scores_size;
189+
const bfloat16 *denom_in = packed_in + scores_size + num_q_heads;
190+
191+
for (int h = 0; h < num_q_heads; h++) {
192+
float correction = static_cast<float>(correction_in[h]);
193+
float *y_head = value_accum + h * head_dim;
194+
195+
// Save denominator for final normalization
196+
saved_denom[h] = static_cast<float>(denom_in[h]);
197+
198+
// Apply correction to accumulated output: Y = C_c * Y_old
199+
aie::vector<float, 16> corr_vec = aie::broadcast<float, 16>(correction);
200+
for (int d = 0; d < head_dim; d += 16) {
201+
aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d);
202+
y_vec = aie::mul(y_vec, corr_vec);
203+
aie::store_v(y_head + d, y_vec);
204+
}
205+
206+
// Accumulate: Y += sum_pos( F_c[pos, h] * V[pos, :] )
207+
for (int pos = 0; pos < chunk_size; pos++) {
208+
float f = static_cast<float>(scores_in[pos * num_q_heads + h]);
209+
const bfloat16 *v_pos = v_chunk + pos * head_dim;
210+
aie::vector<float, 16> f_vec = aie::broadcast<float, 16>(f);
211+
212+
for (int d = 0; d < head_dim; d += 16) {
213+
aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d);
214+
aie::vector<bfloat16, 16> v_vec = aie::load_v<16>(v_pos + d);
215+
aie::accum<accfloat, 16> v_acc(v_vec);
216+
aie::vector<float, 16> v_f32 = v_acc.to_vector<float>();
217+
aie::vector<float, 16> fv = aie::mul(f_vec, v_f32);
218+
y_vec = aie::add(y_vec, fv);
219+
aie::store_v(y_head + d, y_vec);
220+
}
221+
}
222+
}
223+
224+
event1();
225+
}
226+
227+
// Normalize and produce final output: O = Y / l.
228+
// Reads the denominator from saved_denom (set by the last accum call).
229+
//
230+
// output: (num_q_heads, head_dim) -- final attention output in bf16
231+
void flowkv_value_normalize_bf16(bfloat16 *__restrict output, int32_t num_q_heads, int32_t head_dim)
232+
{
233+
::aie::set_rounding(aie::rounding_mode::conv_even);
234+
235+
for (int h = 0; h < num_q_heads; h++) {
236+
float inv_l = aie::inv(saved_denom[h]);
237+
aie::vector<float, 16> inv_l_vec = aie::broadcast<float, 16>(inv_l);
238+
float *y_head = value_accum + h * head_dim;
239+
bfloat16 *o_head = output + h * head_dim;
240+
241+
for (int d = 0; d < head_dim; d += 16) {
242+
aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d);
243+
aie::vector<float, 16> scaled = aie::mul(y_vec, inv_l_vec);
244+
aie::accum<accfloat, 16> y_acc(scaled);
245+
aie::vector<bfloat16, 16> out_vec = y_acc.to_vector<bfloat16>();
246+
aie::store_v(o_head + d, out_vec);
247+
}
248+
}
249+
}
250+
251+
} // extern "C"

0 commit comments

Comments
 (0)