|
| 1 | +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +// FlowKV decode attention kernel for AIE2+. |
| 5 | +// |
| 6 | +// Implements streaming decode attention with online softmax using a 2-tile |
| 7 | +// pipeline per KV head group: |
| 8 | +// |
| 9 | +// Score tile (CT0): Computes Q * K^T / sqrt(d) with online softmax tracking. |
| 10 | +// Maintains running max and denominator across chunks. |
| 11 | +// Outputs a packed buffer [F_c | C_c | l] to the value tile via on-chip |
| 12 | +// FIFO each chunk iteration. |
| 13 | +// |
| 14 | +// Value tile (CT1): Accumulates weighted values with correction. |
| 15 | +// Reads the packed buffer from the score tile FIFO each chunk. |
| 16 | +// Saves the denominator from the last chunk in a static buffer so that |
| 17 | +// normalize can read it after all FIFO buffers are released. |
| 18 | +// Final normalization: O = Y / l. |
| 19 | +// |
| 20 | +// Both tiles share this single .o file. Each Worker calls a different subset |
| 21 | +// of functions. Static buffers are per-tile (each tile gets its own copy). |
| 22 | +// |
| 23 | +// Packed inter-tile buffer layout (bf16): |
| 24 | +// [0 .. chunk_size*group_size - 1] : F_c scores |
| 25 | +// [chunk_size*group_size .. chunk_size*group_size + gs-1] : C_c correction |
| 26 | +// [chunk_size*group_size + gs .. chunk_size*group_size + 2*gs - 1] : l denom |
| 27 | + |
| 28 | +#define NOCPP |
| 29 | + |
| 30 | +#include "../aie_kernel_utils.h" |
| 31 | + |
| 32 | +#include <aie_api/aie.hpp> |
| 33 | +#include <stdint.h> |
| 34 | +#include <type_traits> |
| 35 | + |
| 36 | +// --------------------------------------------------------------------------- |
| 37 | +// Score tile: static softmax state (only used by score tile Worker) |
| 38 | +// --------------------------------------------------------------------------- |
| 39 | +static float score_running_max[4] __attribute__((aligned(64))); |
| 40 | +static float score_running_sum[4] __attribute__((aligned(64))); |
| 41 | + |
| 42 | +// --------------------------------------------------------------------------- |
| 43 | +// Value tile: accumulated output in f32 for precision |
| 44 | +// --------------------------------------------------------------------------- |
| 45 | +static float value_accum[4 * 64] __attribute__((aligned(64))); |
| 46 | + |
| 47 | +// Saved denominator from the last chunk (written by accum, read by normalize) |
| 48 | +static float saved_denom[4] __attribute__((aligned(64))); |
| 49 | + |
| 50 | +extern "C" { |
| 51 | + |
| 52 | +// ============================= Score Tile ==================================== |
| 53 | + |
| 54 | +// Initialize softmax state at the start of a new attention computation. |
| 55 | +void flowkv_score_init_bf16(int32_t num_q_heads) |
| 56 | +{ |
| 57 | + for (int h = 0; h < num_q_heads; h++) { |
| 58 | + score_running_max[h] = -1e30f; |
| 59 | + score_running_sum[h] = 0.0f; |
| 60 | + } |
| 61 | +} |
| 62 | + |
| 63 | +// Compute attention scores for one K chunk and update online softmax state. |
| 64 | +// Writes results into a single packed inter-tile buffer. |
| 65 | +// |
| 66 | +// q_in: (num_q_heads, head_dim) -- query vectors for this KV group |
| 67 | +// k_chunk: (chunk_size, head_dim) -- K cache chunk |
| 68 | +// packed_out: packed buffer for inter-tile FIFO: |
| 69 | +// [0 .. cs*gs-1]: F_c scores in (chunk_size, num_q_heads) layout |
| 70 | +// [cs*gs .. cs*gs+gs-1]: C_c correction factors |
| 71 | +// [cs*gs+gs .. cs*gs+2*gs-1]: l denominators |
| 72 | +void flowkv_score_chunk_bf16(const bfloat16 *__restrict q_in, const bfloat16 *__restrict k_chunk, |
| 73 | + bfloat16 *__restrict packed_out, int32_t num_q_heads, int32_t head_dim, |
| 74 | + int32_t chunk_size) |
| 75 | +{ |
| 76 | + event0(); |
| 77 | + ::aie::set_rounding(aie::rounding_mode::conv_even); |
| 78 | + |
| 79 | + const float inv_sqrt_d = 0.125f; // 1/sqrt(64) = 1/8 |
| 80 | + |
| 81 | + const int32_t scores_size = chunk_size * num_q_heads; |
| 82 | + bfloat16 *scores_out = packed_out; |
| 83 | + bfloat16 *correction_out = packed_out + scores_size; |
| 84 | + bfloat16 *denom_out = packed_out + scores_size + num_q_heads; |
| 85 | + |
| 86 | + for (int h = 0; h < num_q_heads; h++) { |
| 87 | + const bfloat16 *q_head = q_in + h * head_dim; |
| 88 | + float m_old = score_running_max[h]; |
| 89 | + float l_old = score_running_sum[h]; |
| 90 | + |
| 91 | + // Phase 1: Compute dot products and find chunk-local max |
| 92 | + // Store scores as bf16 to avoid float array auto-vectorization issues |
| 93 | + bfloat16 scores_bf16[32]; // chunk_size max = 32 |
| 94 | + bfloat16 m_chunk_bf16 = static_cast<bfloat16>(-1e30f); |
| 95 | + |
| 96 | + for (int pos = 0; pos < chunk_size; pos++) { |
| 97 | + const bfloat16 *k_pos = k_chunk + pos * head_dim; |
| 98 | + |
| 99 | + // Vectorized dot product: head_dim=64 using single accum |
| 100 | + aie::accum<accfloat, 32> acc = aie::zeros<accfloat, 32>(); |
| 101 | + |
| 102 | + auto q_vec0 = aie::load_v<32>(q_head); |
| 103 | + auto k_vec0 = aie::load_v<32>(k_pos); |
| 104 | + acc = aie::mac(acc, q_vec0, k_vec0); |
| 105 | + |
| 106 | + auto q_vec1 = aie::load_v<32>(q_head + 32); |
| 107 | + auto k_vec1 = aie::load_v<32>(k_pos + 32); |
| 108 | + acc = aie::mac(acc, q_vec1, k_vec1); |
| 109 | + |
| 110 | + bfloat16 score = static_cast<bfloat16>( |
| 111 | + aie::reduce_add(acc.to_vector<float>()) * inv_sqrt_d); |
| 112 | + |
| 113 | + scores_bf16[pos] = score; |
| 114 | + if (static_cast<float>(score) > static_cast<float>(m_chunk_bf16)) { |
| 115 | + m_chunk_bf16 = score; |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + // Phase 2: Online softmax update using bf16 vector ops |
| 120 | + float m_chunk_f = static_cast<float>(m_chunk_bf16); |
| 121 | + float m_new = (m_chunk_f > m_old) ? m_chunk_f : m_old; |
| 122 | + bfloat16 m_new_bf16 = static_cast<bfloat16>(m_new); |
| 123 | + |
| 124 | + // C_c = exp2((m_old - m_new) * log2e) via vector exp2 |
| 125 | + bfloat16 corr_scaled = static_cast<bfloat16>((m_old - m_new) * 1.4453125f); |
| 126 | + aie::vector<bfloat16, 16> corr_in_vec = aie::broadcast<bfloat16, 16>(corr_scaled); |
| 127 | + aie::accum<accfloat, 16> corr_acc(corr_in_vec); |
| 128 | + aie::vector<bfloat16, 16> corr_exp = aie::exp2<bfloat16>(corr_acc.to_vector<float>()); |
| 129 | + float c_correction = static_cast<float>(corr_exp[0]); |
| 130 | + |
| 131 | + bfloat16 l_new_bf16 = static_cast<bfloat16>(c_correction * l_old); |
| 132 | + |
| 133 | + // Compute exp2 for each score position — one at a time, no float arrays |
| 134 | + for (int pos = 0; pos < chunk_size; pos++) { |
| 135 | + bfloat16 diff = static_cast<bfloat16>( |
| 136 | + (static_cast<float>(scores_bf16[pos]) - m_new) * 1.4453125f); |
| 137 | + aie::vector<bfloat16, 16> diff_vec = aie::broadcast<bfloat16, 16>(diff); |
| 138 | + aie::accum<accfloat, 16> diff_acc(diff_vec); |
| 139 | + aie::vector<bfloat16, 16> exp_result = aie::exp2<bfloat16>(diff_acc.to_vector<float>()); |
| 140 | + bfloat16 f_bf16 = exp_result[0]; |
| 141 | + l_new_bf16 = static_cast<bfloat16>(static_cast<float>(l_new_bf16) + static_cast<float>(f_bf16)); |
| 142 | + scores_out[pos * num_q_heads + h] = f_bf16; |
| 143 | + } |
| 144 | + |
| 145 | + // Update running state |
| 146 | + score_running_max[h] = m_new; |
| 147 | + score_running_sum[h] = static_cast<float>(l_new_bf16); |
| 148 | + |
| 149 | + // Write correction and denominator to packed buffer |
| 150 | + correction_out[h] = static_cast<bfloat16>(c_correction); |
| 151 | + denom_out[h] = l_new_bf16; |
| 152 | + } |
| 153 | + |
| 154 | + event1(); |
| 155 | +} |
| 156 | + |
| 157 | +// ============================= Value Tile ==================================== |
| 158 | + |
| 159 | +// Initialize the value accumulator. |
| 160 | +void flowkv_value_init_bf16(int32_t num_q_heads, int32_t head_dim) |
| 161 | +{ |
| 162 | + int total = num_q_heads * head_dim; |
| 163 | + for (int i = 0; i < total; i++) { |
| 164 | + value_accum[i] = 0.0f; |
| 165 | + } |
| 166 | + for (int h = 0; h < num_q_heads; h++) { |
| 167 | + saved_denom[h] = 0.0f; |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +// Accumulate weighted values for one chunk. |
| 172 | +// Reads scores and correction from the packed inter-tile buffer. |
| 173 | +// Saves the denominator into a static buffer for later normalization. |
| 174 | +// |
| 175 | +// packed_in: packed buffer from score tile FIFO |
| 176 | +// [0..cs*gs-1]: F_c scores |
| 177 | +// [cs*gs..cs*gs+gs-1]: C_c correction |
| 178 | +// [cs*gs+gs..cs*gs+2*gs-1]: l denom |
| 179 | +// v_chunk: (chunk_size, head_dim) -- V cache chunk from DDR |
| 180 | +void flowkv_value_accum_bf16(const bfloat16 *__restrict packed_in, const bfloat16 *__restrict v_chunk, |
| 181 | + int32_t num_q_heads, int32_t head_dim, int32_t chunk_size) |
| 182 | +{ |
| 183 | + event0(); |
| 184 | + ::aie::set_rounding(aie::rounding_mode::conv_even); |
| 185 | + |
| 186 | + const int32_t scores_size = chunk_size * num_q_heads; |
| 187 | + const bfloat16 *scores_in = packed_in; |
| 188 | + const bfloat16 *correction_in = packed_in + scores_size; |
| 189 | + const bfloat16 *denom_in = packed_in + scores_size + num_q_heads; |
| 190 | + |
| 191 | + for (int h = 0; h < num_q_heads; h++) { |
| 192 | + float correction = static_cast<float>(correction_in[h]); |
| 193 | + float *y_head = value_accum + h * head_dim; |
| 194 | + |
| 195 | + // Save denominator for final normalization |
| 196 | + saved_denom[h] = static_cast<float>(denom_in[h]); |
| 197 | + |
| 198 | + // Apply correction to accumulated output: Y = C_c * Y_old |
| 199 | + aie::vector<float, 16> corr_vec = aie::broadcast<float, 16>(correction); |
| 200 | + for (int d = 0; d < head_dim; d += 16) { |
| 201 | + aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d); |
| 202 | + y_vec = aie::mul(y_vec, corr_vec); |
| 203 | + aie::store_v(y_head + d, y_vec); |
| 204 | + } |
| 205 | + |
| 206 | + // Accumulate: Y += sum_pos( F_c[pos, h] * V[pos, :] ) |
| 207 | + for (int pos = 0; pos < chunk_size; pos++) { |
| 208 | + float f = static_cast<float>(scores_in[pos * num_q_heads + h]); |
| 209 | + const bfloat16 *v_pos = v_chunk + pos * head_dim; |
| 210 | + aie::vector<float, 16> f_vec = aie::broadcast<float, 16>(f); |
| 211 | + |
| 212 | + for (int d = 0; d < head_dim; d += 16) { |
| 213 | + aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d); |
| 214 | + aie::vector<bfloat16, 16> v_vec = aie::load_v<16>(v_pos + d); |
| 215 | + aie::accum<accfloat, 16> v_acc(v_vec); |
| 216 | + aie::vector<float, 16> v_f32 = v_acc.to_vector<float>(); |
| 217 | + aie::vector<float, 16> fv = aie::mul(f_vec, v_f32); |
| 218 | + y_vec = aie::add(y_vec, fv); |
| 219 | + aie::store_v(y_head + d, y_vec); |
| 220 | + } |
| 221 | + } |
| 222 | + } |
| 223 | + |
| 224 | + event1(); |
| 225 | +} |
| 226 | + |
| 227 | +// Normalize and produce final output: O = Y / l. |
| 228 | +// Reads the denominator from saved_denom (set by the last accum call). |
| 229 | +// |
| 230 | +// output: (num_q_heads, head_dim) -- final attention output in bf16 |
| 231 | +void flowkv_value_normalize_bf16(bfloat16 *__restrict output, int32_t num_q_heads, int32_t head_dim) |
| 232 | +{ |
| 233 | + ::aie::set_rounding(aie::rounding_mode::conv_even); |
| 234 | + |
| 235 | + for (int h = 0; h < num_q_heads; h++) { |
| 236 | + float inv_l = aie::inv(saved_denom[h]); |
| 237 | + aie::vector<float, 16> inv_l_vec = aie::broadcast<float, 16>(inv_l); |
| 238 | + float *y_head = value_accum + h * head_dim; |
| 239 | + bfloat16 *o_head = output + h * head_dim; |
| 240 | + |
| 241 | + for (int d = 0; d < head_dim; d += 16) { |
| 242 | + aie::vector<float, 16> y_vec = aie::load_v<16>(y_head + d); |
| 243 | + aie::vector<float, 16> scaled = aie::mul(y_vec, inv_l_vec); |
| 244 | + aie::accum<accfloat, 16> y_acc(scaled); |
| 245 | + aie::vector<bfloat16, 16> out_vec = y_acc.to_vector<bfloat16>(); |
| 246 | + aie::store_v(o_head + d, out_vec); |
| 247 | + } |
| 248 | + } |
| 249 | +} |
| 250 | + |
| 251 | +} // extern "C" |
0 commit comments