Skip to content

Commit ff41674

Browse files
committed
fixes
1 parent 7a4b207 commit ff41674

5 files changed

Lines changed: 33 additions & 16 deletions

File tree

iron/applications/llama_3.2_1b/llama_cpu.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ def llama_forward_pass(config, state):
291291

292292

293293
def main():
294-
prompt = "The capital of France is "
295294
args = harness.parse_args()
295+
prompt = harness.get_prompt(args.prompt_len)
296296
config, state = harness.init(args.weights_path, args.tokenizer_path, prompt=prompt)
297297
print(prompt, end="", flush=True)
298298
harness.generate(config, state, llama_forward_pass, num_tokens=args.num_tokens)

iron/applications/llama_3.2_1b/llama_inference_harness.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,12 @@ def parse_args():
178178
parser.add_argument(
179179
"tokenizer_path", type=str, help="Path to the tokenizer model (tiktoken file)"
180180
)
181+
parser.add_argument(
182+
"--prompt-len",
183+
type=int,
184+
default=2048,
185+
help="Length of the input prompt in tokens (default: 2048)",
186+
)
181187
parser.add_argument(
182188
"--num-tokens",
183189
type=int,
@@ -187,6 +193,13 @@ def parse_args():
187193
return parser.parse_args()
188194

189195

196+
def get_prompt(prompt_len):
197+
with open("prompt.txt", "r") as f:
198+
prompt = f.read()
199+
prompt = prompt[:prompt_len]
200+
return prompt
201+
202+
190203
def init(
191204
weights_path,
192205
tokenizer_path,
@@ -249,12 +262,13 @@ def generate(config, state, forward_pass, num_tokens=100, use_kv_cache=True):
249262
t_decode = t_decode_end - t_decode_start
250263
sys.stderr.write("\n\n=== Performance Statistics ===\n")
251264
sys.stderr.write(f"[Prefill] Time to first token: {t_prefill:7.3f} s\n")
252-
sys.stderr.write(
253-
f"[Decode] Time per token (mean): {t_decode / (n_tokens_generated - 1):7.3f} s\n"
254-
)
255-
sys.stderr.write(
256-
f"[Decode] Tokens per second: {(n_tokens_generated - 1) / t_decode:7.3f}\n"
257-
)
265+
if n_tokens_generated > 1:
266+
sys.stderr.write(
267+
f"[Decode] Time per token (mean): {t_decode / (n_tokens_generated - 1):7.3f} s\n"
268+
)
269+
sys.stderr.write(
270+
f"[Decode] Tokens per second: {(n_tokens_generated - 1) / t_decode:7.3f}\n"
271+
)
258272
sys.stderr.write(
259273
f"[Total] Time per token (mean): {(t_prefill + t_decode) / n_tokens_generated:7.3f} s\n"
260274
)

iron/applications/llama_3.2_1b/llama_npu.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,14 +1326,15 @@ def llama_forward_pass(config, state):
13261326

13271327

13281328
def main():
1329-
global aie_ops, aie_buffers
1330-
prompt = "The capital of France is "
1331-
# with open('prompt.txt', 'r') as f:
1332-
# prompt = f.read()
1333-
# prompt = prompt[:max_seq_len]
1334-
1329+
global aie_ops, aie_buffers, max_seq_len
13351330
args = harness.parse_args()
13361331

1332+
assert (
1333+
max_seq_len >= args.prompt_len + args.num_tokens
1334+
), "max_seq_len must be at least prompt_len + num_tokens"
1335+
1336+
prompt = harness.get_prompt(args.prompt_len)
1337+
13371338
config, state = harness.init(args.weights_path, args.tokenizer_path, prompt=prompt)
13381339

13391340
aie_ops = AIELlamaOperators(config, max_seq_len)

iron/applications/llama_3.2_1b/test.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
import subprocess
66
import pytest
77
from pathlib import Path
8+
import os
89

910
test_dir = Path(__file__).parent
10-
weights_dir = Path("/srv")
11+
weights_dir = Path(os.environ.get("IRON_EXAMPLE_WEIGHTS_DIR", "/srv"))
1112

1213

1314
def generate_test_params():
14-
prompt_lengths = [2048, 13]
15+
prompt_lengths = [1024, 13]
1516
num_tokens_list = [40, 1]
1617

1718
params = []
@@ -32,7 +33,7 @@ def generate_test_params():
3233
)
3334
@pytest.mark.parametrize("prompt_len,num_tokens", params, ids=names)
3435
def test_llama_3_2_1b(prompt_len, num_tokens):
35-
command = f"python3 {test_dir}/llama_npu.py {weights_dir}/llama3.2-1b/model.safetensors {weights_dir}/llama3.2-1b/tokenizer.model --num-tokens {num_tokens}"
36+
command = f"python3 {test_dir}/llama_npu.py {weights_dir}/llama3.2-1b/model.safetensors {weights_dir}/llama3.2-1b/tokenizer.model --num-tokens {num_tokens} --prompt-len {prompt_len}"
3637

3738
result = subprocess.run(
3839
command,

iron/common/fusion.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ def load_elf(op):
240240
def patch_elf(elf_data, patches):
241241
for i, patch in patches.items():
242242
val, mask = patch
243+
val = np.uint64(val)
243244
mask = np.uint64(mask) # avoid numpy overflow errors
244245
elf_data[i] = np.uint32((elf_data[i] & ~mask) | (val & mask))
245246
return elf_data

0 commit comments

Comments
 (0)