|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks) |
| 4 | +# Run as: |
| 5 | +# bash dev/cpu_demo_run.sh |
| 6 | + |
| 7 | +# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook. |
| 8 | +# Think of this run as educational/fun demo, not something you should expect to work well. |
| 9 | +# This is also why I hide this script away in dev/ |
| 10 | + |
| 11 | +# all the setup stuff |
| 12 | +export OMP_NUM_THREADS=1 |
| 13 | +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" |
| 14 | +mkdir -p $NANOCHAT_BASE_DIR |
| 15 | +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh |
| 16 | +[ -d ".venv" ] || uv venv |
| 17 | +uv sync |
| 18 | +source .venv/bin/activate |
| 19 | +if [ -z "$WANDB_RUN" ]; then |
| 20 | + WANDB_RUN=dummy |
| 21 | +fi |
| 22 | +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y |
| 23 | +source "$HOME/.cargo/env" |
| 24 | +uv run maturin develop --release --manifest-path rustbpe/Cargo.toml |
| 25 | +EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip |
| 26 | +if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then |
| 27 | + curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL |
| 28 | + unzip -q eval_bundle.zip |
| 29 | + rm eval_bundle.zip |
| 30 | + mv eval_bundle $NANOCHAT_BASE_DIR |
| 31 | +fi |
| 32 | + |
| 33 | +# wipe the report |
| 34 | +python -m nanochat.report reset |
| 35 | + |
| 36 | +# train tokenizer on ~1B characters |
| 37 | +python -m nanochat.dataset -n 4 |
| 38 | +python -m scripts.tok_train --max_chars=1000000000 |
| 39 | +python -m scripts.tok_eval |
| 40 | + |
| 41 | +# train a very small 4 layer model on the CPU |
| 42 | +# each optimization step processes a single sequence of 1024 tokens |
| 43 | +# we only run 50 steps of optimization (bump this to get better results) |
| 44 | +python -m scripts.base_train \ |
| 45 | + --depth=4 \ |
| 46 | + --max_seq_len=1024 \ |
| 47 | + --device_batch_size=1 \ |
| 48 | + --total_batch_size=1024 \ |
| 49 | + --eval_every=50 \ |
| 50 | + --eval_tokens=4096 \ |
| 51 | + --core_metric_every=50 \ |
| 52 | + --core_metric_max_per_task=12 \ |
| 53 | + --sample_every=50 \ |
| 54 | + --num_iterations=50 |
| 55 | +python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096 |
| 56 | +python -m scripts.base_eval --max-per-task=5 |
| 57 | + |
| 58 | +# midtraining |
| 59 | +python -m scripts.mid_train \ |
| 60 | + --max_seq_len=1024 \ |
| 61 | + --device_batch_size=1 \ |
| 62 | + --eval_every=50 \ |
| 63 | + --eval_tokens=4096 \ |
| 64 | + --total_batch_size=1024 \ |
| 65 | + --num_iterations=100 |
| 66 | +# eval results will be terrible, this is just to execute the code paths. |
| 67 | +# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems |
| 68 | +python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20 |
| 69 | + |
| 70 | +# SFT |
| 71 | +python -m scripts.chat_sft \ |
| 72 | + --device_batch_size=1 \ |
| 73 | + --target_examples_per_step=4 \ |
| 74 | + --num_iterations=100 \ |
| 75 | + --eval_steps=4 \ |
| 76 | + --eval_metrics_max_problems=16 |
| 77 | + |
| 78 | +# Chat CLI |
| 79 | +# python -m scripts.chat_cli -p "Why is the sky blue?" |
| 80 | + |
| 81 | +# Chat Web |
| 82 | +# python -m scripts.chat_web |
| 83 | + |
| 84 | +python -m nanochat.report generate |
0 commit comments