karpathy
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/runcpu.sh‎
Lines changed: 84 additions & 0 deletions b/‎dev/runcpu.sh‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎nanochat/common.py‎
Lines changed: 24 additions & 10 deletions b/‎nanochat/common.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎nanochat/dataloader.py‎
Lines changed: 3 additions & 3 deletions b/‎nanochat/dataloader.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎nanochat/execution.py‎
Lines changed: 5 additions & 4 deletions b/‎nanochat/execution.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎nanochat/gpt.py‎
Lines changed: 3 additions & 2 deletions b/‎nanochat/gpt.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎nanochat/loss_eval.py‎
Lines changed: 1 addition & 1 deletion b/‎nanochat/loss_eval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nanochat/report.py‎
Lines changed: 4 additions & 0 deletions b/‎nanochat/report.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 18 additions & 11 deletions b/‎pyproject.toml‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎run1000.sh‎
Lines changed: 2 additions & 0 deletions b/‎run1000.sh‎
Lines changed: 2 additions & 0 deletions
@@ -95,7 +95,7 @@ And a bit more about computing environments that will run nanochat:
 
 ## Running on CPU / MPS
 
-If you'd like to tinker with nanochat on your Macbook or a CPU machine, there is a work in progress [CPU|MPS PR](https://github.com/karpathy/nanochat/pull/88) up here. If you're on Macbook, use `--device_type=mps` when running `base_train.py`. See the PR and its diff for more. You're not going to get too far without GPU nodes, but at least you'll be able to run the code and maybe train a very tiny LLM with some patience.
+nanochat cn be run on CPU or on MPS (if you're on Macbook), and will automatically try to detect what device is best to run on. You're not going to get too far without GPUs, but at least you'll be able to run the code paths and maybe train a tiny LLM with some patience. For an example of how to make all the run commands much smaller (feel free to tune!), you can refer to [dev/runcpu.sh](dev/runcpu.sh) file. You'll see that I'm essentially restricting all scripts to train smaller models, to run for shorter number of iterations, etc. This functionality is new, slightly gnarly (touched a lot of code), and was merged in this [CPU|MPS PR](https://github.com/karpathy/nanochat/pull/88) on Oct 21, 2025.
 
 ## Customization
 
 
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
+# Run as:
+# bash dev/cpu_demo_run.sh
+
+# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
+# Think of this run as educational/fun demo, not something you should expect to work well.
+# This is also why I hide this script away in dev/
+
+# all the setup stuff
+export OMP_NUM_THREADS=1
+NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
+mkdir -p $NANOCHAT_BASE_DIR
+command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+[ -d ".venv" ] || uv venv
+uv sync
+source .venv/bin/activate
+if [ -z "$WANDB_RUN" ]; then
+    WANDB_RUN=dummy
+fi
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source "$HOME/.cargo/env"
+uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
+EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
+if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
+    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
+    unzip -q eval_bundle.zip
+    rm eval_bundle.zip
+    mv eval_bundle $NANOCHAT_BASE_DIR
+fi
+
+# wipe the report
+python -m nanochat.report reset
+
+# train tokenizer on ~1B characters
+python -m nanochat.dataset -n 4
+python -m scripts.tok_train --max_chars=1000000000
+python -m scripts.tok_eval
+
+# train a very small 4 layer model on the CPU
+# each optimization step processes a single sequence of 1024 tokens
+# we only run 50 steps of optimization (bump this to get better results)
+python -m scripts.base_train \
+    --depth=4 \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --total_batch_size=1024 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --core_metric_every=50 \
+    --core_metric_max_per_task=12 \
+    --sample_every=50 \
+    --num_iterations=50
+python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096
+python -m scripts.base_eval --max-per-task=5
+
+# midtraining
+python -m scripts.mid_train \
+    --max_seq_len=1024 \
+    --device_batch_size=1 \
+    --eval_every=50 \
+    --eval_tokens=4096 \
+    --total_batch_size=1024 \
+    --num_iterations=100
+# eval results will be terrible, this is just to execute the code paths.
+# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems
+python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20
+
+# SFT
+python -m scripts.chat_sft \
+    --device_batch_size=1 \
+    --target_examples_per_step=4 \
+    --num_iterations=100 \
+    --eval_steps=4 \
+    --eval_metrics_max_problems=16
+
+# Chat CLI
+# python -m scripts.chat_cli -p "Why is the sky blue?"
+
+# Chat Web
+# python -m scripts.chat_web
+
+python -m nanochat.report generate
@@ -89,32 +89,46 @@ def get_dist_info():
     else:
         return False, 0, 0, 1
 
-def compute_init():
+def autodetect_device_type():
+    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
+    if torch.cuda.is_available():
+        device_type = "cuda"
+    elif torch.backends.mps.is_available():
+        device_type = "mps"
+    else:
+        device_type = "cpu"
+    print0(f"Autodetected device type: {device_type}")
+    return device_type
+
+def compute_init(device_type="cuda"): # cuda|cpu|mps
     """Basic initialization that we keep doing over and over, so make common."""
 
-    # CUDA is currently required
-    assert torch.cuda.is_available(), "CUDA is needed for a distributed run atm"
+    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
+    if device_type == "cuda":
+        assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
+    if device_type == "mps":
+        assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
 
     # Reproducibility
     torch.manual_seed(42)
-    torch.cuda.manual_seed(42)
+    if device_type == "cuda":
+        torch.cuda.manual_seed(42)
     # skipping full reproducibility for now, possibly investigate slowdown later
     # torch.use_deterministic_algorithms(True)
-    # torch.backends.cudnn.deterministic = True
-    # torch.backends.cudnn.benchmark = False
 
     # Precision
-    torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
+    if device_type == "cuda":
+        torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
 
-    # Distributed setup: Distributed Data Parallel (DDP), optional
+    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    if ddp:
+    if ddp and device_type == "cuda":
         device = torch.device("cuda", ddp_local_rank)
         torch.cuda.set_device(device) # make "cuda" default to this device
         dist.init_process_group(backend="nccl", device_id=device)
         dist.barrier()
     else:
-        device = torch.device("cuda")
+        device = torch.device(device_type) # mps|cpu
 
     if ddp_rank == 0:
         logger.info(f"Distributed world size: {ddp_world_size}")
 
@@ -6,7 +6,7 @@
 from nanochat.dataset import parquets_iter_batched
 from nanochat.tokenizer import get_tokenizer
 
-def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128):
+def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda"):
     """Stream pretraining text from parquet files, tokenize, yield training batches."""
     assert split in ["train", "val"], "split must be 'train' or 'val'"
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
@@ -43,6 +43,6 @@ def document_batches():
         inputs_cpu = scratch[:-1].to(dtype=torch.int32)
         targets_cpu = scratch[1:]
         # Reshape to 2D and move to GPU async
-        inputs = inputs_cpu.view(B, T).to(device="cuda", dtype=torch.int32, non_blocking=True)
-        targets = targets_cpu.view(B, T).to(device="cuda", dtype=torch.int64, non_blocking=True)
+        inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
+        targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
         yield inputs, targets
@@ -146,13 +146,12 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     with caution.
     """
 
-    if maximum_memory_bytes is not None:
+    if platform.uname().system != "Darwin":
+        # These resource limit calls seem to fail on macOS (Darwin), skip?
         import resource
-
         resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
         resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
-        if not platform.uname().system == "Darwin":
-            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
 
     faulthandler.disable()
 
@@ -225,6 +224,7 @@ def _unsafe_execute(code: str, timeout: float, maximum_memory_bytes: Optional[in
         rmtree = shutil.rmtree
         rmdir = os.rmdir
         chdir = os.chdir
+        unlink = os.unlink
 
         # Disable functionalities that can make destructive changes to the test.
         reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
@@ -282,6 +282,7 @@ def _unsafe_execute(code: str, timeout: float, maximum_memory_bytes: Optional[in
         shutil.rmtree = rmtree
         os.rmdir = rmdir
         os.chdir = chdir
+        os.unlink = unlink
 
 
 def execute_code(
 
@@ -169,8 +169,6 @@ def __init__(self, config):
         cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
         self.register_buffer("cos", cos, persistent=False) # persistent=False means it's not saved to the checkpoint
         self.register_buffer("sin", sin, persistent=False)
-        # Cast the embeddings from fp32 to bf16: optim can tolerate it and it saves memory: both in the model and the activations
-        self.transformer.wte.to(dtype=torch.bfloat16)
 
     def init_weights(self):
         self.apply(self._init_weights)
@@ -184,6 +182,9 @@ def init_weights(self):
         head_dim = self.config.n_embd // self.config.n_head
         cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
         self.cos, self.sin = cos, sin
+        # Cast the embeddings from fp32 to bf16: optim can tolerate it and it saves memory: both in the model and the activations
+        if self.transformer.wte.weight.device.type == "cuda":
+            self.transformer.wte.to(dtype=torch.bfloat16)
 
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
 
@@ -33,7 +33,7 @@ def evaluate_bpb(model, batches, steps, token_bytes):
         loss2d = model(x, y, loss_reduction='none') # (B, T)
         loss2d = loss2d.view(-1) # flatten
         y = y.view(-1) # flatten
-        if (y < 0).any():
+        if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
             # slightly more complex code path if some target tokens are ignore_index (e.g. -1)
             # any target token < 0 is to be ignored: do NOT index token_bytes with negatives
             valid = y >= 0
 
@@ -283,6 +283,10 @@ def generate(self):
                     # capture bloat data for summary later (the stuff after Bloat header and until \n\n)
                     bloat_data = re.search(r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL)
                     bloat_data = bloat_data.group(1) if bloat_data else ""
+            else:
+                start_time = None # will cause us to not write the total wall clock time
+                bloat_data = "[bloat data missing]"
+                print(f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?")
             # process all the individual sections
             for file_name in EXPECTED_FILES:
                 section_file = os.path.join(report_dir, file_name)
 
@@ -11,6 +11,7 @@ dependencies = [
     "numpy==1.26.4",
     "psutil>=7.1.0",
     "regex>=2025.9.1",
+    "setuptools>=80.9.0",
     "tiktoken>=0.11.0",
     "tokenizers>=0.22.0",
     "torch>=2.8.0",
@@ -22,17 +23,6 @@ dependencies = [
 requires = ["maturin>=1.7,<2.0"]
 build-backend = "maturin"
 
-# target torch to cuda 12.8
-[tool.uv.sources]
-torch = [
-    { index = "pytorch-cu128" },
-]
-
-[[tool.uv.index]]
-name = "pytorch-cu128"
-url = "https://download.pytorch.org/whl/cu128"
-explicit = true
-
 [tool.maturin]
 module-name = "rustbpe"
 bindings = "pyo3"
@@ -53,3 +43,20 @@ testpaths = ["tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
+
+# target torch to cuda 12.8
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cpu", marker = "sys_platform != 'linux'" },
+  { index = "pytorch-cu128", marker = "sys_platform == 'linux'" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # The $1000 tier of nanochat
 # Designed to run end-to-end for $1000/24 ~= 41.6 hours on an 8XH100 node
 # A bit sparser on comments, see speedrun.sh for more detail
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
`1`	`3`	`# The $1000 tier of nanochat`
`2`	`4`	`# Designed to run end-to-end for $1000/24 ~= 41.6 hours on an 8XH100 node`
`3`	`5`	`# A bit sparser on comments, see speedrun.sh for more detail`