-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
97 lines (65 loc) · 4.78 KB
/
Makefile
File metadata and controls
97 lines (65 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
.PHONY: help collect preprocess split train-codet5 train-codebert index evaluate \
serve serve-prod test lint format clean retrain feedback-stats upgrade-model
PYTHON := python
help: ## Show this help
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \
awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}'
# ─── Data Pipeline ────────────────────────────────────────────────────────────
collect: ## Fetch PR review data from GitHub (requires INSPECTAI_GITHUB_TOKEN in .env)
$(PYTHON) -m src.data.collector
preprocess: ## Parse diffs and structure data into JSONL pairs
$(PYTHON) -m src.data.preprocessor
split: ## Split processed data into train (80%) / valid (20%)
$(PYTHON) -m src.data.splitter
# ─── Training ─────────────────────────────────────────────────────────────────
train-codet5: ## Fine-tune CodeT5 for review generation
$(PYTHON) -m src.training.train_codet5
train-codet5-base: ## Fine-tune CodeT5-BASE (Phase 2C — 220M params, slower)
codet5_name=Salesforce/codet5-base $(PYTHON) -m src.training.train_codet5
train-codebert: ## Fine-tune CodeBERT as severity classifier (fast: 400 samples, 1 epoch)
$(PYTHON) -m src.training.train_codebert
train-codebert-full: ## Fine-tune CodeBERT on full dataset (overnight: all samples, 3 epochs)
MAX_TRAIN_SAMPLES=0 MAX_EPOCHS=3 $(PYTHON) -m src.training.train_codebert
index: ## Build FAISS vector index from training data
$(PYTHON) -m src.training.build_faiss
evaluate: ## Run BLEU/ROUGE/Actionability evaluation on validation set (50 samples)
$(PYTHON) -m src.training.evaluate --samples 50 --phase phase2
evaluate-baseline: ## Run evaluation and tag as phase1_baseline (for paper comparison)
$(PYTHON) -m src.training.evaluate --samples 50 --phase phase1_baseline \
--notes "codet5-small, 1986 records, heuristic labels"
# ─── Active Learning ──────────────────────────────────────────────────────────
retrain: ## Run active learning cycle: pull feedback → augment data → retrain
$(PYTHON) -m src.feedback.retrainer
retrain-dry: ## Dry run: collect and merge data but do NOT trigger retraining
$(PYTHON) -m src.feedback.retrainer --dry-run
feedback-stats: ## Show feedback store statistics (acceptance rate, pending records)
$(PYTHON) -c "from src.feedback.store import get_feedback_store; \
import json; print(json.dumps(get_feedback_store().stats(), indent=2))"
# ─── Serving ──────────────────────────────────────────────────────────────────
serve: ## Start FastAPI server (dev mode with auto-reload, port 8000)
uvicorn src.api.app:app --host 0.0.0.0 --port 8000 --reload
serve-prod: ## Start FastAPI server (production mode, 2 workers)
uvicorn src.api.app:app --host 0.0.0.0 --port 8000 --workers 2
# ─── Full Pipelines ───────────────────────────────────────────────────────────
pipeline: ## Full training pipeline: preprocess → split → train → index
$(MAKE) preprocess split train-codet5 train-codebert index
pipeline-phase2: ## Phase 2 full run: full codebert + evaluate baseline + serve
$(MAKE) train-codebert-full evaluate-baseline serve
# ─── Quality ──────────────────────────────────────────────────────────────────
test: ## Run test suite
pytest tests/ -v --tb=short
lint: ## Run ruff linter
ruff check src/ tests/
format: ## Auto-format with ruff
ruff format src/ tests/
# ─── Utilities ────────────────────────────────────────────────────────────────
clean: ## Remove compiled Python files and __pycache__
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
find . -name "*.pyc" -delete
setup-env: ## Copy .env.example to .env (first-time setup)
cp .env.example .env
@echo ".env created — fill in your secrets before running."
upgrade-model: ## Switch to codet5-base in config (edit default.yaml then retrain)
@echo "Edit config/default.yaml and uncomment:"
@echo " codet5_name: Salesforce/codet5-base"
@echo "Then run: make train-codet5"