DeepTCM/diagnose_training.py at main · spartanjoax/DeepTCM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Diagnostic script: run on remote server and paste output back.
Usage: conda run -n torch python diagnose_training.py
"""
import os
import json
import glob
import numpy as np
from joblib import load

print("=" * 60)
print("1. CHAMPION CONFIG")
print("=" * 60)
cfg_path = "configs/champion_model.json"
with open(cfg_path) as f:
    cfg = json.load(f)
for k, v in cfg.items():
    print(f"  {k}: {v}")

print()
print("=" * 60)
print("2. DATA SPLIT SIZES (paper split)")
print("=" * 60)
d = load("data/nasa.bin")
x, xp, y = d
print(f"Total runs in .bin: {len(x)}")
test_cases = [11, 12, 15, 16]
train_pool_cases = [1, 2, 3, 4, 5, 7, 8, 9, 10, 13, 14]
# --final_retrain: val=[] so all non-test cases are train
train_cases_final = list(train_pool_cases)  # all 11 non-test cases
print(f"[--final_retrain split]")
print(f"  Test  cases {test_cases}: {sum((xp[:,0]==c).sum() for c in test_cases)} runs")
print(f"  Val   cases []: 0 runs  (custom_val_cases=[] with --final_retrain)")
print(f"  Train cases {train_cases_final}: {sum((xp[:,0]==c).sum() for c in train_cases_final)} runs")
train_cases = train_cases_final  # use full train for downstream
print(f"  TARGET range — train: [{y[np.isin(xp[:,0], train_cases)].min():.4f}, "
      f"{y[np.isin(xp[:,0], train_cases)].max():.4f}]")
print(f"  TARGET range — test:  [{y[np.isin(xp[:,0], test_cases)].min():.4f}, "
      f"{y[np.isin(xp[:,0], test_cases)].max():.4f}]")
# Also show what HP-search val split would be (for context)
val_cases_hp = cfg.get("_val_cases", [3, 5])
train_cases_hp = [c for c in train_pool_cases if c not in val_cases_hp]
print(f"[HP-search split (for context)]")
print(f"  Val   cases {val_cases_hp}: {sum((xp[:,0]==c).sum() for c in val_cases_hp)} runs")
print(f"  Train cases {train_cases_hp}: {sum((xp[:,0]==c).sum() for c in train_cases_hp)} runs")

print()
print("=" * 60)
print("3. LATEST TRAINING RESULTS")
print("=" * 60)
import pandas as pd
result_dirs = sorted(glob.glob("final_optimized_all_sw250_ss125"), reverse=True)
if not result_dirs:
    result_dirs = sorted(glob.glob("final_all_sw250_ss125"), reverse=True)
if result_dirs:
    d = result_dirs[0]
    print(f"Latest output folder: {d}")

    # Aggregate CSV — only exists after the full run completes
    csv_files = glob.glob(f"{d}/*scores.csv")
    if csv_files:
        for c in csv_files:
            df = pd.read_csv(c, sep=";")
            cols = [col for col in ["name", "rmse", "r2_score", "mae"] if col in df.columns]
            print(f"\n  [FINAL] {os.path.basename(c)}:")
            print(df[cols].to_string(index=False))
    else:
        print("  (Aggregate CSV not written yet — run still in progress)")

    # Per-model JSONs — written as each model/rep completes
    per_model_jsons = sorted(glob.glob(f"{d}/DL_*_rep*.json"))
    if per_model_jsons:
        rows = []
        for jpath in per_model_jsons:
            try:
                with open(jpath) as jf:
                    jr = json.load(jf)
                rows.append({
                    "name": jr.get("name", os.path.basename(jpath)),
                    "rmse": jr.get("rmse", float("nan")),
                    "r2_score": jr.get("r2_score", float("nan")),
                    "mae": jr.get("mae", float("nan")),
                    "val_root_mse": jr.get("val_root_mse", float("nan")),
                    "epochs": len(jr.get("history", {}).get("root_mse", [])),
                    "best_train_rmse": min(jr["history"]["root_mse"]) if jr.get("history", {}).get("root_mse") else float("nan"),
                })
            except Exception as e:
                rows.append({"name": jpath, "rmse": f"ERROR: {e}"})
        dfj = pd.DataFrame(rows)
        print(f"\n  [PER-MODEL, {len(rows)} completed so far]:")
        print(dfj.to_string(index=False))
    else:
        print("  No per-model JSON files written yet.")

    # Grouped summary — mean ± std per architecture across reps
    if per_model_jsons:
        import re
        dfj["arch"] = dfj["name"].apply(lambda n: re.sub(r"_rep\d+$", "", n))
        numeric_cols = [c for c in ["rmse", "r2_score", "mae", "epochs", "best_train_rmse"]
                        if c in dfj.columns and pd.api.types.is_numeric_dtype(dfj[c])]
        grp = dfj.groupby("arch")[numeric_cols].agg(["mean", "std"]).round(4)
        print(f"\n  [GROUPED SUMMARY — mean (std) across reps, sorted by rmse mean]:")
        summary_rows = []
        for arch, row in grp.iterrows():
            rmse_m = row[("rmse", "mean")]
            rmse_s = row[("rmse", "std")]
            r2_m   = row[("r2_score", "mean")]
            ep_m   = row[("epochs", "mean")]
            btr_m  = row[("best_train_rmse", "mean")]
            summary_rows.append((rmse_m, arch, rmse_s, r2_m, ep_m, btr_m))
        for rmse_m, arch, rmse_s, r2_m, ep_m, btr_m in sorted(summary_rows):
            print(f"    {arch:<35s} test_rmse={rmse_m:.4f}±{rmse_s:.4f}  "
                  f"r2={r2_m:.3f}  train_rmse={btr_m:.4f}  epochs={ep_m:.0f}")

    # Keras model files — confirms what rep files actually exist on disk
    keras_files = glob.glob(f"{d}/*.keras")
    print(f"\n  Keras model files on disk ({len(keras_files)}):")
    for kf in sorted(keras_files):
        print(f"    {os.path.basename(kf)}")
else:
    print("  No output folders found yet.")

print()
print("=" * 60)
print("4. PREPROCESSING STATS CHECK (train vs test scalogram)")
print("=" * 60)
import os, sys
os.environ["KERAS_BACKEND"] = "torch"

from helpers import apply_full_preprocessing

scalogram = cfg.get("scalogram", "none")
print(f"Scalogram type: {scalogram}")

if scalogram != "none":
    x_arr = x.astype(np.float32)
    # Use only ~3 train runs for speed
    train_mask = np.isin(xp[:, 0], train_cases)
    test_mask = np.isin(xp[:, 0], test_cases)
    x_tr_small = x_arr[train_mask][:3]
    x_te_small = x_arr[test_mask][:3]
    y_tr_small = y[train_mask][:3]
    y_te_small = y[test_mask][:3]
    proc_tr = np.zeros((len(x_tr_small), 4), dtype=np.float32)
    proc_te = np.zeros((len(x_te_small), 4), dtype=np.float32)

    sw_cfg = {"window_size": 250, "stride": 125}
    _, _, stats_2d, _ = apply_full_preprocessing(
        [x_tr_small, proc_tr], y_tr_small, cfg, split="train",
        sliding_window_config=sw_cfg
    )
    print(f"  Train scalo_mean keys: {list(stats_2d.keys())}")
    if "scalo_mean" in stats_2d:
        print(f"  scalo_mean (first 3 channels): {stats_2d['scalo_mean'].flat[:3]}")
        print(f"  scalo_std  (first 3 channels): {stats_2d['scalo_std'].flat[:3]}")
    else:
        print("  WARNING: scalo_mean NOT in stats — normalization never computed!")

    # Check test WITHOUT passing stats (the old bug)
    te_no_stats, _, _, _ = apply_full_preprocessing(
        [x_te_small, proc_te], y_te_small, cfg, split="test",
        preproc_stats={},  # empty = old broken behaviour
        sliding_window_config=sw_cfg
    )
    # Check test WITH stats (fixed)
    te_with_stats, _, _, _ = apply_full_preprocessing(
        [x_te_small, proc_te], y_te_small, cfg, split="test",
        preproc_stats=stats_2d,  # correct
        sliding_window_config=sw_cfg
    )
    arr_no = te_no_stats[0] if isinstance(te_no_stats, list) else te_no_stats
    arr_with = te_with_stats[0] if isinstance(te_with_stats, list) else te_with_stats
    print(f"\n  Test scalogram WITHOUT stats: mean={arr_no.mean():.4f}, std={arr_no.std():.4f}")
    print(f"  Test scalogram WITH stats:    mean={arr_with.mean():.4f}, std={arr_with.std():.4f}")
    print(f"  (After correct normalization, mean should be ~0 and std ~1)")
else:
    print("  Scalogram=none — no 2D normalization issue.")

print()
print("=" * 60)
print("5. TRAINING CURVES (from per-model JSONs)")
print("=" * 60)
if result_dirs:
    per_model_jsons = sorted(glob.glob(f"{result_dirs[0]}/DL_*_rep*.json"))
    for jpath in per_model_jsons:
        try:
            with open(jpath) as jf:
                jr = json.load(jf)
            hist = jr.get("history", {})
            train_rmse = hist.get("root_mse", [])
            val_rmse = hist.get("val_root_mse", [])
            name = jr.get("name", os.path.basename(jpath))
            if train_rmse:
                print(f"  {name}: epochs={len(train_rmse)}, "
                      f"best_train={min(train_rmse):.4f} (ep {int(np.argmin(train_rmse))+1}), "
                      f"final_train={train_rmse[-1]:.4f}"
                      + (f", best_val={min(val_rmse):.4f}" if val_rmse else ""))
        except Exception as e:
            print(f"  {jpath}: ERROR {e}")
else:
    print("  No output folders found yet.")

print()
print("DONE — paste all output above.")