diff --git a/docs/notebooks/devnet_datasets.ipynb b/docs/notebooks/devnet_datasets.ipynb new file mode 100644 index 00000000..a202a8b8 --- /dev/null +++ b/docs/notebooks/devnet_datasets.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ea4ae65a-d555-4b54-96f9-11eed006adc2", + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install coniferest matplotlib pandas tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3d9577061e9494ed", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-03T15:29:14.711984Z", + "start_time": "2025-07-03T15:29:13.632289Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "\n", + "from coniferest.aadforest import AADForest\n", + "from coniferest.datasets import Dataset, DevNetDataset\n", + "from coniferest.isoforest import IsolationForest\n", + "from coniferest.label import Label\n", + "from coniferest.pineforest import PineForest\n", + "from coniferest.session.oracle import OracleSession, create_oracle_session" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "initial_id", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-03T15:29:15.712748Z", + "start_time": "2025-07-03T15:29:15.707980Z" + } + }, + "outputs": [], + "source": [ + "class Compare:\n", + " models = {\n", + " 'Isolation Forest': IsolationForest,\n", + " 'AAD': AADForest,\n", + " 'Pine Forest': PineForest,\n", + " }\n", + "\n", + " def __init__(self, dataset: Dataset, *, iterations=100, n_jobs=-1, sampletrees_per_batch=1<<20):\n", + " self.model_kwargs = {\n", + " 'n_trees': 128,\n", + " 'sampletrees_per_batch': sampletrees_per_batch,\n", + " 'n_jobs': n_jobs,\n", + " }\n", + " self.session_kwargs = {\n", + " 'data': dataset.data,\n", + " 'labels': dataset.labels,\n", + " 'max_iterations': iterations,\n", + " }\n", + " self.results = {}\n", + " self.steps = np.arange(1, iterations + 1)\n", + " self.total_anomaly_fraction = np.mean(dataset.labels == Label.A)\n", + "\n", + " def get_sessions(self, random_seed):\n", + " model_kwargs = self.model_kwargs | {'random_seed': random_seed}\n", + "\n", + " return {\n", + " name: create_oracle_session(model=model(**model_kwargs), **self.session_kwargs)\n", + " for name, model in self.models.items()\n", + " }\n", + "\n", + " def run(self, random_seeds):\n", + " assert len(random_seeds) == len(set(random_seeds)), \"random seeds must be different\"\n", + " \n", + " results = defaultdict(dict)\n", + "\n", + " futures = []\n", + " for random_seed in tqdm(random_seeds):\n", + " sessions = self.get_sessions(random_seed)\n", + " for name, session in sessions.items():\n", + " session.run()\n", + " anomalies = np.cumsum(np.array(list(session.known_labels.values())) == Label.A)\n", + " results[name][random_seed] = anomalies\n", + "\n", + " self.results |= results\n", + " return self\n", + "\n", + " def plot(self, dataset_name: str, savefig=False):\n", + " plt.figure(figsize=(8, 6))\n", + " plt.title(f'Dataset: {dataset_name}')\n", + "\n", + " for name, anomalies_dict in self.results.items():\n", + " anomalies = np.stack(list(anomalies_dict.values()))\n", + " q5, median, q95 = np.quantile(anomalies, [0.05, 0.5, 0.95], axis=0)\n", + "\n", + " plt.plot(self.steps, median, alpha=0.75, label=name)\n", + " plt.fill_between(self.steps, q5, q95, alpha=0.5)\n", + "\n", + " plt.plot(self.steps, self.steps * self.total_anomaly_fraction, ls='--', color='grey',\n", + " label='Theoretical random')\n", + "\n", + " plt.xlabel('Iteration')\n", + " plt.ylabel('Number of anomalies')\n", + " plt.grid()\n", + " plt.legend()\n", + " if savefig:\n", + " plt.savefig(f'{dataset_name}.pdf')\n", + "\n", + " return self" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "929fd77b-3333-4937-90aa-d2804151d868", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/200 [00:00= anomaly_threshold\n", + " self.labels = np.full(anomaly.shape, Label.R)\n", + " self.labels[anomaly] = Label.A\n", + "\n", + "\n", + "seeds = range(12, 212)\n", + "\n", + "path = Path(\"/home/hombit/gz2\")\n", + "dataset_obj = GalaxyZoo2Dataset(path)\n", + "%time compare_zoo = Compare(dataset_obj, iterations=100, n_jobs=24, sampletrees_per_batch=1<<16).run(seeds)\n", + "compare_zoo.plot(\"Galaxy Zoo 2 (Anything odd? 90%)\", savefig=True)\n", + "with open(\"galaxyzoo2_compare.pickle\", \"wb\") as fh:\n", + " pickle.dump(compare_zoo, fh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71c337b3577915d5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-03T15:35:53.300312Z", + "start_time": "2025-07-03T15:34:16.696646Z" + }, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "import pickle\n", + "\n", + "from joblib.parallel import delayed, Parallel\n", + "\n", + "print(DevNetDataset.avialble_datasets)\n", + "\n", + "seeds = range(200)\n", + "compare_delayed = delayed(\n", + " lambda dataset: Compare(DevNetDataset(dataset), iterations=100, n_jobs=48, sampletrees_per_batch=1<<16).run(seeds),\n", + ")\n", + "compare_ = Parallel(\n", + " n_jobs=len(DevNetDataset.avialble_datasets),\n", + ")(compare_delayed(dataset) for dataset in DevNetDataset.avialble_datasets)\n", + "\n", + "for dataset, compare_obj in zip(DevNetDataset.avialble_datasets, compare_):\n", + " print(f\"Plot {dataset}\")\n", + " compare_obj.plot(dataset, savefig=True)\n", + "\n", + "for dataset, compare_obj in zip(DevNetDataset.avialble_datasets, compare_):\n", + " print(f\"Save Compare object for {dataset}\")\n", + " with open(f'{dataset}_compare.pickle', 'wb') as fh:\n", + " pickle.dump(compare_obj, fh)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb3c8e56-306a-4bd7-a756-f8489deb1c22", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebooks/gz2.py b/docs/notebooks/gz2.py new file mode 100644 index 00000000..d74e5dee --- /dev/null +++ b/docs/notebooks/gz2.py @@ -0,0 +1,107 @@ +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +from tqdm import tqdm + +from coniferest.aadforest import AADForest +from coniferest.datasets import Dataset, DevNetDataset +from coniferest.isoforest import IsolationForest +from coniferest.label import Label +from coniferest.pineforest import PineForest +from coniferest.session.oracle import OracleSession, create_oracle_session + +class Compare: + models = { + 'Isolation Forest': IsolationForest, + 'AAD': AADForest, + 'Pine Forest': PineForest, + } + + def __init__(self, dataset: Dataset, *, iterations=100, n_jobs=-1, sampletrees_per_batch=1<<20): + self.model_kwargs = { + 'n_trees': 128, + 'sampletrees_per_batch': sampletrees_per_batch, + 'n_jobs': n_jobs, + } + self.session_kwargs = { + 'data': dataset.data, + 'labels': dataset.labels, + 'max_iterations': iterations, + } + self.results = {} + self.steps = np.arange(1, iterations + 1) + self.total_anomaly_fraction = np.mean(dataset.labels == Label.A) + + def get_sessions(self, random_seed): + model_kwargs = self.model_kwargs | {'random_seed': random_seed} + + return { + name: create_oracle_session(model=model(**model_kwargs), **self.session_kwargs) + for name, model in self.models.items() + } + + def run(self, random_seeds): + assert len(random_seeds) == len(set(random_seeds)), "random seeds must be different" + + results = defaultdict(dict) + + futures = [] + for random_seed in tqdm(random_seeds): + sessions = self.get_sessions(random_seed) + for name, session in sessions.items(): + session.run() + anomalies = np.cumsum(np.array(list(session.known_labels.values())) == Label.A) + results[name][random_seed] = anomalies + + self.results |= results + return self + + def plot(self, dataset_name: str, savefig=False): + plt.figure(figsize=(8, 6)) + plt.title(f'Dataset: {dataset_name}') + + for name, anomalies_dict in self.results.items(): + anomalies = np.stack(list(anomalies_dict.values())) + q5, median, q95 = np.quantile(anomalies, [0.05, 0.5, 0.95], axis=0) + + plt.plot(self.steps, median, alpha=0.75, label=name) + plt.fill_between(self.steps, q5, q95, alpha=0.5) + + plt.plot(self.steps, self.steps * self.total_anomaly_fraction, ls='--', color='grey', + label='Theoretical random') + + plt.xlabel('Iteration') + plt.ylabel('Number of anomalies') + plt.grid() + plt.legend() + if savefig: + plt.savefig(f'{dataset_name}.pdf') + + return self + +import pickle +from pathlib import Path + +import pandas as pd + +class GalaxyZoo2Dataset(Dataset): + def __init__(self, path: Path, *, anomaly_class='Class6.1', anomaly_threshold=0.9): + astronomaly = pd.read_parquet(path / "astronomaly.parquet") + self.data = astronomaly.drop(columns=['GalaxyID', 'anomaly']).to_numpy().copy(order='C') + ids = astronomaly['GalaxyID'].to_numpy() + + solutions = pd.read_csv(path / "training_solutions_rev1.csv", index_col="GalaxyID") + anomaly = solutions[anomaly_class][ids] >= anomaly_threshold + self.labels = np.full(anomaly.shape, Label.R) + self.labels[anomaly] = Label.A + + +seeds = range(12, 212) + +path = Path("/home/hombit/gz2") +dataset_obj = GalaxyZoo2Dataset(path) +compare_zoo = Compare(dataset_obj, iterations=100, n_jobs=1, sampletrees_per_batch=1<<16).run(seeds) +compare_zoo.plot("Galaxy Zoo 2 (Anything odd? 90%)", savefig=True) +with open("galaxyzoo2_compare.pickle", "wb") as fh: + pickle.dump(compare_zoo, fh) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 4e6641cd..5b14187c 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -10,7 +10,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "coniferest" -version = "0.1.0" +version = "0.0.16" dependencies = [ "enum_dispatch", "itertools", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 283e703b..98d81534 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "coniferest" -version = "0.1.0" +version = "0.0.16" edition = "2021" [lib] diff --git a/rust/src/tree_traversal.rs b/rust/src/tree_traversal.rs index 2180d240..f2c27d93 100644 --- a/rust/src/tree_traversal.rs +++ b/rust/src/tree_traversal.rs @@ -89,8 +89,8 @@ where Ok({ let paths = PyArray1::zeros(py, data_view.nrows(), false); - // SAFETY: this call invalidates other views, but it is the only view we need - let paths_view_mut = unsafe { paths.as_array_mut() }; + let mut paths_rw = paths.readwrite(); + let paths_view_mut = paths_rw.as_array_mut(); // Here we need to dispatch `data` and run the template function calc_paths_sum_impl( @@ -146,8 +146,8 @@ where false, ); - // SAFETY: this call invalidates other views, but it is the only view we need - let values_view = unsafe { values.as_array_mut() }; + let mut values_rw = values.readwrite(); + let values_view = values_rw.as_array_mut(); // Here we need to dispatch `data` and run the template function calc_paths_sum_transpose_impl( @@ -190,10 +190,10 @@ where let delta_sum = PyArray2::zeros(py, (data_view.nrows(), data_view.ncols()), false); let hit_count = PyArray2::zeros(py, (data_view.nrows(), data_view.ncols()), false); - // SAFETY: this call invalidates other views, but it is the only view we need - let delta_sum_view = unsafe { delta_sum.as_array_mut() }; - // SAFETY: this call invalidates other views, but it is the only view we need - let hit_count_view = unsafe { hit_count.as_array_mut() }; + let mut delta_sum_rw = delta_sum.readwrite(); + let delta_sum_view = delta_sum_rw.as_array_mut(); + let mut hit_count_rw = hit_count.readwrite(); + let hit_count_view = hit_count_rw.as_array_mut(); calc_feature_delta_sum_impl( selectors_view, @@ -234,8 +234,8 @@ where Ok({ let leafs = PyArray2::zeros(py, (data_view.nrows(), node_offsets_view.len() - 1), false); - // SAFETY: this call invalidates other views, but it is the only view we need - let leafs_view = unsafe { leafs.as_array_mut() }; + let mut leafs_rw = leafs.readwrite(); + let leafs_view = leafs_rw.as_array_mut(); calc_apply_impl( selectors_view, diff --git a/src/coniferest/datasets/__init__.py b/src/coniferest/datasets/__init__.py index 37e53f42..4c8c1cd5 100644 --- a/src/coniferest/datasets/__init__.py +++ b/src/coniferest/datasets/__init__.py @@ -158,6 +158,8 @@ def __init__(self, name: str): if name not in self.avialble_datasets: raise ValueError(f"Dataset {name} is not available. Available datasets are: {self.avialble_datasets}") + self.name = name + df = pd.read_csv(self._dataset_urls[name]) # Last column is for class, the rest are features diff --git a/src/coniferest/evaluator.py b/src/coniferest/evaluator.py index d45f72b1..e46fbf01 100644 --- a/src/coniferest/evaluator.py +++ b/src/coniferest/evaluator.py @@ -108,10 +108,11 @@ def combine_selectors(cls, selectors_list): # Assign a unique sequential index to every leaf # The index is used for weighted scores leaf_mask = selectors["feature"] < 0 - leaf_count = np.count_nonzero(leaf_mask) - leaf_offsets = np.full_like(node_offsets, leaf_count) - leaf_offsets[:-1] = np.cumsum(leaf_mask)[node_offsets[:-1]] + # Each offset tells how many leafs are in all previous trees + leaf_offsets = np.zeros_like(node_offsets) + leaf_offsets[1:] = np.cumsum(leaf_mask)[node_offsets[1:] - 1] + leaf_count = leaf_offsets[-1] selectors["left"][leaf_mask] = np.arange(0, leaf_count)