scverse · felix0097 · Mar 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,12 @@ and this project adheres to [Semantic Versioning][].
 [keep a changelog]: https://keepachangelog.com/en/1.0.0/
 [semantic versioning]: https://semver.org/spec/v2.0.0.html
 
+## [0.0.9]
+
+### Breaking
+- Renamed `annbatch.DatasetCollection.add_adatas` to {meth}`annbatch.DatasetCollection.add_anndatas` and renamed `adata_paths` argument to `anndata_paths` to be consistent with the rest of the API.
+- In similar fashion, renamed `load_adata` argument to `load_anndata` in {meth}`annbatch.DatasetCollection.add_anndatas` and {meth}`annbatch.Loader.use_collection`.
+
 ## [0.0.8]
 
 - {class}`~annbatch.Loader` acccepts an `rng` argument now
@@ -35,7 +41,7 @@ and this project adheres to [Semantic Versioning][].
 
 ## [0.0.4]
 
-- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to {meth}`annbatch.DatasetCollection.add_adatas`)
+- Load into memory nullables/categoricals from `obs` by default when shuffling (i.e., no custom `load_adata` argument to `annbatch.DatasetCollection.add_adatas`)
 
 ## [0.0.3]
 
@@ -46,12 +52,13 @@ and this project adheres to [Semantic Versioning][].
 
 ## [0.0.2]
 
+
 ### Breaking
 
 - `ZarrSparseDataset` and `ZarrDenseDataset` have been conslidated into {class}`annbatch.Loader`
-- `create_anndata_collection` and `add_to_collection` have been moved into the {meth}`annbatch.DatasetCollection.add_adatas` method
-- Default reading of input data is now fully lazy in {meth}`annbatch.DatasetCollection.add_adatas`, and therefore the shuffle process may now be slower although have better memory properties.  Use `load_adata` argument in {meth}`annbatch.DatasetCollection.add_adatas` to customize this behavior.
-- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API.  At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the {meth}`annbatch.Loader.add_anndatas` as before.
+- `create_anndata_collection` and `add_to_collection` have been moved into the `annbatch.DatasetCollection.add_adatas` method
+- Default reading of input data is now fully lazy in `annbatch.DatasetCollection.add_adatas`, and therefore the shuffle process may now be slower although have better memory properties.  Use `load_adata` argument in `annbatch.DatasetCollection.add_adatas` to customize this behavior.
+- Files shuffled under the old `create_anndata_collection` will not be recognized by {class}`annbatch.DatasetCollection` and therefore are not usable with the new {class}`annbatch.Loader.use_collection` API.  At the moment, the file metadata we maintain is only for internal purposes - however, if you wish to migrate to be able to use {class}`annbatch.DatasetCollection` in conjunction with {class}`annbatch.Loader.use_collection`, the root folder of the old collection must have attrs `{"encoding-type": "annbatch-preshuffled", "encoding-version": "0.1.0"}` and be a {class}`zarr.Group`. The subfolders (i.e., datasets) must be called `dataset_([0-9]*)`. Otherwise you can use the `annbatch.DatasetCollection.add_adatas` as before.
 
 ### Changed
 

diff --git a/README.md b/README.md
@@ -86,8 +86,8 @@ zarr.config.set(
 
 # Create a collection at the given path. The subgroups will all be anndata stores.
 collection = DatasetCollection("path/to/output/collection.zarr")
-collection.add_adatas(
-    adata_paths=[
+collection.add_anndatas(
+    anndata_paths=[
         "path/to/your/file1.h5ad",
         "path/to/your/file2.h5ad"
     ],
@@ -127,9 +127,9 @@ with ad.settings.override(remove_unused_categories=False):
         preload_nchunks=256,
     )
     # `use_collection` automatically uses the on-disk `X` and full `obs` in the `Loader`
-    # but the `load_adata` arg can override this behavior
+    # but the `load_anndata` arg can override this behavior
     # (see `custom_load_func` above for an example of customization).
-    ds = ds.use_collection(collection, load_adata = custom_load_func)
+    ds = ds.use_collection(collection, load_anndata = custom_load_func)
 
 # Iterate over dataloader (plugin replacement for torch.utils.DataLoader)
 for batch in ds:

diff --git a/docs/index.md b/docs/index.md
@@ -9,8 +9,8 @@ Let's go through the above example:
 ### Preprocessing
 
 ```python
-colleciton = DatasetCollection("path/to/output/store.zarr").add_adatas(
-    adata_paths=[
+colleciton = DatasetCollection("path/to/output/store.zarr").add_anndatas(
+    anndata_paths=[
         "path/to/your/file1.h5ad",
         "path/to/your/file2.h5ad"
     ],

diff --git a/docs/notebooks/example.ipynb b/docs/notebooks/example.ipynb
@@ -118,7 +118,7 @@
    "metadata": {},
    "source": [
     "The conversion code will take care of the following things:\n",
-    "* Align (outer join) the gene spaces across all datasets listed in `adata_paths`\n",
+    "* Align (outer join) the gene spaces across all datasets listed in `anndata_paths`\n",
     "  * The gene spaces are outer-joined based on the gene names provided in the `var_names` field of the individual `AnnData` objects.\n",
     "  * If you want to subset to specific gene space, you can provide a list of gene names via the `var_subset` parameter.\n",
     "* Shuffle the cells across all datasets (this works on larger than memory datasets as well).\n",
@@ -133,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "tags": [
      "hide-output"
@@ -178,7 +178,7 @@
     "\n",
     "\n",
     "# For CELLxGENE data, the raw counts can either be found under .raw.X or under .X (if .raw is not supplied).\n",
-    "# To have a store that only contains raw counts, we can write the following load_adata function\n",
+    "# To have a store that only contains raw counts, we can write the following load_anndata function\n",
     "def read_lazy_x_and_obs_only(path) -> ad.AnnData:\n",
     "    \"\"\"Custom load function to only load raw counts from CxG data.\"\"\"\n",
     "    # IMPORTANT: Large data should always be loaded lazily to reduce the memory footprint\n",
@@ -198,14 +198,14 @@
     "\n",
     "\n",
     "collection = DatasetCollection(zarr.open(\"annbatch_collection\", mode=\"w\"))\n",
-    "collection.add_adatas(\n",
+    "collection.add_anndatas(\n",
     "    # List all the h5ad files you want to include in the collection\n",
-    "    adata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n",
+    "    anndata_paths=[\"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\", \"f81463b8-4986-4904-a0ea-20ff02cbb317.h5ad\"],\n",
     "    # Path to store the output collection\n",
     "    shuffle=True,  # Whether to pre-shuffle the cells of the collection\n",
     "    n_obs_per_dataset=2_097_152,  # Number of cells per dataset shard, this number is much higher than available in these datasets but is generally a good target\n",
     "    var_subset=None,  # Optionally subset the collection to a specific gene space\n",
-    "    load_adata=read_lazy_x_and_obs_only,\n",
+    "    load_anndata=read_lazy_x_and_obs_only,\n",
     ")"
    ]
   },
@@ -227,7 +227,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "tags": [
      "hide-output"
@@ -251,7 +251,7 @@
     "from annbatch import Loader\n",
     "\n",
     "\n",
-    "def _load_adata(g: zarr.Group) -> ad.AnnData:\n",
+    "def _load_anndata(g: zarr.Group) -> ad.AnnData:\n",
     "    return ad.AnnData(X=ad.io.sparse_dataset(g[\"X\"]), obs=ad.experimental.read_lazy(g).obs[[\"cell_type\"]].to_memory())\n",
     "\n",
     "\n",
@@ -265,7 +265,7 @@
     ")\n",
     "\n",
     "# Add in the shuffled data that should be used for training.\n",
-    "ds.use_collection(collection, load_adata=_load_adata)"
+    "ds.use_collection(collection, load_anndata=_load_anndata)"
    ]
   },
   {
@@ -328,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "tags": [
      "hide-output"
@@ -363,11 +363,11 @@
     }
    ],
    "source": [
-    "collection.add_adatas(\n",
-    "    adata_paths=[\n",
+    "collection.add_anndatas(\n",
+    "    anndata_paths=[\n",
     "        \"866d7d5e-436b-4dbd-b7c1-7696487d452e.h5ad\",\n",
     "    ],\n",
-    "    load_adata=read_lazy_x_and_obs_only,\n",
+    "    load_anndata=read_lazy_x_and_obs_only,\n",
     ")"
    ]
   },
@@ -381,7 +381,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv",
+   "display_name": "annbatch",
    "language": "python",
    "name": "python3"
   },
@@ -395,7 +395,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.12"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -168,7 +168,8 @@ run.patch = [ "subprocess" ]
 run.source = [ "annbatch" ]
 
 [tool.mypy]
-overrides = [ { module = [ "anndata.*", "cupyx.*", "cupy.*" ], ignore_missing_imports = true } ]
+[[tool.mypy.overrides]]
+overrides = [ { module = [ "anndata.*", "cupyx.*", "cupy.*", "torch.*", "h5py.*" ], ignore_missing_imports = true } ]
 
 [tool.cruft]
 skip = [