minor

kaizhang · kaizhang · commit 50864e9770ac · 2025-04-14T12:51:06.000+08:00
diff --git a/snapatac2-core/src/preprocessing/import.rs b/snapatac2-core/src/preprocessing/import.rs
@@ -318,6 +318,7 @@ pub fn import_values<A, I>(
     anndata: &A,
     values: I,
     chrom_sizes: &ChromSizes,
+    white_list: Option<&HashSet<String>>,
     chunk_size: usize,
 ) -> Result<()>
 where
@@ -373,6 +374,7 @@ where
     let chunked_values = values.chunk_by(|x| x.0.clone());
     let chunked_values = chunked_values
         .into_iter()
+        .filter(|(key, _)| white_list.map_or(true, |x| x.contains(key)))
         .progress_with(spinner)
         .chunks(chunk_size);
     let arrays = chunked_values.into_iter().map(|chunk| {
diff --git a/snapatac2-python/python/snapatac2/plotting/__init__.py b/snapatac2-python/python/snapatac2/plotting/__init__.py
@@ -94,8 +94,8 @@ def tsse(
         raise ValueError("TSS enrichment score is not computed, please run `metrics.tsse` first.")
 
     selected_cells = np.where(adata.obs["n_fragment"] >= min_fragment)[0]
-    x = adata.obs["n_fragment"][selected_cells]
-    y = adata.obs["tsse"][selected_cells]
+    x = adata.obs["n_fragment"].to_numpy()[selected_cells]
+    y = adata.obs["tsse"].to_numpy()[selected_cells]
 
     fig = kde2d(x, y, log_x=True, log_y=False)
     fig.update_layout(
diff --git a/snapatac2-python/python/snapatac2/preprocessing/_basic.py b/snapatac2-python/python/snapatac2/preprocessing/_basic.py
@@ -400,6 +400,7 @@ def import_values(
     chrom_sizes: Genome | dict[str, int],
     *,
     file: Path | None = None,
+    whitelist: Path | list[str] | None = None,
     chunk_size: int = 200,
     backend: Literal['hdf5'] = 'hdf5',
 ) -> internal.AnnData:
@@ -410,13 +411,17 @@ def import_values(
     ----------
     input_dir
         Directory containing the input files. Each file corresponds to a single cell.
+    chrom_sizes
+        A Genome object or a dictionary containing chromosome sizes, for example,
+        `{"chr1": 2393, "chr2": 2344, ...}`.
     file
         File name of the output h5ad file used to store the result. If provided,
         result will be saved to a backed AnnData, otherwise an in-memory AnnData
         is used.
-    chrom_sizes
-        A Genome object or a dictionary containing chromosome sizes, for example,
-        `{"chr1": 2393, "chr2": 2344, ...}`.
+    whitelist
+        File name or a list of barcodes. If it is a file name, each line
+        must contain a valid barcode. When provided, only barcodes in the whitelist
+        will be retained.
     chunk_size
         Increasing the chunk_size speeds up I/O but uses more memory.
     backend
@@ -433,9 +438,16 @@ def import_values(
     if len(chrom_sizes) == 0:
         raise ValueError("chrom_size cannot be empty")
 
+    if whitelist is not None:
+        if isinstance(whitelist, str) or isinstance(whitelist, Path):
+            with open(whitelist, "r") as fl:
+                whitelist = set([line.strip() for line in fl])
+        else:
+            whitelist = set(whitelist)
+
     adata = AnnData() if file is None else internal.AnnData(filename=file, backend=backend)
     internal.import_values(
-        adata, input_dir, chrom_sizes, chunk_size
+        adata, input_dir, chrom_sizes, whitelist, chunk_size
     )
     return adata
 
diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs
@@ -12,6 +12,7 @@ use pyo3::{prelude::*, pybacked::PyBackedStr};
 use snapatac2_core::feature_count::ValueType;
 use snapatac2_core::preprocessing::SummaryType;
 use std::collections::HashMap;
+use std::ffi::OsStr;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::{collections::BTreeMap, collections::HashSet, ops::Deref, str::FromStr};
@@ -239,14 +240,20 @@ pub(crate) fn import_contacts(
 }
 
 #[pyfunction]
+#[pyo3(signature = (anndata, input_dir, chrom_size, chunk_size, white_list=None))]
 pub(crate) fn import_values(
     anndata: AnnDataLike,
     input_dir: PathBuf,
     chrom_size: BTreeMap<String, u64>,
     chunk_size: usize,
+    white_list: Option<HashSet<String>>,
 ) -> Result<()> {
     fn read_chrom_values(path: PathBuf) -> impl Iterator<Item = (String, BaseValue)> {
-        let barcode = path.file_stem().unwrap().to_str().unwrap().to_string();
+        let barcode = if path.ends_with(".gz") {
+            <OsStr as AsRef<std::path::Path>>::as_ref(path.file_stem().unwrap()).file_stem()
+        } else {
+            path.file_stem()
+        }.unwrap().to_str().unwrap().to_string();
         let reader = BufReader::new(utils::open_file_for_read(&path));
         reader.lines().skip(1).map(move |line| {
             let line = line.unwrap();
@@ -268,7 +275,7 @@ pub(crate) fn import_values(
 
     macro_rules! run {
         ($data:expr) => {
-            preprocessing::import_values($data, sorted_values, &chrom_sizes, chunk_size)?
+            preprocessing::import_values($data, sorted_values, &chrom_sizes, white_list.as_ref(), chunk_size)?
         };
     }