Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ members = [
"algorithms/linfa-ica",
"algorithms/linfa-bayes",
"algorithms/linfa-elasticnet",
"algorithms/linfa-pls",
"datasets",
]

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Where does `linfa` stand right now? [Are we learning yet?](http://www.arewelearn
| [hierarchical](algorithms/linfa-hierarchical/) | Agglomerative hierarchical clustering | Tested | Unsupervised learning | Cluster and build hierarchy of clusters |
| [bayes](algorithms/linfa-bayes/) | Naive Bayes | Tested | Supervised learning | Contains Gaussian Naive Bayes |
| [ica](algorithms/linfa-ica/) | Independent component analysis | Tested | Unsupervised learning | Contains FastICA implementation |
| [pls](algorithms/linfa-pls/) | Partial Least Squares | Tested | Supervised learning | Contains PLS estimators for dimensionality reduction and regression |

We believe that only a significant community effort can nurture, build, and sustain a machine learning ecosystem in Rust - there is no other way forward.

Expand Down
40 changes: 40 additions & 0 deletions algorithms/linfa-pls/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
[package]
name = "linfa-pls"
version = "0.3.1"
edition = "2018"
authors = ["relf <remi.lafage@onera.fr>"]
description = "Partial Least Squares family methods"
license = "MIT/Apache-2.0"

repository = "https://github.com/rust-ml/linfa"
readme = "README.md"

keywords = ["pls", "machine-learning", "linfa", "supervised"]
categories = ["algorithms", "mathematics", "science"]

[features]
default = []
serde = ["serde_crate", "ndarray/serde"]

[dependencies.serde_crate]
package = "serde"
optional = true
version = "1.0"
default-features = false
features = ["std", "derive"]

[dependencies]
ndarray = { version = "0.13", default-features=false }
ndarray-linalg = "0.12"
ndarray-stats = "0.3"
ndarray-rand = "0.11"
rand_isaac = "0.2"
num-traits = "0.2"
paste = "1.0"

linfa = { version = "0.3.1", path = "../.." }

[dev-dependencies]
linfa-datasets = { version = "0.3.1", path = "../../datasets", features = ["linnerud"] }
rand_isaac = "0.2"
approx = "0.3"
40 changes: 40 additions & 0 deletions algorithms/linfa-pls/examples/pls_regression.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use linfa::prelude::*;
use linfa_pls::{PlsRegression, Result};
use ndarray::{Array, Array1, Array2};
use ndarray_rand::rand::SeedableRng;
use ndarray_rand::rand_distr::StandardNormal;
use ndarray_rand::RandomExt;
use rand_isaac::Isaac64Rng;

#[allow(clippy::many_single_char_names)]
fn main() -> Result<()> {
let n = 1000;
let q = 3;
let p = 10;
let mut rng = Isaac64Rng::seed_from_u64(42);

// X shape (n, p) random
let x: Array2<f64> = Array::random_using((n, p), StandardNormal, &mut rng);

// B shape (p, q) such that B[0, ..] = 1, B[1, ..] = 2; otherwise zero
let mut b: Array2<f64> = Array2::zeros((p, q));
b.row_mut(0).assign(&Array1::ones(q));
b.row_mut(1).assign(&Array1::from_elem(q, 2.));

// Y shape (n, q) such that yj = 1*x1 + 2*x2 + noise(Normal(5, 1))
let y = x.dot(&b) + Array::random_using((n, q), StandardNormal, &mut rng).mapv(|v: f64| v + 5.);

let ds = Dataset::new(x, y);
let pls = PlsRegression::params(3)
.scale(true)
.max_iterations(200)
.fit(&ds)?;

println!("True B (such that: Y = XB + noise)");
println!("{:?}", b);

// PLS regression coefficients is an estimation of B
println!("Estimated B");
println!("{:1.1}", pls.coefficients());
Ok(())
}
36 changes: 36 additions & 0 deletions algorithms/linfa-pls/src/errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use ndarray_linalg::error::LinalgError;
use std::error::Error;
use std::fmt::{self, Display};

pub type Result<T> = std::result::Result<T, PlsError>;

#[derive(Debug)]
pub enum PlsError {
NotEnoughSamplesError(String),
BadComponentNumberError(String),
PowerMethodNotConvergedError(String),
LinalgError(String),
}

impl Display for PlsError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::NotEnoughSamplesError(message) => write!(f, "Not enough samples: {}", message),
Self::BadComponentNumberError(message) => {
write!(f, "Bad component number: {}", message)
}
Self::PowerMethodNotConvergedError(message) => {
write!(f, "Power method not converged: {}", message)
}
Self::LinalgError(message) => write!(f, "Linear algebra error: {}", message),
}
}
}

impl Error for PlsError {}

impl From<LinalgError> for PlsError {
fn from(error: LinalgError) -> PlsError {
PlsError::LinalgError(error.to_string())
}
}
242 changes: 242 additions & 0 deletions algorithms/linfa-pls/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
//! # Partial Least Squares
//!
//! `linfa-pls` provides an implementation of methods in the PLS (Partial Least Squares) family.
//! The PLS method is a statistical method that finds a linear relationship between
//! input variables and output variables by projecting them onto a new subspace formed
//! by newly chosen variables (aka latent variables), which are linear
//! combinations of the input variables. The subspace is choosen to maximize the
//! covariance between responses and independant variables.
//!
//! This approach is particularly useful when the original data are characterized by
//! a large number of highly collinear variables measured on a small number of samples.
//!
//! The implementation is a port of the scikit-learn 0.24 cross-decomposition module.
//!
//! ## References
//!
//! * [A survey of Partial Least Squares (PLS) methods, with emphasis on the two-block case JA Wegelin](https://stat.uw.edu/sites/default/files/files/reports/2000/tr371.pdf)
//! * [Scikit-Learn User Guide](https://scikit-learn.org/stable/modules/cross_decomposition.html#cross-decomposition)
//!
//! ## Example
//!
//! ```rust, ignore
//! use linfa::prelude::*;
//! use linfa_pls::{errors::Result, PlsRegression};
//! use ndarray::array;
//!
//! // Load linnerud datase 20 samples, 3 input features, 3 output features
//! let ds = linnerud();
//!
//! // Fit PLS2 method using 2 principal components (latent variables)
//! let pls = PlsRegression::params(2).fit(&ds)?;
//!
//! // We can either apply the dimension reduction to a dataset
//! let reduced_ds = pls.transform(ds);
//!
//! // ... or predict outputs given a new input sample.
//! let exercices = array![[14., 146., 61.], [6., 80., 60.]];
//! let physio_measures = pls.predict(exercices);
//! ```
mod errors;
mod pls_generic;
mod pls_svd;
mod utils;

use crate::pls_generic::*;

use linfa::{traits::Fit, traits::PredictRef, traits::Transformer, DatasetBase, Float};
use ndarray::{Array2, ArrayBase, Data, Ix2};
use ndarray_linalg::{Lapack, Scalar};

pub use errors::*;
pub use pls_svd::*;

macro_rules! pls_algo { ($name:ident) => {
paste::item! {

pub struct [<Pls $name Params>]<F: Float>(PlsParams<F>);

impl<F: Float> [<Pls $name Params>]<F> {
/// Set the maximum number of iterations of the power method when algorithm='Nipals'. Ignored otherwise.
pub fn max_iterations(mut self, max_iter: usize) -> Self {
self.0.max_iter = max_iter;
self
}

/// Set the tolerance used as convergence criteria in the power method: the algorithm
/// stops whenever the squared norm of u_i - u_{i-1} is less than tol, where u corresponds
/// to the left singular vector.
pub fn tolerance(mut self, tolerance: F) -> Self {
self.0.tolerance = tolerance;
self
}

/// Set whether to scale the dataset
pub fn scale(mut self, scale: bool) -> Self {
self.0.scale = scale;
self
}

/// Set the algorithm used to estimate the first singular vectors of the cross-covariance matrix.
/// `Nipals` uses the power method while `Svd` will compute the whole SVD.
pub fn algorithm(mut self, algorithm: Algorithm) -> Self {
self.0.algorithm = algorithm;
self
}
}

pub struct [<Pls $name>]<F: Float>(Pls<F>);
impl<F: Float> [<Pls $name>]<F> {

pub fn params(n_components: usize) -> [<Pls $name Params>]<F> {
[<Pls $name Params>](Pls::[<$name:lower>](n_components))
}

/// Singular vectors of the cross-covariance matrices
pub fn weights(&self) -> (&Array2<F>, &Array2<F>) {
self.0.weights()
}

/// Loadings of records and targets
pub fn loadings(&self) -> (&Array2<F>, &Array2<F>) {
self.0.loadings()
}

/// Projection matrices used to transform records and targets
pub fn rotations(&self) -> (&Array2<F>, &Array2<F>) {
self.0.rotations()
}

/// The coefficients of the linear model such that Y is approximated as Y = X.coefficients
pub fn coefficients(&self) -> &Array2<F> {
self.0.coefficients()
}

/// Transform the given dataset in the projected space back to the original space.
pub fn inverse_transform(
&self,
dataset: DatasetBase<
ArrayBase<impl Data<Elem = F>, Ix2>,
ArrayBase<impl Data<Elem = F>, Ix2>,
>,
) -> DatasetBase<Array2<F>, Array2<F>> {
self.0.inverse_transform(dataset)
}
}

impl<F: Float + Scalar + Lapack, D: Data<Elem = F>> Fit<'_, ArrayBase<D, Ix2>, ArrayBase<D, Ix2>>
for [<Pls $name Params>]<F>
{
type Object = Result<[<Pls $name>]<F>>;
fn fit(
&self,
dataset: &DatasetBase<ArrayBase<D, Ix2>, ArrayBase<D, Ix2>>,
) -> Result<[<Pls $name>]<F>> {
let pls = self.0.fit(dataset)?;
Ok([<Pls $name>](pls))
}
}

impl<F: Float, D: Data<Elem = F>> Transformer<
DatasetBase<ArrayBase<D, Ix2>, ArrayBase<D, Ix2>>,
DatasetBase<Array2<F>, Array2<F>>,
> for [<Pls $name>]<F>
{
/// Apply dimension reduction to the given dataset
fn transform(
&self,
dataset: DatasetBase<ArrayBase<D, Ix2>, ArrayBase<D, Ix2>>,
) -> DatasetBase<Array2<F>, Array2<F>> {
self.0.transform(dataset)
}
}

impl<F: Float, D: Data<Elem = F>> PredictRef<ArrayBase<D, Ix2>, Array2<F>> for [<Pls $name>]<F> {
/// Given an input matrix `X`, with shape `(n_samples, n_features)`,
/// `predict` returns the target variable according to [<Pls $name>] method
/// learned from the training data distribution.
fn predict_ref<'a>(&'a self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
self.0.predict_ref(x)
}
}
}
}}

pls_algo!(Regression);
pls_algo!(Canonical);
pls_algo!(Cca);

#[cfg(test)]
mod test {
use super::*;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the tests are currently only checking that the fitting process runs without error

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think transform and predict were at least covered but I guess I can add some tests checking that we get the same results as scikit-learn.

use approx::assert_abs_diff_eq;
use linfa::{traits::Fit, traits::Predict, traits::Transformer};
use linfa_datasets::linnerud;
use ndarray::array;

macro_rules! test_pls_algo {
(Svd) => {
paste::item! {
#[test]
fn [<test_pls_svd>]() -> Result<()> {
let ds = linnerud();
let pls = PlsSvd::<f64>::params(3).fit(&ds)?;
let _ds1 = pls.transform(ds);
Ok(())
}
}
};

($name:ident, $expected:expr) => {
paste::item! {
#[test]
fn [<test_pls_$name:lower>]() -> Result<()> {
let ds = linnerud();
let pls = [<Pls $name>]::<f64>::params(2).fit(&ds)?;
let _ds1 = pls.transform(ds);
let exercices = array![[14., 146., 61.], [6., 80., 60.]];
let physios = pls.predict(exercices);
assert_abs_diff_eq!($expected, physios.targets(), epsilon=1e-2);
Ok(())
}
}
};
}

// Prediction values were checked against scikit-learn 0.24.1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

test_pls_algo!(
Canonical,
array![
[180.56979423, 33.29543984, 56.90850758],
[190.854022, 38.91963398, 53.26914489]
]
);
test_pls_algo!(
Regression,
array![
[172.39580643, 34.11919145, 57.15430526],
[192.11167813, 38.05058858, 53.99844922]
]
);
test_pls_algo!(
Cca,
array![
[181.56238421, 34.42502589, 57.31447865],
[205.11767414, 40.23445194, 52.26494323]
]
);
test_pls_algo!(Svd);

#[test]
fn test_one_component_equivalence() -> Result<()> {
// PlsRegression, PlsSvd and PLSCanonical should all be equivalent when n_components is 1
let ds = linnerud();
let regression = PlsRegression::params(1).fit(&ds)?.transform(linnerud());
let canonical = PlsCanonical::params(1).fit(&ds)?.transform(linnerud());
let svd = PlsSvd::<f64>::params(1).fit(&ds)?.transform(linnerud());

assert_abs_diff_eq!(regression.records(), canonical.records(), epsilon = 1e-5);
assert_abs_diff_eq!(svd.records(), canonical.records(), epsilon = 1e-5);
Ok(())
}
}
Loading