Statistics

Calculate Measures of Central Tendency

std cat-science

These examples calculate measures of central tendency for a data set contained within a Rust array. There may be no mean, median or mode to calculate for an empty set of data, so each function returns an std::option::Option⮳ to be handled by the caller.

The first example calculates the mean (the sum of all measurements divided by the number of measurements in the set) by producing an iterator of references over the data, and using std::iter::Iterator::sum⮳ and len⮳ to determine the total value and count of values respectively.

/// Calculates the mean of a set of numbers.
fn main() {
    // Sample data set.
    let data = [3, 1, 6, 1, 5, 8, 1, 8, 10, 11];

    // Calculate the sum of the data.
    let sum = data.iter().sum::<i32>() as f32;
    // Get the number of elements in the data set.
    let count = data.len();

    let mean = match count {
        positive if positive > 0 => Some(sum / count as f32),
        _ => None,
    };

    println!("Mean of the data is {:?}", mean);
}

The second example calculates the median using the quickselect algorithm, which avoids a full sort⮳ by sorting only partitions of the data set known to possibly contain the median. This uses std::cmp::Ord::cmp⮳ and std::cmp::Ordering⮳ to succinctly decide the next partition to examine, and split_at⮳ to choose an arbitrary pivot for the next partition at each step.

use std::cmp::Ordering;

/// Partitions a slice of integers into three parts:
/// - `left`: elements less than the pivot
/// - `pivot`: the first element of the slice
/// - `right`: elements greater than or equal to the pivot
///
/// Returns `None` if the slice is empty, otherwise returns `Some((left, pivot,
/// right))`.
fn partition(data: &[i32]) -> Option<(Vec<i32>, i32, Vec<i32>)> {
    match data.len() {
        // If the slice is empty, there's nothing to partition.
        // Return None to indicate this.
        0 => None,
        // Otherwise, proceed with partitioning.
        _ => {
            // Split the slice into the first element (pivot) and the rest
            // (tail).
            let (pivot_slice, tail) = data.split_at(1);
            let pivot = pivot_slice[0];
            let (left, right) =
                tail.iter().fold((vec![], vec![]), |mut splits, next| {
                    {
                        let &mut (ref mut left, ref mut right) = &mut splits;
                        if next < &pivot {
                            left.push(*next);
                        } else {
                            right.push(*next);
                        }
                    }
                    splits
                });

            Some((left, pivot, right))
        }
    }
}

/// Selects the k-th smallest element from a slice of integers.
///
/// This function uses the quickselect algorithm to find the k-th smallest
/// element in the slice. It returns `None` if the slice is empty or if `k` is
/// out of bounds.
///
/// # Arguments
///
/// * `data` - The slice of integers to search.
/// * `k` - The index of the element to select (0-based).
fn select(data: &[i32], k: usize) -> Option<i32> {
    // Partition the data around a pivot.
    let part = partition(data);

    // Handle the result of the partition.
    match part {
        None => None,
        Some((left, pivot, right)) => {
            let pivot_idx = left.len();

            match pivot_idx.cmp(&k) {
                Ordering::Equal => Some(pivot),
                Ordering::Greater => select(&left, k),
                Ordering::Less => select(&right, k - (pivot_idx + 1)),
            }
        }
    }
}

/// Calculates the median of a slice of integers.
///
/// The median is the middle value in a sorted list of numbers. If the list has
/// an even number of elements, the median is the average of the two middle
/// numbers.
///
/// # Arguments
///
/// * `data` - The slice of integers to calculate the median from.
fn median(data: &[i32]) -> Option<f32> {
    // Get the size of the data slice.
    let size = data.len();

    // Check if the size is even or odd.
    match size {
        // If the size is even, calculate the average of the two middle
        // elements. The two middle elements are at indices (size / 2) -
        // 1 and size / 2. Use the `select` function to find these
        // elements.
        even if even % 2 == 0 => {
            let fst_med = select(data, (even / 2) - 1);
            let snd_med = select(data, even / 2);

            match (fst_med, snd_med) {
                (Some(fst), Some(snd)) => Some((fst + snd) as f32 / 2.0),
                _ => None,
            }
        }
        // If the size is odd, the median is the middle element.
        // The middle element is at index size / 2.
        odd => select(data, odd / 2).map(|x| x as f32),
    }
}

fn main() {
    let data = [3, 1, 6, 1, 5, 8, 1, 8, 10, 11];

    let part = partition(&data);
    println!("Partition is {:?}", part);

    let sel = select(&data, 5);
    println!("Selection at ordered index {} is {:?}", 5, sel);

    let med = median(&data);
    println!("Median is {:?}", med);
}

The final example calculates the mode using a mutable std::collections::HashMap⮳ to collect counts of each distinct integer from the set, using a std::iter::Iterator::fold⮳ and the std::collections::hash_map::Entry⮳ API. The most frequent value in the std::collections::HashMap⮳ surfaces with std::iter::Iterator::max_by_key⮳.

use std::collections::HashMap;

/// Calculates the mode of a dataset.
///
/// The mode is the value that appears most frequently in a dataset.
fn main() {
    let data = [3, 1, 6, 1, 5, 8, 1, 8, 10, 11];

    // Calculate the frequency of each value in the dataset.
    let frequencies = data.iter().fold(HashMap::new(), |mut freqs, value| {
        *freqs.entry(value).or_insert(0) += 1; // Increment the count for the current value.
        freqs
    });

    let mode = frequencies
        .into_iter()
        .max_by_key(|&(_, count)| count)
        .map(|(value, _)| *value);

    println!("Mode of the data is {:?}", mode);
}

Compute the Standard Deviation

std cat-science

This example calculates the standard deviation and z-score of a set of measurements.

The standard deviation is defined as the square root of the variance (here calculated with f32's sqrt⮳ where the variance is the std::iter::Iterator::sum⮳ of the squared difference between each measurement and the mean divided by the number of measurements).

The z-score is the number of standard deviations a single measurement spans away from the mean of the data set.

/// Calculates the mean (average) of a slice of i32 integers.
///
/// # Arguments
///
/// * `data` - A slice of i32 integers.
///
/// # Returns
///
/// Returns `Some(f32)` containing the mean if the slice is not empty, otherwise
/// returns `None`.
fn mean(data: &[i32]) -> Option<f32> {
    let sum = data.iter().sum::<i32>() as f32;
    let count = data.len();

    match count {
        positive if positive > 0 => Some(sum / count as f32),
        _ => None,
    }
}

/// Calculates the standard deviation of a slice of i32 integers.
///
/// # Arguments
///
/// * `data` - A slice of i32 integers.
///
/// # Returns
///
/// Returns `Some(f32)` containing the standard deviation if the slice is not
/// empty, otherwise returns `None`.
fn std_deviation(data: &[i32]) -> Option<f32> {
    match (mean(data), data.len()) {
        (Some(data_mean), count) if count > 0 => {
            let variance = data
                .iter()
                .map(|value| {
                    let diff = data_mean - (*value as f32);

                    diff * diff
                })
                .sum::<f32>()
                / count as f32;

            Some(variance.sqrt())
        }
        _ => None,
    }
}

fn main() {
    let data = [3, 1, 6, 1, 5, 8, 1, 8, 10, 11];

    let data_mean = mean(&data);
    println!("Mean is {:?}", data_mean);

    let data_std_deviation = std_deviation(&data);
    println!("Standard deviation is {:?}", data_std_deviation);

    let zscore = match (data_mean, data_std_deviation) {
        (Some(mean), Some(std_deviation)) => {
            let diff = data[4] as f32 - mean;

            Some(diff / std_deviation)
        }
        _ => None,
    };
    println!(
        "Z-score of data at index 4 (with value {}) is {:?}",
        data[4], zscore
    );
}