coreutils/src/uu/shuf/src/nonrepeating_iterator.rs at f856a4642569080ca59fc7178a91f43714e6f1a0 · oech3/coreutils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// hijack HashMap for performance
type HashMap<K, V> = std::collections::HashMap<K, V, rustc_hash::FxBuildHasher>;

use std::ops::RangeInclusive;

use uucore::error::UResult;

use crate::WrappedRng;

/// An iterator that samples from an integer range without repetition.
///
/// This is based on Fisher-Yates, and it's required for backward compatibility
/// that it behaves exactly like Fisher-Yates if --random-source or --random-seed
/// is used. But we have a few tricks:
///
/// - In the beginning we use a hash table instead of an array. This way we lazily
///   keep track of swaps without allocating the entire range upfront.
///
/// - When the hash table starts to get big relative to the remaining items
///   we switch over to an array.
///
/// - We store the array backwards so that we can shrink it as we go and free excess
///   memory every now and then.
///
/// Both the hash table and the array give the same output.
///
/// There's room for optimization:
///
/// - Switching over from the hash table to the array is costly. If we happen to know
///   (through --head-count) that only few draws remain then it would be better not
///   to switch.
///
/// - If the entire range gets used then we might as well allocate an array to start
///   with. But if the user e.g. pipes through `head` rather than using --head-count
///   we can't know whether that's the case, so there's a tradeoff.
///
///   GNU decides the other way: --head-count is noticeably faster than | head.
pub(crate) struct NonrepeatingIterator<'a> {
    rng: &'a mut WrappedRng,
    values: Values,
}

enum Values {
    Full(Vec<u64>),
    Sparse(RangeInclusive<u64>, HashMap<u64, u64>),
}

impl<'a> NonrepeatingIterator<'a> {
    pub(crate) fn new(range: RangeInclusive<u64>, rng: &'a mut WrappedRng) -> Self {
        let values = Values::Sparse(range, HashMap::default());
        NonrepeatingIterator { rng, values }
    }

    fn produce(&mut self) -> UResult<u64> {
        match &mut self.values {
            Values::Full(items) => {
                let this_idx = items.len() - 1;

                let other_idx = self.rng.choose_from_range(0..=items.len() as u64 - 1)? as usize;
                // Flip the index to pretend we're going left-to-right
                let other_idx = items.len() - other_idx - 1;

                items.swap(this_idx, other_idx);

                let val = items.pop().unwrap();
                if items.len().is_power_of_two() && items.len() >= 512 {
                    items.shrink_to_fit();
                }
                Ok(val)
            }
            Values::Sparse(range, items) => {
                let this_idx = *range.start();
                let this_val = items.remove(&this_idx).unwrap_or(this_idx);

                let other_idx = self.rng.choose_from_range(range.clone())?;

                let val = if this_idx == other_idx {
                    this_val
                } else {
                    items.insert(other_idx, this_val).unwrap_or(other_idx)
                };
                *range = *range.start() + 1..=*range.end();

                Ok(val)
            }
        }
    }
}

impl Iterator for NonrepeatingIterator<'_> {
    type Item = UResult<u64>;

    fn next(&mut self) -> Option<Self::Item> {
        match &self.values {
            Values::Full(items) if items.is_empty() => return None,
            Values::Full(_) => (),
            Values::Sparse(range, _) if range.is_empty() => return None,
            Values::Sparse(range, items) => {
                let range_len = range.size_hint().0 as u64;
                if items.len() as u64 >= range_len / 8 {
                    self.values = Values::Full(hashmap_to_vec(range.clone(), items));
                }
            }
        }

        Some(self.produce())
    }
}

fn hashmap_to_vec(range: RangeInclusive<u64>, map: &HashMap<u64, u64>) -> Vec<u64> {
    let lookup = |idx| *map.get(&idx).unwrap_or(&idx);
    range.rev().map(lookup).collect()
}