diff --git a/Cargo.lock b/Cargo.lock index cd2b252..9f83ae6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,18 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" + [[package]] name = "byteorder" version = "1.5.0" @@ -18,16 +30,71 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" name = "chains-gen" version = "0.1.0" dependencies = [ + "dashmap", + "fnv", "indexmap", "rand", + "rayon", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", + "rayon", +] + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "getrandom" version = "0.2.15" @@ -39,6 +106,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.2" @@ -52,7 +125,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.2", + "rayon", ] [[package]] @@ -61,6 +135,35 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -118,6 +221,47 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + [[package]] name = "syn" version = "2.0.96" @@ -141,6 +285,70 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index 400769d..df098d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,5 +4,8 @@ version = "0.1.0" edition = "2024" [dependencies] -indexmap = "2.7.0" +dashmap = { version = "6.1.0", features = ["rayon"] } +fnv = "1.0.7" +indexmap = { version = "2.7.0", features = ["rayon"] } rand = "0.8.5" +rayon = "1.10.0" diff --git a/src/main.rs b/src/main.rs index 71148d3..bfcf6c7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,20 +2,21 @@ use std::time::Instant; use markov::{MarkovAllNodes, MarkovNode, MarkovToken}; +use rayon::{iter::ParallelIterator, str::ParallelString}; + mod markov; fn main() { // chain creation let content = std::fs::read_to_string("./data.txt").unwrap(); - let lines = content.lines(); - let mut all_nodes = MarkovAllNodes::new(); + let all_nodes = MarkovAllNodes::new(); let root_node = MarkovNode::new(MarkovToken::Root); let end_node = MarkovNode::new(MarkovToken::End); let start = Instant::now(); - for line in lines { + content.par_lines().for_each(|line| { let mut nodes = line .split_whitespace() .filter(|s| s.chars().all(|c| c.is_ascii_alphanumeric())) @@ -30,13 +31,20 @@ fn main() { break; } - node.conns.borrow_mut().connect(next_node.clone()); + node.conns.connect(next_node.clone()); node = next_node; } - } + }); println!("took {:?} to create chain", start.elapsed()); + // cache index maps + + let start = Instant::now(); + root_node.conns.index_map(); + all_nodes.cache_index_maps(); + println!("took {:?} to cache index maps", start.elapsed()); + // generation let mut rng = rand::thread_rng(); @@ -52,7 +60,7 @@ fn main() { .unwrap_or_else(|| root_node.clone()); loop { - let next_node = node.conns.borrow().random_weighted(&mut rng); + let next_node = node.conns.random_weighted(&mut rng); if let MarkovToken::Value(value) = &node.value { result += value; diff --git a/src/markov.rs b/src/markov.rs index f70faba..bb6f989 100644 --- a/src/markov.rs +++ b/src/markov.rs @@ -1,8 +1,13 @@ -use std::{cell::RefCell, collections::HashMap, hash::Hash, rc::Rc}; +use std::{ + hash::Hash, + sync::{Arc, OnceLock}, +}; -use indexmap::IndexMap; use rand::{distributions::WeightedIndex, prelude::Distribution, rngs::ThreadRng}; +type FnvDashMap = dashmap::DashMap; +type FnvIndexMap = indexmap::IndexMap; + #[derive(PartialEq, Eq, Hash, Debug)] pub enum MarkovToken { Root, @@ -22,9 +27,14 @@ impl MarkovToken { pub struct MarkovNode { pub value: MarkovToken, - pub conns: RefCell, + pub conns: MarkovConns, } +impl std::fmt::Debug for MarkovNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + self.value.fmt(f) + } +} impl PartialEq for MarkovNode { fn eq(&self, other: &Self) -> bool { self.value.eq(&other.value) @@ -38,55 +48,70 @@ impl Hash for MarkovNode { } impl MarkovNode { - pub fn new(token: MarkovToken) -> Rc { - Rc::new(Self { + pub fn new(token: MarkovToken) -> Arc { + let conns = MarkovConns { + conns: FnvDashMap::default(), + im_cached: OnceLock::new(), + }; + + Arc::new(Self { value: token, - conns: RefCell::new(MarkovConns(IndexMap::new())), + conns, }) } } -pub struct MarkovConns(pub IndexMap, u32>); +pub struct MarkovConns { + conns: FnvDashMap, u32>, + im_cached: OnceLock, u32>>, +} impl MarkovConns { - pub fn connect(&mut self, word: Rc) { - if let Some(count) = self.0.get_mut(&word) { - *count += 1; - } else { - self.0.insert(word, 1); - } + pub fn connect(&self, n: Arc) { + self.conns.entry(n).and_modify(|c| *c += 1).or_insert(1); } - fn index(&self, i: usize) -> Rc { - self.0.get_index(i).map(|(n, _)| n.clone()).unwrap() + pub fn index_map(&self) -> &FnvIndexMap, u32> { + self.im_cached.get_or_init(|| { + self.conns + .iter() + .map(|r| (r.key().clone(), *r.value())) + .collect::>() + }) } - pub fn random_weighted(&self, rng: &mut ThreadRng) -> Rc { - let wi = WeightedIndex::new(self.0.iter().map(|(_, c)| c)).unwrap(); // SAFETY: there should always be at least an end token + pub fn random_weighted(&self, rng: &mut ThreadRng) -> Arc { + let im = self.index_map(); + + let wi = WeightedIndex::new(im.iter().map(|(_, c)| c).copied()).unwrap(); // SAFETY: there should always be at least an end token let i = wi.sample(rng); - self.index(i) + im.get_index(i).map(|(n, _)| n.clone()).unwrap() } } -pub struct MarkovAllNodes(HashMap>); +pub struct MarkovAllNodes(FnvDashMap>); impl MarkovAllNodes { pub fn new() -> Self { - Self(HashMap::new()) + Self(FnvDashMap::default()) } - pub fn node(&mut self, word: &str) -> Rc { - if let Some(node) = self.0.get(word) { - node.clone() - } else { - let node = MarkovNode::new(MarkovToken::Value(word.to_string())); - self.0.insert(word.to_string(), node.clone()); - node - } + pub fn node(&self, word: &str) -> Arc { + self.0 + .entry(word.to_string()) + .or_insert_with(|| MarkovNode::new(MarkovToken::Value(word.to_string()))) + .value() + .clone() } - pub fn try_node(&self, word: &str) -> Option> { - self.0.get(word).cloned() + pub fn try_node(&self, word: &str) -> Option> { + self.0.get(word).map(|r| r.clone()) + } + + pub fn cache_index_maps(&self) { + self.0.iter().for_each(|e| { + e.value().conns.index_map(); + }); } }