From 9debe776f278a5497f4d5c1c51428c720211271c Mon Sep 17 00:00:00 2001 From: minish Date: Tue, 28 Jan 2025 20:17:30 -0500 Subject: [PATCH] Initial commit --- .gitignore | 1 + Cargo.lock | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 8 +++ README.md | 10 ++++ src/main.rs | 73 ++++++++++++++++++++++ src/markov.rs | 92 ++++++++++++++++++++++++++++ 6 files changed, 347 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 src/main.rs create mode 100644 src/markov.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..cd2b252 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,163 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chains-gen" +version = "0.1.0" +dependencies = [ + "indexmap", + "rand", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "indexmap" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "syn" +version = "2.0.96" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..400769d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "chains-gen" +version = "0.1.0" +edition = "2024" + +[dependencies] +indexmap = "2.7.0" +rand = "0.8.5" diff --git a/README.md b/README.md new file mode 100644 index 0000000..465355e --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Chains-gen + +chains-gen is a simple little program to create a Markov chain and then generate some text using it. + +To use it, put your training data inside `data.txt` in a directory, and then run the program there. + +> [!NOTE] +> You should try to keep each "entry" of data contained in one line, not bleeding out into multiple. The results could be weird if you don't.. + +If everything is okay you should be able to hit enter and generate sentences. diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..71148d3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,73 @@ +use std::time::Instant; + +use markov::{MarkovAllNodes, MarkovNode, MarkovToken}; + +mod markov; + +fn main() { + // chain creation + + let content = std::fs::read_to_string("./data.txt").unwrap(); + let lines = content.lines(); + + let mut all_nodes = MarkovAllNodes::new(); + let root_node = MarkovNode::new(MarkovToken::Root); + let end_node = MarkovNode::new(MarkovToken::End); + + let start = Instant::now(); + for line in lines { + let mut nodes = line + .split_whitespace() + .filter(|s| s.chars().all(|c| c.is_ascii_alphanumeric())) + .map(|t| all_nodes.node(t)); + + let mut node = root_node.clone(); + while !node.value.is_end() { + let next_node = nodes.next().unwrap_or_else(|| end_node.clone()); + + // it will be an empty chain so skip + if node.value.is_root() && next_node.value.is_end() { + break; + } + + node.conns.borrow_mut().connect(next_node.clone()); + + node = next_node; + } + } + println!("took {:?} to create chain", start.elapsed()); + + // generation + + let mut rng = rand::thread_rng(); + loop { + let mut picked_start_word = String::new(); + std::io::stdin().read_line(&mut picked_start_word).unwrap(); + + let start = Instant::now(); + + let mut result = String::new(); + let mut node = all_nodes + .try_node(picked_start_word.trim()) + .unwrap_or_else(|| root_node.clone()); + + loop { + let next_node = node.conns.borrow().random_weighted(&mut rng); + + if let MarkovToken::Value(value) = &node.value { + result += value; + result.push(' '); + } + + if next_node.value.is_end() { + break; + } + + node = next_node; + } + + let elapsed = start.elapsed(); + println!("output: {result}"); + println!("gen took {elapsed:?}"); + } +} diff --git a/src/markov.rs b/src/markov.rs new file mode 100644 index 0000000..f70faba --- /dev/null +++ b/src/markov.rs @@ -0,0 +1,92 @@ +use std::{cell::RefCell, collections::HashMap, hash::Hash, rc::Rc}; + +use indexmap::IndexMap; +use rand::{distributions::WeightedIndex, prelude::Distribution, rngs::ThreadRng}; + +#[derive(PartialEq, Eq, Hash, Debug)] +pub enum MarkovToken { + Root, + Value(String), + End, +} + +impl MarkovToken { + pub fn is_root(&self) -> bool { + matches!(self, Self::Root) + } + + pub fn is_end(&self) -> bool { + matches!(self, Self::End) + } +} + +pub struct MarkovNode { + pub value: MarkovToken, + pub conns: RefCell, +} + +impl PartialEq for MarkovNode { + fn eq(&self, other: &Self) -> bool { + self.value.eq(&other.value) + } +} +impl Eq for MarkovNode {} +impl Hash for MarkovNode { + fn hash(&self, state: &mut H) { + self.value.hash(state); + } +} + +impl MarkovNode { + pub fn new(token: MarkovToken) -> Rc { + Rc::new(Self { + value: token, + conns: RefCell::new(MarkovConns(IndexMap::new())), + }) + } +} + +pub struct MarkovConns(pub IndexMap, u32>); + +impl MarkovConns { + pub fn connect(&mut self, word: Rc) { + if let Some(count) = self.0.get_mut(&word) { + *count += 1; + } else { + self.0.insert(word, 1); + } + } + + fn index(&self, i: usize) -> Rc { + self.0.get_index(i).map(|(n, _)| n.clone()).unwrap() + } + + pub fn random_weighted(&self, rng: &mut ThreadRng) -> Rc { + let wi = WeightedIndex::new(self.0.iter().map(|(_, c)| c)).unwrap(); // SAFETY: there should always be at least an end token + let i = wi.sample(rng); + + self.index(i) + } +} + +pub struct MarkovAllNodes(HashMap>); + +impl MarkovAllNodes { + pub fn new() -> Self { + Self(HashMap::new()) + } + + pub fn node(&mut self, word: &str) -> Rc { + if let Some(node) = self.0.get(word) { + node.clone() + } else { + let node = MarkovNode::new(MarkovToken::Value(word.to_string())); + self.0.insert(word.to_string(), node.clone()); + node + } + } + + pub fn try_node(&self, word: &str) -> Option> { + self.0.get(word).cloned() + } +}