From 7677d768bb3954fde7c1523f34d894e44856492c Mon Sep 17 00:00:00 2001 From: Mike Ward Date: Mon, 19 Jan 2026 03:53:37 -0600 Subject: [PATCH] examples,strings: add a new `strings.lorem` module; use it in the Markov chain text generator example (#26387) --- examples/lorem.v | 224 +++----------------------------- vlib/strings/lorem/lorem.v | 221 +++++++++++++++++++++++++++++++ vlib/strings/lorem/lorem_test.v | 66 ++++++++++ 3 files changed, 302 insertions(+), 209 deletions(-) create mode 100644 vlib/strings/lorem/lorem.v create mode 100644 vlib/strings/lorem/lorem_test.v diff --git a/examples/lorem.v b/examples/lorem.v index 923545b4d..741a2116b 100644 --- a/examples/lorem.v +++ b/examples/lorem.v @@ -2,23 +2,8 @@ Random Markov Text Generator This program generates pseudo-random text using a Markov chain built from -one of several embedded corpora. It produces structured output in the -form of paragraphs and sentences, with configurable parameters for: - -- Markov order (n-gram size) -- Words per sentence -- Sentences per paragraph -- Paragraph count -- Optional seed phrases and RNG seed -- Optional corpus selection - -Features: - -- Five built-in seed phrases, randomly chosen if no seed is provided -- Paragraphs and sentences with ±20% variability in lengths -- Automatic reseeding from corpus if seed phrases do not exist in the model -- Fully self-contained; no external corpus files required -- Can be run with no parameters and produces readable, multi-paragraph text +one of several embedded corpora in the strings module. It produces structured output in the +form of paragraphs and sentences, with configurable parameters. Usage: @@ -29,72 +14,11 @@ Example: ./lorem -order 2 -words 12 -sentences 4 -paragraphs 3 -corpus poe */ -import rand +import strings.lorem import flag import os - -// ---------------- Embedded Corpora ---------------- - -const corpora = { - 'lorem': lorem_corpus - 'poe': poe_corpus - 'darwin': darwin_corpus - 'bard': shakespeare_corpus -} - -const lorem_corpus = ' -lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor -incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis -nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore -eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt -in culpa qui officia deserunt mollit anim id est laborum -' - -const poe_corpus = ' -once upon a midnight dreary while I pondered weak and weary -over many a quaint and curious volume of forgotten lore -while I nodded nearly napping suddenly there came a tapping -as of someone gently rapping at my chamber door -' - -const darwin_corpus = ' -when we look to the individuals of the same variety or sub variety of our -older cultivated plants and animals one of the first points which strikes -us is that they generally differ much more from each other than do the -individuals of any one species or variety in a state of nature -' - -const shakespeare_corpus = ' -to be or not to be that is the question -all the worlds a stage and all the men and women merely players -the lady doth protest too much methinks -a rose by any other name would smell as sweet -et tu brute -if music be the food of love play on -now is the winter of our discontent -we are such stuff as dreams are made on -brevity is the soul of wit -some are born great some achieve greatness and some have greatness thrust upon them -cry havoc and let slip the dogs of war -all that glisters is not gold -the fault dear brutus is not in our stars but in ourselves -to thine own self be true -lord what fools these mortals be -shall i compare thee to a summers day -' - -// ---------------- Seed Phrases ---------------- - -const seed_phrases = [ - 'in the beginning', - 'once upon a time', - 'it was the first', - 'when we consider', - 'there was a moment', -] - -// ---------------- Main ---------------- +import rand +import time fn main() { mut fp := flag.new_flag_parser(os.args[1..]) @@ -106,17 +30,23 @@ fn main() { words_per_sentence := fp.int('words', `w`, 10, 'Words per sentence [default: 10]') sentences_per_paragraph := fp.int('sentences', `s`, 5, 'Sentences per paragraph [default: 5]') paragraphs := fp.int('paragraphs', `p`, 3, 'Paragraph count [default: 3]') - corpus_name := fp.string('corpus', `c`, 'lorem', 'Corpus name (lorem, poe, darwin, bard) [default: lorem') + corpus_name := fp.string('corpus', `c`, 'lorem', 'Corpus name (lorem, poe, darwin, bard) [default: lorem]') seed_text := fp.string('seed', `S`, '', 'Seed phrase (random if omitted)') - rng_seed := fp.int('rngseed', `r`, 0, 'RNG seed (0 = non-deterministic)') + mut rng_seed := fp.int('rngseed', `r`, 0, 'RNG seed (0 = random)') fp.finalize() or { eprintln(err) return } - text := generate_text( - order: order + if rng_seed == 0 { + t := time.now().unix_milli() + rand.seed([u32(t), u32(t >> 32)]) + rng_seed = rand.int() + } + + text := lorem.generate( + markov_order: order words_per_sentence: words_per_sentence sentences_per_paragraph: sentences_per_paragraph paragraphs: paragraphs @@ -127,127 +57,3 @@ fn main() { println(text) } - -struct LoremCfg { - order int = 2 - words_per_sentence int = 10 - sentences_per_paragraph int = 5 - paragraphs int = 3 - corpus_name string - seed_text string - rng_seed int -} - -// ---------------- Text Generation ---------------- - -fn generate_text(cfg LoremCfg) string { - if cfg.rng_seed != 0 { - rand.seed([u32(cfg.rng_seed)]) - } - - seed := match cfg.seed_text != '' { - true { cfg.seed_text } - else { random_seed_phrase() } - } - - corpus := select_corpus(cfg.corpus_name) - tokens := tokenize(corpus) - - if tokens.len <= cfg.order { - eprintln('corpus too small for selected order') - return '' - } - - model := build_markov(tokens, cfg.order) - - mut state := tokenize(seed) - if state.len < cfg.order { - start := rand.intn(tokens.len - cfg.order) or { 0 } - state = tokens[start..start + cfg.order].clone() - } - - mut out := []string{} - - for pi in 0 .. cfg.paragraphs { - if pi != 0 { - out << '\n\n' - } - sentences := vary(cfg.sentences_per_paragraph, 1) - - for si in 0 .. sentences { - if si != 0 { - out << ' ' - } - words := vary(cfg.words_per_sentence, 3) - mut sentence := []string{} - - for _ in 0 .. words { - key := state.join('\u0001') - nexts := model[key] or { - start := rand.intn(tokens.len - cfg.order) or { 0 } - state = tokens[start..start + cfg.order].clone() - continue - } - - next := nexts[rand.intn(nexts.len) or { 0 }] - sentence << next - - state = state[1..].clone() - state << next - } - - if sentence.len > 0 { - out << sentence.join(' ').capitalize() - out << '.' - } - } - } - - return out.join('') -} - -// ---------------- Utilities ---------------- - -fn vary(base int, min int) int { - delta := int(f32(base) * 0.2) - if delta == 0 { - return base - } - offset := rand.intn(delta * 2 + 1) or { 0 } - delta - val := base + offset - return if val < min { min } else { val } -} - -fn select_corpus(name string) string { - if name != '' { - if corpus := corpora[name] { - return corpus - } - eprintln('unknown corpus: ${name}') - exit(1) - } - - keys := corpora.keys() - key := keys[rand.intn(keys.len) or { 0 }] - return corpora[key] -} - -fn random_seed_phrase() string { - return seed_phrases[rand.intn(seed_phrases.len) or { 0 }] -} - -fn tokenize(text string) []string { - return text - .replace_each(['\n', ' ', '\t', ' ']) - .split(' ') - .filter(it.len > 0) -} - -fn build_markov(tokens []string, order int) map[string][]string { - mut model := map[string][]string{} - for i in 0 .. tokens.len - order { - key := tokens[i..i + order].join('\u0001') - model[key] << tokens[i + order] - } - return model -} diff --git a/vlib/strings/lorem/lorem.v b/vlib/strings/lorem/lorem.v new file mode 100644 index 000000000..2b3cfa629 --- /dev/null +++ b/vlib/strings/lorem/lorem.v @@ -0,0 +1,221 @@ +module lorem + +const lorem_corpora = { + 'lorem': lorem_corpus + 'poe': poe_corpus + 'darwin': darwin_corpus + 'bard': shakespeare_corpus +} + +const lorem_corpus = ' +lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore +eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt +in culpa qui officia deserunt mollit anim id est laborum +' + +const poe_corpus = ' +once upon a midnight dreary while I pondered weak and weary +over many a quaint and curious volume of forgotten lore +while I nodded nearly napping suddenly there came a tapping +as of someone gently rapping at my chamber door +' + +const darwin_corpus = ' +when we look to the individuals of the same variety or sub variety of our +older cultivated plants and animals one of the first points which strikes +us is that they generally differ much more from each other than do the +individuals of any one species or variety in a state of nature +' + +const shakespeare_corpus = ' +to be or not to be that is the question +all the worlds a stage and all the men and women merely players +the lady doth protest too much methinks +a rose by any other name would smell as sweet +et tu brute +if music be the food of love play on +now is the winter of our discontent +we are such stuff as dreams are made on +brevity is the soul of wit +some are born great some achieve greatness and some have greatness thrust upon them +cry havoc and let slip the dogs of war +all that glisters is not gold +the fault dear brutus is not in our stars but in ourselves +to thine own self be true +lord what fools these mortals be +shall i compare thee to a summers day +' + +// ---------------- Seed Phrases ---------------- + +const lorem_seed_phrases = [ + 'in the beginning', + 'once upon a time', + 'it was the first', + 'when we consider', + 'there was a moment', +] + +// ---------------- Text Generation ---------------- + +// LoremCfg +// - `corpus_name` random if not specified +// - `seed_text` random if not specified +// - `rng_seed` random if not specified +@[params] +pub struct LoremCfg { +pub: + words_per_sentence int = 10 + sentences_per_paragraph int = 5 + paragraphs int = 3 + markov_order int = 2 + corpus_name string + seed_text string + rng_seed int +} + +// lorum - pseudo-random text using a Markov chain built from +// one of several embedded corpora. It produces structured output in the +// form of paragraphs and sentences, with configurable parameters for: +// +// - Markov order (n-gram size) +// - Words per sentence +// - Sentences per paragraph +// - Paragraph count +// - Optional seed phrases and RNG seed +// - Optional corpus selection +// +pub fn generate(cfg LoremCfg) string { + // Initialize LCG with provided seed or default + // If rng_seed is 0, we use a default seed (making it deterministic by default) + initial_seed := if cfg.rng_seed != 0 { u32(cfg.rng_seed) } else { u32(123456789) } + mut rng := LorumRNG{ + seed: initial_seed + } + + seed := match cfg.seed_text != '' { + true { cfg.seed_text } + else { lorem_random_seed_phrase(mut rng) } + } + + corpus := lorem_select_corpus(mut rng, cfg.corpus_name) + tokens := lorem_tokenize(corpus) + + if tokens.len <= cfg.markov_order { + eprintln('corpus too small for selected order') + return '' + } + + model := lorem_build_markov(tokens, cfg.markov_order) + + mut state := lorem_tokenize(seed) + if state.len < cfg.markov_order { + start := rng.intn(tokens.len - cfg.markov_order) + state = tokens[start..start + cfg.markov_order].clone() + } + + mut out := []string{} + + for pi in 0 .. cfg.paragraphs { + if pi != 0 { + out << '\n\n' + } + sentences := lorem_vary(mut rng, cfg.sentences_per_paragraph, 1) + + for si in 0 .. sentences { + if si != 0 { + out << ' ' + } + words := lorem_vary(mut rng, cfg.words_per_sentence, 3) + mut sentence := []string{} + + for _ in 0 .. words { + key := state.join('\u0001') + nexts := model[key] or { + start := rng.intn(tokens.len - cfg.markov_order) + state = tokens[start..start + cfg.markov_order].clone() + continue + } + + next := nexts[rng.intn(nexts.len)] + sentence << next + + state = state[1..].clone() + state << next + } + + if sentence.len > 0 { + out << sentence.join(' ').capitalize() + out << '.' + } + } + } + + return out.join('') +} + +// ---------------- Utilities ---------------- + +// LorumRNG is a simple internal PRNG used to avoid importing the `rand` module, +// which would cause an import cycle (strings -> rand -> time -> ... -> strings). +struct LorumRNG { +mut: + seed u32 +} + +// intn returns a pseudo-random number in [0, max). +// It is a minimal implementation sufficient for text generation. +fn (mut r LorumRNG) intn(max int) int { + if max <= 0 { + return 0 + } + r.seed = r.seed * 1664525 + 1013904223 + return int(r.seed % u32(max)) +} + +fn lorem_vary(mut rng LorumRNG, base int, min int) int { + delta := int(f32(base) * 0.2) + if delta == 0 { + return base + } + offset := rng.intn(delta * 2 + 1) - delta + val := base + offset + return if val < min { min } else { val } +} + +fn lorem_select_corpus(mut rng LorumRNG, name string) string { + if name != '' { + if corpus := lorem_corpora[name] { + return corpus + } + eprintln('unknown corpus: ${name}') + exit(1) + } + + keys := lorem_corpora.keys() + key := keys[rng.intn(keys.len)] + return lorem_corpora[key] +} + +fn lorem_random_seed_phrase(mut rng LorumRNG) string { + return lorem_seed_phrases[rng.intn(lorem_seed_phrases.len)] +} + +fn lorem_tokenize(text string) []string { + return text + .replace_each(['\n', ' ', '\t', ' ']) + .split(' ') + .filter(it.len > 0) +} + +fn lorem_build_markov(tokens []string, order int) map[string][]string { + mut model := map[string][]string{} + for i in 0 .. tokens.len - order { + key := tokens[i..i + order].join('\u0001') + model[key] << tokens[i + order] + } + return model +} diff --git a/vlib/strings/lorem/lorem_test.v b/vlib/strings/lorem/lorem_test.v new file mode 100644 index 000000000..e741a8f2a --- /dev/null +++ b/vlib/strings/lorem/lorem_test.v @@ -0,0 +1,66 @@ +module lorem + +fn test_lorem_generate_basic() { + output := generate(LoremCfg{ + paragraphs: 2 + sentences_per_paragraph: 3 + words_per_sentence: 5 + }) + assert output.len > 0 + // 2 paragraphs should be separated by "\n\n" + assert output.count('\n\n') == 1 +} + +fn test_lorem_generate_deterministic() { + cfg := LoremCfg{ + rng_seed: 12345 + paragraphs: 1 + } + out1 := generate(cfg) + out2 := generate(cfg) + assert out1 == out2 +} + +fn test_lorem_generate_counts() { + cfg := LoremCfg{ + paragraphs: 3 + } + output := generate(cfg) + // There should be 2 separators for 3 paragraphs + assert output.count('\n\n') == 2 +} + +fn test_lorem_custom_corpus() { + // 'bard' is shakespeare + cfg := LoremCfg{ + corpus_name: 'bard' + rng_seed: 999 + paragraphs: 1 + } + output := generate(cfg) + assert output.len > 0 + // Hard to check exact content due to randomness, but it should not crash +} + +fn test_lorem_vary() { + // lorem_vary is private, effectively testing internal logic + // base 10, min 5 + // delta = 10 * 0.2 = 2 + // range = [-2, 2] + // result = 10 + [-2, 2] -> [8, 12] + // We can't guarantee a specific value, but we can check bounds + mut rng := LorumRNG{ + seed: 12345 + } + for _ in 0 .. 100 { + val := lorem_vary(mut rng, 10, 5) + assert val >= 8 + assert val <= 12 + } +} + +fn test_lorem_tokenize() { + text := 'Hello\nWorld Test' + tokens := lorem_tokenize(text) + assert tokens == ['Hello', 'World', 'Test'] +} -- 2.39.5