| 1 | module lorem |
| 2 | |
| 3 | const lorem_corpora = { |
| 4 | 'lorem': lorem_corpus |
| 5 | 'poe': poe_corpus |
| 6 | 'darwin': darwin_corpus |
| 7 | 'bard': shakespeare_corpus |
| 8 | } |
| 9 | |
| 10 | const lorem_corpus = ' |
| 11 | lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor |
| 12 | incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis |
| 13 | nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat |
| 14 | Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore |
| 15 | eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt |
| 16 | in culpa qui officia deserunt mollit anim id est laborum |
| 17 | ' |
| 18 | |
| 19 | const poe_corpus = ' |
| 20 | once upon a midnight dreary while I pondered weak and weary |
| 21 | over many a quaint and curious volume of forgotten lore |
| 22 | while I nodded nearly napping suddenly there came a tapping |
| 23 | as of someone gently rapping at my chamber door |
| 24 | ' |
| 25 | |
| 26 | const darwin_corpus = ' |
| 27 | when we look to the individuals of the same variety or sub variety of our |
| 28 | older cultivated plants and animals one of the first points which strikes |
| 29 | us is that they generally differ much more from each other than do the |
| 30 | individuals of any one species or variety in a state of nature |
| 31 | ' |
| 32 | |
| 33 | const shakespeare_corpus = ' |
| 34 | to be or not to be that is the question |
| 35 | all the worlds a stage and all the men and women merely players |
| 36 | the lady doth protest too much methinks |
| 37 | a rose by any other name would smell as sweet |
| 38 | et tu brute |
| 39 | if music be the food of love play on |
| 40 | now is the winter of our discontent |
| 41 | we are such stuff as dreams are made on |
| 42 | brevity is the soul of wit |
| 43 | some are born great some achieve greatness and some have greatness thrust upon them |
| 44 | cry havoc and let slip the dogs of war |
| 45 | all that glisters is not gold |
| 46 | the fault dear brutus is not in our stars but in ourselves |
| 47 | to thine own self be true |
| 48 | lord what fools these mortals be |
| 49 | shall i compare thee to a summers day |
| 50 | ' |
| 51 | |
| 52 | // ---------------- Seed Phrases ---------------- |
| 53 | |
| 54 | const lorem_seed_phrases = [ |
| 55 | 'in the beginning', |
| 56 | 'once upon a time', |
| 57 | 'it was the first', |
| 58 | 'when we consider', |
| 59 | 'there was a moment', |
| 60 | ] |
| 61 | |
| 62 | // ---------------- Text Generation ---------------- |
| 63 | |
| 64 | // LoremCfg |
| 65 | // - `corpus_name` random if not specified |
| 66 | // - `seed_text` random if not specified |
| 67 | // - `rng_seed` random if not specified |
| 68 | @[params] |
| 69 | pub struct LoremCfg { |
| 70 | pub: |
| 71 | words_per_sentence int = 10 |
| 72 | sentences_per_paragraph int = 5 |
| 73 | paragraphs int = 3 |
| 74 | markov_order int = 2 |
| 75 | corpus_name string |
| 76 | seed_text string |
| 77 | rng_seed int |
| 78 | } |
| 79 | |
| 80 | // lorum - pseudo-random text using a Markov chain built from |
| 81 | // one of several embedded corpora. It produces structured output in the |
| 82 | // form of paragraphs and sentences, with configurable parameters for: |
| 83 | // |
| 84 | // - Markov order (n-gram size) |
| 85 | // - Words per sentence |
| 86 | // - Sentences per paragraph |
| 87 | // - Paragraph count |
| 88 | // - Optional seed phrases and RNG seed |
| 89 | // - Optional corpus selection |
| 90 | // |
| 91 | pub fn generate(cfg LoremCfg) string { |
| 92 | // Initialize LCG with provided seed or default |
| 93 | // If rng_seed is 0, we use a default seed (making it deterministic by default) |
| 94 | initial_seed := if cfg.rng_seed != 0 { u32(cfg.rng_seed) } else { u32(123456789) } |
| 95 | mut rng := LorumRNG{ |
| 96 | seed: initial_seed |
| 97 | } |
| 98 | |
| 99 | seed := match cfg.seed_text != '' { |
| 100 | true { cfg.seed_text } |
| 101 | else { lorem_random_seed_phrase(mut rng) } |
| 102 | } |
| 103 | |
| 104 | corpus := lorem_select_corpus(mut rng, cfg.corpus_name) |
| 105 | tokens := lorem_tokenize(corpus) |
| 106 | |
| 107 | if tokens.len <= cfg.markov_order { |
| 108 | eprintln('corpus too small for selected order') |
| 109 | return '' |
| 110 | } |
| 111 | |
| 112 | model := lorem_build_markov(tokens, cfg.markov_order) |
| 113 | |
| 114 | mut state := lorem_tokenize(seed) |
| 115 | if state.len < cfg.markov_order { |
| 116 | start := rng.intn(tokens.len - cfg.markov_order) |
| 117 | state = tokens[start..start + cfg.markov_order].clone() |
| 118 | } |
| 119 | |
| 120 | mut out := []string{} |
| 121 | |
| 122 | for pi in 0 .. cfg.paragraphs { |
| 123 | if pi != 0 { |
| 124 | out << '\n\n' |
| 125 | } |
| 126 | sentences := lorem_vary(mut rng, cfg.sentences_per_paragraph, 1) |
| 127 | |
| 128 | for si in 0 .. sentences { |
| 129 | if si != 0 { |
| 130 | out << ' ' |
| 131 | } |
| 132 | words := lorem_vary(mut rng, cfg.words_per_sentence, 3) |
| 133 | mut sentence := []string{} |
| 134 | |
| 135 | for _ in 0 .. words { |
| 136 | key := state.join('\u0001') |
| 137 | nexts := model[key] or { |
| 138 | start := rng.intn(tokens.len - cfg.markov_order) |
| 139 | state = tokens[start..start + cfg.markov_order].clone() |
| 140 | continue |
| 141 | } |
| 142 | |
| 143 | next := nexts[rng.intn(nexts.len)] |
| 144 | sentence << next |
| 145 | |
| 146 | state = state[1..].clone() |
| 147 | state << next |
| 148 | } |
| 149 | |
| 150 | if sentence.len > 0 { |
| 151 | out << sentence.join(' ').capitalize() |
| 152 | out << '.' |
| 153 | } |
| 154 | } |
| 155 | } |
| 156 | |
| 157 | return out.join('') |
| 158 | } |
| 159 | |
| 160 | // ---------------- Utilities ---------------- |
| 161 | |
| 162 | // LorumRNG is a simple internal PRNG used to avoid importing the `rand` module, |
| 163 | // which would cause an import cycle (strings -> rand -> time -> ... -> strings). |
| 164 | struct LorumRNG { |
| 165 | mut: |
| 166 | seed u32 |
| 167 | } |
| 168 | |
| 169 | // intn returns a pseudo-random number in [0, max). |
| 170 | // It is a minimal implementation sufficient for text generation. |
| 171 | fn (mut r LorumRNG) intn(max int) int { |
| 172 | if max <= 0 { |
| 173 | return 0 |
| 174 | } |
| 175 | r.seed = r.seed * 1664525 + 1013904223 |
| 176 | return int(r.seed % u32(max)) |
| 177 | } |
| 178 | |
| 179 | fn lorem_vary(mut rng LorumRNG, base int, min int) int { |
| 180 | delta := int(f32(base) * 0.2) |
| 181 | if delta == 0 { |
| 182 | return base |
| 183 | } |
| 184 | offset := rng.intn(delta * 2 + 1) - delta |
| 185 | val := base + offset |
| 186 | return if val < min { min } else { val } |
| 187 | } |
| 188 | |
| 189 | fn lorem_select_corpus(mut rng LorumRNG, name string) string { |
| 190 | if name != '' { |
| 191 | if corpus := lorem_corpora[name] { |
| 192 | return corpus |
| 193 | } |
| 194 | eprintln('unknown corpus: ${name}') |
| 195 | exit(1) |
| 196 | } |
| 197 | |
| 198 | keys := lorem_corpora.keys() |
| 199 | key := keys[rng.intn(keys.len)] |
| 200 | return lorem_corpora[key] |
| 201 | } |
| 202 | |
| 203 | fn lorem_random_seed_phrase(mut rng LorumRNG) string { |
| 204 | return lorem_seed_phrases[rng.intn(lorem_seed_phrases.len)] |
| 205 | } |
| 206 | |
| 207 | fn lorem_tokenize(text string) []string { |
| 208 | return text |
| 209 | .replace_each(['\n', ' ', '\t', ' ']) |
| 210 | .split(' ') |
| 211 | .filter(it.len > 0) |
| 212 | } |
| 213 | |
| 214 | fn lorem_build_markov(tokens []string, order int) map[string][]string { |
| 215 | mut model := map[string][]string{} |
| 216 | for i in 0 .. tokens.len - order { |
| 217 | key := tokens[i..i + order].join('\u0001') |
| 218 | if key !in model { |
| 219 | model[key] = []string{} |
| 220 | } |
| 221 | model[key] << tokens[i + order] |
| 222 | } |
| 223 | return model |
| 224 | } |
| 225 | |