v / vlib / strings / lorem / lorem.v
224 lines · 192 sloc · 5.93 KB · afc1f92d0a42d1e8bbdd09007825f6947955a53a
Raw
1module lorem
2
3const lorem_corpora = {
4 'lorem': lorem_corpus
5 'poe': poe_corpus
6 'darwin': darwin_corpus
7 'bard': shakespeare_corpus
8}
9
10const lorem_corpus = '
11lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor
12incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis
13nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat
14Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore
15eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt
16in culpa qui officia deserunt mollit anim id est laborum
17'
18
19const poe_corpus = '
20once upon a midnight dreary while I pondered weak and weary
21over many a quaint and curious volume of forgotten lore
22while I nodded nearly napping suddenly there came a tapping
23as of someone gently rapping at my chamber door
24'
25
26const darwin_corpus = '
27when we look to the individuals of the same variety or sub variety of our
28older cultivated plants and animals one of the first points which strikes
29us is that they generally differ much more from each other than do the
30individuals of any one species or variety in a state of nature
31'
32
33const shakespeare_corpus = '
34to be or not to be that is the question
35all the worlds a stage and all the men and women merely players
36the lady doth protest too much methinks
37a rose by any other name would smell as sweet
38et tu brute
39if music be the food of love play on
40now is the winter of our discontent
41we are such stuff as dreams are made on
42brevity is the soul of wit
43some are born great some achieve greatness and some have greatness thrust upon them
44cry havoc and let slip the dogs of war
45all that glisters is not gold
46the fault dear brutus is not in our stars but in ourselves
47to thine own self be true
48lord what fools these mortals be
49shall i compare thee to a summers day
50'
51
52// ---------------- Seed Phrases ----------------
53
54const lorem_seed_phrases = [
55 'in the beginning',
56 'once upon a time',
57 'it was the first',
58 'when we consider',
59 'there was a moment',
60]
61
62// ---------------- Text Generation ----------------
63
64// LoremCfg
65// - `corpus_name` random if not specified
66// - `seed_text` random if not specified
67// - `rng_seed` random if not specified
68@[params]
69pub struct LoremCfg {
70pub:
71 words_per_sentence int = 10
72 sentences_per_paragraph int = 5
73 paragraphs int = 3
74 markov_order int = 2
75 corpus_name string
76 seed_text string
77 rng_seed int
78}
79
80// lorum - pseudo-random text using a Markov chain built from
81// one of several embedded corpora. It produces structured output in the
82// form of paragraphs and sentences, with configurable parameters for:
83//
84// - Markov order (n-gram size)
85// - Words per sentence
86// - Sentences per paragraph
87// - Paragraph count
88// - Optional seed phrases and RNG seed
89// - Optional corpus selection
90//
91pub fn generate(cfg LoremCfg) string {
92 // Initialize LCG with provided seed or default
93 // If rng_seed is 0, we use a default seed (making it deterministic by default)
94 initial_seed := if cfg.rng_seed != 0 { u32(cfg.rng_seed) } else { u32(123456789) }
95 mut rng := LorumRNG{
96 seed: initial_seed
97 }
98
99 seed := match cfg.seed_text != '' {
100 true { cfg.seed_text }
101 else { lorem_random_seed_phrase(mut rng) }
102 }
103
104 corpus := lorem_select_corpus(mut rng, cfg.corpus_name)
105 tokens := lorem_tokenize(corpus)
106
107 if tokens.len <= cfg.markov_order {
108 eprintln('corpus too small for selected order')
109 return ''
110 }
111
112 model := lorem_build_markov(tokens, cfg.markov_order)
113
114 mut state := lorem_tokenize(seed)
115 if state.len < cfg.markov_order {
116 start := rng.intn(tokens.len - cfg.markov_order)
117 state = tokens[start..start + cfg.markov_order].clone()
118 }
119
120 mut out := []string{}
121
122 for pi in 0 .. cfg.paragraphs {
123 if pi != 0 {
124 out << '\n\n'
125 }
126 sentences := lorem_vary(mut rng, cfg.sentences_per_paragraph, 1)
127
128 for si in 0 .. sentences {
129 if si != 0 {
130 out << ' '
131 }
132 words := lorem_vary(mut rng, cfg.words_per_sentence, 3)
133 mut sentence := []string{}
134
135 for _ in 0 .. words {
136 key := state.join('\u0001')
137 nexts := model[key] or {
138 start := rng.intn(tokens.len - cfg.markov_order)
139 state = tokens[start..start + cfg.markov_order].clone()
140 continue
141 }
142
143 next := nexts[rng.intn(nexts.len)]
144 sentence << next
145
146 state = state[1..].clone()
147 state << next
148 }
149
150 if sentence.len > 0 {
151 out << sentence.join(' ').capitalize()
152 out << '.'
153 }
154 }
155 }
156
157 return out.join('')
158}
159
160// ---------------- Utilities ----------------
161
162// LorumRNG is a simple internal PRNG used to avoid importing the `rand` module,
163// which would cause an import cycle (strings -> rand -> time -> ... -> strings).
164struct LorumRNG {
165mut:
166 seed u32
167}
168
169// intn returns a pseudo-random number in [0, max).
170// It is a minimal implementation sufficient for text generation.
171fn (mut r LorumRNG) intn(max int) int {
172 if max <= 0 {
173 return 0
174 }
175 r.seed = r.seed * 1664525 + 1013904223
176 return int(r.seed % u32(max))
177}
178
179fn lorem_vary(mut rng LorumRNG, base int, min int) int {
180 delta := int(f32(base) * 0.2)
181 if delta == 0 {
182 return base
183 }
184 offset := rng.intn(delta * 2 + 1) - delta
185 val := base + offset
186 return if val < min { min } else { val }
187}
188
189fn lorem_select_corpus(mut rng LorumRNG, name string) string {
190 if name != '' {
191 if corpus := lorem_corpora[name] {
192 return corpus
193 }
194 eprintln('unknown corpus: ${name}')
195 exit(1)
196 }
197
198 keys := lorem_corpora.keys()
199 key := keys[rng.intn(keys.len)]
200 return lorem_corpora[key]
201}
202
203fn lorem_random_seed_phrase(mut rng LorumRNG) string {
204 return lorem_seed_phrases[rng.intn(lorem_seed_phrases.len)]
205}
206
207fn lorem_tokenize(text string) []string {
208 return text
209 .replace_each(['\n', ' ', '\t', ' '])
210 .split(' ')
211 .filter(it.len > 0)
212}
213
214fn lorem_build_markov(tokens []string, order int) map[string][]string {
215 mut model := map[string][]string{}
216 for i in 0 .. tokens.len - order {
217 key := tokens[i..i + order].join('\u0001')
218 if key !in model {
219 model[key] = []string{}
220 }
221 model[key] << tokens[i + order]
222 }
223 return model
224}
225