v2 / vlib / v / token / keywords_matcher_trie.v
181 lines · 166 sloc · 5.38 KB · 3d60410b605d001e54f280070d5f952da9de1112
Raw
1module token
2
3// KeywordsMatcherTrie provides a faster way of determining whether a given name
4// is a reserved word (belongs to a given set of previously known words `R`).
5// See the module description for more details.
6@[heap]
7pub struct KeywordsMatcherTrie {
8pub mut:
9 nodes []&TrieNode
10 min_len int = 999999
11 max_len int
12}
13
14// str returns a short representation of matcher
15pub fn (km &KeywordsMatcherTrie) str() string {
16 return 'KeywordsMatcherTrie{ /* nodes.len: ${km.nodes.len} */ min_len: ${km.min_len}, max_len: ${km.max_len} }'
17}
18
19// TrieNode is a single node from a trie, used by KeywordsMatcherTrie
20pub struct TrieNode {
21pub mut:
22 children [123]&TrieNode
23 value int = -1 // when != -1, it is a leaf node representing a match
24}
25
26// str returns a string representation of the node content
27pub fn (node &TrieNode) str() string {
28 if isnil(node) {
29 return '&TrieNode(nil)'
30 }
31 return '&TrieNode{value: ${node.value}}'
32}
33
34// find tries to find the given `word` in the set of all previously added words
35// to the KeywordsMatcherTrie instance. It returns -1 if the word was NOT found
36// there at all. If the word was found, find will return the `value` (value => 0),
37// associated with the word, when it was added.
38@[direct_array_access]
39pub fn (km &KeywordsMatcherTrie) find(word string) int {
40 wlen := word.len
41 if wlen < km.min_len || wlen > km.max_len {
42 return -1
43 }
44 node := km.nodes[wlen]
45 if node == unsafe { nil } {
46 return -1
47 }
48 return node.find(word)
49}
50
51// matches returns true when the word was already added, i.e. when it was found.
52@[inline]
53pub fn (km &KeywordsMatcherTrie) matches(word string) bool {
54 return km.find(word) != -1
55}
56
57// add_word adds the given word to the KeywordsMatcherTrie instance. It associates a non
58// negative integer value to it, so later `find` could return the value, when it succeeds.
59@[direct_array_access; markused]
60pub fn (mut km KeywordsMatcherTrie) add_word(word string, value int) {
61 wlen := word.len
62 if km.max_len < wlen {
63 km.max_len = wlen
64 }
65 if km.min_len > wlen {
66 km.min_len = wlen
67 }
68 // add more top level slots, if needed:
69 for km.nodes.len < wlen + 1 {
70 // eprintln('>>>>>>>>>>>>>> appending more nodes for word: ${word} | value: ${value} | km.nodes.len: ${km.nodes.len} | wlen: ${wlen}')
71 km.nodes << unsafe { &TrieNode(nil) }
72 }
73 if km.nodes[wlen] == unsafe { nil } {
74 km.nodes[wlen] = new_trie_node()
75 }
76 km.nodes[wlen].add_word(word, value, 0)
77}
78
79pub fn KeywordsMatcherTrie.new(cap int) KeywordsMatcherTrie {
80 mut km := KeywordsMatcherTrie{
81 nodes: []&TrieNode{cap: cap}
82 }
83 for _ in 0 .. cap {
84 km.nodes << &TrieNode(unsafe { nil })
85 }
86 return km
87}
88
89// new_keywords_matcher_trie creates a new KeywordsMatcherTrie instance from a given map
90// with string keys, and integer or enum values.
91pub fn new_keywords_matcher_trie[T](kw_map map[string]T) KeywordsMatcherTrie {
92 mut km := KeywordsMatcherTrie.new(10)
93 for k, v in kw_map {
94 km.add_word(k, int(v))
95 }
96 // dump(km.min_len)
97 // dump(km.max_len)
98 // for idx,x in km.nodes { if x != unsafe { nil } { eprintln('>> idx: ${idx} | ${ptr_str(x)}') } }
99 return km
100}
101
102// new_keywords_matcher_from_array_trie creates a new KeywordsMatcherTrie instance from a given array
103// of strings. The values for the strings, that `find` will return, will be the indexes in that array.
104pub fn new_keywords_matcher_from_array_trie(names []string) KeywordsMatcherTrie {
105 mut m := map[string]int{}
106 for i, name in names {
107 m[name] = i
108 }
109 return new_keywords_matcher_trie[int](m)
110}
111
112//
113
114// new_trie_node creates a new TrieNode instance
115pub fn new_trie_node() &TrieNode {
116 return &TrieNode{}
117}
118
119// show displays the information in `node`, in a more compact/readable format (recursively)
120pub fn (node &TrieNode) show(level int) {
121 mut non_nil_children := []int{}
122 for idx, x in node.children {
123 if x != unsafe { nil } {
124 non_nil_children << idx
125 }
126 }
127 children := non_nil_children.map(u8(it).ascii_str())
128 eprintln('> level: ${level:2} | value: ${node.value:12} | non_nil_children: ${non_nil_children.len:2} | ${children}')
129 for x in node.children {
130 if x != unsafe { nil } {
131 x.show(level + 1)
132 }
133 }
134}
135
136// add_word adds another `word` and `value` pair into the trie, starting from `node` (recursively).
137// `word_idx` is just used as an accumulator, and starts from 0 at the root of the tree.
138@[direct_array_access; markused]
139pub fn (mut node TrieNode) add_word(word string, value int, word_idx int) {
140 if word_idx < 0 || word_idx >= word.len {
141 node.value = value
142 return
143 }
144 first := u8(word[word_idx])
145 // eprintln('>> node: ${ptr_str(node)} | first: ${first} | word_idx: ${word_idx}')
146 mut child_node := node.children[first]
147 if child_node == unsafe { nil } {
148 child_node = new_trie_node()
149 node.children[first] = child_node
150 }
151 child_node.add_word(word, value, word_idx + 1)
152}
153
154// find tries to find a match for `word` to the trie (the set of all previously added words).
155// It returns -1 if there is no match, or the value associated with the previously added
156// matching word by `add_word`.
157@[direct_array_access]
158pub fn (root &TrieNode) find(word string) int {
159 wlen := word.len
160 mut node := unsafe { &TrieNode(root) }
161 mut idx := 0
162 for {
163 // eprintln('> match_keyword: `${word:20}` | node: ${ptr_str(node)} | idx: ${idx:3}')
164 if idx == wlen {
165 k := node.value
166 if k != -1 {
167 // node.show(0)
168 return k
169 }
170 return -1
171 }
172 c := word[idx]
173 child := node.children[c]
174 if child == unsafe { nil } {
175 return -1
176 }
177 node = unsafe { child }
178 idx++
179 }
180 return -1
181}
182