From 194db24829869ce040d149955725bee7097c8b13 Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Sat, 21 Jun 2025 12:33:14 +0300 Subject: [PATCH] builtin: add a rune iterator method to strings, allowing `for for i, r in s.runes_iterator() {` without first allocating an array for all the runes (#24769) --- vlib/builtin/builtin.v | 4 +- vlib/builtin/string.v | 37 +++++++++++++++++++ vlib/builtin/string_iterator_test.v | 32 ++++++++++++++++ vlib/builtin/utf8.v | 36 +++++++++--------- .../gen/c/testdata/if_else_return.c.must_have | 2 +- 5 files changed, 91 insertions(+), 20 deletions(-) create mode 100644 vlib/builtin/string_iterator_test.v diff --git a/vlib/builtin/builtin.v b/vlib/builtin/builtin.v index ba6c8c8c0..ec76cfeaa 100644 --- a/vlib/builtin/builtin.v +++ b/vlib/builtin/builtin.v @@ -74,9 +74,9 @@ fn __print_assert_failure(i &VAssertMetaInfo) { eprintln('${i.fpath}:${i.line_nr + 1}: FAIL: fn ${i.fn_name}: assert ${i.src}') if i.op.len > 0 && i.op != 'call' { if i.llabel == i.lvalue { - eprintln(' left value: ${i.llabel}') + eprintln(' left value: ${i.llabel}') } else { - eprintln(' left value: ${i.llabel} = ${i.lvalue}') + eprintln(' left value: ${i.llabel} = ${i.lvalue}') } if i.rlabel == i.rvalue { eprintln(' right value: ${i.rlabel}') diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index 32a20e418..247a85f95 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -2979,3 +2979,40 @@ fn data_to_hex_string(data &u8, len int) string { hex[dst] = 0 return tos(hex, dst) } + +pub struct RunesIterator { +mut: + s string + i int +} + +// runes_iterator creates an iterator over all the runes in the given string `s`. +// It can be used in `for r in s.runes_iterator() {`, as a direct substitute to +// calling .runes(): `for r in s.runes() {`, which needs an intermediate allocation +// of an array. +pub fn (s string) runes_iterator() RunesIterator { + return RunesIterator{ + s: s + i: 0 + } +} + +// next is the method that will be called for each iteration in `for r in s.runes_iterator() {` +pub fn (mut ri RunesIterator) next() ?rune { + for ri.i >= ri.s.len { + return none + } + char_len := utf8_char_len(unsafe { ri.s.str[ri.i] }) + if char_len == 1 { + res := unsafe { ri.s.str[ri.i] } + ri.i++ + return res + } + start := &u8(unsafe { &ri.s.str[ri.i] }) + len := if ri.s.len - 1 >= ri.i + char_len { char_len } else { ri.s.len - ri.i } + ri.i += char_len + if char_len > 4 { + return 0 + } + return rune(impl_utf8_to_utf32(start, len)) +} diff --git a/vlib/builtin/string_iterator_test.v b/vlib/builtin/string_iterator_test.v new file mode 100644 index 000000000..8b87550cb --- /dev/null +++ b/vlib/builtin/string_iterator_test.v @@ -0,0 +1,32 @@ +fn check(s string) { + srunes := s.runes() + println('') + println('> s: ${s}') + println('> s.len: ${s.len:-4}') + println('> srunes.len: ${srunes.len:-4}') + mut itera_ := []rune{} + for r in s.runes_iterator() { + itera_ << r + } + println('> srunes: ${srunes}') + println('> iterated: ${itera_}') + assert srunes == itera_ +} + +fn test_ascii() { + check('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') +} + +fn test_mixed() { + check('abc,あいうえお,привет,❄☕❀💰') +} + +fn test_emoji_and_for_i_r_in_iterator() { + s := '❄☕❀💰' + check(s) + srunes := s.runes() + for i, r in s.runes_iterator() { + eprintln('> i: ${i} | r: ${r}') + assert srunes[i] == r + } +} diff --git a/vlib/builtin/utf8.v b/vlib/builtin/utf8.v index 2d0afdf64..9afc6a0eb 100644 --- a/vlib/builtin/utf8.v +++ b/vlib/builtin/utf8.v @@ -78,34 +78,36 @@ pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int { // it is used in vlib/builtin/string.v, // and also in vlib/v/gen/c/cgen.v pub fn (_rune string) utf32_code() int { - if res := _rune.bytes().utf8_to_utf32() { - return int(res) + if _rune.len > 4 { + return 0 } - return 0 + return int(impl_utf8_to_utf32(&u8(_rune.str), _rune.len)) } // convert array of utf8 bytes to single utf32 value // will error if more than 4 bytes are submitted -@[direct_array_access] pub fn (_bytes []u8) utf8_to_utf32() !rune { - if _bytes.len == 0 { - return 0 - } - // return ASCII unchanged - if _bytes.len == 1 { - return rune(_bytes[0]) - } if _bytes.len > 4 { return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum') } + return impl_utf8_to_utf32(&u8(_bytes.data), _bytes.len) +} - mut b := u8(int(_bytes[0])) - - b = b << _bytes.len +@[direct_array_access] +fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune { + if _bytes_len == 0 { + return 0 + } + // return ASCII unchanged + if _bytes_len == 1 { + return unsafe { rune(_bytes[0]) } + } + mut b := u8(int(unsafe { _bytes[0] })) + b = b << _bytes_len mut res := rune(b) - mut shift := 6 - _bytes.len - for i := 1; i < _bytes.len; i++ { - c := rune(_bytes[i]) + mut shift := 6 - _bytes_len + for i := 1; i < _bytes_len; i++ { + c := rune(unsafe { _bytes[i] }) res = rune(res) << shift res |= c & 63 // 0x3f shift = 6 diff --git a/vlib/v/gen/c/testdata/if_else_return.c.must_have b/vlib/v/gen/c/testdata/if_else_return.c.must_have index 14dc1b71c..f44f0909e 100644 --- a/vlib/v/gen/c/testdata/if_else_return.c.must_have +++ b/vlib/v/gen/c/testdata/if_else_return.c.must_have @@ -5,5 +5,5 @@ _result_ok(&(string[]) { s }, (_result*)(&_t2), sizeof(string)); } else { return (_result_string){ .is_error=true, .err=_v_error(_S("empty")), .data={E_STRUCT} }; } -return _t1; +return _t2; } -- 2.39.5