From a8d75c10b5a706d6a0fe1b797327a5fa4e2f82e9 Mon Sep 17 00:00:00 2001 From: Jorge Mireles Date: Wed, 30 Jul 2025 10:11:41 -0600 Subject: [PATCH] vlib: add `archive.tar` module to enable reading of .tar ang .tar.gz files (#24995) --- examples/archive/tar_gz_reader.v | 166 ++++++++++++ vlib/archive/README.md | 3 + vlib/archive/tar/README.md | 33 +++ vlib/archive/tar/reader.v | 277 ++++++++++++++++++++ vlib/archive/tar/reader_test.v | 157 +++++++++++ vlib/archive/tar/tar.v | 43 +++ vlib/archive/tar/testdata/file-and-dir.tar | Bin 0 -> 2560 bytes vlib/archive/tar/testdata/gnu-long-nul.tar | Bin 0 -> 2560 bytes vlib/archive/tar/testdata/gnu-not-utf8.tar | Bin 0 -> 1536 bytes vlib/archive/tar/testdata/gnu-utf8.tar | Bin 0 -> 2560 bytes vlib/archive/tar/testdata/gnu.tar | Bin 0 -> 3072 bytes vlib/archive/tar/testdata/life.tar.gz | Bin 0 -> 626 bytes vlib/archive/tar/untar.v | 290 +++++++++++++++++++++ 13 files changed, 969 insertions(+) create mode 100644 examples/archive/tar_gz_reader.v create mode 100644 vlib/archive/README.md create mode 100644 vlib/archive/tar/README.md create mode 100644 vlib/archive/tar/reader.v create mode 100644 vlib/archive/tar/reader_test.v create mode 100644 vlib/archive/tar/tar.v create mode 100644 vlib/archive/tar/testdata/file-and-dir.tar create mode 100644 vlib/archive/tar/testdata/gnu-long-nul.tar create mode 100644 vlib/archive/tar/testdata/gnu-not-utf8.tar create mode 100644 vlib/archive/tar/testdata/gnu-utf8.tar create mode 100644 vlib/archive/tar/testdata/gnu.tar create mode 100644 vlib/archive/tar/testdata/life.tar.gz create mode 100644 vlib/archive/tar/untar.v diff --git a/examples/archive/tar_gz_reader.v b/examples/archive/tar_gz_reader.v new file mode 100644 index 000000000..0d01fcb91 --- /dev/null +++ b/examples/archive/tar_gz_reader.v @@ -0,0 +1,166 @@ +import archive.tar +import flag +import net.http +import os +import term + +const default_url = 'https://github.com/vlang/v/archive/refs/tags/v0.1.3.tar.gz' + +@[heap] +struct Context { + url string // Web starting with http:// or https://. Local starting with file:/// + chunks bool // true: decompress with callback + debug int // print debug lines + max_blocks int // if max_blocks > 0 and is reached stops early. + filename string // if filename is found as a path of a data block, stops early. +} + +fn (ctx &Context) read_last_block(mut read tar.Read) bool { + if ctx.max_blocks > 0 && ctx.max_blocks < read.get_block_number() { + read.stop_early = true + return true + } + return false +} + +fn new_context() !&Context { + mut fp := flag.new_flag_parser(os.args) + fp.application('tar_gz_reader') + fp.version('0.0.20250721') + fp.description('Reads into memory selected sections of *.tar.gz. archives from https or home_dir.') + fp.skip_executable() + ctx := &Context{ + url: fp.string('url', `u`, default_url, 'archive *.tar.gz URL, default(${default_url}). Start name with file:/// for local') + chunks: fp.bool('chunks', `c`, false, 'decompress with chunks to reduce RAM usage, default(false)') + debug: fp.int('debug', `d`, 0, 'prints blocks: 1=other, 2:+dirs, 3=+files, 4=+data, default(0=silent)') + max_blocks: fp.int('max_blocks', `m`, 0, 'maximum blocks to read, stop early. Default(0=read all)') + filename: fp.string('filename', `f`, '', 'filename content complete print, stop early. Default(empty means none)') + } + additional := fp.finalize()! + if additional.len > 0 { + println('unprocessed args ${additional.join_lines()}') + } + return ctx +} + +// Downloader downloads a *.tar.gz using HTTP chunks +struct Downloader { +mut: + chunks int + data []u8 +} + +fn new_downloader(url string) !&Downloader { + mut downloader := &Downloader{} + params := http.DownloaderParams{ + downloader: downloader + } + if url.starts_with('http://') || url.starts_with('https://') { + http.download_file_with_progress(url, '', params)! + } else if url.starts_with('file:///') { + path := '${os.home_dir()}/${url[8..]}' + println('path ${path}') + downloader.data = os.read_bytes(path)! + } + return downloader +} + +fn (mut d Downloader) on_start(mut request http.Request, path string) ! {} + +fn (mut d Downloader) on_chunk(request &http.Request, chunk []u8, already_received u64, expected u64) ! { + if expected == 0 { + return + } + d.chunks++ + d.data << chunk +} + +fn (mut d Downloader) on_finish(request &http.Request, response &http.Response) ! {} + +struct FileReader implements tar.Reader { + ctx &Context +mut: + filepath string + content []u8 +} + +fn new_file_reader(ctx &Context) FileReader { + return FileReader{ + ctx: ctx + } +} + +fn (mut f FileReader) other_block(mut read tar.Read, details string) { + if f.ctx.read_last_block(mut read) { + return + } + if f.ctx.debug > 0 { + row := 'OTHER block:${read.get_block_number():6} ${read.get_special()} ${details} ${read.get_path()} ' + println(term.colorize(term.bright_yellow, row)) + } +} + +fn (mut f FileReader) dir_block(mut read tar.Read, size u64) { + if f.ctx.read_last_block(mut read) { + return + } + if f.ctx.debug > 1 { + row := 'DIR block:${read.get_block_number():6} ${read.get_path()} size:${size}' + println(term.colorize(term.green, row)) + } +} + +fn (mut f FileReader) file_block(mut read tar.Read, size u64) { + if f.ctx.read_last_block(mut read) { + return + } + path := read.get_path() + if f.ctx.debug > 2 { + row := ' FILE block:${read.get_block_number():6} ${path} size:${size}' + println(term.colorize(term.bright_blue, row)) + } + if f.ctx.filename != '' && f.filepath == '' && path.ends_with(f.ctx.filename) { + f.filepath = path + } +} + +fn (mut f FileReader) data_block(mut read tar.Read, data []u8, pending int) { + if f.ctx.read_last_block(mut read) { + return + } + path := read.get_path() + if f.ctx.debug > 3 { + println(' DATA block:${read.get_block_number():6} ${path} len:${data.len} pend:${pending}') + } + if f.ctx.filename != '' { + if f.filepath == path { + f.content << data + if pending == 0 { + // our file of interest data is complete + read.stop_early = true + } + } + } +} + +fn main() { + ctx := new_context()! + reader := FileReader{ + ctx: ctx + } + mut untar := tar.new_untar(reader) + mut decompressor := tar.new_decompresor(untar) + downloader := new_downloader(ctx.url)! + if ctx.chunks { + decompressor.read_chunks(downloader.data)! + } else { + decompressor.read_all(downloader.data)! + } + println('-'.repeat(80)) + println('Download: ${ctx.url} chunks:${downloader.chunks} bytes=${downloader.data.len}') + println('Untar: ${untar}') + println('Content: Path:${reader.filepath} bytes:${reader.content.len}') + println('-'.repeat(80)) + println('${reader.content.bytestr()}') + println('-'.repeat(80)) +} diff --git a/vlib/archive/README.md b/vlib/archive/README.md new file mode 100644 index 000000000..a83c50a6e --- /dev/null +++ b/vlib/archive/README.md @@ -0,0 +1,3 @@ +## Description + +`archive` is a namespace for different archive formats like `tar` or `zip`. diff --git a/vlib/archive/tar/README.md b/vlib/archive/tar/README.md new file mode 100644 index 000000000..47b02679e --- /dev/null +++ b/vlib/archive/tar/README.md @@ -0,0 +1,33 @@ +## Description + +`tar` is a module to access tar archives. + +Tape archives (tar) are a file format for storing a sequence of files that can be read and written +as streams. This module covers the reading of the basic sections of archives produced by GNU tools +like Linux command `tar -xvf` but in memory instead modifing the filesystem. Parses directories, +files, and file's content and manage paths longer than 100 chars. + +### Read Efficiency + +An entire tar file can be read in memory or by chunks. Keeps in memory a single decompressed +[chunk](https://modules.vlang.io/compress.gzip.html#decompress_with_callback) of 32 KB at a time +and also keeps in memory a single tar block of 512 bytes at a time. Convert paths to strings until +needed and the user reader implementation can stop early the reading process. + +### Read Example + +The tar blocks are parsed and some fields are passed to `Reader` implemented methods. + +```v +import os +import archive.tar + +fn main() { + os.chdir(@VMODROOT) or {} + path := 'archive/tar/testdata/life.tar.gz' + reader := tar.new_debug_reader() + tar.read_tar_gz_file(path, reader)! +} +``` +Look also in `examples` folder the `tar_gz_reader.v` program. + diff --git a/vlib/archive/tar/reader.v b/vlib/archive/tar/reader.v new file mode 100644 index 000000000..e7e1e5f6e --- /dev/null +++ b/vlib/archive/tar/reader.v @@ -0,0 +1,277 @@ +module tar + +import compress.gzip +import os + +// read_tar_gz_file decompresses a given local file and reads all the blocks +// with a given reader. +pub fn read_tar_gz_file(path string, reader Reader) ! { + tar_gz := os.read_bytes(path)! + all_blocks := gzip.decompress(tar_gz)! + mut untar := Untar{ + reader: reader + } + untar.read_all_blocks(all_blocks)! +} + +// Read is used by Untar to call Reader implemented methods. +// The implementor can read the block's `get_block_number()` and `get_path()` +// and can set the field `stop_early` to true to suspend the reading. +pub struct Read { +mut: + block_number int + special BlockSpecial + prefix_len int + prefix_buf [131]u8 + separator bool + path_len int + path_buf [100]u8 + + long_path &LongPath = unsafe { nil } +pub mut: + stop_early bool +} + +// set_short_path sets Read path with the tar block strings `prefix` and `path`. +// Block's `prefix` C string max length is 131 but most of the time is 0. +// Block's `path` C string max length is 100. Both `prefix` and `path` are +// linked to a V string but converted until is needed, see `get_path()`. +fn (mut b Read) set_short_path(buffer [512]u8, separator_after_prefix bool) { + // first check if TAR block has a prefix string (0 to 131 chars). The + // prefix will be other than '' the TAR block filepath len is > 100. + b.prefix_len = 0 + for i := 345; i < 345 + 131; i++ { + letter := buffer[i] + if letter == 0 { + break // first 0 found means prefix C string is complete. + } + b.prefix_buf[b.prefix_len] = letter + b.prefix_len++ + } + + b.separator = separator_after_prefix + + // most of the time there is path for blocks like dirs and regular files: + b.path_len = 0 + for i := 0; i < 100; i++ { + letter := buffer[i] + if letter == 0 { + break // first 0 found means path C string is complete. + } + b.path_buf[b.path_len] = letter + b.path_len++ + } +} + +// set_long_path sets Read path with the long path reference. +fn (mut b Read) set_long_path(long_path &LongPath) { + b.long_path = unsafe { long_path } +} + +// get_path returns the path of this read. The path is valid for blocks of types +// directory, file and file data. +pub fn (b Read) get_path() string { + if b.long_path != unsafe { nil } { + return b.long_path.get_path() + } + + mut str := []u8{} + if b.prefix_len > 0 { + str << b.prefix_buf[0..b.prefix_len] + } + if b.prefix_len > 0 && b.separator { + str << `/` + } + if b.path_len > 0 { + str << b.path_buf[0..b.path_len] + } + return str.bytestr() +} + +// get_block_number returns the consecutive number of this read. +pub fn (b Read) get_block_number() int { + return b.block_number +} + +// get_special returns the special type of the Read. +pub fn (b Read) get_special() BlockSpecial { + return b.special +} + +// str returns a string representation with block number, path, special type and stop early. +pub fn (r Read) str() string { + return '(block_number:${r.block_number} path:${r.get_path()} special:${r.special} stop_early:${r.stop_early})' +} + +// Reader is used to read by Untar to parse the blocks. +pub interface Reader { +mut: + // dir_block is called when untar reads a block of type directory. + // Call `Read.get_path()` to get the full name of the directory. + // `size` field is zero for directories. + // The implementor can set Read's field `stop_early` to suspend the reader. + dir_block(mut read Read, size u64) + + // file_block is called when untar reads a block of type filename. + // Call `Read.get_path()` to get the full name of the file. + // `size` is the expected file size in bytes to be read later. + // The implementor can set Read's field `stop_early` to suspend the reader. + file_block(mut read Read, size u64) + + // file_block is called when untar reads a block of type filedata. + // Call `Read.get_path()` to get the full name of the file data belongs to. + // The `data` size is 512 bytes or less. `pending` indicates how many bytes are left to read. + // The implementor can inspect the data and use the pending value + // to set Read's field `stop_early` to suspend the reader. + data_block(mut read Read, data []u8, pending int) + + // other_block is called when untar reads a block type other than directory, + // filename or filedata. `Read.get_header()` and 'details' give more info about the block. + // `block device` or `FIFO`. + // The implementor can set Read's field `stop_early` to suspend the reader. + other_block(mut read Read, details string) +} + +// DebugReader implements a Reader and prints rows for blocks read +// as directories, files, file data blocks and special blocks. +pub struct DebugReader implements Reader { +} + +// new_debug_reader returns a DebugReader +pub fn new_debug_reader() &DebugReader { + return &DebugReader{} +} + +fn (mut t DebugReader) dir_block(mut read Read, size u64) { + println('DIR #${read.get_block_number()} ${read.get_path()}') +} + +fn (mut t DebugReader) file_block(mut read Read, size u64) { + println('FILE #${read.get_block_number()} path:${read.get_path()} size:${size}') +} + +fn (mut t DebugReader) data_block(mut read Read, data []u8, pending int) { + println('DATA #${read.get_block_number()} ${read.get_path()} size:${data.len} pending:${pending}') +} + +fn (mut t DebugReader) other_block(mut read Read, details string) { + println('OTHER #${read.get_block_number()} special:${read.special} ${details}') +} + +// ReadResult is returned by ReadResultFn +pub enum ReadResult { + @continue + stop_early + end_of_file + end_archive + overflow +} + +type ReadResultFn = fn (block []u8) !ReadResult + +@[heap] +pub struct Decompressor { +mut: + untar &Untar +} + +// new_decompressor returns a Decompressor to decompress a tar.gz file +// A given Untar with a registered Reader will read the blocks. +pub fn new_decompresor(untar &Untar) &Decompressor { + return &Decompressor{ + untar: untar + } +} + +// read_all decompresses the given `tar_gz` array with all the tar blocks. +// Then calls untar method `read_all` to read all the blocks at once. +// A read result is returned which can be of the type stop early or an error. +pub fn (mut d Decompressor) read_all(tar_gz []u8) !ReadResult { + all_blocks := gzip.decompress(tar_gz)! + return d.untar.read_all_blocks(all_blocks)! +} + +// read_chunks decompresses the given `tar_gz` array by chunks of +// 32768 bytes which can hold up to 64 tar blocks of 512 bytes each. +// Then calls untar method read_block with ChunksReader dispatcher. +// A read result is returned which can be of the type stop early or an error. +pub fn (mut d Decompressor) read_chunks(tar_gz []u8) !ReadResult { + mut reader := &ChunksReader{ + read_block_fn: d.untar.read_single_block + } + callback := fn (chunk []u8, mut reader ChunksReader) int { + result := reader.read_blocks(chunk) + if result == .continue { + return chunk.len // go for more + } + return 0 // suspend + } + gzip.decompress_with_callback(tar_gz, callback, reader) or { + if reader.result == .continue { + return err + } + return reader.result + } + return reader.result +} + +// ChunkReader has a reusable fixed buffer with maximum length of decompressed chunk +// of 32768 bytes plus a maximum previous pending tar block of 512 bytes. +struct ChunksReader { +mut: + read_block_fn ReadResultFn = unsafe { nil } + buffer [32768 + 512]u8 + chunks_counter int + pending int // position of the last not sent buffer byte + result ReadResult +} + +// read_blocks receives a chunk like those of 32k from a gzip decompressor. The chunk is +// assumed to be a TAR archive section and is cut in 512 bytes blocks that are sent to +// the untar reader one by one. The untar reader result informs this process to continue or +// stop early. This process can keep in the buffer the remaining bytes of an incomplete +// block and will be send to the untar reader prepended to a next chunk cuts. +fn (mut d ChunksReader) read_blocks(chunk []u8) ReadResult { + d.chunks_counter++ + total := d.pending + chunk.len + if total > d.buffer.len { + assert false, 'Should not occur buffer overflow ${total}' + return .overflow + } + + // append new chunk after previous incomplete block bytes not sent yet + for i, ch in chunk { + d.buffer[i + d.pending] = ch + } + d.pending += chunk.len + + mut cut := 0 + for { + if cut + 512 > d.pending { + // after sending all complete blocks move the remaining not sent bytes + // to the start of the reused buffer to be prepended before next chunk + for i := cut; i < d.pending; i++ { + d.buffer[cut - 512] = d.buffer[i] + } + d.pending -= cut + return .continue + } + + // send a complete block + block := d.buffer[cut..cut + 512] + cut += 512 + d.result = d.read_block_fn(block) or { + assert false, 'Should not occur buffer overflow' + return .overflow + } + match d.result { + .continue { + // try next cut or leave a remaining + } + else { + break // untar error or stop_early + } + } + } + return d.result +} diff --git a/vlib/archive/tar/reader_test.v b/vlib/archive/tar/reader_test.v new file mode 100644 index 000000000..fcf24b3a7 --- /dev/null +++ b/vlib/archive/tar/reader_test.v @@ -0,0 +1,157 @@ +module tar + +import os + +fn testsuite_begin() { + os.chdir(@VMODROOT) or {} +} + +const testdata = 'vlib/archive/tar/testdata' + +// files copied from golang: https://github.com/golang/go/blob/master/src/archive/tar/testdata/file-and-dir.tar +fn test_golang_testdata() { + // [ ] dir | 0 bytes | folder + // [ ] small.txt | 5 bytes | file + r1 := new_test_reader('file-and-dir.tar', false)! + assert r1.dirs[0] == 'dir/' + assert r1.files['small.txt'] == 5 + assert r1.data['small.txt'] == 'Kilts'.bytes() + assert r1.other[0] == 'block:4 special:blank_1 continue' + assert r1.other[1] == 'block:5 special:blank_2 end_archive' + + // [ ] small.txt | 5 bytes | file + // [ ] small2.txt | 11 bytes | file + r2 := new_test_reader('gnu.tar', false)! + assert r2.dirs.len == 0 + assert r2.files['small.txt'] == 5 + assert r2.files['small2.txt'] == 11 + assert r2.data['small.txt'] == 'Kilts'.bytes() + assert r2.data['small2.txt'] == 'Google.com\n'.bytes() + + // [ ] h1bye | 0 bytes + r3 := new_test_reader('gnu-not-utf8.tar', false)! + r3_filename := [u8(`h`), `i`, 0x80, 0x81, 0x82, 0x83, `b`, `y`, `e`].bytestr() + r3_file_len := r3.files[r3_filename] or { assert false, 'file not found: ${r3_filename}' } + assert r3_file_len == 0 + assert r3.other.len == 2 + + // [ ] 0123456789 | 0 bytes + r4 := new_test_reader('gnu-long-nul.tar', false)! + assert r4.dirs.len == 0 + r4_filename := '0123456789' + r4_file_len := r4.files[r4_filename] or { + assert false, 'file ${r4_filename} not found in ${r4.files.keys()}' + } + assert r4_file_len == 0 + assert r4.other[0] == 'block:1 special:long_name size:161' + assert r4.other[1] == 'block:2 special:long_name data_part:161' + + // [ ] ☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹ | 0 bytes + r5 := new_test_reader('gnu-utf8.tar', false)! + r5_filename := '☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹' + r5_file_len := r5.files[r5_filename] or { assert false, 'file not found: ${r5_filename}' } + assert r5_file_len == 0 + assert r5.other[0] == 'block:1 special:long_name size:163' + assert r5.other[1] == 'block:2 special:long_name data_part:163' +} + +fn test_long_long_short() { + // test long path (human) substitute another long path (chimp) then a normal path (cat) + r1 := new_test_reader_gz('life.tar.gz', false)! + + mammal := 'life/Animalia/Chordata/Mammalia' + human := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Homo/Homo sapiens.txt' + chimp := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Pan/Pan troglodytes.txt' + cat := '${mammal}/Carnivora_Feliformia/Felidae_Felinae/Felis/Felis catus.txt' + assert human.len > 100 + assert chimp.len > 100 + assert cat.len <= 100 + assert r1.files[human] == 35 + assert r1.files[chimp] == 40 + assert r1.files[cat] == 33 + assert r1.texts[human] == 'https://en.wikipedia.org/wiki/Human' + assert r1.texts[chimp] == 'https://en.wikipedia.org/wiki/Chimpanzee' + assert r1.texts[cat] == 'https://en.wikipedia.org/wiki/Cat' +} + +struct TestReader { + debug bool +mut: + dirs []string + files map[string]u64 + data map[string][]u8 + texts map[string]string + other []string + + last_file string + last_data []u8 +} + +// new_test_reader reads files *.tar +fn new_test_reader(tar_file string, debug bool) !&TestReader { + mut reader := &TestReader{ + debug: debug + } + mut untar := Untar{ + reader: reader + } + all_blocks := os.read_bytes('${testdata}/${tar_file}')! + untar.read_all_blocks(all_blocks)! + return reader +} + +// new_test_reader_gz reads files *.tar.gz +fn new_test_reader_gz(tar_gz_file string, debug bool) !&TestReader { + mut reader := &TestReader{ + debug: debug + } + mut untar := Untar{ + reader: reader + } + mut decompressor := new_decompresor(untar) + tar_gz := os.read_bytes('${testdata}/${tar_gz_file}')! + decompressor.read_all(tar_gz)! + + return reader +} + +fn (mut t TestReader) dir_block(mut read Read, size u64) { + t.dirs << read.get_path() + if t.debug { + println('DIR #${read.get_block_number()} ${read.get_path()}') + } +} + +fn (mut t TestReader) file_block(mut read Read, size u64) { + t.last_file = read.get_path() + t.files[t.last_file] = size + if t.debug { + println('FILE #${read.get_block_number()} ${read.get_path()}') + } +} + +fn (mut t TestReader) data_block(mut read Read, data []u8, pending int) { + path := read.get_path() + if t.debug { + println('DATA #${read.get_block_number()} ${path}') + } + if t.last_file == path { + t.last_data << data + if pending == 0 { + t.data[t.last_file] = t.last_data.clone() + t.texts[path] = t.last_data.bytestr() + if t.debug { + println('TEXT #${read.get_block_number()} ${t.last_data.bytestr()}') + } + t.last_file = '' + t.last_data.clear() + } + } +} + +fn (mut t TestReader) other_block(mut read Read, details string) { + t.other << 'block:${read.block_number} special:${read.special} ${details}' + if t.debug { + println('OTHER #${read.get_block_number()} special:${read.special} ${details}') + } +} diff --git a/vlib/archive/tar/tar.v b/vlib/archive/tar/tar.v new file mode 100644 index 000000000..ee464995d --- /dev/null +++ b/vlib/archive/tar/tar.v @@ -0,0 +1,43 @@ +module tar + +// ustart header block octets +// Field | Offset | Length +// -------------------------- +// name | 0 | 100 +// mode | 100 | 8 +// uid | 108 | 8 +// gid | 116 | 8 +// size | 124 | 12 +// mtime | 136 | 12 +// chksum | 148 | 8 +// typeflag | 156 | 1 +// linkname | 157 | 100 +// magic | 257 | 6 +// version | 263 | 2 +// uname | 265 | 32 +// gname | 297 | 32 +// devmajor | 329 | 8 +// devminor | 337 | 8 +// prefix | 345 | 155 + +pub enum BlockHeader as u8 { + file = u8(`0`) // 0x30 + hard_link = u8(`1`) // 0x31 + sym_link = u8(`2`) // 0x32 + char_dev = u8(`3`) // 0x33 + block_dev = u8(`4`) // 0x34 + dir = u8(`5`) // 0x35 + fifo = u8(`6`) // 0x36 + long_name = u8(`L`) // 0x4c = 76 dec + global = u8(`g`) // 0x67 pax +} + +pub enum BlockSpecial { + no // for headers `0`,`5` or data blocks + blank_1 // first blank block: continue + blank_2 // second blank block: end of archiv + ignore // for headers `1`, `2`, `3`, `4`, `6` + long_name // for header `L` + global // for header `g` + unknown // for not header defined +} diff --git a/vlib/archive/tar/testdata/file-and-dir.tar b/vlib/archive/tar/testdata/file-and-dir.tar new file mode 100644 index 0000000000000000000000000000000000000000..c18d4283e38097edf81165094a4635a473693e94 GIT binary patch literal 2560 zcmXTUP0Y#BE2$`9pdB!P00ta1P}meE2U5adU}$JyY|NlwKszTPOfD@hNh|_dj}Re` zh3+}>Org6j@64Q%V!GQpYT;lBft1W5eOkK(J!_!xL0r`QZ(we2V#J_eN^6IK%pc+T SpB_OxYR6~@jE2DA3;_VVvLvAZ literal 0 HcmV?d00001 diff --git a/vlib/archive/tar/testdata/gnu-long-nul.tar b/vlib/archive/tar/testdata/gnu-long-nul.tar new file mode 100644 index 0000000000000000000000000000000000000000..28bc812aa60e81ea324297c81c738486acffc09c GIT binary patch literal 2560 zcmdPX*VA|K$9U5Ztq0(Jv@FdDKtv;8zq|ijXv5BIw^6DOh^2UK)_HbK1>@VQ0c5`qsHR zJrb1zXEcV16&lMRW}rdtKd=NSXg->JkvP|Esp4`g&CK_h+FMmo7oR?iU7RP&svn2t z!9Ke4)upeR_aRYKtT+(g`B!B>fS>t?p7IZ9V9LKWlK+)w+iY|SVQ_tY3I4Ddrx1w) M;($0H4*b6ZFWOBnBLDyZ literal 0 HcmV?d00001 diff --git a/vlib/archive/tar/testdata/life.tar.gz b/vlib/archive/tar/testdata/life.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..8abb599883cf1fbd64b34f5fddb65e0ac427fc12 GIT binary patch literal 626 zcmV-&0*(D2iwFp~xrS%}18iw#WiE7Kascg`?QYsI6ozvtxdLK4{*d--6Vk*FCRN%6 zAZw)HHHlFY_I33;&O}!_l{6jkpnD$)Y?`P99-q@UNlSNIWXZHJ!Wb1sZWMbs9!R)i zl1s$}HwokY?x~vixO(<&XPY!lKKo`WX%? zu)ESXTUWc)M^`y_>zk@*pS3>5)W6}>|AiD*8jp+^rN*cR=;NT@0mL+j%WK< zxuh15uv{<(|52FKKTUi##y`_W^!^RY;XekmY?i%U`g*f;_1#485d>=~556CdjO+Yp zEile?@V}hsc#b&uZ@lkL?KiajN}0a?!&BPYoh#~g)_r@8U*Z)L!+&4@T0Y(X>74)m z@4wd6|3d&sQWoP2_|Nn+a literal 0 HcmV?d00001 diff --git a/vlib/archive/tar/untar.v b/vlib/archive/tar/untar.v new file mode 100644 index 000000000..fcefd7957 --- /dev/null +++ b/vlib/archive/tar/untar.v @@ -0,0 +1,290 @@ +module tar + +// Untar uses a reader to parse the contents of a unix tar file. +// Reuses a fixed array of 512 bytes to parse each TAR block. +@[heap] +pub struct Untar { +mut: + reader Reader + max_blocks int + buffer [512]u8 // data to parse block + read Read // last read to send/receive to/from reader implementation + + state State // true when reading data blocks or long names + size int // remaining data size during state_data + + long_path &LongPath = unsafe { nil } // not nil to hold a file long_name + + blank_block int = -1 // last no-data block with all-zeros +} + +enum State { + header + data + long_path +} + +// new_untar builds a untar with a given Reader. +pub fn new_untar(reader Reader) &Untar { + return &Untar{ + reader: reader + } +} + +// str returns a string representation with max_blocks and last read. +pub fn (u Untar) str() string { + return 'max_blocks:${u.max_blocks} last_read:${u.read}' +} + +// read_all_blocks parses the data blocks of any decompressed *.tar.gz array. +// The data blocks length must be divisible by 512. +pub fn (mut u Untar) read_all_blocks(blocks []u8) !ReadResult { + if blocks.len % 512 != 0 { + return error('data_blocks size is not a multiple of 512') + } + u.max_blocks = blocks.len / 512 + for i := 0; i < blocks.len; i += 512 { + result := u.read_single_block(blocks[i..i + 512])! + if result != .continue { + return result + } + } + return .end_of_file +} + +// read_single_block parses one data block at a time. +// The data block length must be 512. Two consecutive no data blocks +// have 512 zeroes returns a .end_archive result. +pub fn (mut u Untar) read_single_block(block []u8) !ReadResult { + if block.len != 512 { + return error('data_block size is not 512') + } + u.read.block_number++ // 1,2,3... + + mut is_blank_block := true + for i in 0 .. 512 { + u.buffer[i] = block[i] + if block[i] != 0 { + is_blank_block = false + } + } + match u.state { + .header { + if is_blank_block { + // current non-data block is a blank block + prev_block := u.read.block_number - 1 + result := if u.blank_block == prev_block { + // two consecutive blank blocks + u.read.special = .blank_2 + ReadResult.end_archive + } else { + // first blank block + u.read.special = .blank_1 + ReadResult.continue + } + u.read.path_len = 0 + u.reader.other_block(mut u.read, '${result}') + u.blank_block = u.read.block_number + return result + } + u.read_header()! + } + .data { + u.read_data() + } + .long_path { + u.read_long_path() + } + } + return if u.read.stop_early { + .stop_early + } else { + .continue + } +} + +fn (mut u Untar) read_header() ! { + u.size = int(u.extract_octal(124, 12)) + header := u.buffer[156] // pos 0x9c + block_header := BlockHeader.from(header) or { + u.read.special = .unknown + u.read.path_len = 0 + u.reader.other_block(mut u.read, 'size:${u.size}') + return + } + match block_header { + .dir { + if !u.checksum_ok() { + return error('Checksum error: directory reading:${u.read}') + } + u.read.special = .no + u.read.set_short_path(u.buffer, false) + u.reader.dir_block(mut u.read, u64(u.size)) + // u.state = .header + } + .file { + if !u.checksum_ok() { + return error('Checksum error file reading:${u.read}') + } + u.read.special = .no + if u.long_path != unsafe { nil } { + u.read.set_long_path(u.long_path) + if u.size > 0 { + u.state = .data + } + } else { + u.read.set_short_path(u.buffer, true) + if u.size > 0 { + u.state = .data + } + } + u.reader.file_block(mut u.read, u64(u.size)) + } + .long_name { + u.read.special = .long_name + u.reader.other_block(mut u.read, 'size:${u.size}') + if u.size > 0 { + u.state = .long_path + u.long_path = new_long_path(u.size) + } + } + .hard_link, .sym_link, .char_dev, .block_dev, .fifo { + u.read.special = .ignore + u.reader.other_block(mut u.read, block_header.str()) + } + .global { + u.read.special = .global + u.read.set_short_path(u.buffer, false) + u.reader.other_block(mut u.read, 'size:${u.size}') + if u.size > 0 { + u.state = .data + } + } + } +} + +// reader_data calls Reader.data_block for implementor to collect data parts as file content +fn (mut u Untar) read_data() { + if u.size > 0 { + part := if u.size > 512 { 512 } else { u.size } + u.size -= 512 + pending := if u.size > 0 { u.size } else { 0 } + data_part := u.buffer[0..part] + u.reader.data_block(mut u.read, data_part, pending) + } + if u.size <= 0 { + u.long_path = unsafe { nil } + u.read.long_path = unsafe { nil } // real clear + u.state = .header + } +} + +fn (mut u Untar) read_long_path() { + if u.size > 0 { + part := if u.size > 512 { 512 } else { u.size } + u.size -= 512 + data_part := u.buffer[0..part] + if u.long_path != unsafe { nil } { + // this long path field collects the data parts as file long name + u.long_path.append(data_part) + u.reader.other_block(mut u.read, 'data_part:${data_part.len}') + } + } + if u.size <= 0 { + u.state = .header + } +} + +// extract_path returns the block path for directories and files. +fn (mut u Untar) extract_path() string { + mut name := []u8{} + mut i := 0 + for { + if i >= u.buffer.len { + break + } + letter := u.buffer[i] + if letter == 0 { + break + } + name << letter + i++ + } + return name.bytestr() +} + +// checksum_ok verifies the validity for dir and files blocks. +fn (mut u Untar) checksum_ok() bool { + mut v := u64(0) + for n := 0; n < 512; n++ { + if n < 148 || n > 155 { + v += u.buffer[n] + } else { + v += 0x20 + } + } + parse := u.extract_octal(148, 8) + return v == parse +} + +// extract_octal reads an octal number at block position `pos` with a given number of `digits`. +fn (mut u Untar) extract_octal(pos int, digits int) u64 { + mut i := u64(0) + mut p := pos + mut n := digits + for { + if (u.buffer[p] < `0` || u.buffer[p] > `7`) && n > 0 { + p++ + n-- + } else { + break + } + } + for { + if u.buffer[p] >= `0` && u.buffer[p] <= `7` && n > 0 { + i *= 8 + i += u8(u.buffer[p] - `0`) + p++ + n-- + } else { + break + } + } + return i +} + +@[heap] +struct LongPath { +mut: + name []u8 + last_pos int +} + +// new_long_path builds a LongPath with a fixed maximum name size +fn new_long_path(size int) &LongPath { + return &LongPath{ + name: []u8{len: size} + } +} + +// appends copies the data to the +fn (mut l LongPath) append(data []u8) { + if l.name.len >= l.last_pos + data.len { + for i, d in data { + l.name[l.last_pos + i] = d + } + l.last_pos += data.len + } +} + +// get_path returns the string from name appended as C string. +fn (l LongPath) get_path() string { + mut s := []u8{} + for n in l.name { + if n == 0 { + break + } + s << n + } + return s.bytestr() +} -- 2.39.5