v/vlib/encoding/csv/reader.v

// Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module csv

// Once interfaces are further along the idea would be to have something similar to
// go's io.reader & bufio.reader rather than reading the whole file into string, this
// would then satisfy that interface. I designed it this way to be easily adapted.
struct CommentIsDelimiterError {
	Error
}

fn (err CommentIsDelimiterError) msg() string {
	return 'encoding.csv: comment cannot be the same as delimiter'
}

struct InvalidDelimiterError {
	Error
}

fn (err InvalidDelimiterError) msg() string {
	return 'encoding.csv: invalid delimiter'
}

struct EndOfFileError {
	Error
}

fn (err EndOfFileError) msg() string {
	return 'encoding.csv: end of file'
}

struct InvalidLineEndingError {
	Error
}

fn (err InvalidLineEndingError) msg() string {
	return 'encoding.csv: could not find any valid line endings'
}

struct Reader {
	// not used yet
	// has_header        bool
	// headings          []string
	data      string
	delimiter u8
	comment   u8
mut:
	is_mac_pre_osx_le bool
	row_pos           int
}

[params]
pub struct ReaderConfig {
	delimiter u8 = `,`
	comment   u8 = `#`
}

// new_reader initializes a Reader with string data to parse and,
// optionally, a custom delimiter.
pub fn new_reader(data string, config ReaderConfig) &Reader {
	return &Reader{
		data: data
		delimiter: config.delimiter
		comment: config.comment
	}
}

// read reads a row from the CSV data.
// If successful, the result holds an array of each column's data.
pub fn (mut r Reader) read() ![]string {
	l := r.read_record()!
	return l
}

// Once we have multi dimensional array
// pub fn (mut r Reader) read_all() ?[][]string {
// 	mut records := []string{}
// 	for {
// 		record := r.read_record() or {
// 			if err.error == err_eof.error {
// 				return records
// 			} else {
// 				return err
// 			}
// 		}
// 		records << record
// 	}
// 	return records
// }
fn (mut r Reader) read_line() !string {
	// last record
	if r.row_pos == r.data.len {
		return &EndOfFileError{}
	}
	le := if r.is_mac_pre_osx_le { '\r' } else { '\n' }
	mut i := r.data.index_after(le, r.row_pos)
	if i == -1 {
		if r.row_pos == 0 {
			// check for pre osx mac line endings
			i = r.data.index_after('\r', r.row_pos)
			if i != -1 {
				r.is_mac_pre_osx_le = true
			} else {
				// no valid line endings found
				return &InvalidLineEndingError{}
			}
		} else {
			// No line ending on file
			i = r.data.len - 1
		}
	}
	mut line := r.data[r.row_pos..i]
	r.row_pos = i + 1
	// normalize win line endings (remove extra \r)
	if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) {
		line = line[..line.len - 1]
	}
	return line
}

fn (mut r Reader) read_record() ![]string {
	if r.delimiter == r.comment {
		return &CommentIsDelimiterError{}
	}
	if !valid_delim(r.delimiter) {
		return &InvalidDelimiterError{}
	}
	mut need_read := true
	mut keep_raw := false
	mut line := ''
	mut fields := []string{}
	mut i := -1
	for {
		if need_read {
			l := r.read_line()!
			if l.len <= 0 {
				if keep_raw {
					line += '\n'
				}
				continue
			} else if l[0] == r.comment {
				if keep_raw {
					line += '\n' + l
				}
				continue
			} else {
				if keep_raw {
					line += '\n'
				}
				line += l
			}
			need_read = false
			keep_raw = false
		}
		if line.len == 0 || line[0] != `"` { // not quoted
			j := line.index(r.delimiter.ascii_str()) or {
				// last
				fields << line[..line.len]
				break
			}
			i = j
			fields << line[..i]
			line = line[i + 1..]
			continue
		} else { // quoted
			mut need_more := true
			mut has_double_quotes := false
			mut j := 0
			mut n := 1
			for n < line.len {
				if line[n] == `"` {
					if n == line.len - 1 || line[n + 1] != `"` {
						need_more = false
						j = n - 1
						break
					} else {
						has_double_quotes = true
						n++
					}
				}
				n++
			}
			if need_more {
				need_read = true
				keep_raw = true
				continue
			}
			line = line[1..]
			if j + 1 == line.len {
				// last record
				fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
				break
			}
			next := line[j + 1]
			if next == r.delimiter {
				fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }
				if j + 2 == line.len {
					line = ''
				} else {
					line = line[j + 2..]
				}
				continue
			}
		}
		if i <= -1 && fields.len == 0 {
			return &InvalidDelimiterError{}
		}
	}
	return fields
}

fn valid_delim(b u8) bool {
	return b != 0 && b != `"` && b != `\r` && b != `\n`
}
all: 2023 copyright 2023-03-28 23:55:57 +03:00			`// Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved.`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// Use of this source code is governed by an MIT license`
			`// that can be found in the LICENSE file.`
encoding.csv: add write support 2019-08-17 15:51:20 +03:00			`module csv`

encoding.csv module 2019-08-14 09:45:56 +03:00			`// Once interfaces are further along the idea would be to have something similar to`
			`// go's io.reader & bufio.reader rather than reading the whole file into string, this`
			`// would then satisfy that interface. I designed it this way to be easily adapted.`
docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 12:54:10 +03:00			`struct CommentIsDelimiterError {`
			`Error`
add custom errors to encoding lib (#9513) 2021-03-30 15:27:26 +03:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 12:54:10 +03:00			`fn (err CommentIsDelimiterError) msg() string {`
			`return 'encoding.csv: comment cannot be the same as delimiter'`
add custom errors to encoding lib (#9513) 2021-03-30 15:27:26 +03:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 12:54:10 +03:00			`struct InvalidDelimiterError {`
			`Error`
add custom errors to encoding lib (#9513) 2021-03-30 15:27:26 +03:00			`}`

docs, builtin, encoding.csv: update error implementations (#13440) 2022-02-12 12:54:10 +03:00			`fn (err InvalidDelimiterError) msg() string {`
			`return 'encoding.csv: invalid delimiter'`
			`}`

			`struct EndOfFileError {`
			`Error`
			`}`

			`fn (err EndOfFileError) msg() string {`
			`return 'encoding.csv: end of file'`
			`}`

			`struct InvalidLineEndingError {`
			`Error`
			`}`

			`fn (err InvalidLineEndingError) msg() string {`
			`return 'encoding.csv: could not find any valid line endings'`
add custom errors to encoding lib (#9513) 2021-03-30 15:27:26 +03:00			`}`
encoding.csv module 2019-08-14 09:45:56 +03:00
encoding.csv: re-encapsulate fields in Writer/Reader (fix #15558) (#15570) 2022-08-28 11:13:43 +03:00			`struct Reader {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// not used yet`
			`// has_header bool`
			`// headings []string`
encoding.csv: re-encapsulate fields in Writer/Reader (fix #15558) (#15570) 2022-08-28 11:13:43 +03:00			`data string`
			`delimiter u8`
			`comment u8`
			`mut:`
encoding.csv module 2019-08-14 09:45:56 +03:00			`is_mac_pre_osx_le bool`
			`row_pos int`
			`}`

encoding.csv: allow passing a custom delimiter to the `new_reader` function (#13910) 2022-04-03 19:13:43 +03:00			`[params]`
			`pub struct ReaderConfig {`
all: ~500 more byte=>u8 2022-04-15 18:25:45 +03:00			delimiter u8 = `,`
			comment u8 = `#`
encoding.csv: allow passing a custom delimiter to the `new_reader` function (#13910) 2022-04-03 19:13:43 +03:00			`}`

			`// new_reader initializes a Reader with string data to parse and,`
			`// optionally, a custom delimiter.`
			`pub fn new_reader(data string, config ReaderConfig) &Reader {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`return &Reader{`
			`data: data`
encoding.csv: allow passing a custom delimiter to the `new_reader` function (#13910) 2022-04-03 19:13:43 +03:00			`delimiter: config.delimiter`
			`comment: config.comment`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`}`

encoding/csv: improve Reader docs (#6828) 2020-11-14 20:49:36 +03:00			`// read reads a row from the CSV data.`
			`// If successful, the result holds an array of each column's data.`
all: change optional to result in most of the libraries (#16123) 2022-10-20 22:14:33 +03:00			`pub fn (mut r Reader) read() ![]string {`
			`l := r.read_record()!`
encoding.csv module 2019-08-14 09:45:56 +03:00			`return l`
			`}`

			`// Once we have multi dimensional array`
parser: check `(mut f Foo)` syntax 2020-05-17 14:51:18 +03:00			`// pub fn (mut r Reader) read_all() ?[][]string {`
all: update`import ()` and `[]array` 2020-04-26 14:49:31 +03:00			`// mut records := []string{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// for {`
			`// record := r.read_record() or {`
all: update repo to use the new error handling syntax (#8950) 2021-02-28 23:20:21 +03:00			`// if err.error == err_eof.error {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// return records`
			`// } else {`
all: update repo to use the new error handling syntax (#8950) 2021-02-28 23:20:21 +03:00			`// return err`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// }`
			`// }`
			`// records << record`
			`// }`
			`// return records`
			`// }`
all: change optional to result in most of the libraries (#16123) 2022-10-20 22:14:33 +03:00			`fn (mut r Reader) read_line() !string {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// last record`
			`if r.row_pos == r.data.len {`
encoding.csv: remove unnecessary IError() cast 2022-10-27 11:30:08 +03:00			`return &EndOfFileError{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`le := if r.is_mac_pre_osx_le { '\r' } else { '\n' }`
			`mut i := r.data.index_after(le, r.row_pos)`
			`if i == -1 {`
			`if r.row_pos == 0 {`
			`// check for pre osx mac line endings`
			`i = r.data.index_after('\r', r.row_pos)`
			`if i != -1 {`
			`r.is_mac_pre_osx_le = true`
			`} else {`
			`// no valid line endings found`
encoding.csv: remove unnecessary IError() cast 2022-10-27 11:30:08 +03:00			`return &InvalidLineEndingError{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
csv: handle missing line ending 2020-04-21 01:02:55 +03:00			`} else {`
			`// No line ending on file`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`i = r.data.len - 1`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`}`
compiler/vlib: replace substr/left/right with `[start..end]` everywhere 2019-10-27 10:03:15 +03:00			`mut line := r.data[r.row_pos..i]`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`r.row_pos = i + 1`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// normalize win line endings (remove extra \r)`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			if !r.is_mac_pre_osx_le && (line.len >= 1 && line[line.len - 1] == `\r`) {
			`line = line[..line.len - 1]`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`return line`
			`}`

all: change optional to result in most of the libraries (#16123) 2022-10-20 22:14:33 +03:00			`fn (mut r Reader) read_record() ![]string {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`if r.delimiter == r.comment {`
encoding.csv: remove unnecessary IError() cast 2022-10-27 11:30:08 +03:00			`return &CommentIsDelimiterError{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
encoding.csv: add write support 2019-08-17 15:51:20 +03:00			`if !valid_delim(r.delimiter) {`
encoding.csv: remove unnecessary IError() cast 2022-10-27 11:30:08 +03:00			`return &InvalidDelimiterError{}`
encoding.csv: add write support 2019-08-17 15:51:20 +03:00			`}`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`mut need_read := true`
			`mut keep_raw := false`
encoding.csv module 2019-08-14 09:45:56 +03:00			`mut line := ''`
all: update`import ()` and `[]array` 2020-04-26 14:49:31 +03:00			`mut fields := []string{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`mut i := -1`
			`for {`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`if need_read {`
all: change optional to result in most of the libraries (#16123) 2022-10-20 22:14:33 +03:00			`l := r.read_line()!`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`if l.len <= 0 {`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`if keep_raw {`
			`line += '\n'`
			`}`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`continue`
			`} else if l[0] == r.comment {`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`if keep_raw {`
			`line += '\n' + l`
			`}`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`continue`
			`} else {`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`if keep_raw {`
			`line += '\n'`
			`}`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`line += l`
			`}`
			`need_read = false`
			`keep_raw = false`
			`}`
csv: fix parse error of last empty field on unquoted line (#10083) 2021-05-13 17:51:07 +03:00			if line.len == 0 \|\| line[0] != `"` { // not quoted
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`j := line.index(r.delimiter.ascii_str()) or {`
encoding.csv module 2019-08-14 09:45:56 +03:00			`// last`
csv: fix missing last column 2020-04-20 22:49:05 +03:00			`fields << line[..line.len]`
encoding.csv module 2019-08-14 09:45:56 +03:00			`break`
			`}`
cgen: fix returning optional consts; fix csv test 2020-04-08 18:21:36 +03:00			`i = j`
compiler/vlib: replace substr/left/right with `[start..end]` everywhere 2019-10-27 10:03:15 +03:00			`fields << line[..i]`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`line = line[i + 1..]`
encoding.csv module 2019-08-14 09:45:56 +03:00			`continue`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`} else { // quoted`
csv: fix csv fields with double quotes (#10399) 2021-06-10 19:24:20 +03:00			`mut need_more := true`
			`mut has_double_quotes := false`
			`mut j := 0`
			`mut n := 1`
			`for n < line.len {`
			if line[n] == `"` {
			if n == line.len - 1 \|\| line[n + 1] != `"` {
			`need_more = false`
			`j = n - 1`
			`break`
			`} else {`
			`has_double_quotes = true`
			`n++`
			`}`
			`}`
			`n++`
			`}`
			`if need_more {`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`need_read = true`
			`keep_raw = true`
			`continue`
csv: fix last-field-empty error 2020-04-29 17:50:02 +03:00			`}`
csv: fix field multiple lines error 2020-05-10 15:19:26 +03:00			`line = line[1..]`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`if j + 1 == line.len {`
csv: fix last-field-empty error 2020-04-29 17:50:02 +03:00			`// last record`
csv: fix csv fields with double quotes (#10399) 2021-06-10 19:24:20 +03:00			`fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }`
csv: fix last-field-empty error 2020-04-29 17:50:02 +03:00			`break`
			`}`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			`next := line[j + 1]`
csv: fix last-field-empty error 2020-04-29 17:50:02 +03:00			`if next == r.delimiter {`
csv: fix csv fields with double quotes (#10399) 2021-06-10 19:24:20 +03:00			`fields << if has_double_quotes { line[..j].replace('""', '"') } else { line[..j] }`
csv: fix error of read() (#9193) 2021-03-08 17:57:02 +03:00			`if j + 2 == line.len {`
csv: fix parse error of last empty field on unquoted line (#10083) 2021-05-13 17:51:07 +03:00			`line = ''`
			`} else {`
			`line = line[j + 2..]`
csv: fix error of read() (#9193) 2021-03-08 17:57:02 +03:00			`}`
csv: fix last-field-empty error 2020-04-29 17:50:02 +03:00			`continue`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`}`
			`if i <= -1 && fields.len == 0 {`
encoding.csv: remove unnecessary IError() cast 2022-10-27 11:30:08 +03:00			`return &InvalidDelimiterError{}`
encoding.csv module 2019-08-14 09:45:56 +03:00			`}`
			`}`
			`return fields`
			`}`
encoding.csv: add write support 2019-08-17 15:51:20 +03:00
all: ~500 more byte=>u8 2022-04-15 18:25:45 +03:00			`fn valid_delim(b u8) bool {`
all: byte.str() => byte.ascii_str() 2021-01-05 21:14:35 +03:00			return b != 0 && b != `"` && b != `\r` && b != `\n`
encoding.csv: add write support 2019-08-17 15:51:20 +03:00			`}`