2020-08-09 05:13:34 +03:00
|
|
|
module html
|
|
|
|
|
|
|
|
import os
|
2020-12-09 22:08:15 +03:00
|
|
|
import strings
|
2020-08-09 05:13:34 +03:00
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
struct LexicalAttributes {
|
2020-08-09 05:13:34 +03:00
|
|
|
mut:
|
|
|
|
current_tag &Tag
|
2020-09-09 16:34:41 +03:00
|
|
|
open_tag bool
|
|
|
|
open_code bool
|
2020-09-09 15:14:44 +03:00
|
|
|
open_string int
|
2020-09-09 16:34:41 +03:00
|
|
|
open_comment bool
|
|
|
|
is_attribute bool
|
2020-09-09 15:14:44 +03:00
|
|
|
opened_code_type string
|
|
|
|
line_count int
|
2021-05-31 14:21:06 +03:00
|
|
|
lexeme_builder strings.Builder = strings.new_builder(100)
|
2021-08-04 12:44:41 +03:00
|
|
|
code_tags map[string]bool = {
|
2021-05-31 14:21:06 +03:00
|
|
|
'script': true
|
|
|
|
'style': true
|
|
|
|
}
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
|
2020-08-09 05:13:34 +03:00
|
|
|
pub struct Parser {
|
|
|
|
mut:
|
|
|
|
dom DocumentObjectModel
|
2020-12-09 22:08:15 +03:00
|
|
|
lexical_attributes LexicalAttributes = LexicalAttributes{
|
2021-02-25 15:24:30 +03:00
|
|
|
current_tag: &Tag{}
|
|
|
|
}
|
|
|
|
filename string = 'direct-parse'
|
|
|
|
initialized bool
|
|
|
|
tags []&Tag
|
|
|
|
debug_file os.File
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// This function is used to add a tag for the parser ignore it's content.
|
|
|
|
// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
|
|
|
|
// like `add_code_tag('script')` will make all `script` tags content be jumped,
|
|
|
|
// so you still have its content, but will not confuse the parser with it's `>` or `<`.
|
2020-08-09 05:13:34 +03:00
|
|
|
pub fn (mut parser Parser) add_code_tag(name string) {
|
2020-12-09 22:08:15 +03:00
|
|
|
if name.len <= 0 {
|
|
|
|
return
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.code_tags[name] = true
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
[inline]
|
2020-08-09 05:13:34 +03:00
|
|
|
fn (parser Parser) builder_str() string {
|
2020-12-09 22:08:15 +03:00
|
|
|
return parser.lexical_attributes.lexeme_builder.after(0)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
[if debug]
|
|
|
|
fn (mut parser Parser) print_debug(data string) {
|
|
|
|
$if debug {
|
|
|
|
if data.len > 0 {
|
2021-03-01 02:18:14 +03:00
|
|
|
parser.debug_file.writeln(data) or { panic(err) }
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn (mut parser Parser) verify_end_comment(remove bool) bool {
|
|
|
|
lexeme := parser.builder_str()
|
|
|
|
last := lexeme[lexeme.len - 1]
|
|
|
|
penultimate := lexeme[lexeme.len - 2]
|
2020-12-09 22:08:15 +03:00
|
|
|
is_end_comment := last == `-` && penultimate == `-`
|
2020-08-09 05:13:34 +03:00
|
|
|
if is_end_comment && remove {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.go_back(2)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
return is_end_comment
|
|
|
|
}
|
|
|
|
|
|
|
|
fn blank_string(data string) bool {
|
|
|
|
mut count := 0
|
2020-12-09 22:08:15 +03:00
|
|
|
for chr in data {
|
|
|
|
if chr == 9 || chr == 32 {
|
2020-08-09 05:13:34 +03:00
|
|
|
count++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return count == data.len
|
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// init initializes the parser.
|
|
|
|
fn (mut parser Parser) init() {
|
|
|
|
if parser.initialized {
|
|
|
|
return
|
|
|
|
}
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.dom = DocumentObjectModel{
|
|
|
|
debug_file: parser.debug_file
|
|
|
|
root: &Tag{}
|
|
|
|
}
|
|
|
|
parser.add_code_tag('')
|
|
|
|
parser.tags = []&Tag{}
|
|
|
|
parser.dom.close_tags['/!document'] = true
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag = &Tag{}
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.initialized = true
|
|
|
|
}
|
|
|
|
|
|
|
|
fn (mut parser Parser) generate_tag() {
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.open_tag {
|
|
|
|
return
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2021-02-25 15:24:30 +03:00
|
|
|
if parser.lexical_attributes.current_tag.name.len > 0
|
|
|
|
|| parser.lexical_attributes.current_tag.content.len > 0 {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.tags << parser.lexical_attributes.current_tag
|
|
|
|
}
|
|
|
|
parser.lexical_attributes.current_tag = &Tag{}
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// split_parse parses the HTML fragment
|
2020-08-09 05:13:34 +03:00
|
|
|
pub fn (mut parser Parser) split_parse(data string) {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.init()
|
|
|
|
for chr in data {
|
|
|
|
// returns true if byte is a " or '
|
2021-08-06 06:21:28 +03:00
|
|
|
is_quote := chr == `"` || chr == `'`
|
2020-12-09 22:08:15 +03:00
|
|
|
string_code := match chr {
|
|
|
|
`"` { 1 } // "
|
2021-08-06 06:21:28 +03:00
|
|
|
`'` { 2 } // '
|
2020-08-09 05:13:34 +03:00
|
|
|
else { 0 }
|
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2021-02-25 15:24:30 +03:00
|
|
|
if parser.lexical_attributes.open_string > 0
|
|
|
|
&& parser.lexical_attributes.open_string == string_code {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.open_string = 0
|
|
|
|
} else if is_quote {
|
|
|
|
parser.lexical_attributes.open_string = string_code
|
|
|
|
} else if chr == `>` { // only execute verification if is a > // here will verify < to know if code tag is finished
|
|
|
|
name_close_tag := '</$parser.lexical_attributes.opened_code_type>'
|
|
|
|
if parser.builder_str().to_lower().ends_with(name_close_tag) {
|
|
|
|
parser.lexical_attributes.open_code = false
|
2020-08-09 05:13:34 +03:00
|
|
|
// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
|
|
|
|
parser.lexical_attributes.current_tag.closed = true
|
|
|
|
parser.lexical_attributes.current_tag.close_type = .new_tag
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if parser.lexical_attributes.open_comment {
|
|
|
|
if chr == `>` && parser.verify_end_comment(false) { // close tag '>'
|
|
|
|
// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
|
|
|
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
|
|
|
parser.lexical_attributes.open_comment = false
|
|
|
|
parser.lexical_attributes.open_tag = false
|
2020-08-09 05:13:34 +03:00
|
|
|
} else {
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if parser.lexical_attributes.open_string > 0 {
|
|
|
|
if parser.lexical_attributes.open_string == string_code {
|
|
|
|
parser.lexical_attributes.open_string = 0
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-08-09 05:13:34 +03:00
|
|
|
temp_lexeme := parser.builder_str()
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.current_tag.last_attribute != '' {
|
|
|
|
lattr := parser.lexical_attributes.current_tag.last_attribute
|
2020-08-20 16:45:54 +03:00
|
|
|
nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
|
|
|
|
// parser.print_debug(lattr + " = " + temp_lexeme)
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.attributes[lattr] = nval
|
|
|
|
parser.lexical_attributes.current_tag.last_attribute = ''
|
2020-08-09 05:13:34 +03:00
|
|
|
} else {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
2020-08-09 05:13:34 +03:00
|
|
|
} else {
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if parser.lexical_attributes.open_tag {
|
|
|
|
if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
|
|
|
|
parser.lexical_attributes.open_string = string_code
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if chr == `>` { // close tag >
|
2020-08-09 05:13:34 +03:00
|
|
|
complete_lexeme := parser.builder_str().to_lower()
|
2021-02-25 15:24:30 +03:00
|
|
|
parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0
|
|
|
|
&& complete_lexeme[complete_lexeme.len - 1] == `/`) // if equals to /
|
2020-12-09 22:08:15 +03:00
|
|
|
if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.dom.close_tags[complete_lexeme] = true
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.closed = true
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
*/
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.current_tag.name == '' {
|
|
|
|
parser.lexical_attributes.current_tag.name = complete_lexeme
|
2020-08-09 05:13:34 +03:00
|
|
|
} else if complete_lexeme != '/' {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.open_tag = false
|
|
|
|
parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
|
|
|
|
if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
|
|
|
|
parser.lexical_attributes.open_code = true
|
|
|
|
parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
// parser.print_debug(parser.lexical_attributes.current_tag.name)
|
2022-04-15 14:58:56 +03:00
|
|
|
} else if chr !in [u8(9), ` `, `=`, `\n`] { // Tab, space, = and \n
|
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if chr != 10 {
|
2020-08-09 05:13:34 +03:00
|
|
|
complete_lexeme := parser.builder_str().to_lower()
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.current_tag.name == '' {
|
|
|
|
parser.lexical_attributes.current_tag.name = complete_lexeme
|
2020-08-09 05:13:34 +03:00
|
|
|
} else {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
|
|
|
|
parser.lexical_attributes.current_tag.last_attribute = ''
|
|
|
|
if chr == `=` { // if was a =
|
|
|
|
parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
if parser.builder_str() == '!--' {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.open_comment = true
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
} else if chr == `<` { // open tag '<'
|
2020-08-09 05:13:34 +03:00
|
|
|
temp_string := parser.builder_str()
|
2020-12-09 22:08:15 +03:00
|
|
|
if parser.lexical_attributes.lexeme_builder.len >= 1 {
|
2021-02-25 15:24:30 +03:00
|
|
|
if parser.lexical_attributes.current_tag.name.len > 1
|
|
|
|
&& parser.lexical_attributes.current_tag.name[0] == 47
|
|
|
|
&& !blank_string(temp_string) {
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.tags << &Tag{
|
|
|
|
name: 'text'
|
|
|
|
content: temp_string
|
|
|
|
}
|
|
|
|
} else {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
}
|
2020-12-09 22:08:15 +03:00
|
|
|
// parser.print_debug(parser.lexical_attributes.current_tag.str())
|
|
|
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.generate_tag()
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.open_tag = true
|
2020-08-09 05:13:34 +03:00
|
|
|
} else {
|
2022-04-15 14:58:56 +03:00
|
|
|
parser.lexical_attributes.lexeme_builder.write_u8(chr)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// parse_html parses the given HTML string
|
|
|
|
pub fn (mut parser Parser) parse_html(data string) {
|
|
|
|
parser.init()
|
|
|
|
mut lines := data.split_into_lines()
|
2020-08-09 05:13:34 +03:00
|
|
|
for line in lines {
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.lexical_attributes.line_count++
|
2020-08-09 05:13:34 +03:00
|
|
|
parser.split_parse(line)
|
|
|
|
}
|
|
|
|
parser.generate_tag()
|
|
|
|
parser.dom.debug_file = parser.debug_file
|
2020-12-09 22:08:15 +03:00
|
|
|
parser.dom.construct(parser.tags)
|
2020-08-09 05:13:34 +03:00
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// finalize finishes the parsing stage .
|
|
|
|
[inline]
|
2020-08-09 05:13:34 +03:00
|
|
|
pub fn (mut parser Parser) finalize() {
|
|
|
|
parser.generate_tag()
|
|
|
|
}
|
|
|
|
|
2020-12-09 22:08:15 +03:00
|
|
|
// get_dom returns the parser's current DOM representation.
|
2020-08-09 05:13:34 +03:00
|
|
|
pub fn (mut parser Parser) get_dom() DocumentObjectModel {
|
|
|
|
if !parser.dom.constructed {
|
|
|
|
parser.generate_tag()
|
|
|
|
parser.dom.construct(parser.tags)
|
|
|
|
}
|
|
|
|
return parser.dom
|
|
|
|
}
|