From f6a85d5305649e050632bc9b4d3227b5e575d1ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Victor=20Oliveira=20Couto?= Date: Sat, 8 Aug 2020 23:13:34 -0300 Subject: [PATCH] net.html: create html parser module (#6076) --- vlib/net/html/README.md | 109 ++++++++++++ vlib/net/html/data_structures.v | 88 ++++++++++ vlib/net/html/dom.v | 216 ++++++++++++++++++++++++ vlib/net/html/dom_test.v | 60 +++++++ vlib/net/html/parser.v | 283 ++++++++++++++++++++++++++++++++ vlib/net/html/parser_test.v | 52 ++++++ vlib/net/html/tag.v | 86 ++++++++++ 7 files changed, 894 insertions(+) create mode 100644 vlib/net/html/README.md create mode 100644 vlib/net/html/data_structures.v create mode 100644 vlib/net/html/dom.v create mode 100644 vlib/net/html/dom_test.v create mode 100644 vlib/net/html/parser.v create mode 100644 vlib/net/html/parser_test.v create mode 100644 vlib/net/html/tag.v diff --git a/vlib/net/html/README.md b/vlib/net/html/README.md new file mode 100644 index 0000000000..dcdef90f79 --- /dev/null +++ b/vlib/net/html/README.md @@ -0,0 +1,109 @@ +# V HTML + +A HTML parser made in V + +## Usage + +If description below isn't enought, see test files + +### Parser + +Responsible for read HTML in full strings or splited string and returns all Tag objets of it HTML or return a DocumentObjectModel, that will try to find how the HTML Tree is. + +#### split_parse(data string) +This functions is the main function called by parse method to fragment parse your HTML + +#### parse_html(data string, is_file bool) +This function is called passing a filename or a complete html data string to it + +#### add_code_tag(name string) +This function is used to add a tag for the parser ignore it's content. For example, if you have an html or XML with a custom tag, like `" + mut parser := Parser{} + parser.parse_html(temp_html, false) + assert parser.get_tags()[2].get_content().len == 101 +} + +/*fn test_download_source() { + println('Fetching github data in pastebin') + resp := http.get('https://pastebin.com/raw/5snUQgqN') or { + println('failed to fetch data from the server') + return + } + println('Finalized fetching, start parsing') + mut parser := Parser{} + parser.parse_html(resp.text, false) + assert parser.get_tags().len == 2244 +}*/ diff --git a/vlib/net/html/tag.v b/vlib/net/html/tag.v new file mode 100644 index 0000000000..6a48628f93 --- /dev/null +++ b/vlib/net/html/tag.v @@ -0,0 +1,86 @@ +module html + +enum CloseTagType { + in_name + new_tag +} + +[ref_only] +pub struct Tag { +mut: + name string = '' + attributes map[string]string // attributes will be like map[name]value + last_attribute string = '' + content string = '' + children []&Tag + parent &Tag = C.NULL + position_in_parent int = 0 + closed bool = false + close_type CloseTagType = .in_name +} + +fn (mut tag Tag) add_parent(t &Tag, position int) { + tag.position_in_parent = position + tag.parent = t +} + +fn (mut tag Tag) add_child(t &Tag) int { + mut children := tag.children + children << t + tag.children = children + return tag.children.len +} + +pub fn (tag Tag) get_children() []Tag_ptr { + return tag.children +} + +pub fn (tag Tag) get_parent() &Tag { + return tag.parent +} + +pub fn (tag Tag) get_name() string { + return tag.name +} + +pub fn (tag Tag) get_content() string { + return tag.content +} + +pub fn (tag Tag) get_attributes() map[string]string { + return tag.attributes +} + +pub fn (tag Tag) text() string { + if tag.name.len >= 2 && tag.name[0..2] == 'br' { + return '\n' + } + mut to_return := tag.content.replace('\n', '') + for index := 0; index < tag.children.len; index++ { + to_return += tag.children[index].text() + } + return to_return +} + +pub fn (tag &Tag) str() string { + mut to_return := '<$tag.name' + for key in tag.attributes.keys() { + to_return += ' $key' + value := tag.attributes[key] + if value.len > 0 { + to_return += '=' + '"${tag.attributes[key]}"' + } + } + to_return += if tag.closed && tag.close_type == .in_name { '/>' } else { '>' } + to_return += '$tag.content' + if tag.children.len > 0 { + // println('${tag.name} have ${tag.children.len} childrens') + for index := 0; index < tag.children.len; index++ { + to_return += tag.get_children()[index].str() + } + } + if !tag.closed || tag.close_type == .new_tag { + to_return += '' + } + return to_return +}