-- Copyright Michal Hoftich, 2022 -- HTML parser inspired by https://browser.engineering/html.html -- but then redone using https://html.spec.whatwg.org/multipage/parsing.html -- -- There main purpose of this module is to create an useful DOM for later processing -- using LuaXML functions. Either for cleanup, or for translation to output formats, -- for example LaTeX. -- -- It should be possible to serialize DOM back to the original HTML code. -- -- We attempt to do some basic fixes, like to close paragraphs or list items that -- aren't closed correctly in the original code. We don't fix tables or -- formatting elements (see https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements) -- as these features don't seem necessary for the purpose of this module. We may change -- this policy in the future, if it turns out that they are necessary. -- -- local M = {} -- use local copies of utf8 functions local ucodepoint = utf8.codepoint local utfchar = utf8.char local function uchar(codepoint) if codepoint and codepoint > -1 then return utfchar(codepoint) end return "" end -- declare namespaces local xmlns = { HTML = "http://www.w3.org/1999/xhtml", MathML = "http://www.w3.org/1998/Math/MathML", SVG = "http://www.w3.org/2000/svg", XLink = "http://www.w3.org/1999/xlink", XML = "http://www.w3.org/XML/1998/namespace", XMLNS = "http://www.w3.org/2000/xmlns/", } -- we must make search tree for named entities, as their support -- is quite messy local named_entities if kpse then named_entities = require "luaxml-namedentities" else named_entities = require "luaxml.namedentities" end local entity_tree = {children = {}} local function update_tree(tree, char) local children = tree.children or {} local current = children[char] or {} children[char] = current tree.children = children return current end -- loop over named entities and update tree for entity, char in pairs(named_entities) do local tree = entity_tree for char in entity:gmatch(".") do tree = update_tree(tree,char) end tree.entity = entity tree.char = char end local function search_entity_tree(tbl) -- get named entity for the list of characters local tree = entity_tree for _,char in ipairs(tbl) do if tree.children then tree = tree.children[char] if not tree then return nil end else return nil end end -- print("tree", tree.char) return tree end -- declare basic node types local Root = { _type = "root", xmlns = xmlns.HTML } function Root:init() local o = {} setmetatable(o, self) self.__index = self self.__tostring = function (x) return "_ROOT" end o.children = {} return o end function Root:add_child(node) table.insert(self.children, node) end local Doctype = { _type = "doctype" } function Doctype:init(name, parent) local o = {} setmetatable(o, self) self.__index = self self.__tostring = function (x) if x.data then return "" else return "" end end self.add_child = Root.add_child o.parent = parent o.name = name o.children = {} return o end function Doctype:add_data(data) self.data = data end local Text = { _type = "text" } function Text:init(text, parent) local o = {} setmetatable(o, self) self.__index = self o.text = text self.__tostring = function (x) return "'" .. x.text .. "'" end self.add_child = Root.add_child o.parent = parent o.children = {} return o end local Comment = { _type = "comment" } function Comment:init(text, parent) local o = {} setmetatable(o, self) self.__index = self o.text = text self.__tostring = function (x) return "" end self.add_child = Root.add_child o.parent = parent o.children = {} return o end local Element = { _type = "element" } function Element:init(tag, parent) local o = {} setmetatable(o, self) self.__index = self -- tag can be table with unicode characters if type(tag) == "table" then o.tag = table.concat(tag) else o.tag = tag end self.__tostring = function(x) local attr = {} for _, el in ipairs(x.attr) do -- handle attributes local value if el.value:match('"') then value = "'" .. el.value .. "'" else value = '"' .. el.value .. '"' end attr[#attr+1] = el.name .. "=" .. value end local closing = ">" if x.self_closing then closing = " />" end if #attr > 0 then return "<" .. x.tag .. " " .. table.concat(attr, " ") .. closing else return "<" .. x.tag .. closing end end self.add_child = Root.add_child o.children = {} o.attr = {} o.parent = parent -- default xmlns o.xmlns = xmlns.HTML return o end -- state machine functions -- each function takes HtmlParser as an argument local HtmlStates = {} -- declare codepoints for more efficient processing local less_than = ucodepoint("<") local greater_than = ucodepoint(">") local amperesand = ucodepoint("&") local exclam = ucodepoint("!") local question = ucodepoint("?") local solidus = ucodepoint("/") local equals = ucodepoint("=") local quoting = ucodepoint('"') local apostrophe = ucodepoint("'") local semicolon = ucodepoint(";") local hyphen = ucodepoint("-") local dash = ucodepoint("-") local numbersign = ucodepoint("#") local smallx = ucodepoint("x") local bigx = ucodepoint("X") local right_square = ucodepoint("]") local EOF = -1 -- special character, meaning end of stream local null = 0 local function is_upper_alpha(codepoint) if (64 < codepoint and codepoint < 91) then return true end end local function is_lower_alpha(codepoint) if (96 < codepoint and codepoint < 123) then return true end end local function is_alpha(codepoint) -- detect if codepoint is alphanumeric if is_upper_alpha(codepoint) or is_lower_alpha(codepoint) then return true end return false end local function is_numeric(codepoint) if 47 < codepoint and codepoint < 58 then return true end end local function is_upper_hex(codepoint) if 64 < codepoint and codepoint < 71 then return true end end local function is_lower_hex(codepoint) if 96 < codepoint and codepoint < 103 then return true end end local function is_hexadecimal(codepoint) if is_numeric(codepoint) or is_lower_hex(codepoint) or is_upper_hex(codepoint) then return true end end local function is_alphanumeric(codepoint) return is_alpha(codepoint) or is_numeric(codepoint) end local function is_space(codepoint) -- detect space characters if codepoint==0x0009 or codepoint==0x000A or codepoint==0x000C or codepoint==0x0020 then return true end return false end local function is_surrogate(codepoint) return 0xD800 <= codepoint and codepoint <= 0xDFFF end character_entity_replace_table = { [0x80] = 0x20AC, [0x82] = 0x201A, [0x83] = 0x0192, [0x84] = 0x201E, [0x85] = 0x2026, [0x86] = 0x2020, [0x87] = 0x2021, [0x88] = 0x02C6, [0x89] = 0x2030, [0x8A] = 0x0160, [0x8B] = 0x2039, [0x8C] = 0x0152, [0x8E] = 0x017D, [0x91] = 0x2018, [0x92] = 0x2019, [0x93] = 0x201C, [0x94] = 0x201D, [0x95] = 0x2022, [0x96] = 0x2013, [0x97] = 0x2014, [0x98] = 0x02DC, [0x99] = 0x2122, [0x9A] = 0x0161, [0x9B] = 0x203A, [0x9C] = 0x0153, [0x9E] = 0x017E, [0x9F] = 0x0178 } local function fix_null(codepoint) if codepoint == null then return 0xFFFD else return codepoint end end HtmlStates.data = function(parser) -- this is the default state local codepoint = parser.codepoint -- print("codepoint", parser.codepoint) if codepoint == less_than then -- start of tag return "tag_open" elseif codepoint == amperesand then -- we must save the current state -- what we will return to after entity parser.return_state = "data" return "character_reference" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) end return "data" end HtmlStates.tag_open = function(parser) -- parse tag contents local codepoint = parser.codepoint if codepoint == exclam then return "markup_declaration_open" elseif codepoint == solidus then return "end_tag_open" elseif codepoint == question then parser:start_token("comment",{data={}}) return "bogus_comment" elseif is_alpha(codepoint) then local data = { name = {}, attr = {}, current_attr_name = {}, current_attr_value = {}, self_closing = false } parser:start_token("start_tag", data) return parser:tokenize("tag_name") elseif codepoint == EOF then parser:emit_character(">") parser:emit_eof() else -- invalid tag -- emit "<" and reconsume current character as data parser:emit_character("<") return parser:tokenize("data") end end HtmlStates.character_reference = function(parser) -- parse HTML entities -- initialize temp buffer parser.temp_buffer = {"&"} local codepoint = parser.codepoint if is_alphanumeric(codepoint) then return parser:tokenize("named_character_reference") elseif codepoint == numbersign then table.insert(parser.temp_buffer, uchar(codepoint)) return "numeric_character_reference" else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.named_character_reference = function(parser) -- named entity parsing is pretty complicated -- https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state local codepoint = parser.codepoint -- test if the current entity name is included in the named entity list local search_table = {} -- first char in temp buffer is &, which we don't want to lookup in the search tree for i=2, #parser.temp_buffer do search_table[#search_table+1] = parser.temp_buffer[i] end if codepoint == semicolon then -- close named entity local entity = search_entity_tree(search_table) if entity and entity.char then parser:add_entity(entity.char) else -- if the current name doesn't correspond to any named entity, flush everything into text parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end return parser.return_state else local char = uchar(codepoint) -- try if the current entity name is in the named entity search tree table.insert(search_table, char) local entity = search_entity_tree(search_table) if entity then -- keep parsing name entity while we match a name table.insert(parser.temp_buffer, char) return "named_character_reference" else -- here this will be more complicated if #search_table > 1 then local token = parser.current_token if token.type == "start_tag" and (codepoint == equals or is_alphanumeric(codepoint)) then -- in attribute value, flush characters and retokenize parser:flush_temp_buffer() return parser:tokenize(parser.return_state) else -- try to get entity for characters preceding the current character table.remove(search_table) local newentity = search_entity_tree(search_table) if newentity and newentity.char then parser:add_entity(newentity.char) else -- we need to find if parts of the current substring match a named entity -- for example ¬it; -> ¬it; but ∉ -> ∉ local rest = {} -- loop over the table with characters, and try to find if it matches entity for i = #search_table, 1,-1 do local removed_char = table.remove(search_table) -- table.insert(rest, 1, removed_char) newentity = search_entity_tree(search_table) if newentity and newentity.char then parser:add_entity(newentity.char) parser.temp_buffer = rest break end end -- replace temporary buffer witch characters that followed the matched entity parser:flush_temp_buffer() end return parser:tokenize(parser.return_state) end else -- search table contains only the current character parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end end end HtmlStates.numeric_character_reference = function(parser) -- this variable will hold the number local codepoint = parser.codepoint parser.character_reference_code = 0 if codepoint == smallx or codepoint == bigx then -- hexadecimal entity table.insert(parser.temp_buffer, uchar(codepoint)) return "hexadecimal_character_reference_start" else -- try decimal entity return parser:tokenize("decimal_character_reference_start") end end HtmlStates.hexadecimal_character_reference_start = function(parser) local codepoint = parser.codepoint if is_hexadecimal(codepoint) then return parser:tokenize("hexadecimal_character_reference") else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.decimal_character_reference_start = function(parser) local codepoint = parser.codepoint if is_numeric(codepoint) then return parser:tokenize("decimal_character_reference") else parser:flush_temp_buffer() return parser:tokenize(parser.return_state) end end HtmlStates.decimal_character_reference = function(parser) local codepoint = parser.codepoint -- helper functions for easier working with the character_reference_code local function multiply(number) parser.character_reference_code = parser.character_reference_code * number end local function add(number) parser.character_reference_code = parser.character_reference_code + number end if is_numeric(codepoint) then multiply(10) add(codepoint - 0x30) elseif codepoint == semicolon then return "numeric_reference_end_state" else -- this adds current entity parser:tokenize("numeric_reference_end_state") -- now tokenize the current character return parser:tokenize(parser.return_state) end return "decimal_character_reference" end HtmlStates.hexadecimal_character_reference = function(parser) local codepoint = parser.codepoint -- helper functions for easier working with the character_reference_code local function multiply(number) parser.character_reference_code = parser.character_reference_code * number end local function add(number) parser.character_reference_code = parser.character_reference_code + number end if is_numeric(codepoint) then multiply(16) add(codepoint - 0x30) elseif is_upper_hex(codepoint) then multiply(16) add(codepoint - 0x37) elseif is_lower_hex(codepoint) then multiply(16) add(codepoint - 0x57) elseif codepoint == semicolon then return "numeric_reference_end_state" else -- this adds current entity parser:tokenize("numeric_reference_end_state") -- now tokenize the current character return parser:tokenize(parser.return_state) end return "hexadecimal_character_reference" end HtmlStates.numeric_reference_end_state = function(parser) -- in this state, we don't need to local character = parser.character_reference_code -- we need to clean invalid character codes if character == 0x00 or character > 0x10FFFF or is_surrogate(character) then character = 0xFFFD -- should we add special support for "noncharacter"? I think we can pass them to the output anyway elseif character_entity_replace_table[character] then character = character_entity_replace_table[character] end parser:add_entity(uchar(character)) return parser.return_state end HtmlStates.markup_declaration_open = function(parser) -- started by is consumed as token.data return "consume_doctype_data" end end HtmlStates.consume_doctype_data = function(parser) -- this state just reads everything inside doctype as data local codepoint = parser.codepoint if codepoint == greater_than then parser:emit() return "data" elseif codepoint == EOF then doctype_eof(parser) else parser:append_token_data("data", uchar(codepoint)) return "consume_doctype_data" end end HtmlStates.tag_name = function(parser) local codepoint = parser.codepoint codepoint = fix_null(codepoint) if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" elseif is_upper_alpha(codepoint) then local lower = string.lower(uchar(codepoint)) parser:append_token_data("name", lower) elseif codepoint==EOF then parser:emit() parser:emit_eof() else local char = uchar(codepoint) parser:append_token_data("name", char) end return "tag_name" end HtmlStates.self_closing_tag = function(parser) local codepoint = parser.codepoint if codepoint == greater_than then parser.current_token.self_closing = true parser:emit() return "data" else return parser:tokenize("before_attribute_name") end end HtmlStates.before_attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then -- ignore spacing return "before_attribute_name" elseif codepoint == solidus or codepoint == greater_than then -- reconsume in after_attribute_name return parser:tokenize("after_attribute_name") elseif codepoint == equals then -- ToDo: handle https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name else -- start new attribute parser:start_attribute() return parser:tokenize("attribute_name") end end HtmlStates.attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) or codepoint == solidus or codepoint == greater_than then return parser:tokenize("after_attribute_name") elseif codepoint == equals then return "before_attribute_value" elseif is_upper_alpha(codepoint) then -- lowercase attribute names local lower = string.lower(uchar(codepoint)) parser:append_token_data("current_attr_name", lower) return "attribute_name" else parser:append_token_data("current_attr_name", uchar(codepoint)) return "attribute_name" end end HtmlStates.after_attribute_name = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "after_attribute_name" elseif codepoint == equals then return "before_attribute_value" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" else parser:start_attribute() return parser:tokenize("attribute_name") end end HtmlStates.before_attribute_value = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_value" elseif codepoint == quoting then return "attribute_value_quoting" elseif codepoint == apostrophe then return "attribute_value_apostrophe" elseif codepoint == greater_than then parser:emit() return "data" else return parser:tokenize("attribute_value_unquoted") end end HtmlStates.attribute_value_quoting = function(parser) local codepoint = parser.codepoint if codepoint == quoting then return "after_attribute_value_quoting" elseif codepoint == amperesand then parser.return_state = "attribute_value_quoting" return "character_reference" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_quoting" end end HtmlStates.attribute_value_apostrophe = function(parser) local codepoint = parser.codepoint if codepoint == apostrophe then return "after_attribute_value_quoting" elseif codepoint == amperesand then parser.return_state = "attribute_value_apostrophe" return "character_reference" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_apostrophe" end end HtmlStates.attribute_value_unquoted = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_name" elseif codepoint == amperesand then parser.return_state = "attribute_value_unquoted" return "character_reference" elseif codepoint == greater_than then parser:emit() return "data" else parser:append_token_data("current_attr_value", uchar(codepoint)) return "attribute_value_unquoted" end end HtmlStates.after_attribute_value_quoting = function(parser) local codepoint = parser.codepoint if is_space(codepoint) then return "before_attribute_name" elseif codepoint == solidus then return "self_closing_tag" elseif codepoint == greater_than then parser:emit() return "data" else return parser:tokenize("before_attribute_name") end end HtmlStates.rcdata = function(parser) -- this is the default state local codepoint = parser.codepoint -- print("codepoint", parser.codepoint) codepoint = fix_null(codepoint) if codepoint == less_than then -- start of tag return "rcdata_less_than" elseif codepoint == amperesand then -- we must save the current state -- what we will return to after entity parser.return_state = "rcdata" return "character_reference" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) end return "rcdata" end local function discard_rcdata_end_tag(parser, text) parser:discard_token() parser:emit_character(text) end HtmlStates.rcdata_less_than = function(parser) local codepoint = parser.codepoint if codepoint == solidus then return "rcdata_end_tag_open" else discard_rcdata_end_tag(parser, "<") return parser:tokenize("rcdata") end end HtmlStates.rcdata_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) parser.temp_buffer = {} return parser:tokenize("rcdata_end_tag_name") else discard_rcdata_end_tag(parser, "") return "script_data" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_escaped" end end HtmlStates.script_data_escaped_less_than_sign = function(parser) local codepoint = parser.codepoint if codepoint == solidus then parser.temp_buffer = {} return "script_data_escaped_end_tag_open" elseif is_alpha(codepoint) then parser.temp_buffer = {} parser:emit_character("<") return parser:tokenize("script_data_double_escape_start") else parser:emit_character("<") return parser:tokenize("script_data_escaped") end end HtmlStates.script_data_escaped_end_tag_open = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) then parser:start_token("end_tag", {name={}}) return parser:tokenize("script_data_escaped_end_tag_name") else parser:emit_character("") return "script_data" elseif codepoint == EOF then parser:emit_eof() else parser:emit_character(uchar(codepoint)) return "script_data_double_escaped" end end HtmlStates.script_data_double_escaped_less_than_sign = function(parser) local codepoint = parser.codepoint if codepoint == solidus then parser:emit("/") return "script_data_double_escape_end" else return parser:tokenize("script_data_double_escaped") end end HtmlStates.script_data_double_escape_end = function(parser) local codepoint = parser.codepoint if is_alpha(codepoint) or codepoint == solidus or codepoint == greater_than then local current_tag = table.concat(parser.current_token.name or {}) parser:emit_character(uchar(codepoint)) if current_tag == "script" then return "script_data_escaped" else return "script_data_double_escaped" end elseif is_upper_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint) + 0x20) return "script_data_double_escape_start" elseif is_lower_alpha(codepoint) then parser:emit_character(uchar(codepoint)) table.insert(parser.temp_buffer, uchar(codepoint)) return "script_data_double_escape_start" else return parser:tokenize("script_data_double_escaped") end end -- formatting elements needs special treatment local formatting_element_names ={ a = true, b = true, big = true, code = true, em = true, font = true, i = true, nobr = true, s = true, small = true, strike = true, strong = true, tt = true, u = true } local function is_formatting_element(name) return formatting_element_names[name] end local special_elements = {} local special_elements_list = {"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", "section", "select", "source", "style", "summary", "table", "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp", "mi","mo","mn","ms","mtext", "annotation-xml","foreignObject","desc", "title" } for k,v in ipairs(special_elements_list) do special_elements[v] = true end local function is_special(name) return special_elements[name] end -- these lists are used in HtmlParser:generate_implied_endtags() local implied_endtags = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true} local implied_endtags_thoroughly = {dd=true, dt=true, li = true, optgroup = true, option = true, p = true, rb = true, rp = true, rd = true, trc = true, caption = true, colgroup = true, tbody = true, td = true, tfoot = true, th = true, thead = true, tr = true } -- find if unfinished tags list contain a tag -- it fails if any element from element_list is matched before that tag local function is_in_scope(parser, target, element_list) for i = #parser.unfinished, 1, -1 do local node = parser.unfinished[i] local tag = node.tag if tag == target then return true elseif element_list[tag] then return false end end return false end local particular_scope_elements = { applet = true, caption = true, html = true, table = true, td = true, th = true, marquee = true, object = true, template = true, mi = true, mo = true, mn = true, ms = true, mtext = true, ["annotation-xml"] = true, foreignObject = true, desc = true, title = true, } local function is_in_particular_scope(parser, target) return is_in_scope(parser, target, particular_scope_elements) end -- derived scope lists -- -- list_item scope local list_item_scope_elements = {ol = true, ul = true} for k,v in pairs(particular_scope_elements) do list_item_scope_elements[k] = v end local function is_in_list_item_scope(parser, target) return is_in_scope(parser, target, list_item_scope_elements) end -- button scope local button_scope_elements = {button = true} for k,v in pairs(particular_scope_elements) do button_scope_elements[k] = v end local function is_in_button_scope(parser, target) return is_in_scope(parser, target, button_scope_elements) end -- table scope local table_scope_elements = {html = true, table = true, template = true} local function is_in_table_scope(parser, target) return is_in_scope(parser, target, table_scope_elements) end -- select scope local function is_in_select_scope(parser, target) -- this scope is specific, because it supports all tags except two for i = #parser.unfinished, 1, -1 do local node = parser.unfinished[i] local tag = node.tag if tag == target then return true elseif tag == "optgroup" or tag == "option" then -- only these two tags are supported else return false end end return false end -- List of active formatting elements -- https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements -- we don't implement it yet, but maybe in the future. local HtmlTreeStates = {} local HtmlParser = {} function HtmlParser:init(body) local o ={} setmetatable(o, self) self.__index = self o.body = self:normalize_newlines(body) -- HTML string o.position = 0 -- position in the parsed string o.unfinished = {} -- insert Root node into the list of opened elements o.Document = Root:init() o.default_state = "data" -- default state machine state o.state = o.default_state -- working state of the machine o.return_state = o.default_state -- special state set by entities parsing o.temp_buffer = {} -- keep temporary data o.current_token = {type="start"} -- currently processed token o.insertion_mode = "initial" -- tree construction state o.head_pointer = nil -- pointer to the Head element o.form_pointer = nil o.active_formatting = {} -- list of active formatting elements o.scripting_flag = false -- we will not support scripting return o end function HtmlParser:normalize_newlines(body) -- we must normalize newlines return body:gsub("\r\n", "\n"):gsub("\r", "\n") end -- declare void elements local self_closing_tags_list = {"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"} local self_closing_tags = {} for _,v in ipairs(self_closing_tags_list) do self_closing_tags[v] = true end function HtmlParser:parse() -- we assume utf8 input, you must convert it yourself if the source is -- in a different encoding self.text = {} self.state = self.default_state -- this should enable us to pass over some characters that we want to ignore -- for example scripts, css, etc. self.ignored_pos = -1 for pos, ucode in utf8.codes(self.body) do -- save buffer info and require the tokenize function if pos > self.ignored_pos then self.position = pos self.codepoint = ucode self.character = uchar(ucode) self.state = self:tokenize(self.state) or self.state -- if tokenizer don't return new state, assume that it continues in the current state end end return self:finish() end function HtmlParser:tokenize(state) local state = state or self.state local ucode = self.codepoint local text = self.text self.last_position = self.position self.element_state = false -- execute state machine object and return new state local fn = HtmlStates[state] or function(parser) return self.default_state end local newstate = fn(self) -- this should enable changing state from elements that needs special treatment, like