diff options
Diffstat (limited to 'src/services/html_converter/html_tree_converter.service.js')
| -rw-r--r-- | src/services/html_converter/html_tree_converter.service.js | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/src/services/html_converter/html_tree_converter.service.js b/src/services/html_converter/html_tree_converter.service.js new file mode 100644 index 00000000..6a8796c4 --- /dev/null +++ b/src/services/html_converter/html_tree_converter.service.js @@ -0,0 +1,97 @@ +import { getTagName } from './utility.service.js' + +/** + * This is a not-so-tiny purpose-built HTML parser/processor. This parses html + * and converts it into a tree structure representing tag openers/closers and + * children. + * + * Structure follows this pattern: [opener, [...children], closer] except root + * node which is just [...children]. Text nodes can only be within children and + * are represented as strings. + * + * Intended use is to convert HTML structure and then recursively iterate over it + * most likely using a map. Very useful for dynamically rendering html replacing + * tags with JSX elements in a render function. + * + * known issue: doesn't handle CDATA so CDATA might not work well + * known issue: doesn't handle HTML comments + * + * @param {Object} input - input data + * @return {string} processed html + */ +export const convertHtmlToTree = (html = '') => { + // Elements that are implicitly self-closing + // https://developer.mozilla.org/en-US/docs/Glossary/empty_element + const emptyElements = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' + ]) + // TODO For future - also parse HTML5 multi-source components? + + const buffer = [] // Current output buffer + const levels = [['', buffer]] // How deep we are in tags and which tags were there + let textBuffer = '' // Current line content + let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag + + const getCurrentBuffer = () => { + return levels[levels.length - 1][1] + } + + const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer + if (textBuffer === '') return + getCurrentBuffer().push(textBuffer) + textBuffer = '' + } + + const handleSelfClosing = (tag) => { + getCurrentBuffer().push([tag]) + } + + const handleOpen = (tag) => { + const curBuf = getCurrentBuffer() + const newLevel = [tag, []] + levels.push(newLevel) + curBuf.push(newLevel) + } + + const handleClose = (tag) => { + const currentTag = levels[levels.length - 1] + if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) { + currentTag.push(tag) + levels.pop() + } else { + getCurrentBuffer().push(tag) + } + } + + for (let i = 0; i < html.length; i++) { + const char = html[i] + if (char === '<' && tagBuffer === null) { + flushText() + tagBuffer = char + } else if (char !== '>' && tagBuffer !== null) { + tagBuffer += char + } else if (char === '>' && tagBuffer !== null) { + tagBuffer += char + const tagFull = tagBuffer + tagBuffer = null + const tagName = getTagName(tagFull) + if (tagFull[1] === '/') { + handleClose(tagFull) + } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') { + // self-closing + handleSelfClosing(tagFull) + } else { + handleOpen(tagFull) + } + } else { + textBuffer += char + } + } + if (tagBuffer) { + textBuffer += tagBuffer + } + + flushText() + return buffer +} |
