1 files changed, 97 insertions, 0 deletions
diff --git a/src/services/html_converter/html_tree_converter.service.js b/src/services/html_converter/html_tree_converter.service.js
new file mode 100644
index 00000000..6a8796c4
--- /dev/null
+++ b/src/services/html_converter/html_tree_converter.service.js
@@ -0,0 +1,97 @@
+import { getTagName } from './utility.service.js'
+
+/**
+ * This is a not-so-tiny purpose-built HTML parser/processor. This parses html
+ * and converts it into a tree structure representing tag openers/closers and
+ * children.
+ *
+ * Structure follows this pattern: [opener, [...children], closer] except root
+ * node which is just [...children]. Text nodes can only be within children and
+ * are represented as strings.
+ *
+ * Intended use is to convert HTML structure and then recursively iterate over it
+ * most likely using a map. Very useful for dynamically rendering html replacing
+ * tags with JSX elements in a render function.
+ *
+ * known issue: doesn't handle CDATA so CDATA might not work well
+ * known issue: doesn't handle HTML comments
+ *
+ * @param {Object} input - input data
+ * @return {string} processed html
+ */
+export const convertHtmlToTree = (html = '') => {
+  // Elements that are implicitly self-closing
+  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+  const emptyElements = new Set([
+    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+  ])
+  // TODO For future - also parse HTML5 multi-source components?
+
+  const buffer = [] // Current output buffer
+  const levels = [['', buffer]] // How deep we are in tags and which tags were there
+  let textBuffer = '' // Current line content
+  let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
+
+  const getCurrentBuffer = () => {
+    return levels[levels.length - 1][1]
+  }
+
+  const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
+    if (textBuffer === '') return
+    getCurrentBuffer().push(textBuffer)
+    textBuffer = ''
+  }
+
+  const handleSelfClosing = (tag) => {
+    getCurrentBuffer().push([tag])
+  }
+
+  const handleOpen = (tag) => {
+    const curBuf = getCurrentBuffer()
+    const newLevel = [tag, []]
+    levels.push(newLevel)
+    curBuf.push(newLevel)
+  }
+
+  const handleClose = (tag) => {
+    const currentTag = levels[levels.length - 1]
+    if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
+      currentTag.push(tag)
+      levels.pop()
+    } else {
+      getCurrentBuffer().push(tag)
+    }
+  }
+
+  for (let i = 0; i < html.length; i++) {
+    const char = html[i]
+    if (char === '<' && tagBuffer === null) {
+      flushText()
+      tagBuffer = char
+    } else if (char !== '>' && tagBuffer !== null) {
+      tagBuffer += char
+    } else if (char === '>' && tagBuffer !== null) {
+      tagBuffer += char
+      const tagFull = tagBuffer
+      tagBuffer = null
+      const tagName = getTagName(tagFull)
+      if (tagFull[1] === '/') {
+        handleClose(tagFull)
+      } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
+        // self-closing
+        handleSelfClosing(tagFull)
+      } else {
+        handleOpen(tagFull)
+      }
+    } else {
+      textBuffer += char
+    }
+  }
+  if (tagBuffer) {
+    textBuffer += tagBuffer
+  }
+
+  flushText()
+  return buffer
+}