aboutsummaryrefslogtreecommitdiff
path: root/src/services/html_converter/html_tree_converter.service.js
diff options
context:
space:
mode:
authorHenry Jameson <me@hjkos.com>2022-01-24 19:12:17 +0200
committerHenry Jameson <me@hjkos.com>2022-01-24 19:12:17 +0200
commit9ea0f10abb195799842baf2f4b8711d1744b37bb (patch)
tree2c64119cc7c6bc989e65189712ce81cbbb40c91a /src/services/html_converter/html_tree_converter.service.js
parenta96a62929d723c3676174dfd71c0db4462599a12 (diff)
parent182fcca5da9fa284f46f5ca1c8b1790353dec316 (diff)
Merge remote-tracking branch 'origin/develop' into settings-and-filtering
* origin/develop: (169 commits) Improve the user card for deactivated users Update CHANGELOG.md Update CHANGELOG.md Allow canceling a follow request Simple policy reasons for instance specific policies entity_normalizer: Escape name when parsing user Translated using Weblate (Spanish) Translated using Weblate (Catalan) Translated using Weblate (Korean) Translated using Weblate (Japanese (ja_PEDANTIC)) Translated using Weblate (Indonesian) Translated using Weblate (Esperanto) Translated using Weblate (Vietnamese) Translated using Weblate (Italian) Translated using Weblate (Vietnamese) Translated using Weblate (Indonesian) Translated using Weblate (Italian) Translated using Weblate (Vietnamese) Translated using Weblate (Indonesian) Translated using Weblate (Chinese (Simplified)) ...
Diffstat (limited to 'src/services/html_converter/html_tree_converter.service.js')
-rw-r--r--src/services/html_converter/html_tree_converter.service.js97
1 files changed, 97 insertions, 0 deletions
diff --git a/src/services/html_converter/html_tree_converter.service.js b/src/services/html_converter/html_tree_converter.service.js
new file mode 100644
index 00000000..6a8796c4
--- /dev/null
+++ b/src/services/html_converter/html_tree_converter.service.js
@@ -0,0 +1,97 @@
+import { getTagName } from './utility.service.js'
+
+/**
+ * This is a not-so-tiny purpose-built HTML parser/processor. This parses html
+ * and converts it into a tree structure representing tag openers/closers and
+ * children.
+ *
+ * Structure follows this pattern: [opener, [...children], closer] except root
+ * node which is just [...children]. Text nodes can only be within children and
+ * are represented as strings.
+ *
+ * Intended use is to convert HTML structure and then recursively iterate over it
+ * most likely using a map. Very useful for dynamically rendering html replacing
+ * tags with JSX elements in a render function.
+ *
+ * known issue: doesn't handle CDATA so CDATA might not work well
+ * known issue: doesn't handle HTML comments
+ *
+ * @param {Object} input - input data
+ * @return {string} processed html
+ */
+export const convertHtmlToTree = (html = '') => {
+ // Elements that are implicitly self-closing
+ // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+ const emptyElements = new Set([
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+ 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+ ])
+ // TODO For future - also parse HTML5 multi-source components?
+
+ const buffer = [] // Current output buffer
+ const levels = [['', buffer]] // How deep we are in tags and which tags were there
+ let textBuffer = '' // Current line content
+ let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
+
+ const getCurrentBuffer = () => {
+ return levels[levels.length - 1][1]
+ }
+
+ const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
+ if (textBuffer === '') return
+ getCurrentBuffer().push(textBuffer)
+ textBuffer = ''
+ }
+
+ const handleSelfClosing = (tag) => {
+ getCurrentBuffer().push([tag])
+ }
+
+ const handleOpen = (tag) => {
+ const curBuf = getCurrentBuffer()
+ const newLevel = [tag, []]
+ levels.push(newLevel)
+ curBuf.push(newLevel)
+ }
+
+ const handleClose = (tag) => {
+ const currentTag = levels[levels.length - 1]
+ if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
+ currentTag.push(tag)
+ levels.pop()
+ } else {
+ getCurrentBuffer().push(tag)
+ }
+ }
+
+ for (let i = 0; i < html.length; i++) {
+ const char = html[i]
+ if (char === '<' && tagBuffer === null) {
+ flushText()
+ tagBuffer = char
+ } else if (char !== '>' && tagBuffer !== null) {
+ tagBuffer += char
+ } else if (char === '>' && tagBuffer !== null) {
+ tagBuffer += char
+ const tagFull = tagBuffer
+ tagBuffer = null
+ const tagName = getTagName(tagFull)
+ if (tagFull[1] === '/') {
+ handleClose(tagFull)
+ } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
+ // self-closing
+ handleSelfClosing(tagFull)
+ } else {
+ handleOpen(tagFull)
+ }
+ } else {
+ textBuffer += char
+ }
+ }
+ if (tagBuffer) {
+ textBuffer += tagBuffer
+ }
+
+ flushText()
+ return buffer
+}