aboutsummaryrefslogtreecommitdiff
path: root/src/services/html_converter/html_line_converter.service.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/services/html_converter/html_line_converter.service.js')
-rw-r--r--src/services/html_converter/html_line_converter.service.js136
1 files changed, 136 insertions, 0 deletions
diff --git a/src/services/html_converter/html_line_converter.service.js b/src/services/html_converter/html_line_converter.service.js
new file mode 100644
index 00000000..5eeaa7cb
--- /dev/null
+++ b/src/services/html_converter/html_line_converter.service.js
@@ -0,0 +1,136 @@
+import { getTagName } from './utility.service.js'
+
+/**
+ * This is a tiny purpose-built HTML parser/processor. This basically detects
+ * any type of visual newline and converts entire HTML into a array structure.
+ *
+ * Text nodes are represented as object with single property - text - containing
+ * the visual line. Intended usage is to process the array with .map() in which
+ * map function returns a string and resulting array can be converted back to html
+ * with a .join('').
+ *
+ * Generally this isn't very useful except for when you really need to either
+ * modify visual lines (greentext i.e. simple quoting) or do something with
+ * first/last line.
+ *
+ * known issue: doesn't handle CDATA so nested CDATA might not work well
+ *
+ * @param {Object} input - input data
+ * @return {(string|{ text: string })[]} processed html in form of a list.
+ */
+export const convertHtmlToLines = (html = '') => {
+ // Elements that are implicitly self-closing
+ // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+ const emptyElements = new Set([
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+ 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+ ])
+ // Block-level element (they make a visual line)
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+ const blockElements = new Set([
+ 'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
+ 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
+ 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
+ ])
+ // br is very weird in a way that it's technically not block-level, it's
+ // essentially converted to a \n (or \r\n). There's also wbr but it doesn't
+ // guarantee linebreak, only suggest it.
+ const linebreakElements = new Set(['br'])
+
+ const visualLineElements = new Set([
+ ...blockElements.values(),
+ ...linebreakElements.values()
+ ])
+
+ // All block-level elements that aren't empty elements, i.e. not <hr>
+ const nonEmptyElements = new Set(visualLineElements)
+ // Difference
+ for (let elem of emptyElements) {
+ nonEmptyElements.delete(elem)
+ }
+
+ // All elements that we are recognizing
+ const allElements = new Set([
+ ...nonEmptyElements.values(),
+ ...emptyElements.values()
+ ])
+
+ let buffer = [] // Current output buffer
+ const level = [] // How deep we are in tags and which tags were there
+ let textBuffer = '' // Current line content
+ let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
+
+ const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
+ if (textBuffer.trim().length > 0) {
+ buffer.push({ level: [...level], text: textBuffer })
+ } else {
+ buffer.push(textBuffer)
+ }
+ textBuffer = ''
+ }
+
+ const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing
+ flush()
+ buffer.push(tag)
+ }
+
+ const handleOpen = (tag) => { // handles opening tags
+ flush()
+ buffer.push(tag)
+ level.unshift(getTagName(tag))
+ }
+
+ const handleClose = (tag) => { // handles closing tags
+ if (level[0] === getTagName(tag)) {
+ flush()
+ buffer.push(tag)
+ level.shift()
+ } else { // Broken case
+ textBuffer += tag
+ }
+ }
+
+ for (let i = 0; i < html.length; i++) {
+ const char = html[i]
+ if (char === '<' && tagBuffer === null) {
+ tagBuffer = char
+ } else if (char !== '>' && tagBuffer !== null) {
+ tagBuffer += char
+ } else if (char === '>' && tagBuffer !== null) {
+ tagBuffer += char
+ const tagFull = tagBuffer
+ tagBuffer = null
+ const tagName = getTagName(tagFull)
+ if (allElements.has(tagName)) {
+ if (linebreakElements.has(tagName)) {
+ handleBr(tagFull)
+ } else if (nonEmptyElements.has(tagName)) {
+ if (tagFull[1] === '/') {
+ handleClose(tagFull)
+ } else if (tagFull[tagFull.length - 2] === '/') {
+ // self-closing
+ handleBr(tagFull)
+ } else {
+ handleOpen(tagFull)
+ }
+ } else {
+ textBuffer += tagFull
+ }
+ } else {
+ textBuffer += tagFull
+ }
+ } else if (char === '\n') {
+ handleBr(char)
+ } else {
+ textBuffer += char
+ }
+ }
+ if (tagBuffer) {
+ textBuffer += tagBuffer
+ }
+
+ flush()
+
+ return buffer
+}