aboutsummaryrefslogtreecommitdiff
path: root/src/services/html_converter/html_line_converter.service.js
blob: 9c3d1f19f841c7d53618d6d92f0793fd8660fd3e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import { getTagName } from './utility.service.js'

/**
 * This is a tiny purpose-built HTML parser/processor. This basically detects
 * any type of visual newline and converts entire HTML into a array structure.
 *
 * Text nodes are represented as object with single property - text - containing
 * the visual line. Intended usage is to process the array with .map() in which
 * map function returns a string and resulting array can be converted back to html
 * with a .join('').
 *
 * Generally this isn't very useful except for when you really need to either
 * modify visual lines (greentext i.e. simple quoting) or do something with
 * first/last line.
 *
 * known issue: doesn't handle CDATA so nested CDATA might not work well
 *
 * @param {Object} input - input data
 * @return {(string|{ text: string })[]} processed html in form of a list.
 */
export const convertHtmlToLines = (html = '') => {
  // Elements that are implicitly self-closing
  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
  const emptyElements = new Set([
    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
  ])
  // Block-level element (they make a visual line)
  // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
  const blockElements = new Set([
    'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
    'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
    'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
  ])
  // br is very weird in a way that it's technically not block-level, it's
  // essentially converted to a \n (or \r\n). There's also wbr but it doesn't
  // guarantee linebreak, only suggest it.
  const linebreakElements = new Set(['br'])

  const visualLineElements = new Set([
    ...blockElements.values(),
    ...linebreakElements.values()
  ])

  // All block-level elements that aren't empty elements, i.e. not <hr>
  const nonEmptyElements = new Set(visualLineElements)
  // Difference
  for (const elem of emptyElements) {
    nonEmptyElements.delete(elem)
  }

  // All elements that we are recognizing
  const allElements = new Set([
    ...nonEmptyElements.values(),
    ...emptyElements.values()
  ])

  const buffer = [] // Current output buffer
  const level = [] // How deep we are in tags and which tags were there
  let textBuffer = '' // Current line content
  let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag

  const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
    if (textBuffer.trim().length > 0) {
      buffer.push({ level: [...level], text: textBuffer })
    } else {
      buffer.push(textBuffer)
    }
    textBuffer = ''
  }

  const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing
    flush()
    buffer.push(tag)
  }

  const handleOpen = (tag) => { // handles opening tags
    flush()
    buffer.push(tag)
    level.unshift(getTagName(tag))
  }

  const handleClose = (tag) => { // handles closing tags
    if (level[0] === getTagName(tag)) {
      flush()
      buffer.push(tag)
      level.shift()
    } else { // Broken case
      textBuffer += tag
    }
  }

  for (let i = 0; i < html.length; i++) {
    const char = html[i]
    if (char === '<' && tagBuffer === null) {
      tagBuffer = char
    } else if (char !== '>' && tagBuffer !== null) {
      tagBuffer += char
    } else if (char === '>' && tagBuffer !== null) {
      tagBuffer += char
      const tagFull = tagBuffer
      tagBuffer = null
      const tagName = getTagName(tagFull)
      if (allElements.has(tagName)) {
        if (linebreakElements.has(tagName)) {
          handleBr(tagFull)
        } else if (nonEmptyElements.has(tagName)) {
          if (tagFull[1] === '/') {
            handleClose(tagFull)
          } else if (tagFull[tagFull.length - 2] === '/') {
            // self-closing
            handleBr(tagFull)
          } else {
            handleOpen(tagFull)
          }
        } else {
          textBuffer += tagFull
        }
      } else {
        textBuffer += tagFull
      }
    } else if (char === '\n') {
      handleBr(char)
    } else {
      textBuffer += char
    }
  }
  if (tagBuffer) {
    textBuffer += tagBuffer
  }

  flush()

  return buffer
}