1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
import { getTagName } from './utility.service.js'
/**
* This is a tiny purpose-built HTML parser/processor. This basically detects
* any type of visual newline and converts entire HTML into a array structure.
*
* Text nodes are represented as object with single property - text - containing
* the visual line. Intended usage is to process the array with .map() in which
* map function returns a string and resulting array can be converted back to html
* with a .join('').
*
* Generally this isn't very useful except for when you really need to either
* modify visual lines (greentext i.e. simple quoting) or do something with
* first/last line.
*
* known issue: doesn't handle CDATA so nested CDATA might not work well
*
* @param {Object} input - input data
* @return {(string|{ text: string })[]} processed html in form of a list.
*/
export const convertHtmlToLines = (html = '') => {
// Elements that are implicitly self-closing
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
const emptyElements = new Set([
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
])
// Block-level element (they make a visual line)
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
const blockElements = new Set([
'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
])
// br is very weird in a way that it's technically not block-level, it's
// essentially converted to a \n (or \r\n). There's also wbr but it doesn't
// guarantee linebreak, only suggest it.
const linebreakElements = new Set(['br'])
const visualLineElements = new Set([
...blockElements.values(),
...linebreakElements.values()
])
// All block-level elements that aren't empty elements, i.e. not <hr>
const nonEmptyElements = new Set(visualLineElements)
// Difference
for (const elem of emptyElements) {
nonEmptyElements.delete(elem)
}
// All elements that we are recognizing
const allElements = new Set([
...nonEmptyElements.values(),
...emptyElements.values()
])
const buffer = [] // Current output buffer
const level = [] // How deep we are in tags and which tags were there
let textBuffer = '' // Current line content
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
if (textBuffer.trim().length > 0) {
buffer.push({ level: [...level], text: textBuffer })
} else {
buffer.push(textBuffer)
}
textBuffer = ''
}
const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing
flush()
buffer.push(tag)
}
const handleOpen = (tag) => { // handles opening tags
flush()
buffer.push(tag)
level.unshift(getTagName(tag))
}
const handleClose = (tag) => { // handles closing tags
if (level[0] === getTagName(tag)) {
flush()
buffer.push(tag)
level.shift()
} else { // Broken case
textBuffer += tag
}
}
for (let i = 0; i < html.length; i++) {
const char = html[i]
if (char === '<' && tagBuffer === null) {
tagBuffer = char
} else if (char !== '>' && tagBuffer !== null) {
tagBuffer += char
} else if (char === '>' && tagBuffer !== null) {
tagBuffer += char
const tagFull = tagBuffer
tagBuffer = null
const tagName = getTagName(tagFull)
if (allElements.has(tagName)) {
if (linebreakElements.has(tagName)) {
handleBr(tagFull)
} else if (nonEmptyElements.has(tagName)) {
if (tagFull[1] === '/') {
handleClose(tagFull)
} else if (tagFull[tagFull.length - 2] === '/') {
// self-closing
handleBr(tagFull)
} else {
handleOpen(tagFull)
}
} else {
textBuffer += tagFull
}
} else {
textBuffer += tagFull
}
} else if (char === '\n') {
handleBr(char)
} else {
textBuffer += char
}
}
if (tagBuffer) {
textBuffer += tagBuffer
}
flush()
return buffer
}
|