1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
/**
* This is a not-so-tiny purpose-built HTML parser/processor. This parses html
* and converts it into a tree structure representing tag openers/closers and
* children.
*
* Structure follows this pattern: [opener, [...children], closer] except root
* node which is just [...children]. Text nodes can only be within children and
* are represented as strings.
*
* Intended use is to convert HTML structure and then recursively iterate over it
* most likely using a map. Very useful for dynamically rendering html replacing
* tags with JSX elements in a render function.
*
* known issue: doesn't handle CDATA so CDATA might not work well
* known issue: doesn't handle HTML comments
*
* @param {Object} input - input data
* @return {string} processed html
*/
export const convertHtmlToTree = (html) => {
// Elements that are implicitly self-closing
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
const emptyElements = new Set([
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
])
// TODO For future - also parse HTML5 multi-source components?
const buffer = [] // Current output buffer
const levels = [['', buffer]] // How deep we are in tags and which tags were there
let textBuffer = '' // Current line content
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
const getCurrentBuffer = () => {
return levels[levels.length - 1][1]
}
const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
if (textBuffer === '') return
getCurrentBuffer().push(textBuffer)
textBuffer = ''
}
const handleSelfClosing = (tag) => {
getCurrentBuffer().push([tag])
}
const handleOpen = (tag) => {
const curBuf = getCurrentBuffer()
const newLevel = [tag, []]
levels.push(newLevel)
curBuf.push(newLevel)
}
const handleClose = (tag) => {
const currentTag = levels[levels.length - 1]
if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
currentTag.push(tag)
levels.pop()
} else {
getCurrentBuffer().push(tag)
}
}
for (let i = 0; i < html.length; i++) {
const char = html[i]
if (char === '<' && tagBuffer === null) {
flushText()
tagBuffer = char
} else if (char !== '>' && tagBuffer !== null) {
tagBuffer += char
} else if (char === '>' && tagBuffer !== null) {
tagBuffer += char
const tagFull = tagBuffer
tagBuffer = null
const tagName = getTagName(tagFull)
if (tagFull[1] === '/') {
handleClose(tagFull)
} else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
// self-closing
handleSelfClosing(tagFull)
} else {
handleOpen(tagFull)
}
} else {
textBuffer += char
}
}
if (tagBuffer) {
textBuffer += tagBuffer
}
flushText()
return buffer
}
// Extracts tag name from tag, i.e. <span a="b"> => span
export const getTagName = (tag) => {
const result = /(?:<\/(\w+)>|<(\w+)\s?.*?\/?>)/gi.exec(tag)
return result && (result[1] || result[2])
}
export const processTextForEmoji = (text, emojis, processor) => {
const buffer = []
let textBuffer = ''
for (let i = 0; i < text.length; i++) {
const char = text[i]
if (char === ':') {
const next = text.slice(i + 1)
let found = false
for (let emoji of emojis) {
if (next.slice(0, emoji.shortcode.length + 1) === (emoji.shortcode + ':')) {
found = emoji
break
}
}
if (found) {
buffer.push(textBuffer)
textBuffer = ''
buffer.push(processor(found))
i += found.shortcode.length + 1
} else {
textBuffer += char
}
} else {
textBuffer += char
}
}
if (textBuffer) buffer.push(textBuffer)
return buffer
}
export const getAttrs = tag => {
const innertag = tag
.substring(1, tag.length - 1)
.replace(new RegExp('^' + getTagName(tag)), '')
.replace(/\/?$/, '')
.trim()
const attrs = Array.from(innertag.matchAll(/([a-z0-9-]+)(?:=("[^"]+?"|'[^']+?'))?/gi))
.map(([trash, key, value]) => [key, value])
.map(([k, v]) => {
if (!v) return [k, true]
return [k, v.substring(1, v.length - 1)]
})
return Object.fromEntries(attrs)
}
|