From cc00af7a3102034b05ebcd4aa1fd01c6f467184a Mon Sep 17 00:00:00 2001 From: Henry Jameson Date: Thu, 10 Jun 2021 18:52:01 +0300 Subject: Hellthread(tm) Certified --- .../html_converter/html_line_converter.service.js | 102 ++++++++++++++ .../html_converter/html_tree_converter.service.js | 146 +++++++++++++++++++++ .../mini_html_converter.service.js | 138 ------------------- .../tiny_post_html_processor.service.js | 94 ------------- 4 files changed, 248 insertions(+), 232 deletions(-) create mode 100644 src/services/html_converter/html_line_converter.service.js create mode 100644 src/services/html_converter/html_tree_converter.service.js delete mode 100644 src/services/mini_html_converter/mini_html_converter.service.js delete mode 100644 src/services/tiny_post_html_processor/tiny_post_html_processor.service.js (limited to 'src/services') diff --git a/src/services/html_converter/html_line_converter.service.js b/src/services/html_converter/html_line_converter.service.js new file mode 100644 index 00000000..80482c9a --- /dev/null +++ b/src/services/html_converter/html_line_converter.service.js @@ -0,0 +1,102 @@ +/** + * This is a tiny purpose-built HTML parser/processor. This basically detects + * any type of visual newline and converts entire HTML into a array structure. + * + * Text nodes are represented as object with single property - text - containing + * the visual line. Intended usage is to process the array with .map() in which + * map function returns a string and resulting array can be converted back to html + * with a .join(''). + * + * Generally this isn't very useful except for when you really need to either + * modify visual lines (greentext i.e. simple quoting) or do something with + * first/last line. + * + * known issue: doesn't handle CDATA so nested CDATA might not work well + * + * @param {Object} input - input data + * @return {(string|{ text: string })[]} processed html in form of a list. + */ +export const convertHtmlToLines = (html) => { + const handledTags = new Set(['p', 'br', 'div']) + const openCloseTags = new Set(['p', 'div']) + + let buffer = [] // Current output buffer + const level = [] // How deep we are in tags and which tags were there + let textBuffer = '' // Current line content + let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag + + // Extracts tag name from tag, i.e. => span + const getTagName = (tag) => { + const result = /(?:<\/(\w+)>|<(\w+)\s?[^/]*?\/?>)/gi.exec(tag) + return result && (result[1] || result[2]) + } + + const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer + if (textBuffer.trim().length > 0) { + buffer.push({ text: textBuffer }) + } else { + buffer.push(textBuffer) + } + textBuffer = '' + } + + const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing + flush() + buffer.push(tag) + } + + const handleOpen = (tag) => { // handles opening tags + flush() + buffer.push(tag) + level.push(tag) + } + + const handleClose = (tag) => { // handles closing tags + flush() + buffer.push(tag) + if (level[level.length - 1] === tag) { + level.pop() + } + } + + for (let i = 0; i < html.length; i++) { + const char = html[i] + if (char === '<' && tagBuffer === null) { + tagBuffer = char + } else if (char !== '>' && tagBuffer !== null) { + tagBuffer += char + } else if (char === '>' && tagBuffer !== null) { + tagBuffer += char + const tagFull = tagBuffer + tagBuffer = null + const tagName = getTagName(tagFull) + if (handledTags.has(tagName)) { + if (tagName === 'br') { + handleBr(tagFull) + } else if (openCloseTags.has(tagName)) { + if (tagFull[1] === '/') { + handleClose(tagFull) + } else if (tagFull[tagFull.length - 2] === '/') { + // self-closing + handleBr(tagFull) + } else { + handleOpen(tagFull) + } + } + } else { + textBuffer += tagFull + } + } else if (char === '\n') { + handleBr(char) + } else { + textBuffer += char + } + } + if (tagBuffer) { + textBuffer += tagBuffer + } + + flush() + + return buffer +} diff --git a/src/services/html_converter/html_tree_converter.service.js b/src/services/html_converter/html_tree_converter.service.js new file mode 100644 index 00000000..badd473a --- /dev/null +++ b/src/services/html_converter/html_tree_converter.service.js @@ -0,0 +1,146 @@ +/** + * This is a not-so-tiny purpose-built HTML parser/processor. This parses html + * and converts it into a tree structure representing tag openers/closers and + * children. + * + * Structure follows this pattern: [opener, [...children], closer] except root + * node which is just [...children]. Text nodes can only be within children and + * are represented as strings. + * + * Intended use is to convert HTML structure and then recursively iterate over it + * most likely using a map. Very useful for dynamically rendering html replacing + * tags with JSX elements in a render function. + * + * known issue: doesn't handle CDATA so CDATA might not work well + * known issue: doesn't handle HTML comments + * + * @param {Object} input - input data + * @return {string} processed html + */ +export const convertHtmlToTree = (html) => { + // Elements that are implicitly self-closing + // https://developer.mozilla.org/en-US/docs/Glossary/empty_element + const emptyElements = new Set([ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', + 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' + ]) + // TODO For future - also parse HTML5 multi-source components? + + const buffer = [] // Current output buffer + const levels = [['', buffer]] // How deep we are in tags and which tags were there + let textBuffer = '' // Current line content + let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag + + const getCurrentBuffer = () => { + return levels[levels.length - 1][1] + } + + const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer + if (textBuffer === '') return + getCurrentBuffer().push(textBuffer) + textBuffer = '' + } + + const handleSelfClosing = (tag) => { + getCurrentBuffer().push([tag]) + } + + const handleOpen = (tag) => { + const curBuf = getCurrentBuffer() + const newLevel = [tag, []] + levels.push(newLevel) + curBuf.push(newLevel) + } + + const handleClose = (tag) => { + const currentTag = levels[levels.length - 1] + if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) { + currentTag.push(tag) + levels.pop() + } else { + getCurrentBuffer().push(tag) + } + } + + for (let i = 0; i < html.length; i++) { + const char = html[i] + if (char === '<' && tagBuffer === null) { + flushText() + tagBuffer = char + } else if (char !== '>' && tagBuffer !== null) { + tagBuffer += char + } else if (char === '>' && tagBuffer !== null) { + tagBuffer += char + const tagFull = tagBuffer + tagBuffer = null + const tagName = getTagName(tagFull) + if (tagFull[1] === '/') { + handleClose(tagFull) + } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') { + // self-closing + handleSelfClosing(tagFull) + } else { + handleOpen(tagFull) + } + } else { + textBuffer += char + } + } + if (tagBuffer) { + textBuffer += tagBuffer + } + + flushText() + return buffer +} + +// Extracts tag name from tag, i.e. => span +export const getTagName = (tag) => { + const result = /(?:<\/(\w+)>|<(\w+)\s?.*?\/?>)/gi.exec(tag) + return result && (result[1] || result[2]) +} + +export const processTextForEmoji = (text, emojis, processor) => { + const buffer = [] + let textBuffer = '' + for (let i = 0; i < text.length; i++) { + const char = text[i] + if (char === ':') { + const next = text.slice(i + 1) + let found = false + for (let emoji of emojis) { + if (next.slice(0, emoji.shortcode.length + 1) === (emoji.shortcode + ':')) { + found = emoji + break + } + } + if (found) { + buffer.push(textBuffer) + textBuffer = '' + buffer.push(processor(found)) + i += found.shortcode.length + 1 + } else { + textBuffer += char + } + } else { + textBuffer += char + } + } + if (textBuffer) buffer.push(textBuffer) + return buffer +} + +export const getAttrs = tag => { + const innertag = tag + .substring(1, tag.length - 1) + .replace(new RegExp('^' + getTagName(tag)), '') + .replace(/\/?$/, '') + .trim() + const attrs = Array.from(innertag.matchAll(/([a-z0-9-]+)(?:=("[^"]+?"|'[^']+?'))?/gi)) + .map(([trash, key, value]) => [key, value]) + .map(([k, v]) => { + if (!v) return [k, true] + return [k, v.substring(1, v.length - 1)] + }) + return Object.fromEntries(attrs) +} diff --git a/src/services/mini_html_converter/mini_html_converter.service.js b/src/services/mini_html_converter/mini_html_converter.service.js deleted file mode 100644 index 900752cd..00000000 --- a/src/services/mini_html_converter/mini_html_converter.service.js +++ /dev/null @@ -1,138 +0,0 @@ -/** - * This is a not-so-tiny purpose-built HTML parser/processor. It was made for use - * with StatusBody component for purpose of replacing tags with vue components - * - * known issue: doesn't handle CDATA so nested CDATA might not work well - * - * @param {Object} input - input data - * @param {(string) => string} lineProcessor - function that will be called on every line - * @param {{ key[string]: (string) => string}} tagProcessor - map of processors for tags - * @return {string} processed html - */ -export const convertHtml = (html) => { - // Elements that are implicitly self-closing - // https://developer.mozilla.org/en-US/docs/Glossary/empty_element - const emptyElements = new Set([ - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', - 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' - ]) - // TODO For future - also parse HTML5 multi-source components? - - const buffer = [] // Current output buffer - const levels = [['', buffer]] // How deep we are in tags and which tags were there - let textBuffer = '' // Current line content - let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag - - const getCurrentBuffer = () => { - return levels[levels.length - 1][1] - } - - const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer - if (textBuffer === '') return - getCurrentBuffer().push(textBuffer) - textBuffer = '' - } - - const handleSelfClosing = (tag) => { - getCurrentBuffer().push([tag]) - } - - const handleOpen = (tag) => { - const curBuf = getCurrentBuffer() - const newLevel = [tag, []] - levels.push(newLevel) - curBuf.push(newLevel) - } - - const handleClose = (tag) => { - const currentTag = levels[levels.length - 1] - if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) { - currentTag.push(tag) - levels.pop() - } else { - getCurrentBuffer().push(tag) - } - } - - for (let i = 0; i < html.length; i++) { - const char = html[i] - if (char === '<' && tagBuffer === null) { - flushText() - tagBuffer = char - } else if (char !== '>' && tagBuffer !== null) { - tagBuffer += char - } else if (char === '>' && tagBuffer !== null) { - tagBuffer += char - const tagFull = tagBuffer - tagBuffer = null - const tagName = getTagName(tagFull) - if (tagFull[1] === '/') { - handleClose(tagFull) - } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') { - // self-closing - handleSelfClosing(tagFull) - } else { - handleOpen(tagFull) - } - } else { - textBuffer += char - } - } - if (tagBuffer) { - textBuffer += tagBuffer - } - - flushText() - return buffer -} - -// Extracts tag name from tag, i.e. => span -export const getTagName = (tag) => { - const result = /(?:<\/(\w+)>|<(\w+)\s?.*?\/?>)/gi.exec(tag) - return result && (result[1] || result[2]) -} - -export const processTextForEmoji = (text, emojis, processor) => { - const buffer = [] - let textBuffer = '' - for (let i = 0; i < text.length; i++) { - const char = text[i] - if (char === ':') { - const next = text.slice(i + 1) - let found = false - for (let emoji of emojis) { - if (next.slice(0, emoji.shortcode.length + 1) === (emoji.shortcode + ':')) { - found = emoji - break - } - } - if (found) { - buffer.push(textBuffer) - textBuffer = '' - buffer.push(processor(found)) - i += found.shortcode.length + 1 - } else { - textBuffer += char - } - } else { - textBuffer += char - } - } - if (textBuffer) buffer.push(textBuffer) - return buffer -} - -export const getAttrs = tag => { - const innertag = tag - .substring(1, tag.length - 1) - .replace(new RegExp('^' + getTagName(tag)), '') - .replace(/\/?$/, '') - .trim() - const attrs = Array.from(innertag.matchAll(/([a-z0-9-]+)(?:=("[^"]+?"|'[^']+?'))?/gi)) - .map(([trash, key, value]) => [key, value]) - .map(([k, v]) => { - if (!v) return [k, true] - return [k, v.substring(1, v.length - 1)] - }) - return Object.fromEntries(attrs) -} diff --git a/src/services/tiny_post_html_processor/tiny_post_html_processor.service.js b/src/services/tiny_post_html_processor/tiny_post_html_processor.service.js deleted file mode 100644 index de6f20ef..00000000 --- a/src/services/tiny_post_html_processor/tiny_post_html_processor.service.js +++ /dev/null @@ -1,94 +0,0 @@ -/** - * This is a tiny purpose-built HTML parser/processor. This basically detects any type of visual newline and - * allows it to be processed, useful for greentexting, mostly - * - * known issue: doesn't handle CDATA so nested CDATA might not work well - * - * @param {Object} input - input data - * @param {(string) => string} processor - function that will be called on every line - * @return {string} processed html - */ -export const processHtml = (html, processor) => { - const handledTags = new Set(['p', 'br', 'div']) - const openCloseTags = new Set(['p', 'div']) - - let buffer = '' // Current output buffer - const level = [] // How deep we are in tags and which tags were there - let textBuffer = '' // Current line content - let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag - - // Extracts tag name from tag, i.e. => span - const getTagName = (tag) => { - const result = /(?:<\/(\w+)>|<(\w+)\s?[^/]*?\/?>)/gi.exec(tag) - return result && (result[1] || result[2]) - } - - const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer - if (textBuffer.trim().length > 0) { - buffer += processor(textBuffer) - } else { - buffer += textBuffer - } - textBuffer = '' - } - - const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing - flush() - buffer += tag - } - - const handleOpen = (tag) => { // handles opening tags - flush() - buffer += tag - level.push(tag) - } - - const handleClose = (tag) => { // handles closing tags - flush() - buffer += tag - if (level[level.length - 1] === tag) { - level.pop() - } - } - - for (let i = 0; i < html.length; i++) { - const char = html[i] - if (char === '<' && tagBuffer === null) { - tagBuffer = char - } else if (char !== '>' && tagBuffer !== null) { - tagBuffer += char - } else if (char === '>' && tagBuffer !== null) { - tagBuffer += char - const tagFull = tagBuffer - tagBuffer = null - const tagName = getTagName(tagFull) - if (handledTags.has(tagName)) { - if (tagName === 'br') { - handleBr(tagFull) - } else if (openCloseTags.has(tagName)) { - if (tagFull[1] === '/') { - handleClose(tagFull) - } else if (tagFull[tagFull.length - 2] === '/') { - // self-closing - handleBr(tagFull) - } else { - handleOpen(tagFull) - } - } - } else { - textBuffer += tagFull - } - } else if (char === '\n') { - handleBr(char) - } else { - textBuffer += char - } - } - if (tagBuffer) { - textBuffer += tagBuffer - } - - flush() - - return buffer -} -- cgit v1.2.3-70-g09d2