Merge remote-tracking branch 'origin/develop' into settings-and-filtering

* origin/develop: (169 commits) Improve the user card for deactivated users Update CHANGELOG.md Update CHANGELOG.md Allow canceling a follow request Simple policy reasons for instance specific policies entity_normalizer: Escape name when parsing user Translated using Weblate (Spanish) Translated using Weblate (Catalan) Translated using Weblate (Korean) Translated using Weblate (Japanese (ja_PEDANTIC)) Translated using Weblate (Indonesian) Translated using Weblate (Esperanto) Translated using Weblate (Vietnamese) Translated using Weblate (Italian) Translated using Weblate (Vietnamese) Translated using Weblate (Indonesian) Translated using Weblate (Italian) Translated using Weblate (Vietnamese) Translated using Weblate (Indonesian) Translated using Weblate (Chinese (Simplified)) ...
author: Henry Jameson <me@hjkos.com> 2022-01-24 19:12:17 +0200
committer: Henry Jameson <me@hjkos.com> 2022-01-24 19:12:17 +0200
commit: 9ea0f10abb195799842baf2f4b8711d1744b37bb (patch)
tree: 2c64119cc7c6bc989e65189712ce81cbbb40c91a /src/services/html_converter
parent: a96a62929d723c3676174dfd71c0db4462599a12 (diff)
parent: 182fcca5da9fa284f46f5ca1c8b1790353dec316 (diff)
3 files changed, 306 insertions, 0 deletions
diff --git a/src/services/html_converter/html_line_converter.service.js b/src/services/html_converter/html_line_converter.service.js
new file mode 100644
index 00000000..5eeaa7cb
--- /dev/null
+++ b/src/services/html_converter/html_line_converter.service.js
@@ -0,0 +1,136 @@
+import { getTagName } from './utility.service.js'
+
+/**
+ * This is a tiny purpose-built HTML parser/processor. This basically detects
+ * any type of visual newline and converts entire HTML into a array structure.
+ *
+ * Text nodes are represented as object with single property - text - containing
+ * the visual line. Intended usage is to process the array with .map() in which
+ * map function returns a string and resulting array can be converted back to html
+ * with a .join('').
+ *
+ * Generally this isn't very useful except for when you really need to either
+ * modify visual lines (greentext i.e. simple quoting) or do something with
+ * first/last line.
+ *
+ * known issue: doesn't handle CDATA so nested CDATA might not work well
+ *
+ * @param {Object} input - input data
+ * @return {(string|{ text: string })[]} processed html in form of a list.
+ */
+export const convertHtmlToLines = (html = '') => {
+  // Elements that are implicitly self-closing
+  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+  const emptyElements = new Set([
+    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+  ])
+  // Block-level element (they make a visual line)
+  // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+  const blockElements = new Set([
+    'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
+    'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
+    'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
+  ])
+  // br is very weird in a way that it's technically not block-level, it's
+  // essentially converted to a \n (or \r\n). There's also wbr but it doesn't
+  // guarantee linebreak, only suggest it.
+  const linebreakElements = new Set(['br'])
+
+  const visualLineElements = new Set([
+    ...blockElements.values(),
+    ...linebreakElements.values()
+  ])
+
+  // All block-level elements that aren't empty elements, i.e. not <hr>
+  const nonEmptyElements = new Set(visualLineElements)
+  // Difference
+  for (let elem of emptyElements) {
+    nonEmptyElements.delete(elem)
+  }
+
+  // All elements that we are recognizing
+  const allElements = new Set([
+    ...nonEmptyElements.values(),
+    ...emptyElements.values()
+  ])
+
+  let buffer = [] // Current output buffer
+  const level = [] // How deep we are in tags and which tags were there
+  let textBuffer = '' // Current line content
+  let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
+
+  const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
+    if (textBuffer.trim().length > 0) {
+      buffer.push({ level: [...level], text: textBuffer })
+    } else {
+      buffer.push(textBuffer)
+    }
+    textBuffer = ''
+  }
+
+  const handleBr = (tag) => { // handles single newlines/linebreaks/selfclosing
+    flush()
+    buffer.push(tag)
+  }
+
+  const handleOpen = (tag) => { // handles opening tags
+    flush()
+    buffer.push(tag)
+    level.unshift(getTagName(tag))
+  }
+
+  const handleClose = (tag) => { // handles closing tags
+    if (level[0] === getTagName(tag)) {
+      flush()
+      buffer.push(tag)
+      level.shift()
+    } else { // Broken case
+      textBuffer += tag
+    }
+  }
+
+  for (let i = 0; i < html.length; i++) {
+    const char = html[i]
+    if (char === '<' && tagBuffer === null) {
+      tagBuffer = char
+    } else if (char !== '>' && tagBuffer !== null) {
+      tagBuffer += char
+    } else if (char === '>' && tagBuffer !== null) {
+      tagBuffer += char
+      const tagFull = tagBuffer
+      tagBuffer = null
+      const tagName = getTagName(tagFull)
+      if (allElements.has(tagName)) {
+        if (linebreakElements.has(tagName)) {
+          handleBr(tagFull)
+        } else if (nonEmptyElements.has(tagName)) {
+          if (tagFull[1] === '/') {
+            handleClose(tagFull)
+          } else if (tagFull[tagFull.length - 2] === '/') {
+            // self-closing
+            handleBr(tagFull)
+          } else {
+            handleOpen(tagFull)
+          }
+        } else {
+          textBuffer += tagFull
+        }
+      } else {
+        textBuffer += tagFull
+      }
+    } else if (char === '\n') {
+      handleBr(char)
+    } else {
+      textBuffer += char
+    }
+  }
+  if (tagBuffer) {
+    textBuffer += tagBuffer
+  }
+
+  flush()
+
+  return buffer
+}
diff --git a/src/services/html_converter/html_tree_converter.service.js b/src/services/html_converter/html_tree_converter.service.js
new file mode 100644
index 00000000..6a8796c4
--- /dev/null
+++ b/src/services/html_converter/html_tree_converter.service.js
@@ -0,0 +1,97 @@
+import { getTagName } from './utility.service.js'
+
+/**
+ * This is a not-so-tiny purpose-built HTML parser/processor. This parses html
+ * and converts it into a tree structure representing tag openers/closers and
+ * children.
+ *
+ * Structure follows this pattern: [opener, [...children], closer] except root
+ * node which is just [...children]. Text nodes can only be within children and
+ * are represented as strings.
+ *
+ * Intended use is to convert HTML structure and then recursively iterate over it
+ * most likely using a map. Very useful for dynamically rendering html replacing
+ * tags with JSX elements in a render function.
+ *
+ * known issue: doesn't handle CDATA so CDATA might not work well
+ * known issue: doesn't handle HTML comments
+ *
+ * @param {Object} input - input data
+ * @return {string} processed html
+ */
+export const convertHtmlToTree = (html = '') => {
+  // Elements that are implicitly self-closing
+  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+  const emptyElements = new Set([
+    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+  ])
+  // TODO For future - also parse HTML5 multi-source components?
+
+  const buffer = [] // Current output buffer
+  const levels = [['', buffer]] // How deep we are in tags and which tags were there
+  let textBuffer = '' // Current line content
+  let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
+
+  const getCurrentBuffer = () => {
+    return levels[levels.length - 1][1]
+  }
+
+  const flushText = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
+    if (textBuffer === '') return
+    getCurrentBuffer().push(textBuffer)
+    textBuffer = ''
+  }
+
+  const handleSelfClosing = (tag) => {
+    getCurrentBuffer().push([tag])
+  }
+
+  const handleOpen = (tag) => {
+    const curBuf = getCurrentBuffer()
+    const newLevel = [tag, []]
+    levels.push(newLevel)
+    curBuf.push(newLevel)
+  }
+
+  const handleClose = (tag) => {
+    const currentTag = levels[levels.length - 1]
+    if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
+      currentTag.push(tag)
+      levels.pop()
+    } else {
+      getCurrentBuffer().push(tag)
+    }
+  }
+
+  for (let i = 0; i < html.length; i++) {
+    const char = html[i]
+    if (char === '<' && tagBuffer === null) {
+      flushText()
+      tagBuffer = char
+    } else if (char !== '>' && tagBuffer !== null) {
+      tagBuffer += char
+    } else if (char === '>' && tagBuffer !== null) {
+      tagBuffer += char
+      const tagFull = tagBuffer
+      tagBuffer = null
+      const tagName = getTagName(tagFull)
+      if (tagFull[1] === '/') {
+        handleClose(tagFull)
+      } else if (emptyElements.has(tagName) || tagFull[tagFull.length - 2] === '/') {
+        // self-closing
+        handleSelfClosing(tagFull)
+      } else {
+        handleOpen(tagFull)
+      }
+    } else {
+      textBuffer += char
+    }
+  }
+  if (tagBuffer) {
+    textBuffer += tagBuffer
+  }
+
+  flushText()
+  return buffer
+}
diff --git a/src/services/html_converter/utility.service.js b/src/services/html_converter/utility.service.js
new file mode 100644
index 00000000..4d0c36c2
--- /dev/null
+++ b/src/services/html_converter/utility.service.js
@@ -0,0 +1,73 @@
+/**
+ * Extract tag name from tag opener/closer.
+ *
+ * @param {String} tag - tag string, i.e. '<a href="...">'
+ * @return {String} - tagname, i.e. "div"
+ */
+export const getTagName = (tag) => {
+  const result = /(?:<\/(\w+)>|<(\w+)\s?.*?\/?>)/gi.exec(tag)
+  return result && (result[1] || result[2])
+}
+
+/**
+ * Extract attributes from tag opener.
+ *
+ * @param {String} tag - tag string, i.e. '<a href="...">'
+ * @return {Object} - map of attributes key = attribute name, value = attribute value
+ *   attributes without values represented as boolean true
+ */
+export const getAttrs = tag => {
+  const innertag = tag
+    .substring(1, tag.length - 1)
+    .replace(new RegExp('^' + getTagName(tag)), '')
+    .replace(/\/?$/, '')
+    .trim()
+  const attrs = Array.from(innertag.matchAll(/([a-z0-9-]+)(?:=("[^"]+?"|'[^']+?'))?/gi))
+    .map(([trash, key, value]) => [key, value])
+    .map(([k, v]) => {
+      if (!v) return [k, true]
+      return [k, v.substring(1, v.length - 1)]
+    })
+  return Object.fromEntries(attrs)
+}
+
+/**
+ * Finds shortcodes in text
+ *
+ * @param {String} text - original text to find emojis in
+ * @param {{ url: String, shortcode: Sring }[]} emoji - list of shortcodes to find
+ * @param {Function} processor - function to call on each encountered emoji,
+ *   function is passed single object containing matching emoji ({ url, shortcode })
+ *   return value will be inserted into resulting array instead of :shortcode:
+ * @return {Array} resulting array with non-emoji parts of text and whatever {processor}
+ *   returned for emoji
+ */
+export const processTextForEmoji = (text, emojis, processor) => {
+  const buffer = []
+  let textBuffer = ''
+  for (let i = 0; i < text.length; i++) {
+    const char = text[i]
+    if (char === ':') {
+      const next = text.slice(i + 1)
+      let found = false
+      for (let emoji of emojis) {
+        if (next.slice(0, emoji.shortcode.length + 1) === (emoji.shortcode + ':')) {
+          found = emoji
+          break
+        }
+      }
+      if (found) {
+        buffer.push(textBuffer)
+        textBuffer = ''
+        buffer.push(processor(found))
+        i += found.shortcode.length + 1
+      } else {
+        textBuffer += char
+      }
+    } else {
+      textBuffer += char
+    }
+  }
+  if (textBuffer) buffer.push(textBuffer)
+  return buffer
+}
author	Henry Jameson <me@hjkos.com>	2022-01-24 19:12:17 +0200
committer	Henry Jameson <me@hjkos.com>	2022-01-24 19:12:17 +0200
commit	9ea0f10abb195799842baf2f4b8711d1744b37bb (patch)
tree	2c64119cc7c6bc989e65189712ce81cbbb40c91a /src/services/html_converter
parent	a96a62929d723c3676174dfd71c0db4462599a12 (diff)
parent	182fcca5da9fa284f46f5ca1c8b1790353dec316 (diff)