refactored line converter, untied its logic from greentexting, better

handling of broken cases
author: Henry Jameson <me@hjkos.com> 2021-06-13 13:29:26 +0300
committer: Henry Jameson <me@hjkos.com> 2021-06-13 15:24:29 +0300
commit: bebafa1a2c38972245d37de70f4aec4bfb2083fd (patch)
tree: 733c738d15bda28210053fdfb80b789c8a9dd708 /src
parent: e825021ef1ae7a672b275227a6a1ff44d5f522bc (diff)
2 files changed, 54 insertions, 11 deletions
diff --git a/src/components/rich_content/rich_content.jsx b/src/components/rich_content/rich_content.jsx
index e188763f..328e9201 100644
--- a/src/components/rich_content/rich_content.jsx
+++ b/src/components/rich_content/rich_content.jsx
@@ -246,6 +246,7 @@ const getLinkData = (attrs, children, index) => {
  */
 export const preProcessPerLine = (html, greentext, handleLinks) => {
   const lastMentions = []
+  const greentextHandle = new Set(['p', 'div'])
 
   let nonEmptyIndex = -1
   const newHtml = convertHtmlToLines(html).reverse().map((item, index, array) => {
@@ -256,7 +257,14 @@ export const preProcessPerLine = (html, greentext, handleLinks) => {
     nonEmptyIndex += 1
 
     // Greentext stuff
-    if (greentext && (string.includes('&gt;') || string.includes('&lt;'))) {
+    if (
+      // Only if greentext is engaged
+      greentext &&
+        // Only handle p's and divs. Don't want to affect blocquotes, code etc
+        item.level.every(l => greentextHandle.has(l)) &&
+        // Only if line begins with '>' or '<'
+        (string.includes('&gt;') || string.includes('&lt;'))
+    ) {
       const cleanedString = string.replace(/<[^>]+?>/gi, '') // remove all tags
         .replace(/@\w+/gi, '') // remove mentions (even failed ones)
         .trim()
diff --git a/src/services/html_converter/html_line_converter.service.js b/src/services/html_converter/html_line_converter.service.js
index e448d5cd..f43d162a 100644
--- a/src/services/html_converter/html_line_converter.service.js
+++ b/src/services/html_converter/html_line_converter.service.js
@@ -19,9 +19,42 @@ import { getTagName } from './utility.service.js'
  * @return {(string|{ text: string })[]} processed html in form of a list.
  */
 export const convertHtmlToLines = (html) => {
-  const ignoredTags = new Set(['code', 'blockquote'])
-  const handledTags = new Set(['p', 'br', 'div', 'pre', 'code', 'blockquote'])
-  const openCloseTags = new Set(['p', 'div', 'pre', 'code', 'blockquote'])
+  // Elements that are implicitly self-closing
+  // https://developer.mozilla.org/en-US/docs/Glossary/empty_element
+  const emptyElements = new Set([
+    'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+    'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+  ])
+  // Block-level element (they make a visual line)
+  // https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
+  const blockElements = new Set([
+    'address', 'article', 'aside', 'blockquote', 'details', 'dialog', 'dd',
+    'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'main',
+    'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul'
+  ])
+  // br is very weird in a way that it's technically not block-level, it's
+  // essentially converted to a \n (or \r\n). There's also wbr but it doesn't
+  // guarantee linebreak, only suggest it.
+  const linebreakElements = new Set(['br'])
+
+  const visualLineElements = new Set([
+    ...blockElements.values(),
+    ...linebreakElements.values()
+  ])
+
+  // All block-level elements that aren't empty elements, i.e. not <hr>
+  const nonEmptyElements = new Set(visualLineElements)
+  // Difference
+  for (let elem of emptyElements) {
+    nonEmptyElements.delete(elem)
+  }
+
+  // All elements that we are recognizing
+  const allElements = new Set([
+    ...nonEmptyElements.values(),
+    ...emptyElements.values()
+  ])
 
   let buffer = [] // Current output buffer
   const level = [] // How deep we are in tags and which tags were there
@@ -29,8 +62,8 @@ export const convertHtmlToLines = (html) => {
   let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
 
   const flush = () => { // Processes current line buffer, adds it to output buffer and clears line buffer
-    if (textBuffer.trim().length > 0 && !level.some(l => ignoredTags.has(l))) {
-      buffer.push({ text: textBuffer })
+    if (textBuffer.trim().length > 0) {
+      buffer.push({ level: [...level], text: textBuffer })
     } else {
       buffer.push(textBuffer)
     }
@@ -49,10 +82,12 @@ export const convertHtmlToLines = (html) => {
   }
 
   const handleClose = (tag) => { // handles closing tags
-    flush()
-    buffer.push(tag)
     if (level[0] === getTagName(tag)) {
+      flush()
+      buffer.push(tag)
       level.shift()
+    } else { // Broken case
+      textBuffer += tag
     }
   }
 
@@ -67,10 +102,10 @@ export const convertHtmlToLines = (html) => {
       const tagFull = tagBuffer
       tagBuffer = null
       const tagName = getTagName(tagFull)
-      if (handledTags.has(tagName)) {
-        if (tagName === 'br') {
+      if (allElements.has(tagName)) {
+        if (linebreakElements.has(tagName)) {
           handleBr(tagFull)
-        } else if (openCloseTags.has(tagName)) {
+        } else if (nonEmptyElements.has(tagName)) {
           if (tagFull[1] === '/') {
             handleClose(tagFull)
           } else if (tagFull[tagFull.length - 2] === '/') {
author	Henry Jameson <me@hjkos.com>	2021-06-13 13:29:26 +0300
committer	Henry Jameson <me@hjkos.com>	2021-06-13 15:24:29 +0300
commit	bebafa1a2c38972245d37de70f4aec4bfb2083fd (patch)
tree	733c738d15bda28210053fdfb80b789c8a9dd708 /src
parent	e825021ef1ae7a672b275227a6a1ff44d5f522bc (diff)