2021-10-06 19:40:33 +00:00
|
|
|
import h2m from 'h2m'
|
2021-02-14 15:40:36 +00:00
|
|
|
import { decode } from 'he'
|
2021-10-06 19:40:33 +00:00
|
|
|
import { htmlDecode } from 'js-htmlencode'
|
|
|
|
import { HTMLElement, parse, TextNode } from 'node-html-parser'
|
2021-02-11 12:02:53 +00:00
|
|
|
|
2021-03-30 15:21:18 +00:00
|
|
|
const noChildren = ['strong', 'b', 'em', 'i', 'u', 's']
|
|
|
|
const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a']
|
2021-02-14 15:40:36 +00:00
|
|
|
const cleanText = (node: TextNode, parentType: string): TextNode => {
|
2021-03-30 15:21:18 +00:00
|
|
|
const text =
|
|
|
|
parentType && trimNodes.includes(parentType.toLowerCase())
|
|
|
|
? node.rawText.trim()
|
|
|
|
: node.rawText
|
2021-02-14 15:40:36 +00:00
|
|
|
return new TextNode(text)
|
|
|
|
}
|
2021-02-12 13:06:29 +00:00
|
|
|
|
2021-02-14 15:40:36 +00:00
|
|
|
const deepClean = (node: HTMLElement): HTMLElement => {
|
|
|
|
const attributes = Object.entries(node.attributes)
|
|
|
|
.map(([key, val]) => {
|
|
|
|
if (key === 'href' && val) {
|
|
|
|
// eslint-disable-next-line no-param-reassign
|
|
|
|
val = val.replace(/ /g, '%20')
|
|
|
|
}
|
|
|
|
return `${key}="${val}"`
|
|
|
|
})
|
|
|
|
.join(' ')
|
|
|
|
const cleaned = new HTMLElement(node.tagName, {}, attributes, node.parentNode)
|
|
|
|
node.childNodes.forEach((childNode) => {
|
|
|
|
if (childNode instanceof HTMLElement) {
|
2021-02-14 16:05:34 +00:00
|
|
|
if (node.tagName && noChildren.includes(node.tagName.toLowerCase())) {
|
2021-03-30 15:21:18 +00:00
|
|
|
cleaned.childNodes.push(
|
|
|
|
cleanText(new TextNode(childNode.innerText), node.tagName)
|
|
|
|
)
|
2021-02-14 16:05:34 +00:00
|
|
|
} else {
|
|
|
|
cleaned.childNodes.push(deepClean(childNode))
|
|
|
|
}
|
2021-02-14 15:40:36 +00:00
|
|
|
} else if (childNode instanceof TextNode) {
|
|
|
|
cleaned.childNodes.push(cleanText(childNode, node.tagName))
|
2021-02-12 12:49:10 +00:00
|
|
|
}
|
2021-02-14 15:40:36 +00:00
|
|
|
})
|
|
|
|
return cleaned
|
2021-02-12 12:49:10 +00:00
|
|
|
}
|
2021-02-11 12:02:53 +00:00
|
|
|
|
2021-10-06 19:40:33 +00:00
|
|
|
const rearrangeWhitespace = (html = ''): string => {
|
2021-04-30 08:37:44 +00:00
|
|
|
let content = html
|
2021-10-06 19:40:33 +00:00
|
|
|
.replace(/<span[^>]*>/gm, '')
|
|
|
|
.split('</span>')
|
|
|
|
.join('')
|
|
|
|
.replace(/<div[^>]*>/gm, '')
|
|
|
|
.split('</div>')
|
|
|
|
.join('')
|
|
|
|
.split(' ')
|
|
|
|
.join('&nbsp;')
|
|
|
|
|
2021-04-30 08:37:44 +00:00
|
|
|
// FIXME: Make a loop that doesn't break linting
|
2021-04-27 09:54:38 +00:00
|
|
|
trimNodes.forEach((trimNode) => {
|
|
|
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
|
|
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
2021-04-30 08:37:44 +00:00
|
|
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
2021-10-06 19:40:33 +00:00
|
|
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
2021-04-27 09:54:38 +00:00
|
|
|
})
|
2021-10-06 19:40:33 +00:00
|
|
|
|
2021-04-30 08:37:44 +00:00
|
|
|
trimNodes.forEach((trimNode) => {
|
|
|
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
|
|
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
|
|
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
2021-10-06 19:40:33 +00:00
|
|
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
2021-04-30 08:37:44 +00:00
|
|
|
})
|
|
|
|
trimNodes.forEach((trimNode) => {
|
|
|
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
|
|
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
|
|
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
2021-10-06 19:40:33 +00:00
|
|
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
2021-04-30 08:37:44 +00:00
|
|
|
})
|
|
|
|
trimNodes.forEach((trimNode) => {
|
|
|
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
|
|
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
|
|
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
2021-10-06 19:40:33 +00:00
|
|
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
2021-04-30 08:37:44 +00:00
|
|
|
})
|
|
|
|
|
2021-04-27 09:54:38 +00:00
|
|
|
return content
|
|
|
|
}
|
|
|
|
|
2021-10-06 19:40:33 +00:00
|
|
|
export const clean = (html = ''): string =>
|
2021-02-14 15:40:36 +00:00
|
|
|
deepClean(parse(decode(html))).outerHTML
|
|
|
|
|
2021-02-11 13:30:42 +00:00
|
|
|
interface Node {
|
|
|
|
name: string
|
|
|
|
attrs: { [key: string]: string }
|
|
|
|
isInPreNode: boolean
|
|
|
|
md: string
|
|
|
|
}
|
|
|
|
const converter = 'MarkdownExtra'
|
|
|
|
const overides = {
|
|
|
|
a: (node: Node) => `[${node.md}](${node.attrs.href})`,
|
2021-04-27 09:54:38 +00:00
|
|
|
img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`,
|
|
|
|
i: (node: Node) => `*${node.md}*`,
|
|
|
|
b: (node: Node) => `**${node.md}**`,
|
2021-10-06 19:40:33 +00:00
|
|
|
h1: (node: Node) => `# ${node.md}\n`,
|
|
|
|
h2: (node: Node) => `## ${node.md}\n`,
|
|
|
|
h3: (node: Node) => `### ${node.md}\n`,
|
|
|
|
h4: (node: Node) => `#### ${node.md}\n`,
|
|
|
|
h5: (node: Node) => `##### ${node.md}\n`,
|
|
|
|
h6: (node: Node) => `###### ${node.md}\n`,
|
2021-02-11 13:30:42 +00:00
|
|
|
}
|
|
|
|
|
2021-10-06 19:40:33 +00:00
|
|
|
export const toMarkdown = (html?: string): string => {
|
2021-12-16 07:22:20 +00:00
|
|
|
if (html?.length == 0) return ''
|
2021-11-16 12:33:00 +00:00
|
|
|
|
2021-04-27 09:54:38 +00:00
|
|
|
const rearranged = rearrangeWhitespace(html)
|
|
|
|
const trimmed = clean(rearranged)
|
2021-02-11 13:30:42 +00:00
|
|
|
const markdown = h2m(trimmed, { overides, converter })
|
2021-02-11 12:02:53 +00:00
|
|
|
const decoded = htmlDecode(markdown)
|
|
|
|
return decoded
|
|
|
|
}
|