import h2m from 'h2m' import { decode } from 'he' import { htmlDecode } from 'js-htmlencode' import { HTMLElement, parse, TextNode } from 'node-html-parser' const noChildren = ['strong', 'b', 'em', 'i', 'u', 's'] const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a'] const cleanText = (node: TextNode, parentType: string): TextNode => { const text = parentType && trimNodes.includes(parentType.toLowerCase()) ? node.rawText.trim() : node.rawText return new TextNode(text) } const deepClean = (node: HTMLElement): HTMLElement => { const attributes = Object.entries(node.attributes) .map(([key, val]) => { if (key === 'href' && val) { // eslint-disable-next-line no-param-reassign val = val.replace(/ /g, '%20') } return `${key}="${val}"` }) .join(' ') const cleaned = new HTMLElement(node.tagName, {}, attributes, node.parentNode) node.childNodes.forEach((childNode) => { if (childNode instanceof HTMLElement) { if (node.tagName && noChildren.includes(node.tagName.toLowerCase())) { cleaned.childNodes.push( cleanText(new TextNode(childNode.innerText), node.tagName) ) } else { cleaned.childNodes.push(deepClean(childNode)) } } else if (childNode instanceof TextNode) { cleaned.childNodes.push(cleanText(childNode, node.tagName)) } }) return cleaned } const rearrangeWhitespace = (html = ''): string => { let content = html .replace(/]*>/gm, '') .split('') .join('') .replace(/]*>/gm, '') .split('') .join('') .split(' ') .join('&nbsp;') // FIXME: Make a loop that doesn't break linting trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) content = content.split(`&nbsp;`).join(` `) }) trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) content = content.split(`&nbsp;`).join(` `) }) trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) content = content.split(`&nbsp;`).join(` `) }) trimNodes.forEach((trimNode) => { content = content.split(`<${trimNode}> `).join(` <${trimNode}>`) content = content.split(` `).join(` `) content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`) content = content.split(`&nbsp;`).join(` `) }) return content } export const clean = (html = ''): string => deepClean(parse(decode(html))).outerHTML interface Node { name: string attrs: { [key: string]: string } isInPreNode: boolean md: string } const converter = 'MarkdownExtra' const overides = { a: (node: Node) => `[${node.md}](${node.attrs.href})`, img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`, i: (node: Node) => `*${node.md}*`, b: (node: Node) => `**${node.md}**`, h1: (node: Node) => `# ${node.md}\n`, h2: (node: Node) => `## ${node.md}\n`, h3: (node: Node) => `### ${node.md}\n`, h4: (node: Node) => `#### ${node.md}\n`, h5: (node: Node) => `##### ${node.md}\n`, h6: (node: Node) => `###### ${node.md}\n`, } export const toMarkdown = (html?: string): string => { if (html?.length == 0) return '' const rearranged = rearrangeWhitespace(html) const trimmed = clean(rearranged) const markdown = h2m(trimmed, { overides, converter }) const decoded = htmlDecode(markdown) return decoded }