fix: failing html parsing
This commit is contained in:
parent
a867b116c8
commit
2a2259a260
|
@ -1,7 +1,7 @@
|
||||||
import * as h2m from 'h2m'
|
import h2m from 'h2m'
|
||||||
import { htmlDecode } from 'js-htmlencode'
|
|
||||||
import { decode } from 'he'
|
import { decode } from 'he'
|
||||||
import { parse, HTMLElement, TextNode } from 'node-html-parser'
|
import { htmlDecode } from 'js-htmlencode'
|
||||||
|
import { HTMLElement, parse, TextNode } from 'node-html-parser'
|
||||||
|
|
||||||
const noChildren = ['strong', 'b', 'em', 'i', 'u', 's']
|
const noChildren = ['strong', 'b', 'em', 'i', 'u', 's']
|
||||||
const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a']
|
const trimNodes = [...noChildren, 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'a']
|
||||||
|
@ -40,45 +40,48 @@ const deepClean = (node: HTMLElement): HTMLElement => {
|
||||||
return cleaned
|
return cleaned
|
||||||
}
|
}
|
||||||
|
|
||||||
const rearrangeWhitespace = (html: string = ''): string => {
|
const rearrangeWhitespace = (html = ''): string => {
|
||||||
let content = html
|
let content = html
|
||||||
.replace(/<span[^>]*>/gm, '')
|
.replace(/<span[^>]*>/gm, '')
|
||||||
.split('</span>').join('')
|
.split('</span>')
|
||||||
.replace(/<div[^>]*>/gm, '')
|
.join('')
|
||||||
.split('</div>').join('')
|
.replace(/<div[^>]*>/gm, '')
|
||||||
.split(' ').join('&nbsp;')
|
.split('</div>')
|
||||||
|
.join('')
|
||||||
|
.split(' ')
|
||||||
|
.join('&nbsp;')
|
||||||
|
|
||||||
// FIXME: Make a loop that doesn't break linting
|
// FIXME: Make a loop that doesn't break linting
|
||||||
trimNodes.forEach((trimNode) => {
|
trimNodes.forEach((trimNode) => {
|
||||||
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
||||||
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
||||||
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
||||||
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
||||||
})
|
})
|
||||||
|
|
||||||
trimNodes.forEach((trimNode) => {
|
trimNodes.forEach((trimNode) => {
|
||||||
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
||||||
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
||||||
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
||||||
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
||||||
})
|
})
|
||||||
trimNodes.forEach((trimNode) => {
|
trimNodes.forEach((trimNode) => {
|
||||||
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
||||||
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
||||||
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
||||||
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
||||||
})
|
})
|
||||||
trimNodes.forEach((trimNode) => {
|
trimNodes.forEach((trimNode) => {
|
||||||
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}> `).join(` <${trimNode}>`)
|
||||||
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(` </${trimNode}>`).join(`</${trimNode}> `)
|
||||||
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
content = content.split(`<${trimNode}>&nbsp;`).join(` <${trimNode}>`)
|
||||||
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
content = content.split(`&nbsp;</${trimNode}>`).join(`</${trimNode}> `)
|
||||||
})
|
})
|
||||||
|
|
||||||
return content
|
return content
|
||||||
}
|
}
|
||||||
|
|
||||||
export const clean = (html: string = ''): string =>
|
export const clean = (html = ''): string =>
|
||||||
deepClean(parse(decode(html))).outerHTML
|
deepClean(parse(decode(html))).outerHTML
|
||||||
|
|
||||||
interface Node {
|
interface Node {
|
||||||
|
@ -93,15 +96,15 @@ const overides = {
|
||||||
img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`,
|
img: (node: Node) => `![${node.attrs.title || ''}](${node.attrs.src})`,
|
||||||
i: (node: Node) => `*${node.md}*`,
|
i: (node: Node) => `*${node.md}*`,
|
||||||
b: (node: Node) => `**${node.md}**`,
|
b: (node: Node) => `**${node.md}**`,
|
||||||
'h1': (node: Node) => `# ${node.md}\n`,
|
h1: (node: Node) => `# ${node.md}\n`,
|
||||||
'h2': (node: Node) => `## ${node.md}\n`,
|
h2: (node: Node) => `## ${node.md}\n`,
|
||||||
'h3': (node: Node) => `### ${node.md}\n`,
|
h3: (node: Node) => `### ${node.md}\n`,
|
||||||
'h4': (node: Node) => `#### ${node.md}\n`,
|
h4: (node: Node) => `#### ${node.md}\n`,
|
||||||
'h5': (node: Node) => `##### ${node.md}\n`,
|
h5: (node: Node) => `##### ${node.md}\n`,
|
||||||
'h6': (node: Node) => `###### ${node.md}\n`,
|
h6: (node: Node) => `###### ${node.md}\n`,
|
||||||
}
|
}
|
||||||
|
|
||||||
export const toMarkdown = (html: string): string => {
|
export const toMarkdown = (html?: string): string => {
|
||||||
const rearranged = rearrangeWhitespace(html)
|
const rearranged = rearrangeWhitespace(html)
|
||||||
const trimmed = clean(rearranged)
|
const trimmed = clean(rearranged)
|
||||||
const markdown = h2m(trimmed, { overides, converter })
|
const markdown = h2m(trimmed, { overides, converter })
|
||||||
|
|
Loading…
Reference in New Issue