feat: Ombyggd parsning av nyhetsbrev (#65)

* feat: 🎸 Ombyggd parsning av nyhetsbrev
This commit is contained in:
Johan Öbrink 2021-02-14 16:40:36 +01:00 committed by GitHub
parent 905b893ca7
commit a5dfb704f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 163 additions and 45 deletions

View File

@ -1,6 +1,6 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
testEnvironment: 'jsdom',
transform: {
'.(ts|tsx)': 'ts-jest'
},

File diff suppressed because one or more lines are too long

View File

@ -1,37 +1,57 @@
import * as h2m from 'h2m'
import { htmlDecode } from 'js-htmlencode'
import { decode } from 'he'
import {
parse, HTMLElement, TextNode,
} from 'node-html-parser'
export const trim = (html: string = ''): string => {
const trimmed = html
.replace(/ /g, ' ')
.split('>')
.map((token) => token.trim())
.join('>')
.split('</')
.map((token) => token.trim())
.join('</')
let result = trimmed
const rxSpaces = /href="(.*)"/g
let matches = trimmed.match(rxSpaces)
if (matches) {
// eslint-disable-next-line no-restricted-syntax
for (const match of matches) {
result = result.replace(match, match.replace(/ /g, '%20'))
}
}
const rxSpacesSing = /href='(.*)'/g
matches = trimmed.match(rxSpacesSing)
if (matches) {
// eslint-disable-next-line no-restricted-syntax
for (const match of matches) {
result = result.replace(match, match.replace(/ /g, '%20'))
}
}
return result
const trimNodes = [
'strong',
'b',
'em',
'i',
'u',
's',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
'a',
]
const cleanText = (node: TextNode, parentType: string): TextNode => {
const text = (parentType && trimNodes.includes(parentType.toLowerCase()))
? node.rawText.trim() : node.rawText
return new TextNode(text)
}
const deepClean = (node: HTMLElement): HTMLElement => {
const attributes = Object.entries(node.attributes)
.map(([key, val]) => {
if (key === 'href' && val) {
// eslint-disable-next-line no-param-reassign
val = val.replace(/ /g, '%20')
}
return `${key}="${val}"`
})
.join(' ')
const cleaned = new HTMLElement(node.tagName, {}, attributes, node.parentNode)
node.childNodes.forEach((childNode) => {
if (childNode instanceof HTMLElement) {
cleaned.childNodes.push(deepClean(childNode))
} else if (childNode instanceof TextNode) {
cleaned.childNodes.push(cleanText(childNode, node.tagName))
}
})
return cleaned
}
export const clean = (html: string = ''): string => (
deepClean(parse(decode(html))).outerHTML
)
interface Node {
name: string
attrs: { [key: string]: string }
@ -44,7 +64,7 @@ const overides = {
}
export const toMarkdown = (html: string): string => {
const trimmed = trim(html)
const trimmed = clean(html)
const markdown = h2m(trimmed, { overides, converter })
const decoded = htmlDecode(markdown)
return decoded

View File

@ -36,10 +36,13 @@
"typescript": "^4.1.3"
},
"dependencies": {
"@types/he": "^1.1.1",
"camelcase-keys": "^6.2.2",
"events": "^3.2.0",
"h2m": "^0.7.0",
"he": "^1.2.0",
"js-htmlencode": "^0.3.0",
"luxon": "^1.25.0"
"luxon": "^1.25.0",
"node-html-parser": "^2.1.0"
}
}

View File

@ -5,9 +5,14 @@
"declaration": true,
"outDir": "./dist",
"strict": true,
"moduleResolution": "node",
"allowSyntheticDefaultImports": true
},
"include": ["lib"],
"exclude": ["node_modules", "**/__tests__/*", "**/*.test.ts"]
}
"include": [
"lib"
],
"exclude": [
"node_modules",
"**/__tests__/*",
"**/*.test.ts"
]
}

View File

@ -556,6 +556,11 @@
dependencies:
"@types/node" "*"
"@types/he@^1.1.1":
version "1.1.1"
resolved "https://registry.yarnpkg.com/@types/he/-/he-1.1.1.tgz#19e14033c4ee8f1a702c74dcc6182664839ac2b7"
integrity sha512-jpzrsR1ns0n3kyWt92QfOUQhIuJGQ9+QGa7M62rO6toe98woQjnsnzjdMtsQXCdvjjmqjS2ZBCC7xKw0cdzU+Q==
"@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0", "@types/istanbul-lib-coverage@^2.0.1":
version "2.0.3"
resolved "https://registry.npmjs.org/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.3.tgz"
@ -2259,6 +2264,11 @@ has@^1.0.3:
dependencies:
function-bind "^1.1.1"
he@1.2.0, he@^1.2.0:
version "1.2.0"
resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f"
integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==
hosted-git-info@^2.1.4:
version "2.8.8"
resolved "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.8.tgz"
@ -3379,6 +3389,13 @@ node-fetch@^2.6.1:
resolved "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz"
integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw==
node-html-parser@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-2.1.0.tgz#36345804d743a5a1f672d4821a53f6b0e60629a9"
integrity sha512-kbCNfqjrwHAbG+mevL8aqjwVtF0Qv66XurWHoGLOc5G9rPR1L3k602jfeczAUUBldLNnCrdsDmO5G5nqAoMW+g==
dependencies:
he "1.2.0"
node-int64@^0.4.0:
version "0.4.0"
resolved "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz"