import { Readability } from '@mozilla/readability'
import * as turndownPluginGfm from 'turndown-plugin-gfm'
import TurndownService from 'turndown'

let getParsedDomFromHtml = (html) => {
  return new window.DOMParser().parseFromString(html, 'text/html')
}

export function setDomParser(fn) {
  getParsedDomFromHtml = fn
}

export function extractArticleFromHTML(html) {
  if (html?.constructor !== String) {
    return {}
  }

  let useFirefoxReader = true
  const lineBreakPlaceholder = ':::s-l=br-ph:::'

  function removeNonContentElements(dom) {
    ;[
      'script',
      'are',
      'command',
      'embed',
      'input',
      'keygen',
      'link',
      'param',
      'source',
      'track',
      'video',
      'audio',
      'svg',
      'iframe',
      'style',
    ].forEach((selector) => {
      dom.querySelectorAll(selector).forEach((el) => el.remove())
    })
    return dom
  }

  function preparedDom(html) {
    // is it unique enough? :D
    let dom = getParsedDomFromHtml(html)
    dom.querySelectorAll('p, h1, h2, h3, h4, h5, h6').forEach((p) => {
      p.innerHTML += lineBreakPlaceholder
    })
    dom = removeNonContentElements(dom)
    return dom
  }

  let textContent = ''
  let title = ''

  if (useFirefoxReader) {
    let reader = new Readability(preparedDom(html))
    try {
      let parsed = reader.parse()
      if (parsed) {
        textContent = parsed.textContent
        title = parsed.title
        if (textContent) {
          textContent = textContent
            .split(`\n`)
            .map((line) => {
              return line.trim()
            })
            .join(`\n`)
            .replace(new RegExp(lineBreakPlaceholder, 'g'), `\n\n`)
            .replace(/\n\n+/g, `\n\n`)
            .trim()
        }
      }
    } catch (e) {
      console.error(e)
    }
  }

  if (!textContent || !title) {
    let dom = getParsedDomFromHtml(html)
    dom.querySelectorAll('div, span').forEach((el) => {
      if (
        !el.innerText ||
        (el.innerText.trim().length <= 25 && el.innerText.trim().match(/\w$/))
      ) {
        el.remove()
      }
    })
    dom.querySelectorAll('a').forEach((el) => {
      if (!el.innerText || !el.innerText.trim()) {
        el.remove()
      }
    })
    dom.querySelectorAll('img, picture, video').forEach((el) => {
      el.remove()
    })

    textContent =
      dom.querySelector('main') ||
      dom.querySelector('article') ||
      dom.querySelector('#main') ||
      dom.querySelector('#content') ||
      dom.querySelector('body')
    title =
      dom.querySelector('h1')?.innerText ||
      dom.querySelector('title')?.innerText ||
      dom.querySelector('h2, h3, h4, h5, h6')?.innerText

    title = title?.trim()
    textContent = textContent?.innerHTML
      ? new TurndownService()
          .use(turndownPluginGfm.gfm)
          .turndown(textContent.innerHTML)
      : null
  }

  return {
    textContent,
    title,
  }
}
