/**
 * Class for parsing html to prose mirror slices
 */

class HTML2ProsemirrorParser {

  private domParser: DOMParser;
  private tagsToIgnore = new Set<string>(["STRONG", "I", "EM", "B", "U", "BR"]);
  private dequeue = new Array<Element>();

  public constructor() {
    this.domParser = new DOMParser();
  }

  /**
   * Parses the given HTML and returns the body portion of it in a way suitable for further processing by prosemirror
   * @param html The HTML to parse.
   * @return 'null' if the document cannot be parsed, the body of the given HTML as HTMLElement
   */
  public parseHTML(html: string): Document | null {
    const doc = this.domParser.parseFromString(html, "text/html");
    if (!doc) {
      return null;
    }

    const body = doc.body;
    this.filterUnwantedNodes(body);

    // Populate dequeue with initial elements
    for (const bodyChild of body.children) {
      if (bodyChild.nodeType === Node.ELEMENT_NODE) {
        this.dequeue.push(bodyChild as Element);
      }
    }

    // process as long as there are unwrapped nodes
    while (this.dequeue.length > 0) {
      const element = this.dequeue.shift();
      this.processSingleElement(element, doc);
    }

    // Is required for copy&paste from internal library -> Leads to crash when copy&paste from a word file (with line breaks)
    // EDIT: Currently, there is no need to process \n or \r\n in text nodes. The 'crash' when copying from a Word file could not be
    // reproduced. Still, this code migth be important, so it should not be deleted for the moment. If this bug occurs, describe how to
    // reproduce it and try to comment in this line.
    // this.postProcessTextnodes(doc.body, doc);

    return doc;
  }

  private postProcessTextnodes(node: Node, doc: Document) {
    const children = node.childNodes;

    children.forEach(child => {

      if (this.isTextNode(child) && this.hasNewLine(child)) {
        const splittedText = child.textContent?.split(/(?:\r\n?|\n)/);

        if (splittedText) {
          const newNodes = this.insertBreaks(splittedText, doc);
          newNodes.forEach(newNode => node.insertBefore(newNode, child));
          node.removeChild(child);
        }
      }
    });
  }

  private insertBreaks(splittedText: string[], doc: Document): Array<Node> {
    const newNodes: Array<Node> = [];
    for (let textPartIdx = 0; textPartIdx < splittedText.length; textPartIdx++) {
      const block = splittedText[textPartIdx];
      if (block.length > 0) {
        const newNode = doc.createTextNode(block);
        newNodes.push(newNode);
      }
      if (textPartIdx !== splittedText.length - 1) {
        const br = doc.createElement("br");
        newNodes.push(br);
      }
    }
    return newNodes;
  }

  private hasNewLine(node: Node): boolean {
    return node.textContent ? node.textContent.includes('\n') : false;
  }

  private isTextNode(node: Node): boolean {
    return node.nodeType === Node.TEXT_NODE;
  }


  /**
   * Takes a single element and unwraps its children to its parent.
   * Children are not unwrapped if they are HTML Formatting Elements.
   * @param element The element to unwrap.
   * @param doc Document where the elements are contained within.
   */
  private processSingleElement(element: Element | undefined, doc: Document) {
    if (element === undefined) {
      return;
    }
    for (const child of element.childNodes) {
      if (child.nodeType === Node.ELEMENT_NODE && !this.tagsToIgnore.has(child.nodeName)) {
        this.dequeue.push(child as Element);
      }
    }
    const parent = element.parentNode;
    const parentChanged = this.unwrapChildrenToDirectParentConsinderingLinebreaks(element, doc);
    // if parent changed we must possibily go for another unwrap
    // add it again to the dequeue
    if (parentChanged && parent && parent !== doc.body) {
      this.dequeue.push(parent as Element);
    }
  }

  /**
   * Recursively filters unwanted nodes from the given node.
   * This concerns empty text-nodes and some nodes found in MS-Word HTML
   * @param node The node to filter
   */
  private filterUnwantedNodes(node: Node) {
    const children = node.childNodes;
    let childIdx = 0;
    while (childIdx < children.length) {
      const child = children.item(childIdx);
      if (this.isUnwantedNode(child)) {
        node.removeChild(child);
      } else {
        this.filterUnwantedNodes(child);
        childIdx++;
      }
    }
  }

  /**
   * Checks if a node is unwanted, i.e. must be filtered
   * @param node The node to check
   * @returns true if the node is unwanted, false otherwise
   */
  private isUnwantedNode(node: Node): boolean {
    // Empty nodes like <span> </span> are often used in web (e.g. Wikipedia), we should not delete them.
    // If there are empty unwanted nodes, e.g. from word files, this check should be adapted for different use cases.
    // if (node.nodeType === Node.TEXT_NODE && this.isEmptyText(node)) {
      // return true;
    // }

    /*
     * Nodes that are
     * 4 -> CDATASection
     * 5 -> EntityReference
     * 6 -> Entity
     * 7 -> ProcessingInstruction
     * 8 -> Comment
     * are unwanted
     */
    if (node.nodeType >= Node.CDATA_SECTION_NODE && node.nodeType <= Node.COMMENT_NODE) {
      return true;
    }

    // filter some microsoft word specific html elements
    return node.nodeType === Node.ELEMENT_NODE
      && ((node as HTMLElement).classList.contains("MsoCommentReference") || (node as HTMLElement).classList.contains("MsoCommentText"));
  }

  private subtreeHasText(node: Node | null): boolean {
    if (!node) {
      return false;
    }
    if (node.nodeType === Node.TEXT_NODE) {
      return true;
    }
    for (const child of node.childNodes) {
      if (this.subtreeHasText(child as Element)) {
        return true;
      }
    }
    return false;
  }

  /**
   * Move all children of element to the parent of element inserting linebreaks where necessary
   * @param element The element to lift child nodes up to the parent
   * @param doc document for creating new linebreak nodes
   * @return 'true' if parent of element is changed, 'false' otherwise
   */
  private unwrapChildrenToDirectParentConsinderingLinebreaks(element: Element, doc: Document): boolean {
    if (!element || !element.parentNode) {
      return false;
    }

    // convert paragraphs (<p> tags) and other tags to breaks
    // DIV is currently excluded because Word adds div for comments and page breaks, which cannot be filtered properly
    if (["P", "H1", "H2", "H3", "UL", "LI", /* "DIV",*/ "OL"].includes(element.nodeName)) {
      element.appendChild(doc.createElement("br"));
    } else if (this.hasNodeALogicalSibling(element)) {
      // add two breaks between logical blocks
      if (this.subtreeHasText(element) && this.subtreeHasText(element.nextSibling)) {
        element.appendChild(doc.createElement("br"));
        element.appendChild(doc.createElement("br"));
      }
    }

    // try to unroot children of element if the element is non of the tags to ignore (defined in the list above),
    // return true if parent has changed
    return this.tagsToIgnore.has(element.nodeName) ? false : this.unrootChild(element);
  }

  /**
   * Move all children of the given node to its parent node
   * @param unrootee Node from which children will be unrooted
   * @return 'true' if unrootee's parent node was modified, 'false' otherwise
   */
  private unrootChild(unrootee: Element): boolean {
    const parentNode = unrootee.parentNode;
    if (!parentNode) {
      return false;
    }
    const hasChildren = unrootee.children.length > 0;
    while (unrootee.firstChild) {
      const firstChild = unrootee.firstChild;
      parentNode.insertBefore(firstChild, unrootee);
    }
    if (unrootee.nodeType !== Node.TEXT_NODE) {
      parentNode.removeChild(unrootee);
      return true;
    }
    return hasChildren;
  }

  private hasNodeALogicalSibling(node: Node): boolean {
    if (node.nodeType !== Node.ELEMENT_NODE || node.nextSibling?.nodeType !== Node.ELEMENT_NODE) {
      return false;
    }
    const currentNode = node as HTMLElement;
    const sibling = node.nextSibling as HTMLElement;
    return (currentNode.getAttribute("logicalBlock") === "true" && sibling.getAttribute("logicalBlock") === "true");
  }

  private isEmptyText(node: Node): boolean {
    if (node.nodeType !== Node.TEXT_NODE) {
      return false;
    }

    // filter non-breaking space, tabs and linebreaks.
    // (MS word generates a lot of spans only containing a single &nbsp;
    const TRIM_WHITESPACES = /([\u00A0])|(\t)|(\r?\n)/g;
    const trimmedText = node.textContent?.replace(TRIM_WHITESPACES, '');
    return trimmedText?.length === 0;
  }
}

export default HTML2ProsemirrorParser;
