import cheerio from 'cheerio';
import { useEffect, useState } from 'react';

import { sanitizeWhitespace } from '../util/html';

export interface SummarySection {
  header?: string;
  subheader?: string;
  sentence?: string;
}

export interface SummaryContent {
  preview?: string;
  headerImage?: string;
  headline?: string;
  subheader?: string;
  images: string[];
  sections: SummarySection[];
  body: string;
}

/**
 * Parse out body content from an article structured with `h1` tags.
 */
const parseWithHeadlines = ($: cheerio.Root): SummarySection[] => {
  const sections: SummarySection[] = [];
  const headlines = $('div.post').last().find('h1');
  headlines.each((idx, headline) => {
    const section: SummarySection = {};
    section.header = $(headline).text();

    // Check if we have h2's.
    const subheaders = $(headline).next('h2');
    if (subheaders.length) {
      section.subheader = subheaders.first().text();
      sections.push(section);
      return;
    }
    // Maybe we have h3's.
    const sectionHeaders = $(headline).next('h3');
    if (sectionHeaders.length) {
      section.subheader = sectionHeaders.first().text();
      sections.push(section);
      return;
    }
    // Maybe we have h4's.
    const smallHeaders = $(headline).next('h4');
    if (smallHeaders.length) {
      section.subheader = smallHeaders.first().text();
      sections.push(section);
      return;
    }
    // Fall back to first sentences.
    const paragraphs = $(headline).next('p');
    if (paragraphs.length) {
      section.sentence = (paragraphs.first().text() || '').split('.')[0];
      sections.push(section);
      return;
    }

    // Still want to show something.
    console.warn('Unable to parse content section from headline');
    sections.push(section);
  });

  return sections;
};

/**
 * Parse out body content from an article structured with `h2` tags.
 */
const parseWithSubheaders = ($: cheerio.Root): SummarySection[] => {
  const sections: SummarySection[] = [];
  const subheaders = $('div.post').last().find('h2');
  subheaders.each((idx, subheader) => {
    const section: SummarySection = {};
    section.header = $(subheader).text();

    // Check if we have h3's.
    const sectionHeaders = $(subheader).next('h3');
    if (sectionHeaders.length) {
      section.subheader = sectionHeaders.first().text();
      sections.push(section);
      return;
    }
    // Maybe we have h4's.
    const smallHeaders = $(subheader).next('h4');
    if (smallHeaders.length) {
      section.subheader = smallHeaders.first().text();
      sections.push(section);
      return;
    }
    // Fall back to first sentences.
    const paragraphs = $(subheader).next('p');
    if (paragraphs.length) {
      section.sentence = (paragraphs.first().text() || '').split('.')[0];
      sections.push(section);
      return;
    }

    // Still want to show something.
    console.warn('Unable to parse content section from subheader');
    sections.push(section);
  });

  return sections;
};

/**
 * Parse out body content from an article structured with `h3` tags.
 */
const parseWithSectionHeaders = ($: cheerio.Root): SummarySection[] => {
  const sections: SummarySection[] = [];
  const sectionHeaders = $('div.post').last().find('h3');
  sectionHeaders.each((idx, sectionHeader) => {
    const section: SummarySection = {};
    section.header = $(sectionHeader).text();

    // Maybe we have h4's.
    const smallHeaders = $(sectionHeaders).next('h4');
    if (smallHeaders.length) {
      section.subheader = smallHeaders.first().text();
      sections.push(section);
      return;
    }

    // Grab the first sentence of the next paragraph.
    const paragraphs = $(sectionHeader).next('p');
    if (paragraphs.length) {
      section.sentence = (paragraphs.first().text() || '').split('.')[0];
      sections.push(section);
      return;
    }

    // Still want to show something.
    console.warn('Unable to parse content section from section header');
    sections.push(section);
  });

  return sections;
};

/**
 * Parse out headers and first sentences for articles that only have h4's.
 */
const parseWithSmallHeaders = ($: cheerio.Root): SummarySection[] => {
  const sections: SummarySection[] = [];
  const smallHeaders = $('div.post').last().find('h4');
  smallHeaders.each((idx, smallHeader) => {
    const section: SummarySection = {};
    section.header = $(smallHeader).text();

    // Grab the first sentence of the next paragraph.
    const paragraphs = $(smallHeader).next('p');
    if (paragraphs.length) {
      section.sentence = (paragraphs.first().text() || '').split('.')[0];
      sections.push(section);
      return;
    }

    // Still want to show something.
    console.warn('Unable to parse content section from small header');
    sections.push(section);
  });

  return sections;
};

/**
 * Parse out body content from an article that has no headers.
 */
const parseWithParagraphs = ($: cheerio.Root): SummarySection[] => {
  const sections: SummarySection[] = [];
  const paragraphs = $('div.post').last().find('p');
  paragraphs.each((idx, paragraph) => {
    // Grab the first sentence.
    const section: SummarySection = {
      sentence: ($(paragraph).text() || '').split('.')[0],
    };
    sections.push(section);
  });

  return sections;
};

const useSummary = (article: string, skipSummary: boolean): SummaryContent | undefined => {
  const [content, setContent] = useState<SummaryContent | undefined>();

  useEffect(() => {
    const $ = cheerio.load(sanitizeWhitespace(article));

    // Parse initial header and image metadta.
    const preview = sanitizeWhitespace($('div.preview').text() || '');
    const headerImage = $('.header.graphic-header').find('img').first().attr('src');

    const headerContents = $('.post-header');
    const headline = headerContents.find('h1').first().text();
    const subheader = headerContents.find('h3').first().text();

    const images: string[] = [];
    $('div.post')
      .last()
      .find('.image-wrapper')
      .each((idx, image) => {
        const url = $(image).find('img').first().attr('src');
        if (url) {
          images.push(url);
        }
      });

    const headlines = $('div.post').last().find('h1');
    const subheadlines = $('div.post').last().find('h2');
    const sectionHeaders = $('div.post').last().find('h3');
    const smallHeaders = $('div.post').last().find('h4');

    // Uncomment to debug summary parsing.
    // console.log('Headlines', headlines.length);
    // console.log('Subheadlines', subheadlines.length);
    // console.log('Sections', sectionHeaders.length);
    // console.log('Small Headers', smallHeaders.length);

    let sections: SummarySection[];
    if (headlines.length) {
      console.debug('Article has headlines');
      sections = parseWithHeadlines($);
    } else if (subheadlines.length) {
      console.debug('Article has subheaders');
      sections = parseWithSubheaders($);
    } else if (sectionHeaders.length) {
      console.debug('Article has section headers');
      sections = parseWithSectionHeaders($);
    } else if (smallHeaders.length) {
      console.debug('Article has small headers');
      sections = parseWithSmallHeaders($);
    } else {
      console.debug('Falling back to paragraph parsing');
      sections = parseWithParagraphs($);
    }
    let body = '';

    if (skipSummary) {
      body = $('body').html() || $('div').html() || '';
    } else {
      body = $('div.post').last().html() || '';
    }

    // TODO: Support other newsletters.
    const parsedContent: SummaryContent = {
      preview,
      headerImage,
      headline,
      subheader,
      images,
      sections,
      body: body,
    };
    setContent(parsedContent);
  }, [article, skipSummary]);

  return content;
};

export default useSummary;
