/* eslint-disable max-statements */
import nlp from "compromise/three";

import {
	ListStackType,
	ListTypes,
	MergedLine,
	MergedParagraph,
	ProcessedTextContent,
	WeightedNumber,
	compareNumberedListItems,
	deduplicateAndMergeItems,
	getContextualListMatch,
	getListMatches,
	weightedMedian
} from "./pdf-utils-common";

// Takes multiple line items and merges them into a single line based on their position and list type
// It also prepares their indexing information
export function mergeLines(lines: MergedLine[]) {
	const currentGroup: MergedLine[] = [];
	const mergedLineGroups: ProcessedTextContent[] = [];

	// This stack tells us the current list context we are in
	const listStack: Array<ListStackType> = [];

	// Merge words into lines and save them in mergedLineGroups
	for (let lineIdx = 0; lineIdx <= lines.length; lineIdx++) {
		const line = lines[lineIdx];
		const previousLine = lines[lineIdx - 1];
		if (!line) {
			continue;
		}

		const listMatches = getListMatches(line.str);

		// We are entering a new list, merge previous items
		if (listMatches[0]?.listType !== "paragraph") {
			mergeLinesInternal();
		}

		if (previousLine && currentGroup.length > 0) {
			// Page breaks are problematic, we can have massive layout changes between them
			// so if a piece of text between two pages has the same font, we assume it was a continuation
			const hasPageChangedWithSameFont =
				previousLine.fontName === line.fontName &&
				line.height === previousLine.height &&
				line.left >= previousLine.left &&
				line.pageNumber - previousLine.pageNumber === 1;

			if (hasPageChangedWithSameFont) {
				currentGroup.push(line);
				continue;
			}

			// Split when there is more than 1 line of space between the lines
			const topDifference = Math.floor(
				Math.abs(previousLine.absoluteTop + previousLine.height - line.absoluteTop)
			);
			const hasLineHeightIncreased = topDifference > line.height;

			// Split when we are in a paragraph and the font name has changed (Prominent in CIS documents)
			// Or when the height is bigger than the previous line
			const isFontSizeSignificantlyDifferent = Math.abs(line.height - previousLine.height) > 2;
			const isFontAndHeightDifferent =
				line.fontName !== previousLine.fontName && line.height !== previousLine.height;

			// If the previous line ended early, then it means that the current line is not a part of the paragraph
			const previousLineEndedEarly = previousLine.xGridEnd <= 7 && previousLine.xGridStart === 0;

			// If a line ends with a colon, then it's a new paragraph
			const previousLineEndsWithAColon = previousLine.str.endsWith(":");

			// If the indents mismatch, then it's a new paragraph
			const indentsMismatch = previousLine.xGridStart > line.xGridStart;

			if (
				previousLineEndsWithAColon ||
				hasLineHeightIncreased ||
				isFontAndHeightDifferent ||
				isFontSizeSignificantlyDifferent ||
				previousLineEndedEarly ||
				indentsMismatch
			) {
				mergeLinesInternal();
			}
		}
		currentGroup.push(line);
	}

	// Handle the last line group
	mergeLinesInternal();

	// Merge the current group into a single paragraph and calculates their index
	function mergeLinesInternal() {
		if (currentGroup.length > 0) {
			const paragraph = mergeLinesIntoParagraph(currentGroup, mergedLineGroups);
			updateListStack(paragraph, listStack);
			mergedLineGroups.push(processParagraph(paragraph, listStack));
			currentGroup.length = 0;
		}
	}

	return fixIndexes(mergedLineGroups);
}

/**
 * PDF parsing isn't perfect, and PDF formats are not standardized, so we have to do some
 * post-processing to fix the indexes. Especially when we have numbered lists with depths
 */
function fixIndexes(contents: ProcessedTextContent[]): ProcessedTextContent[] {
	let idx = 0;
	let current = contents[idx]!;

	while (idx < contents.length) {
		current = contents[idx]!;

		// Fix all lists with "d.d" format
		if (
			current.listMatch.listType === "numbered" &&
			current.listMatch.depth > 1 &&
			current.index.length !== current.listMatch.depth
		) {
			const currentIndexJoin = current.indexJoin;
			const baseIndex: string[] = new Array(current.listMatch.depth).fill(
				current.listMatch.listType
			);
			const currentIndexLength = current.index.length;
			let nextItem = contents[idx + 1];

			// Fix the current item
			current.index = baseIndex;
			current.indexJoin = current.index.join(".");

			// Loop through all the next items
			while (
				nextItem &&
				nextItem.index.length > currentIndexLength &&
				nextItem.indexJoin.startsWith(currentIndexJoin)
			) {
				nextItem.index = [...baseIndex, nextItem.listMatch.listType];
				nextItem.indexJoin = nextItem.index.join(".");
				idx++;
				nextItem = contents[idx + 1];
			}
		} else {
			idx++;
		}
	}

	return contents;
}

/**
 * Updates the list stack based on the current line
 */
function updateListStack(line: MergedParagraph, listStack: ListStackType[]) {
	const { listMatch, left, absoluteTop, lineHeight } = line;

	// Check if the last element in the key stack belongs to the current list item key
	const lastListStack = listStack.at(-1);

	const currentStackItem: ListStackType = {
		...listMatch,
		height: lineHeight,
		left,
		top: absoluteTop
	};

	// Keep popping the index queue until we reach a similar context
	if (lastListStack) {
		let listContextDecreased = hasListContextDecreased(currentStackItem, listStack);
		while (listContextDecreased) {
			listStack.pop();
			listContextDecreased = hasListContextDecreased(currentStackItem, listStack);
		}

		if (listStack.at(-1)?.key === currentStackItem.key) {
			listStack.pop();
		}
	}

	// Push the current item to the stack
	listStack.push(currentStackItem);
}

/**
 * Merges multiple lines of text into a single line. This takes care of cleanup of string
 * and also recalculates the list match if the text is merged.
 */
function mergeLinesIntoParagraph(
	mergeTextContents: MergedLine[],
	mergedLineGroups: ProcessedTextContent[]
): MergedParagraph {
	const [firstItem] = mergeTextContents;
	const lastItem = mergeTextContents[mergeTextContents.length - 1];

	if (!firstItem || !lastItem) {
		throw new Error("Empty text contents");
	}

	const str = mergeTextContents.map(content => content.str).join(" ");
	const listMatch = getContextualListMatch(str, mergedLineGroups);

	return {
		str: listMatch.str,
		absoluteTop: firstItem.absoluteTop,
		transform: [],
		hasEOL: false,
		listMatch,
		...getMedianMergedTextContentProperties(mergeTextContents)
	};
}

function processParagraph(line: MergedParagraph, listStack: ListStackType[]): ProcessedTextContent {
	const keyStack = listStack.map(item => item.listType);
	const { listMatch } = line;
	const jointIds = listStack.map(item => item.delimiter);

	return {
		...line,
		id: deduplicateAndMergeItems(jointIds),
		index: keyStack,
		indexJoin: keyStack.join("."),
		parsedStr: nlp(listMatch.str)
	};
}

function getMedianMergedTextContentProperties(textContents: MergedLine[]) {
	let minTop = Infinity;
	let minLeft = Infinity;
	let maxBottom = -Infinity;
	let maxRight = -Infinity;

	const heights: WeightedNumber[] = [];

	const firstItem = textContents[0];
	const lastItem = textContents[textContents.length - 1];

	if (!firstItem || !lastItem) {
		throw new Error("Empty text contents");
	}

	textContents.forEach(item => {
		const { left, top, height, width } = item;

		if (top < minTop) {
			minTop = top;
		}
		if (left < minLeft) {
			minLeft = left;
		}
		if (top + height > maxBottom) {
			maxBottom = top + height;
		}
		if (left + width > maxRight) {
			maxRight = left + width;
		}

		heights.push({
			number: height,
			weight: item.str.length
		});
	});

	return {
		top: minTop,
		left: minLeft,
		width: maxRight - minLeft,
		pageNumber: lastItem.pageNumber,
		//@todo - height will be wrong for items which span multiple pages
		height: maxBottom - minTop,
		fontName: firstItem.fontName,
		lineHeight: weightedMedian(heights),
		dir: firstItem.dir
	};
}

const enumeratedLists: Set<ListTypes> = new Set([
	"alphabet",
	"alphabet-numbers",
	"numbered",
	"roman"
]);

const nonEnumeratedLists: Set<ListTypes> = new Set(["paragraph", "unordered"]);

/**
 * This is a pretty complex function, it checks if the list context has decreased
 * This is made complicated because we simply can't rely on the document to have proper
 * list structure or indentation, so we have to make some assumptions and create our own
 * rules by which a document must adhere to
 *
 * Notable examples for this kind of violation are RBI TRM and CIS benchmarks
 */
// eslint-disable-next-line complexity
function hasListContextDecreased(currentItem: ListStackType, stack: ListStackType[]) {
	const lastItem = stack.at(-1);
	if (!lastItem) {
		return false;
	}

	// A paragraph at level 1 can't have a list item at level 2 (it doesn't make sense)
	if (stack.at(0)?.listType === "paragraph") {
		return true;
	}

	// Check if indentation has changed significantly
	const leftDifference = lastItem.left - currentItem.left;
	const topDifference = currentItem.top - lastItem.top;
	const { height } = currentItem;
	if (leftDifference > 2 && topDifference > height) {
		return true;
	} else if (leftDifference < -2 && topDifference > height) {
		return false;
	}

	// For numbered lists, we check if there is an item with a lower depth
	if (currentItem.listType === "numbered") {
		const lastNumberedItem = stack.findLast(item => item.listType === "numbered");

		if (lastNumberedItem) {
			// If the items are in the same depth check if they actually increment or not
			if (lastNumberedItem.depth === currentItem.depth) {
				const sizeComparison = compareNumberedListItems(
					lastNumberedItem.delimiter,
					currentItem.delimiter
				);
				return sizeComparison <= 0;
				// If the current item is of a lower depth, then de-ident
			} else if (lastNumberedItem.depth > currentItem.depth) {
				return true;
				// If we have a deeply nested numbered list, but the last item is not a numbered list then de-indent
			} else if (lastItem.listType !== "numbered") {
				return true;
			} else {
				return false;
			}
		}
	}

	// We allow non enumerated lists to be a child of enumerated lists
	if (nonEnumeratedLists.has(currentItem.listType) && enumeratedLists.has(lastItem.listType)) {
		return false;
	}

	// Items which are much bigger or smaller than previous denote a new context
	const heightDifference = currentItem.height - lastItem.height;
	if (heightDifference >= 2) {
		return true;
	} else if (heightDifference <= -2) {
		return false;
	}

	// We can't have the exact same list key inside the same list as a deeply nested child
	if (
		stack.slice(0, -1).some(item => item.key === currentItem.key && item.left === currentItem.left)
	) {
		return true;
	}

	// In all other cases we don't de-indent
	return false;
}
