/**
 * Extracts line groups from a PDF document. It tries to merge words into lines
 * And also tries to group data into columns, to try and detect tables
 */
import * as pdfJs from "pdfjs-dist";
import { PDFDocumentProxy, PageViewport } from "pdfjs-dist";
import { TextItem, TextMarkedContent } from "pdfjs-dist/types/src/display/api";

import {
	MergedLine,
	NEXT_WORD_BREAK_THRESHOLD,
	PageMargins,
	ParsedPage,
	WeightedNumber,
	getPageMargins,
	weightedMedian
} from "./pdf-utils-common";

export async function getLineGroupsFromDocument(document: PDFDocumentProxy) {
	const currentGroup: Array<TextItem> = [];
	const lineGroups: MergedLine[] = [];
	const pages = await getPagesFromDocument(document);

	const pageMargins = getPageMargins(pages);

	// Merge words into lines and save them in lineGroups
	for (let pageIdx = 0; pageIdx < pages.length; pageIdx++) {
		const { contents } = pages[pageIdx]!;

		for (let itemIdx = 0; itemIdx < contents.length; itemIdx++) {
			const item = contents[itemIdx];

			if (!item || !("str" in item)) {
				continue;
			}
			const nextItem = contents[itemIdx + 1];

			// Footnotes or super/subscripts are close to the text and should be ignored
			const distanceBetweenItems = getDistanceFromNextItem(item, nextItem);
			if (
				nextItem &&
				distanceBetweenItems !== null &&
				currentGroup.length === 0 &&
				!item.hasEOL &&
				nextItem.height > item.height &&
				distanceBetweenItems < item.height &&
				/\d+/.test(item.str)
			) {
				continue;
			}

			if (
				item.hasEOL ||
				// If the next item is too far to the right, it means we might be in a table
				// So we should break the group
				(distanceBetweenItems !== null &&
					distanceBetweenItems > item.height * NEXT_WORD_BREAK_THRESHOLD)
			) {
				const mergeResult = mergeTextItemsIntoLine([...currentGroup, item], pageMargins, pageIdx);
				if (mergeResult) {
					lineGroups.push(mergeResult);
				}
				currentGroup.length = 0;
			} else {
				currentGroup.push(item);
			}
		}

		// Don't forget the last group
		const mergeResult = mergeTextItemsIntoLine([...currentGroup], pageMargins, pageIdx);
		if (mergeResult) {
			lineGroups.push(mergeResult);
		}
		currentGroup.length = 0;
	}

	return { lineGroups, pageMargins, numPages: pages.length };
}

/**
 * PDFJs by default doesn't calculate transforms properly from the page viewport
 * So we mutate it's transform to make it work
 * @param textItems
 * @param viewPort
 */
function fixAndFilterTextItemTransform(
	textItems: (TextItem | TextMarkedContent)[],
	viewPort: PageViewport
) {
	const filteredItems: TextItem[] = [];

	// First we go over all the items and update their transforms properly
	for (let itemIdx = 0; itemIdx < textItems.length; itemIdx++) {
		const item = textItems[itemIdx];

		if (!item || !("str" in item)) {
			continue;
		}

		const [, rotation] = item.transform;

		// Discard all rotated text and items
		if (rotation !== 0) {
			// eslint-disable-next-line no-console
			console.warn("Discarding rotated text", item.str);
			continue;
		}

		// Skip whitespace chars and merge them into the previous item
		if (item.str.trim().length === 0 || item.height === 0) {
			const previousItem = filteredItems.at(-1);

			if (previousItem) {
				previousItem.str += " ";
				previousItem.hasEOL = item.hasEOL;
				continue;
			}
		}

		const properTransform = pdfJs.Util.transform(viewPort.transform, item.transform) as number[];

		// Update the item transform with the correct one
		item.transform = properTransform;

		filteredItems.push(item);
	}

	return filteredItems;
}

async function getPagesFromDocument(document: PDFDocumentProxy) {
	const pages: Array<ParsedPage> = [];
	const { numPages } = document;

	for (let i = 1; i <= numPages; i++) {
		const page = await document.getPage(i);
		const viewport = page.getViewport({ scale: 1 });
		const { items } = await page.getTextContent();

		pages.push({
			page,
			viewport,
			contents: fixAndFilterTextItemTransform(items, viewport)
		});
	}

	return pages;
}

/**
 * Takes a series of words and merges them into a line. It also calculates the top, bottom, left, width
 * and pageWidth of the line. This is useful for future calculations which require layout information
 */
function mergeTextItemsIntoLine(
	textContents: TextItem[],
	pageMargins: PageMargins,
	pageNumber: number
): MergedLine | null {
	if (textContents.length === 0) {
		return null;
	}

	const str = textContents
		.map(item_ => item_.str)
		.join("")
		.trim();

	const medianTextProperties = getMedianTextItemProperties(textContents, pageMargins);
	const absoluteTop = medianTextProperties.top + pageNumber * pageMargins.height;

	return {
		str,
		hasEOL: true,
		absoluteTop,
		transform: [],
		pageNumber,
		...medianTextProperties
	};
}

function getMedianTextItemProperties(textContents: TextItem[], pageMargins: PageMargins) {
	const [firstItem] = textContents;

	if (!firstItem) {
		throw new Error("Empty text contents");
	}

	// Left and right values are always going to be min/max based because
	// a left element in a line is the only left element
	//
	// But top and bottom can be different because a line can have
	// subscript or superscript items
	let minLeft = Infinity;
	let maxRight = -Infinity;
	const heights: WeightedNumber[] = [];
	const tops: WeightedNumber[] = [];
	const fontNameCounts: Record<string, number> = {};

	textContents.forEach(item => {
		const [, , , , itemLeft, itemTop] = item.transform as number[];

		if (!itemLeft || !itemTop) {
			return;
		}

		tops.push({
			number: itemTop,
			weight: item.str.length
		});

		if (itemLeft < minLeft) {
			minLeft = itemLeft;
		}

		const right = itemLeft + item.width;
		if (right > maxRight) {
			maxRight = right;
		}

		heights.push({
			number: Math.floor(item.height),
			weight: item.str.length
		});

		fontNameCounts[item.fontName] = (fontNameCounts[item.fontName] ?? 0) + item.str.length;
	});

	// Some lines come with a mix of fonts, we find the most common one
	const medianFontName = Object.entries(fontNameCounts).reduce((acc, [fontName, count]) => {
		if (count > (fontNameCounts[acc] ?? 0)) {
			return fontName;
		}
		return acc;
	}, firstItem.fontName);

	const top = weightedMedian(tops) - pageMargins.top;
	const left = minLeft - pageMargins.left;
	const right = maxRight - pageMargins.left;

	// Due to various font sizes in a same line, we find the most common height
	const medianHeight = weightedMedian(heights);

	return {
		top,
		left,
		width: Math.floor(maxRight - minLeft),
		height: medianHeight,
		lineHeight: medianHeight,
		fontName: medianFontName,
		dir: firstItem.dir,

		// Quantize the xGridStart and xGridEnd to 10
		xGridStart: Math.floor((left / pageMargins.width) * 10),
		xGridEnd: Math.ceil((right / pageMargins.width) * 10)
	};
}

function getDistanceFromNextItem(item: TextItem, nextItem?: TextItem) {
	let distanceBetweenItems = null as number | null;

	// We do not want to compute items in a different line
	const topDifference = nextItem ? nextItem.transform[5] - item.transform[5] : 0;
	if (nextItem && topDifference < item.height) {
		distanceBetweenItems = nextItem.transform[4] - item.transform[4] - item.width;
	}

	return distanceBetweenItems;
}
