import { IndexedParagraph, MergedLine, PageMargins } from "./pdf-utils-common";

/**
 * Uses heuristics to remove header and footer items from the text content
 * @param items
 * @returns
 */
export function removeHeaderandFooterItems(
	items: MergedLine[],
	pageMargins: PageMargins,
	numPages: number
) {
	const headerOrFooterThreshold = 0.9;

	// Now take the margins and increase the top and reduce the buttom by a threshold
	const topThreshold = pageMargins.height * (1 - headerOrFooterThreshold);
	const bottomThreshold = pageMargins.height * headerOrFooterThreshold;

	// Now check which elements fall under this threshold
	const removalCandidates = items.filter(
		item => item.top < topThreshold || item.top + item.height > bottomThreshold
	);

	const normalisedSets: Map<string, MergedLine[]> = new Map();

	// Now for each item, normalise the string and add it to a set
	removalCandidates.forEach(item => {
		const normalisedStr = normaliseString(item.str);
		const set = normalisedSets.get(normalisedStr);

		if (set) {
			set.push(item);
		} else {
			normalisedSets.set(normalisedStr, [item]);
		}
	});

	// If an element repeats across multiple pages it will have count proportional to the number of pages
	// we use this heuristic to remove items that are repeated across multiple pages
	const duplicateCountThreshold = Math.ceil(numPages * headerOrFooterThreshold);
	Array.from(normalisedSets.values()).forEach(set => {
		if (set.length > duplicateCountThreshold) {
			// Remove every item from this set from the items array
			set.forEach(item => {
				const idx = items.indexOf(item);
				if (idx > -1) {
					items.splice(idx, 1);
				}
			});
		}
	});

	return items;
}

export function removeJunkContentFromTextContent(items: IndexedParagraph[]): IndexedParagraph[] {
	return items.filter(({ str }) => {
		return !hasRepeatedContent(str) && /[a-z]/i.test(str);
	});
}

// Marks items with repeated content or items that have no alphabets as junk
// (e.g. aaaaa or ...... or ------ as they are likely to information items not activity)
function hasRepeatedContent(str: string) {
	return /(.+)\1{4,}/.test(str);
}

// Normalise text contents so they can be indexed to check for duplication
// This helps in getting rid of content like "Page 2 or 200" or "Page 1" and normalises it
function normaliseString(str: string) {
	return (
		// Remove numbers at the end, they denote page numbers
		str
			// all lowers
			.toLocaleLowerCase()
			// only allow alphanumeric characters
			.replace(/[^a-z0-9]/g, "")
			// remove numbers at the end, they usually denote page numbers
			.replace(/\d+$/, "")
	);
}
