import * as pdfJs from "pdfjs-dist";
import { PDFDocumentProxy } from "pdfjs-dist";
import Worker from "pdfjs-dist/build/pdf.worker.mjs?worker&url";

import { readFile } from "../read-file";

import { createIndex } from "./create-index";
import { getLineGroupsFromDocument } from "./get-line-groups";
import { mergeLines } from "./merge-lines";
import { removeHeaderandFooterItems, removeJunkContentFromTextContent } from "./pdf-cleanup-utils";

// Needed by PDF parser. Disable it for unit test environment
if (import.meta.env.MODE !== "test") {
	pdfJs.GlobalWorkerOptions.workerSrc = Worker;
}

export async function getPDFDocumentFromEvent(
	e: CustomEvent<{ value: File }>
): Promise<pdfJs.PDFDocumentProxy> {
	const file = e.detail.value;

	const fileContents = await readFile(file);

	return pdfJs.getDocument({
		isEvalSupported: false,
		data: fileContents
	}).promise;
}

export async function getParagraphsFromDocument(document: PDFDocumentProxy) {
	let { lineGroups, pageMargins, numPages } = await getLineGroupsFromDocument(document);

	lineGroups = removeHeaderandFooterItems(lineGroups, pageMargins, numPages);

	// Document processing can take a while, so let's update the progress and add an idle callback
	await new Promise(resolve => requestIdleCallback(resolve));

	const paragraphs = mergeLines(lineGroups);

	// Document processing can take a while, so let's update the progress and add an idle callback
	await new Promise(resolve => requestIdleCallback(resolve));

	const indexes = removeJunkContentFromTextContent(createIndex(paragraphs));

	return indexes;
}
