import { IndexedParagraph, ProcessedTextContent, SentenceType } from "./pdf-utils-common";

/**
 * Takes lines from a document, splits it into sentences and indexes them
 */
export function createIndex(
	lines: ProcessedTextContent[],
	splitSentences = true
): IndexedParagraph[] {
	const indexedItems: IndexedParagraph[] = [];

	// Some statements have the exact same ID in the same document
	// So we add a unique ID if we encounter them
	const idSet = new Set<string>();
	let uniqueSentenceIdx = 0;

	for (let idx = 0; idx < lines.length; idx++) {
		const line = lines[idx];
		if (!line) {
			continue;
		}

		const { listMatch, parsedStr, id } = line;

		const parsedSentence = parsedStr;

		// Compromise library's forEach function is different than regular foreach
		let sentenceIdx = 0;

		// If we want to split sentences we will use the native method of `compromoise` library
		// But if we don't we just cast it to a JS array
		(splitSentences ? parsedSentence : [parsedSentence]).forEach(sentence => {
			//@ts-expect-error
			const hasVerbsAndWords = sentence.verbs().length > 0;
			const type = hasVerbsAndWords ? SentenceType.ACTIVITY : SentenceType.INFORMATION;
			const text = sentence.text();
			let sentenceId = sentenceIdx === 0 ? id : `${id}.${sentenceIdx}`;

			if (!sentenceId) {
				sentenceId = String(idx);
			} else if (idSet.has(sentenceId)) {
				sentenceId = `${sentenceId}.${uniqueSentenceIdx}`;
				uniqueSentenceIdx++;
			} else {
				idSet.add(sentenceId);
			}

			indexedItems.push({
				...line,
				id: sentenceId,
				displayId: line.id,
				str: text.trim(),
				listMatch,
				type
			});

			sentenceIdx++;
		});
	}

	return indexedItems;
}
