"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.createMatches = exports.atomize = void 0;
const sortByReference_1 = require("./sortByReference");
const separators = /[-\.\(\)\/]/g;
function atomize(str) {
    const referenceParts = str.split(separators);
    // now go through referenceParts and split whenever there is a digit and then a letter or vice versa
    const r = referenceParts.map(part => {
        return part.split(/(?<=\d)(?=\D)|(?<=\D)(?=\d)/);
    }).flat().filter(part => part.length > 0);
    return r;
}
exports.atomize = atomize;
function groupBy(array, accessor) {
    return array.reduce((acc, value) => {
        // Group initialization
        if (!acc[accessor(value)]) {
            acc[accessor(value)] = [];
        }
        // Grouping
        acc[accessor(value)].push(value);
        return acc;
    }, {});
}
function isRomanNumeral(str) {
    // remove leading and trailing parenthesis
    str = str.replace(/^\(/, '').replace(/\)$/, '');
    const romanNumeralRegex = /^(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)$/i;
    return romanNumeralRegex.test(str);
}
function splitForSorting(subject) {
    return subject.split(separators).filter(el => el.length > 0);
}
function createMatches(rawText, explain = false) {
    const languages = ['en', 'de'];
    const regexByLanguage = {
        de: /(?<prefix>Anlage|Anhang)\s+(?<paragraph>§\s*)*\s*(?<reference1>\S*\s*)\s*(?<reference2>\S*\s*)\s*(?<reference3>\S*\s*)/g,
        en: /(?<prefix>Schedule|Annex|Exhibit|Appendix)\s+(?<paragraph>§\s*)*\s*(?<reference1>\S*\s*)\s*(?<reference2>\S*\s*)\s*(?<reference3>\S*\s*)/g,
    };
    const allMatches = languages.map(language => createMatchesByLanguage(rawText, language)).flat();
    const byFullName = groupBy(allMatches, match => `${match.prefix} ${match.reference}`);
    const aggregatedMatches = [];
    for (const fullName in byFullName) {
        const thisFinding = byFullName[fullName];
        aggregatedMatches.push({
            prefix: thisFinding[0].prefix,
            reference: thisFinding[0].reference,
            language: thisFinding[0].language,
            fullMatches: thisFinding.map(match => match.fullMatch)
        });
    }
    const prefixesForSorting = ['Anlage', 'Anhang', 'Schedule', 'Annex', 'Exhibit'];
    function sortByPrefix(a, b) {
        return prefixesForSorting.indexOf(a.prefix) - prefixesForSorting.indexOf(b.prefix);
    }
    aggregatedMatches.sort((a, b) => sortByPrefix(a, b) || (0, sortByReference_1.sortElementWise)(splitForSorting(a.reference), splitForSorting(b.reference)));
    return {
        languages: [...new Set(aggregatedMatches.map(match => match.language))],
        analysis: aggregatedMatches
    };
    function createMatchesByLanguage(rawText, language) {
        const lines = rawText
            .replace(/\u200E/g, '') // Remove hidden character present in some word files
            .split(/[\r\n]/) // split into lines
            .filter(line => line.length > 0); // remove empty lines
        const regex = regexByLanguage[language];
        let result;
        const matches = [];
        lines.forEach(line => {
            regex.lastIndex = 0;
            while (result = regex.exec(line)) {
                if (result.groups == null)
                    continue;
                let { groups: { paragraph: paragraphSymbol, prefix, reference1: referencePart1, reference2: referencePart2 } } = result;
                // We add 1 to the index because we want to start searching one character after the last match
                // this helps with cases like "Anlage Anlage 1.6(ii)" where the first match is "Anlage Anlage", which is not
                // a valid reference and then the regex cannot match just "Anlage 1.6(ii)". This is resolved by starting the next search
                // one character after the first match. Now the regex sees "nlage Anlage 1.6(ii)" and correctly matches "Anlage 1.6(ii)"
                regex.lastIndex = result.index + 1;
                // there are some things in the first part that tell us that the second part is not part of the reference
                // which are the comma, colon and semicolon , : ;
                // so if the first parts ends with one of these, we remove the second part
                // could also be a tab character, which happens in the case of a table of contents where the reference is in the first column and the page number in the second
                if (/[,:;]$/.test(referencePart1.trim()) || /\t/.test(referencePart1)) {
                    referencePart2 = '';
                    // Since we already discarded the second part, we can remove characters of the end of the first part that are never part of a reference
                    referencePart1 = referencePart1.trim().replace(/[,:;]$/, '');
                }
                // we also had a case where part 2 was just "- ". That is not a reference.
                if (/^\-\s+/.test(referencePart2)) {
                    referencePart2 = '';
                }
                let combinedReference = referencePart1.trim() + referencePart2.trim();
                if (!isGoodReference(combinedReference)) {
                    combinedReference = referencePart1.trim();
                }
                if (!isGoodReference(combinedReference)) {
                    // All up to now failed, so let's assume the text is something like "als Anlage B.1.3.2beigefügten Erklärungen", missing a blank. Atomizing this gives us
                    // ['B', '1', '3', '2', 'beigefügten']
                    const atoms = atomize(combinedReference);
                    // So make sure we have at least two atoms, so we can actually remove one thing and still have a reference
                    if (atoms.length > 1) {
                        const lastAtom = atoms[atoms.length - 1];
                        // we only try this if the last atom is the problem. If there are multiple bad atoms, we're out of luck
                        if (!isReferenceAtom(lastAtom)) {
                            // search last occurence of atom
                            const r = new RegExp(lastAtom + '$', 'i');
                            combinedReference = combinedReference.replace(r, '');
                        }
                    }
                    // ok we tried pretty much everything, all hope is lost, we give up this match
                    if (!isGoodReference(combinedReference)) {
                        continue;
                    }
                }
                combinedReference = combinedReference.replace(/[\.\,\;\:]\s*$/, '').trim(); // trim trailing dot, comma, semicolon or colon
                if (paragraphSymbol != null) {
                    combinedReference = paragraphSymbol + combinedReference;
                }
                matches.push({
                    language,
                    fullMatch: line,
                    prefix,
                    reference: combinedReference
                });
            }
        });
        function isGoodReference(str) {
            if (str.length == 0) {
                return false;
            }
            const atoms = atomize(str);
            return atoms.every(isReferenceAtom);
        }
        function isReferenceAtom(str) {
            // str is only a digits
            if (/^\d+$/.test(str))
                return true;
            // str is only a single or repeating character
            if (/^([a-zA-Z])\1*$/.test(str))
                return true;
            // str is a roman numeral
            if (isRomanNumeral(str))
                return true;
            return false;
        }
        return matches;
    }
}
exports.createMatches = createMatches;
