CREATE OR REPLACE FUNCTION run_integrity_check()
RETURNS SETOF TEXT
LANGUAGE plpgsql
AS $$
BEGIN

    -- SECTION: CHECKS ON A SINGLE TABLE (can be reworked into CHECK CONSTRAINT after they pass)

    -- Mutation-level measurement types must not be used for sequence-level measurement.
    IF EXISTS (
        SELECT * FROM measurement
        WHERE sequence_id IS NOT NULL AND type IN ('DDG', 'DOMAINOME_DDG', 'DOMAINOME_FITNESS', 'DTM')
    ) THEN
        RETURN NEXT 'Mutation-level measurement types must not be used for sequence-level measurement.';
    END IF;

    -- UNIPROTKB and MEGASCALE accessions must be unique and have "unique" column set to true.
    IF EXISTS (
        SELECT type, accession, COUNT(*) FROM protein_reference
        WHERE type IN ('UNIPROTKB', 'MEGASCALE')
        GROUP BY type, accession
        HAVING COUNT(*) > 1
    ) OR EXISTS (
        SELECT * FROM protein_reference
        WHERE type IN ('UNIPROTKB', 'MEGASCALE') AND "unique" IS NULL OR "unique" = false
    ) THEN
        RETURN NEXT 'UNIPROTKB and MEGASCALE accessions must be unique and have "unique" column set to true.';
    END IF;

    -- Measurements must have a numeric value.
    IF EXISTS (
        SELECT type, array_agg(DISTINCT str_value) AS str_value_arr
        FROM measurement
        WHERE measurement.num_value IS NULL
        GROUP BY type
    ) THEN
        RETURN NEXT 'Measurements must have a numeric value.';
    END IF;


    -- SECTION: INTEGRITY CHECKS.

    -- Each sequence must have a protein.
    IF EXISTS (
        SELECT * FROM sequence
        LEFT JOIN protein_sequence ON sequence.id = protein_sequence.sequence_id
        WHERE protein_id IS NULL
    ) THEN
        RETURN NEXT 'Each sequence must have a protein.';
    END IF;

    -- Each protein must have a sequence.
    IF EXISTS (
        SELECT * FROM protein
        LEFT JOIN protein_sequence ON protein.id = protein_sequence.protein_id
        WHERE sequence_id IS NULL
    ) THEN
        RETURN NEXT 'Each protein must have a sequence.';
    END IF;

    -- Each mutant must have a mutation.
    IF EXISTS (
        SELECT * FROM mutant
        LEFT JOIN substitution ON mutant.id = substitution.mutant_id
        LEFT JOIN insertion ON mutant.id = insertion.mutant_id
        LEFT JOIN deletion ON mutant.id = deletion.mutant_id
        WHERE substitution.mutant_id IS NULL AND insertion.mutant_id IS NULL AND deletion.mutant_id IS NULL
    ) THEN
        RETURN NEXT 'Each mutant must have a mutation.';
    END IF;

    -- Each mutant must have a measurement.
    IF EXISTS (
        SELECT * FROM mutant
        LEFT JOIN measurement ON mutant.id = measurement.mutant_id
        WHERE measurement.id IS NULL
    ) THEN
        RETURN NEXT 'Each mutant must have a measurement.';
    END IF;

    -- Each dataset must have a measurement.
    IF EXISTS (
        SELECT * FROM dataset
        LEFT JOIN dataset_measurement ON dataset.id = dataset_measurement.dataset_id
        WHERE dataset_measurement.measurement_id IS NULL
    ) THEN
        RETURN NEXT 'Each dataset must have a measurement.';
    END IF;

    -- Each protein must have a UNIPROTKB or MEGASCALE accession.
    -- TODO they should be exclusive
    IF EXISTS (
        SELECT * FROM protein
        LEFT JOIN protein_reference REF ON protein.id = REF.protein_id AND REF.type IN ('UNIPROTKB', 'MEGASCALE')
        WHERE REF.accession IS NULL
    ) THEN
        RETURN NEXT 'Each protein must have a UNIPROTKB or MEGASCALE accession.';
    END IF;

    -- Each void must be a tunnel or a pocket, according to its type.
    IF EXISTS (
        SELECT DISTINCT type FROM void WHERE type NOT IN ('POCKET', 'TUNNEL')
    ) OR EXISTS (
        SELECT * FROM void
        LEFT JOIN tunnel ON void.id = tunnel.id
        WHERE void.type = 'TUNNEL' AND tunnel.id IS NULL
    ) OR EXISTS (
        SELECT * FROM void
        LEFT JOIN pocket ON void.id = pocket.id
        WHERE void.type = 'POCKET' AND pocket.id IS NULL
    ) THEN
        RETURN NEXT 'Each void must be a tunnel or a pocket, according to its type.';
    END IF;

    -- Each void must have only residues which are assigned the assembly of the void.
    IF EXISTS (
        SELECT
            void.*,
            void_residues,
            assembly_residues
        FROM void
        LEFT JOIN LATERAL (
            SELECT array_agg(residue_id) AS void_residues
            FROM void_residue
            WHERE void_residue.void_id = void.id
        ) ON TRUE
        LEFT JOIN LATERAL (
            SELECT array_agg(residue.id) AS assembly_residues
            FROM assembly
            INNER JOIN chain ON assembly.id = chain.assembly_id
            INNER JOIN residue ON chain.id = residue.chain_id
            WHERE assembly.id = void.assembly_id
        ) ON TRUE
        WHERE NOT void_residues <@ assembly_residues
    ) THEN
        RETURN NEXT 'Each void must have only residues which are assigned the assembly of the void.';
    END IF;

    -- Each tunnel must have only *bottleneck* residues which are assigned to the assembly of the void.
    IF EXISTS (
        SELECT
            tunnel.*,
            bottleneck_residues,
            assembly_residues
        FROM tunnel
        INNER JOIN void ON tunnel.id = void.id
        LEFT JOIN LATERAL (
            SELECT array_agg(residue_id) AS bottleneck_residues
            FROM tunnel_bottleneck_residue
            WHERE tunnel_bottleneck_residue.tunnel_id = tunnel.id
        ) ON TRUE
        LEFT JOIN LATERAL (
            SELECT array_agg(residue.id) AS assembly_residues
            FROM assembly
            INNER JOIN chain ON assembly.id = chain.assembly_id
            INNER JOIN residue ON chain.id = residue.chain_id
            WHERE assembly.id = void.assembly_id
        ) ON TRUE
        WHERE NOT bottleneck_residues <@ assembly_residues
    ) THEN
        RETURN NEXT 'Each tunnel must have only *bottleneck* residues which are assigned to the assembly of the void.';
    END IF;


    -- SECTION: POSITION VALIDITY

    -- Position validity: sequence_residue_mapping.
    IF EXISTS (
        SELECT sequence_id, length(sequence) AS sequence_length, position
        FROM sequence
        INNER JOIN sequence_residue_mapping MAP ON sequence.id = MAP.sequence_id
        WHERE position NOT BETWEEN 1 AND length(sequence)
    ) THEN
        RETURN NEXT 'Position validity: sequence_residue_mapping.';
    END IF;

    -- Position validity: sequence_feature.
    IF EXISTS (
        -- We check only position, as we suppose position_array and position_range are not used.
        -- If they will be used, they cause an error, and we'll update the condition.
        SELECT sequence_id, length(sequence) AS sequence_length, FEAT.*
        FROM sequence
        INNER JOIN sequence_feature FEAT ON sequence.id = FEAT.sequence_id
        WHERE
            position NOT BETWEEN 1 AND length(sequence)
            OR position_array IS NOT NULL
            OR position_range IS NOT NULL
    ) THEN
        RETURN NEXT 'Position validity: sequence_feature.';
    END IF;

    -- Position validity: substitution.
    IF EXISTS (
        SELECT sequence.id AS sequence_id, length(sequence), substitution.*
        FROM sequence
        INNER JOIN mutant ON sequence.id = mutant.source_id
        INNER JOIN substitution ON mutant.id = substitution.mutant_id
        WHERE position NOT BETWEEN 1 AND length(sequence)
    ) THEN
        RETURN NEXT 'Position validity: substitution.';
    END IF;

    -- Position validity: deletion.
    IF EXISTS (
        SELECT sequence.id AS sequence_id, length(sequence), deletion.*
        FROM sequence
        INNER JOIN mutant ON sequence.id = mutant.source_id
        INNER JOIN deletion ON mutant.id = deletion.mutant_id
        WHERE position NOT BETWEEN 1 AND length(sequence) - length(amino_acids) + 1
    ) THEN
        RETURN NEXT 'Position validity: deletion.';
    END IF;

    -- Position validity: insertion.
    IF EXISTS (
        SELECT sequence.id AS sequence_id, length(sequence), insertion.*
        FROM sequence
        INNER JOIN mutant ON sequence.id = mutant.source_id
        INNER JOIN insertion ON mutant.id = insertion.mutant_id
        WHERE position NOT BETWEEN 0 AND length(sequence)  -- insertion at the beginning has position 0
    ) THEN
        RETURN NEXT 'Position validity: insertion.';
    END IF;

END;
$$;

CREATE OR REPLACE PROCEDURE report_integrity_check()
LANGUAGE plpgsql
AS $$
DECLARE errors TEXT[];
BEGIN
    errors := (SELECT array_agg(msg) FROM run_integrity_check() AS msg);
    IF array_length(errors, 1) IS NOT NULL THEN
        RAISE EXCEPTION 'Integrity check failed:%', chr(10) || array_to_string(errors, chr(10));
    ELSE
        RAISE NOTICE 'Integrity check passed';
    END IF;
END;
$$;

-- To run, execute:
-- CALL report_integrity_check();
