-- DDG histogram view, performs aggregation to the predefined bins
-- COUNTS individual measurements, not mutations
DROP VIEW IF EXISTS ddg_histogram;
CREATE VIEW ddg_histogram AS
SELECT (string_to_array(bin, ' '))[1] AS interval_min,
       (string_to_array(bin, ' '))[2] AS interval_max,
       count(*)                       AS count
FROM (
      SELECT case
                 WHEN num_value <= -12.0 THEN '-12 -12'
                 WHEN num_value > -12.0 AND num_value <= -10.0 THEN '-12 -10'
                 WHEN num_value > -10.0 AND num_value <= -8.0 THEN '-10 -8'
                 WHEN num_value > -8.0 AND num_value <= -6.0 THEN '-8 -6'
                 WHEN num_value > -6.0 AND num_value <= -4.0 THEN '-6 -4'
                 WHEN num_value > -4.0 AND num_value <= -2.0 THEN '-4 -2'
                 WHEN num_value > -2.0 AND num_value <= 0.0 THEN '-2 0'
                 WHEN num_value > 0.0 AND num_value <= 2.0 THEN '0 2'
                 WHEN num_value > 2.0 AND num_value <= 4.0 THEN '2 4'
                 WHEN num_value > 4.0 AND num_value <= 6.0 THEN '4 6'
                 WHEN num_value > 6.0 AND num_value <= 8.0 THEN '6 8'
                 WHEN num_value > 8.0 THEN '8 8'
             end AS bin
      FROM experiment EXP INNER JOIN measurement MES ON MES.experiment_id = EXP.id
      WHERE EXP.active IS TRUE AND MES.type = 'DDG' AND MES.num_value IS NOT NULL) sub
WHERE bin IS NOT NULL
GROUP BY bin
ORDER BY (
             CASE bin
                 WHEN '-12 -12' THEN 1
                 WHEN '-12 -10' THEN 2
                 WHEN '-10 -8' THEN 3
                 WHEN '-8 -6' THEN 4
                 WHEN '-6 -4' THEN 5
                 WHEN '-4 -2' THEN 6
                 WHEN '-2 0' THEN 7
                 WHEN '0 2' THEN 8
                 WHEN '2 4' THEN 9
                 WHEN '4 6' THEN 10
                 WHEN '6 8' THEN 11
                 WHEN '8 8' THEN 12
             END
             );

-- DTM histogram view, performs aggregation to the predefined bins
DROP VIEW IF EXISTS dtm_histogram;
CREATE VIEW dtm_histogram as SELECT
    (string_to_array(bin, ' '))[1] AS interval_min,
    (string_to_array(bin, ' '))[2] AS interval_max,
    count(*) AS count
FROM (
         SELECT
             CASE
                 WHEN num_value <= -15.0 THEN '-15 -15'
                 WHEN num_value > -15.0 AND num_value <= -10.0 THEN '-15 -10'
                 WHEN num_value > -10.0 AND num_value <= -5.0 THEN '-10 -5'
                 WHEN num_value > -5.0 AND num_value <= 0.0 THEN '-5 0'
                 WHEN num_value > 0.0 AND num_value <= 5.0 THEN '0 5'
                 WHEN num_value > 5.0 AND num_value <= 10.0 THEN '5 10'
                 WHEN num_value > 10.0 AND num_value <= 15.0 THEN '10 15'
                 WHEN num_value > 15.0 THEN '15 15'
                 END AS bin
         FROM experiment EXP INNER JOIN measurement MES ON MES.experiment_id = EXP.id
         WHERE EXP.active IS TRUE AND MES.type='DTM' AND MES.num_value IS NOT NULL
     ) sub
GROUP BY bin
ORDER BY (
             CASE bin
                 WHEN '-15 -15' THEN 1
                 WHEN '-15 -10' THEN 2
                 WHEN '-10 -5' THEN 3
                 WHEN '-5 0' THEN 4
                 WHEN '0 5' THEN 5
                 WHEN '5 10' THEN 6
                 WHEN '10 15' THEN 7
                 WHEN '15 15' THEN 8
                 END
             );


DROP VIEW IF EXISTS protein_statistics;
CREATE VIEW protein_statistics AS
SELECT protein.name, COUNT(*) as count, (COUNT(*) / SUM(COUNT(*)) OVER ()) * 100 AS percentage
FROM experiment
    INNER JOIN measurement ON measurement.experiment_id = experiment.id
    INNER JOIN mutant ON mutant.id = measurement.mutant_id
    INNER JOIN protein_sequence ON protein_sequence.sequence_id = mutant.source_id
    INNER JOIN protein ON protein.id = protein_sequence.protein_id
WHERE experiment.active IS TRUE
GROUP BY protein.id
ORDER BY count DESC;


-- INTERRO Family representation of entries - different mutations, not measurements
-- Contains only the INTERPRO accession. For names a query to the interpro database is needed.
DROP VIEW IF EXISTS interpro_statistics;
CREATE VIEW interpro_statistics AS
SELECT protein_reference.accession, COUNT(*) as count, (COUNT(*) / SUM(COUNT(*)) OVER ()) * 100.0 AS percentage
FROM experiment
    INNER JOIN measurement ON measurement.experiment_id = experiment.id
    INNER JOIN mutant ON mutant.id = measurement.mutant_id
    INNER JOIN protein_sequence ON protein_sequence.sequence_id = mutant.source_id
    INNER JOIN protein_reference ON protein_reference.protein_id = protein_sequence.protein_id
WHERE experiment.active IS TRUE AND measurement.mutant_id IS NOT NULL AND protein_reference.type = 'INTERPRO'
GROUP BY protein_reference.accession
ORDER BY count DESC
LIMIT 10;

-- Amino acid statistics
DROP VIEW IF EXISTS mutation_statistics;
CREATE VIEW mutation_statistics AS
(SELECT substitution.source_aa, substitution.target_aa, COUNT(DISTINCT measurement.experiment_id) as count
 FROM substitution
      INNER JOIN mutant ON mutant.id = substitution.mutant_id
      INNER JOIN measurement ON measurement.mutant_id = mutant.id
      INNER JOIN experiment ON experiment.id = measurement.experiment_id AND experiment.active IS TRUE
 GROUP BY substitution.source_aa, substitution.target_aa)
UNION ALL
(SELECT '-' as source_aa, unnest(string_to_array(amino_acids, '')) AS target_aa, COUNT(DISTINCT measurement.experiment_id) as count
 FROM insertion
      INNER JOIN mutant ON mutant.id = mutant_id
      INNER JOIN measurement ON measurement.mutant_id = mutant.id
      INNER JOIN experiment ON experiment.id = measurement.experiment_id AND experiment.active IS TRUE
 GROUP BY target_aa)
UNION ALL
(SELECT unnest(string_to_array(amino_acids, '')) AS source_aa, '-' as target_aa, COUNT(DISTINCT measurement.experiment_id) as count
 FROM deletion
      INNER JOIN mutant ON mutant.id = mutant_id
      INNER JOIN measurement ON measurement.mutant_id = mutant.id
      INNER JOIN experiment ON experiment.id = measurement.experiment_id AND experiment.active IS TRUE
 GROUP BY source_aa);