/* --------------------------------------------------------------
   FILE: 00_data_setup_and_staging.sql
   PURPOSE: End-to-end professional cleaning pipeline
   DATASET: hospital_readmissions.csv
   DB: hospital_readmissions
   OUTPUT: core_readmissions_clean (analysis-ready) + export
   DIALECT: MySQL 8.x
-------------------------------------------------------------- */

-- 0) DATABASE SETUP
CREATE DATABASE IF NOT EXISTS hospital_readmissions;
USE hospital_readmissions;

-- Recommended for scripting
SET SQL_SAFE_UPDATES = 0;

-- 1) STAGING LAYER (RAW LOAD)
-- ----------------------------------------------------------
DROP TABLE IF EXISTS stg_hospital_readmissions;

CREATE TABLE stg_hospital_readmissions (
    stg_row_id           INT AUTO_INCREMENT PRIMARY KEY,

    patient_age          VARCHAR(30),
    time_in_hospital     SMALLINT,
    n_lab_procedures     SMALLINT,
    n_procedures         SMALLINT,
    n_medications        SMALLINT,
    n_outpatient         SMALLINT,
    n_inpatient          SMALLINT,
    n_emergency          SMALLINT,

    medical_specialty    VARCHAR(100),
    diag_1               VARCHAR(30),
    diag_2               VARCHAR(30),
    diag_3               VARCHAR(30),

    glucose_test         VARCHAR(20),
    A1Ctest              VARCHAR(20),
    change_status        VARCHAR(20),
    diabetes_med         VARCHAR(20),
    readmitted           VARCHAR(20)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- [IMPORT NOTE]
-- Import hospital_readmissions.csv into stg_hospital_readmissions using Workbench Import Wizard.
SELECT		*
FROM		stg_hospital_readmissions;
-- 2) QUICK SANITY CHECKS
-- ----------------------------------------------------------
SELECT COUNT(*) AS total_rows FROM stg_hospital_readmissions;

-- Missing / blank counts for text fields
SELECT
    SUM(patient_age IS NULL OR TRIM(patient_age) = '') AS patient_age_blank,
    SUM(medical_specialty IS NULL OR TRIM(medical_specialty) = '') AS medical_specialty_blank,
    SUM(diag_1 IS NULL OR TRIM(diag_1) = '') AS diag_1_blank,
    SUM(diag_2 IS NULL OR TRIM(diag_2) = '') AS diag_2_blank,
    SUM(diag_3 IS NULL OR TRIM(diag_3) = '') AS diag_3_blank,
    SUM(glucose_test IS NULL OR TRIM(glucose_test) = '') AS glucose_test_blank,
    SUM(A1Ctest IS NULL OR TRIM(A1Ctest) = '') AS A1Ctest_blank,
    SUM(change_status IS NULL OR TRIM(change_status) = '') AS change_status_blank,
    SUM(diabetes_med IS NULL OR TRIM(diabetes_med) = '') AS diabetes_med_blank,
    SUM(readmitted IS NULL OR TRIM(readmitted) = '') AS readmitted_blank
FROM stg_hospital_readmissions;

-- Basic domain check for numeric negatives (should be 0)
SELECT
    SUM(time_in_hospital < 0) AS time_in_hospital_neg,
    SUM(n_lab_procedures < 0) AS n_lab_procedures_neg,
    SUM(n_procedures < 0) AS n_procedures_neg,
    SUM(n_medications < 0) AS n_medications_neg,
    SUM(n_outpatient < 0) AS n_outpatient_neg,
    SUM(n_inpatient < 0) AS n_inpatient_neg,
    SUM(n_emergency < 0) AS n_emergency_neg
FROM stg_hospital_readmissions;

-- 3) STANDARDIZE TEXT “MISSING” TOKENS
-- ----------------------------------------------------------
-- Only apply to VARCHAR columns (not numeric fields).
UPDATE stg_hospital_readmissions
SET patient_age = NULL
WHERE patient_age IS NOT NULL AND LOWER(TRIM(patient_age)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET medical_specialty = NULL
WHERE medical_specialty IS NOT NULL AND LOWER(TRIM(medical_specialty)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET diag_1 = NULL
WHERE diag_1 IS NOT NULL AND LOWER(TRIM(diag_1)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET diag_2 = NULL
WHERE diag_2 IS NOT NULL AND LOWER(TRIM(diag_2)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET diag_3 = NULL
WHERE diag_3 IS NOT NULL AND LOWER(TRIM(diag_3)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET glucose_test = NULL
WHERE glucose_test IS NOT NULL AND LOWER(TRIM(glucose_test)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET A1Ctest = NULL
WHERE A1Ctest IS NOT NULL AND LOWER(TRIM(A1Ctest)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET change_status = NULL
WHERE change_status IS NOT NULL AND LOWER(TRIM(change_status)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET diabetes_med = NULL
WHERE diabetes_med IS NOT NULL AND LOWER(TRIM(diabetes_med)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

UPDATE stg_hospital_readmissions
SET readmitted = NULL
WHERE readmitted IS NOT NULL AND LOWER(TRIM(readmitted)) IN ('', 'na', 'n/a', 'none', 'null', 'unknown');

-- 4) STANDARDIZE CATEGORICAL VALUES
-- ----------------------------------------------------------
UPDATE stg_hospital_readmissions
SET readmitted =
    CASE
        WHEN readmitted IS NULL THEN NULL
        WHEN LOWER(TRIM(readmitted)) = 'yes' THEN 'Yes'
        WHEN LOWER(TRIM(readmitted)) = 'no'  THEN 'No'
        ELSE NULL
    END;

UPDATE stg_hospital_readmissions
SET diabetes_med =
    CASE
        WHEN diabetes_med IS NULL THEN NULL
        WHEN LOWER(TRIM(diabetes_med)) = 'yes' THEN 'Yes'
        WHEN LOWER(TRIM(diabetes_med)) = 'no'  THEN 'No'
        ELSE NULL
    END;

UPDATE stg_hospital_readmissions
SET change_status =
    CASE
        WHEN change_status IS NULL THEN NULL
        WHEN LOWER(TRIM(change_status)) = 'yes' THEN 'Yes'
        WHEN LOWER(TRIM(change_status)) = 'no'  THEN 'No'
        ELSE NULL
    END;

UPDATE stg_hospital_readmissions
SET A1Ctest =
    CASE
        WHEN A1Ctest IS NULL THEN NULL
        WHEN LOWER(TRIM(A1Ctest)) = 'high'   THEN 'High'
        WHEN LOWER(TRIM(A1Ctest)) = 'normal' THEN 'Normal'
        WHEN LOWER(TRIM(A1Ctest)) = 'no'     THEN 'No'
        ELSE NULL
    END;

UPDATE stg_hospital_readmissions
SET glucose_test =
    CASE
        WHEN glucose_test IS NULL THEN NULL
        WHEN LOWER(TRIM(glucose_test)) = 'high'   THEN 'High'
        WHEN LOWER(TRIM(glucose_test)) = 'normal' THEN 'Normal'
        WHEN LOWER(TRIM(glucose_test)) = 'no'     THEN 'No'
        ELSE NULL
    END;

-- 5) DUPLICATE CHECK (DIAGNOSTIC)
-- ----------------------------------------------------------
SELECT
    patient_age, time_in_hospital, n_lab_procedures, n_procedures, n_medications,
    n_outpatient, n_inpatient, n_emergency, medical_specialty,
    diag_1, diag_2, diag_3, glucose_test, A1Ctest, change_status, diabetes_med, readmitted,
    COUNT(*) AS dups
FROM stg_hospital_readmissions
GROUP BY
    patient_age, time_in_hospital, n_lab_procedures, n_procedures, n_medications,
    n_outpatient, n_inpatient, n_emergency, medical_specialty,
    diag_1, diag_2, diag_3, glucose_test, A1Ctest, change_status, diabetes_med, readmitted
HAVING COUNT(*) > 1;

-- 6) OUTLIER FLAGS (IQR APPROX) + OPTIONAL CAPPING
-- ----------------------------------------------------------
WITH ordered AS (
    SELECT
        time_in_hospital, n_lab_procedures, n_procedures, n_medications,
        n_outpatient, n_inpatient, n_emergency,
        PERCENT_RANK() OVER (ORDER BY time_in_hospital)  AS pr_time,
        PERCENT_RANK() OVER (ORDER BY n_lab_procedures)  AS pr_lab,
        PERCENT_RANK() OVER (ORDER BY n_procedures)      AS pr_proc,
        PERCENT_RANK() OVER (ORDER BY n_medications)     AS pr_med,
        PERCENT_RANK() OVER (ORDER BY n_outpatient)      AS pr_out,
        PERCENT_RANK() OVER (ORDER BY n_inpatient)       AS pr_in,
        PERCENT_RANK() OVER (ORDER BY n_emergency)       AS pr_emg
    FROM stg_hospital_readmissions
),
quartiles AS (
    SELECT
        MIN(CASE WHEN pr_time >= 0.25 THEN time_in_hospital END) AS q1_time,
        MIN(CASE WHEN pr_time >= 0.75 THEN time_in_hospital END) AS q3_time,

        MIN(CASE WHEN pr_lab >= 0.25 THEN n_lab_procedures END) AS q1_lab,
        MIN(CASE WHEN pr_lab >= 0.75 THEN n_lab_procedures END) AS q3_lab,

        MIN(CASE WHEN pr_proc >= 0.25 THEN n_procedures END) AS q1_proc,
        MIN(CASE WHEN pr_proc >= 0.75 THEN n_procedures END) AS q3_proc,

        MIN(CASE WHEN pr_med >= 0.25 THEN n_medications END) AS q1_med,
        MIN(CASE WHEN pr_med >= 0.75 THEN n_medications END) AS q3_med,

        MIN(CASE WHEN pr_out >= 0.25 THEN n_outpatient END) AS q1_out,
        MIN(CASE WHEN pr_out >= 0.75 THEN n_outpatient END) AS q3_out,

        MIN(CASE WHEN pr_in >= 0.25 THEN n_inpatient END) AS q1_in,
        MIN(CASE WHEN pr_in >= 0.75 THEN n_inpatient END) AS q3_in,

        MIN(CASE WHEN pr_emg >= 0.25 THEN n_emergency END) AS q1_emg,
        MIN(CASE WHEN pr_emg >= 0.75 THEN n_emergency END) AS q3_emg
    FROM ordered
)
SELECT
    q1_time, q3_time,
    q1_lab, q3_lab,
    q1_proc, q3_proc,
    q1_med, q3_med,
    q1_out, q3_out,
    q1_in, q3_in,
    q1_emg, q3_emg
FROM quartiles;

-- 7) BUILD CORE CLEAN TABLE (ANALYSIS-READY)
-- ----------------------------------------------------------
DROP TABLE IF EXISTS core_readmissions_clean;
CREATE TABLE core_readmissions_clean AS
WITH ordered AS (
    SELECT
        stg_row_id,
        TRIM(patient_age) AS age,
        time_in_hospital,
        n_lab_procedures,
        n_procedures,
        n_medications,
        n_outpatient,
        n_inpatient,
        n_emergency,
        NULLIF(TRIM(medical_specialty), '') AS medical_specialty,
        NULLIF(TRIM(diag_1), '') AS diag_1,
        NULLIF(TRIM(diag_2), '') AS diag_2,
        NULLIF(TRIM(diag_3), '') AS diag_3,
        TRIM(glucose_test) AS glucose_test,
        TRIM(A1Ctest) AS A1Ctest,
        TRIM(change_status) AS change_status,
        TRIM(diabetes_med) AS diabetes_med,
        TRIM(readmitted) AS readmitted,

        PERCENT_RANK() OVER (ORDER BY time_in_hospital)  AS pr_time,
        PERCENT_RANK() OVER (ORDER BY n_lab_procedures)  AS pr_lab,
        PERCENT_RANK() OVER (ORDER BY n_procedures)      AS pr_proc,
        PERCENT_RANK() OVER (ORDER BY n_medications)     AS pr_med,
        PERCENT_RANK() OVER (ORDER BY n_outpatient)      AS pr_out,
        PERCENT_RANK() OVER (ORDER BY n_inpatient)       AS pr_in,
        PERCENT_RANK() OVER (ORDER BY n_emergency)       AS pr_emg
    FROM stg_hospital_readmissions
),
quartiles AS (
    SELECT
        MIN(CASE WHEN pr_time >= 0.25 THEN time_in_hospital END) AS q1_time,
        MIN(CASE WHEN pr_time >= 0.75 THEN time_in_hospital END) AS q3_time,

        MIN(CASE WHEN pr_lab >= 0.25 THEN n_lab_procedures END) AS q1_lab,
        MIN(CASE WHEN pr_lab >= 0.75 THEN n_lab_procedures END) AS q3_lab,

        MIN(CASE WHEN pr_proc >= 0.25 THEN n_procedures END) AS q1_proc,
        MIN(CASE WHEN pr_proc >= 0.75 THEN n_procedures END) AS q3_proc,

        MIN(CASE WHEN pr_med >= 0.25 THEN n_medications END) AS q1_med,
        MIN(CASE WHEN pr_med >= 0.75 THEN n_medications END) AS q3_med,

        MIN(CASE WHEN pr_out >= 0.25 THEN n_outpatient END) AS q1_out,
        MIN(CASE WHEN pr_out >= 0.75 THEN n_outpatient END) AS q3_out,

        MIN(CASE WHEN pr_in >= 0.25 THEN n_inpatient END) AS q1_in,
        MIN(CASE WHEN pr_in >= 0.75 THEN n_inpatient END) AS q3_in,

        MIN(CASE WHEN pr_emg >= 0.25 THEN n_emergency END) AS q1_emg,
        MIN(CASE WHEN pr_emg >= 0.75 THEN n_emergency END) AS q3_emg
    FROM ordered
)
SELECT
    o.stg_row_id,
    o.age,
    o.time_in_hospital,
    o.n_lab_procedures,
    o.n_procedures,
    o.n_medications,
    o.n_outpatient,
    o.n_inpatient,
    o.n_emergency,
    o.medical_specialty,
    o.diag_1,
    o.diag_2,
    o.diag_3,
    o.glucose_test,
    o.A1Ctest,
    o.change_status,
    o.diabetes_med,
    o.readmitted,

    -- outlier flags (do NOT remove by default)
    CASE
        WHEN o.time_in_hospital IS NULL THEN 0
        WHEN o.time_in_hospital < (q1_time - 1.5*(q3_time-q1_time))
          OR o.time_in_hospital > (q3_time + 1.5*(q3_time-q1_time)) THEN 1
        ELSE 0
    END AS outlier_time_in_hospital,

    CASE
        WHEN o.n_lab_procedures IS NULL THEN 0
        WHEN o.n_lab_procedures < (q1_lab - 1.5*(q3_lab-q1_lab))
          OR o.n_lab_procedures > (q3_lab + 1.5*(q3_lab-q1_lab)) THEN 1
        ELSE 0
    END AS outlier_n_lab_procedures,

    CASE
        WHEN o.n_medications IS NULL THEN 0
        WHEN o.n_medications < (q1_med - 1.5*(q3_med-q1_med))
          OR o.n_medications > (q3_med + 1.5*(q3_med-q1_med)) THEN 1
        ELSE 0
    END AS outlier_n_medications

FROM ordered o
CROSS JOIN quartiles
WHERE NOT (
    o.age IS NULL AND
    o.time_in_hospital IS NULL AND
    o.n_lab_procedures IS NULL AND
    o.n_procedures IS NULL AND
    o.n_medications IS NULL AND
    o.n_outpatient IS NULL AND
    o.n_inpatient IS NULL AND
    o.n_emergency IS NULL AND
    o.medical_specialty IS NULL AND
    o.diag_1 IS NULL AND
    o.diag_2 IS NULL AND
    o.diag_3 IS NULL AND
    o.glucose_test IS NULL AND
    o.A1Ctest IS NULL AND
    o.change_status IS NULL AND
    o.diabetes_med IS NULL AND
    o.readmitted IS NULL
);

SELECT COUNT(*) AS core_rows FROM core_readmissions_clean;

-- 8) CONSISTENCY CHECKS (DIAGNOSTIC)
-- ----------------------------------------------------------
SELECT *
FROM core_readmissions_clean
WHERE diabetes_med = 'No' AND change_status = 'Yes';

-- 9) EXPORT (CSV)
-- ----------------------------------------------------------
SELECT
    'stg_row_id','age','time_in_hospital','n_lab_procedures','n_procedures','n_medications',
    'n_outpatient','n_inpatient','n_emergency','medical_specialty','diag_1','diag_2','diag_3',
    'glucose_test','A1Ctest','change_status','diabetes_med','readmitted',
    'outlier_time_in_hospital','outlier_n_lab_procedures','outlier_n_medications'
UNION ALL
SELECT
    stg_row_id, age, time_in_hospital, n_lab_procedures, n_procedures, n_medications,
    n_outpatient, n_inpatient, n_emergency, medical_specialty, diag_1, diag_2, diag_3,
    glucose_test, A1Ctest, change_status, diabetes_med, readmitted,
    outlier_time_in_hospital, outlier_n_lab_procedures, outlier_n_medications
FROM core_readmissions_clean
INTO OUTFILE 'C:/ProgramData/MySQL/MySQL Server 9.5/Uploads/core_readmissions_clean.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
ESCAPED BY '"'
LINES TERMINATED BY '\n';