/* 	Project: Banking Credit Risk & Customer Behaviour Analysis
	File 01: 00_data_setup_and_staging.sql
	DataSets:
		1) UCI Credit Card Default Dataset (customer-level)
		2) Credit Card Transactions / Fraud Dataset (transaction-level)
	Output:
		1) core_transaction
        2) core_customers
*/			
-- 1) Create database + use it
CREATE DATABASE IF NOT EXISTS banking_credit_risk;
USE banking_credit_risk;

-- 2) STAGING LAYER (RAW DATA – NO BUSINESS LOGIC)
-- -----------------------------------------------------------------
-- [2.1] Staging table: UCI Credit Card (customer-level)
DROP TABLE IF EXISTS stg_uci_credit_card;

CREATE TABLE stg_uci_credit_card (
    id                          INT,
    limit_bal                   DECIMAL(12,2),
    sex                         TINYINT,
    education                   TINYINT,
    marriage                    TINYINT,
    age                         TINYINT,

    pay_0                       TINYINT,
    pay_2                       TINYINT,
    pay_3                       TINYINT,
    pay_4                       TINYINT,
    pay_5                       TINYINT,
    pay_6                       TINYINT,

    bill_amt1                   DECIMAL(12,2),
    bill_amt2                   DECIMAL(12,2),
    bill_amt3                   DECIMAL(12,2),
    bill_amt4                   DECIMAL(12,2),
    bill_amt5                   DECIMAL(12,2),
    bill_amt6                   DECIMAL(12,2),

    pay_amt1                    DECIMAL(12,2),
    pay_amt2                    DECIMAL(12,2),
    pay_amt3                    DECIMAL(12,2),
    pay_amt4                    DECIMAL(12,2),
    pay_amt5                    DECIMAL(12,2),
    pay_amt6                    DECIMAL(12,2),

    default_payment_next_month  TINYINT
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- [2.2] Staging table: Credit Card Fraud (transaction-level)
DROP TABLE IF EXISTS stg_creditcard;
CREATE TABLE stg_creditcard (
    time_sec    INT,

    v1          DOUBLE,  v2  DOUBLE,  v3  DOUBLE,  v4  DOUBLE,  v5  DOUBLE,  v6  DOUBLE,  v7  DOUBLE,
    v8          DOUBLE,  v9  DOUBLE,  v10 DOUBLE,  v11 DOUBLE,  v12 DOUBLE,  v13 DOUBLE,  v14 DOUBLE,
    v15         DOUBLE,  v16 DOUBLE,  v17 DOUBLE,  v18 DOUBLE,  v19 DOUBLE,  v20 DOUBLE,  v21 DOUBLE,
    v22         DOUBLE,  v23 DOUBLE,  v24 DOUBLE,  v25 DOUBLE,  v26 DOUBLE,  v27 DOUBLE,  v28 DOUBLE,

    amount      DECIMAL(12,2),
    class       TINYINT
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-- 3) IMPORT STEP (manual via Workbench):
--    Import into stg_uci_credit_card and stg_creditcard using "Table Data Import Wizard"
SELECT		*
FROM		stg_uci_credit_card;
SELECT		*
FROM		stg_creditcard;

-- 4) BASIC IMPORT CHECKS
-- ---------------------------------------------------------
-- [4.1] UCI credit card
SELECT COUNT(*) AS total_rows
FROM stg_uci_credit_card;

SELECT *
FROM stg_uci_credit_card
LIMIT 10;

-- [4.2] Fraud Creditcard
SELECT COUNT(*) AS total_rows
FROM stg_creditcard;

SELECT *
FROM stg_creditcard
LIMIT 10;

-- 5) DATA QUALITY CHECKS (STAGING)
-- ---------------------------------------------------------
-- [5.1] Missing (NULL) checks: UCI important columns
SELECT
    SUM(id IS NULL OR id = '') AS id_nulls,
    SUM(limit_bal IS NULL OR limit_bal = '') AS limit_bal_nulls,
    SUM(sex IS NULL OR sex = '') AS sex_nulls,
    SUM(education IS NULL OR education = '') AS education_nulls,
    SUM(marriage IS NULL OR marriage = '') AS marriage_nulls,
    SUM(age IS NULL OR age = '') AS age_nulls,
    SUM(default_payment_next_month IS NULL OR default_payment_next_month = '') AS default_flag_nulls
FROM stg_uci_credit_card;

-- [5.2] Missing (NULL) checks: Fraud important columns
SELECT
    SUM(time_sec IS NULL OR time_sec = '') AS time_nulls,
    SUM(amount IS NULL OR amount = '') AS amount_nulls,
    SUM(class IS NULL OR class = '') AS class_nulls
FROM stg_creditcard;

-- 6) Check Duplicate
-- [6.1] Customers by ID (UCI)
SELECT 		id, 
			COUNT(*) AS dup_count
FROM 		stg_uci_credit_card
GROUP BY 	id
HAVING 		COUNT(*) > 1;

-- [6.2] Full-row duplicates (Fraud) via grouping all columns (heavy but accurate)
SELECT 		time_sec, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, 
			v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, amount, class, 
            COUNT(*) AS dup_count 
FROM stg_creditcard 
GROUP BY 	time_sec, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, 
			v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, amount, class 
HAVING COUNT(*) > 1;

-- [6.3] Total number of distinct duplicates Fraud creditcard
SELECT		COUNT(*) AS total_dup
FROM(
SELECT		1
FROM 		stg_creditcard
GROUP BY
    time_sec, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
    v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
    v21, v22, v23, v24, v25, v26, v27, v28,
    amount, class
HAVING COUNT(*) > 1
) d;

-- [6.4] Total number of duplicates Fraud creditcard
SELECT		SUM(dup_count -1) AS total_dups
FROM		(
SELECT		COUNT(*) AS dup_count
FROM		stg_creditcard
GROUP BY	time_sec, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
			v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
			v21, v22, v23, v24, v25, v26, v27, v28,
			amount, class
HAVING		COUNT(*) > 1
) d;
/* 
Findings:
   - 298 duplicate groups detected
   - 420 extra duplicate rows identified (SUM(dup_count - 1))
Action Plan:
   - Deduplication will be performed in the CORE layer
*/

-- 7) Validation and Standardization
-- [7.1] Validation and Standardization UCI Credit card
SELECT		*
FROM		stg_uci_credit_card;

-- [7.2] limit_bal non negative check
SELECT		
		SUM(limit_bal < 0) AS limit_check
FROM	stg_uci_credit_card;

-- [7.3] Sex Standardization
SELECT		DISTINCT sex
FROM		stg_uci_credit_card;
SET			SQL_SAFE_UPDATES = 0;

ALTER TABLE		stg_uci_credit_card
ADD COLUMN		sex_label VARCHAR(10);

UPDATE		stg_uci_credit_card
SET			sex_label =	
CASE
		WHEN	sex = 1 THEN 'Male'
        WHEN	sex = 2 THEN 'Female'
		ELSE 	NULL
        END;

SELECT		DISTINCT sex_label
FROM		stg_uci_credit_card;

-- [7.4] Education Standardization
SELECT		DISTINCT education
FROM		stg_uci_credit_card;

ALTER TABLE		stg_uci_credit_card
ADD COLUMN		education_label VARCHAR(30);

UPDATE		stg_uci_credit_card
SET			education_label =
CASE
			WHEN education = 1 THEN 'Graduate School'
            WHEN education = 2 THEN 'University'
            WHEN education = 3 THEN 'High School'
            WHEN education = 4 THEN 'Others'
            WHEN education = 5 THEN 'Unknown'
            WHEN education = 6 THEN 'Unknown'
            WHEN education = 0 THEN 'Unknown'
            ELSE NULL
            END;

SELECT		DISTINCT education_label
FROM		stg_uci_credit_card;

-- [7.5] Marriage Standardization
SELECT		DISTINCT marriage
FROM		stg_uci_credit_card;

ALTER TABLE		stg_uci_credit_card
ADD COLUMN		marriage_label VARCHAR(13);

UPDATE			stg_uci_credit_card
SET				marriage_label =
CASE			
				WHEN marriage = 1 THEN 'Married'
                WHEN marriage = 2 THEN 'Single'
                WHEN marriage = 3 THEN 'Others'
                WHEN marriage = 0 THEN 'Others'
                ELSE NULL
                END;
                
SELECT	DISTINCT marriage_label
FROM		stg_uci_credit_card;

-- [7.6] Validation and Standardization Fraud Credit card
-- [7.7] Validation of none numeric time_sec and amount
SELECT		
			SUM(time_sec < 0) AS time_sec_check,
            SUM(amount < 0) AS amount_check
FROM		stg_creditcard;

-- [7.8] default.payment.next.month Standardization
SELECT		DISTINCT default_payment_next_month
FROM		stg_uci_credit_card;
	
ALTER TABLE     stg_uci_credit_card
ADD COLUMN		default_payment_next_month_label VARCHAR(10);

UPDATE			stg_uci_credit_card
SET				default_payment_next_month_label = 
CASE			
				WHEN default_payment_next_month = 1 THEN 'Yes'
                WHEN default_payment_next_month = 0 THEN 'No'
                ELSE NULL
                END;

SELECT			DISTINCT default_payment_next_month_label
FROM			stg_uci_credit_card;

-- [7.9] Standardization Class
SELECT		DISTINCT class
FROM		stg_creditcard;

ALTER TABLE		stg_creditcard
ADD COLUMN		class_label VARCHAR(30);

UPDATE			stg_creditcard
SET				class_label =
CASE		
		WHEN class = 0 THEN 'Legitimate transaction'
        WHEN class = 1 THEN 'Fraudulent transaction'
        ELSE NULL
        END;
SELECT			DISTINCT class_label
FROM			stg_creditcard;        

-- 8) CORE LAYER (clean, ready for analysis)
-- ---------------------------------------------------------
-- [8.1] Core customers (one row per customer + business-friendly names)
DESCRIBE	stg_uci_credit_card;	
DROP TABLE IF EXISTS core_customers;
CREATE TABLE core_customers AS
SELECT
			CAST(id AS UNSIGNED) 					AS customer_id,
            CAST(limit_bal AS DECIMAL(12, 2))   	AS credit_limit,
            
            -- We are keeping both columns code+label
            CAST(sex AS UNSIGNED)					AS sex_code,
            NULLIF(TRIM(sex_label), '')				AS sex,
            
            CAST(education AS UNSIGNED)				AS education_code,
            NULLIF(TRIM(education_label), '')		AS	education_level,
            
            CAST(marriage AS UNSIGNED)				AS marriage_code,
            NULLIF(TRIM(marriage_label), '')		AS marital_status,
            
            CAST(age AS UNSIGNED)					AS age,
            
            -- Repayment Status
            CAST(pay_0 AS SIGNED)					AS pay_0,
            CAST(pay_2 AS SIGNED)                        AS pay_2,
			CAST(pay_3 AS SIGNED)                        AS pay_3,
			CAST(pay_4 AS SIGNED)                        AS pay_4,
			CAST(pay_5 AS SIGNED)                        AS pay_5,
			CAST(pay_6 AS SIGNED)                        AS pay_6,

			-- Bills
            CAST(bill_amt1 AS DECIMAL(12,2))			 AS	bill_amt1,
            CAST(bill_amt2 AS DECIMAL(12,2))             AS bill_amt2,
			CAST(bill_amt3 AS DECIMAL(12,2))             AS bill_amt3,
			CAST(bill_amt4 AS DECIMAL(12,2))             AS bill_amt4,
			CAST(bill_amt5 AS DECIMAL(12,2))             AS bill_amt5,
			CAST(bill_amt6 AS DECIMAL(12,2))             AS bill_amt6,

			-- Payment
            CAST(pay_amt1 AS DECIMAL(12,2))              AS pay_amt1,
			CAST(pay_amt2 AS DECIMAL(12,2))              AS pay_amt2,
			CAST(pay_amt3 AS DECIMAL(12,2))              AS pay_amt3,
			CAST(pay_amt4 AS DECIMAL(12,2))              AS pay_amt4,
			CAST(pay_amt5 AS DECIMAL(12,2))              AS pay_amt5,
			CAST(pay_amt6 AS DECIMAL(12,2))              AS pay_amt6,

			-- Target
            CAST(default_payment_next_month AS UNSIGNED) AS default_payment_next_month_code,
			NULLIF(TRIM(default_payment_next_month_label), '') AS default_payment_next_month

FROM		stg_uci_credit_card
WHERE		1=1
			AND			id IS NOT NULL
			AND			default_payment_next_month IN (0, 1);

SELECT		*
FROM		core_customers;

-- [8.2] Core transactions (one row per transaction + keep all features)
DESCRIBE		stg_creditcard;
DROP TABLE IF EXISTS core_transaction;
DROP TABLE IF EXISTS core_transaction;
CREATE TABLE core_transaction AS
SELECT
    time_sec,
    v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
    v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
    v21, v22, v23, v24, v25, v26, v27, v28,
    amount,
    class_code,
    class
FROM (
    SELECT
        CAST(time_sec AS UNSIGNED)        AS time_sec,
        CAST(v1 AS DOUBLE)               AS v1,
        CAST(v2 AS DOUBLE)               AS v2,
        CAST(v3 AS DOUBLE)               AS v3,
        CAST(v4 AS DOUBLE)               AS v4,
        CAST(v5 AS DOUBLE)               AS v5,
        CAST(v6 AS DOUBLE)               AS v6,
        CAST(v7 AS DOUBLE)               AS v7,
        CAST(v8 AS DOUBLE)               AS v8,
        CAST(v9 AS DOUBLE)               AS v9,
        CAST(v10 AS DOUBLE)              AS v10,
        CAST(v11 AS DOUBLE)              AS v11,
        CAST(v12 AS DOUBLE)              AS v12,
        CAST(v13 AS DOUBLE)              AS v13,
        CAST(v14 AS DOUBLE)              AS v14,
        CAST(v15 AS DOUBLE)              AS v15,
        CAST(v16 AS DOUBLE)              AS v16,
        CAST(v17 AS DOUBLE)              AS v17,
        CAST(v18 AS DOUBLE)              AS v18,
        CAST(v19 AS DOUBLE)              AS v19,
        CAST(v20 AS DOUBLE)              AS v20,
        CAST(v21 AS DOUBLE)              AS v21,
        CAST(v22 AS DOUBLE)              AS v22,
        CAST(v23 AS DOUBLE)              AS v23,
        CAST(v24 AS DOUBLE)              AS v24,
        CAST(v25 AS DOUBLE)              AS v25,
        CAST(v26 AS DOUBLE)              AS v26,
        CAST(v27 AS DOUBLE)              AS v27,
        CAST(v28 AS DOUBLE)              AS v28,

        CAST(amount AS DECIMAL(12,2))    AS amount,

        CAST(class AS UNSIGNED)          AS class_code,
        NULLIF(TRIM(class_label), '')    AS class,

        ROW_NUMBER() OVER (
            PARTITION BY
                CAST(time_sec AS UNSIGNED),
                CAST(v1 AS DOUBLE), CAST(v2 AS DOUBLE), CAST(v3 AS DOUBLE), CAST(v4 AS DOUBLE),
                CAST(v5 AS DOUBLE), CAST(v6 AS DOUBLE), CAST(v7 AS DOUBLE), CAST(v8 AS DOUBLE),
                CAST(v9 AS DOUBLE), CAST(v10 AS DOUBLE), CAST(v11 AS DOUBLE), CAST(v12 AS DOUBLE),
                CAST(v13 AS DOUBLE), CAST(v14 AS DOUBLE), CAST(v15 AS DOUBLE), CAST(v16 AS DOUBLE),
                CAST(v17 AS DOUBLE), CAST(v18 AS DOUBLE), CAST(v19 AS DOUBLE), CAST(v20 AS DOUBLE),
                CAST(v21 AS DOUBLE), CAST(v22 AS DOUBLE), CAST(v23 AS DOUBLE), CAST(v24 AS DOUBLE),
                CAST(v25 AS DOUBLE), CAST(v26 AS DOUBLE), CAST(v27 AS DOUBLE), CAST(v28 AS DOUBLE),
                CAST(amount AS DECIMAL(12,2)),
                CAST(class AS UNSIGNED),
                NULLIF(TRIM(class_label), '')
            ORDER BY time_sec
        ) AS rn
    FROM stg_creditcard
    WHERE class IN (0,1)
) t
WHERE rn = 1;

-- 9)INDEXES
-- ---------------------------------------------------------
-- [9.1] Customers: common filters
CREATE INDEX idx_core_customers_default_flag 
ON core_customers (default_payment_next_month);

CREATE INDEX idx_core_customers_credit_limit 
ON core_customers (credit_limit);

-- [9.2] Transactions: common filters
CREATE INDEX idx_core_transaction_fraud_flag 
ON core_transaction (class);

CREATE INDEX idx_core_transaction_time_sec  
ON core_transaction (time_sec);

CREATE INDEX idx_core_transaction_amount    
ON core_transaction (amount);

-- 10) VALIDATION
-- ---------------------------------------------------------
-- [10.1] Validate core_customers
--     - Row counts vs staging_core_customers
SELECT		'stg_uci_credit_card' AS table_name,
			COUNT(*) AS row_count
FROM		stg_uci_credit_card
UNION ALL
SELECT		'core_customers',
			COUNT(*)
FROM		core_customers;

-- [10.2] Row counts vs staging_core_transaction
SELECT		'stg_credit_card' AS table_name,
			COUNT(*) AS row_counts
FROM		stg_creditcard
UNION ALL
SELECT		'core_creditcard',
			COUNT(*)
FROM		core_transaction;

-- 11) Exporting the clean datasets to csv format
-- [11.1] Exporting the core_customers
SELECT		'customer_id', 'credit_limit', 'sex_code', 'sex', 'education_code', 'education_level',
			'marriage_code', 'marital_status', 'age', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5',	
            'pay_6', 'bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6', 
            'pay_amt1', 'pay_amt2', 'pay_amt3', 'pay_amt4', 'pay_amt5', 'pay_amt6', 
            'default_payment_next_month_code', 'default_payment_next_month'
UNION ALL
SELECT		customer_id, credit_limit, sex_code, sex, education_code, education_level, marriage_code, 
			marital_status, age, pay_0, pay_2, pay_3, pay_4, pay_5, pay_6, bill_amt1, bill_amt2, bill_amt3, 
            bill_amt4, bill_amt5, bill_amt6, pay_amt1, pay_amt2, pay_amt3, pay_amt4, pay_amt5, pay_amt6, 
            default_payment_next_month_code, default_payment_next_month            
FROM		core_customers
INTO OUTFILE 'C:/ProgramData/MySQL/MySQL Server 9.5/Uploads/cleaned_UCI_credit_card.xlsx'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';

-- [11.1] Exporting the core_transaction
SELECT		'time_sec',	'v1', 'v2', 'v3', 'v4',	'v5', 'v6',	'v7', 'v8', 'v9', 'v10', 'v11', 'v12',
			'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19', 'v20', 'v21', 'v22', 'v23', 'v24',	'v25',
            'v26', 'v27', 'v28', 'amount', 'class_code','class'
UNION ALL
SELECT		time_sec, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
			v20, v21, v22, v23, v24, v25, v26, v27, v28, amount, class_code, class
FROM		core_transaction
INTO OUTFILE 'C:/ProgramData/MySQL/MySQL Server 9.5/Uploads/cleaned_credit_card_transactions.csv'
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n';