OpenSource_ETL_Project/kit_eda
2025-08-23 01:09:59 -06:00

134 lines
4.6 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* ==========================================================
KIT EDA POSTGRESQL 15 QUERIES CLAVE
Autor: NeoAnd
📌 Instrucciones rápidas:
-- Ejecuta:
SELECT run_eda_auto_export('public','clientes');
-- Luego en DBeaver:
-- Busca en el navegador las tablas temporales que empiezan con "tmp_"
-- Ej: tmp_total_rows, tmp_nulls, tmp_top10_micolumna, tmp_stats_micolumna, tmp_time_midate, tmp_corr
Ejecuta por bloque (Cmd+Enter en Mac, Ctrl+Enter en Windows).
========================================================== */
CREATE OR REPLACE FUNCTION run_eda_auto_export(p_schema TEXT, p_table TEXT)
RETURNS void AS
DECLARE
r RECORD;
col_num TEXT[];
col_cat TEXT[];
col_date TEXT[];
sql_cmd TEXT;
BEGIN
-- 🔍 Clasificación automática de columnas
SELECT array_agg(column_name) FILTER (WHERE data_type LIKE '%int%' OR data_type LIKE '%numeric%' OR data_type LIKE '%double%')
INTO col_num
FROM information_schema.columns
WHERE table_schema = p_schema AND table_name = p_table;
SELECT array_agg(column_name) FILTER (WHERE data_type IN ('character varying','text','uuid'))
INTO col_cat
FROM information_schema.columns
WHERE table_schema = p_schema AND table_name = p_table;
SELECT array_agg(column_name) FILTER (WHERE data_type LIKE '%date%' OR data_type LIKE '%timestamp%')
INTO col_date
FROM information_schema.columns
WHERE table_schema = p_schema AND table_name = p_table;
-- 📊 1. Conteo total
EXECUTE format('DROP TABLE IF EXISTS tmp_total_rows;
CREATE TEMP TABLE tmp_total_rows AS
SELECT COUNT(*) AS total FROM %I.%I',
p_schema, p_table);
-- 📊 2. Nulos por columna
sql_cmd := '';
FOR r IN
SELECT column_name
FROM information_schema.columns
WHERE table_schema = p_schema AND table_name = p_table
LOOP
sql_cmd := sql_cmd || format(
'SELECT %L AS columna, COUNT(*) AS nulls FROM %I.%I WHERE %I IS NULL UNION ALL ',
r.column_name, p_schema, p_table, r.column_name
);
END LOOP;
sql_cmd := left(sql_cmd, length(sql_cmd)-11); -- quitar último UNION ALL
EXECUTE format('DROP TABLE IF EXISTS tmp_nulls;
CREATE TEMP TABLE tmp_nulls AS %s', sql_cmd);
-- 📊 3. Top 10 valores más frecuentes por columna categórica
IF col_cat IS NOT NULL THEN
FOREACH r IN ARRAY col_cat LOOP
EXECUTE format(
'DROP TABLE IF EXISTS tmp_top10_%I;
CREATE TEMP TABLE tmp_top10_%I AS
SELECT %I, COUNT(*) AS freq
FROM %I.%I
GROUP BY %I
ORDER BY freq DESC
LIMIT 10',
r, r, r, p_schema, p_table, r
);
END LOOP;
END IF;
-- 📊 4. Estadísticas descriptivas de columnas numéricas
IF col_num IS NOT NULL THEN
FOREACH r IN ARRAY col_num LOOP
EXECUTE format(
'DROP TABLE IF EXISTS tmp_stats_%I;
CREATE TEMP TABLE tmp_stats_%I AS
SELECT MIN(%I) AS minimo,
MAX(%I) AS maximo,
AVG(%I) AS media,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY %I) AS mediana
FROM %I.%I',
r, r, r, r, r, r, r, p_schema, p_table
);
END LOOP;
END IF;
-- 📊 5. Distribución temporal por columnas de fecha
IF col_date IS NOT NULL THEN
FOREACH r IN ARRAY col_date LOOP
EXECUTE format(
'DROP TABLE IF EXISTS tmp_time_%I;
CREATE TEMP TABLE tmp_time_%I AS
SELECT date_trunc(''day'', %I) AS dia, COUNT(*)
FROM %I.%I
GROUP BY dia
ORDER BY dia',
r, r, r, p_schema, p_table
);
END LOOP;
END IF;
-- 📊 6. Correlaciones numéricas
IF array_length(col_num,1) >= 2 THEN
EXECUTE 'DROP TABLE IF EXISTS tmp_corr;
CREATE TEMP TABLE tmp_corr (col_x TEXT, col_y TEXT, correlacion NUMERIC)';
FOR i IN 1..array_length(col_num,1)-1 LOOP
FOR j IN i+1..array_length(col_num,1) LOOP
EXECUTE format(
'INSERT INTO tmp_corr
SELECT %L, %L, corr(%I,%I)
FROM %I.%I',
col_num[i], col_num[j], col_num[i], col_num[j], p_schema, p_table
);
END LOOP;
END LOOP;
END IF;
RAISE NOTICE '✅ Tablas temporales creadas. Revísalas en el navegador de DBeaver (prefijo tmp_)';
END;
LANGUAGE plpgsql;