rss/migrations/001_utils_normalize_url.sql

62 lines
2.2 KiB
PL/PgSQL

-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www,
-- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.),
-- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz.
CREATE OR REPLACE FUNCTION normalize_url(in_url text)
RETURNS text
LANGUAGE plpgsql
AS $$
DECLARE
u text := trim(in_url);
scheme_host text;
path_q text;
BEGIN
IF u IS NULL OR u = '' THEN
RETURN NULL;
END IF;
-- quitar espacios y fragmentos
u := regexp_replace(u, '#.*$', '', 'i');
-- separar esquema+host de path+query
-- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
IF scheme_host IS NULL THEN
-- si no hay esquema, asumimos http
u := 'http://' || u;
scheme_host := substring(u FROM '^[a-z]+://[^/]*');
END IF;
path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$');
IF path_q IS NULL THEN
path_q := '/';
END IF;
-- normalizar esquema y host (minúsculas, quitar www.)
scheme_host := lower(scheme_host);
scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i');
-- quitar puerto por defecto (:80 en http, :443 en https)
scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i');
scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i');
-- limpiar parámetros de tracking en la query
-- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc.
path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi');
-- limpiar conectores sobrantes ?, &, &&, ?&, etc.
path_q := regexp_replace(path_q, '\?&+', '?', 'g');
path_q := regexp_replace(path_q, '&{2,}', '&', 'g');
path_q := regexp_replace(path_q, '\?$', '', 'g');
path_q := regexp_replace(path_q, '\?$','', 'g');
-- colapsar dobles barras en path (no tocar “://”)
path_q := regexp_replace(path_q, '/{2,}', '/', 'g');
-- quitar “/” final si no es la raíz
IF path_q <> '/' THEN
path_q := regexp_replace(path_q, '/+$', '', 'g');
END IF;
RETURN scheme_host || path_q;
END;
$$;