-- Función para “normalizar” URLs: minúsculas en esquema/host, quitar www, -- quitar fragmentos (#...), limpiar trackers comunes (utm_*, gclid, fbclid, ref, etc.), -- colapsar dobles “/”, y quitar la “/” final salvo si es la raíz. CREATE OR REPLACE FUNCTION normalize_url(in_url text) RETURNS text LANGUAGE plpgsql AS $$ DECLARE u text := trim(in_url); scheme_host text; path_q text; BEGIN IF u IS NULL OR u = '' THEN RETURN NULL; END IF; -- quitar espacios y fragmentos u := regexp_replace(u, '#.*$', '', 'i'); -- separar esquema+host de path+query -- ej: https://example.com:443/foo?bar -> scheme_host=https://example.com:443 ; path_q=/foo?bar scheme_host := substring(u FROM '^[a-z]+://[^/]*'); IF scheme_host IS NULL THEN -- si no hay esquema, asumimos http u := 'http://' || u; scheme_host := substring(u FROM '^[a-z]+://[^/]*'); END IF; path_q := substring(u FROM '^[a-z]+://[^/]*(/.*)$'); IF path_q IS NULL THEN path_q := '/'; END IF; -- normalizar esquema y host (minúsculas, quitar www.) scheme_host := lower(scheme_host); scheme_host := regexp_replace(scheme_host, '^(https?://)www\.', '\1', 'i'); -- quitar puerto por defecto (:80 en http, :443 en https) scheme_host := regexp_replace(scheme_host, '^http://([^/:]+):80$', 'http://\1', 'i'); scheme_host := regexp_replace(scheme_host, '^https://([^/:]+):443$', 'https://\1', 'i'); -- limpiar parámetros de tracking en la query -- elimina ?utm_... &utm_... gclid fbclid mc_cid mc_eid ref ref_src etc. path_q := regexp_replace(path_q, '([?&])(utm_[^=&]+|gclid|fbclid|mc_cid|mc_eid|ref|ref_src|yclid|igshid)=[^&#]*', '\1', 'gi'); -- limpiar conectores sobrantes ?, &, &&, ?&, etc. path_q := regexp_replace(path_q, '\?&+', '?', 'g'); path_q := regexp_replace(path_q, '&{2,}', '&', 'g'); path_q := regexp_replace(path_q, '\?$', '', 'g'); path_q := regexp_replace(path_q, '\?$','', 'g'); -- colapsar dobles barras en path (no tocar “://”) path_q := regexp_replace(path_q, '/{2,}', '/', 'g'); -- quitar “/” final si no es la raíz IF path_q <> '/' THEN path_q := regexp_replace(path_q, '/+$', '', 'g'); END IF; RETURN scheme_host || path_q; END; $$;